diff --git a/backend/.env.dist.composed b/backend/.env.dist.composed index 0bf0dd50d7..b0af8d7a05 100644 --- a/backend/.env.dist.composed +++ b/backend/.env.dist.composed @@ -35,3 +35,6 @@ CROWD_PACKAGES_DB_PORT=5432 CROWD_PACKAGES_DB_USERNAME=postgres CROWD_PACKAGES_DB_PASSWORD=example CROWD_PACKAGES_DB_DATABASE=packages-db + +# security-contacts-worker +SECURITY_CONTACTS_USER_AGENT="lfx-security-contacts-worker" diff --git a/backend/.env.dist.local b/backend/.env.dist.local index bcba7bc5d3..adca4976c3 100755 --- a/backend/.env.dist.local +++ b/backend/.env.dist.local @@ -185,6 +185,9 @@ ENRICHER_BATCH_SIZE=100 ENRICHER_REPO_UPDATE_INTERVAL_HOURS=24 ENRICHER_IDLE_SLEEP_SEC=60 +# security-contacts-worker +SECURITY_CONTACTS_USER_AGENT="lfx-security-contacts-worker" + OSSPCKGS_GCP_PROJECT=local-dev OSSPCKGS_GCS_BUCKET=local-dev OSSPCKGS_GCP_CREDENTIALS_B64=e30= diff --git a/backend/src/osspckgs/migrations/V1782950400__security_contacts.sql b/backend/src/osspckgs/migrations/V1782950400__security_contacts.sql new file mode 100644 index 0000000000..fc1d62410b --- /dev/null +++ b/backend/src/osspckgs/migrations/V1782950400__security_contacts.sql @@ -0,0 +1,27 @@ +CREATE TABLE IF NOT EXISTS security_contacts ( + id BIGSERIAL PRIMARY KEY, + repo_id BIGINT NOT NULL REFERENCES repos(id) ON DELETE CASCADE, + channel TEXT NOT NULL, + value TEXT NOT NULL, + role TEXT NOT NULL, + name TEXT, + score NUMERIC(4,3) NOT NULL, + confidence TEXT NOT NULL, + provenance JSONB NOT NULL DEFAULT '[]', + last_refreshed TIMESTAMPTZ NOT NULL DEFAULT NOW(), + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW() +); + +CREATE UNIQUE INDEX IF NOT EXISTS security_contacts_repo_channel_value_uq + ON security_contacts (repo_id, channel, value); + +CREATE INDEX IF NOT EXISTS security_contacts_repo_confidence_idx + ON security_contacts (repo_id, confidence); + +ALTER TABLE repos ADD COLUMN IF NOT EXISTS pvr_enabled BOOL; +ALTER TABLE repos ADD COLUMN IF NOT EXISTS security_policy_url TEXT; +ALTER TABLE repos ADD COLUMN IF NOT EXISTS vulnerability_reporting_url TEXT; +ALTER TABLE repos ADD COLUMN IF NOT EXISTS bug_bounty_url TEXT; +ALTER TABLE repos ADD COLUMN IF NOT EXISTS security_txt_url TEXT; +ALTER TABLE repos ADD COLUMN IF NOT EXISTS contacts_last_refreshed TIMESTAMPTZ; diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index ec3949f97c..65ea4b191d 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1345,6 +1345,9 @@ importers: fast-xml-parser: specifier: ^5.8.0 version: 5.8.0 + js-yaml: + specifier: ^4.1.1 + version: 4.1.1 jsonwebtoken: specifier: ^9.0.0 version: 9.0.3 @@ -1367,6 +1370,9 @@ importers: specifier: ^0.12.3 version: 0.12.3 devDependencies: + '@types/js-yaml': + specifier: ^4.0.9 + version: 4.0.9 '@types/jsonwebtoken': specifier: ^9.0.0 version: 9.0.6 @@ -7616,10 +7622,6 @@ packages: resolution: {integrity: sha512-PMSmkqxr106Xa156c2M265Z+FTrPl+oxd/rgOQy2tijQeK5TxQ43psO1ZCwhVOSdnn+RzkzlRz/eY4BgJBYVpg==} hasBin: true - js-yaml@4.1.0: - resolution: {integrity: sha512-wpxZs9NoxZaJESJGIZTyDEaYpl0FKSA+FB9aJiyemKhMwkxQg63h4T1KJgUGHpTqPDNRcmmYLugrRjJlBtWvRA==} - hasBin: true - js-yaml@4.1.1: resolution: {integrity: sha512-qQKT4zQxXl8lLwBtHMWwaTcGfFOZviOJet3Oy/xmGk2gZH677CJM9EvtfdSkgWcATZhj/55JZ0rmy3myCT5lsA==} hasBin: true @@ -16044,7 +16046,7 @@ snapshots: imurmurhash: 0.1.4 is-glob: 4.0.3 is-path-inside: 3.0.3 - js-yaml: 4.1.0 + js-yaml: 4.1.1 json-stable-stringify-without-jsonify: 1.0.1 levn: 0.4.1 lodash.merge: 4.6.2 @@ -17279,10 +17281,6 @@ snapshots: argparse: 1.0.10 esprima: 4.0.1 - js-yaml@4.1.0: - dependencies: - argparse: 2.0.1 - js-yaml@4.1.1: dependencies: argparse: 2.0.1 diff --git a/scripts/builders/packages.env b/scripts/builders/packages.env index 191e44ea95..07536c5ef9 100644 --- a/scripts/builders/packages.env +++ b/scripts/builders/packages.env @@ -1,4 +1,4 @@ DOCKERFILE="./services/docker/Dockerfile.packages" CONTEXT="../" REPO="sjc.ocir.io/axbydjxa5zuh/packages" -SERVICES="github-repos-enricher bq-dataset-ingest npm-worker maven-worker osv-worker dockerhub-sync cargo-worker go-worker nuget-worker" +SERVICES="github-repos-enricher bq-dataset-ingest npm-worker maven-worker osv-worker dockerhub-sync cargo-worker go-worker nuget-worker security-contacts-worker" diff --git a/scripts/services/security-contacts-worker.yaml b/scripts/services/security-contacts-worker.yaml new file mode 100644 index 0000000000..934afe2c69 --- /dev/null +++ b/scripts/services/security-contacts-worker.yaml @@ -0,0 +1,65 @@ +version: '3.1' + +x-env-args: &env-args + DOCKER_BUILDKIT: 1 + NODE_ENV: docker + SERVICE: security-contacts-worker + SHELL: /bin/sh + SUPPRESS_NO_CONFIG_WARNING: 'true' + CROWD_TEMPORAL_TASKQUEUE: packages-worker + +services: + security-contacts-worker: + build: + context: ../../ + dockerfile: ./scripts/services/docker/Dockerfile.packages + command: 'pnpm run start:security-contacts-worker' + working_dir: /usr/crowd/app/services/apps/packages_worker + env_file: + - ../../backend/.env.dist.local + - ../../backend/.env.dist.composed + - ../../backend/.env.override.local + - ../../backend/.env.override.composed + environment: + <<: *env-args + restart: always + networks: + - crowd-bridge + + security-contacts-worker-dev: + build: + context: ../../ + dockerfile: ./scripts/services/docker/Dockerfile.packages + command: 'pnpm run dev:security-contacts-worker' + working_dir: /usr/crowd/app/services/apps/packages_worker + env_file: + - ../../backend/.env.dist.local + - ../../backend/.env.dist.composed + - ../../backend/.env.override.local + - ../../backend/.env.override.composed + environment: + <<: *env-args + hostname: security-contacts-worker + networks: + - crowd-bridge + volumes: + - ../../services/libs/audit-logs/src:/usr/crowd/app/services/libs/audit-logs/src + - ../../services/libs/common/src:/usr/crowd/app/services/libs/common/src + - ../../services/libs/common_services/src:/usr/crowd/app/services/libs/common_services/src + - ../../services/libs/data-access-layer/src:/usr/crowd/app/services/libs/data-access-layer/src + - ../../services/libs/database/src:/usr/crowd/app/services/libs/database/src + - ../../services/libs/integrations/src:/usr/crowd/app/services/libs/integrations/src + - ../../services/libs/logging/src:/usr/crowd/app/services/libs/logging/src + - ../../services/libs/nango/src:/usr/crowd/app/services/libs/nango/src + - ../../services/libs/opensearch/src:/usr/crowd/app/services/libs/opensearch/src + - ../../services/libs/queue/src:/usr/crowd/app/services/libs/queue/src + - ../../services/libs/redis/src:/usr/crowd/app/services/libs/redis/src + - ../../services/libs/snowflake/src:/usr/crowd/app/services/libs/snowflake/src + - ../../services/libs/telemetry/src:/usr/crowd/app/services/libs/telemetry/src + - ../../services/libs/temporal/src:/usr/crowd/app/services/libs/temporal/src + - ../../services/libs/types/src:/usr/crowd/app/services/libs/types/src + - ../../services/apps/packages_worker/src:/usr/crowd/app/services/apps/packages_worker/src + +networks: + crowd-bridge: + external: true diff --git a/services/apps/packages_worker/package.json b/services/apps/packages_worker/package.json index 88351e70ec..1d3f345578 100644 --- a/services/apps/packages_worker/package.json +++ b/services/apps/packages_worker/package.json @@ -40,6 +40,9 @@ "start:go-worker": "CROWD_TEMPORAL_TASKQUEUE=go-worker SERVICE=go-worker tsx src/bin/go-worker.ts", "dev:go-worker": "CROWD_TEMPORAL_TASKQUEUE=go-worker SERVICE=go-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9241 src/bin/go-worker.ts", "dev:go-worker:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=go-worker SERVICE=go-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9241 src/bin/go-worker.ts", + "start:security-contacts-worker": "CROWD_TEMPORAL_TASKQUEUE=packages-worker SERVICE=security-contacts-worker tsx src/bin/security-contacts-worker.ts", + "dev:security-contacts-worker": "CROWD_TEMPORAL_TASKQUEUE=packages-worker SERVICE=security-contacts-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9243 src/bin/security-contacts-worker.ts", + "dev:security-contacts-worker:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=packages-worker SERVICE=security-contacts-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9243 src/bin/security-contacts-worker.ts", "start:nuget-worker": "CROWD_TEMPORAL_TASKQUEUE=nuget-worker SERVICE=nuget-worker tsx src/bin/nuget-worker.ts", "dev:nuget-worker": "CROWD_TEMPORAL_TASKQUEUE=nuget-worker SERVICE=nuget-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9242 src/bin/nuget-worker.ts", "dev:nuget-worker:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=nuget-worker SERVICE=nuget-worker LOG_LEVEL=trace nodemon --watch src --watch ../../libs --ext ts --exec tsx --inspect=0.0.0.0:9242 src/bin/nuget-worker.ts", @@ -72,6 +75,7 @@ "@temporalio/activity": "~1.17.2", "@temporalio/client": "~1.17.2", "@temporalio/workflow": "~1.17.2", + "js-yaml": "^4.1.1", "jsonwebtoken": "^9.0.0", "semver": "^7.6.0", "axios": "^1.16.1", @@ -83,6 +87,7 @@ "unzipper": "^0.12.3" }, "devDependencies": { + "@types/js-yaml": "^4.0.9", "@types/jsonwebtoken": "^9.0.0", "@types/node": "^20.8.2", "@types/semver": "^7.5.8", diff --git a/services/apps/packages_worker/src/activities.ts b/services/apps/packages_worker/src/activities.ts index aa564ce3fa..01791b31bc 100644 --- a/services/apps/packages_worker/src/activities.ts +++ b/services/apps/packages_worker/src/activities.ts @@ -26,3 +26,4 @@ export { } from './cargo/activities' export { enrichGoVersionsBatch, enrichGoStatusBatch } from './go/activities' export { processNuGetBatch } from './nuget/activities' +export { processSecurityContactsBatch } from './security-contacts/activities' diff --git a/services/apps/packages_worker/src/bin/security-contacts-worker.ts b/services/apps/packages_worker/src/bin/security-contacts-worker.ts new file mode 100644 index 0000000000..387b0a7c1d --- /dev/null +++ b/services/apps/packages_worker/src/bin/security-contacts-worker.ts @@ -0,0 +1,8 @@ +import { scheduleSecurityContactsIngestion } from '../security-contacts/schedule' +import { svc } from '../service' + +setImmediate(async () => { + await svc.init() + await scheduleSecurityContactsIngestion() + await svc.start() +}) diff --git a/services/apps/packages_worker/src/config.ts b/services/apps/packages_worker/src/config.ts index ab5de33808..cc5307f161 100644 --- a/services/apps/packages_worker/src/config.ts +++ b/services/apps/packages_worker/src/config.ts @@ -46,6 +46,13 @@ export function getEnricherConfig() { } } +export function getSecurityContactsConfig() { + return { + // Sent on all registry calls; crates.io rejects requests without an identifying UA. + userAgent: requireEnv('SECURITY_CONTACTS_USER_AGENT'), + } +} + export function getMavenConfig() { return { batchSize: requireEnvInt('MAVEN_FETCHER_BATCH_SIZE'), diff --git a/services/apps/packages_worker/src/enricher/installationPool.ts b/services/apps/packages_worker/src/enricher/installationPool.ts new file mode 100644 index 0000000000..7ce35016fd --- /dev/null +++ b/services/apps/packages_worker/src/enricher/installationPool.ts @@ -0,0 +1,56 @@ +import { getServiceChildLogger } from '@crowd/logging' + +const log = getServiceChildLogger('installation-pool') + +// Park an installation before GitHub starts rejecting — avoids a failed request + requeue +const PROACTIVE_PARK_REMAINING = 50 + +/** Round-robins over installations, skipping ones parked until their rate-limit reset. */ +export class InstallationPool { + private readonly parkedUntil = new Map() + private roundRobinIdx = 0 + + constructor(private readonly ids: number[]) {} + + select(): { installationId: number; waitMs: number } { + const now = Date.now() + const n = this.ids.length + + for (let i = 0; i < n; i++) { + const idx = (this.roundRobinIdx + i) % n + const id = this.ids[idx] + if ((this.parkedUntil.get(id) ?? 0) <= now) { + this.roundRobinIdx = (idx + 1) % n + return { installationId: id, waitMs: 0 } + } + } + + let soonestReset = Infinity + let soonestId = this.ids[0] + for (const id of this.ids) { + const reset = this.parkedUntil.get(id) ?? 0 + if (reset < soonestReset) { + soonestReset = reset + soonestId = id + } + } + return { installationId: soonestId, waitMs: Math.max(1_000, soonestReset - now) } + } + + park(installationId: number, untilMs: number): void { + this.parkedUntil.set(installationId, untilMs) + } + + parkIfBudgetLow( + installationId: number, + remaining: number | null | undefined, + resetAt: string | null | undefined, + ): void { + if (remaining == null || resetAt == null || remaining >= PROACTIVE_PARK_REMAINING) return + this.park(installationId, new Date(resetAt).getTime() + 5_000) + log.info( + { installationId, remaining, resetAt }, + 'Budget low — proactively parking installation', + ) + } +} diff --git a/services/apps/packages_worker/src/enricher/runEnrichmentLoop.ts b/services/apps/packages_worker/src/enricher/runEnrichmentLoop.ts index 0085e0bff9..bb68a380b9 100644 --- a/services/apps/packages_worker/src/enricher/runEnrichmentLoop.ts +++ b/services/apps/packages_worker/src/enricher/runEnrichmentLoop.ts @@ -6,6 +6,7 @@ import { getEnricherConfig } from '../config' import { fetchActivitySnapshot } from './fetchActivitySnapshot' import { fetchLightRepo, parseGithubUrl } from './fetchLightRepo' import { GithubAppConfig, getInstallationToken } from './githubAppAuth' +import { InstallationPool } from './installationPool' import { FetchError, LightRepoResult, RepoActivitySnapshot } from './types' import { bulkUpdateEnrichedRepos, markReposSkipped } from './updateEnrichedRepos' import { bulkUpsertRepoActivitySnapshot } from './updateRepoActivitySnapshot' @@ -17,65 +18,11 @@ const DB_FETCH_SIZE = 2000 const WRITE_FLUSH_SIZE = 500 const WRITE_FLUSH_MS = 5000 const MAX_FLUSH_FAILURES = 3 -// Park an installation before GitHub starts rejecting — avoids a failed request + requeue -const PROACTIVE_PARK_REMAINING = 50 // Rate-limited snapshots retry once with another installation before being skipped const SNAPSHOT_RATE_LIMIT_RETRIES = 1 // Installations whose token mint fails (e.g. org IP allowlist) sit out for an hour const MINT_FAILURE_PARK_MS = 60 * 60 * 1000 -// ─── Installation pool ──────────────────────────────────────────────────────── - -/** Round-robins over installations, skipping ones parked until their rate-limit reset. */ -class InstallationPool { - private readonly parkedUntil = new Map() - private roundRobinIdx = 0 - - constructor(private readonly ids: number[]) {} - - select(): { installationId: number; waitMs: number } { - const now = Date.now() - const n = this.ids.length - - for (let i = 0; i < n; i++) { - const idx = (this.roundRobinIdx + i) % n - const id = this.ids[idx] - if ((this.parkedUntil.get(id) ?? 0) <= now) { - this.roundRobinIdx = (idx + 1) % n - return { installationId: id, waitMs: 0 } - } - } - - let soonestReset = Infinity - let soonestId = this.ids[0] - for (const id of this.ids) { - const reset = this.parkedUntil.get(id) ?? 0 - if (reset < soonestReset) { - soonestReset = reset - soonestId = id - } - } - return { installationId: soonestId, waitMs: Math.max(1_000, soonestReset - now) } - } - - park(installationId: number, untilMs: number): void { - this.parkedUntil.set(installationId, untilMs) - } - - parkIfBudgetLow( - installationId: number, - remaining: number | null | undefined, - resetAt: string | null | undefined, - ): void { - if (remaining == null || resetAt == null || remaining >= PROACTIVE_PARK_REMAINING) return - this.park(installationId, new Date(resetAt).getTime() + 5_000) - log.info( - { installationId, remaining, resetAt }, - 'Budget low — proactively parking installation', - ) - } -} - // ─── Fetch with retries ─────────────────────────────────────────────────────── type FetchOutcome = diff --git a/services/apps/packages_worker/src/security-contacts/activities.ts b/services/apps/packages_worker/src/security-contacts/activities.ts new file mode 100644 index 0000000000..e6be0fecde --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/activities.ts @@ -0,0 +1,17 @@ +import { getServiceChildLogger } from '@crowd/logging' + +import { getSecurityContactsConfig } from '../config' +import { getPackagesDb } from '../db' + +import { BatchResult, processBatch } from './processBatch' + +const log = getServiceChildLogger('security-contacts-activity') + +export async function processSecurityContactsBatch(): Promise { + const config = getSecurityContactsConfig() + const qx = await getPackagesDb() + + const result = await processBatch(qx, config) + log.info({ ...result }, 'Security contacts batch activity complete') + return result +} diff --git a/services/apps/packages_worker/src/security-contacts/extractors/http.ts b/services/apps/packages_worker/src/security-contacts/extractors/http.ts new file mode 100644 index 0000000000..8f1c625e20 --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/extractors/http.ts @@ -0,0 +1,87 @@ +// crates.io rejects requests without a descriptive User-Agent (HTTP 403); harmless elsewhere. +// https://crates.io/policies#crawlers +export function registryHeaders(userAgent: string): Record { + return { 'User-Agent': userAgent } +} + +// Genuinely-absent / not-determinable → null body; every other non-200 throws so transient +// failures (5xx/...) are treated as failures and the pipeline preserves data instead of +// wiping it. 422 is included because GitHub's PVR endpoint returns it (per-repo, non-transient) +// when the flag can't be determined — that must read as "unknown", not block the whole repo. +const ABSENT_STATUSES = new Set([404, 410, 422]) + +// Registry rate-limit / overload responses: retried in-process (honoring Retry-After) so a brief +// throttle doesn't fail the extractor and cost the repo a whole refresh cadence. +const RATE_LIMIT_STATUSES = new Set([429, 503]) +const MAX_RATE_LIMIT_RETRIES = 3 + +async function fetchWithRetry( + url: string, + timeoutMs: number, + headers: Record, +): Promise { + for (let attempt = 0; ; attempt++) { + const controller = new AbortController() + const timeoutId = setTimeout(() => controller.abort(), timeoutMs) + let res: Response + try { + res = await fetch(url, { headers, signal: controller.signal }) + } finally { + clearTimeout(timeoutId) + } + if (!RATE_LIMIT_STATUSES.has(res.status) || attempt >= MAX_RATE_LIMIT_RETRIES) return res + const retryAfterSec = parseInt(res.headers.get('retry-after') ?? '0', 10) + const waitMs = retryAfterSec ? retryAfterSec * 1000 : Math.min(30_000, 1_000 * 2 ** attempt) + await new Promise((r) => setTimeout(r, waitMs)) + } +} + +export interface FetchTextResult { + status: number + text: string | null +} + +export async function fetchText( + url: string, + timeoutMs: number, + headers: Record = {}, +): Promise { + const res = await fetchWithRetry(url, timeoutMs, headers) + if (res.status === 200) return { status: 200, text: await res.text() } + if (ABSENT_STATUSES.has(res.status)) return { status: res.status, text: null } + throw new Error(`fetchText ${url} failed: HTTP ${res.status}`) +} + +export interface FetchJsonResult { + status: number + json: unknown | null +} + +export async function fetchJson( + url: string, + timeoutMs: number, + headers: Record = {}, +): Promise { + const res = await fetchWithRetry(url, timeoutMs, headers) + if (res.status === 200) return { status: 200, json: await res.json() } + if (ABSENT_STATUSES.has(res.status)) return { status: res.status, json: null } + throw new Error(`fetchJson ${url} failed: HTTP ${res.status}`) +} + +const EMAIL_RE = /^[^@\s]+@[^@\s]+\.[^@\s]+$/ +const EMAIL_GLOBAL_RE = /[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}/g + +export function isEmail(value: string): boolean { + return EMAIL_RE.test(value.trim()) +} + +// Pulls all email addresses out of free-form strings like "Name , Name2 ". +export function extractEmails(value: string): string[] { + return value.match(EMAIL_GLOBAL_RE) ?? [] +} + +/** Returns the login if the URL is a bare github profile (github.com/), else null. */ +export function githubHandleFromUrl(value: string): string | null { + const m = value.trim().match(/^https?:\/\/github\.com\/([^/\s]+)\/?$/i) + return m ? m[1] : null +} diff --git a/services/apps/packages_worker/src/security-contacts/extractors/pvr.ts b/services/apps/packages_worker/src/security-contacts/extractors/pvr.ts new file mode 100644 index 0000000000..b573a2cdf1 --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/extractors/pvr.ts @@ -0,0 +1,53 @@ +import { parseGithubUrl } from '../../enricher/fetchLightRepo' +import { Extractor, ExtractorResult } from '../types' + +const SOURCE = 'pvr' + +/* eslint-disable @typescript-eslint/no-explicit-any */ + +export function mapPvr( + body: unknown, + owner: string, + name: string, + fetchedAt: string, +): ExtractorResult { + const enabled = (body as any)?.enabled + + if (typeof enabled !== 'boolean') return { contacts: [], policies: {} } + if (!enabled) return { contacts: [], policies: { pvrEnabled: false } } + + const url = `https://github.com/${owner}/${name}/security/advisories/new` + return { + contacts: [ + { + channel: 'github-pvr', + value: url, + role: 'security-team', + tier: 'A', + provenance: [{ source: SOURCE, sourceTier: 'A', fetchedAt }], + }, + ], + policies: { pvrEnabled: true, vulnerabilityReportingUrl: url }, + } +} + +export const extractPvr: Extractor = async (target, deps) => { + // GitHub's PVR endpoint rejects archived (and private) repos with 422; skip the call for + // known-archived repos. Non-archived-yet-unknown repos still get the 422→unknown safety net. + if (target.archived) return { contacts: [], policies: {} } + + let owner: string + let name: string + try { + ;({ owner, name } = parseGithubUrl(target.url)) + } catch { + return { contacts: [], policies: {} } + } + + const { text } = await deps.githubGet( + `/repos/${owner}/${name}/private-vulnerability-reporting`, + ) + if (!text) return { contacts: [], policies: {} } + + return mapPvr(JSON.parse(text), owner, name, new Date().toISOString()) +} diff --git a/services/apps/packages_worker/src/security-contacts/extractors/registry/cargo.ts b/services/apps/packages_worker/src/security-contacts/extractors/registry/cargo.ts new file mode 100644 index 0000000000..080a383643 --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/extractors/registry/cargo.ts @@ -0,0 +1,60 @@ +import { ExtractorResult, ProvenanceEntry, RawContact } from '../../types' +import { fetchJson, registryHeaders } from '../http' + +import { ParsedPurl } from './purl' + +const SOURCE = 'crates.io' + +// crates.io policy: max 1 request/second. Serialize calls process-wide. +// https://crates.io/policies#crawlers +let nextAllowedAt = 0 +async function throttle(): Promise { + const now = Date.now() + const waitMs = Math.max(0, nextAllowedAt - now) + nextAllowedAt = Math.max(now, nextAllowedAt) + 1000 + if (waitMs > 0) await new Promise((r) => setTimeout(r, waitMs)) +} + +/* eslint-disable @typescript-eslint/no-explicit-any */ + +export function mapCargoOwners(doc: unknown, sourceUrl: string, fetchedAt: string): RawContact[] { + if (!doc || typeof doc !== 'object') return [] + const users = (doc as any).users + if (!Array.isArray(users)) return [] + + const prov = (): ProvenanceEntry[] => [ + { source: SOURCE, sourceTier: 'B', path: sourceUrl, fetchedAt }, + ] + const contacts: RawContact[] = [] + const seen = new Set() + for (const u of users) { + const login = u?.login + // crates.io logins like "github:org:team" are teams, not personal handles. + if (typeof login !== 'string' || login.includes(':')) continue + const key = login.toLowerCase() + if (seen.has(key)) continue + seen.add(key) + contacts.push({ + channel: 'github-handle', + value: login, + name: typeof u.name === 'string' ? u.name : undefined, + role: 'maintainer', + tier: 'B', + provenance: prov(), + }) + } + return contacts +} + +export async function fetchCargo( + parsed: ParsedPurl, + timeoutMs: number, + userAgent: string, +): Promise { + // crates.io does not expose author emails; owners give GitHub handles. + await throttle() + const url = `https://crates.io/api/v1/crates/${encodeURIComponent(parsed.name)}/owners` + const { json } = await fetchJson(url, timeoutMs, registryHeaders(userAgent)) + if (!json) return { contacts: [], policies: {} } + return { contacts: mapCargoOwners(json, url, new Date().toISOString()), policies: {} } +} diff --git a/services/apps/packages_worker/src/security-contacts/extractors/registry/composer.ts b/services/apps/packages_worker/src/security-contacts/extractors/registry/composer.ts new file mode 100644 index 0000000000..3f5a72d506 --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/extractors/registry/composer.ts @@ -0,0 +1,72 @@ +import { ExtractorResult, ProvenanceEntry, RawContact, RepoPolicies } from '../../types' +import { fetchJson, isEmail, registryHeaders } from '../http' + +import { ParsedPurl } from './purl' + +const SOURCE = 'packagist' + +/* eslint-disable @typescript-eslint/no-explicit-any */ + +export function mapComposer( + doc: unknown, + fullName: string, + sourceUrl: string, + fetchedAt: string, +): ExtractorResult { + const contacts: RawContact[] = [] + const policies: Partial = {} + if (!doc || typeof doc !== 'object') return { contacts, policies } + + const versions = (doc as any).packages?.[fullName] + const latest = Array.isArray(versions) ? versions[0] : undefined + if (!latest || typeof latest !== 'object') return { contacts, policies } + + const prov = (): ProvenanceEntry[] => [ + { source: SOURCE, sourceTier: 'B', path: sourceUrl, fetchedAt }, + ] + const seen = new Set() + const add = ( + channel: RawContact['channel'], + value: string, + role: RawContact['role'], + name?: string, + ): void => { + const key = `${channel}:${value.toLowerCase()}` + if (seen.has(key)) return + seen.add(key) + contacts.push({ channel, value, name, role, tier: 'B', provenance: prov() }) + } + + for (const a of Array.isArray(latest.authors) ? latest.authors : []) { + if (typeof a?.email === 'string' && isEmail(a.email)) { + add('email', a.email, 'maintainer', typeof a.name === 'string' ? a.name : undefined) + } + } + + const support = latest.support + if (support && typeof support === 'object') { + // support.security is Composer's dedicated security channel. + if (typeof support.security === 'string' && /^https?:\/\//i.test(support.security)) { + policies.securityPolicyUrl = support.security + add('url', support.security, 'security-team') + } + if (typeof support.email === 'string' && isEmail(support.email)) { + add('email', support.email, 'security-team') + } + } + + return { contacts, policies } +} + +export async function fetchComposer( + parsed: ParsedPurl, + timeoutMs: number, + userAgent: string, +): Promise { + if (!parsed.namespace) return { contacts: [], policies: {} } + const fullName = `${parsed.namespace}/${parsed.name}` + const url = `https://repo.packagist.org/p2/${fullName}.json` + const { json } = await fetchJson(url, timeoutMs, registryHeaders(userAgent)) + if (!json) return { contacts: [], policies: {} } + return mapComposer(json, fullName, url, new Date().toISOString()) +} diff --git a/services/apps/packages_worker/src/security-contacts/extractors/registry/index.ts b/services/apps/packages_worker/src/security-contacts/extractors/registry/index.ts new file mode 100644 index 0000000000..42b83cf330 --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/extractors/registry/index.ts @@ -0,0 +1,59 @@ +import { Extractor, ExtractorResult, RawContact, RepoPolicies } from '../../types' + +import { fetchCargo } from './cargo' +import { fetchComposer } from './composer' +import { fetchMaven } from './maven' +import { fetchNpm } from './npm' +import { fetchNuget } from './nuget' +import { ParsedPurl, parsePurl } from './purl' +import { fetchPypi } from './pypi' +import { fetchRubygems } from './rubygems' + +type EcosystemFetcher = ( + parsed: ParsedPurl, + timeoutMs: number, + userAgent: string, +) => Promise + +// Keyed by the lowercased packages.ecosystem value. go has no package-manifest contacts. +const FETCHERS: Record = { + npm: fetchNpm, + pypi: fetchPypi, + maven: fetchMaven, + cargo: fetchCargo, + nuget: fetchNuget, + rubygems: fetchRubygems, + composer: fetchComposer, +} + +export const extractManifest: Extractor = async (target, deps) => { + const contacts: RawContact[] = [] + const policies: Partial = {} + const seenPurls = new Set() + + for (const pkg of target.packages) { + if (seenPurls.has(pkg.purl)) continue + seenPurls.add(pkg.purl) + + const fetcher = FETCHERS[pkg.ecosystem?.toLowerCase()] + if (!fetcher) continue + + const parsed = parsePurl(pkg.purl) + if (!parsed) continue + + try { + const result = await fetcher(parsed, deps.fetchTimeoutMs, deps.userAgent) + contacts.push(...result.contacts) + for (const [key, value] of Object.entries(result.policies)) { + if (!(policies as Record)[key] && value != null) { + ;(policies as Record)[key] = value + } + } + } catch { + // one bad package must not sink the rest + continue + } + } + + return { contacts, policies } +} diff --git a/services/apps/packages_worker/src/security-contacts/extractors/registry/maven.ts b/services/apps/packages_worker/src/security-contacts/extractors/registry/maven.ts new file mode 100644 index 0000000000..2a268d326a --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/extractors/registry/maven.ts @@ -0,0 +1,90 @@ +import { XMLParser } from 'fast-xml-parser' + +import { ExtractorResult, ProvenanceEntry, RawContact } from '../../types' +import { fetchText, isEmail, registryHeaders } from '../http' + +import { ParsedPurl } from './purl' + +const SOURCE = 'maven-pom' +const BASE = 'https://repo1.maven.org/maven2' +const parser = new XMLParser({ ignoreAttributes: true }) + +/* eslint-disable @typescript-eslint/no-explicit-any */ + +function asArray(x: T | T[] | undefined): T[] { + if (x === undefined || x === null) return [] + return Array.isArray(x) ? x : [x] +} + +export function mapMavenPom(xml: string, sourceUrl: string, fetchedAt: string): RawContact[] { + let doc: any + try { + doc = parser.parse(xml) + } catch { + return [] + } + const project = doc?.project + if (!project) return [] + + const prov = (): ProvenanceEntry[] => [ + { source: SOURCE, sourceTier: 'B', path: sourceUrl, fetchedAt }, + ] + const contacts: RawContact[] = [] + const seen = new Set() + for (const dev of asArray(project.developers?.developer)) { + const email = dev?.email + if (typeof email !== 'string' || !isEmail(email)) continue + const key = email.toLowerCase() + if (seen.has(key)) continue + seen.add(key) + contacts.push({ + channel: 'email', + value: email, + name: typeof dev.name === 'string' ? dev.name : undefined, + role: 'maintainer', + tier: 'B', + provenance: prov(), + }) + } + return contacts +} + +async function resolveVersion( + groupPath: string, + artifact: string, + timeoutMs: number, + userAgent: string, +): Promise { + const url = `${BASE}/${groupPath}/${artifact}/maven-metadata.xml` + const { text } = await fetchText(url, timeoutMs, registryHeaders(userAgent)) + if (!text) return null + let doc: any + try { + doc = parser.parse(text) + } catch { + return null + } + const versioning = doc?.metadata?.versioning + const release = versioning?.release ?? versioning?.latest + if (typeof release === 'string') return release + const versions = asArray(versioning?.versions?.version) + return versions.length ? String(versions[versions.length - 1]) : null +} + +export async function fetchMaven( + parsed: ParsedPurl, + timeoutMs: number, + userAgent: string, +): Promise { + if (!parsed.namespace) return { contacts: [], policies: {} } + const groupPath = parsed.namespace.replace(/\./g, '/') + const artifact = parsed.name + + const version = await resolveVersion(groupPath, artifact, timeoutMs, userAgent) + if (!version) return { contacts: [], policies: {} } + + const url = `${BASE}/${groupPath}/${artifact}/${version}/${artifact}-${version}.pom` + const { text } = await fetchText(url, timeoutMs, registryHeaders(userAgent)) + if (!text) return { contacts: [], policies: {} } + return { contacts: mapMavenPom(text, url, new Date().toISOString()), policies: {} } +} diff --git a/services/apps/packages_worker/src/security-contacts/extractors/registry/npm.ts b/services/apps/packages_worker/src/security-contacts/extractors/registry/npm.ts new file mode 100644 index 0000000000..8bfe8a29fb --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/extractors/registry/npm.ts @@ -0,0 +1,60 @@ +import { ExtractorResult, ProvenanceEntry, RawContact } from '../../types' +import { fetchJson, isEmail, registryHeaders } from '../http' + +import { ParsedPurl } from './purl' + +const SOURCE = 'npm-registry' + +/* eslint-disable @typescript-eslint/no-explicit-any */ + +function npmPackagePath(parsed: ParsedPurl): string { + const full = parsed.namespace ? `${parsed.namespace}/${parsed.name}` : parsed.name + // Scoped packages must percent-encode the slash for the registry path. + return full.startsWith('@') ? full.replaceAll('/', '%2F') : full +} + +export function mapNpm(doc: unknown, sourceUrl: string, fetchedAt: string): RawContact[] { + if (!doc || typeof doc !== 'object') return [] + const d = doc as any + + const declaredAt = typeof d.time?.modified === 'string' ? d.time.modified : undefined + const prov = (): ProvenanceEntry[] => [ + { source: SOURCE, sourceTier: 'B', path: sourceUrl, fetchedAt, declaredAt }, + ] + + const contacts: RawContact[] = [] + const seen = new Set() + const addEmail = (email: string, name?: string): void => { + if (!isEmail(email)) return + const key = email.toLowerCase() + if (seen.has(key)) return + seen.add(key) + contacts.push({ + channel: 'email', + value: email, + name, + role: 'maintainer', + tier: 'B', + provenance: prov(), + }) + } + + if (typeof d.bugs?.email === 'string') addEmail(d.bugs.email) + for (const m of Array.isArray(d.maintainers) ? d.maintainers : []) { + if (typeof m?.email === 'string') + addEmail(m.email, typeof m.name === 'string' ? m.name : undefined) + } + + return contacts +} + +export async function fetchNpm( + parsed: ParsedPurl, + timeoutMs: number, + userAgent: string, +): Promise { + const url = `https://registry.npmjs.org/${npmPackagePath(parsed)}` + const { json } = await fetchJson(url, timeoutMs, registryHeaders(userAgent)) + if (!json) return { contacts: [], policies: {} } + return { contacts: mapNpm(json, url, new Date().toISOString()), policies: {} } +} diff --git a/services/apps/packages_worker/src/security-contacts/extractors/registry/nuget.ts b/services/apps/packages_worker/src/security-contacts/extractors/registry/nuget.ts new file mode 100644 index 0000000000..908dfe461a --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/extractors/registry/nuget.ts @@ -0,0 +1,77 @@ +import { XMLParser } from 'fast-xml-parser' + +import { ExtractorResult, ProvenanceEntry, RawContact } from '../../types' +import { extractEmails, fetchJson, fetchText, registryHeaders } from '../http' + +import { ParsedPurl } from './purl' + +const SOURCE = 'nuget-nuspec' +const BASE = 'https://api.nuget.org/v3-flatcontainer' +const parser = new XMLParser({ ignoreAttributes: true }) + +/* eslint-disable @typescript-eslint/no-explicit-any */ + +// NuGet authors/owners are display names; emails appear only when an author string embeds one. +export function mapNuspec(xml: string, sourceUrl: string, fetchedAt: string): RawContact[] { + let doc: any + try { + doc = parser.parse(xml) + } catch { + return [] + } + const meta = doc?.package?.metadata + if (!meta) return [] + + const prov = (): ProvenanceEntry[] => [ + { source: SOURCE, sourceTier: 'B', path: sourceUrl, fetchedAt }, + ] + const contacts: RawContact[] = [] + const seen = new Set() + for (const field of ['authors', 'owners']) { + if (typeof meta[field] !== 'string') continue + for (const email of extractEmails(meta[field])) { + const key = email.toLowerCase() + if (seen.has(key)) continue + seen.add(key) + contacts.push({ + channel: 'email', + value: email, + role: 'maintainer', + tier: 'B', + provenance: prov(), + }) + } + } + return contacts +} + +async function latestStableVersion( + id: string, + timeoutMs: number, + userAgent: string, +): Promise { + const { json } = await fetchJson( + `${BASE}/${id}/index.json`, + timeoutMs, + registryHeaders(userAgent), + ) + const versions = (json as { versions?: unknown } | null)?.versions + if (!Array.isArray(versions) || versions.length === 0) return null + const stable = [...versions].reverse().find((v) => typeof v === 'string' && !v.includes('-')) + return (stable as string) ?? (versions[versions.length - 1] as string) +} + +export async function fetchNuget( + parsed: ParsedPurl, + timeoutMs: number, + userAgent: string, +): Promise { + const id = parsed.name.toLowerCase() + const version = await latestStableVersion(id, timeoutMs, userAgent) + if (!version) return { contacts: [], policies: {} } + + const url = `${BASE}/${id}/${version}/${id}.nuspec` + const { text } = await fetchText(url, timeoutMs, registryHeaders(userAgent)) + if (!text) return { contacts: [], policies: {} } + return { contacts: mapNuspec(text, url, new Date().toISOString()), policies: {} } +} diff --git a/services/apps/packages_worker/src/security-contacts/extractors/registry/purl.ts b/services/apps/packages_worker/src/security-contacts/extractors/registry/purl.ts new file mode 100644 index 0000000000..a0ea5c0454 --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/extractors/registry/purl.ts @@ -0,0 +1,33 @@ +export interface ParsedPurl { + type: string + namespace: string | null + name: string +} + +/** + * Minimal Package URL parser: pkg:TYPE/NAMESPACE/NAME@VERSION?QUALIFIERS#SUBPATH. + * Only type/namespace/name are needed to address a registry. + */ +export function parsePurl(purl: string): ParsedPurl | null { + if (!purl || !purl.startsWith('pkg:')) return null + + let rest = purl.slice('pkg:'.length) + rest = rest.split('#')[0].split('?')[0] + + const slash = rest.indexOf('/') + if (slash === -1) return null + + const type = rest.slice(0, slash).toLowerCase() + let path = rest.slice(slash + 1) + + // Version is the last @ segment (but not a leading @scope on the name). + const at = path.lastIndexOf('@') + if (at > 0) path = path.slice(0, at) + + const segments = path.split('/').filter(Boolean).map(decodeURIComponent) + if (segments.length === 0) return null + + const name = segments[segments.length - 1] + const namespace = segments.length > 1 ? segments.slice(0, -1).join('/') : null + return { type, namespace, name } +} diff --git a/services/apps/packages_worker/src/security-contacts/extractors/registry/pypi.ts b/services/apps/packages_worker/src/security-contacts/extractors/registry/pypi.ts new file mode 100644 index 0000000000..ca53b7b226 --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/extractors/registry/pypi.ts @@ -0,0 +1,67 @@ +import { ExtractorResult, ProvenanceEntry, RawContact, RepoPolicies } from '../../types' +import { extractEmails, fetchJson, registryHeaders } from '../http' + +import { ParsedPurl } from './purl' + +const SOURCE = 'pypi' + +/* eslint-disable @typescript-eslint/no-explicit-any */ + +export function mapPypi(doc: unknown, sourceUrl: string, fetchedAt: string): ExtractorResult { + const contacts: RawContact[] = [] + const policies: Partial = {} + if (!doc || typeof doc !== 'object') return { contacts, policies } + const info = (doc as any).info + if (!info || typeof info !== 'object') return { contacts, policies } + + const prov = (): ProvenanceEntry[] => [ + { source: SOURCE, sourceTier: 'B', path: sourceUrl, fetchedAt }, + ] + const seen = new Set() + const addEmail = (email: string): void => { + const key = email.toLowerCase() + if (seen.has(key)) return + seen.add(key) + contacts.push({ + channel: 'email', + value: email, + role: 'maintainer', + tier: 'B', + provenance: prov(), + }) + } + + // author_email / maintainer_email are RFC-5322 lists: "Name , Name2 " + for (const field of ['author_email', 'maintainer_email']) { + if (typeof info[field] === 'string') for (const e of extractEmails(info[field])) addEmail(e) + } + + const projectUrls = info.project_urls + if (projectUrls && typeof projectUrls === 'object') { + for (const [key, value] of Object.entries(projectUrls)) { + if (/security/i.test(key) && typeof value === 'string' && /^https?:\/\//i.test(value)) { + if (!policies.securityPolicyUrl) policies.securityPolicyUrl = value + contacts.push({ + channel: 'url', + value, + role: 'security-team', + tier: 'B', + provenance: prov(), + }) + } + } + } + + return { contacts, policies } +} + +export async function fetchPypi( + parsed: ParsedPurl, + timeoutMs: number, + userAgent: string, +): Promise { + const url = `https://pypi.org/pypi/${encodeURIComponent(parsed.name)}/json` + const { json } = await fetchJson(url, timeoutMs, registryHeaders(userAgent)) + if (!json) return { contacts: [], policies: {} } + return mapPypi(json, url, new Date().toISOString()) +} diff --git a/services/apps/packages_worker/src/security-contacts/extractors/registry/rubygems.ts b/services/apps/packages_worker/src/security-contacts/extractors/registry/rubygems.ts new file mode 100644 index 0000000000..63375d36e0 --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/extractors/registry/rubygems.ts @@ -0,0 +1,46 @@ +import { ExtractorResult, ProvenanceEntry, RawContact } from '../../types' +import { extractEmails, fetchJson, registryHeaders } from '../http' + +import { ParsedPurl } from './purl' + +const SOURCE = 'rubygems' + +/* eslint-disable @typescript-eslint/no-explicit-any */ + +// RubyGems hides author emails; we can only surface emails when an author string embeds one. +// bug_tracker_uri is an issue tracker, not a security contact, so it is intentionally skipped. +export function mapRubygems(doc: unknown, sourceUrl: string, fetchedAt: string): RawContact[] { + if (!doc || typeof doc !== 'object') return [] + const authors = (doc as any).authors + if (typeof authors !== 'string') return [] + + const prov = (): ProvenanceEntry[] => [ + { source: SOURCE, sourceTier: 'B', path: sourceUrl, fetchedAt }, + ] + const contacts: RawContact[] = [] + const seen = new Set() + for (const email of extractEmails(authors)) { + const key = email.toLowerCase() + if (seen.has(key)) continue + seen.add(key) + contacts.push({ + channel: 'email', + value: email, + role: 'maintainer', + tier: 'B', + provenance: prov(), + }) + } + return contacts +} + +export async function fetchRubygems( + parsed: ParsedPurl, + timeoutMs: number, + userAgent: string, +): Promise { + const url = `https://rubygems.org/api/v1/gems/${encodeURIComponent(parsed.name)}.json` + const { json } = await fetchJson(url, timeoutMs, registryHeaders(userAgent)) + if (!json) return { contacts: [], policies: {} } + return { contacts: mapRubygems(json, url, new Date().toISOString()), policies: {} } +} diff --git a/services/apps/packages_worker/src/security-contacts/extractors/securityContactsFile.ts b/services/apps/packages_worker/src/security-contacts/extractors/securityContactsFile.ts new file mode 100644 index 0000000000..b54860f718 --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/extractors/securityContactsFile.ts @@ -0,0 +1,90 @@ +import { getServiceChildLogger } from '@crowd/logging' + +import { parseGithubUrl } from '../../enricher/fetchLightRepo' +import { ExtractorDeps, ProvenanceEntry, RawContact, Extractor } from '../types' + +import { isEmail } from './http' + +const log = getServiceChildLogger('security-contacts:security_contacts-file') + +const SOURCE = 'security_contacts' +const PATH = 'SECURITY_CONTACTS' +const HANDLE_RE = /^[a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?$/ + +export interface SecurityContactEntry { + handle: string + email?: string +} + +export function parseSecurityContacts(text: string): SecurityContactEntry[] { + const entries: SecurityContactEntry[] = [] + for (const rawLine of text.split('\n')) { + const line = rawLine.trim() + if (!line || line.startsWith('#')) continue + + const tokens = line.replace(/^-\s*/, '').split(/\s+/) + const handle = tokens[0].replace(/^@/, '') + if (!HANDLE_RE.test(handle)) continue + + const email = tokens.slice(1).find((t) => isEmail(t)) + entries.push(email ? { handle, email } : { handle }) + } + return entries +} + +async function resolvePublicEmail( + login: string, + githubGet: ExtractorDeps['githubGet'], +): Promise { + try { + const { text } = await githubGet(`/users/${login}`) + const email = (text ? (JSON.parse(text) as { email?: unknown }) : null)?.email + return typeof email === 'string' && isEmail(email) ? email : null + } catch (err) { + log.warn({ login, errMsg: (err as Error).message }, 'Handle email resolution failed') + return null + } +} + +export const extractSecurityContactsFile: Extractor = async (target, deps) => { + let owner: string + let name: string + try { + ;({ owner, name } = parseGithubUrl(target.url)) + } catch { + return { contacts: [], policies: {} } + } + + const { text } = await deps.githubGet(`/repos/${owner}/${name}/contents/${PATH}`, { raw: true }) + if (!text) return { contacts: [], policies: {} } + + const fetchedAt = new Date().toISOString() + const prov = (): ProvenanceEntry[] => [{ source: SOURCE, sourceTier: 'A', path: PATH, fetchedAt }] + + const contacts: RawContact[] = [] + + for (const entry of parseSecurityContacts(text)) { + const email = entry.email ?? (await resolvePublicEmail(entry.handle, deps.githubGet)) + if (email) { + // handle = the username this email was resolved from (used for identity-linking). + contacts.push({ + channel: 'email', + value: email, + handle: entry.handle, + role: 'security-team', + tier: 'A', + provenance: prov(), + }) + } else { + contacts.push({ + channel: 'github-handle', + value: entry.handle, + role: 'security-team', + tier: 'A', + provenance: prov(), + }) + } + } + + return { contacts, policies: {} } +} diff --git a/services/apps/packages_worker/src/security-contacts/extractors/securityInsights.ts b/services/apps/packages_worker/src/security-contacts/extractors/securityInsights.ts new file mode 100644 index 0000000000..86ebf4abb0 --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/extractors/securityInsights.ts @@ -0,0 +1,238 @@ +import yaml from 'js-yaml' + +import { getServiceChildLogger } from '@crowd/logging' + +import { parseGithubUrl } from '../../enricher/fetchLightRepo' +import { + ContactChannel, + ContactRole, + Extractor, + ExtractorResult, + ProvenanceEntry, + RawContact, + RepoPolicies, +} from '../types' + +import { fetchText, githubHandleFromUrl, isEmail } from './http' + +const log = getServiceChildLogger('security-contacts:security-insights') + +const SOURCE = 'security-insights' +const PATHS = [ + 'SECURITY-INSIGHTS.yml', + '.github/SECURITY-INSIGHTS.yml', + '.gitlab/SECURITY-INSIGHTS.yml', +] + +/* eslint-disable @typescript-eslint/no-explicit-any */ + +function arr(x: unknown): any[] { + return Array.isArray(x) ? x : [] +} + +function classifyValue(raw: string): { channel: ContactChannel; value: string } | null { + const v = raw.trim() + if (!v) return null + if (v.toLowerCase().startsWith('github:')) { + const handle = v.slice('github:'.length).trim() + return handle ? { channel: 'github-handle', value: handle } : null + } + if (isEmail(v)) return { channel: 'email', value: v } + const handle = githubHandleFromUrl(v) + if (handle) return { channel: 'github-handle', value: handle } + if (/^https?:\/\//i.test(v)) return { channel: 'url', value: v } + if (v.startsWith('@')) return { channel: 'github-handle', value: v.slice(1) } + return { channel: 'github-handle', value: v } +} + +// project-si-source may only point at trusted raw-content hosts (SSRF guard). +const REDIRECT_ALLOWED_HOSTS = new Set([ + 'raw.githubusercontent.com', + 'gist.githubusercontent.com', + 'gitlab.com', +]) + +function isAllowedRedirect(url: string): boolean { + try { + const u = new URL(url) + return u.protocol === 'https:' && REDIRECT_ALLOWED_HOSTS.has(u.hostname.toLowerCase()) + } catch { + return false + } +} + +// MAINTAINERS.md / OWNERS.md links are documents, not contacts. +function isDocumentUrl(value: string): boolean { + const v = value.toLowerCase() + return v.includes('/blob/') || v.endsWith('.md') +} + +type Mapped = { channel: ContactChannel; value: string; name?: string } + +// security-contacts entry: typed object {type,value} or a bare string +function fromContactEntry(entry: unknown): Mapped | null { + if (typeof entry === 'string') return classifyValue(entry) + if (entry && typeof entry === 'object') { + const o = entry as any + const value = o.value ?? o.email ?? o.url + if (typeof value !== 'string') return null + const name = typeof o.name === 'string' ? o.name : undefined + if (o.type === 'email') return { channel: 'email', value, name } + if (o.type === 'url') return { channel: 'url', value, name } + const c = classifyValue(value) + return c ? { ...c, name } : null + } + return null +} + +// people objects (core-team / administrators / champions / v2 contact): prefer email, else social handle +function fromPerson(p: unknown): Mapped | null { + if (!p || typeof p !== 'object') return null + const o = p as any + const name = typeof o.name === 'string' ? o.name : undefined + if (typeof o.email === 'string' && isEmail(o.email)) + return { channel: 'email', value: o.email, name } + if (typeof o.social === 'string') { + const c = classifyValue(o.social) + if (c) return { ...c, name } + } + return null +} + +export function mapSecurityInsights( + doc: unknown, + provPath: string, + fetchedAt: string, +): ExtractorResult { + const contacts: RawContact[] = [] + const policies: Partial = {} + if (doc == null || typeof doc !== 'object') return { contacts, policies } + const root = doc as any + + const declaredAt: string | undefined = + typeof root.header?.['last-updated'] === 'string' + ? root.header['last-updated'] + : typeof root.header?.['last-reviewed'] === 'string' + ? root.header['last-reviewed'] + : undefined + + const prov = (): ProvenanceEntry[] => [ + { source: SOURCE, sourceTier: 'A', path: provPath, fetchedAt, declaredAt }, + ] + + const add = (m: Mapped | null, role: ContactRole): void => { + if (m) contacts.push({ ...m, role, tier: 'A', provenance: prov() }) + } + const setPolicy = (key: keyof RepoPolicies, value: unknown): void => { + if (!policies[key] && typeof value === 'string' && value.trim()) { + ;(policies as any)[key] = value.trim() + } + } + + // Scan flat (v1) and nested (v2 project/repository) layouts uniformly. + const sections = [root, root.project, root.repository].filter((s) => s && typeof s === 'object') + + for (const section of sections) { + for (const e of arr(section['security-contacts'])) add(fromContactEntry(e), 'security-team') + + const vr = section['vulnerability-reporting'] + if (vr && typeof vr === 'object') { + if (typeof vr['email-contact'] === 'string' && isEmail(vr['email-contact'])) { + add({ channel: 'email', value: vr['email-contact'] }, 'security-team') + } + add(fromPerson(vr['contact']), 'security-team') + for (const e of arr(vr['security-contacts'])) add(fromContactEntry(e), 'security-team') + setPolicy('securityPolicyUrl', vr['security-policy']) + const programUrl = arr(vr['bug-bounty-programs']).find((p) => typeof p?.url === 'string')?.url + setPolicy('bugBountyUrl', vr['bug-bounty-url'] ?? programUrl) + } + + for (const p of arr(section['core-team'])) add(fromPerson(p), 'maintainer') + for (const p of arr(section['administrators'])) add(fromPerson(p), 'maintainer') + + const security = section['security'] + if (security && typeof security === 'object') { + for (const p of arr(security['champions'])) add(fromPerson(p), 'security-team') + } + + const pl = section['project-lifecycle'] + if (pl && typeof pl === 'object') { + for (const m of arr(pl['core-maintainers'])) { + if (typeof m !== 'string') { + add(fromPerson(m), 'maintainer') + continue + } + const c = classifyValue(m) + if (!c) continue + // MAINTAINERS.md/OWNERS.md are non-standardized markdown — not safely parseable. + // TODO(CM-1243): follow + extract maintainers from these documents in a later pass. + if (c.channel === 'url' && isDocumentUrl(c.value)) { + log.info( + { provPath, document: c.value }, + 'TODO(security-contacts): unparsed core-maintainers document', + ) + continue + } + add(c, 'maintainer') + } + } + + const documentation = section['documentation'] + if (documentation && typeof documentation === 'object') { + setPolicy('securityPolicyUrl', documentation['security-policy']) + } + } + + return { contacts, policies } +} + +export function parseSecurityInsights( + text: string, + provPath: string, + fetchedAt: string, +): ExtractorResult { + let doc: unknown + try { + doc = yaml.load(text) + } catch (err) { + log.warn({ provPath, errMsg: (err as Error).message }, 'Failed to parse SECURITY-INSIGHTS.yml') + return { contacts: [], policies: {} } + } + return mapSecurityInsights(doc, provPath, fetchedAt) +} + +export const extractSecurityInsights: Extractor = async (target, deps) => { + let owner: string + let name: string + try { + ;({ owner, name } = parseGithubUrl(target.url)) + } catch { + return { contacts: [], policies: {} } + } + + const fetchedAt = new Date().toISOString() + + for (const path of PATHS) { + const { text } = await deps.githubGet(`/repos/${owner}/${name}/contents/${path}`, { raw: true }) + if (!text) continue + + let doc: unknown + try { + doc = yaml.load(text) + } catch { + continue + } + + const redirect = (doc as any)?.header?.['project-si-source'] + // Only follow redirects to trusted raw-content hosts — a repo-controlled URL must not + // be able to point the worker at internal/metadata endpoints (SSRF). + if (typeof redirect === 'string' && isAllowedRedirect(redirect)) { + const redirected = await fetchText(redirect, deps.fetchTimeoutMs) + if (redirected.text) return parseSecurityInsights(redirected.text, redirect, fetchedAt) + } + + return mapSecurityInsights(doc, path, fetchedAt) + } + + return { contacts: [], policies: {} } +} diff --git a/services/apps/packages_worker/src/security-contacts/extractors/securityMd.ts b/services/apps/packages_worker/src/security-contacts/extractors/securityMd.ts new file mode 100644 index 0000000000..89d7e7d3ef --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/extractors/securityMd.ts @@ -0,0 +1,118 @@ +import { parseGithubUrl } from '../../enricher/fetchLightRepo' +import { + ContactChannel, + Extractor, + ExtractorResult, + ProvenanceEntry, + RawContact, + RepoPolicies, +} from '../types' + +import { githubHandleFromUrl } from './http' + +const SOURCE = 'security.md' +const PATHS = ['SECURITY.md', '.github/SECURITY.md', 'docs/SECURITY.md'] + +const KEYWORD_RE = + /\b(report|security|vulnerabilit(?:y|ies)|disclosure|contact|advisor(?:y|ies))\b/i +const NEGATIVE_SECTION_RE = /acknowledg|hall of fame|thanks|credits|honou?r|researchers/i +const EMAIL_RE = /[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}/g +const URL_RE = /https?:\/\/[^\s)<>\]"']+/g +const HEADING_RE = /^#{1,6}\s+(.*)$/ + +const PVR_RE = /private vulnerability reporting|\/security\/advisories\/new/i + +function cleanUrl(url: string): string { + return url.replace(/[.,;:]+$/, '') +} + +export function parseSecurityMd( + text: string, + owner: string, + name: string, + provPath: string, + fetchedAt: string, +): ExtractorResult { + const policies: Partial = { + securityPolicyUrl: `https://github.com/${owner}/${name}/blob/HEAD/${provPath}`, + } + const prov = (): ProvenanceEntry[] => [ + { source: SOURCE, sourceTier: 'B', path: provPath, fetchedAt }, + ] + + const seen = new Set() + const contacts: RawContact[] = [] + const add = (channel: ContactChannel, value: string): boolean => { + const key = `${channel}:${value.toLowerCase()}` + if (seen.has(key)) return false + seen.add(key) + contacts.push({ channel, value, role: 'security-team', tier: 'B', provenance: prov() }) + return true + } + + // Verbose policies list many references/people; bound how many B1 promotes. + const MAX_PER_CHANNEL = 3 + let emailCount = 0 + let urlCount = 0 + + const lines = text.split('\n') + let heading = '' + for (let i = 0; i < lines.length; i++) { + const line = lines[i] + const headingMatch = HEADING_RE.exec(line.trim()) + if (headingMatch) { + heading = headingMatch[1] + continue + } + + // Skip researcher credits / acknowledgements sections — those are not contacts. + if (NEGATIVE_SECTION_RE.test(heading)) continue + // Paragraph-local proximity: a keyword must appear within ±2 lines of the candidate. + const windowText = lines.slice(Math.max(0, i - 2), i + 3).join('\n') + if (!KEYWORD_RE.test(windowText)) continue + + if (emailCount < MAX_PER_CHANNEL) { + for (const email of line.match(EMAIL_RE) ?? []) { + if (emailCount >= MAX_PER_CHANNEL) break + if (add('email', email)) emailCount++ + } + } + if (urlCount < MAX_PER_CHANNEL) { + for (const rawUrl of line.match(URL_RE) ?? []) { + if (urlCount >= MAX_PER_CHANNEL) break + const url = cleanUrl(rawUrl) + // The canonical PVR advisory URL is emitted as a github-pvr contact below. + if (/\/security\/advisories/i.test(url)) continue + // Bare github.com/ profile links are people listings, not reporting channels. + if (githubHandleFromUrl(url)) continue + if (add('url', url)) urlCount++ + } + } + } + + // PVR redirect language corroborates A2. Emitted unconditionally; processBatch vetoes it + // when A2 authoritatively reports PVR disabled. + if (PVR_RE.test(text)) { + add('github-pvr', `https://github.com/${owner}/${name}/security/advisories/new`) + } + + return { contacts, policies } +} + +export const extractSecurityMd: Extractor = async (target, deps) => { + let owner: string + let name: string + try { + ;({ owner, name } = parseGithubUrl(target.url)) + } catch { + return { contacts: [], policies: {} } + } + + const fetchedAt = new Date().toISOString() + for (const path of PATHS) { + const { text } = await deps.githubGet(`/repos/${owner}/${name}/contents/${path}`, { raw: true }) + if (text) return parseSecurityMd(text, owner, name, path, fetchedAt) + } + + return { contacts: [], policies: {} } +} diff --git a/services/apps/packages_worker/src/security-contacts/extractors/securityTxt.ts b/services/apps/packages_worker/src/security-contacts/extractors/securityTxt.ts new file mode 100644 index 0000000000..cd596247ee --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/extractors/securityTxt.ts @@ -0,0 +1,87 @@ +import { Extractor, ExtractorResult, ProvenanceEntry, RawContact, RepoPolicies } from '../types' + +import { fetchText, isEmail } from './http' + +const SOURCE = 'security.txt' + +// Platform hosts serve their own security.txt, not the project's — never attribute it to a repo. +const PLATFORM_HOSTS = new Set(['github.com', 'www.github.com', 'gitlab.com', 'bitbucket.org']) + +const FIELD_RE = /^([A-Za-z-]+):\s*(.+?)\s*$/ + +// target.homepage is externally-sourced. Requiring https already blocks the classic SSRF target +// (cloud-metadata IMDS is http-only); we also reject obvious loopback/localhost. +function isBlockedHost(h: string): boolean { + return h === 'localhost' || h === '::1' || h === '0.0.0.0' || h.startsWith('127.') +} + +export function parseSecurityTxt( + text: string, + sourceUrl: string, + fetchedAt: string, +): ExtractorResult { + const contacts: RawContact[] = [] + const policies: Partial = {} + + // Reject HTML served for a missing file (e.g. SPA 200s). + if (/]/i.test(text)) return { contacts, policies } + + const prov = (): ProvenanceEntry[] => [ + { source: SOURCE, sourceTier: 'A', path: sourceUrl, fetchedAt }, + ] + const add = (channel: RawContact['channel'], value: string): void => { + contacts.push({ channel, value, role: 'security-team', tier: 'A', provenance: prov() }) + } + + let sawField = false + for (const rawLine of text.split('\n')) { + const line = rawLine.trim() + if (!line || line.startsWith('#')) continue + const m = FIELD_RE.exec(line) + if (!m) continue + const field = m[1].toLowerCase() + const value = m[2].trim() + + if (field === 'contact') { + sawField = true + if (value.toLowerCase().startsWith('mailto:')) { + const email = value.slice('mailto:'.length).trim() + if (isEmail(email)) add('email', email) + } else if (isEmail(value)) { + add('email', value) + } else if (/^https?:\/\//i.test(value)) { + add('url', value) + } + // tel: and other schemes are not actionable channels — skip + } else if (field === 'policy') { + sawField = true + if (!policies.securityPolicyUrl && /^https?:\/\//i.test(value)) + policies.securityPolicyUrl = value + } + } + + if (sawField) policies.securityTxtUrl = sourceUrl + return { contacts, policies } +} + +export const extractSecurityTxt: Extractor = async (target, deps) => { + if (!target.homepage) return { contacts: [], policies: {} } + + let origin: string + let host: string + try { + const u = new URL(target.homepage) + if (u.protocol !== 'https:') return { contacts: [], policies: {} } + origin = u.origin + host = u.hostname.toLowerCase() + } catch { + return { contacts: [], policies: {} } + } + if (PLATFORM_HOSTS.has(host) || isBlockedHost(host)) return { contacts: [], policies: {} } + + const url = `${origin}/.well-known/security.txt` + const { text } = await fetchText(url, deps.fetchTimeoutMs) + if (!text) return { contacts: [], policies: {} } + + return parseSecurityTxt(text, url, new Date().toISOString()) +} diff --git a/services/apps/packages_worker/src/security-contacts/githubToken.ts b/services/apps/packages_worker/src/security-contacts/githubToken.ts new file mode 100644 index 0000000000..7c5cbc38da --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/githubToken.ts @@ -0,0 +1,211 @@ +import { getServiceChildLogger } from '@crowd/logging' + +import { getGithubAppConfig } from '../config' +import { + GithubAppConfig, + fetchRateLimitDiagnostics, + getInstallationToken, + resolveInstallations, +} from '../enricher/githubAppAuth' +import { InstallationPool } from '../enricher/installationPool' + +import { GithubGetResult } from './types' + +const log = getServiceChildLogger('security-contacts:github-token') + +const GITHUB_API = 'https://api.github.com' + +// Genuinely-absent / not-determinable → null body (see http.ts for the same set). 422 covers the +// PVR endpoint returning "can't determine" per-repo; it must read as unknown, not a hard failure. +const ABSENT_STATUSES = new Set([404, 410, 422]) + +// App-wide ceiling on concurrent GitHub requests. GitHub's secondary limit rejects bursts of +// >100 concurrent requests from one app; staying under that (across all installations) is the +// single most effective guard against secondary-limit 429s at high repo concurrency. +const MAX_CONCURRENT_GITHUB_REQUESTS = 50 + +// Bound the park/switch/backoff retry loop so a persistently-limited request eventually surfaces +// as a failure (which the pipeline treats as transient and preserves existing data). +const MAX_RATE_LIMIT_RETRIES = 6 + +/** Minimal async semaphore with fair FIFO hand-off, used to cap concurrent GitHub requests. */ +class Semaphore { + private active = 0 + private readonly waiters: Array<() => void> = [] + + constructor(private readonly max: number) {} + + async acquire(): Promise { + if (this.active < this.max) { + this.active++ + return + } + await new Promise((resolve) => this.waiters.push(resolve)) + } + + release(): void { + const next = this.waiters.shift() + if (next) next() + else this.active-- + } +} + +const gate = new Semaphore(MAX_CONCURRENT_GITHUB_REQUESTS) + +interface Pool { + pool: InstallationPool + appConfig: GithubAppConfig +} + +// Module-scoped so installations are resolved once and reused across activity invocations. +let cached: Pool | null = null +let initPromise: Promise | null = null + +async function ensurePool(): Promise { + if (cached) return cached + if (!initPromise) { + initPromise = (async () => { + try { + const appConfig = getGithubAppConfig() + const discovered = await resolveInstallations(appConfig) + if (discovered.length === 0) { + log.warn('No GitHub App installations — authed extractors will run unauthenticated') + return null + } + const healthy = await fetchRateLimitDiagnostics( + appConfig.appId, + appConfig.privateKeyPem, + discovered, + ) + cached = { pool: new InstallationPool(healthy.length ? healthy : discovered), appConfig } + return cached + } catch (err) { + log.warn( + { errMsg: (err as Error).message }, + 'GitHub token pool unavailable — running unauthenticated', + ) + return null + } + })() + } + return initPromise +} + +const sleep = (ms: number): Promise => new Promise((r) => setTimeout(r, ms)) + +function numOrNull(v: string | null): number | null { + if (v == null) return null + const n = parseInt(v, 10) + return Number.isFinite(n) ? n : null +} + +function resetIso(v: string | null): string | null { + const sec = numOrNull(v) + return sec == null ? null : new Date(sec * 1000).toISOString() +} + +function isRateLimited(status: number, body: string): boolean { + // Primary limit → 403/429 with x-ratelimit-remaining: 0; secondary → 403/429 mentioning it. + return status === 429 || (status === 403 && /rate limit|secondary/i.test(body)) +} + +async function fetchOnce( + url: string, + timeoutMs: number, + headers: Record, +): Promise { + const controller = new AbortController() + const timeoutId = setTimeout(() => controller.abort(), timeoutMs) + await gate.acquire() + try { + return await fetch(url, { headers, signal: controller.signal }) + } finally { + gate.release() + clearTimeout(timeoutId) + } +} + +/** + * Rate-limit-safe GitHub API GET. Selects an installation from the pool, sleeps if all are parked, + * feeds response budget headers back so exhausted installations get parked before they 403, and on + * a rate-limit response parks (primary) or waits out Retry-After (secondary, app-wide) then retries + * on another installation. Falls back to a single unauthenticated request when no App is configured. + * + * Returns text on 200; null body for absent resources (404/410/422); throws on other non-200s and + * once the retry budget is exhausted (callers treat throws as transient and preserve existing data). + */ +export async function githubApiGet( + path: string, + timeoutMs: number, + opts: { raw?: boolean } = {}, +): Promise { + const accept = opts.raw ? 'application/vnd.github.raw' : 'application/vnd.github+json' + const url = `${GITHUB_API}${path}` + const resolved = await ensurePool() + + if (!resolved) { + const res = await fetchOnce(url, timeoutMs, { Accept: accept }) + if (res.status === 200) return { status: 200, text: await res.text() } + if (ABSENT_STATUSES.has(res.status)) return { status: res.status, text: null } + throw new Error(`githubApiGet ${path} failed: HTTP ${res.status}`) + } + + const { pool, appConfig } = resolved + + for (let attempt = 0; attempt <= MAX_RATE_LIMIT_RETRIES; attempt++) { + const { installationId, waitMs } = pool.select() + if (waitMs > 0) { + log.warn({ waitMs: Math.round(waitMs / 1000) }, 'All installations parked — waiting') + await sleep(waitMs) + } + + let token: string + try { + token = await getInstallationToken(appConfig.appId, appConfig.privateKeyPem, installationId) + } catch (err) { + // Mint failure (rate-limited or auth) — park this installation and try another. + pool.park(installationId, Date.now() + 60_000) + if (attempt === MAX_RATE_LIMIT_RETRIES) throw err + continue + } + + const res = await fetchOnce(url, timeoutMs, { + Authorization: `bearer ${token}`, + Accept: accept, + }) + + if (res.status === 200 || ABSENT_STATUSES.has(res.status)) { + pool.parkIfBudgetLow( + installationId, + numOrNull(res.headers.get('x-ratelimit-remaining')), + resetIso(res.headers.get('x-ratelimit-reset')), + ) + return res.status === 200 + ? { status: 200, text: await res.text() } + : { status: res.status, text: null } + } + + const body = await res.text().catch(() => '') + if (isRateLimited(res.status, body)) { + const retryAfterSec = numOrNull(res.headers.get('retry-after')) + if (retryAfterSec) { + // Secondary limits are app-wide, so switching installations won't help — wait it out. + log.warn({ retryAfterSec }, 'GitHub secondary rate limit — backing off') + await sleep(retryAfterSec * 1000 + 1_000) + } else { + // Primary limit — park this installation until its reset and switch to another. + const resetSec = numOrNull(res.headers.get('x-ratelimit-reset')) + pool.park(installationId, resetSec ? resetSec * 1000 + 5_000 : Date.now() + 60_000) + } + if (attempt === MAX_RATE_LIMIT_RETRIES) { + throw new Error(`githubApiGet ${path} rate limited after ${attempt + 1} attempts`) + } + continue + } + + throw new Error(`githubApiGet ${path} failed: HTTP ${res.status}`) + } + + // Unreachable: the loop either returns or throws on the final attempt. + throw new Error(`githubApiGet ${path} exhausted retries`) +} diff --git a/services/apps/packages_worker/src/security-contacts/processBatch.ts b/services/apps/packages_worker/src/security-contacts/processBatch.ts new file mode 100644 index 0000000000..b01d84a381 --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/processBatch.ts @@ -0,0 +1,184 @@ +import { QueryExecutor } from '@crowd/data-access-layer/src/queryExecutor' +import { getServiceChildLogger } from '@crowd/logging' + +import { getSecurityContactsConfig } from '../config' +import { mapWithConcurrency } from '../utils/concurrency' + +import { extractPvr } from './extractors/pvr' +import { extractManifest } from './extractors/registry' +import { extractSecurityContactsFile } from './extractors/securityContactsFile' +import { extractSecurityInsights } from './extractors/securityInsights' +import { extractSecurityMd } from './extractors/securityMd' +import { extractSecurityTxt } from './extractors/securityTxt' +import { githubApiGet } from './githubToken' +import { reconcile } from './reconcile' +import { + Extractor, + ExtractorDeps, + RawContact, + RepoPackage, + RepoPolicies, + RepoTarget, +} from './types' +import { markRepoAttempted, writeContacts } from './writeContacts' + +const log = getServiceChildLogger('security-contacts') + +type Config = ReturnType + +export interface BatchResult { + /** Repos evaluated in this batch. 0 signals the workflow there is no more work. */ + processed: number +} + +// Two-tier refresh cadence (the worker runs on a daily cron). Just under 24h/168h so a repo +// processed at ~06:00 is eligible again at the next daily/weekly tick rather than slipping a day. +const DAILY_INTERVAL_HOURS = 20 // repos never evaluated or with no contacts yet +const WEEKLY_INTERVAL_HOURS = 156 // already-enriched repos (have contacts) + +// Tuned for throughput within the platform ceilings: +// - CONCURRENCY: parallel repos. GitHub calls go through the authed Contents API via a +// rate-limit-aware pool (per-installation budget parking + app-wide concurrency gate + +// Retry-After backoff in githubToken), so high repo concurrency won't trip GitHub limits. +// - FETCH_TIMEOUT_MS: generous enough for slow registries (Maven metadata/POM) without hanging slots. +// - BATCH_SIZE: bounded by the 30-min activity timeout (worst case: an all-cargo batch throttled +// to crates.io's 1 req/s finishes ~8 min). +const CONCURRENCY = 100 +const FETCH_TIMEOUT_MS = 15000 +const BATCH_SIZE = 500 + +const EXTRACTORS: Extractor[] = [ + extractSecurityInsights, // A1 + extractPvr, // A2 + extractSecurityContactsFile, // A3 + extractSecurityTxt, // A4 + extractSecurityMd, // B1 + extractManifest, // B2 +] + +interface SweepRow { + id: string + url: string + homepage: string | null + archived: boolean | null + packages: RepoPackage[] | null +} + +async function fetchBatch(qx: QueryExecutor): Promise { + return qx.select( + ` + SELECT r.id::text AS id, + r.url, + r.homepage, + r.archived, + json_agg(json_build_object('purl', p.purl, 'ecosystem', p.ecosystem)) AS packages + FROM repos r + JOIN package_repos pr ON pr.repo_id = r.id + JOIN packages p ON p.id = pr.package_id AND p.is_critical + WHERE r.host = 'github' + AND ( + -- never evaluated → always eligible + r.contacts_last_refreshed IS NULL + -- evaluated but no contacts found yet → retry on the daily cadence + OR ( + NOT EXISTS (SELECT 1 FROM security_contacts sc WHERE sc.repo_id = r.id) + AND r.contacts_last_refreshed < NOW() - INTERVAL '$(dailyIntervalHours) hours' + ) + -- already enriched (has contacts) → refresh on the weekly cadence + OR ( + EXISTS (SELECT 1 FROM security_contacts sc WHERE sc.repo_id = r.id) + AND r.contacts_last_refreshed < NOW() - INTERVAL '$(weeklyIntervalHours) hours' + ) + ) + GROUP BY r.id + ORDER BY r.id + LIMIT $(batchSize) + `, + { + batchSize: BATCH_SIZE, + dailyIntervalHours: DAILY_INTERVAL_HOURS, + weeklyIntervalHours: WEEKLY_INTERVAL_HOURS, + }, + ) +} + +function toTarget(row: SweepRow): RepoTarget { + return { + repoId: row.id, + url: row.url, + homepage: row.homepage, + archived: row.archived, + packages: row.packages ?? [], + } +} + +async function processRepo( + target: RepoTarget, + deps: ExtractorDeps, + qx: QueryExecutor, +): Promise { + const results = await Promise.allSettled(EXTRACTORS.map((extract) => extract(target, deps))) + + // Replace the repo's contacts only when every extractor succeeded. If any failed (transient + // error — non-200s throw), a destructive rewrite would drop contacts a failed tier-A/B extractor + // still has, so preserve existing data and just record the attempt; retried next cadence. + const failed = results.find((r) => r.status === 'rejected') as PromiseRejectedResult | undefined + if (failed) { + log.warn( + { repoId: target.repoId, errMsg: failed.reason?.message }, + 'Extractor failed — preserving existing data', + ) + await markRepoAttempted(qx, target.repoId) + return + } + + let contacts: RawContact[] = [] + const policies: Partial = {} + for (const r of results) { + if (r.status !== 'fulfilled') continue + contacts.push(...r.value.contacts) + for (const [key, value] of Object.entries(r.value.policies)) { + if (!(policies as Record)[key] && value != null) { + ;(policies as Record)[key] = value + } + } + } + + // A2 veto: B1 may emit a github-pvr contact from redirect language; drop it when A2 + // authoritatively reports PVR disabled (Option C from the design discussion). + if (policies.pvrEnabled === false) { + contacts = contacts.filter((c) => c.channel !== 'github-pvr') + } + + const scored = reconcile(contacts) + await writeContacts(qx, target.repoId, scored, policies) +} + +export async function processBatch(qx: QueryExecutor, config: Config): Promise { + const batch = await fetchBatch(qx) + if (batch.length === 0) return { processed: 0 } + + const deps: ExtractorDeps = { + fetchTimeoutMs: FETCH_TIMEOUT_MS, + userAgent: config.userAgent, + githubGet: (path, opts) => githubApiGet(path, FETCH_TIMEOUT_MS, opts), + } + + const targets = batch.map(toTarget) + // Isolate per-repo failures: mapWithConcurrency is fail-fast, so a single repo's DB error must + // not abort the batch (and, across Temporal retries, halt the whole sweep). Unmarked repos on + // error are simply retried next sweep. + await mapWithConcurrency(targets, CONCURRENCY, async (target) => { + try { + await processRepo(target, deps, qx) + } catch (err) { + log.error({ repoId: target.repoId, errMsg: (err as Error).message }, 'Repo processing failed') + // Best-effort mark so a persistently-failing repo drains on cadence rather than making the + // sweep hot-loop (it would otherwise stay eligible and keep processed > 0 forever). + await markRepoAttempted(qx, target.repoId).catch(() => undefined) + } + }) + + log.info({ processed: targets.length }, 'Security contacts batch complete') + return { processed: targets.length } +} diff --git a/services/apps/packages_worker/src/security-contacts/reconcile.ts b/services/apps/packages_worker/src/security-contacts/reconcile.ts new file mode 100644 index 0000000000..545c343007 --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/reconcile.ts @@ -0,0 +1,109 @@ +import { scoreContact } from './score' +import { + ContactChannel, + ContactRole, + ProvenanceEntry, + RawContact, + ScoredContact, + SourceTier, +} from './types' + +const MAX_CONTACTS = 5 + +const ROLE_PRIORITY: Record = { + 'security-team': 5, + maintainer: 4, + admin: 3, + committer: 2, + 'org-owner': 1, +} + +const TIER_RANK: Record = { A: 4, B: 3, C: 2, D: 1 } + +function normalizeValue(channel: ContactChannel, value: string): string { + const v = value.trim() + return channel === 'email' || channel === 'github-handle' ? v.toLowerCase() : v +} + +function higherRole(a: ContactRole, b: ContactRole): ContactRole { + return ROLE_PRIORITY[a] >= ROLE_PRIORITY[b] ? a : b +} + +function higherTier(a: SourceTier, b: SourceTier): SourceTier { + return TIER_RANK[a] >= TIER_RANK[b] ? a : b +} + +function dedupeProvenance(entries: ProvenanceEntry[]): ProvenanceEntry[] { + const seen = new Set() + const out: ProvenanceEntry[] = [] + for (const e of entries) { + const key = `${e.source}|${e.path ?? ''}|${e.fetchedAt}` + if (seen.has(key)) continue + seen.add(key) + out.push(e) + } + return out +} + +function mergeInto(target: RawContact, src: RawContact): void { + target.provenance.push(...src.provenance) + target.role = higherRole(target.role, src.role) + target.tier = higherTier(target.tier, src.tier) + if (!target.name && src.name) target.name = src.name + if (!target.handle && src.handle) target.handle = src.handle +} + +function exactMatchMerge(contacts: RawContact[]): RawContact[] { + const byKey = new Map() + for (const c of contacts) { + const key = `${c.channel}:${normalizeValue(c.channel, c.value)}` + const existing = byKey.get(key) + if (existing) { + mergeInto(existing, c) + } else { + byKey.set(key, { ...c, provenance: [...c.provenance] }) + } + } + return [...byKey.values()] +} + +// Collapse a bare github-handle into the email A3 resolved it from, matched via the explicit +// `handle` field (never the display name, to avoid merging unrelated people who share a name). +function identityLinkMerge(contacts: RawContact[]): RawContact[] { + const emailByHandle = new Map() + for (const c of contacts) { + if (c.channel === 'email' && c.handle) emailByHandle.set(c.handle.toLowerCase(), c) + } + + const out: RawContact[] = [] + for (const c of contacts) { + if (c.channel === 'github-handle') { + const email = emailByHandle.get(normalizeValue(c.channel, c.value)) + if (email) { + mergeInto(email, c) + continue + } + } + out.push(c) + } + return out +} + +export function reconcile(contacts: RawContact[], now: Date = new Date()): ScoredContact[] { + const merged = identityLinkMerge(exactMatchMerge(contacts)) + + const scored: ScoredContact[] = merged.map((c) => { + const contact = { ...c, provenance: dedupeProvenance(c.provenance) } + return { ...contact, ...scoreContact(contact, now) } + }) + + scored.sort( + (a, b) => + b.score - a.score || + ROLE_PRIORITY[b.role] - ROLE_PRIORITY[a.role] || + TIER_RANK[b.tier] - TIER_RANK[a.tier] || + a.value.localeCompare(b.value), + ) + + return scored.slice(0, MAX_CONTACTS) +} diff --git a/services/apps/packages_worker/src/security-contacts/schedule.ts b/services/apps/packages_worker/src/security-contacts/schedule.ts new file mode 100644 index 0000000000..1ea3a4ec5b --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/schedule.ts @@ -0,0 +1,41 @@ +import { ScheduleAlreadyRunning, ScheduleOverlapPolicy } from '@temporalio/client' + +import { svc } from '../service' +import { ingestSecurityContacts } from '../workflows' + +export async function scheduleSecurityContactsIngestion(): Promise { + const { temporal } = svc + if (!temporal) throw new Error('Temporal client not initialized') + + try { + await temporal.schedule.create({ + scheduleId: 'security-contacts-ingestion', + spec: { + cronExpressions: ['0 6 * * *'], + }, + policies: { + overlap: ScheduleOverlapPolicy.SKIP, + catchupWindow: '1 hour', + }, + action: { + type: 'startWorkflow', + workflowType: ingestSecurityContacts, + workflowId: 'security-contacts-daily', + taskQueue: 'packages-worker', + workflowRunTimeout: '24 hours', + retry: { + initialInterval: '30 seconds', + backoffCoefficient: 2, + maximumAttempts: 3, + }, + args: [], + }, + }) + } catch (err) { + if (err instanceof ScheduleAlreadyRunning) { + svc.log.info('Schedule security-contacts-ingestion already exists, skipping creation.') + } else { + throw err + } + } +} diff --git a/services/apps/packages_worker/src/security-contacts/score.ts b/services/apps/packages_worker/src/security-contacts/score.ts new file mode 100644 index 0000000000..1d14aa4578 --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/score.ts @@ -0,0 +1,107 @@ +import { ConfidenceBand, ContactChannel, ProvenanceEntry, RawContact, SourceTier } from './types' + +const WEIGHTS = { tier: 0.55, channel: 0.2, freshness: 0.15, corroboration: 0.1 } + +const TIER_SCORE: Record = { A: 1.0, B: 0.7, C: 0.4, D: 0.2 } + +// github-handle contacts carry no resolved email; nudge them below an equivalent email +const HANDLE_ONLY_PENALTY = 0.05 + +const FRESH_DAYS = 90 +const STALE_DAYS = 730 +const DAY_MS = 24 * 60 * 60 * 1000 + +const SECURITY_LOCALPARTS = new Set([ + 'security', + 'secure', + 'psirt', + 'sirt', + 'cert', + 'cve', + 'abuse', + 'vuln', + 'vulnerability', + 'vulnerabilities', + 'disclosure', +]) + +const GENERIC_LOCALPARTS = new Set([ + 'info', + 'team', + 'contact', + 'hello', + 'hi', + 'support', + 'admin', + 'help', + 'maintainers', + 'dev', + 'devs', + 'opensource', + 'open-source', + 'office', + 'mail', +]) + +function emailQuality(value: string): number { + const localPart = value.split('@')[0]?.toLowerCase().trim() ?? '' + if (SECURITY_LOCALPARTS.has(localPart) || localPart.startsWith('security')) return 1.0 + if (GENERIC_LOCALPARTS.has(localPart)) return 0.7 + return 0.6 +} + +function channelQuality(channel: ContactChannel, value: string): number { + switch (channel) { + case 'email': + return emailQuality(value) + case 'github-pvr': + return 0.95 + case 'web-form': + case 'url': + return 0.5 + case 'github-handle': + return 0.4 + } +} + +function freshnessScore(provenance: ProvenanceEntry[], now: Date): number { + const declaredTimes = provenance + .map((p) => new Date(p.declaredAt ?? p.fetchedAt).getTime()) + .filter((t) => !Number.isNaN(t)) + if (declaredTimes.length === 0) return 0 + + const ageDays = (now.getTime() - Math.max(...declaredTimes)) / DAY_MS + if (ageDays <= FRESH_DAYS) return 1.0 + if (ageDays >= STALE_DAYS) return 0 + return 1 - (ageDays - FRESH_DAYS) / (STALE_DAYS - FRESH_DAYS) +} + +// Independent = distinct extractors, not the same file re-fetched +function corroborationScore(provenance: ProvenanceEntry[]): number { + const sources = new Set(provenance.map((p) => p.source)) + if (sources.size >= 3) return 1.0 + if (sources.size === 2) return 0.5 + return 0 +} + +export function confidenceBand(score: number): ConfidenceBand { + if (score >= 0.8) return 'PRIMARY' + if (score >= 0.55) return 'SECONDARY' + if (score >= 0.3) return 'FALLBACK' + return 'NONE' +} + +export function scoreContact( + contact: RawContact, + now: Date = new Date(), +): { score: number; confidence: ConfidenceBand } { + const raw = + WEIGHTS.tier * TIER_SCORE[contact.tier] + + WEIGHTS.channel * channelQuality(contact.channel, contact.value) + + WEIGHTS.freshness * freshnessScore(contact.provenance, now) + + WEIGHTS.corroboration * corroborationScore(contact.provenance) - + (contact.channel === 'github-handle' ? HANDLE_ONLY_PENALTY : 0) + + const score = Math.round(Math.min(1, Math.max(0, raw)) * 1000) / 1000 + return { score, confidence: confidenceBand(score) } +} diff --git a/services/apps/packages_worker/src/security-contacts/types.ts b/services/apps/packages_worker/src/security-contacts/types.ts new file mode 100644 index 0000000000..b3cd480732 --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/types.ts @@ -0,0 +1,84 @@ +export type ContactChannel = 'email' | 'github-pvr' | 'url' | 'github-handle' | 'web-form' + +export type ContactRole = 'security-team' | 'maintainer' | 'admin' | 'committer' | 'org-owner' + +export type ConfidenceBand = 'PRIMARY' | 'SECONDARY' | 'FALLBACK' | 'NONE' + +export type SourceTier = 'A' | 'B' | 'C' | 'D' + +/** Where a single contact value was observed, for auditability and corroboration scoring. */ +export interface ProvenanceEntry { + /** Human-readable source identifier, e.g. 'security-insights', 'pvr', 'security.md'. */ + source: string + sourceTier: SourceTier + /** Path or URL the value was read from, when applicable. */ + path?: string + /** When this worker fetched the source. ISO-8601. */ + fetchedAt: string + /** When the source itself declared the value (e.g. SECURITY-INSIGHTS last-updated). ISO-8601. */ + declaredAt?: string +} + +/** Extractor output, before reconciliation and scoring. */ +export interface RawContact { + channel: ContactChannel + value: string + role: ContactRole + name?: string + /** Username an A3 email was resolved from — used only to identity-link a bare github-handle. */ + handle?: string + tier: SourceTier + provenance: ProvenanceEntry[] +} + +export interface ScoredContact extends RawContact { + score: number + confidence: ConfidenceBand +} + +export interface RepoPolicies { + securityPolicyUrl?: string + vulnerabilityReportingUrl?: string + bugBountyUrl?: string + securityTxtUrl?: string + pvrEnabled?: boolean +} + +export interface ExtractorResult { + contacts: RawContact[] + policies: Partial +} + +export interface RepoPackage { + purl: string + ecosystem: string +} + +export interface RepoTarget { + repoId: string + url: string + homepage: string | null + /** From the enricher; null = not yet enriched. Archived repos can't be queried for PVR. */ + archived: boolean | null + packages: RepoPackage[] +} + +/** Result of a GitHub API GET routed through the rate-limit-aware pool. */ +export interface GithubGetResult { + status: number + /** Response body (raw file text or JSON string); null for absent resources (404/410/422). */ + text: string | null +} + +export interface ExtractorDeps { + fetchTimeoutMs: number + /** Sent on registry calls; required (crates.io rejects requests without an identifying UA). */ + userAgent: string + /** + * Pool-aware, rate-limit-safe GitHub API GET. Handles installation selection, budget parking, + * and 429/secondary-limit backoff internally. `raw` selects the raw media type (file contents). + */ + githubGet: (path: string, opts?: { raw?: boolean }) => Promise +} + +export type Extractor = (target: RepoTarget, deps: ExtractorDeps) => Promise diff --git a/services/apps/packages_worker/src/security-contacts/workflows.ts b/services/apps/packages_worker/src/security-contacts/workflows.ts new file mode 100644 index 0000000000..2b15660963 --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/workflows.ts @@ -0,0 +1,21 @@ +import { continueAsNew, log, proxyActivities } from '@temporalio/workflow' + +import type * as activities from './activities' + +const acts = proxyActivities({ + startToCloseTimeout: '30 minutes', + retry: { + initialInterval: '30 seconds', + backoffCoefficient: 2, + maximumAttempts: 5, + }, +}) + +export async function ingestSecurityContacts(): Promise { + const result = await acts.processSecurityContactsBatch() + if (result.processed === 0) { + log.info('Security contacts ingestion complete — no more work, exiting.') + return + } + await continueAsNew() +} diff --git a/services/apps/packages_worker/src/security-contacts/writeContacts.ts b/services/apps/packages_worker/src/security-contacts/writeContacts.ts new file mode 100644 index 0000000000..6d2196fc53 --- /dev/null +++ b/services/apps/packages_worker/src/security-contacts/writeContacts.ts @@ -0,0 +1,73 @@ +import { QueryExecutor } from '@crowd/data-access-layer/src/queryExecutor' + +import { RepoPolicies, ScoredContact } from './types' + +// Records the attempt without touching contacts/policies — preserves existing data on total +// failure while advancing contacts_last_refreshed so the repo isn't reprocessed this sweep. +export async function markRepoAttempted(qx: QueryExecutor, repoId: string): Promise { + await qx.result('UPDATE repos SET contacts_last_refreshed = NOW() WHERE id = $(repoId)', { + repoId, + }) +} + +/** + * Idempotent per-repo recompute: replace the repo's security_contacts rows and refresh + * the policy columns in one transaction. Policy columns use COALESCE so a run that doesn't + * rediscover a field (partial/failed extractor pass) never clears a previously known value. + */ +export async function writeContacts( + qx: QueryExecutor, + repoId: string, + contacts: ScoredContact[], + policies: Partial, +): Promise { + await qx.tx(async (tx) => { + await tx.result('DELETE FROM security_contacts WHERE repo_id = $(repoId)', { repoId }) + + for (const c of contacts) { + await tx.result( + `INSERT INTO security_contacts + (repo_id, channel, value, role, name, score, confidence, provenance, last_refreshed) + VALUES + ($(repoId), $(channel), $(value), $(role), $(name), $(score), $(confidence), $(provenance)::jsonb, NOW())`, + { + repoId, + channel: c.channel, + value: c.value, + role: c.role, + name: c.name ?? null, + score: c.score, + confidence: c.confidence, + provenance: JSON.stringify(c.provenance), + }, + ) + } + + await tx.result( + // COALESCE preserves previously stored values when a run doesn't (re)discover a field — + // a partial/failed extractor pass must not wipe still-valid policy URLs or the PVR flag. + // vulnerability_reporting_url is PVR-derived, so when PVR is authoritatively resolved we + // overwrite it (clearing it once PVR is disabled); otherwise it is preserved. + `UPDATE repos SET + security_policy_url = COALESCE($(securityPolicyUrl), security_policy_url), + vulnerability_reporting_url = CASE WHEN $(pvrResolved) + THEN $(vulnerabilityReportingUrl) + ELSE COALESCE($(vulnerabilityReportingUrl), vulnerability_reporting_url) + END, + bug_bounty_url = COALESCE($(bugBountyUrl), bug_bounty_url), + security_txt_url = COALESCE($(securityTxtUrl), security_txt_url), + pvr_enabled = COALESCE($(pvrEnabled), pvr_enabled), + contacts_last_refreshed = NOW() + WHERE id = $(repoId)`, + { + repoId, + securityPolicyUrl: policies.securityPolicyUrl ?? null, + vulnerabilityReportingUrl: policies.vulnerabilityReportingUrl ?? null, + pvrResolved: policies.pvrEnabled !== undefined, + bugBountyUrl: policies.bugBountyUrl ?? null, + securityTxtUrl: policies.securityTxtUrl ?? null, + pvrEnabled: policies.pvrEnabled ?? null, + }, + ) + }) +} diff --git a/services/apps/packages_worker/src/workflows/index.ts b/services/apps/packages_worker/src/workflows/index.ts index 09416fdaba..ffdef936b3 100644 --- a/services/apps/packages_worker/src/workflows/index.ts +++ b/services/apps/packages_worker/src/workflows/index.ts @@ -20,3 +20,4 @@ export { rankPackagesWorkflow } from '../criticality/workflow' export { cargoSyncWorkflow } from '../cargo/workflows' export { enrichGoVersions, enrichGoStatus } from '../go/workflows' export { ingestNuGetPackages } from '../nuget/workflows' +export { ingestSecurityContacts } from '../security-contacts/workflows'