From a43be14c2352718d354aa96d52d8d17c87021350 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uro=C5=A1=20Marolt?= Date: Mon, 6 Apr 2026 14:27:28 +0200 Subject: [PATCH 01/19] chore: pcc sync worker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Uroš Marolt --- .../U1775312770__pcc-sync-worker-setup.sql | 5 + .../V1775312770__pcc-sync-worker-setup.sql | 20 + .../docker/Dockerfile.pcc_sync_worker | 25 ++ .../Dockerfile.pcc_sync_worker.dockerignore | 18 + scripts/services/pcc-sync-worker.yaml | 53 +++ services/apps/pcc_sync_worker/package.json | 33 ++ .../src/activities/cleanupActivity.ts | 33 ++ .../src/activities/exportActivity.ts | 100 +++++ .../pcc_sync_worker/src/activities/index.ts | 2 + .../pcc_sync_worker/src/config/settings.ts | 6 + .../src/consumer/pccProjectConsumer.ts | 395 ++++++++++++++++++ services/apps/pcc_sync_worker/src/index.ts | 42 ++ services/apps/pcc_sync_worker/src/main.ts | 38 ++ .../apps/pcc_sync_worker/src/parser/index.ts | 8 + .../pcc_sync_worker/src/parser/rowParser.ts | 157 +++++++ .../apps/pcc_sync_worker/src/parser/types.ts | 62 +++ .../pcc_sync_worker/src/schedules/index.ts | 2 + .../src/schedules/pccS3Cleanup.ts | 46 ++ .../src/schedules/pccS3Export.ts | 46 ++ .../src/scripts/triggerCleanup.ts | 26 ++ .../src/scripts/triggerExport.ts | 26 ++ .../src/workflows/cleanupWorkflow.ts | 17 + .../src/workflows/exportWorkflow.ts | 17 + .../pcc_sync_worker/src/workflows/index.ts | 2 + services/apps/pcc_sync_worker/tsconfig.json | 4 + .../apps/snowflake_connectors/package.json | 2 - .../src/activities/cleanupActivity.ts | 11 +- .../src/activities/exportActivity.ts | 3 +- .../src/consumer/transformerConsumer.ts | 17 +- services/libs/snowflake/package.json | 3 + services/libs/snowflake/src/index.ts | 3 + .../snowflake/src}/metadataStore.ts | 73 +++- .../core => libs/snowflake/src}/s3Service.ts | 0 .../snowflake/src}/snowflakeExporter.ts | 3 +- 34 files changed, 1266 insertions(+), 32 deletions(-) create mode 100644 backend/src/database/migrations/U1775312770__pcc-sync-worker-setup.sql create mode 100644 backend/src/database/migrations/V1775312770__pcc-sync-worker-setup.sql create mode 100644 scripts/services/docker/Dockerfile.pcc_sync_worker create mode 100644 scripts/services/docker/Dockerfile.pcc_sync_worker.dockerignore create mode 100644 scripts/services/pcc-sync-worker.yaml create mode 100644 services/apps/pcc_sync_worker/package.json create mode 100644 services/apps/pcc_sync_worker/src/activities/cleanupActivity.ts create mode 100644 services/apps/pcc_sync_worker/src/activities/exportActivity.ts create mode 100644 services/apps/pcc_sync_worker/src/activities/index.ts create mode 100644 services/apps/pcc_sync_worker/src/config/settings.ts create mode 100644 services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts create mode 100644 services/apps/pcc_sync_worker/src/index.ts create mode 100644 services/apps/pcc_sync_worker/src/main.ts create mode 100644 services/apps/pcc_sync_worker/src/parser/index.ts create mode 100644 services/apps/pcc_sync_worker/src/parser/rowParser.ts create mode 100644 services/apps/pcc_sync_worker/src/parser/types.ts create mode 100644 services/apps/pcc_sync_worker/src/schedules/index.ts create mode 100644 services/apps/pcc_sync_worker/src/schedules/pccS3Cleanup.ts create mode 100644 services/apps/pcc_sync_worker/src/schedules/pccS3Export.ts create mode 100644 services/apps/pcc_sync_worker/src/scripts/triggerCleanup.ts create mode 100644 services/apps/pcc_sync_worker/src/scripts/triggerExport.ts create mode 100644 services/apps/pcc_sync_worker/src/workflows/cleanupWorkflow.ts create mode 100644 services/apps/pcc_sync_worker/src/workflows/exportWorkflow.ts create mode 100644 services/apps/pcc_sync_worker/src/workflows/index.ts create mode 100644 services/apps/pcc_sync_worker/tsconfig.json rename services/{apps/snowflake_connectors/src/core => libs/snowflake/src}/metadataStore.ts (70%) rename services/{apps/snowflake_connectors/src/core => libs/snowflake/src}/s3Service.ts (100%) rename services/{apps/snowflake_connectors/src/core => libs/snowflake/src}/snowflakeExporter.ts (98%) diff --git a/backend/src/database/migrations/U1775312770__pcc-sync-worker-setup.sql b/backend/src/database/migrations/U1775312770__pcc-sync-worker-setup.sql new file mode 100644 index 0000000000..1b84cbc1c9 --- /dev/null +++ b/backend/src/database/migrations/U1775312770__pcc-sync-worker-setup.sql @@ -0,0 +1,5 @@ +DROP INDEX IF EXISTS pcc_sync_errors_dedup_idx; + +DROP TABLE IF EXISTS pcc_projects_sync_errors; + +ALTER TABLE segments DROP COLUMN IF EXISTS maturity; diff --git a/backend/src/database/migrations/V1775312770__pcc-sync-worker-setup.sql b/backend/src/database/migrations/V1775312770__pcc-sync-worker-setup.sql new file mode 100644 index 0000000000..239ec3468d --- /dev/null +++ b/backend/src/database/migrations/V1775312770__pcc-sync-worker-setup.sql @@ -0,0 +1,20 @@ +-- Add maturity field to segments for PCC project_maturity_level sync +ALTER TABLE segments ADD COLUMN IF NOT EXISTS maturity TEXT NULL; + +-- Catch-all table for PCC sync issues that require manual review +CREATE TABLE IF NOT EXISTS pcc_projects_sync_errors ( + id BIGSERIAL PRIMARY KEY, + run_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + external_project_id TEXT, + external_project_slug TEXT, + error_type TEXT NOT NULL, + details JSONB, + resolved BOOLEAN NOT NULL DEFAULT FALSE +); + +-- Deduplication index: one unresolved error per (project, error_type). +-- On repeated daily exports the same error upserts in place instead of accumulating rows. +-- Excludes rows where external_project_id IS NULL (e.g. SCHEMA_MISMATCH with no project id). +CREATE UNIQUE INDEX IF NOT EXISTS pcc_sync_errors_dedup_idx + ON pcc_projects_sync_errors (external_project_id, error_type) + WHERE NOT resolved AND external_project_id IS NOT NULL; diff --git a/scripts/services/docker/Dockerfile.pcc_sync_worker b/scripts/services/docker/Dockerfile.pcc_sync_worker new file mode 100644 index 0000000000..ae54539255 --- /dev/null +++ b/scripts/services/docker/Dockerfile.pcc_sync_worker @@ -0,0 +1,25 @@ +FROM node:20-bullseye-slim AS builder + +RUN apt-get update && apt-get install -y python3 make g++ && rm -rf /var/lib/apt/lists/* + +WORKDIR /usr/crowd/app +RUN npm install -g corepack@latest && corepack enable pnpm && corepack prepare pnpm@9.15.0 --activate + +COPY ./pnpm-workspace.yaml ./pnpm-lock.yaml ./ +RUN pnpm fetch + +COPY ./services ./services +RUN pnpm i --frozen-lockfile + +FROM node:20-bullseye-slim AS runner + +RUN apt-get update && apt-get install -y ca-certificates && rm -rf /var/lib/apt/lists/* + +WORKDIR /usr/crowd/app +RUN npm install -g corepack@latest && corepack enable pnpm && corepack prepare pnpm@9.15.0 --activate + +COPY --from=builder /usr/crowd/app/node_modules ./node_modules +COPY --from=builder /usr/crowd/app/services/base.tsconfig.json ./services/base.tsconfig.json +COPY --from=builder /usr/crowd/app/services/libs ./services/libs +COPY --from=builder /usr/crowd/app/services/archetypes/ ./services/archetypes +COPY --from=builder /usr/crowd/app/services/apps/pcc_sync_worker/ ./services/apps/pcc_sync_worker diff --git a/scripts/services/docker/Dockerfile.pcc_sync_worker.dockerignore b/scripts/services/docker/Dockerfile.pcc_sync_worker.dockerignore new file mode 100644 index 0000000000..4b74fc87af --- /dev/null +++ b/scripts/services/docker/Dockerfile.pcc_sync_worker.dockerignore @@ -0,0 +1,18 @@ +**/.git +**/node_modules +**/venv* +**/.webpack +**/.serverless +**/.env +**/.env.* +**/.idea +**/.vscode +**/dist +.vscode/ +.github/ +frontend/ +scripts/ +.flake8 +*.md +Makefile +backend/ diff --git a/scripts/services/pcc-sync-worker.yaml b/scripts/services/pcc-sync-worker.yaml new file mode 100644 index 0000000000..8e30e081eb --- /dev/null +++ b/scripts/services/pcc-sync-worker.yaml @@ -0,0 +1,53 @@ +version: '3.1' + +x-env-args: &env-args + DOCKER_BUILDKIT: 1 + NODE_ENV: docker + SERVICE: pcc-sync-worker + CROWD_TEMPORAL_TASKQUEUE: pccSync + SHELL: /bin/sh + +services: + pcc-sync-worker: + build: + context: ../../ + dockerfile: ./scripts/services/docker/Dockerfile.pcc_sync_worker + command: 'pnpm run start' + working_dir: /usr/crowd/app/services/apps/pcc_sync_worker + env_file: + - ../../backend/.env.dist.local + - ../../backend/.env.dist.composed + - ../../backend/.env.override.local + - ../../backend/.env.override.composed + environment: + <<: *env-args + restart: always + networks: + - crowd-bridge + + pcc-sync-worker-dev: + build: + context: ../../ + dockerfile: ./scripts/services/docker/Dockerfile.pcc_sync_worker + command: 'pnpm run dev' + working_dir: /usr/crowd/app/services/apps/pcc_sync_worker + env_file: + - ../../backend/.env.dist.local + - ../../backend/.env.dist.composed + - ../../backend/.env.override.local + - ../../backend/.env.override.composed + environment: + <<: *env-args + hostname: pcc-sync-worker + networks: + - crowd-bridge + volumes: + - ../../services/libs/common/src:/usr/crowd/app/services/libs/common/src + - ../../services/libs/logging/src:/usr/crowd/app/services/libs/logging/src + - ../../services/libs/snowflake/src:/usr/crowd/app/services/libs/snowflake/src + - ../../services/libs/temporal/src:/usr/crowd/app/services/libs/temporal/src + - ../../services/apps/pcc_sync_worker/src:/usr/crowd/app/services/apps/pcc_sync_worker/src + +networks: + crowd-bridge: + external: true diff --git a/services/apps/pcc_sync_worker/package.json b/services/apps/pcc_sync_worker/package.json new file mode 100644 index 0000000000..4d25c97efb --- /dev/null +++ b/services/apps/pcc_sync_worker/package.json @@ -0,0 +1,33 @@ +{ + "name": "@crowd/pcc-sync-worker", + "scripts": { + "start": "CROWD_TEMPORAL_TASKQUEUE=pccSync SERVICE=pcc-sync-worker tsx src/index.ts", + "start:debug": "CROWD_TEMPORAL_TASKQUEUE=pccSync SERVICE=pcc-sync-worker LOG_LEVEL=debug tsx src/index.ts", + "start:debug:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && CROWD_TEMPORAL_TASKQUEUE=pccSync SERVICE=pcc-sync-worker LOG_LEVEL=debug tsx src/index.ts", + "dev": "nodemon --watch src --watch ../../libs --ext ts --exec pnpm run start:debug", + "dev:local": "nodemon --watch src --watch ../../libs --ext ts --exec pnpm run start:debug:local", + "lint": "npx eslint --ext .ts src --max-warnings=0", + "format": "npx prettier --write \"src/**/*.ts\"", + "format-check": "npx prettier --check .", + "tsc-check": "tsc --noEmit", + "trigger-export": "SERVICE=pcc-sync-worker tsx src/scripts/triggerExport.ts", + "trigger-cleanup": "SERVICE=pcc-sync-worker tsx src/scripts/triggerCleanup.ts" + }, + "dependencies": { + "@crowd/archetype-standard": "workspace:*", + "@crowd/archetype-worker": "workspace:*", + "@crowd/common": "workspace:*", + "@crowd/database": "workspace:*", + "@crowd/logging": "workspace:*", + "@crowd/slack": "workspace:*", + "@crowd/snowflake": "workspace:*", + "@crowd/temporal": "workspace:*", + "@temporalio/client": "~1.11.8", + "@temporalio/workflow": "~1.11.8", + "tsx": "^4.7.1", + "typescript": "^5.6.3" + }, + "devDependencies": { + "nodemon": "^3.0.1" + } +} diff --git a/services/apps/pcc_sync_worker/src/activities/cleanupActivity.ts b/services/apps/pcc_sync_worker/src/activities/cleanupActivity.ts new file mode 100644 index 0000000000..4a51f0f83e --- /dev/null +++ b/services/apps/pcc_sync_worker/src/activities/cleanupActivity.ts @@ -0,0 +1,33 @@ +import { WRITE_DB_CONFIG, getDbConnection } from '@crowd/database' +import { getServiceChildLogger } from '@crowd/logging' +import { SlackChannel, SlackPersona, sendSlackNotification } from '@crowd/slack' +import { MetadataStore, S3Service } from '@crowd/snowflake' + +const log = getServiceChildLogger('cleanupActivity') + +const PLATFORM = 'pcc' + +export async function executeCleanup(intervalHours = 24): Promise { + const db = await getDbConnection(WRITE_DB_CONFIG()) + const metadataStore = new MetadataStore(db) + const s3Service = new S3Service() + + const jobs = await metadataStore.getCleanableJobS3Paths(intervalHours, PLATFORM, false) + log.info({ jobCount: jobs.length, intervalHours }, 'Found cleanable PCC jobs') + + for (const job of jobs) { + try { + await s3Service.deleteFile(job.s3Path) + await metadataStore.markCleaned(job.id) + log.info({ jobId: job.id, s3Path: job.s3Path }, 'Cleaned PCC job') + } catch (err) { + log.error({ jobId: job.id, s3Path: job.s3Path, err }, 'Failed to clean PCC job, skipping') + sendSlackNotification( + SlackChannel.CDP_INTEGRATIONS_ALERTS, + SlackPersona.ERROR_REPORTER, + 'PCC S3 Cleanup Failed', + `Failed to clean job \`${job.id}\` at \`${job.s3Path}\`.\n\n*Error:* ${err instanceof Error ? err.message : err}`, + ) + } + } +} diff --git a/services/apps/pcc_sync_worker/src/activities/exportActivity.ts b/services/apps/pcc_sync_worker/src/activities/exportActivity.ts new file mode 100644 index 0000000000..e811786143 --- /dev/null +++ b/services/apps/pcc_sync_worker/src/activities/exportActivity.ts @@ -0,0 +1,100 @@ +/** + * Export activity: Execute PCC recursive CTE COPY INTO + write metadata. + * + * Full daily export of ANALYTICS.SILVER_DIM.PROJECTS via recursive CTE. + * No incremental logic — at ~1,538 leaf rows, a full daily export is simpler + * and more reliable than incremental (a parent name change would require + * re-exporting all descendants). + */ +import { WRITE_DB_CONFIG, getDbConnection } from '@crowd/database' +import { getServiceChildLogger } from '@crowd/logging' +import { MetadataStore, SnowflakeExporter } from '@crowd/snowflake' + +const log = getServiceChildLogger('exportActivity') + +const PLATFORM = 'pcc' +const SOURCE_NAME = 'project-hierarchy' + +function buildSourceQuery(): string { + return ` + WITH RECURSIVE project_hierarchy AS ( + SELECT project_id, name, description, project_logo, project_status, + project_maturity_level, repository_url, slug, parent_id, + 1 AS depth, + name AS depth_1, NULL::VARCHAR AS depth_2, NULL::VARCHAR AS depth_3, + NULL::VARCHAR AS depth_4, NULL::VARCHAR AS depth_5 + FROM ANALYTICS.SILVER_DIM.PROJECTS + WHERE parent_id IS NULL + UNION ALL + SELECT p.project_id, p.name, p.description, p.project_logo, p.project_status, + p.project_maturity_level, p.repository_url, p.slug, p.parent_id, + h.depth + 1, + h.depth_1, + CASE WHEN h.depth + 1 = 2 THEN p.name ELSE h.depth_2 END, + CASE WHEN h.depth + 1 = 3 THEN p.name ELSE h.depth_3 END, + CASE WHEN h.depth + 1 = 4 THEN p.name ELSE h.depth_4 END, + CASE WHEN h.depth + 1 = 5 THEN p.name ELSE h.depth_5 END + FROM ANALYTICS.SILVER_DIM.PROJECTS p + INNER JOIN project_hierarchy h ON p.parent_id = h.project_id + ) + SELECT ph.project_id, ph.name, ph.slug, ph.description, ph.project_logo, ph.repository_url, + ph.project_status, ph.project_maturity_level, ph.depth, + ph.depth_1, ph.depth_2, ph.depth_3, ph.depth_4, ph.depth_5, + s.segment_id + FROM project_hierarchy ph + LEFT JOIN ANALYTICS.SILVER_DIM.ACTIVE_SEGMENTS s + ON s.source_id = ph.project_id AND s.project_type = 'subproject' + WHERE ph.project_id NOT IN ( + SELECT DISTINCT parent_id FROM ANALYTICS.SILVER_DIM.PROJECTS + WHERE parent_id IS NOT NULL + ) + ` +} + +function buildS3FilenamePrefix(): string { + const now = new Date() + const year = now.getFullYear() + const month = String(now.getMonth() + 1).padStart(2, '0') + const day = String(now.getDate()).padStart(2, '0') + const s3BucketPath = process.env.CROWD_SNOWFLAKE_S3_BUCKET_PATH + if (!s3BucketPath) { + throw new Error('Missing required env var CROWD_SNOWFLAKE_S3_BUCKET_PATH') + } + return `${s3BucketPath}/${PLATFORM}/${SOURCE_NAME}/${year}/${month}/${day}` +} + +export async function executeExport(): Promise { + log.info({ platform: PLATFORM, sourceName: SOURCE_NAME }, 'Starting PCC export') + + const exporter = new SnowflakeExporter() + const db = await getDbConnection(WRITE_DB_CONFIG()) + + try { + const metadataStore = new MetadataStore(db) + const sourceQuery = buildSourceQuery() + const s3FilenamePrefix = buildS3FilenamePrefix() + const exportStartedAt = new Date() + + const onBatchComplete = async (s3Path: string, totalRows: number, totalBytes: number) => { + await metadataStore.insertExportJob( + PLATFORM, + SOURCE_NAME, + s3Path, + totalRows, + totalBytes, + exportStartedAt, + ) + } + + await exporter.executeBatchedCopyInto(sourceQuery, s3FilenamePrefix, onBatchComplete) + + log.info({ platform: PLATFORM, sourceName: SOURCE_NAME }, 'PCC export completed') + } catch (err) { + log.error({ platform: PLATFORM, sourceName: SOURCE_NAME, err }, 'PCC export failed') + throw err + } finally { + await exporter + .destroy() + .catch((err) => log.warn({ err }, 'Failed to close Snowflake connection')) + } +} diff --git a/services/apps/pcc_sync_worker/src/activities/index.ts b/services/apps/pcc_sync_worker/src/activities/index.ts new file mode 100644 index 0000000000..1fd1f65a10 --- /dev/null +++ b/services/apps/pcc_sync_worker/src/activities/index.ts @@ -0,0 +1,2 @@ +export { executeExport } from './exportActivity' +export { executeCleanup } from './cleanupActivity' diff --git a/services/apps/pcc_sync_worker/src/config/settings.ts b/services/apps/pcc_sync_worker/src/config/settings.ts new file mode 100644 index 0000000000..12d00867d4 --- /dev/null +++ b/services/apps/pcc_sync_worker/src/config/settings.ts @@ -0,0 +1,6 @@ +/** + * Centralized configuration: Temporal. + */ + +export { TEMPORAL_CONFIG, getTemporalClient } from '@crowd/temporal' +export type { ITemporalConfig } from '@crowd/temporal' diff --git a/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts b/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts new file mode 100644 index 0000000000..a675d0384f --- /dev/null +++ b/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts @@ -0,0 +1,395 @@ +/** + * PCC project consumer: polls snowflakeExportJobs for platform='pcc' jobs, + * streams each Parquet file, runs the matching cascade, and writes to DB. + * + * One DB transaction per job — all segment + insightsProject writes roll back + * together on any failure. Errors that can't be auto-resolved are written to + * pcc_projects_sync_errors for manual review. + */ +import { DEFAULT_TENANT_ID } from '@crowd/common' +import { DbConnOrTx, DbConnection, WRITE_DB_CONFIG, getDbConnection } from '@crowd/database' +import { getServiceChildLogger } from '@crowd/logging' +import { MetadataStore, S3Service, SnowflakeExportJob } from '@crowd/snowflake' + +import { parsePccRow } from '../parser' +import type { CdpHierarchyTarget, ParsedPccProject } from '../parser' + +const log = getServiceChildLogger('pccProjectConsumer') + +const PLATFORM = 'pcc' +const MAX_POLLING_INTERVAL_MS = 30 * 60 * 1000 // 30 minutes + +// ───────────────────────────────────────────────────────────────────────────── +// Consumer loop +// ───────────────────────────────────────────────────────────────────────────── + +export class PccProjectConsumer { + private running = false + private currentPollingIntervalMs: number + + constructor( + private readonly metadataStore: MetadataStore, + private readonly s3Service: S3Service, + private readonly db: DbConnection, + private readonly pollingIntervalMs: number, + readonly dryRun: boolean = false, + ) { + this.currentPollingIntervalMs = pollingIntervalMs + } + + async start(): Promise { + this.running = true + log.info({ dryRun: this.dryRun }, 'PCC project consumer started') + + while (this.running) { + try { + const job = await this.metadataStore.claimOldestPendingJob(PLATFORM) + + if (job) { + this.currentPollingIntervalMs = this.pollingIntervalMs + await this.processJob(job) + await new Promise((resolve) => setImmediate(resolve)) + continue + } + } catch (err) { + log.error({ err }, 'Error in consumer loop') + await this.sleep(this.pollingIntervalMs) + continue + } + + log.info({ currentPollingIntervalMs: this.currentPollingIntervalMs }, 'No pending PCC jobs') + await this.sleep(this.currentPollingIntervalMs) + this.currentPollingIntervalMs = Math.min( + this.currentPollingIntervalMs * 2, + MAX_POLLING_INTERVAL_MS, + ) + } + + log.info('PCC project consumer stopped') + } + + stop(): void { + this.running = false + } + + // ───────────────────────────────────────────────────────────────────────── + // Job processing + // ───────────────────────────────────────────────────────────────────────── + + private async processJob(job: SnowflakeExportJob): Promise { + log.info({ jobId: job.id, s3Path: job.s3Path, dryRun: this.dryRun }, 'Processing PCC job') + + const startTime = Date.now() + let upsertedCount = 0 + let skippedCount = 0 + let mismatchCount = 0 + let errorCount = 0 + + try { + await this.db.tx(async (tx) => { + for await (const raw of this.s3Service.streamParquetRows(job.s3Path)) { + const parsed = parsePccRow(raw) + + if (parsed.ok === false) { + errorCount++ + log.warn({ jobId: job.id, details: parsed.details }, 'Row schema mismatch — skipping') + if (!this.dryRun) { + await insertSyncError(tx, null, null, 'SCHEMA_MISMATCH', parsed.details) + } + continue + } + + const { project } = parsed + const result = await this.processRow(tx, project) + + switch (result.action) { + case 'UPSERTED': + upsertedCount++ + break + case 'SKIPPED': + skippedCount++ + break + case 'MISMATCH': + mismatchCount++ + if (!this.dryRun) { + await insertSyncError( + tx, + project.pccProjectId, + project.pccSlug, + 'HIERARCHY_MISMATCH', + result.details, + ) + } + break + } + } + }) + + const metrics = { upsertedCount, skippedCount, mismatchCount, errorCount } + log.info({ jobId: job.id, ...metrics, dryRun: this.dryRun }, 'PCC job completed') + + await this.metadataStore.markCompleted(job.id, { + transformedCount: upsertedCount, + skippedCount: skippedCount + mismatchCount + errorCount, + processingDurationMs: Date.now() - startTime, + }) + } catch (err) { + const errorMessage = err instanceof Error ? err.message : String(err) + log.error({ jobId: job.id, err }, 'PCC job failed') + + try { + await this.metadataStore.markFailed(job.id, errorMessage, { + processingDurationMs: Date.now() - startTime, + }) + } catch (updateErr) { + log.error({ jobId: job.id, updateErr }, 'Failed to mark job as failed') + } + } + } + + // ───────────────────────────────────────────────────────────────────────── + // Per-row matching cascade + writes + // ───────────────────────────────────────────────────────────────────────── + + private async processRow( + tx: DbConnOrTx, + project: ParsedPccProject, + ): Promise< + | { action: 'UPSERTED' } + | { action: 'SKIPPED' } + | { action: 'MISMATCH'; details: Record } + > { + // Step 1: segment_id from Snowflake ACTIVE_SEGMENTS JOIN + let segment = project.segmentIdFromSnowflake + ? await findSegmentById(tx, project.segmentIdFromSnowflake) + : null + + // Step 2: sourceId fallback + if (!segment) { + segment = await findSegmentBySourceId(tx, project.pccProjectId) + } + + // Step 3: derived slug match + if (!segment && project.pccSlug) { + segment = await findSegmentBySlug(tx, project.pccSlug) + } + + // Step 4: no match → SKIP (Phase 1: project doesn't exist in CDP yet) + if (!segment) { + return { action: 'SKIPPED' } + } + + // Hierarchy mismatch check: segment was matched but parent/group differs + const mismatchFields = detectHierarchyMismatch(segment, project.cdpTarget) + if (mismatchFields.length > 0) { + return { + action: 'MISMATCH', + details: { + segmentId: segment.id, + segmentName: segment.name, + pccProjectId: project.pccProjectId, + mismatchFields, + cdpTarget: project.cdpTarget, + currentHierarchy: { + group: segment.grandparentName ?? segment.parentName ?? segment.name, + project: segment.parentName ?? segment.name, + subproject: segment.name, + }, + }, + } + } + + if (!this.dryRun) { + await upsertSegment(tx, segment.id, project) + const nameConflict = await upsertInsightsProject(tx, segment.id, project) + if (nameConflict) { + log.warn( + { segmentId: segment.id, name: project.name }, + 'insightsProject name conflict — segment synced, insights project skipped', + ) + await insertSyncError(tx, project.pccProjectId, project.pccSlug, 'INSIGHTS_NAME_CONFLICT', { + segmentId: segment.id, + name: project.name, + }) + } + } else { + log.info( + { + segmentId: segment.id, + pccProjectId: project.pccProjectId, + name: project.name, + status: project.status, + maturity: project.maturity, + }, + '[dry-run] Would upsert segment', + ) + } + + return { action: 'UPSERTED' } + } + + private sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// DB helpers +// ───────────────────────────────────────────────────────────────────────────── + +interface SegmentRow { + id: string + name: string + parentName: string | null + grandparentName: string | null +} + +async function findSegmentById(db: DbConnOrTx, segmentId: string): Promise { + return db.oneOrNone( + `SELECT id, name, "parentName", "grandparentName" + FROM segments + WHERE id = $(segmentId) AND type = 'subproject' AND "tenantId" = $(tenantId)`, + { segmentId, tenantId: DEFAULT_TENANT_ID }, + ) +} + +async function findSegmentBySourceId(db: DbConnOrTx, sourceId: string): Promise { + return db.oneOrNone( + `SELECT id, name, "parentName", "grandparentName" + FROM segments + WHERE "sourceId" = $(sourceId) AND type = 'subproject' AND "tenantId" = $(tenantId)`, + { sourceId, tenantId: DEFAULT_TENANT_ID }, + ) +} + +async function findSegmentBySlug(db: DbConnOrTx, slug: string): Promise { + const rows = await db.manyOrNone( + `SELECT id, name, "parentName", "grandparentName" + FROM segments + WHERE slug = $(slug) AND type = 'subproject' AND "tenantId" = $(tenantId)`, + { slug, tenantId: DEFAULT_TENANT_ID }, + ) + if (rows.length === 1) return rows[0] + if (rows.length > 1) { + log.warn({ slug, count: rows.length }, 'Ambiguous slug match — skipping') + } + return null +} + +function detectHierarchyMismatch(segment: SegmentRow, cdpTarget: CdpHierarchyTarget): string[] { + // Only check structural hierarchy (parent/grandparent placement), not the leaf name. + // The leaf name is a metadata field we're here to sync — a difference there is an UPDATE, + // not a mismatch. Mismatches indicate the project is in the wrong place in the hierarchy, + // which requires manual review before auto-fixing (per Phase 1 spec). + const mismatches: string[] = [] + if (segment.grandparentName && segment.grandparentName !== cdpTarget.group) { + mismatches.push('group_name') + } + if (segment.parentName && segment.parentName !== cdpTarget.project) { + mismatches.push('project_name') + } + return mismatches +} + +async function upsertSegment( + db: DbConnOrTx, + segmentId: string, + project: ParsedPccProject, +): Promise { + await db.none( + `UPDATE segments + SET name = $(name), + status = $(status)::"segmentsStatus_type", + maturity = $(maturity), + description = $(description), + "updatedAt" = NOW() + WHERE id = $(segmentId) AND "tenantId" = $(tenantId)`, + { + segmentId, + name: project.name, + status: project.status ?? 'active', + maturity: project.maturity, + description: project.description, + tenantId: DEFAULT_TENANT_ID, + }, + ) +} + +// Returns true if a name conflict prevented creating the insightsProject row. +async function upsertInsightsProject( + db: DbConnOrTx, + segmentId: string, + project: ParsedPccProject, +): Promise { + // Partial unique index on segmentId WHERE deletedAt IS NULL means + // ON CONFLICT won't fire for soft-deleted rows. Use UPDATE-then-INSERT. + // Slug is intentionally not updated on name changes — it is a stable identifier + // referenced by FK from securityInsightsEvaluations and related tables. + // Guard the UPDATE against the partial unique index on (name) WHERE deletedAt IS NULL. + // If another active row already holds the new name, the NOT EXISTS subquery causes the + // UPDATE to match 0 rows instead of throwing a 23505 unique violation. + const updated = await db.result( + `UPDATE "insightsProjects" + SET name = $(name), + description = $(description), + "logoUrl" = $(logoUrl), + "updatedAt" = NOW() + WHERE "segmentId" = $(segmentId) AND "deletedAt" IS NULL + AND NOT EXISTS ( + SELECT 1 FROM "insightsProjects" + WHERE name = $(name) AND "deletedAt" IS NULL AND "segmentId" != $(segmentId) + )`, + { segmentId, name: project.name, description: project.description, logoUrl: project.logoUrl }, + ) + + if (updated.rowCount === 0) { + // Either (a) no active row exists yet → proceed to INSERT, + // or (b) a row exists but its name collides with another segment → return conflict. + const exists = await db.oneOrNone<{ id: string }>( + `SELECT id FROM "insightsProjects" WHERE "segmentId" = $(segmentId) AND "deletedAt" IS NULL`, + { segmentId }, + ) + if (exists) return true + + const inserted = await db.result( + `INSERT INTO "insightsProjects" (name, slug, description, "segmentId", "logoUrl", "isLF") + VALUES ($(name), generate_slug('insightsProjects', $(name)), $(description), $(segmentId), $(logoUrl), TRUE) + ON CONFLICT (name) WHERE "deletedAt" IS NULL DO NOTHING`, + { name: project.name, description: project.description, segmentId, logoUrl: project.logoUrl }, + ) + if (inserted.rowCount === 0) return true + } + + return false +} + +async function insertSyncError( + db: DbConnOrTx, + externalProjectId: string | null, + externalProjectSlug: string | null, + errorType: string, + details: Record, +): Promise { + await db.none( + `INSERT INTO pcc_projects_sync_errors + (external_project_id, external_project_slug, error_type, details) + VALUES ($(externalProjectId), $(externalProjectSlug), $(errorType), $(details)::jsonb) + ON CONFLICT (external_project_id, error_type) + WHERE NOT resolved AND external_project_id IS NOT NULL + DO UPDATE SET details = EXCLUDED.details, run_at = NOW()`, + { externalProjectId, externalProjectSlug, errorType, details: JSON.stringify(details) }, + ) +} + +// ───────────────────────────────────────────────────────────────────────────── +// Factory +// ───────────────────────────────────────────────────────────────────────────── + +export async function createPccProjectConsumer(dryRun = false): Promise { + const db = await getDbConnection(WRITE_DB_CONFIG()) + const metadataStore = new MetadataStore(db) + const s3Service = new S3Service() + const pollingIntervalMs = 10_000 // 10 seconds + + return new PccProjectConsumer(metadataStore, s3Service, db, pollingIntervalMs, dryRun) +} diff --git a/services/apps/pcc_sync_worker/src/index.ts b/services/apps/pcc_sync_worker/src/index.ts new file mode 100644 index 0000000000..31a0e09b35 --- /dev/null +++ b/services/apps/pcc_sync_worker/src/index.ts @@ -0,0 +1,42 @@ +/** + * Entry point: Start Temporal worker + PCC project consumer loop. + */ +import { getServiceChildLogger } from '@crowd/logging' + +import { createPccProjectConsumer } from './consumer/pccProjectConsumer' +import { svc } from './main' +import { schedulePccS3Cleanup, schedulePccS3Export } from './schedules' + +const log = getServiceChildLogger('main') + +const DRY_RUN = process.env.PCC_DRY_RUN === 'true' + +setImmediate(async () => { + await svc.init() + + await schedulePccS3Export() + await schedulePccS3Cleanup() + + const consumer = await createPccProjectConsumer(DRY_RUN) + consumer.start().catch((err) => { + log.error({ err }, 'Consumer loop crashed') + process.exit(1) + }) + + const HARD_TIMEOUT_MS = 2 * 60 * 60 * 1000 + + const shutdown = () => { + log.info('Shutdown signal received, stopping consumer...') + consumer.stop() + + setTimeout(() => { + log.warn('Graceful shutdown timed out after 2 hours, forcing exit') + process.exit(1) + }, HARD_TIMEOUT_MS).unref() + } + + process.on('SIGINT', shutdown) + process.on('SIGTERM', shutdown) + + await svc.start() +}) diff --git a/services/apps/pcc_sync_worker/src/main.ts b/services/apps/pcc_sync_worker/src/main.ts new file mode 100644 index 0000000000..a08bb41cbc --- /dev/null +++ b/services/apps/pcc_sync_worker/src/main.ts @@ -0,0 +1,38 @@ +/** + * Temporal worker setup. + * + * Uses the ServiceWorker archetype which handles Temporal connection, + * workflow bundling, and activity registration automatically. + */ +import { Config } from '@crowd/archetype-standard' +import { Options, ServiceWorker } from '@crowd/archetype-worker' + +const config: Config = { + envvars: [ + 'CROWD_SNOWFLAKE_S3_BUCKET_PATH', + 'CROWD_SNOWFLAKE_STORAGE_INTEGRATION', + 'CROWD_SNOWFLAKE_S3_REGION', + 'CROWD_SNOWFLAKE_S3_ACCESS_KEY_ID', + 'CROWD_SNOWFLAKE_S3_SECRET_ACCESS_KEY', + ], + producer: { + enabled: false, + }, + temporal: { + enabled: true, + }, + redis: { + enabled: false, + }, +} + +const options: Options = { + postgres: { + enabled: true, + }, + opensearch: { + enabled: false, + }, +} + +export const svc = new ServiceWorker(config, options) diff --git a/services/apps/pcc_sync_worker/src/parser/index.ts b/services/apps/pcc_sync_worker/src/parser/index.ts new file mode 100644 index 0000000000..0e3d9a7617 --- /dev/null +++ b/services/apps/pcc_sync_worker/src/parser/index.ts @@ -0,0 +1,8 @@ +export { parsePccRow } from './rowParser' +export type { + CdpHierarchyTarget, + MappingRule, + ParseResult, + ParsedPccProject, + PccParquetRow, +} from './types' diff --git a/services/apps/pcc_sync_worker/src/parser/rowParser.ts b/services/apps/pcc_sync_worker/src/parser/rowParser.ts new file mode 100644 index 0000000000..74ada6a415 --- /dev/null +++ b/services/apps/pcc_sync_worker/src/parser/rowParser.ts @@ -0,0 +1,157 @@ +/** + * PCC project row parser and hierarchy mapper. + * + * Transforms a raw Parquet row from the PCC Snowflake export into a + * structured ParsedPccProject, applying the CDP hierarchy mapping rules. + * + * Pure function — no DB access, no I/O. Fully unit-testable against + * CDP__PCC Integration - proposal-for-migration.csv (project root). + * + * Mapping rules (effective_depth = raw DEPTH - 1, stripping TLF root): + * Rule 1 (eff=1): group=D2, project=D2, subproject=D2 + * Rule 2 (eff=2): group=D2, project=D3, subproject=D3 + * Rule 3 (eff=3): group=D2, project=D3, subproject=D4 + * Rule 4 (eff=4): group=D3, project=D4, subproject=D5 (drops D2 intermediate) + * + * Depth > 4 (raw > 5): SCHEMA_MISMATCH — surfaced to pcc_projects_sync_errors. + */ +import type { MappingRule, ParseResult, PccParquetRow } from './types' + +/** + * PCC PROJECT_STATUS → CDP segmentsStatus_type enum. + * CDP enum values: active | archived | formation | prospect + */ +const STATUS_MAP: Record = { + Active: 'active', + Archived: 'archived', + 'Formation - Disengaged': 'formation', + 'Formation - Engaged': 'formation', + 'Formation - Exploratory': 'formation', + 'Formation - On Hold': 'formation', + Prospect: 'prospect', +} + +/** + * Intermediate PCC nodes that are transparent in the CDP hierarchy. + * When D2 equals one of these, it is skipped and D1 ("The Linux Foundation") + * is used as the CDP project group instead. + */ +const TRANSPARENT_INTERMEDIATES = new Set(['LF Projects, LLC']) + +/** + * Parse and validate a raw Parquet row from the PCC export. + * Returns ok=false with SCHEMA_MISMATCH if the row is malformed or + * has an unsupported depth (> 5 raw / > 4 effective). + */ +export function parsePccRow(raw: Record): ParseResult { + const row = raw as Partial + + const projectId = row.PROJECT_ID + const name = row.NAME + const depth = typeof row.DEPTH === 'number' ? row.DEPTH : Number(row.DEPTH) + + if (!projectId || !name || !Number.isFinite(depth)) { + return { + ok: false, + errorType: 'SCHEMA_MISMATCH', + details: { reason: 'missing required fields', raw }, + } + } + + const effectiveDepth = depth - 1 + + if (effectiveDepth < 1 || effectiveDepth > 4) { + return { + ok: false, + errorType: 'SCHEMA_MISMATCH', + details: { + reason: effectiveDepth < 1 ? 'unexpected root node (depth=1)' : 'unsupported depth > 5', + rawDepth: depth, + effectiveDepth, + projectId, + name, + }, + } + } + + const d1 = row.DEPTH_1 ?? null + const d2 = row.DEPTH_2 ?? null + const d3 = row.DEPTH_3 ?? null + const d4 = row.DEPTH_4 ?? null + const d5 = row.DEPTH_5 ?? null + + const cdpTargetResult = buildCdpTarget(effectiveDepth as 1 | 2 | 3 | 4, name, d1, d2, d3, d4, d5) + if (cdpTargetResult.ok === false) { + return { + ok: false, + errorType: 'SCHEMA_MISMATCH', + details: { reason: cdpTargetResult.reason, rawDepth: depth, effectiveDepth, projectId, name }, + } + } + + const rawStatus = row.PROJECT_STATUS ?? null + const mappedStatus = rawStatus ? (STATUS_MAP[rawStatus] ?? null) : null + + return { + ok: true, + project: { + pccProjectId: projectId, + pccSlug: row.SLUG ?? null, + name, + status: mappedStatus, + maturity: row.PROJECT_MATURITY_LEVEL ?? null, + description: row.DESCRIPTION ?? null, + logoUrl: row.PROJECT_LOGO ?? null, + repositoryUrl: row.REPOSITORY_URL ?? null, + segmentIdFromSnowflake: row.SEGMENT_ID ?? null, + effectiveDepth, + mappingRule: effectiveDepth as MappingRule, + cdpTarget: cdpTargetResult.target, + }, + } +} + +function buildCdpTarget( + effectiveDepth: 1 | 2 | 3 | 4, + _leafName: string, + d1: string | null, + d2: string | null, + d3: string | null, + d4: string | null, + d5: string | null, +): + | { ok: true; target: { group: string; project: string; subproject: string } } + | { ok: false; reason: string } { + // When D2 is a transparent intermediate (e.g. "LF Projects, LLC"), skip it + // and promote D1 ("The Linux Foundation") to be the CDP project group. + const d2IsTransparent = !!d2 && TRANSPARENT_INTERMEDIATES.has(d2) + const group2 = d2IsTransparent ? d1 : d2 + + switch (effectiveDepth) { + case 1: + // D1=TLF (stripped), leaf=D2 → all three CDP levels are the same node. + // Apply transparency: if D2 is a transparent intermediate, promote D1 as the group. + if (!group2) return { ok: false, reason: 'missing DEPTH_2 for effective_depth=1' } + return { ok: true, target: { group: group2, project: group2, subproject: group2 } } + + case 2: + // D1=TLF, D2=group (or transparent→D1), leaf=D3 + if (!group2) return { ok: false, reason: 'missing DEPTH_2 for effective_depth=2' } + if (!d3) return { ok: false, reason: 'missing DEPTH_3 for effective_depth=2' } + return { ok: true, target: { group: group2, project: d3, subproject: d3 } } + + case 3: + // D1=TLF, D2=group (or transparent→D1), D3=project, leaf=D4 + if (!group2) return { ok: false, reason: 'missing DEPTH_2 for effective_depth=3' } + if (!d3) return { ok: false, reason: 'missing DEPTH_3 for effective_depth=3' } + if (!d4) return { ok: false, reason: 'missing DEPTH_4 for effective_depth=3' } + return { ok: true, target: { group: group2, project: d3, subproject: d4 } } + + case 4: + // D1=TLF, D2=intermediate (always dropped at this depth), D3=group, D4=project, leaf=D5 + if (!d3) return { ok: false, reason: 'missing DEPTH_3 for effective_depth=4' } + if (!d4) return { ok: false, reason: 'missing DEPTH_4 for effective_depth=4' } + if (!d5) return { ok: false, reason: 'missing DEPTH_5 for effective_depth=4' } + return { ok: true, target: { group: d3, project: d4, subproject: d5 } } + } +} diff --git a/services/apps/pcc_sync_worker/src/parser/types.ts b/services/apps/pcc_sync_worker/src/parser/types.ts new file mode 100644 index 0000000000..1a212f4af6 --- /dev/null +++ b/services/apps/pcc_sync_worker/src/parser/types.ts @@ -0,0 +1,62 @@ +/** + * Types for the PCC project parser. + * + * Parquet rows come from Snowflake COPY INTO with HEADER=TRUE, + * so all column names are uppercase. + */ + +/** Raw Parquet row from the PCC recursive CTE export. */ +export interface PccParquetRow { + PROJECT_ID: string + NAME: string + SLUG: string | null + DESCRIPTION: string | null + PROJECT_LOGO: string | null + REPOSITORY_URL: string | null + PROJECT_STATUS: string | null + PROJECT_MATURITY_LEVEL: string | null + DEPTH: number + DEPTH_1: string | null + DEPTH_2: string | null + DEPTH_3: string | null + DEPTH_4: string | null + DEPTH_5: string | null + SEGMENT_ID: string | null +} + +/** + * CDP hierarchy target derived from PCC depth levels. + * Phase 1 only updates existing segments — all three levels refer to + * existing CDP segment names (group / project / subproject). + */ +export interface CdpHierarchyTarget { + group: string + project: string + subproject: string +} + +/** Which depth-mapping rule was applied (effective depth 1–4). */ +export type MappingRule = 1 | 2 | 3 | 4 + +/** Structured result after parsing and transforming a single Parquet row. */ +export interface ParsedPccProject { + pccProjectId: string + /** Raw PCC slug — used for step-3 segment matching in the consumer. */ + pccSlug: string | null + name: string + /** Mapped to CDP segmentsStatus_type enum value, or null if unknown. */ + status: string | null + maturity: string | null + description: string | null + logoUrl: string | null + repositoryUrl: string | null + /** segment_id from Snowflake ACTIVE_SEGMENTS JOIN — used for step-1 matching. */ + segmentIdFromSnowflake: string | null + effectiveDepth: number + mappingRule: MappingRule + cdpTarget: CdpHierarchyTarget +} + +export type ParseResult = + | { ok: true; project: ParsedPccProject } + | { ok: false; errorType: 'SCHEMA_MISMATCH'; details: Record } diff --git a/services/apps/pcc_sync_worker/src/schedules/index.ts b/services/apps/pcc_sync_worker/src/schedules/index.ts new file mode 100644 index 0000000000..69a828b61e --- /dev/null +++ b/services/apps/pcc_sync_worker/src/schedules/index.ts @@ -0,0 +1,2 @@ +export { schedulePccS3Export } from './pccS3Export' +export { schedulePccS3Cleanup } from './pccS3Cleanup' diff --git a/services/apps/pcc_sync_worker/src/schedules/pccS3Cleanup.ts b/services/apps/pcc_sync_worker/src/schedules/pccS3Cleanup.ts new file mode 100644 index 0000000000..b5be4cc095 --- /dev/null +++ b/services/apps/pcc_sync_worker/src/schedules/pccS3Cleanup.ts @@ -0,0 +1,46 @@ +import { ScheduleAlreadyRunning, ScheduleOverlapPolicy } from '@temporalio/client' + +import { SlackChannel, SlackPersona, sendSlackNotification } from '@crowd/slack' + +import { svc } from '../main' +import { pccS3CleanupScheduler } from '../workflows' + +export const schedulePccS3Cleanup = async () => { + try { + await svc.temporal.schedule.create({ + scheduleId: 'pcc-s3-cleanup', + spec: { + // Run at 03:00 every day + cronExpressions: ['00 3 * * *'], + }, + policies: { + overlap: ScheduleOverlapPolicy.SKIP, + catchupWindow: '1 minute', + }, + action: { + type: 'startWorkflow', + workflowType: pccS3CleanupScheduler, + taskQueue: 'pccSync', + retry: { + initialInterval: '15 seconds', + backoffCoefficient: 2, + maximumAttempts: 3, + }, + args: [], + }, + }) + } catch (err) { + if (err instanceof ScheduleAlreadyRunning) { + svc.log.info('PCC cleanup schedule already registered in Temporal.') + svc.log.info('Configuration may have changed since. Please make sure they are in sync.') + } else { + svc.log.error({ err }, 'Failed to create pcc-s3-cleanup schedule') + sendSlackNotification( + SlackChannel.CDP_INTEGRATIONS_ALERTS, + SlackPersona.ERROR_REPORTER, + 'PCC S3 Cleanup Schedule Failed', + `Failed to create the \`pcc-s3-cleanup\` Temporal schedule.\n\n*Error:* ${err instanceof Error ? err.message : String(err)}`, + ) + } + } +} diff --git a/services/apps/pcc_sync_worker/src/schedules/pccS3Export.ts b/services/apps/pcc_sync_worker/src/schedules/pccS3Export.ts new file mode 100644 index 0000000000..8b5fbcaedf --- /dev/null +++ b/services/apps/pcc_sync_worker/src/schedules/pccS3Export.ts @@ -0,0 +1,46 @@ +import { ScheduleAlreadyRunning, ScheduleOverlapPolicy } from '@temporalio/client' + +import { SlackChannel, SlackPersona, sendSlackNotification } from '@crowd/slack' + +import { svc } from '../main' +import { pccS3ExportScheduler } from '../workflows' + +export const schedulePccS3Export = async () => { + try { + await svc.temporal.schedule.create({ + scheduleId: 'pcc-s3-export', + spec: { + // Run at 01:00 every day, after the snowflake connectors export at 00:20 + cronExpressions: ['00 1 * * *'], + }, + policies: { + overlap: ScheduleOverlapPolicy.SKIP, + catchupWindow: '1 minute', + }, + action: { + type: 'startWorkflow', + workflowType: pccS3ExportScheduler, + taskQueue: 'pccSync', + retry: { + initialInterval: '15 seconds', + backoffCoefficient: 2, + maximumAttempts: 3, + }, + args: [], + }, + }) + } catch (err) { + if (err instanceof ScheduleAlreadyRunning) { + svc.log.info('PCC export schedule already registered in Temporal.') + svc.log.info('Configuration may have changed since. Please make sure they are in sync.') + } else { + svc.log.error({ err }, 'Failed to create pcc-s3-export schedule') + sendSlackNotification( + SlackChannel.CDP_INTEGRATIONS_ALERTS, + SlackPersona.ERROR_REPORTER, + 'PCC S3 Export Schedule Failed', + `Failed to create the \`pcc-s3-export\` Temporal schedule.\n\n*Error:* ${err instanceof Error ? err.message : String(err)}`, + ) + } + } +} diff --git a/services/apps/pcc_sync_worker/src/scripts/triggerCleanup.ts b/services/apps/pcc_sync_worker/src/scripts/triggerCleanup.ts new file mode 100644 index 0000000000..ef71f7aa8d --- /dev/null +++ b/services/apps/pcc_sync_worker/src/scripts/triggerCleanup.ts @@ -0,0 +1,26 @@ +import { TEMPORAL_CONFIG, getTemporalClient } from '../config/settings' + +async function main() { + const client = await getTemporalClient(TEMPORAL_CONFIG()) + + const workflowId = `pcc-cleanup/manual/${new Date().toISOString().slice(0, 19)}` + + await client.workflow.start('pccS3CleanupScheduler', { + taskQueue: 'pccSync', + workflowId, + retry: { + initialInterval: '15 seconds', + backoffCoefficient: 2, + maximumAttempts: 3, + }, + args: [], + }) + + console.log(`PCC S3 cleanup workflow started: ${workflowId}`) + process.exit(0) +} + +main().catch((err) => { + console.error('Failed to trigger workflow:', err) + process.exit(1) +}) diff --git a/services/apps/pcc_sync_worker/src/scripts/triggerExport.ts b/services/apps/pcc_sync_worker/src/scripts/triggerExport.ts new file mode 100644 index 0000000000..cb3f1bc88c --- /dev/null +++ b/services/apps/pcc_sync_worker/src/scripts/triggerExport.ts @@ -0,0 +1,26 @@ +import { TEMPORAL_CONFIG, getTemporalClient } from '../config/settings' + +async function main() { + const client = await getTemporalClient(TEMPORAL_CONFIG()) + + const workflowId = `pcc-export/manual/${new Date().toISOString().slice(0, 19)}` + + await client.workflow.start('pccS3ExportScheduler', { + taskQueue: 'pccSync', + workflowId, + retry: { + initialInterval: '15 seconds', + backoffCoefficient: 2, + maximumAttempts: 3, + }, + args: [], + }) + + console.log(`PCC S3 export workflow started: ${workflowId}`) + process.exit(0) +} + +main().catch((err) => { + console.error('Failed to trigger workflow:', err) + process.exit(1) +}) diff --git a/services/apps/pcc_sync_worker/src/workflows/cleanupWorkflow.ts b/services/apps/pcc_sync_worker/src/workflows/cleanupWorkflow.ts new file mode 100644 index 0000000000..1698af6368 --- /dev/null +++ b/services/apps/pcc_sync_worker/src/workflows/cleanupWorkflow.ts @@ -0,0 +1,17 @@ +import { proxyActivities } from '@temporalio/workflow' + +import type * as activities from '../activities/cleanupActivity' + +const { executeCleanup } = proxyActivities({ + startToCloseTimeout: '1 hour', + retry: { + initialInterval: '2s', + backoffCoefficient: 2, + maximumInterval: '60s', + maximumAttempts: 3, + }, +}) + +export async function pccS3CleanupScheduler(): Promise { + await executeCleanup() +} diff --git a/services/apps/pcc_sync_worker/src/workflows/exportWorkflow.ts b/services/apps/pcc_sync_worker/src/workflows/exportWorkflow.ts new file mode 100644 index 0000000000..5719280bb4 --- /dev/null +++ b/services/apps/pcc_sync_worker/src/workflows/exportWorkflow.ts @@ -0,0 +1,17 @@ +import { proxyActivities } from '@temporalio/workflow' + +import type * as activities from '../activities/exportActivity' + +const { executeExport } = proxyActivities({ + startToCloseTimeout: '1 hour', + retry: { + initialInterval: '2s', + backoffCoefficient: 2, + maximumInterval: '60s', + maximumAttempts: 3, + }, +}) + +export async function pccS3ExportScheduler(): Promise { + await executeExport() +} diff --git a/services/apps/pcc_sync_worker/src/workflows/index.ts b/services/apps/pcc_sync_worker/src/workflows/index.ts new file mode 100644 index 0000000000..9dd43218e4 --- /dev/null +++ b/services/apps/pcc_sync_worker/src/workflows/index.ts @@ -0,0 +1,2 @@ +export { pccS3ExportScheduler } from './exportWorkflow' +export { pccS3CleanupScheduler } from './cleanupWorkflow' diff --git a/services/apps/pcc_sync_worker/tsconfig.json b/services/apps/pcc_sync_worker/tsconfig.json new file mode 100644 index 0000000000..bf7f183850 --- /dev/null +++ b/services/apps/pcc_sync_worker/tsconfig.json @@ -0,0 +1,4 @@ +{ + "extends": "../../base.tsconfig.json", + "include": ["src/**/*"] +} diff --git a/services/apps/snowflake_connectors/package.json b/services/apps/snowflake_connectors/package.json index 5f37b6662e..a321499447 100644 --- a/services/apps/snowflake_connectors/package.json +++ b/services/apps/snowflake_connectors/package.json @@ -27,8 +27,6 @@ "@crowd/snowflake": "workspace:*", "@crowd/temporal": "workspace:*", "@crowd/types": "workspace:*", - "@aws-sdk/client-s3": "^3.700.0", - "@dsnp/parquetjs": "^1.7.0", "@temporalio/client": "~1.11.8", "@temporalio/workflow": "~1.11.8", "tsx": "^4.7.1", diff --git a/services/apps/snowflake_connectors/src/activities/cleanupActivity.ts b/services/apps/snowflake_connectors/src/activities/cleanupActivity.ts index 373494a1fc..34f54e9c97 100644 --- a/services/apps/snowflake_connectors/src/activities/cleanupActivity.ts +++ b/services/apps/snowflake_connectors/src/activities/cleanupActivity.ts @@ -1,9 +1,9 @@ import { WRITE_DB_CONFIG, getDbConnection } from '@crowd/database' import { getServiceChildLogger } from '@crowd/logging' import { SlackChannel, SlackPersona, sendSlackNotification } from '@crowd/slack' +import { MetadataStore, S3Service } from '@crowd/snowflake' -import { MetadataStore } from '../core/metadataStore' -import { S3Service } from '../core/s3Service' +import { getEnabledPlatforms } from '../integrations' const log = getServiceChildLogger('cleanupActivity') @@ -12,7 +12,12 @@ export async function executeCleanup(intervalHours = 24): Promise { const metadataStore = new MetadataStore(db) const s3Service = new S3Service() - const jobs = await metadataStore.getCleanableJobS3Paths(intervalHours) + const jobs = await metadataStore.getCleanableJobS3Paths( + intervalHours, + undefined, + true, + getEnabledPlatforms(), + ) log.info({ jobCount: jobs.length, intervalHours }, 'Found cleanable jobs') for (const job of jobs) { diff --git a/services/apps/snowflake_connectors/src/activities/exportActivity.ts b/services/apps/snowflake_connectors/src/activities/exportActivity.ts index f4fca93164..2299ccd568 100644 --- a/services/apps/snowflake_connectors/src/activities/exportActivity.ts +++ b/services/apps/snowflake_connectors/src/activities/exportActivity.ts @@ -6,10 +6,9 @@ */ import { WRITE_DB_CONFIG, getDbConnection } from '@crowd/database' import { getServiceChildLogger } from '@crowd/logging' +import { MetadataStore, SnowflakeExporter } from '@crowd/snowflake' import { PlatformType } from '@crowd/types' -import { MetadataStore } from '../core/metadataStore' -import { SnowflakeExporter } from '../core/snowflakeExporter' import { getDataSourceNames as _getDataSourceNames, getEnabledPlatforms as _getEnabledPlatforms, diff --git a/services/apps/snowflake_connectors/src/consumer/transformerConsumer.ts b/services/apps/snowflake_connectors/src/consumer/transformerConsumer.ts index 0161048e2f..ed00b9e056 100644 --- a/services/apps/snowflake_connectors/src/consumer/transformerConsumer.ts +++ b/services/apps/snowflake_connectors/src/consumer/transformerConsumer.ts @@ -9,11 +9,10 @@ import { WRITE_DB_CONFIG, getDbConnection } from '@crowd/database' import { getServiceChildLogger } from '@crowd/logging' import { QUEUE_CONFIG, QueueFactory } from '@crowd/queue' import { REDIS_CONFIG, RedisCache, getRedisClient } from '@crowd/redis' +import { MetadataStore, S3Service, SnowflakeExportJob } from '@crowd/snowflake' import { PlatformType } from '@crowd/types' import { IntegrationResolver } from '../core/integrationResolver' -import { MetadataStore, SnowflakeExportJob } from '../core/metadataStore' -import { S3Service } from '../core/s3Service' import { getDataSource, getEnabledPlatforms } from '../integrations' const log = getServiceChildLogger('transformerConsumer') @@ -30,6 +29,7 @@ export class TransformerConsumer { private readonly integrationResolver: IntegrationResolver, private readonly emitter: DataSinkWorkerEmitter, private readonly pollingIntervalMs: number, + private readonly enabledPlatforms: string[], ) { this.currentPollingIntervalMs = pollingIntervalMs } @@ -40,7 +40,7 @@ export class TransformerConsumer { while (this.running) { try { - const job = await this.metadataStore.claimOldestPendingJob() + const job = await this.metadataStore.claimOldestPendingJob(undefined, this.enabledPlatforms) log.info('Claiming job from metadata store', { job }) if (job) { @@ -155,5 +155,14 @@ export async function createTransformerConsumer(): Promise const pollingIntervalMs = 10_000 // 10 seconds - return new TransformerConsumer(metadataStore, s3Service, resolver, emitter, pollingIntervalMs) + const enabledPlatforms = getEnabledPlatforms() as string[] + + return new TransformerConsumer( + metadataStore, + s3Service, + resolver, + emitter, + pollingIntervalMs, + enabledPlatforms, + ) } diff --git a/services/libs/snowflake/package.json b/services/libs/snowflake/package.json index caf9081de6..cc06c1b139 100644 --- a/services/libs/snowflake/package.json +++ b/services/libs/snowflake/package.json @@ -13,7 +13,10 @@ "tsx": "^4.7.1" }, "dependencies": { + "@aws-sdk/client-s3": "^3.700.0", + "@crowd/database": "workspace:*", "@crowd/logging": "workspace:*", + "@dsnp/parquetjs": "^1.7.0", "snowflake-sdk": "^2.3.3" } } diff --git a/services/libs/snowflake/src/index.ts b/services/libs/snowflake/src/index.ts index 025482c309..174071f858 100644 --- a/services/libs/snowflake/src/index.ts +++ b/services/libs/snowflake/src/index.ts @@ -1,3 +1,6 @@ export * from './client' export * from './github' +export * from './metadataStore' +export * from './s3Service' +export * from './snowflakeExporter' export * from './types' diff --git a/services/apps/snowflake_connectors/src/core/metadataStore.ts b/services/libs/snowflake/src/metadataStore.ts similarity index 70% rename from services/apps/snowflake_connectors/src/core/metadataStore.ts rename to services/libs/snowflake/src/metadataStore.ts index 91fe1c16f0..1353453adf 100644 --- a/services/apps/snowflake_connectors/src/core/metadataStore.ts +++ b/services/libs/snowflake/src/metadataStore.ts @@ -45,7 +45,7 @@ export class MetadataStore { const metrics: JobMetrics = { exportedRows: totalRows, exportedBytes: totalBytes } await this.db.none( `INSERT INTO integration."snowflakeExportJobs" (platform, "sourceName", s3_path, "exportStartedAt", metrics) - VALUES ($1, $2, $3, $4, $5::jsonb) + VALUES ($(platform), $(sourceName), $(s3Path), $(exportStartedAt), $(metrics)::jsonb) ON CONFLICT (s3_path) DO UPDATE SET "exportStartedAt" = EXCLUDED."exportStartedAt", "processingStartedAt" = NULL, @@ -54,7 +54,7 @@ export class MetadataStore { error = NULL, metrics = EXCLUDED.metrics, "updatedAt" = NOW()`, - [platform, sourceName, s3Path, exportStartedAt, JSON.stringify(metrics)], + { platform, sourceName, s3Path, exportStartedAt, metrics: JSON.stringify(metrics) }, ) } @@ -62,7 +62,19 @@ export class MetadataStore { * Atomically claim the oldest pending job by setting processingStartedAt. * Uses FOR UPDATE SKIP LOCKED so concurrent consumers never pick the same row. */ - async claimOldestPendingJob(): Promise { + async claimOldestPendingJob( + platform?: string, + platforms?: string[], + ): Promise { + let platformFilter = '' + let params: Record = {} + if (platform) { + platformFilter = 'AND platform = $(platform)' + params = { platform } + } else if (platforms && platforms.length > 0) { + platformFilter = 'AND platform = ANY($(platforms)::text[])' + params = { platforms } + } const row = await this.db.oneOrNone<{ id: number platform: string @@ -82,17 +94,38 @@ export class MetadataStore { WHERE id = ( SELECT id FROM integration."snowflakeExportJobs" WHERE "processingStartedAt" IS NULL + ${platformFilter} ORDER BY "createdAt" ASC LIMIT 1 FOR UPDATE SKIP LOCKED ) RETURNING id, platform, "sourceName", s3_path, "exportStartedAt", "createdAt", "updatedAt", "processingStartedAt", "completedAt", "cleanedAt", error, metrics`, + params, ) return row ? mapRowToJob(row) : null } - async getCleanableJobS3Paths(intervalHours = 24): Promise<{ id: number; s3Path: string }[]> { + async getCleanableJobS3Paths( + intervalHours = 24, + platform?: string, + requireZeroSkipped = true, + platforms?: string[], + ): Promise<{ id: number; s3Path: string }[]> { + let platformFilter = '' + const params: { intervalHours: number; platform?: string; platforms?: string[] } = { + intervalHours, + } + if (platform) { + platformFilter = 'AND platform = $(platform)' + params.platform = platform + } else if (platforms && platforms.length > 0) { + platformFilter = 'AND platform = ANY($(platforms)::text[])' + params.platforms = platforms + } + const skippedFilter = requireZeroSkipped + ? `AND metrics ? 'skippedCount' AND (metrics->>'skippedCount')::int = 0` + : '' const rows = await this.db.manyOrNone<{ id: number; s3_path: string }>( `SELECT id, s3_path FROM integration."snowflakeExportJobs" @@ -100,11 +133,11 @@ export class MetadataStore { AND "cleanedAt" IS NULL AND error IS NULL AND metrics IS NOT NULL - AND metrics ? 'skippedCount' - AND (metrics->>'skippedCount')::int = 0 - AND "completedAt" <= NOW() - make_interval(hours => $1) + ${skippedFilter} + AND "completedAt" <= NOW() - make_interval(hours => $(intervalHours)) + ${platformFilter} ORDER BY "completedAt" ASC`, - [intervalHours], + params, ) return rows.map((r) => ({ id: r.id, s3Path: r.s3_path })) } @@ -113,8 +146,8 @@ export class MetadataStore { await this.db.none( `UPDATE integration."snowflakeExportJobs" SET "cleanedAt" = NOW(), "updatedAt" = NOW() - WHERE id = $1`, - [jobId], + WHERE id = $(jobId)`, + { jobId }, ) } @@ -122,21 +155,21 @@ export class MetadataStore { await this.db.none( `UPDATE integration."snowflakeExportJobs" SET "completedAt" = NOW(), - metrics = COALESCE(metrics, '{}'::jsonb) || COALESCE($2::jsonb, '{}'::jsonb), + metrics = COALESCE(metrics, '{}'::jsonb) || COALESCE($(metrics)::jsonb, '{}'::jsonb), "updatedAt" = NOW() - WHERE id = $1`, - [jobId, metrics ? JSON.stringify(metrics) : null], + WHERE id = $(jobId)`, + { jobId, metrics: metrics ? JSON.stringify(metrics) : null }, ) } async markFailed(jobId: number, error: string, metrics?: Partial): Promise { await this.db.none( `UPDATE integration."snowflakeExportJobs" - SET error = $2, "completedAt" = NOW(), - metrics = COALESCE(metrics, '{}'::jsonb) || COALESCE($3::jsonb, '{}'::jsonb), + SET error = $(error), "completedAt" = NOW(), + metrics = COALESCE(metrics, '{}'::jsonb) || COALESCE($(metrics)::jsonb, '{}'::jsonb), "updatedAt" = NOW() - WHERE id = $1`, - [jobId, error, metrics ? JSON.stringify(metrics) : null], + WHERE id = $(jobId)`, + { jobId, error, metrics: metrics ? JSON.stringify(metrics) : null }, ) } @@ -144,11 +177,11 @@ export class MetadataStore { const row = await this.db.oneOrNone<{ max: Date | null }>( `SELECT MAX("exportStartedAt") AS max FROM integration."snowflakeExportJobs" - WHERE platform = $1 - AND "sourceName" = $2 + WHERE platform = $(platform) + AND "sourceName" = $(sourceName) AND "completedAt" IS NOT NULL AND error IS NULL`, - [platform, sourceName], + { platform, sourceName }, ) return row?.max ?? null } diff --git a/services/apps/snowflake_connectors/src/core/s3Service.ts b/services/libs/snowflake/src/s3Service.ts similarity index 100% rename from services/apps/snowflake_connectors/src/core/s3Service.ts rename to services/libs/snowflake/src/s3Service.ts diff --git a/services/apps/snowflake_connectors/src/core/snowflakeExporter.ts b/services/libs/snowflake/src/snowflakeExporter.ts similarity index 98% rename from services/apps/snowflake_connectors/src/core/snowflakeExporter.ts rename to services/libs/snowflake/src/snowflakeExporter.ts index c210bbfc81..f8d951ee76 100644 --- a/services/apps/snowflake_connectors/src/core/snowflakeExporter.ts +++ b/services/libs/snowflake/src/snowflakeExporter.ts @@ -5,7 +5,8 @@ * to export data into S3 as Parquet files. */ import { getServiceChildLogger } from '@crowd/logging' -import { SnowflakeClient } from '@crowd/snowflake' + +import { SnowflakeClient } from './client' const log = getServiceChildLogger('snowflakeExporter') From ad1fb1b782b868cd36a3214b4a5aed2cc3e553fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uro=C5=A1=20Marolt?= Date: Tue, 14 Apr 2026 08:23:23 +0200 Subject: [PATCH 02/19] chore: pcc sync worker wip1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Uroš Marolt --- .../V1775312770__pcc-sync-worker-setup.sql | 7 + pnpm-lock.yaml | 58 ++++- services/apps/pcc_sync_worker/package.json | 4 +- .../src/consumer/pccProjectConsumer.ts | 243 ++++++++++++------ .../pcc_sync_worker/src/parser/rowParser.ts | 30 ++- .../apps/pcc_sync_worker/src/parser/types.ts | 10 +- 6 files changed, 266 insertions(+), 86 deletions(-) diff --git a/backend/src/database/migrations/V1775312770__pcc-sync-worker-setup.sql b/backend/src/database/migrations/V1775312770__pcc-sync-worker-setup.sql index 239ec3468d..16cecdac82 100644 --- a/backend/src/database/migrations/V1775312770__pcc-sync-worker-setup.sql +++ b/backend/src/database/migrations/V1775312770__pcc-sync-worker-setup.sql @@ -18,3 +18,10 @@ CREATE TABLE IF NOT EXISTS pcc_projects_sync_errors ( CREATE UNIQUE INDEX IF NOT EXISTS pcc_sync_errors_dedup_idx ON pcc_projects_sync_errors (external_project_id, error_type) WHERE NOT resolved AND external_project_id IS NOT NULL; + +-- Deduplication index for unidentifiable rows (no external_project_id). +-- Keyed on (error_type, reason) so repeated daily exports don't accumulate duplicate rows +-- for the same class of malformed input (e.g. rows missing PROJECT_ID/NAME/DEPTH). +CREATE UNIQUE INDEX IF NOT EXISTS pcc_sync_errors_dedup_unknown_idx + ON pcc_projects_sync_errors (error_type, (details->>'reason')) + WHERE NOT resolved AND external_project_id IS NULL; diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 884ea6f6bc..46e0b9c92a 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1310,6 +1310,49 @@ importers: specifier: ^3.0.1 version: 3.1.0 + services/apps/pcc_sync_worker: + dependencies: + '@crowd/archetype-standard': + specifier: workspace:* + version: link:../../archetypes/standard + '@crowd/archetype-worker': + specifier: workspace:* + version: link:../../archetypes/worker + '@crowd/common': + specifier: workspace:* + version: link:../../libs/common + '@crowd/database': + specifier: workspace:* + version: link:../../libs/database + '@crowd/logging': + specifier: workspace:* + version: link:../../libs/logging + '@crowd/slack': + specifier: workspace:* + version: link:../../libs/slack + '@crowd/snowflake': + specifier: workspace:* + version: link:../../libs/snowflake + '@crowd/temporal': + specifier: workspace:* + version: link:../../libs/temporal + '@temporalio/client': + specifier: ~1.11.8 + version: 1.11.8 + '@temporalio/workflow': + specifier: ~1.11.8 + version: 1.11.8 + tsx: + specifier: ^4.7.1 + version: 4.7.3 + typescript: + specifier: ^5.6.3 + version: 5.6.3 + devDependencies: + nodemon: + specifier: ^3.0.1 + version: 3.1.0 + services/apps/profiles_worker: dependencies: '@crowd/archetype-standard': @@ -1572,9 +1615,6 @@ importers: services/apps/snowflake_connectors: dependencies: - '@aws-sdk/client-s3': - specifier: ^3.700.0 - version: 3.985.0 '@crowd/archetype-standard': specifier: workspace:* version: link:../../archetypes/standard @@ -1614,9 +1654,6 @@ importers: '@crowd/types': specifier: workspace:* version: link:../../libs/types - '@dsnp/parquetjs': - specifier: ^1.7.0 - version: 1.8.7(bufferutil@4.0.8)(utf-8-validate@5.0.10) '@temporalio/client': specifier: ~1.11.8 version: 1.11.8 @@ -2360,9 +2397,18 @@ importers: services/libs/snowflake: dependencies: + '@aws-sdk/client-s3': + specifier: ^3.700.0 + version: 3.985.0 + '@crowd/database': + specifier: workspace:* + version: link:../database '@crowd/logging': specifier: workspace:* version: link:../logging + '@dsnp/parquetjs': + specifier: ^1.7.0 + version: 1.8.7(bufferutil@4.0.8)(utf-8-validate@5.0.10) snowflake-sdk: specifier: ^2.3.3 version: 2.3.4(asn1.js@5.4.1)(encoding@0.1.13) diff --git a/services/apps/pcc_sync_worker/package.json b/services/apps/pcc_sync_worker/package.json index 4d25c97efb..f27943916f 100644 --- a/services/apps/pcc_sync_worker/package.json +++ b/services/apps/pcc_sync_worker/package.json @@ -11,7 +11,9 @@ "format-check": "npx prettier --check .", "tsc-check": "tsc --noEmit", "trigger-export": "SERVICE=pcc-sync-worker tsx src/scripts/triggerExport.ts", - "trigger-cleanup": "SERVICE=pcc-sync-worker tsx src/scripts/triggerCleanup.ts" + "trigger-export:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=pcc-sync-worker tsx src/scripts/triggerExport.ts", + "trigger-cleanup": "SERVICE=pcc-sync-worker tsx src/scripts/triggerCleanup.ts", + "trigger-cleanup:local": "set -a && . ../../../backend/.env.dist.local && . ../../../backend/.env.override.local && set +a && SERVICE=pcc-sync-worker tsx src/scripts/triggerCleanup.ts" }, "dependencies": { "@crowd/archetype-standard": "workspace:*", diff --git a/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts b/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts index a675d0384f..e32db0d076 100644 --- a/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts +++ b/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts @@ -80,21 +80,49 @@ export class PccProjectConsumer { log.info({ jobId: job.id, s3Path: job.s3Path, dryRun: this.dryRun }, 'Processing PCC job') const startTime = Date.now() + let totalCount = 0 let upsertedCount = 0 let skippedCount = 0 let mismatchCount = 0 - let errorCount = 0 + let schemaMismatchCount = 0 + let schemaMismatchMatchedCount = 0 // SCHEMA_MISMATCH rows that still have a CDP segment match try { await this.db.tx(async (tx) => { for await (const raw of this.s3Service.streamParquetRows(job.s3Path)) { const parsed = parsePccRow(raw) + totalCount++ + if (parsed.ok === false) { - errorCount++ - log.warn({ jobId: job.id, details: parsed.details }, 'Row schema mismatch — skipping') + schemaMismatchCount++ + const errorDetails: Record = { ...parsed.details } + + // If the row had identifiable fields (depth-range errors), attempt a segment + // match so the error record reflects whether a CDP segment exists for this + // project — useful for triage even when the depth rule is unsupported. + if (parsed.pccProjectId) { + const matched = await findSegmentBySourceId(tx, parsed.pccProjectId) + if (matched) { + schemaMismatchMatchedCount++ + errorDetails.matchedSegmentId = matched.id + errorDetails.matchedSegmentName = matched.name + errorDetails.matchedVia = 'sourceId' + } + } + + log.warn( + { jobId: job.id, details: errorDetails }, + 'Row schema mismatch — skipping', + ) if (!this.dryRun) { - await insertSyncError(tx, null, null, 'SCHEMA_MISMATCH', parsed.details) + await insertSyncError( + tx, + parsed.pccProjectId ?? null, + parsed.pccSlug ?? null, + 'SCHEMA_MISMATCH', + errorDetails, + ) } continue } @@ -125,13 +153,27 @@ export class PccProjectConsumer { } }) - const metrics = { upsertedCount, skippedCount, mismatchCount, errorCount } - log.info({ jobId: job.id, ...metrics, dryRun: this.dryRun }, 'PCC job completed') + const durationMs = Date.now() - startTime + log.info( + { + jobId: job.id, + dryRun: this.dryRun, + durationMs, + total: totalCount, + upserted: upsertedCount, + skipped: skippedCount, + hierarchyMismatch: mismatchCount, + schemaMismatch: schemaMismatchCount, + schemaMismatchWithCdpMatch: schemaMismatchMatchedCount, + schemaMismatchNoCdpMatch: schemaMismatchCount - schemaMismatchMatchedCount, + }, + 'PCC job completed', + ) await this.metadataStore.markCompleted(job.id, { transformedCount: upsertedCount, - skippedCount: skippedCount + mismatchCount + errorCount, - processingDurationMs: Date.now() - startTime, + skippedCount: skippedCount + mismatchCount + schemaMismatchCount, + processingDurationMs: durationMs, }) } catch (err) { const errorMessage = err instanceof Error ? err.message : String(err) @@ -169,12 +211,7 @@ export class PccProjectConsumer { segment = await findSegmentBySourceId(tx, project.pccProjectId) } - // Step 3: derived slug match - if (!segment && project.pccSlug) { - segment = await findSegmentBySlug(tx, project.pccSlug) - } - - // Step 4: no match → SKIP (Phase 1: project doesn't exist in CDP yet) + // Step 3: no match → SKIP (Phase 1: project doesn't exist in CDP yet) if (!segment) { return { action: 'SKIPPED' } } @@ -199,9 +236,32 @@ export class PccProjectConsumer { } } + // Slug drift detection: log when PCC slug differs from the CDP segment slug. + // We do NOT update the slug — it is a stable identifier referenced by FK from + // securityInsightsEvaluations and related tables. The mismatch is recorded for + // manual review but does not block the sync. + if (project.pccSlug && segment.slug && project.pccSlug !== segment.slug) { + log.warn( + { segmentId: segment.id, pccSlug: project.pccSlug, cdpSlug: segment.slug }, + 'Slug drift detected — PCC slug differs from CDP segment slug', + ) + if (!this.dryRun) { + await insertSyncError(tx, project.pccProjectId, project.pccSlug, 'SLUG_CHANGED', { + segmentId: segment.id, + pccSlug: project.pccSlug, + cdpSlug: segment.slug, + }) + } + } + if (!this.dryRun) { - await upsertSegment(tx, segment.id, project) - const nameConflict = await upsertInsightsProject(tx, segment.id, project) + await upsertSegment(tx, project.pccProjectId, project) + const nameConflict = await upsertInsightsProject( + tx, + segment.id, + project.pccProjectId, + project, + ) if (nameConflict) { log.warn( { segmentId: segment.id, name: project.name }, @@ -240,42 +300,29 @@ export class PccProjectConsumer { interface SegmentRow { id: string name: string + slug: string | null parentName: string | null grandparentName: string | null } async function findSegmentById(db: DbConnOrTx, segmentId: string): Promise { return db.oneOrNone( - `SELECT id, name, "parentName", "grandparentName" + `SELECT id, name, slug, "parentName", "grandparentName" FROM segments - WHERE id = $(segmentId) AND type = 'subproject' AND "tenantId" = $(tenantId)`, + WHERE id = $(segmentId) AND "tenantId" = $(tenantId)`, { segmentId, tenantId: DEFAULT_TENANT_ID }, ) } async function findSegmentBySourceId(db: DbConnOrTx, sourceId: string): Promise { return db.oneOrNone( - `SELECT id, name, "parentName", "grandparentName" + `SELECT id, name, slug, "parentName", "grandparentName" FROM segments WHERE "sourceId" = $(sourceId) AND type = 'subproject' AND "tenantId" = $(tenantId)`, { sourceId, tenantId: DEFAULT_TENANT_ID }, ) } -async function findSegmentBySlug(db: DbConnOrTx, slug: string): Promise { - const rows = await db.manyOrNone( - `SELECT id, name, "parentName", "grandparentName" - FROM segments - WHERE slug = $(slug) AND type = 'subproject' AND "tenantId" = $(tenantId)`, - { slug, tenantId: DEFAULT_TENANT_ID }, - ) - if (rows.length === 1) return rows[0] - if (rows.length > 1) { - log.warn({ slug, count: rows.length }, 'Ambiguous slug match — skipping') - } - return null -} - function detectHierarchyMismatch(segment: SegmentRow, cdpTarget: CdpHierarchyTarget): string[] { // Only check structural hierarchy (parent/grandparent placement), not the leaf name. // The leaf name is a metadata field we're here to sync — a difference there is an UPDATE, @@ -293,9 +340,12 @@ function detectHierarchyMismatch(segment: SegmentRow, cdpTarget: CdpHierarchyTar async function upsertSegment( db: DbConnOrTx, - segmentId: string, + sourceId: string, project: ParsedPccProject, ): Promise { + // Update all segment levels (group, project, subproject) that share the same sourceId. + // PCC exports every level of the hierarchy with the same PROJECT_ID, so all three CDP + // segment levels are updated in one pass. await db.none( `UPDATE segments SET name = $(name), @@ -303,9 +353,9 @@ async function upsertSegment( maturity = $(maturity), description = $(description), "updatedAt" = NOW() - WHERE id = $(segmentId) AND "tenantId" = $(tenantId)`, + WHERE "sourceId" = $(sourceId) AND "tenantId" = $(tenantId)`, { - segmentId, + sourceId, name: project.name, status: project.status ?? 'active', maturity: project.maturity, @@ -316,49 +366,73 @@ async function upsertSegment( } // Returns true if a name conflict prevented creating the insightsProject row. +// Updates insightsProject rows for ALL segment levels sharing the same sourceId +// (group, project, subproject). The INSERT is restricted to the matched subproject +// segment (identified by segmentId) to avoid duplicating insights projects for +// hierarchy-only segments. async function upsertInsightsProject( db: DbConnOrTx, segmentId: string, + sourceId: string, project: ParsedPccProject, ): Promise { - // Partial unique index on segmentId WHERE deletedAt IS NULL means - // ON CONFLICT won't fire for soft-deleted rows. Use UPDATE-then-INSERT. - // Slug is intentionally not updated on name changes — it is a stable identifier - // referenced by FK from securityInsightsEvaluations and related tables. - // Guard the UPDATE against the partial unique index on (name) WHERE deletedAt IS NULL. - // If another active row already holds the new name, the NOT EXISTS subquery causes the - // UPDATE to match 0 rows instead of throwing a 23505 unique violation. - const updated = await db.result( - `UPDATE "insightsProjects" + // Check for a name conflict upfront — an active insightsProject belonging to a segment + // outside this PCC project's sourceId group already holds this name. + // We must exclude all segments sharing the same sourceId (not just the subproject), + // because on repeat syncs the group/project levels already carry the same name and + // would produce false positives if only the subproject segmentId were excluded. + const conflicting = await db.oneOrNone<{ id: string }>( + `SELECT ip.id + FROM "insightsProjects" ip + JOIN segments s ON s.id = ip."segmentId" + WHERE ip.name = $(name) + AND ip."deletedAt" IS NULL + AND s."sourceId" != $(sourceId) + AND s."tenantId" = $(tenantId)`, + { name: project.name, sourceId, tenantId: DEFAULT_TENANT_ID }, + ) + if (conflicting) return true + + // No conflict — update all active insightsProject rows linked to any segment that + // shares the PCC sourceId (group, project, subproject levels). + // Slug is intentionally not updated — it is a stable identifier referenced by FK from + // securityInsightsEvaluations and related tables. + await db.none( + `UPDATE "insightsProjects" ip SET name = $(name), description = $(description), "logoUrl" = $(logoUrl), "updatedAt" = NOW() - WHERE "segmentId" = $(segmentId) AND "deletedAt" IS NULL - AND NOT EXISTS ( - SELECT 1 FROM "insightsProjects" - WHERE name = $(name) AND "deletedAt" IS NULL AND "segmentId" != $(segmentId) - )`, - { segmentId, name: project.name, description: project.description, logoUrl: project.logoUrl }, + FROM segments s + WHERE ip."segmentId" = s.id + AND s."sourceId" = $(sourceId) + AND s."tenantId" = $(tenantId) + AND ip."deletedAt" IS NULL`, + { + sourceId, + tenantId: DEFAULT_TENANT_ID, + name: project.name, + description: project.description, + logoUrl: project.logoUrl, + }, ) - if (updated.rowCount === 0) { - // Either (a) no active row exists yet → proceed to INSERT, - // or (b) a row exists but its name collides with another segment → return conflict. - const exists = await db.oneOrNone<{ id: string }>( - `SELECT id FROM "insightsProjects" WHERE "segmentId" = $(segmentId) AND "deletedAt" IS NULL`, - { segmentId }, - ) - if (exists) return true + // INSERT for the subproject segment only (the matched leaf). + // Partial unique index on segmentId WHERE deletedAt IS NULL means ON CONFLICT won't fire + // for soft-deleted rows — use UPDATE-then-INSERT pattern (UPDATE already done above). + const exists = await db.oneOrNone<{ id: string }>( + `SELECT id FROM "insightsProjects" WHERE "segmentId" = $(segmentId) AND "deletedAt" IS NULL`, + { segmentId }, + ) + if (exists) return false - const inserted = await db.result( - `INSERT INTO "insightsProjects" (name, slug, description, "segmentId", "logoUrl", "isLF") - VALUES ($(name), generate_slug('insightsProjects', $(name)), $(description), $(segmentId), $(logoUrl), TRUE) - ON CONFLICT (name) WHERE "deletedAt" IS NULL DO NOTHING`, - { name: project.name, description: project.description, segmentId, logoUrl: project.logoUrl }, - ) - if (inserted.rowCount === 0) return true - } + const inserted = await db.result( + `INSERT INTO "insightsProjects" (name, slug, description, "segmentId", "logoUrl", "isLF") + VALUES ($(name), generate_slug('insightsProjects', $(name)), $(description), $(segmentId), $(logoUrl), TRUE) + ON CONFLICT (name) WHERE "deletedAt" IS NULL DO NOTHING`, + { name: project.name, description: project.description, segmentId, logoUrl: project.logoUrl }, + ) + if (inserted.rowCount === 0) return true return false } @@ -370,15 +444,32 @@ async function insertSyncError( errorType: string, details: Record, ): Promise { - await db.none( - `INSERT INTO pcc_projects_sync_errors - (external_project_id, external_project_slug, error_type, details) - VALUES ($(externalProjectId), $(externalProjectSlug), $(errorType), $(details)::jsonb) - ON CONFLICT (external_project_id, error_type) - WHERE NOT resolved AND external_project_id IS NOT NULL - DO UPDATE SET details = EXCLUDED.details, run_at = NOW()`, - { externalProjectId, externalProjectSlug, errorType, details: JSON.stringify(details) }, - ) + const serialized = JSON.stringify(details) + if (externalProjectId !== null) { + // Known project: deduplicate on (external_project_id, error_type). + await db.none( + `INSERT INTO pcc_projects_sync_errors + (external_project_id, external_project_slug, error_type, details) + VALUES ($(externalProjectId), $(externalProjectSlug), $(errorType), $(details)::jsonb) + ON CONFLICT (external_project_id, error_type) + WHERE NOT resolved AND external_project_id IS NOT NULL + DO UPDATE SET details = EXCLUDED.details, external_project_slug = EXCLUDED.external_project_slug, run_at = NOW()`, + { externalProjectId, externalProjectSlug, errorType, details: serialized }, + ) + } else { + // Unidentifiable row (no PROJECT_ID): deduplicate on (error_type, details->>'reason') + // so repeated daily exports don't accumulate duplicate rows for the same class of + // malformed input. Each distinct failure reason gets one unresolved row. + await db.none( + `INSERT INTO pcc_projects_sync_errors + (external_project_slug, error_type, details) + VALUES ($(externalProjectSlug), $(errorType), $(details)::jsonb) + ON CONFLICT (error_type, (details->>'reason')) + WHERE NOT resolved AND external_project_id IS NULL + DO UPDATE SET details = EXCLUDED.details, external_project_slug = EXCLUDED.external_project_slug, run_at = NOW()`, + { externalProjectSlug, errorType, details: serialized }, + ) + } } // ───────────────────────────────────────────────────────────────────────────── diff --git a/services/apps/pcc_sync_worker/src/parser/rowParser.ts b/services/apps/pcc_sync_worker/src/parser/rowParser.ts index 74ada6a415..140a51772e 100644 --- a/services/apps/pcc_sync_worker/src/parser/rowParser.ts +++ b/services/apps/pcc_sync_worker/src/parser/rowParser.ts @@ -38,6 +38,23 @@ const STATUS_MAP: Record = { */ const TRANSPARENT_INTERMEDIATES = new Set(['LF Projects, LLC']) +/** + * Parquet serializes Snowflake NUMBER columns as fixed-width big-endian Buffers. + * Handle both the plain number case and the Buffer case. + */ +function parseParquetInt(value: unknown): number { + if (typeof value === 'number') return value + // Parquet serializes Snowflake NUMBER as a Node.js Buffer (big-endian bytes) + if (Buffer.isBuffer(value)) { + let result = 0 + for (const byte of value) { + result = result * 256 + byte + } + return result + } + return Number(value) +} + /** * Parse and validate a raw Parquet row from the PCC export. * Returns ok=false with SCHEMA_MISMATCH if the row is malformed or @@ -48,13 +65,20 @@ export function parsePccRow(raw: Record): ParseResult { const projectId = row.PROJECT_ID const name = row.NAME - const depth = typeof row.DEPTH === 'number' ? row.DEPTH : Number(row.DEPTH) + const depth = parseParquetInt(row.DEPTH) if (!projectId || !name || !Number.isFinite(depth)) { return { ok: false, errorType: 'SCHEMA_MISMATCH', - details: { reason: 'missing required fields', raw }, + details: { + reason: 'missing required fields', + missingFields: [ + ...(!projectId ? ['PROJECT_ID'] : []), + ...(!name ? ['NAME'] : []), + ...(!Number.isFinite(depth) ? ['DEPTH'] : []), + ], + }, } } @@ -64,6 +88,8 @@ export function parsePccRow(raw: Record): ParseResult { return { ok: false, errorType: 'SCHEMA_MISMATCH', + pccProjectId: String(projectId), + pccSlug: (row.SLUG ?? null) as string | null, details: { reason: effectiveDepth < 1 ? 'unexpected root node (depth=1)' : 'unsupported depth > 5', rawDepth: depth, diff --git a/services/apps/pcc_sync_worker/src/parser/types.ts b/services/apps/pcc_sync_worker/src/parser/types.ts index 1a212f4af6..9e42f97580 100644 --- a/services/apps/pcc_sync_worker/src/parser/types.ts +++ b/services/apps/pcc_sync_worker/src/parser/types.ts @@ -59,4 +59,12 @@ export interface ParsedPccProject { export type ParseResult = | { ok: true; project: ParsedPccProject } - | { ok: false; errorType: 'SCHEMA_MISMATCH'; details: Record } + | { + ok: false + errorType: 'SCHEMA_MISMATCH' + details: Record + /** Present when the row had a valid PROJECT_ID (depth-range errors). Used for segment lookup. */ + pccProjectId?: string + /** Present when the row had a valid SLUG (depth-range errors). Used for segment lookup. */ + pccSlug?: string | null + } From b7bc7a551835f19c31e32f47cb8705827af0ec9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uro=C5=A1=20Marolt?= Date: Tue, 14 Apr 2026 09:26:24 +0200 Subject: [PATCH 03/19] fix: comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Uroš Marolt --- .../src/consumer/pccProjectConsumer.ts | 26 +++++++++++-------- .../pcc_sync_worker/src/parser/rowParser.ts | 2 ++ 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts b/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts index e32db0d076..3c6101ef9e 100644 --- a/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts +++ b/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts @@ -170,21 +170,25 @@ export class PccProjectConsumer { 'PCC job completed', ) - await this.metadataStore.markCompleted(job.id, { - transformedCount: upsertedCount, - skippedCount: skippedCount + mismatchCount + schemaMismatchCount, - processingDurationMs: durationMs, - }) + if (!this.dryRun) { + await this.metadataStore.markCompleted(job.id, { + transformedCount: upsertedCount, + skippedCount: skippedCount + mismatchCount + schemaMismatchCount, + processingDurationMs: durationMs, + }) + } } catch (err) { const errorMessage = err instanceof Error ? err.message : String(err) log.error({ jobId: job.id, err }, 'PCC job failed') - try { - await this.metadataStore.markFailed(job.id, errorMessage, { - processingDurationMs: Date.now() - startTime, - }) - } catch (updateErr) { - log.error({ jobId: job.id, updateErr }, 'Failed to mark job as failed') + if (!this.dryRun) { + try { + await this.metadataStore.markFailed(job.id, errorMessage, { + processingDurationMs: Date.now() - startTime, + }) + } catch (updateErr) { + log.error({ jobId: job.id, updateErr }, 'Failed to mark job as failed') + } } } } diff --git a/services/apps/pcc_sync_worker/src/parser/rowParser.ts b/services/apps/pcc_sync_worker/src/parser/rowParser.ts index 140a51772e..f22977e7d4 100644 --- a/services/apps/pcc_sync_worker/src/parser/rowParser.ts +++ b/services/apps/pcc_sync_worker/src/parser/rowParser.ts @@ -111,6 +111,8 @@ export function parsePccRow(raw: Record): ParseResult { return { ok: false, errorType: 'SCHEMA_MISMATCH', + pccProjectId: String(projectId), + pccSlug: (row.SLUG ?? null) as string | null, details: { reason: cdpTargetResult.reason, rawDepth: depth, effectiveDepth, projectId, name }, } } From d9c3a75e26eb729f5fd0470ec40c979c87032d17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uro=C5=A1=20Marolt?= Date: Tue, 14 Apr 2026 12:38:41 +0200 Subject: [PATCH 04/19] fix: lint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Uroš Marolt --- .../pcc_sync_worker/src/consumer/pccProjectConsumer.ts | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts b/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts index 3c6101ef9e..b67a433aae 100644 --- a/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts +++ b/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts @@ -111,13 +111,7 @@ export class PccProjectConsumer { } } - log.warn( - { jobId: job.id, details: errorDetails }, - 'Row schema mismatch — skipping', - ) - if (!this.dryRun) { - await insertSyncError( - tx, + log.warn( (!this.dryRun) { await insertSyncError( tx, parsed.pccProjectId ?? null, parsed.pccSlug ?? null, 'SCHEMA_MISMATCH', From e1c45e71bc7b65e396281b2fd5713e35e9859d43 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uro=C5=A1=20Marolt?= Date: Tue, 14 Apr 2026 13:07:19 +0200 Subject: [PATCH 05/19] fix: lint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Uroš Marolt --- .../src/consumer/pccProjectConsumer.ts | 22 +++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts b/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts index b67a433aae..37b3d9f2b1 100644 --- a/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts +++ b/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts @@ -111,7 +111,17 @@ export class PccProjectConsumer { } } - log.warn( (!this.dryRun) { await insertSyncError( tx, + log.warn( + { + pccProjectId: parsed.pccProjectId ?? null, + pccSlug: parsed.pccSlug ?? null, + ...errorDetails, + }, + 'Schema mismatch in PCC row', + ) + if (!this.dryRun) { + await insertSyncError( + tx, parsed.pccProjectId ?? null, parsed.pccSlug ?? null, 'SCHEMA_MISMATCH', @@ -341,9 +351,13 @@ async function upsertSegment( sourceId: string, project: ParsedPccProject, ): Promise { - // Update all segment levels (group, project, subproject) that share the same sourceId. - // PCC exports every level of the hierarchy with the same PROJECT_ID, so all three CDP - // segment levels are updated in one pass. + // Update all CDP segments whose sourceId equals this PCC PROJECT_ID. + // Each PCC node has its own PROJECT_ID. In CDP, how many segment levels share this + // sourceId depends on the effective depth: + // eff=1 → group+project+subproject all share the same PROJECT_ID (same name for all) + // eff=2 → project+subproject share the leaf's PROJECT_ID; group has a different one + // eff=3 or 4 → only the subproject segment carries this PROJECT_ID + // So this UPDATE always writes the correct name and never touches unrelated levels. await db.none( `UPDATE segments SET name = $(name), From 34463ab1ece342e8dacde96415e239d4b67a6337 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uro=C5=A1=20Marolt?= Date: Wed, 15 Apr 2026 20:05:11 +0200 Subject: [PATCH 06/19] fix: mouads comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Uroš Marolt --- .../src/activities/cleanupActivity.ts | 4 +- .../src/activities/exportActivity.ts | 16 +----- .../src/consumer/pccProjectConsumer.ts | 4 +- .../src/activities/cleanupActivity.ts | 5 +- .../src/activities/exportActivity.ts | 13 +---- .../src/consumer/transformerConsumer.ts | 4 +- services/libs/snowflake/src/metadataStore.ts | 53 ++++++++++--------- .../libs/snowflake/src/snowflakeExporter.ts | 16 ++++++ 8 files changed, 54 insertions(+), 61 deletions(-) diff --git a/services/apps/pcc_sync_worker/src/activities/cleanupActivity.ts b/services/apps/pcc_sync_worker/src/activities/cleanupActivity.ts index 4a51f0f83e..a12728442b 100644 --- a/services/apps/pcc_sync_worker/src/activities/cleanupActivity.ts +++ b/services/apps/pcc_sync_worker/src/activities/cleanupActivity.ts @@ -1,7 +1,7 @@ import { WRITE_DB_CONFIG, getDbConnection } from '@crowd/database' import { getServiceChildLogger } from '@crowd/logging' import { SlackChannel, SlackPersona, sendSlackNotification } from '@crowd/slack' -import { MetadataStore, S3Service } from '@crowd/snowflake' +import { MetadataStore, S3Service, buildPlatformFilter } from '@crowd/snowflake' const log = getServiceChildLogger('cleanupActivity') @@ -12,7 +12,7 @@ export async function executeCleanup(intervalHours = 24): Promise { const metadataStore = new MetadataStore(db) const s3Service = new S3Service() - const jobs = await metadataStore.getCleanableJobS3Paths(intervalHours, PLATFORM, false) + const jobs = await metadataStore.getCleanableJobS3Paths(intervalHours, buildPlatformFilter([PLATFORM]), false) log.info({ jobCount: jobs.length, intervalHours }, 'Found cleanable PCC jobs') for (const job of jobs) { diff --git a/services/apps/pcc_sync_worker/src/activities/exportActivity.ts b/services/apps/pcc_sync_worker/src/activities/exportActivity.ts index e811786143..a293636d1c 100644 --- a/services/apps/pcc_sync_worker/src/activities/exportActivity.ts +++ b/services/apps/pcc_sync_worker/src/activities/exportActivity.ts @@ -8,7 +8,7 @@ */ import { WRITE_DB_CONFIG, getDbConnection } from '@crowd/database' import { getServiceChildLogger } from '@crowd/logging' -import { MetadataStore, SnowflakeExporter } from '@crowd/snowflake' +import { MetadataStore, SnowflakeExporter, buildS3FilenamePrefix } from '@crowd/snowflake' const log = getServiceChildLogger('exportActivity') @@ -51,18 +51,6 @@ function buildSourceQuery(): string { ` } -function buildS3FilenamePrefix(): string { - const now = new Date() - const year = now.getFullYear() - const month = String(now.getMonth() + 1).padStart(2, '0') - const day = String(now.getDate()).padStart(2, '0') - const s3BucketPath = process.env.CROWD_SNOWFLAKE_S3_BUCKET_PATH - if (!s3BucketPath) { - throw new Error('Missing required env var CROWD_SNOWFLAKE_S3_BUCKET_PATH') - } - return `${s3BucketPath}/${PLATFORM}/${SOURCE_NAME}/${year}/${month}/${day}` -} - export async function executeExport(): Promise { log.info({ platform: PLATFORM, sourceName: SOURCE_NAME }, 'Starting PCC export') @@ -72,7 +60,7 @@ export async function executeExport(): Promise { try { const metadataStore = new MetadataStore(db) const sourceQuery = buildSourceQuery() - const s3FilenamePrefix = buildS3FilenamePrefix() + const s3FilenamePrefix = buildS3FilenamePrefix(PLATFORM, SOURCE_NAME) const exportStartedAt = new Date() const onBatchComplete = async (s3Path: string, totalRows: number, totalBytes: number) => { diff --git a/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts b/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts index 37b3d9f2b1..a3c163ef24 100644 --- a/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts +++ b/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts @@ -9,7 +9,7 @@ import { DEFAULT_TENANT_ID } from '@crowd/common' import { DbConnOrTx, DbConnection, WRITE_DB_CONFIG, getDbConnection } from '@crowd/database' import { getServiceChildLogger } from '@crowd/logging' -import { MetadataStore, S3Service, SnowflakeExportJob } from '@crowd/snowflake' +import { MetadataStore, S3Service, SnowflakeExportJob, buildPlatformFilter } from '@crowd/snowflake' import { parsePccRow } from '../parser' import type { CdpHierarchyTarget, ParsedPccProject } from '../parser' @@ -43,7 +43,7 @@ export class PccProjectConsumer { while (this.running) { try { - const job = await this.metadataStore.claimOldestPendingJob(PLATFORM) + const job = await this.metadataStore.claimOldestPendingJob(buildPlatformFilter([PLATFORM])) if (job) { this.currentPollingIntervalMs = this.pollingIntervalMs diff --git a/services/apps/snowflake_connectors/src/activities/cleanupActivity.ts b/services/apps/snowflake_connectors/src/activities/cleanupActivity.ts index 34f54e9c97..86124fa941 100644 --- a/services/apps/snowflake_connectors/src/activities/cleanupActivity.ts +++ b/services/apps/snowflake_connectors/src/activities/cleanupActivity.ts @@ -1,7 +1,7 @@ import { WRITE_DB_CONFIG, getDbConnection } from '@crowd/database' import { getServiceChildLogger } from '@crowd/logging' import { SlackChannel, SlackPersona, sendSlackNotification } from '@crowd/slack' -import { MetadataStore, S3Service } from '@crowd/snowflake' +import { MetadataStore, S3Service, buildPlatformFilter } from '@crowd/snowflake' import { getEnabledPlatforms } from '../integrations' @@ -14,9 +14,8 @@ export async function executeCleanup(intervalHours = 24): Promise { const jobs = await metadataStore.getCleanableJobS3Paths( intervalHours, - undefined, + buildPlatformFilter(getEnabledPlatforms()), true, - getEnabledPlatforms(), ) log.info({ jobCount: jobs.length, intervalHours }, 'Found cleanable jobs') diff --git a/services/apps/snowflake_connectors/src/activities/exportActivity.ts b/services/apps/snowflake_connectors/src/activities/exportActivity.ts index 2299ccd568..e61fdf7604 100644 --- a/services/apps/snowflake_connectors/src/activities/exportActivity.ts +++ b/services/apps/snowflake_connectors/src/activities/exportActivity.ts @@ -6,7 +6,7 @@ */ import { WRITE_DB_CONFIG, getDbConnection } from '@crowd/database' import { getServiceChildLogger } from '@crowd/logging' -import { MetadataStore, SnowflakeExporter } from '@crowd/snowflake' +import { MetadataStore, SnowflakeExporter, buildS3FilenamePrefix } from '@crowd/snowflake' import { PlatformType } from '@crowd/types' import { @@ -26,17 +26,6 @@ export async function getDataSourceNamesForPlatform(platform: PlatformType): Pro const log = getServiceChildLogger('exportActivity') -function buildS3FilenamePrefix(platform: string, sourceName: string): string { - const now = new Date() - const year = now.getFullYear() - const month = String(now.getMonth() + 1).padStart(2, '0') - const day = String(now.getDate()).padStart(2, '0') - const s3BucketPath = process.env.CROWD_SNOWFLAKE_S3_BUCKET_PATH - if (!s3BucketPath) { - throw new Error('Missing required env var CROWD_SNOWFLAKE_S3_BUCKET_PATH') - } - return `${s3BucketPath}/${platform}/${sourceName}/${year}/${month}/${day}` -} export async function executeExport( platform: PlatformType, diff --git a/services/apps/snowflake_connectors/src/consumer/transformerConsumer.ts b/services/apps/snowflake_connectors/src/consumer/transformerConsumer.ts index bcac3d5402..f091e89678 100644 --- a/services/apps/snowflake_connectors/src/consumer/transformerConsumer.ts +++ b/services/apps/snowflake_connectors/src/consumer/transformerConsumer.ts @@ -9,7 +9,7 @@ import { WRITE_DB_CONFIG, getDbConnection } from '@crowd/database' import { getServiceChildLogger } from '@crowd/logging' import { QUEUE_CONFIG, QueueFactory } from '@crowd/queue' import { REDIS_CONFIG, RedisCache, getRedisClient } from '@crowd/redis' -import { MetadataStore, S3Service, SnowflakeExportJob } from '@crowd/snowflake' +import { MetadataStore, S3Service, SnowflakeExportJob, buildPlatformFilter } from '@crowd/snowflake' import { PlatformType } from '@crowd/types' import { IntegrationResolver } from '../core/integrationResolver' @@ -40,7 +40,7 @@ export class TransformerConsumer { while (this.running) { try { - const job = await this.metadataStore.claimOldestPendingJob(undefined, this.enabledPlatforms) + const job = await this.metadataStore.claimOldestPendingJob(buildPlatformFilter(this.enabledPlatforms)) log.info('Claiming job from metadata store', { job }) if (job) { diff --git a/services/libs/snowflake/src/metadataStore.ts b/services/libs/snowflake/src/metadataStore.ts index 1353453adf..eb5fb8bf19 100644 --- a/services/libs/snowflake/src/metadataStore.ts +++ b/services/libs/snowflake/src/metadataStore.ts @@ -31,6 +31,27 @@ export interface SnowflakeExportJob { metrics: JobMetrics | null } +export interface PlatformFilter { + clause: string + params: Record +} + +/** + * Build a SQL platform filter for use in metadataStore queries. + * + * An empty array returns `AND FALSE` (matches nothing), preventing + * accidental full-table scans when no platforms are configured. + */ +export function buildPlatformFilter(platforms: string[]): PlatformFilter { + if (platforms.length === 0) { + return { clause: 'AND FALSE', params: {} } + } + return { + clause: 'AND platform = ANY($(platforms)::text[])', + params: { platforms }, + } +} + export class MetadataStore { constructor(private readonly db: DbConnection) {} @@ -62,19 +83,9 @@ export class MetadataStore { * Atomically claim the oldest pending job by setting processingStartedAt. * Uses FOR UPDATE SKIP LOCKED so concurrent consumers never pick the same row. */ - async claimOldestPendingJob( - platform?: string, - platforms?: string[], - ): Promise { - let platformFilter = '' - let params: Record = {} - if (platform) { - platformFilter = 'AND platform = $(platform)' - params = { platform } - } else if (platforms && platforms.length > 0) { - platformFilter = 'AND platform = ANY($(platforms)::text[])' - params = { platforms } - } + async claimOldestPendingJob(filter?: PlatformFilter): Promise { + const platformFilter = filter?.clause ?? '' + const params: Record = filter?.params ?? {} const row = await this.db.oneOrNone<{ id: number platform: string @@ -108,21 +119,11 @@ export class MetadataStore { async getCleanableJobS3Paths( intervalHours = 24, - platform?: string, + filter?: PlatformFilter, requireZeroSkipped = true, - platforms?: string[], ): Promise<{ id: number; s3Path: string }[]> { - let platformFilter = '' - const params: { intervalHours: number; platform?: string; platforms?: string[] } = { - intervalHours, - } - if (platform) { - platformFilter = 'AND platform = $(platform)' - params.platform = platform - } else if (platforms && platforms.length > 0) { - platformFilter = 'AND platform = ANY($(platforms)::text[])' - params.platforms = platforms - } + const platformFilter = filter?.clause ?? '' + const params: Record = { intervalHours, ...filter?.params } const skippedFilter = requireZeroSkipped ? `AND metrics ? 'skippedCount' AND (metrics->>'skippedCount')::int = 0` : '' diff --git a/services/libs/snowflake/src/snowflakeExporter.ts b/services/libs/snowflake/src/snowflakeExporter.ts index f8d951ee76..8f1dc7ac9c 100644 --- a/services/libs/snowflake/src/snowflakeExporter.ts +++ b/services/libs/snowflake/src/snowflakeExporter.ts @@ -25,6 +25,22 @@ interface CopyIntoRow { fileName: string } +/** + * Build the S3 filename prefix for a batched COPY INTO export. + * Format: {CROWD_SNOWFLAKE_S3_BUCKET_PATH}/{platform}/{sourceName}/{yyyy}/{mm}/{dd} + */ +export function buildS3FilenamePrefix(platform: string, sourceName: string): string { + const s3BucketPath = process.env.CROWD_SNOWFLAKE_S3_BUCKET_PATH + if (!s3BucketPath) { + throw new Error('Missing required env var CROWD_SNOWFLAKE_S3_BUCKET_PATH') + } + const now = new Date() + const year = now.getFullYear() + const month = String(now.getMonth() + 1).padStart(2, '0') + const day = String(now.getDate()).padStart(2, '0') + return `${s3BucketPath}/${platform}/${sourceName}/${year}/${month}/${day}` +} + export class SnowflakeExporter { private readonly snowflake: SnowflakeClient From 31867f43771e80e032820d6da034d67aea236fca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uro=C5=A1=20Marolt?= Date: Wed, 15 Apr 2026 20:42:29 +0200 Subject: [PATCH 07/19] fix: joanas comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Uroš Marolt --- pnpm-lock.yaml | 28 ++- services/apps/pcc_sync_worker/package.json | 1 + .../src/activities/cleanupActivity.ts | 6 +- .../src/activities/exportActivity.ts | 53 ++--- .../src/consumer/pccProjectConsumer.ts | 16 +- .../pcc_sync_worker/src/parser/rowParser.ts | 219 ++++++++++-------- .../apps/pcc_sync_worker/src/parser/types.ts | 26 ++- .../src/activities/exportActivity.ts | 1 - .../src/consumer/transformerConsumer.ts | 4 +- 9 files changed, 195 insertions(+), 159 deletions(-) diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 9fa35e91d9..4d6df75e59 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -1336,6 +1336,9 @@ importers: '@crowd/temporal': specifier: workspace:* version: link:../../libs/temporal + '@crowd/types': + specifier: workspace:* + version: link:../../libs/types '@temporalio/client': specifier: ~1.11.8 version: 1.11.8 @@ -7445,6 +7448,7 @@ packages: git-raw-commits@4.0.0: resolution: {integrity: sha512-ICsMM1Wk8xSGMowkOmPrzo2Fgmfo4bMHLNX6ytHjajRJUqvHOw/TFapQ+QG75c3X/tTDDhOSRPGC52dDbNM8FQ==} engines: {node: '>=16'} + deprecated: This package is no longer maintained. For the JavaScript API, please use @conventional-changelog/git-client instead. hasBin: true glob-parent@5.1.2: @@ -7466,11 +7470,11 @@ packages: glob@6.0.4: resolution: {integrity: sha512-MKZeRNyYZAVVVG1oZeLaWie1uweH40m9AZwIwxyPbTSX4hHrVYSzLg0Ro5Z5R7XKkIX+Cc6oD1rqeDJnwsB8/A==} - deprecated: Glob versions prior to v9 are no longer supported + deprecated: Old versions of glob are not supported, and contain widely publicized security vulnerabilities, which have been fixed in the current version. Please update. Support for old versions may be purchased (at exorbitant rates) by contacting i@izs.me glob@7.2.3: resolution: {integrity: sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q==} - deprecated: Glob versions prior to v9 are no longer supported + deprecated: Old versions of glob are not supported, and contain widely publicized security vulnerabilities, which have been fixed in the current version. Please update. Support for old versions may be purchased (at exorbitant rates) by contacting i@izs.me global-directory@4.0.1: resolution: {integrity: sha512-wHTUcDUoZ1H5/0iVqEudYW4/kAlN5cZ3j/bXn0Dpbizl9iaUVeWSHqiOjsgk6OW2bkLclbBjzewBz6weQ1zA2Q==} @@ -10975,8 +10979,8 @@ snapshots: dependencies: '@aws-crypto/sha256-browser': 3.0.0 '@aws-crypto/sha256-js': 3.0.0 - '@aws-sdk/client-sso-oidc': 3.572.0 - '@aws-sdk/client-sts': 3.572.0(@aws-sdk/client-sso-oidc@3.572.0) + '@aws-sdk/client-sso-oidc': 3.572.0(@aws-sdk/client-sts@3.572.0) + '@aws-sdk/client-sts': 3.572.0 '@aws-sdk/core': 3.572.0 '@aws-sdk/credential-provider-node': 3.572.0(@aws-sdk/client-sso-oidc@3.572.0)(@aws-sdk/client-sts@3.572.0) '@aws-sdk/middleware-host-header': 3.567.0 @@ -11170,11 +11174,11 @@ snapshots: transitivePeerDependencies: - aws-crt - '@aws-sdk/client-sso-oidc@3.572.0': + '@aws-sdk/client-sso-oidc@3.572.0(@aws-sdk/client-sts@3.572.0)': dependencies: '@aws-crypto/sha256-browser': 3.0.0 '@aws-crypto/sha256-js': 3.0.0 - '@aws-sdk/client-sts': 3.572.0(@aws-sdk/client-sso-oidc@3.572.0) + '@aws-sdk/client-sts': 3.572.0 '@aws-sdk/core': 3.572.0 '@aws-sdk/credential-provider-node': 3.572.0(@aws-sdk/client-sso-oidc@3.572.0)(@aws-sdk/client-sts@3.572.0) '@aws-sdk/middleware-host-header': 3.567.0 @@ -11213,6 +11217,7 @@ snapshots: '@smithy/util-utf8': 2.3.0 tslib: 2.6.2 transitivePeerDependencies: + - '@aws-sdk/client-sts' - aws-crt '@aws-sdk/client-sso@3.556.0': @@ -11388,11 +11393,11 @@ snapshots: transitivePeerDependencies: - aws-crt - '@aws-sdk/client-sts@3.572.0(@aws-sdk/client-sso-oidc@3.572.0)': + '@aws-sdk/client-sts@3.572.0': dependencies: '@aws-crypto/sha256-browser': 3.0.0 '@aws-crypto/sha256-js': 3.0.0 - '@aws-sdk/client-sso-oidc': 3.572.0 + '@aws-sdk/client-sso-oidc': 3.572.0(@aws-sdk/client-sts@3.572.0) '@aws-sdk/core': 3.572.0 '@aws-sdk/credential-provider-node': 3.572.0(@aws-sdk/client-sso-oidc@3.572.0)(@aws-sdk/client-sts@3.572.0) '@aws-sdk/middleware-host-header': 3.567.0 @@ -11431,7 +11436,6 @@ snapshots: '@smithy/util-utf8': 2.3.0 tslib: 2.6.2 transitivePeerDependencies: - - '@aws-sdk/client-sso-oidc' - aws-crt '@aws-sdk/client-sts@3.985.0': @@ -11597,7 +11601,7 @@ snapshots: '@aws-sdk/credential-provider-ini@3.572.0(@aws-sdk/client-sso-oidc@3.572.0)(@aws-sdk/client-sts@3.572.0)': dependencies: - '@aws-sdk/client-sts': 3.572.0(@aws-sdk/client-sso-oidc@3.572.0) + '@aws-sdk/client-sts': 3.572.0 '@aws-sdk/credential-provider-env': 3.568.0 '@aws-sdk/credential-provider-process': 3.572.0 '@aws-sdk/credential-provider-sso': 3.572.0(@aws-sdk/client-sso-oidc@3.572.0) @@ -11774,7 +11778,7 @@ snapshots: '@aws-sdk/credential-provider-web-identity@3.568.0(@aws-sdk/client-sts@3.572.0)': dependencies: - '@aws-sdk/client-sts': 3.572.0(@aws-sdk/client-sso-oidc@3.572.0) + '@aws-sdk/client-sts': 3.572.0 '@aws-sdk/types': 3.567.0 '@smithy/property-provider': 2.2.0 '@smithy/types': 2.12.0 @@ -12086,7 +12090,7 @@ snapshots: '@aws-sdk/token-providers@3.572.0(@aws-sdk/client-sso-oidc@3.572.0)': dependencies: - '@aws-sdk/client-sso-oidc': 3.572.0 + '@aws-sdk/client-sso-oidc': 3.572.0(@aws-sdk/client-sts@3.572.0) '@aws-sdk/types': 3.567.0 '@smithy/property-provider': 2.2.0 '@smithy/shared-ini-file-loader': 2.4.0 diff --git a/services/apps/pcc_sync_worker/package.json b/services/apps/pcc_sync_worker/package.json index f27943916f..6a421618ae 100644 --- a/services/apps/pcc_sync_worker/package.json +++ b/services/apps/pcc_sync_worker/package.json @@ -20,6 +20,7 @@ "@crowd/archetype-worker": "workspace:*", "@crowd/common": "workspace:*", "@crowd/database": "workspace:*", + "@crowd/types": "workspace:*", "@crowd/logging": "workspace:*", "@crowd/slack": "workspace:*", "@crowd/snowflake": "workspace:*", diff --git a/services/apps/pcc_sync_worker/src/activities/cleanupActivity.ts b/services/apps/pcc_sync_worker/src/activities/cleanupActivity.ts index a12728442b..f26ae59ff7 100644 --- a/services/apps/pcc_sync_worker/src/activities/cleanupActivity.ts +++ b/services/apps/pcc_sync_worker/src/activities/cleanupActivity.ts @@ -12,7 +12,11 @@ export async function executeCleanup(intervalHours = 24): Promise { const metadataStore = new MetadataStore(db) const s3Service = new S3Service() - const jobs = await metadataStore.getCleanableJobS3Paths(intervalHours, buildPlatformFilter([PLATFORM]), false) + const jobs = await metadataStore.getCleanableJobS3Paths( + intervalHours, + buildPlatformFilter([PLATFORM]), + false, + ) log.info({ jobCount: jobs.length, intervalHours }, 'Found cleanable PCC jobs') for (const job of jobs) { diff --git a/services/apps/pcc_sync_worker/src/activities/exportActivity.ts b/services/apps/pcc_sync_worker/src/activities/exportActivity.ts index a293636d1c..7d2725c30a 100644 --- a/services/apps/pcc_sync_worker/src/activities/exportActivity.ts +++ b/services/apps/pcc_sync_worker/src/activities/exportActivity.ts @@ -1,7 +1,8 @@ /** - * Export activity: Execute PCC recursive CTE COPY INTO + write metadata. + * Export activity: Execute PCC COPY INTO + write metadata. * - * Full daily export of ANALYTICS.SILVER_DIM.PROJECTS via recursive CTE. + * Full daily export of leaf projects from ANALYTICS.SILVER_DIM.PROJECTS joined + * with PROJECT_SPINE to produce one row per (leaf, hierarchy_level) pair. * No incremental logic — at ~1,538 leaf rows, a full daily export is simpler * and more reliable than incremental (a parent name change would require * re-exporting all descendants). @@ -17,37 +18,29 @@ const SOURCE_NAME = 'project-hierarchy' function buildSourceQuery(): string { return ` - WITH RECURSIVE project_hierarchy AS ( - SELECT project_id, name, description, project_logo, project_status, - project_maturity_level, repository_url, slug, parent_id, - 1 AS depth, - name AS depth_1, NULL::VARCHAR AS depth_2, NULL::VARCHAR AS depth_3, - NULL::VARCHAR AS depth_4, NULL::VARCHAR AS depth_5 - FROM ANALYTICS.SILVER_DIM.PROJECTS - WHERE parent_id IS NULL - UNION ALL - SELECT p.project_id, p.name, p.description, p.project_logo, p.project_status, - p.project_maturity_level, p.repository_url, p.slug, p.parent_id, - h.depth + 1, - h.depth_1, - CASE WHEN h.depth + 1 = 2 THEN p.name ELSE h.depth_2 END, - CASE WHEN h.depth + 1 = 3 THEN p.name ELSE h.depth_3 END, - CASE WHEN h.depth + 1 = 4 THEN p.name ELSE h.depth_4 END, - CASE WHEN h.depth + 1 = 5 THEN p.name ELSE h.depth_5 END - FROM ANALYTICS.SILVER_DIM.PROJECTS p - INNER JOIN project_hierarchy h ON p.parent_id = h.project_id - ) - SELECT ph.project_id, ph.name, ph.slug, ph.description, ph.project_logo, ph.repository_url, - ph.project_status, ph.project_maturity_level, ph.depth, - ph.depth_1, ph.depth_2, ph.depth_3, ph.depth_4, ph.depth_5, - s.segment_id - FROM project_hierarchy ph + SELECT + p.project_id, + p.name, + p.description, + p.project_logo, + p.project_status, + p.project_maturity_level, + ps.mapped_project_id, + ps.mapped_project_name, + ps.mapped_project_slug, + ps.hierarchy_level, + s.segment_id + FROM ANALYTICS.SILVER_DIM.PROJECTS p + LEFT JOIN ANALYTICS.SILVER_DIM.PROJECT_SPINE ps ON ps.base_project_id = p.project_id LEFT JOIN ANALYTICS.SILVER_DIM.ACTIVE_SEGMENTS s - ON s.source_id = ph.project_id AND s.project_type = 'subproject' - WHERE ph.project_id NOT IN ( - SELECT DISTINCT parent_id FROM ANALYTICS.SILVER_DIM.PROJECTS + ON s.source_id = p.project_id + AND s.project_type = 'subproject' + WHERE p.project_id NOT IN ( + SELECT DISTINCT parent_id + FROM ANALYTICS.SILVER_DIM.PROJECTS WHERE parent_id IS NOT NULL ) + ORDER BY p.name, ps.hierarchy_level ASC ` } diff --git a/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts b/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts index a3c163ef24..cf81da64d3 100644 --- a/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts +++ b/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts @@ -88,9 +88,21 @@ export class PccProjectConsumer { let schemaMismatchMatchedCount = 0 // SCHEMA_MISMATCH rows that still have a CDP segment match try { + // Stream all rows and group by PROJECT_ID before processing. + // The export emits one row per (leaf, hierarchy_level) from the PROJECT_SPINE + // JOIN, so each leaf project produces N rows (one per ancestor level). + const groups = new Map[]>() + for await (const raw of this.s3Service.streamParquetRows(job.s3Path)) { + const projectId = String((raw as Record).PROJECT_ID ?? '') + if (!projectId) continue + if (!groups.has(projectId)) groups.set(projectId, []) + const group = groups.get(projectId) + if (group) group.push(raw) + } + await this.db.tx(async (tx) => { - for await (const raw of this.s3Service.streamParquetRows(job.s3Path)) { - const parsed = parsePccRow(raw) + for (const [, rows] of groups) { + const parsed = parsePccRow(rows) totalCount++ diff --git a/services/apps/pcc_sync_worker/src/parser/rowParser.ts b/services/apps/pcc_sync_worker/src/parser/rowParser.ts index f22977e7d4..8df0ff1f05 100644 --- a/services/apps/pcc_sync_worker/src/parser/rowParser.ts +++ b/services/apps/pcc_sync_worker/src/parser/rowParser.ts @@ -1,43 +1,37 @@ /** * PCC project row parser and hierarchy mapper. * - * Transforms a raw Parquet row from the PCC Snowflake export into a - * structured ParsedPccProject, applying the CDP hierarchy mapping rules. + * Transforms a group of raw Parquet rows (one per hierarchy level) for a single + * PCC leaf project into a structured ParsedPccProject, applying the CDP hierarchy + * mapping rules. * - * Pure function — no DB access, no I/O. Fully unit-testable against - * CDP__PCC Integration - proposal-for-migration.csv (project root). + * Pure function — no DB access, no I/O. * - * Mapping rules (effective_depth = raw DEPTH - 1, stripping TLF root): - * Rule 1 (eff=1): group=D2, project=D2, subproject=D2 - * Rule 2 (eff=2): group=D2, project=D3, subproject=D3 - * Rule 3 (eff=3): group=D2, project=D3, subproject=D4 - * Rule 4 (eff=4): group=D3, project=D4, subproject=D5 (drops D2 intermediate) + * Mapping rules (effective_depth = max HIERARCHY_LEVEL - 1, stripping TLF root): + * Rule 1 (eff=1): group=level[1], project=level[1], subproject=level[1] + * Rule 2 (eff=2): group=level[2], project=level[1], subproject=level[1] + * Rule 3 (eff=3): group=level[3], project=level[2], subproject=level[1] + * Rule 4 (eff=4): group=level[3], project=level[2], subproject=level[1] (level[4] intermediate dropped) * - * Depth > 4 (raw > 5): SCHEMA_MISMATCH — surfaced to pcc_projects_sync_errors. + * Effective depth > 4: SCHEMA_MISMATCH — surfaced to pcc_projects_sync_errors. */ +import { SegmentStatus } from '@crowd/types' + import type { MappingRule, ParseResult, PccParquetRow } from './types' /** - * PCC PROJECT_STATUS → CDP segmentsStatus_type enum. - * CDP enum values: active | archived | formation | prospect + * PCC PROJECT_STATUS → CDP SegmentStatus enum. */ -const STATUS_MAP: Record = { - Active: 'active', - Archived: 'archived', - 'Formation - Disengaged': 'formation', - 'Formation - Engaged': 'formation', - 'Formation - Exploratory': 'formation', - 'Formation - On Hold': 'formation', - Prospect: 'prospect', +const STATUS_MAP: Record = { + Active: SegmentStatus.ACTIVE, + Archived: SegmentStatus.ARCHIVED, + 'Formation - Disengaged': SegmentStatus.FORMATION, + 'Formation - Engaged': SegmentStatus.FORMATION, + 'Formation - Exploratory': SegmentStatus.FORMATION, + 'Formation - On Hold': SegmentStatus.FORMATION, + Prospect: SegmentStatus.PROSPECT, } -/** - * Intermediate PCC nodes that are transparent in the CDP hierarchy. - * When D2 equals one of these, it is skipped and D1 ("The Linux Foundation") - * is used as the CDP project group instead. - */ -const TRANSPARENT_INTERMEDIATES = new Set(['LF Projects, LLC']) - /** * Parquet serializes Snowflake NUMBER columns as fixed-width big-endian Buffers. * Handle both the plain number case and the Buffer case. @@ -56,43 +50,66 @@ function parseParquetInt(value: unknown): number { } /** - * Parse and validate a raw Parquet row from the PCC export. - * Returns ok=false with SCHEMA_MISMATCH if the row is malformed or - * has an unsupported depth (> 5 raw / > 4 effective). + * Parse and validate all raw Parquet rows for a single PCC leaf project. + * + * Each call receives all rows that share the same PROJECT_ID (one row per + * hierarchy level from the PROJECT_SPINE JOIN). Returns ok=false with + * SCHEMA_MISMATCH if the group is malformed or has an unsupported depth (> 4). */ -export function parsePccRow(raw: Record): ParseResult { - const row = raw as Partial +export function parsePccRow(rawRows: Record[]): ParseResult { + if (rawRows.length === 0) { + return { + ok: false, + errorType: 'SCHEMA_MISMATCH', + details: { reason: 'empty row group' }, + } + } - const projectId = row.PROJECT_ID - const name = row.NAME - const depth = parseParquetInt(row.DEPTH) + // All rows share the same leaf-level fields — use the first row for them. + const firstRaw = rawRows[0] as Partial + const projectId = firstRaw.PROJECT_ID + const name = firstRaw.NAME - if (!projectId || !name || !Number.isFinite(depth)) { + if (!projectId || !name) { return { ok: false, errorType: 'SCHEMA_MISMATCH', details: { reason: 'missing required fields', - missingFields: [ - ...(!projectId ? ['PROJECT_ID'] : []), - ...(!name ? ['NAME'] : []), - ...(!Number.isFinite(depth) ? ['DEPTH'] : []), - ], + missingFields: [...(!projectId ? ['PROJECT_ID'] : []), ...(!name ? ['NAME'] : [])], }, } } - const effectiveDepth = depth - 1 - - if (effectiveDepth < 1 || effectiveDepth > 4) { + // Parse HIERARCHY_LEVEL for each row and sort ascending (level=1 is the leaf). + const levelRows = rawRows + .map((r) => { + const row = r as Partial + return { + level: parseParquetInt(row.HIERARCHY_LEVEL), + name: (row.MAPPED_PROJECT_NAME ?? null) as string | null, + slug: (row.MAPPED_PROJECT_SLUG ?? null) as string | null, + } + }) + .sort((a, b) => a.level - b.level) + + const maxLevel = levelRows[levelRows.length - 1].level + const effectiveDepth = maxLevel - 1 + // Slug of the leaf project itself (hierarchy_level=1 row). + const leafSlug = levelRows[0]?.slug ?? null + + if (!Number.isFinite(effectiveDepth) || effectiveDepth < 1 || effectiveDepth > 4) { return { ok: false, errorType: 'SCHEMA_MISMATCH', pccProjectId: String(projectId), - pccSlug: (row.SLUG ?? null) as string | null, + pccSlug: leafSlug, details: { - reason: effectiveDepth < 1 ? 'unexpected root node (depth=1)' : 'unsupported depth > 5', - rawDepth: depth, + reason: + effectiveDepth < 1 + ? 'unexpected root node (maxHierarchyLevel≤1)' + : 'unsupported depth > 4', + maxLevel, effectiveDepth, projectId, name, @@ -100,38 +117,37 @@ export function parsePccRow(raw: Record): ParseResult { } } - const d1 = row.DEPTH_1 ?? null - const d2 = row.DEPTH_2 ?? null - const d3 = row.DEPTH_3 ?? null - const d4 = row.DEPTH_4 ?? null - const d5 = row.DEPTH_5 ?? null + // Build hierarchy_level → MAPPED_PROJECT_NAME lookup. + const nameAt: Record = {} + for (const row of levelRows) { + nameAt[row.level] = row.name + } - const cdpTargetResult = buildCdpTarget(effectiveDepth as 1 | 2 | 3 | 4, name, d1, d2, d3, d4, d5) + const cdpTargetResult = buildCdpTarget(effectiveDepth as MappingRule, nameAt) if (cdpTargetResult.ok === false) { return { ok: false, errorType: 'SCHEMA_MISMATCH', pccProjectId: String(projectId), - pccSlug: (row.SLUG ?? null) as string | null, - details: { reason: cdpTargetResult.reason, rawDepth: depth, effectiveDepth, projectId, name }, + pccSlug: leafSlug, + details: { reason: cdpTargetResult.reason, maxLevel, effectiveDepth, projectId, name }, } } - const rawStatus = row.PROJECT_STATUS ?? null - const mappedStatus = rawStatus ? (STATUS_MAP[rawStatus] ?? null) : null + const rawStatus = firstRaw.PROJECT_STATUS ?? null + const mappedStatus = rawStatus ? (STATUS_MAP[String(rawStatus)] ?? null) : null return { ok: true, project: { - pccProjectId: projectId, - pccSlug: row.SLUG ?? null, - name, + pccProjectId: String(projectId), + pccSlug: leafSlug, + name: String(name), status: mappedStatus, - maturity: row.PROJECT_MATURITY_LEVEL ?? null, - description: row.DESCRIPTION ?? null, - logoUrl: row.PROJECT_LOGO ?? null, - repositoryUrl: row.REPOSITORY_URL ?? null, - segmentIdFromSnowflake: row.SEGMENT_ID ?? null, + maturity: (firstRaw.PROJECT_MATURITY_LEVEL ?? null) as string | null, + description: (firstRaw.DESCRIPTION ?? null) as string | null, + logoUrl: (firstRaw.PROJECT_LOGO ?? null) as string | null, + segmentIdFromSnowflake: (firstRaw.SEGMENT_ID ?? null) as string | null, effectiveDepth, mappingRule: effectiveDepth as MappingRule, cdpTarget: cdpTargetResult.target, @@ -140,46 +156,45 @@ export function parsePccRow(raw: Record): ParseResult { } function buildCdpTarget( - effectiveDepth: 1 | 2 | 3 | 4, - _leafName: string, - d1: string | null, - d2: string | null, - d3: string | null, - d4: string | null, - d5: string | null, + effectiveDepth: MappingRule, + nameAt: Record, ): | { ok: true; target: { group: string; project: string; subproject: string } } | { ok: false; reason: string } { - // When D2 is a transparent intermediate (e.g. "LF Projects, LLC"), skip it - // and promote D1 ("The Linux Foundation") to be the CDP project group. - const d2IsTransparent = !!d2 && TRANSPARENT_INTERMEDIATES.has(d2) - const group2 = d2IsTransparent ? d1 : d2 - switch (effectiveDepth) { - case 1: - // D1=TLF (stripped), leaf=D2 → all three CDP levels are the same node. - // Apply transparency: if D2 is a transparent intermediate, promote D1 as the group. - if (!group2) return { ok: false, reason: 'missing DEPTH_2 for effective_depth=1' } - return { ok: true, target: { group: group2, project: group2, subproject: group2 } } - - case 2: - // D1=TLF, D2=group (or transparent→D1), leaf=D3 - if (!group2) return { ok: false, reason: 'missing DEPTH_2 for effective_depth=2' } - if (!d3) return { ok: false, reason: 'missing DEPTH_3 for effective_depth=2' } - return { ok: true, target: { group: group2, project: d3, subproject: d3 } } - - case 3: - // D1=TLF, D2=group (or transparent→D1), D3=project, leaf=D4 - if (!group2) return { ok: false, reason: 'missing DEPTH_2 for effective_depth=3' } - if (!d3) return { ok: false, reason: 'missing DEPTH_3 for effective_depth=3' } - if (!d4) return { ok: false, reason: 'missing DEPTH_4 for effective_depth=3' } - return { ok: true, target: { group: group2, project: d3, subproject: d4 } } - - case 4: - // D1=TLF, D2=intermediate (always dropped at this depth), D3=group, D4=project, leaf=D5 - if (!d3) return { ok: false, reason: 'missing DEPTH_3 for effective_depth=4' } - if (!d4) return { ok: false, reason: 'missing DEPTH_4 for effective_depth=4' } - if (!d5) return { ok: false, reason: 'missing DEPTH_5 for effective_depth=4' } - return { ok: true, target: { group: d3, project: d4, subproject: d5 } } + case 1: { + // TLF at level 2 (stripped), leaf at level 1 → all three CDP levels share the leaf. + const n1 = nameAt[1] + if (!n1) return { ok: false, reason: 'missing MAPPED_PROJECT_NAME at hierarchy_level=1' } + return { ok: true, target: { group: n1, project: n1, subproject: n1 } } + } + case 2: { + // TLF at level 3, group at level 2, leaf at level 1. + const n2 = nameAt[2] + const n1 = nameAt[1] + if (!n2) return { ok: false, reason: 'missing MAPPED_PROJECT_NAME at hierarchy_level=2' } + if (!n1) return { ok: false, reason: 'missing MAPPED_PROJECT_NAME at hierarchy_level=1' } + return { ok: true, target: { group: n2, project: n1, subproject: n1 } } + } + case 3: { + // TLF at level 4, group at level 3, project at level 2, leaf at level 1. + const n3 = nameAt[3] + const n2 = nameAt[2] + const n1 = nameAt[1] + if (!n3) return { ok: false, reason: 'missing MAPPED_PROJECT_NAME at hierarchy_level=3' } + if (!n2) return { ok: false, reason: 'missing MAPPED_PROJECT_NAME at hierarchy_level=2' } + if (!n1) return { ok: false, reason: 'missing MAPPED_PROJECT_NAME at hierarchy_level=1' } + return { ok: true, target: { group: n3, project: n2, subproject: n1 } } + } + case 4: { + // TLF at level 5, intermediate at level 4 (dropped), group at level 3, project at level 2, leaf at level 1. + const n3 = nameAt[3] + const n2 = nameAt[2] + const n1 = nameAt[1] + if (!n3) return { ok: false, reason: 'missing MAPPED_PROJECT_NAME at hierarchy_level=3' } + if (!n2) return { ok: false, reason: 'missing MAPPED_PROJECT_NAME at hierarchy_level=2' } + if (!n1) return { ok: false, reason: 'missing MAPPED_PROJECT_NAME at hierarchy_level=1' } + return { ok: true, target: { group: n3, project: n2, subproject: n1 } } + } } } diff --git a/services/apps/pcc_sync_worker/src/parser/types.ts b/services/apps/pcc_sync_worker/src/parser/types.ts index 9e42f97580..d6bb6d2559 100644 --- a/services/apps/pcc_sync_worker/src/parser/types.ts +++ b/services/apps/pcc_sync_worker/src/parser/types.ts @@ -5,22 +5,29 @@ * so all column names are uppercase. */ -/** Raw Parquet row from the PCC recursive CTE export. */ +/** + * Raw Parquet row from the PCC PROJECT_SPINE export. + * + * One row is emitted per (leaf project, hierarchy level). A leaf project at + * depth N produces N rows: hierarchy_level=1 is the leaf itself, + * hierarchy_level=N is the topmost ancestor. All rows for the same leaf share + * the same PROJECT_ID, NAME, PROJECT_STATUS, etc. + */ export interface PccParquetRow { PROJECT_ID: string NAME: string - SLUG: string | null DESCRIPTION: string | null PROJECT_LOGO: string | null - REPOSITORY_URL: string | null PROJECT_STATUS: string | null PROJECT_MATURITY_LEVEL: string | null - DEPTH: number - DEPTH_1: string | null - DEPTH_2: string | null - DEPTH_3: string | null - DEPTH_4: string | null - DEPTH_5: string | null + /** ID of the ancestor at this hierarchy level (hierarchy_level=1 → leaf itself). */ + MAPPED_PROJECT_ID: string | null + /** Name of the ancestor at this hierarchy level. */ + MAPPED_PROJECT_NAME: string | null + /** Slug of the ancestor at this hierarchy level. */ + MAPPED_PROJECT_SLUG: string | null + /** 1 = leaf, N = topmost ancestor. */ + HIERARCHY_LEVEL: number SEGMENT_ID: string | null } @@ -49,7 +56,6 @@ export interface ParsedPccProject { maturity: string | null description: string | null logoUrl: string | null - repositoryUrl: string | null /** segment_id from Snowflake ACTIVE_SEGMENTS JOIN — used for step-1 matching. */ segmentIdFromSnowflake: string | null effectiveDepth: number diff --git a/services/apps/snowflake_connectors/src/activities/exportActivity.ts b/services/apps/snowflake_connectors/src/activities/exportActivity.ts index e61fdf7604..75991d9397 100644 --- a/services/apps/snowflake_connectors/src/activities/exportActivity.ts +++ b/services/apps/snowflake_connectors/src/activities/exportActivity.ts @@ -26,7 +26,6 @@ export async function getDataSourceNamesForPlatform(platform: PlatformType): Pro const log = getServiceChildLogger('exportActivity') - export async function executeExport( platform: PlatformType, sourceName: DataSourceName, diff --git a/services/apps/snowflake_connectors/src/consumer/transformerConsumer.ts b/services/apps/snowflake_connectors/src/consumer/transformerConsumer.ts index f091e89678..879c956485 100644 --- a/services/apps/snowflake_connectors/src/consumer/transformerConsumer.ts +++ b/services/apps/snowflake_connectors/src/consumer/transformerConsumer.ts @@ -40,7 +40,9 @@ export class TransformerConsumer { while (this.running) { try { - const job = await this.metadataStore.claimOldestPendingJob(buildPlatformFilter(this.enabledPlatforms)) + const job = await this.metadataStore.claimOldestPendingJob( + buildPlatformFilter(this.enabledPlatforms), + ) log.info('Claiming job from metadata store', { job }) if (job) { From f5c90fbec23c6550f403c93ae2942b7530b32ba1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uro=C5=A1=20Marolt?= Date: Wed, 15 Apr 2026 21:16:47 +0200 Subject: [PATCH 08/19] fix: comment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Uroš Marolt --- .../apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts b/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts index cf81da64d3..3e111a0862 100644 --- a/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts +++ b/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts @@ -373,7 +373,7 @@ async function upsertSegment( await db.none( `UPDATE segments SET name = $(name), - status = $(status)::"segmentsStatus_type", + status = COALESCE($(status)::"segmentsStatus_type", status), maturity = $(maturity), description = $(description), "updatedAt" = NOW() @@ -381,7 +381,7 @@ async function upsertSegment( { sourceId, name: project.name, - status: project.status ?? 'active', + status: project.status, maturity: project.maturity, description: project.description, tenantId: DEFAULT_TENANT_ID, From 8b6800a821e2b3b738ea0221da4a39717e6b382f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uro=C5=A1=20Marolt?= Date: Wed, 15 Apr 2026 22:24:01 +0200 Subject: [PATCH 09/19] fix: bugfixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Uroš Marolt --- services/apps/pcc_sync_worker/src/parser/rowParser.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/apps/pcc_sync_worker/src/parser/rowParser.ts b/services/apps/pcc_sync_worker/src/parser/rowParser.ts index 8df0ff1f05..bf8af6336f 100644 --- a/services/apps/pcc_sync_worker/src/parser/rowParser.ts +++ b/services/apps/pcc_sync_worker/src/parser/rowParser.ts @@ -98,7 +98,7 @@ export function parsePccRow(rawRows: Record[]): ParseResult { // Slug of the leaf project itself (hierarchy_level=1 row). const leafSlug = levelRows[0]?.slug ?? null - if (!Number.isFinite(effectiveDepth) || effectiveDepth < 1 || effectiveDepth > 4) { + if (!Number.isFinite(effectiveDepth) || !Number.isInteger(effectiveDepth) || effectiveDepth < 1 || effectiveDepth > 4) { return { ok: false, errorType: 'SCHEMA_MISMATCH', From c0a80b02d38b2cc2a7531d66a4c18e03b6c7daa6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uro=C5=A1=20Marolt?= Date: Thu, 16 Apr 2026 11:06:28 +0200 Subject: [PATCH 10/19] fix: bugfixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Uroš Marolt --- services/apps/pcc_sync_worker/src/parser/rowParser.ts | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/services/apps/pcc_sync_worker/src/parser/rowParser.ts b/services/apps/pcc_sync_worker/src/parser/rowParser.ts index bf8af6336f..7112cd9dd0 100644 --- a/services/apps/pcc_sync_worker/src/parser/rowParser.ts +++ b/services/apps/pcc_sync_worker/src/parser/rowParser.ts @@ -99,16 +99,18 @@ export function parsePccRow(rawRows: Record[]): ParseResult { const leafSlug = levelRows[0]?.slug ?? null if (!Number.isFinite(effectiveDepth) || !Number.isInteger(effectiveDepth) || effectiveDepth < 1 || effectiveDepth > 4) { + const depthReason = !Number.isFinite(effectiveDepth) || !Number.isInteger(effectiveDepth) + ? 'invalid hierarchy level (non-finite or fractional depth)' + : effectiveDepth < 1 + ? 'unexpected root node (maxHierarchyLevel≤1)' + : 'unsupported depth > 4' return { ok: false, errorType: 'SCHEMA_MISMATCH', pccProjectId: String(projectId), pccSlug: leafSlug, details: { - reason: - effectiveDepth < 1 - ? 'unexpected root node (maxHierarchyLevel≤1)' - : 'unsupported depth > 4', + reason: depthReason, maxLevel, effectiveDepth, projectId, From 2c9ab8d162bc15abb35ec039ac7e3343ffea4270 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uro=C5=A1=20Marolt?= Date: Thu, 16 Apr 2026 11:43:28 +0200 Subject: [PATCH 11/19] fix: bugfixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Uroš Marolt --- .../pcc_sync_worker/src/parser/rowParser.ts | 21 +++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/services/apps/pcc_sync_worker/src/parser/rowParser.ts b/services/apps/pcc_sync_worker/src/parser/rowParser.ts index 7112cd9dd0..3fc96e315e 100644 --- a/services/apps/pcc_sync_worker/src/parser/rowParser.ts +++ b/services/apps/pcc_sync_worker/src/parser/rowParser.ts @@ -82,6 +82,8 @@ export function parsePccRow(rawRows: Record[]): ParseResult { } // Parse HIERARCHY_LEVEL for each row and sort ascending (level=1 is the leaf). + // Filter out rows with non-finite levels before sorting to avoid NaN in the comparator, + // which would violate the sort contract and produce unpredictable ordering. const levelRows = rawRows .map((r) => { const row = r as Partial @@ -91,6 +93,7 @@ export function parsePccRow(rawRows: Record[]): ParseResult { slug: (row.MAPPED_PROJECT_SLUG ?? null) as string | null, } }) + .filter((r) => Number.isFinite(r.level) && Number.isInteger(r.level)) .sort((a, b) => a.level - b.level) const maxLevel = levelRows[levelRows.length - 1].level @@ -98,12 +101,18 @@ export function parsePccRow(rawRows: Record[]): ParseResult { // Slug of the leaf project itself (hierarchy_level=1 row). const leafSlug = levelRows[0]?.slug ?? null - if (!Number.isFinite(effectiveDepth) || !Number.isInteger(effectiveDepth) || effectiveDepth < 1 || effectiveDepth > 4) { - const depthReason = !Number.isFinite(effectiveDepth) || !Number.isInteger(effectiveDepth) - ? 'invalid hierarchy level (non-finite or fractional depth)' - : effectiveDepth < 1 - ? 'unexpected root node (maxHierarchyLevel≤1)' - : 'unsupported depth > 4' + if ( + !Number.isFinite(effectiveDepth) || + !Number.isInteger(effectiveDepth) || + effectiveDepth < 1 || + effectiveDepth > 4 + ) { + const depthReason = + !Number.isFinite(effectiveDepth) || !Number.isInteger(effectiveDepth) + ? 'invalid hierarchy level (non-finite or fractional depth)' + : effectiveDepth < 1 + ? 'unexpected root node (maxHierarchyLevel≤1)' + : 'unsupported depth > 4' return { ok: false, errorType: 'SCHEMA_MISMATCH', From 08be68848db7a80547a05d7545900d62956557d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uro=C5=A1=20Marolt?= Date: Thu, 16 Apr 2026 12:11:27 +0200 Subject: [PATCH 12/19] fix: guard against empty levelRows after HIERARCHY_LEVEL filter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Uroš Marolt --- services/apps/pcc_sync_worker/src/parser/rowParser.ts | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/services/apps/pcc_sync_worker/src/parser/rowParser.ts b/services/apps/pcc_sync_worker/src/parser/rowParser.ts index 3fc96e315e..3a0e1de3a5 100644 --- a/services/apps/pcc_sync_worker/src/parser/rowParser.ts +++ b/services/apps/pcc_sync_worker/src/parser/rowParser.ts @@ -96,6 +96,16 @@ export function parsePccRow(rawRows: Record[]): ParseResult { .filter((r) => Number.isFinite(r.level) && Number.isInteger(r.level)) .sort((a, b) => a.level - b.level) + if (levelRows.length === 0) { + return { + ok: false, + errorType: 'SCHEMA_MISMATCH', + pccProjectId: String(projectId), + pccSlug: null, + details: { reason: 'no rows with valid HIERARCHY_LEVEL', projectId, name }, + } + } + const maxLevel = levelRows[levelRows.length - 1].level const effectiveDepth = maxLevel - 1 // Slug of the leaf project itself (hierarchy_level=1 row). From a26acf6dac9549d6dd626181251c14bf3608b3b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uro=C5=A1=20Marolt?= Date: Mon, 20 Apr 2026 12:16:18 +0200 Subject: [PATCH 13/19] fix: comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Uroš Marolt --- .../src/consumer/pccProjectConsumer.ts | 47 ++++++++++--------- 1 file changed, 25 insertions(+), 22 deletions(-) diff --git a/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts b/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts index 3e111a0862..174f270c4a 100644 --- a/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts +++ b/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts @@ -149,22 +149,11 @@ export class PccProjectConsumer { switch (result.action) { case 'UPSERTED': upsertedCount++ + if (result.hierarchyMismatch) mismatchCount++ break case 'SKIPPED': skippedCount++ break - case 'MISMATCH': - mismatchCount++ - if (!this.dryRun) { - await insertSyncError( - tx, - project.pccProjectId, - project.pccSlug, - 'HIERARCHY_MISMATCH', - result.details, - ) - } - break } } }) @@ -189,7 +178,7 @@ export class PccProjectConsumer { if (!this.dryRun) { await this.metadataStore.markCompleted(job.id, { transformedCount: upsertedCount, - skippedCount: skippedCount + mismatchCount + schemaMismatchCount, + skippedCount: skippedCount + schemaMismatchCount, processingDurationMs: durationMs, }) } @@ -217,9 +206,8 @@ export class PccProjectConsumer { tx: DbConnOrTx, project: ParsedPccProject, ): Promise< - | { action: 'UPSERTED' } + | { action: 'UPSERTED'; hierarchyMismatch: boolean } | { action: 'SKIPPED' } - | { action: 'MISMATCH'; details: Record } > { // Step 1: segment_id from Snowflake ACTIVE_SEGMENTS JOIN let segment = project.segmentIdFromSnowflake @@ -236,12 +224,26 @@ export class PccProjectConsumer { return { action: 'SKIPPED' } } - // Hierarchy mismatch check: segment was matched but parent/group differs + // Hierarchy mismatch detection: segment matched but parent/group differs. + // Phase 1 does NOT re-parent segments — hierarchy fields (parent/grandparent id, + // name, slug) are never written. We record the mismatch for manual review but + // still sync the metadata fields (name, status, maturity, description, logo). const mismatchFields = detectHierarchyMismatch(segment, project.cdpTarget) - if (mismatchFields.length > 0) { - return { - action: 'MISMATCH', - details: { + const hasHierarchyMismatch = mismatchFields.length > 0 + + if (hasHierarchyMismatch) { + log.warn( + { + segmentId: segment.id, + segmentName: segment.name, + pccProjectId: project.pccProjectId, + mismatchFields, + cdpTarget: project.cdpTarget, + }, + 'Hierarchy mismatch — recorded for manual review, metadata still synced (Phase 1 scope)', + ) + if (!this.dryRun) { + await insertSyncError(tx, project.pccProjectId, project.pccSlug, 'HIERARCHY_MISMATCH', { segmentId: segment.id, segmentName: segment.name, pccProjectId: project.pccProjectId, @@ -252,7 +254,7 @@ export class PccProjectConsumer { project: segment.parentName ?? segment.name, subproject: segment.name, }, - }, + }) } } @@ -300,12 +302,13 @@ export class PccProjectConsumer { name: project.name, status: project.status, maturity: project.maturity, + hierarchyMismatch: hasHierarchyMismatch, }, '[dry-run] Would upsert segment', ) } - return { action: 'UPSERTED' } + return { action: 'UPSERTED', hierarchyMismatch: hasHierarchyMismatch } } private sleep(ms: number): Promise { From 68d71c7295bd385dc6d5118a8a3b4a099778eb3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uro=C5=A1=20Marolt?= Date: Mon, 20 Apr 2026 13:14:47 +0200 Subject: [PATCH 14/19] fix: address copilot and cursor review feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - index.ts: wrap startup IIFE in try/catch to surface init failures cleanly - rowParser.ts: select leafSlug by hierarchy_level=1 instead of array position - pccProjectConsumer.ts: write sync errors on the outer connection so they survive a tx rollback and preserve diagnostics for failed jobs Signed-off-by: Uroš Marolt --- .../src/consumer/pccProjectConsumer.ts | 96 +++++++++++-------- services/apps/pcc_sync_worker/src/index.ts | 45 +++++---- .../pcc_sync_worker/src/parser/rowParser.ts | 6 +- 3 files changed, 85 insertions(+), 62 deletions(-) diff --git a/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts b/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts index 174f270c4a..80fffabc54 100644 --- a/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts +++ b/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts @@ -3,8 +3,9 @@ * streams each Parquet file, runs the matching cascade, and writes to DB. * * One DB transaction per job — all segment + insightsProject writes roll back - * together on any failure. Errors that can't be auto-resolved are written to - * pcc_projects_sync_errors for manual review. + * together on any failure. Sync error records are written on a separate + * connection (via `this.db`, not `tx`) so they survive a tx rollback — otherwise + * a single failing row would lose all diagnostic breadcrumbs for the batch. */ import { DEFAULT_TENANT_ID } from '@crowd/common' import { DbConnOrTx, DbConnection, WRITE_DB_CONFIG, getDbConnection } from '@crowd/database' @@ -131,15 +132,12 @@ export class PccProjectConsumer { }, 'Schema mismatch in PCC row', ) - if (!this.dryRun) { - await insertSyncError( - tx, - parsed.pccProjectId ?? null, - parsed.pccSlug ?? null, - 'SCHEMA_MISMATCH', - errorDetails, - ) - } + await this.recordSyncError( + parsed.pccProjectId ?? null, + parsed.pccSlug ?? null, + 'SCHEMA_MISMATCH', + errorDetails, + ) continue } @@ -205,10 +203,7 @@ export class PccProjectConsumer { private async processRow( tx: DbConnOrTx, project: ParsedPccProject, - ): Promise< - | { action: 'UPSERTED'; hierarchyMismatch: boolean } - | { action: 'SKIPPED' } - > { + ): Promise<{ action: 'UPSERTED'; hierarchyMismatch: boolean } | { action: 'SKIPPED' }> { // Step 1: segment_id from Snowflake ACTIVE_SEGMENTS JOIN let segment = project.segmentIdFromSnowflake ? await findSegmentById(tx, project.segmentIdFromSnowflake) @@ -242,20 +237,18 @@ export class PccProjectConsumer { }, 'Hierarchy mismatch — recorded for manual review, metadata still synced (Phase 1 scope)', ) - if (!this.dryRun) { - await insertSyncError(tx, project.pccProjectId, project.pccSlug, 'HIERARCHY_MISMATCH', { - segmentId: segment.id, - segmentName: segment.name, - pccProjectId: project.pccProjectId, - mismatchFields, - cdpTarget: project.cdpTarget, - currentHierarchy: { - group: segment.grandparentName ?? segment.parentName ?? segment.name, - project: segment.parentName ?? segment.name, - subproject: segment.name, - }, - }) - } + await this.recordSyncError(project.pccProjectId, project.pccSlug, 'HIERARCHY_MISMATCH', { + segmentId: segment.id, + segmentName: segment.name, + pccProjectId: project.pccProjectId, + mismatchFields, + cdpTarget: project.cdpTarget, + currentHierarchy: { + group: segment.grandparentName ?? segment.parentName ?? segment.name, + project: segment.parentName ?? segment.name, + subproject: segment.name, + }, + }) } // Slug drift detection: log when PCC slug differs from the CDP segment slug. @@ -267,13 +260,11 @@ export class PccProjectConsumer { { segmentId: segment.id, pccSlug: project.pccSlug, cdpSlug: segment.slug }, 'Slug drift detected — PCC slug differs from CDP segment slug', ) - if (!this.dryRun) { - await insertSyncError(tx, project.pccProjectId, project.pccSlug, 'SLUG_CHANGED', { - segmentId: segment.id, - pccSlug: project.pccSlug, - cdpSlug: segment.slug, - }) - } + await this.recordSyncError(project.pccProjectId, project.pccSlug, 'SLUG_CHANGED', { + segmentId: segment.id, + pccSlug: project.pccSlug, + cdpSlug: segment.slug, + }) } if (!this.dryRun) { @@ -289,10 +280,15 @@ export class PccProjectConsumer { { segmentId: segment.id, name: project.name }, 'insightsProject name conflict — segment synced, insights project skipped', ) - await insertSyncError(tx, project.pccProjectId, project.pccSlug, 'INSIGHTS_NAME_CONFLICT', { - segmentId: segment.id, - name: project.name, - }) + await this.recordSyncError( + project.pccProjectId, + project.pccSlug, + 'INSIGHTS_NAME_CONFLICT', + { + segmentId: segment.id, + name: project.name, + }, + ) } } else { log.info( @@ -314,6 +310,26 @@ export class PccProjectConsumer { private sleep(ms: number): Promise { return new Promise((resolve) => setTimeout(resolve, ms)) } + + // Records a sync-error row on a separate connection (`this.db`, not the job's + // tx) so the diagnostic survives a tx rollback. Wrapped in try/catch so a + // write failure here never cascades into the enclosing tx. + private async recordSyncError( + externalProjectId: string | null, + externalProjectSlug: string | null, + errorType: string, + details: Record, + ): Promise { + if (this.dryRun) return + try { + await insertSyncError(this.db, externalProjectId, externalProjectSlug, errorType, details) + } catch (err) { + log.error( + { err, externalProjectId, externalProjectSlug, errorType }, + 'Failed to record sync error (best-effort)', + ) + } + } } // ───────────────────────────────────────────────────────────────────────────── diff --git a/services/apps/pcc_sync_worker/src/index.ts b/services/apps/pcc_sync_worker/src/index.ts index 31a0e09b35..8bdf367386 100644 --- a/services/apps/pcc_sync_worker/src/index.ts +++ b/services/apps/pcc_sync_worker/src/index.ts @@ -12,31 +12,36 @@ const log = getServiceChildLogger('main') const DRY_RUN = process.env.PCC_DRY_RUN === 'true' setImmediate(async () => { - await svc.init() + try { + await svc.init() - await schedulePccS3Export() - await schedulePccS3Cleanup() + await schedulePccS3Export() + await schedulePccS3Cleanup() - const consumer = await createPccProjectConsumer(DRY_RUN) - consumer.start().catch((err) => { - log.error({ err }, 'Consumer loop crashed') - process.exit(1) - }) + const consumer = await createPccProjectConsumer(DRY_RUN) + consumer.start().catch((err) => { + log.error({ err }, 'Consumer loop crashed') + process.exit(1) + }) - const HARD_TIMEOUT_MS = 2 * 60 * 60 * 1000 + const HARD_TIMEOUT_MS = 2 * 60 * 60 * 1000 - const shutdown = () => { - log.info('Shutdown signal received, stopping consumer...') - consumer.stop() + const shutdown = () => { + log.info('Shutdown signal received, stopping consumer...') + consumer.stop() - setTimeout(() => { - log.warn('Graceful shutdown timed out after 2 hours, forcing exit') - process.exit(1) - }, HARD_TIMEOUT_MS).unref() - } + setTimeout(() => { + log.warn('Graceful shutdown timed out after 2 hours, forcing exit') + process.exit(1) + }, HARD_TIMEOUT_MS).unref() + } - process.on('SIGINT', shutdown) - process.on('SIGTERM', shutdown) + process.on('SIGINT', shutdown) + process.on('SIGTERM', shutdown) - await svc.start() + await svc.start() + } catch (err) { + log.error({ err }, 'PCC worker startup failed') + process.exit(1) + } }) diff --git a/services/apps/pcc_sync_worker/src/parser/rowParser.ts b/services/apps/pcc_sync_worker/src/parser/rowParser.ts index 3a0e1de3a5..0c214df503 100644 --- a/services/apps/pcc_sync_worker/src/parser/rowParser.ts +++ b/services/apps/pcc_sync_worker/src/parser/rowParser.ts @@ -108,8 +108,10 @@ export function parsePccRow(rawRows: Record[]): ParseResult { const maxLevel = levelRows[levelRows.length - 1].level const effectiveDepth = maxLevel - 1 - // Slug of the leaf project itself (hierarchy_level=1 row). - const leafSlug = levelRows[0]?.slug ?? null + // Slug of the leaf project itself (hierarchy_level=1 row). Select by level rather + // than array position — if the level-1 row was filtered out, we want null, not an + // ancestor's slug. + const leafSlug = levelRows.find((r) => r.level === 1)?.slug ?? null if ( !Number.isFinite(effectiveDepth) || From d9037e577bb8d011e1bde9a80a8c10a281daa8c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uro=C5=A1=20Marolt?= Date: Mon, 20 Apr 2026 13:26:29 +0200 Subject: [PATCH 15/19] fix: address copilot shutdown and diagnostics feedback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - index.ts: register shutdown handler before svc.init() so the consumer drains before the archetype tears down shared infra (DB, Temporal) - consumer: make sleep abortable via AbortController so stop() interrupts the polling backoff immediately instead of waiting up to 30 min - consumer: record a SCHEMA_MISMATCH sync error for Parquet rows with missing PROJECT_ID instead of dropping them silently Signed-off-by: Uroš Marolt --- .../src/consumer/pccProjectConsumer.ts | 42 ++++++++++++++++++- services/apps/pcc_sync_worker/src/index.ts | 24 ++++++----- 2 files changed, 54 insertions(+), 12 deletions(-) diff --git a/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts b/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts index 80fffabc54..d7b260276c 100644 --- a/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts +++ b/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts @@ -27,6 +27,7 @@ const MAX_POLLING_INTERVAL_MS = 30 * 60 * 1000 // 30 minutes export class PccProjectConsumer { private running = false private currentPollingIntervalMs: number + private readonly shutdownAbort = new AbortController() constructor( private readonly metadataStore: MetadataStore, @@ -71,6 +72,9 @@ export class PccProjectConsumer { stop(): void { this.running = false + // Interrupt any in-flight backoff sleep so shutdown isn't delayed by + // the current polling interval (up to MAX_POLLING_INTERVAL_MS). + this.shutdownAbort.abort() } // ───────────────────────────────────────────────────────────────────────── @@ -87,6 +91,7 @@ export class PccProjectConsumer { let mismatchCount = 0 let schemaMismatchCount = 0 let schemaMismatchMatchedCount = 0 // SCHEMA_MISMATCH rows that still have a CDP segment match + let missingProjectIdCount = 0 try { // Stream all rows and group by PROJECT_ID before processing. @@ -95,12 +100,30 @@ export class PccProjectConsumer { const groups = new Map[]>() for await (const raw of this.s3Service.streamParquetRows(job.s3Path)) { const projectId = String((raw as Record).PROJECT_ID ?? '') - if (!projectId) continue + if (!projectId) { + missingProjectIdCount++ + continue + } if (!groups.has(projectId)) groups.set(projectId, []) const group = groups.get(projectId) if (group) group.push(raw) } + // Record a single SCHEMA_MISMATCH row aggregating all rows dropped for + // missing PROJECT_ID — unidentifiable rows dedup on (error_type, reason) + // so repeated daily exports don't accumulate duplicates. + if (missingProjectIdCount > 0) { + schemaMismatchCount += missingProjectIdCount + log.warn( + { jobId: job.id, count: missingProjectIdCount }, + 'Dropped Parquet rows with missing PROJECT_ID', + ) + await this.recordSyncError(null, null, 'SCHEMA_MISMATCH', { + reason: 'missing PROJECT_ID', + count: missingProjectIdCount, + }) + } + await this.db.tx(async (tx) => { for (const [, rows] of groups) { const parsed = parsePccRow(rows) @@ -169,6 +192,7 @@ export class PccProjectConsumer { schemaMismatch: schemaMismatchCount, schemaMismatchWithCdpMatch: schemaMismatchMatchedCount, schemaMismatchNoCdpMatch: schemaMismatchCount - schemaMismatchMatchedCount, + missingProjectId: missingProjectIdCount, }, 'PCC job completed', ) @@ -308,7 +332,21 @@ export class PccProjectConsumer { } private sleep(ms: number): Promise { - return new Promise((resolve) => setTimeout(resolve, ms)) + return new Promise((resolve) => { + if (this.shutdownAbort.signal.aborted) { + resolve() + return + } + const timer = setTimeout(resolve, ms) + this.shutdownAbort.signal.addEventListener( + 'abort', + () => { + clearTimeout(timer) + resolve() + }, + { once: true }, + ) + }) } // Records a sync-error row on a separate connection (`this.db`, not the job's diff --git a/services/apps/pcc_sync_worker/src/index.ts b/services/apps/pcc_sync_worker/src/index.ts index 8bdf367386..985fc65bc7 100644 --- a/services/apps/pcc_sync_worker/src/index.ts +++ b/services/apps/pcc_sync_worker/src/index.ts @@ -13,19 +13,13 @@ const DRY_RUN = process.env.PCC_DRY_RUN === 'true' setImmediate(async () => { try { - await svc.init() - - await schedulePccS3Export() - await schedulePccS3Cleanup() - + // Create the consumer (and its DB connection) before svc.init() so we can + // register the shutdown handler first. Node fires SIGINT/SIGTERM listeners + // in registration order; registering ours before the archetype's lets the + // consumer drain before shared infra (Postgres, Temporal) is torn down. const consumer = await createPccProjectConsumer(DRY_RUN) - consumer.start().catch((err) => { - log.error({ err }, 'Consumer loop crashed') - process.exit(1) - }) const HARD_TIMEOUT_MS = 2 * 60 * 60 * 1000 - const shutdown = () => { log.info('Shutdown signal received, stopping consumer...') consumer.stop() @@ -39,6 +33,16 @@ setImmediate(async () => { process.on('SIGINT', shutdown) process.on('SIGTERM', shutdown) + await svc.init() + + await schedulePccS3Export() + await schedulePccS3Cleanup() + + consumer.start().catch((err) => { + log.error({ err }, 'Consumer loop crashed') + process.exit(1) + }) + await svc.start() } catch (err) { log.error({ err }, 'PCC worker startup failed') From 286765cd781e7e44247f10196232a241626a821a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uro=C5=A1=20Marolt?= Date: Mon, 20 Apr 2026 14:19:48 +0200 Subject: [PATCH 16/19] fix: comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Uroš Marolt --- .../src/consumer/pccProjectConsumer.ts | 21 +++++++++++++++++-- services/libs/snowflake/src/metadataStore.ts | 14 +++++++++++++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts b/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts index d7b260276c..3d0f236af5 100644 --- a/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts +++ b/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts @@ -197,7 +197,12 @@ export class PccProjectConsumer { 'PCC job completed', ) - if (!this.dryRun) { + if (this.dryRun) { + // Dry-run must leave no trace: the job was claimed (processingStartedAt + // set by claimOldestPendingJob), so release it so a real run can pick + // it up later. Otherwise dry-run jobs are permanently stuck. + await this.releaseClaimBestEffort(job.id) + } else { await this.metadataStore.markCompleted(job.id, { transformedCount: upsertedCount, skippedCount: skippedCount + schemaMismatchCount, @@ -208,7 +213,11 @@ export class PccProjectConsumer { const errorMessage = err instanceof Error ? err.message : String(err) log.error({ jobId: job.id, err }, 'PCC job failed') - if (!this.dryRun) { + if (this.dryRun) { + // Same rationale as the success path — release the claim so the job + // can be retried on a real run. + await this.releaseClaimBestEffort(job.id) + } else { try { await this.metadataStore.markFailed(job.id, errorMessage, { processingDurationMs: Date.now() - startTime, @@ -220,6 +229,14 @@ export class PccProjectConsumer { } } + private async releaseClaimBestEffort(jobId: number): Promise { + try { + await this.metadataStore.releaseClaim(jobId) + } catch (err) { + log.error({ jobId, err }, 'Failed to release dry-run claim') + } + } + // ───────────────────────────────────────────────────────────────────────── // Per-row matching cascade + writes // ───────────────────────────────────────────────────────────────────────── diff --git a/services/libs/snowflake/src/metadataStore.ts b/services/libs/snowflake/src/metadataStore.ts index eb5fb8bf19..282dd6e0a3 100644 --- a/services/libs/snowflake/src/metadataStore.ts +++ b/services/libs/snowflake/src/metadataStore.ts @@ -152,6 +152,20 @@ export class MetadataStore { ) } + /** + * Release a previously claimed job by clearing processingStartedAt so it can + * be re-claimed. Intended for dry-run or cancellation paths where the claim + * should leave no trace; do not use for failures — use markFailed instead. + */ + async releaseClaim(jobId: number): Promise { + await this.db.none( + `UPDATE integration."snowflakeExportJobs" + SET "processingStartedAt" = NULL, "updatedAt" = NOW() + WHERE id = $(jobId)`, + { jobId }, + ) + } + async markCompleted(jobId: number, metrics?: Partial): Promise { await this.db.none( `UPDATE integration."snowflakeExportJobs" From f3fd2b22004a66726b488bacc3c3d296e61cc13b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uro=C5=A1=20Marolt?= Date: Mon, 20 Apr 2026 14:54:29 +0200 Subject: [PATCH 17/19] fix: trim everything MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Uroš Marolt --- .../src/consumer/pccProjectConsumer.ts | 12 +++--- .../pcc_sync_worker/src/parser/rowParser.ts | 37 +++++++++++++------ 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts b/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts index 3d0f236af5..03e80de566 100644 --- a/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts +++ b/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts @@ -495,11 +495,13 @@ async function upsertInsightsProject( // shares the PCC sourceId (group, project, subproject levels). // Slug is intentionally not updated — it is a stable identifier referenced by FK from // securityInsightsEvaluations and related tables. + // logoUrl won't be updated in InsightsProject until we confirm that the format is + // compatible with the Insights Squared standard. await db.none( `UPDATE "insightsProjects" ip SET name = $(name), description = $(description), - "logoUrl" = $(logoUrl), + -- "logoUrl" = $(logoUrl), "updatedAt" = NOW() FROM segments s WHERE ip."segmentId" = s.id @@ -511,7 +513,6 @@ async function upsertInsightsProject( tenantId: DEFAULT_TENANT_ID, name: project.name, description: project.description, - logoUrl: project.logoUrl, }, ) @@ -524,11 +525,12 @@ async function upsertInsightsProject( ) if (exists) return false + // logoUrl intentionally omitted from the INSERT column list — see note above. const inserted = await db.result( - `INSERT INTO "insightsProjects" (name, slug, description, "segmentId", "logoUrl", "isLF") - VALUES ($(name), generate_slug('insightsProjects', $(name)), $(description), $(segmentId), $(logoUrl), TRUE) + `INSERT INTO "insightsProjects" (name, slug, description, "segmentId", "isLF") + VALUES ($(name), generate_slug('insightsProjects', $(name)), $(description), $(segmentId), TRUE) ON CONFLICT (name) WHERE "deletedAt" IS NULL DO NOTHING`, - { name: project.name, description: project.description, segmentId, logoUrl: project.logoUrl }, + { name: project.name, description: project.description, segmentId }, ) if (inserted.rowCount === 0) return true diff --git a/services/apps/pcc_sync_worker/src/parser/rowParser.ts b/services/apps/pcc_sync_worker/src/parser/rowParser.ts index 0c214df503..89eda3e8df 100644 --- a/services/apps/pcc_sync_worker/src/parser/rowParser.ts +++ b/services/apps/pcc_sync_worker/src/parser/rowParser.ts @@ -49,6 +49,19 @@ function parseParquetInt(value: unknown): number { return Number(value) } +/** + * Normalize a string field from PCC: coerce to string, trim surrounding + * whitespace, and collapse empty results to null. PCC source data sometimes + * carries accidental leading/trailing whitespace — trimming here prevents + * spurious hierarchy mismatches, failed STATUS_MAP lookups, and padded names + * being persisted to segments. + */ +function trimOrNull(value: unknown): string | null { + if (value == null) return null + const s = String(value).trim() + return s === '' ? null : s +} + /** * Parse and validate all raw Parquet rows for a single PCC leaf project. * @@ -67,8 +80,8 @@ export function parsePccRow(rawRows: Record[]): ParseResult { // All rows share the same leaf-level fields — use the first row for them. const firstRaw = rawRows[0] as Partial - const projectId = firstRaw.PROJECT_ID - const name = firstRaw.NAME + const projectId = trimOrNull(firstRaw.PROJECT_ID) + const name = trimOrNull(firstRaw.NAME) if (!projectId || !name) { return { @@ -89,8 +102,8 @@ export function parsePccRow(rawRows: Record[]): ParseResult { const row = r as Partial return { level: parseParquetInt(row.HIERARCHY_LEVEL), - name: (row.MAPPED_PROJECT_NAME ?? null) as string | null, - slug: (row.MAPPED_PROJECT_SLUG ?? null) as string | null, + name: trimOrNull(row.MAPPED_PROJECT_NAME), + slug: trimOrNull(row.MAPPED_PROJECT_SLUG), } }) .filter((r) => Number.isFinite(r.level) && Number.isInteger(r.level)) @@ -157,20 +170,20 @@ export function parsePccRow(rawRows: Record[]): ParseResult { } } - const rawStatus = firstRaw.PROJECT_STATUS ?? null - const mappedStatus = rawStatus ? (STATUS_MAP[String(rawStatus)] ?? null) : null + const rawStatus = trimOrNull(firstRaw.PROJECT_STATUS) + const mappedStatus = rawStatus ? (STATUS_MAP[rawStatus] ?? null) : null return { ok: true, project: { - pccProjectId: String(projectId), + pccProjectId: projectId, pccSlug: leafSlug, - name: String(name), + name, status: mappedStatus, - maturity: (firstRaw.PROJECT_MATURITY_LEVEL ?? null) as string | null, - description: (firstRaw.DESCRIPTION ?? null) as string | null, - logoUrl: (firstRaw.PROJECT_LOGO ?? null) as string | null, - segmentIdFromSnowflake: (firstRaw.SEGMENT_ID ?? null) as string | null, + maturity: trimOrNull(firstRaw.PROJECT_MATURITY_LEVEL), + description: trimOrNull(firstRaw.DESCRIPTION), + logoUrl: trimOrNull(firstRaw.PROJECT_LOGO), + segmentIdFromSnowflake: trimOrNull(firstRaw.SEGMENT_ID), effectiveDepth, mappingRule: effectiveDepth as MappingRule, cdpTarget: cdpTargetResult.target, From c6b6350b6b6ff1b3c8827d95a9acbd91df07548d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uro=C5=A1=20Marolt?= Date: Mon, 20 Apr 2026 15:00:44 +0200 Subject: [PATCH 18/19] fix: query MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Uroš Marolt --- .../apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts b/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts index 03e80de566..1afe14598b 100644 --- a/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts +++ b/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts @@ -496,12 +496,13 @@ async function upsertInsightsProject( // Slug is intentionally not updated — it is a stable identifier referenced by FK from // securityInsightsEvaluations and related tables. // logoUrl won't be updated in InsightsProject until we confirm that the format is - // compatible with the Insights Squared standard. + // compatible with the Insights Squared standard. Do NOT reintroduce it as a + // `--`-commented SQL line: pg-promise scans placeholders textually and would still + // require the `logoUrl` param, triggering "Property 'logoUrl' doesn't exist". await db.none( `UPDATE "insightsProjects" ip SET name = $(name), description = $(description), - -- "logoUrl" = $(logoUrl), "updatedAt" = NOW() FROM segments s WHERE ip."segmentId" = s.id From 15bb77c4cf947f6698f2ee1d7d411ef967560ce3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uro=C5=A1=20Marolt?= Date: Mon, 20 Apr 2026 15:20:25 +0200 Subject: [PATCH 19/19] fix: comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Uroš Marolt --- .../src/activities/cleanupActivity.ts | 2 +- .../src/consumer/pccProjectConsumer.ts | 59 ++++++++++++++++--- .../src/consumer/transformerConsumer.ts | 2 +- 3 files changed, 52 insertions(+), 11 deletions(-) diff --git a/services/apps/pcc_sync_worker/src/activities/cleanupActivity.ts b/services/apps/pcc_sync_worker/src/activities/cleanupActivity.ts index f26ae59ff7..b7dca73cf4 100644 --- a/services/apps/pcc_sync_worker/src/activities/cleanupActivity.ts +++ b/services/apps/pcc_sync_worker/src/activities/cleanupActivity.ts @@ -30,7 +30,7 @@ export async function executeCleanup(intervalHours = 24): Promise { SlackChannel.CDP_INTEGRATIONS_ALERTS, SlackPersona.ERROR_REPORTER, 'PCC S3 Cleanup Failed', - `Failed to clean job \`${job.id}\` at \`${job.s3Path}\`.\n\n*Error:* ${err instanceof Error ? err.message : err}`, + `Failed to clean job \`${job.id}\` at \`${job.s3Path}\`.\n\n*Error:* ${err instanceof Error ? err.message : String(err)}`, ) } } diff --git a/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts b/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts index 1afe14598b..31993d0ae9 100644 --- a/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts +++ b/services/apps/pcc_sync_worker/src/consumer/pccProjectConsumer.ts @@ -28,6 +28,10 @@ export class PccProjectConsumer { private running = false private currentPollingIntervalMs: number private readonly shutdownAbort = new AbortController() + // Jobs already processed in this dry-run lifetime. Dry-run releases the + // claim so nothing is persisted, which means the same "oldest pending" job + // would otherwise be re-claimed on every loop iteration → endless reprocessing. + private readonly dryRunProcessedJobIds = new Set() constructor( private readonly metadataStore: MetadataStore, @@ -48,10 +52,18 @@ export class PccProjectConsumer { const job = await this.metadataStore.claimOldestPendingJob(buildPlatformFilter([PLATFORM])) if (job) { - this.currentPollingIntervalMs = this.pollingIntervalMs - await this.processJob(job) - await new Promise((resolve) => setImmediate(resolve)) - continue + if (this.dryRun && this.dryRunProcessedJobIds.has(job.id)) { + // Already processed in this dry-run lifetime — the claim is about to be + // released again; fall through to the "no pending jobs" path so we back + // off instead of churning the same job forever. + await this.releaseClaimBestEffort(job.id) + } else { + this.currentPollingIntervalMs = this.pollingIntervalMs + await this.processJob(job) + if (this.dryRun) this.dryRunProcessedJobIds.add(job.id) + await new Promise((resolve) => setImmediate(resolve)) + continue + } } } catch (err) { log.error({ err }, 'Error in consumer loop') @@ -97,9 +109,13 @@ export class PccProjectConsumer { // Stream all rows and group by PROJECT_ID before processing. // The export emits one row per (leaf, hierarchy_level) from the PROJECT_SPINE // JOIN, so each leaf project produces N rows (one per ancestor level). + // PROJECT_ID is trimmed at the group-key boundary (PCC source data occasionally + // carries surrounding whitespace) so the same logical project never splits + // into multiple groups. const groups = new Map[]>() for await (const raw of this.s3Service.streamParquetRows(job.s3Path)) { - const projectId = String((raw as Record).PROJECT_ID ?? '') + const rawId = (raw as Record).PROJECT_ID + const projectId = rawId == null ? '' : String(rawId).trim() if (!projectId) { missingProjectIdCount++ continue @@ -111,9 +127,10 @@ export class PccProjectConsumer { // Record a single SCHEMA_MISMATCH row aggregating all rows dropped for // missing PROJECT_ID — unidentifiable rows dedup on (error_type, reason) - // so repeated daily exports don't accumulate duplicates. + // so repeated daily exports don't accumulate duplicates. Kept as a + // separate counter (not folded into schemaMismatchCount) because the + // two track different granularities: rows vs project groups. if (missingProjectIdCount > 0) { - schemaMismatchCount += missingProjectIdCount log.warn( { jobId: job.id, count: missingProjectIdCount }, 'Dropped Parquet rows with missing PROJECT_ID', @@ -205,7 +222,10 @@ export class PccProjectConsumer { } else { await this.metadataStore.markCompleted(job.id, { transformedCount: upsertedCount, - skippedCount: skippedCount + schemaMismatchCount, + // schemaMismatchCount counts project groups; missingProjectIdCount + // counts raw rows dropped before grouping — both are "not synced" + // and belong in skippedCount. + skippedCount: skippedCount + schemaMismatchCount + missingProjectIdCount, processingDurationMs: durationMs, }) } @@ -533,7 +553,28 @@ async function upsertInsightsProject( ON CONFLICT (name) WHERE "deletedAt" IS NULL DO NOTHING`, { name: project.name, description: project.description, segmentId }, ) - if (inserted.rowCount === 0) return true + + if (inserted.rowCount === 0) { + // INSERT was a no-op on the partial unique index (name) WHERE "deletedAt" IS NULL. + // The pre-check above already ruled out cross-sourceId conflicts, so the row holding + // the name must be a same-sourceId sibling — shallow hierarchies (eff=1/2) where + // group/project/subproject share both name and sourceId. Verify before concluding + // it's not a conflict (guards against a hypothetical race with another writer). + const holder = await db.oneOrNone<{ sameFamily: boolean }>( + `SELECT s."sourceId" = $(sourceId) AS "sameFamily" + FROM "insightsProjects" ip + JOIN segments s ON s.id = ip."segmentId" + WHERE ip.name = $(name) + AND ip."deletedAt" IS NULL + AND s."tenantId" = $(tenantId) + LIMIT 1`, + { name: project.name, sourceId, tenantId: DEFAULT_TENANT_ID }, + ) + // Same-family holder (or holder vanished between INSERT and re-check) → not a real + // conflict; the project family is already represented via the sibling row. + if (!holder || holder.sameFamily) return false + return true + } return false } diff --git a/services/apps/snowflake_connectors/src/consumer/transformerConsumer.ts b/services/apps/snowflake_connectors/src/consumer/transformerConsumer.ts index 879c956485..a2d643d80c 100644 --- a/services/apps/snowflake_connectors/src/consumer/transformerConsumer.ts +++ b/services/apps/snowflake_connectors/src/consumer/transformerConsumer.ts @@ -159,7 +159,7 @@ export async function createTransformerConsumer(): Promise const pollingIntervalMs = 10_000 // 10 seconds - const enabledPlatforms = getEnabledPlatforms() as string[] + const enabledPlatforms = getEnabledPlatforms() return new TransformerConsumer( metadataStore,