diff --git a/backend/src/osspckgs/migrations/V1779710880__initial_schema.sql b/backend/src/osspckgs/migrations/V1779710880__initial_schema.sql index e756d54e4d..396012d6ee 100644 --- a/backend/src/osspckgs/migrations/V1779710880__initial_schema.sql +++ b/backend/src/osspckgs/migrations/V1779710880__initial_schema.sql @@ -7,7 +7,11 @@ CREATE TABLE packages_universe ( ecosystem text NOT NULL, namespace text, name text NOT NULL, - downloads_30d bigint, + -- Cached latest 30-day window count. Written by the same weekly ranking worker that upserts rows into + -- the downloads_last_30d table (keyed by purl/end_date). This column is the denormalized latest value + -- used directly by rank_packages_universe() to avoid a join; the downloads_last_30d table holds the + -- full rolling-window timeline. + downloads_last_30d bigint, dependent_packages_count int, dependent_repos_count int, criticality_score numeric(10, 4), @@ -54,7 +58,6 @@ CREATE TABLE packages ( latest_release_at timestamptz, dependent_packages_count int, dependent_repos_count int, - downloads_last_month bigint, -- has_critical_vulnerability bool NOT NULL DEFAULT FALSE, -- Deferred: semantics undecided between (a) any advisory with no fixed_version vs -- (b) latest_version falls inside an affected semver range. Lateral join against @@ -79,10 +82,6 @@ CREATE INDEX ON packages (ecosystem, name); CREATE INDEX ON packages USING gin (keywords); -CREATE INDEX ON packages (downloads_last_month DESC) -WHERE - status = 'active'; - -- INDEX on has_critical_vulnerability removed — column is commented out above. -- Uncomment both when semantics are decided. @@ -126,7 +125,6 @@ CREATE TABLE versions ( is_yanked bool, is_prerelease bool NOT NULL DEFAULT FALSE, license text, -- SPDX where available; can differ per version - download_count bigint, -- per-version where available (npm, crates) last_synced_at timestamptz NOT NULL DEFAULT NOW(), PRIMARY KEY (id, package_id), UNIQUE (package_id, number) @@ -646,9 +644,27 @@ CREATE TABLE package_maintainers ( ); -- ============================================================ --- DOWNLOADS (time-series, partitioned by month via pg_partman) +-- DOWNLOADS +-- +-- Two tables track download volume at different tiers and granularities: +-- +-- downloads_daily (tier 2 — packages) +-- Source of truth for daily download counts. One row per package per day. +-- No denormalized rollup on the packages table — consumers SUM over this +-- table when they need a window (e.g. last 30 days). +-- +-- downloads_last_30d (tier 3 — packages_universe) +-- Rolling 30-day download timeline keyed by purl. Each row represents one +-- 30-day window (start_date..end_date). Keyed by purl so rows survive the +-- weekly truncation of packages_universe. The latest window's count is also +-- cached in packages_universe.downloads_last_30d for fast access by the +-- criticality-ranking function (no join needed). +-- +-- ============================================================ +-- DOWNLOADS DAILY (tier 2 — packages, daily granularity) -- --- pg_partman MUST be enabled in OCI config before this migration runs: +-- Partitioned by month via pg_partman. pg_partman MUST be enabled in OCI +-- config before this migration runs: -- OCI Console → Database → Configuration → Extensions → enable pg_partman -- -- After enabling, run the setup below (once, outside Flyway or in a @@ -676,7 +692,7 @@ CREATE TABLE package_maintainers ( -- ============================================================ CREATE TABLE downloads_daily ( id bigserial, - package_id bigint NOT NULL, + package_id bigint NOT NULL REFERENCES packages (id), date date NOT NULL, count bigint NOT NULL, PRIMARY KEY (id, date), @@ -684,6 +700,57 @@ CREATE TABLE downloads_daily ( ) PARTITION BY RANGE (date); +-- ============================================================ +-- DOWNLOADS LAST 30D (tier 3 — packages_universe, rolling 30-day granularity) +-- +-- Historical timeline of rolling 30-day download counts, keyed by purl. +-- Each row captures one window: downloads from start_date to end_date (inclusive). +-- Keyed by purl (not packages_universe.id) so rows survive the weekly +-- truncation of packages_universe. The latest window is also written +-- to packages_universe.downloads_last_30d column for fast access by the ranking function. +-- +-- Writers should upsert: INSERT ... ON CONFLICT (purl, end_date) DO UPDATE SET count = EXCLUDED.count, start_date = EXCLUDED.start_date +-- PK includes end_date because Postgres requires the partition key to be +-- part of the primary key on range-partitioned tables. +-- +-- Partitioned by month via pg_partman. pg_partman MUST be enabled in OCI +-- config before this migration runs: +-- OCI Console → Database → Configuration → Extensions → enable pg_partman +-- +-- After enabling, run the setup below (once, outside Flyway or in a +-- separate migration) to register pg_partman and create initial partitions: +-- +-- CREATE EXTENSION IF NOT EXISTS pg_partman SCHEMA partman; +-- +-- SELECT partman.create_parent( +-- p_parent_table => 'public.downloads_last_30d', +-- p_control => 'end_date', +-- p_interval => '1 month', +-- p_premake => 3 -- pre-creates 3 future monthly partitions +-- ); +-- +-- -- pg_cron job to maintain partitions (also needs pg_cron enabled in OCI): +-- SELECT cron.schedule('partman-maintain-30d', '0 2 * * *', +-- $$CALL partman.run_maintenance_proc()$$); +-- +-- Without this setup, inserts into downloads_last_30d will fail with +-- "no partition found for row". The table structure below is correct; +-- only the partition management setup is deferred. +-- +-- ============================================================ +CREATE TABLE downloads_last_30d ( + id bigserial, + purl text NOT NULL, + start_date date NOT NULL, + end_date date NOT NULL, + count bigint NOT NULL, + PRIMARY KEY (id, end_date), + UNIQUE (purl, end_date) +) +PARTITION BY RANGE (end_date); + +CREATE INDEX ON downloads_last_30d (purl, end_date DESC); + -- ============================================================ -- CRITICALITY RANKING FUNCTION -- ============================================================ @@ -707,7 +774,7 @@ BEGIN -- and to compress the gap between small and large values (e.g. LN(1)=0 -- vs LN(2)≈0.69 gives a gentler floor than LN(0)=-∞). Typically 1.0. -- - -- Until the npm-registry / Maven downloads enricher runs, downloads_30d + -- Until the npm-registry / Maven downloads enricher runs, downloads_last_30d -- is NULL on every row. weight_downloads contributes 0 to the score; -- ranking effectively reduces to: -- LN(1 + dependent_repos_count) * weight_dependent_repos @@ -717,7 +784,7 @@ BEGIN WITH new_scores AS ( SELECT id, - ( LN(log_smoothing + COALESCE(downloads_30d, 0)) * weight_downloads + ( LN(log_smoothing + COALESCE(downloads_last_30d, 0)) * weight_downloads + LN(log_smoothing + COALESCE(dependent_repos_count, 0)) * weight_dependent_repos + LN(log_smoothing + COALESCE(dependent_packages_count, 0)) * weight_dependent_packages )::numeric(10, 4) AS new_score