Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 79 additions & 12 deletions backend/src/osspckgs/migrations/V1779710880__initial_schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@ CREATE TABLE packages_universe (
ecosystem text NOT NULL,
namespace text,
name text NOT NULL,
downloads_30d bigint,
-- Cached latest 30-day window count. Written by the same weekly ranking worker that upserts rows into
-- the downloads_last_30d table (keyed by purl/end_date). This column is the denormalized latest value
-- used directly by rank_packages_universe() to avoid a join; the downloads_last_30d table holds the
-- full rolling-window timeline.
downloads_last_30d bigint,
dependent_packages_count int,
dependent_repos_count int,
criticality_score numeric(10, 4),
Expand Down Expand Up @@ -54,7 +58,6 @@ CREATE TABLE packages (
latest_release_at timestamptz,
dependent_packages_count int,
dependent_repos_count int,
downloads_last_month bigint,
-- has_critical_vulnerability bool NOT NULL DEFAULT FALSE,
-- Deferred: semantics undecided between (a) any advisory with no fixed_version vs
-- (b) latest_version falls inside an affected semver range. Lateral join against
Expand All @@ -79,10 +82,6 @@ CREATE INDEX ON packages (ecosystem, name);

CREATE INDEX ON packages USING gin (keywords);

CREATE INDEX ON packages (downloads_last_month DESC)
WHERE
status = 'active';

-- INDEX on has_critical_vulnerability removed — column is commented out above.
-- Uncomment both when semantics are decided.

Expand Down Expand Up @@ -126,7 +125,6 @@ CREATE TABLE versions (
is_yanked bool,
is_prerelease bool NOT NULL DEFAULT FALSE,
license text, -- SPDX where available; can differ per version
download_count bigint, -- per-version where available (npm, crates)
last_synced_at timestamptz NOT NULL DEFAULT NOW(),
PRIMARY KEY (id, package_id),
UNIQUE (package_id, number)
Expand Down Expand Up @@ -646,9 +644,27 @@ CREATE TABLE package_maintainers (
);

-- ============================================================
-- DOWNLOADS (time-series, partitioned by month via pg_partman)
-- DOWNLOADS
--
-- Two tables track download volume at different tiers and granularities:
--
-- downloads_daily (tier 2 — packages)
-- Source of truth for daily download counts. One row per package per day.
-- No denormalized rollup on the packages table — consumers SUM over this
-- table when they need a window (e.g. last 30 days).
--
-- downloads_last_30d (tier 3 — packages_universe)
-- Rolling 30-day download timeline keyed by purl. Each row represents one
-- 30-day window (start_date..end_date). Keyed by purl so rows survive the
-- weekly truncation of packages_universe. The latest window's count is also
-- cached in packages_universe.downloads_last_30d for fast access by the
-- criticality-ranking function (no join needed).
--
-- ============================================================
-- DOWNLOADS DAILY (tier 2 — packages, daily granularity)
--
-- pg_partman MUST be enabled in OCI config before this migration runs:
-- Partitioned by month via pg_partman. pg_partman MUST be enabled in OCI
-- config before this migration runs:
-- OCI Console → Database → Configuration → Extensions → enable pg_partman
--
-- After enabling, run the setup below (once, outside Flyway or in a
Expand Down Expand Up @@ -676,14 +692,65 @@ CREATE TABLE package_maintainers (
-- ============================================================
CREATE TABLE downloads_daily (
id bigserial,
package_id bigint NOT NULL,
package_id bigint NOT NULL REFERENCES packages (id),
date date NOT NULL,
count bigint NOT NULL,
PRIMARY KEY (id, date),
UNIQUE (package_id, date)
)
PARTITION BY RANGE (date);

-- ============================================================
-- DOWNLOADS LAST 30D (tier 3 — packages_universe, rolling 30-day granularity)
--
-- Historical timeline of rolling 30-day download counts, keyed by purl.
-- Each row captures one window: downloads from start_date to end_date (inclusive).
-- Keyed by purl (not packages_universe.id) so rows survive the weekly
-- truncation of packages_universe. The latest window is also written
-- to packages_universe.downloads_last_30d column for fast access by the ranking function.
--
-- Writers should upsert: INSERT ... ON CONFLICT (purl, end_date) DO UPDATE SET count = EXCLUDED.count, start_date = EXCLUDED.start_date
-- PK includes end_date because Postgres requires the partition key to be
-- part of the primary key on range-partitioned tables.
--
-- Partitioned by month via pg_partman. pg_partman MUST be enabled in OCI
-- config before this migration runs:
-- OCI Console → Database → Configuration → Extensions → enable pg_partman
--
-- After enabling, run the setup below (once, outside Flyway or in a
-- separate migration) to register pg_partman and create initial partitions:
--
-- CREATE EXTENSION IF NOT EXISTS pg_partman SCHEMA partman;
--
-- SELECT partman.create_parent(
-- p_parent_table => 'public.downloads_last_30d',
-- p_control => 'end_date',
-- p_interval => '1 month',
-- p_premake => 3 -- pre-creates 3 future monthly partitions
-- );
--
-- -- pg_cron job to maintain partitions (also needs pg_cron enabled in OCI):
-- SELECT cron.schedule('partman-maintain-30d', '0 2 * * *',
-- $$CALL partman.run_maintenance_proc()$$);
--
-- Without this setup, inserts into downloads_last_30d will fail with
-- "no partition found for row". The table structure below is correct;
-- only the partition management setup is deferred.
--
-- ============================================================
CREATE TABLE downloads_last_30d (
id bigserial,
purl text NOT NULL,
start_date date NOT NULL,
end_date date NOT NULL,
count bigint NOT NULL,
PRIMARY KEY (id, end_date),
UNIQUE (purl, end_date)
)
PARTITION BY RANGE (end_date);

Comment thread
joanagmaia marked this conversation as resolved.
CREATE INDEX ON downloads_last_30d (purl, end_date DESC);
Comment thread
joanagmaia marked this conversation as resolved.

Comment thread
joanagmaia marked this conversation as resolved.
-- ============================================================
-- CRITICALITY RANKING FUNCTION
-- ============================================================
Expand All @@ -707,7 +774,7 @@ BEGIN
-- and to compress the gap between small and large values (e.g. LN(1)=0
-- vs LN(2)≈0.69 gives a gentler floor than LN(0)=-∞). Typically 1.0.
--
-- Until the npm-registry / Maven downloads enricher runs, downloads_30d
-- Until the npm-registry / Maven downloads enricher runs, downloads_last_30d
-- is NULL on every row. weight_downloads contributes 0 to the score;
-- ranking effectively reduces to:
-- LN(1 + dependent_repos_count) * weight_dependent_repos
Expand All @@ -717,7 +784,7 @@ BEGIN
WITH new_scores AS (
SELECT
id,
( LN(log_smoothing + COALESCE(downloads_30d, 0)) * weight_downloads
( LN(log_smoothing + COALESCE(downloads_last_30d, 0)) * weight_downloads
+ LN(log_smoothing + COALESCE(dependent_repos_count, 0)) * weight_dependent_repos
+ LN(log_smoothing + COALESCE(dependent_packages_count, 0)) * weight_dependent_packages
)::numeric(10, 4) AS new_score
Expand Down
Loading