diff --git a/scripts/check_oc_pqg_drift.py b/scripts/check_oc_pqg_drift.py new file mode 100755 index 0000000..627fdb8 --- /dev/null +++ b/scripts/check_oc_pqg_drift.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +"""Check whether Eric Kansa's OC PQG files on GCS have drifted from our R2 mirror. + +Reads our latest.json + the per-file manifests from data.isamples.org/oc_pqg/, +HEADs the GCS source, and reports whether upstream has a newer version. + +Exit codes: + 0 — in sync, no drift + 1 — drift detected (GCS has a different etag from what we've mirrored) + 2 — probe failure (network error, malformed response, etc.) + +Run manually for now: + python scripts/check_oc_pqg_drift.py + +Later: wire to GitHub Actions cron. +""" +import json +import sys +import urllib.request + +LATEST_URL = "https://data.isamples.org/oc_pqg/latest.json" +GCS_BASE = "https://storage.googleapis.com/opencontext-parquet/" +GCS_FILES = { + "narrow": "oc_isamples_pqg.parquet", + "wide": "oc_isamples_pqg_wide.parquet", +} + + +def fetch_json(url, timeout=20): + req = urllib.request.Request(url, headers={"User-Agent": "isamples-oc-drift-check/1.0"}) + with urllib.request.urlopen(req, timeout=timeout) as r: + return json.loads(r.read()) + + +def head(url, timeout=20): + req = urllib.request.Request(url, method="HEAD", + headers={"User-Agent": "isamples-oc-drift-check/1.0"}) + with urllib.request.urlopen(req, timeout=timeout) as r: + return dict(r.headers) + + +def main() -> int: + try: + latest = fetch_json(LATEST_URL) + except Exception as e: + print(f"ERROR: could not fetch {LATEST_URL}: {e}", file=sys.stderr) + return 2 + + drift_any = False + for flavor, gcs_name in GCS_FILES.items(): + flavor_ptr = latest.get(flavor) + if not flavor_ptr: + print(f"ERROR: latest.json has no entry for {flavor!r}", file=sys.stderr) + return 2 + + try: + manifest = fetch_json(f"https://data.isamples.org/{flavor_ptr['manifest']}") + except Exception as e: + print(f"ERROR: could not fetch manifest for {flavor}: {e}", file=sys.stderr) + return 2 + + try: + gcs_headers = head(f"{GCS_BASE}{gcs_name}") + except Exception as e: + print(f"ERROR: HEAD {GCS_BASE}{gcs_name}: {e}", file=sys.stderr) + return 2 + + gcs_etag = gcs_headers.get("ETag", "").strip('"') + gcs_last_modified = gcs_headers.get("Last-Modified", "") + our_etag = manifest.get("source_etag", "") + our_updated = manifest.get("source_updated", "") + + in_sync = gcs_etag == our_etag + state = "in sync" if in_sync else "DRIFT" + print(f"[{flavor}] {state}") + print(f" mirrored: etag={our_etag} updated={our_updated}") + print(f" gcs: etag={gcs_etag} last-modified={gcs_last_modified}") + if not in_sync: + drift_any = True + + return 1 if drift_any else 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/workers/data-isamples-org/src/index.js b/workers/data-isamples-org/src/index.js index e1f5088..82e4e0d 100644 --- a/workers/data-isamples-org/src/index.js +++ b/workers/data-isamples-org/src/index.js @@ -18,7 +18,15 @@ * working. */ -const IMMUTABLE_PATTERN = /^isamples_\d{6}_.*\.parquet$/; +// Immutable-by-filename patterns. Match files whose path fully determines +// their contents (filename includes a version / date stamp). +// - isamples_YYYYMM_*.parquet (monthly iSamples snapshots) +// - oc_pqg/oc_isamples_pqg*_YYYYMMDD.parquet (mirror of Eric Kansa's +// OpenContext PQG files — versioned by the upstream GCS updated-date) +const IMMUTABLE_PATTERNS = [ + /^isamples_\d{6}_.*\.parquet$/, + /^oc_pqg\/oc_isamples_pqg.*_\d{8}\.parquet$/, +]; const IMMUTABLE_MAX_AGE = 60 * 60 * 24 * 365; // 1 year const FALLBACK_MAX_AGE = 300; // 5 minutes @@ -72,7 +80,7 @@ export default { for (const [k, v] of Object.entries(CORS_HEADERS)) headers.set(k, v); // Cache-Control: this is the optimization. - if (IMMUTABLE_PATTERN.test(key)) { + if (IMMUTABLE_PATTERNS.some(p => p.test(key))) { headers.set('Cache-Control', `public, max-age=${IMMUTABLE_MAX_AGE}, immutable`); } else { headers.set('Cache-Control', `public, max-age=${FALLBACK_MAX_AGE}`);