Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 85 additions & 0 deletions scripts/check_oc_pqg_drift.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#!/usr/bin/env python3
"""Check whether Eric Kansa's OC PQG files on GCS have drifted from our R2 mirror.

Reads our latest.json + the per-file manifests from data.isamples.org/oc_pqg/,
HEADs the GCS source, and reports whether upstream has a newer version.

Exit codes:
0 — in sync, no drift
1 — drift detected (GCS has a different etag from what we've mirrored)
2 — probe failure (network error, malformed response, etc.)

Run manually for now:
python scripts/check_oc_pqg_drift.py

Later: wire to GitHub Actions cron.
"""
import json
import sys
import urllib.request

LATEST_URL = "https://data.isamples.org/oc_pqg/latest.json"
GCS_BASE = "https://storage.googleapis.com/opencontext-parquet/"
GCS_FILES = {
"narrow": "oc_isamples_pqg.parquet",
"wide": "oc_isamples_pqg_wide.parquet",
}


def fetch_json(url, timeout=20):
req = urllib.request.Request(url, headers={"User-Agent": "isamples-oc-drift-check/1.0"})
with urllib.request.urlopen(req, timeout=timeout) as r:
return json.loads(r.read())


def head(url, timeout=20):
req = urllib.request.Request(url, method="HEAD",
headers={"User-Agent": "isamples-oc-drift-check/1.0"})
with urllib.request.urlopen(req, timeout=timeout) as r:
return dict(r.headers)


def main() -> int:
try:
latest = fetch_json(LATEST_URL)
except Exception as e:
print(f"ERROR: could not fetch {LATEST_URL}: {e}", file=sys.stderr)
return 2

drift_any = False
for flavor, gcs_name in GCS_FILES.items():
flavor_ptr = latest.get(flavor)
if not flavor_ptr:
print(f"ERROR: latest.json has no entry for {flavor!r}", file=sys.stderr)
return 2

try:
manifest = fetch_json(f"https://data.isamples.org/{flavor_ptr['manifest']}")
except Exception as e:
print(f"ERROR: could not fetch manifest for {flavor}: {e}", file=sys.stderr)
return 2

try:
gcs_headers = head(f"{GCS_BASE}{gcs_name}")
except Exception as e:
print(f"ERROR: HEAD {GCS_BASE}{gcs_name}: {e}", file=sys.stderr)
return 2

gcs_etag = gcs_headers.get("ETag", "").strip('"')
gcs_last_modified = gcs_headers.get("Last-Modified", "")
our_etag = manifest.get("source_etag", "")
our_updated = manifest.get("source_updated", "")

in_sync = gcs_etag == our_etag
state = "in sync" if in_sync else "DRIFT"
print(f"[{flavor}] {state}")
print(f" mirrored: etag={our_etag} updated={our_updated}")
print(f" gcs: etag={gcs_etag} last-modified={gcs_last_modified}")
if not in_sync:
drift_any = True

return 1 if drift_any else 0


if __name__ == "__main__":
sys.exit(main())
12 changes: 10 additions & 2 deletions workers/data-isamples-org/src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,15 @@
* working.
*/

const IMMUTABLE_PATTERN = /^isamples_\d{6}_.*\.parquet$/;
// Immutable-by-filename patterns. Match files whose path fully determines
// their contents (filename includes a version / date stamp).
// - isamples_YYYYMM_*.parquet (monthly iSamples snapshots)
// - oc_pqg/oc_isamples_pqg*_YYYYMMDD.parquet (mirror of Eric Kansa's
// OpenContext PQG files — versioned by the upstream GCS updated-date)
const IMMUTABLE_PATTERNS = [
/^isamples_\d{6}_.*\.parquet$/,
/^oc_pqg\/oc_isamples_pqg.*_\d{8}\.parquet$/,
];
const IMMUTABLE_MAX_AGE = 60 * 60 * 24 * 365; // 1 year
const FALLBACK_MAX_AGE = 300; // 5 minutes

Expand Down Expand Up @@ -72,7 +80,7 @@ export default {
for (const [k, v] of Object.entries(CORS_HEADERS)) headers.set(k, v);

// Cache-Control: this is the optimization.
if (IMMUTABLE_PATTERN.test(key)) {
if (IMMUTABLE_PATTERNS.some(p => p.test(key))) {
headers.set('Cache-Control', `public, max-age=${IMMUTABLE_MAX_AGE}, immutable`);
} else {
headers.set('Cache-Control', `public, max-age=${FALLBACK_MAX_AGE}`);
Expand Down
Loading