In [None]:
#!/usr/bin/env python3
import os, time, json, gzip, datetime, requests, pandas as pd
from pathlib import Path
from ratelimit import limits, sleep_and_retry  # ratelimit sleeps automatically when the quota for the current second is full

# ──────────────────── user‑configurable parameters ────────────────────
API_KEY               = os.getenv("SCOPUS_API_KEY")           # export SCOPUS_API_KEY=...
MAX_REQS_PER_RUN      = 80_000                                # 20_000 on Thu, etc.
CHUNK_SIZE_REQS       = 2_000                                 # 2 000 req ⇒ 50 000 rows
DATE_RANGE            = "2000-2024"
OUT_RAW_DIR           = Path("Data/raw")
CURSOR_STATE_FILE     = Path("cursor_state.json")             # stores cursor & counter
# ───────────────────────────────────────────────────────────────────────

BASE_URL  = "https://api.elsevier.com/content/search/scopus"
HEADERS   = {"X-ELS-APIKey": API_KEY, "Accept": "application/json"}

# ---------- load previous cursor & counter if they exist ----------
if CURSOR_STATE_FILE.exists():
    state            = json.loads(CURSOR_STATE_FILE.read_text())
    cursor_value     = state["cursor"]
    chunk_counter    = state["chunk_counter"]
else:
    cursor_value  = "*"
    chunk_counter = 0

# ---------- helper: write one chunk ----------
def flush_chunk(recs, counter):
    today = datetime.datetime.now().strftime("%Y%m%d")

    # zipped raw JSONL
    raw_path = OUT_RAW_DIR / f"scopus_raw_{counter:06d}_{today}.jsonl.gz"
    with gzip.open(raw_path, "wt", encoding="utf-8") as gz:
        for r in recs:
            gz.write(json.dumps(r, ensure_ascii=False) + "\n")
    print(f"✓ raw  {len(recs):,} docs → {raw_path.name}")

# ---------- requests: never exceed 9 requests per second ----------
@sleep_and_retry
@limits(calls=9, period=1)   # 9 calls every 1 second
def safe_get(params):
    return session.get(BASE_URL, headers=HEADERS, params=params, timeout=30)

# ---------- last cover date in a lat chunk ----------
def last_entry_date(entries):
    return entries[-1].get("prism:coverDate")

# ---------- main loop ----------
query = {
    "query" : "DOCTYPE(ar)",
    "date"  : DATE_RANGE,
    "sort"  : "-coverDate",
    "count" : 25,
    "cursor": cursor_value,
    "view"  : "COMPLETE",
}

chunk_records = []
reqs_made     = 0
session       = requests.Session()
t0 = time.perf_counter()

while True:
    # simple retry/back‑off
    for attempt in range(5):
        try:
            resp = safe_get(query)
            break
        except requests.exceptions.RequestException as e:
            print(f"⚠ {e}  retry {attempt+1}/5")
            time.sleep(2+attempt)
    else:
        print("🚫 network failed three times; aborting run.")
        break

    reqs_made += 1

    if resp.status_code != 200:
        print(f"🚫 HTTP {resp.status_code} – {resp.text[:200]}")
        break

    data    = resp.json()
    entries = data.get("search-results", {}).get("entry", [])
    if not entries:
        print("• no more entries – completed dataset.")
        break

    chunk_records.extend(entries)

    next_cur = data.get("search-results", {}).get("cursor", {}).get("@next")
    if not next_cur:
        print("• reached end – no next cursor.")
        break
    query["cursor"] = next_cur

    # ---- flush full chunk ----
    if reqs_made % CHUNK_SIZE_REQS == 0:
        chunk_counter += 1
        flush_chunk(chunk_records, chunk_counter)
        chunk_records = [] # free memory | Clear in-memory list for the next chunk

    # ---- stop for this run if quota reached ----
    if reqs_made >= MAX_REQS_PER_RUN:
        print(f"• hit per‑run limit {MAX_REQS_PER_RUN}; pausing until next run.")
        print(f"Cover date of last entry: {last_entry_date(entries)}")
        break

# ---------- flush leftovers ----------
if chunk_records:
    chunk_counter += 1
    flush_chunk(chunk_records, chunk_counter)
    chunk_records = []

# ---------- persist cursor & counter for next run ----------
CURSOR_STATE_FILE.write_text(json.dumps({"cursor": query["cursor"],
                                  "chunk_counter": chunk_counter}))
print(f"Done. Requests made in this run: {reqs_made:,}")
print(f"Duration: {time.perf_counter() - t0}")