# Summarize downloads of the Zenodo community: nfdi4bioimage

This notebook fetches all records from the Zenodo community `nfdi4bioimage`, extracts each record URL and its download count, and saves the result to a CSV file.

## Setup: imports and configuration

We use only standard-library networking (urllib) to avoid extra dependencies.

In [1]:
import json
import time
from typing import Any, Dict, List, Optional
from urllib.parse import urlencode
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError
import pandas as pd

BASE_URL = "https://zenodo.org/api/records"
COMMUNITY = "nfdi4bioimage"
PAGE_SIZE = 200
REQUEST_TIMEOUT = 30
SLEEP_BETWEEN_PAGES = 0.2
HEADERS = {"User-Agent": "git-bob (github-actions bot)", "Accept": "application/json"}
RAW_JSON_PATH = "zenodo_nfdi4bioimage_records.json"
CSV_PATH = "zenodo_nfdi4bioimage_downloads.csv"

## Helper: small JSON GET utility

Fetch a JSON response from a URL with headers and timeout. Returns `None` on error to keep the workflow robust.

In [2]:
def get_json(url: str) -> Optional[Dict[str, Any]]:
    try:
        req = Request(url, headers=HEADERS)
        with urlopen(req, timeout=REQUEST_TIMEOUT) as resp:
            data = resp.read().decode("utf-8", errors="replace")
        return json.loads(data)
    except (HTTPError, URLError, TimeoutError, json.JSONDecodeError):
        return None

## Fetch all records via pagination

We first try filtering by `communities` and follow `links.next`. If that returns no records, we fall back to a query-string search `q=communities:"nfdi4bioimage"` to be safe.

In [3]:
def fetch_all_records() -> List[Dict[str, Any]]:
    records: List[Dict[str, Any]] = []

    def paginate(url: str) -> List[Dict[str, Any]]:
        out: List[Dict[str, Any]] = []
        seen_pages = 0
        while url and seen_pages < 200:  # hard stop
            data = get_json(url)
            if not data:
                break
            hits = (data.get("hits") or {}).get("hits") or []
            out.extend(hits)
            next_link = (data.get("links") or {}).get("next")
            url = next_link
            seen_pages += 1
            if url:
                time.sleep(SLEEP_BETWEEN_PAGES)
        return out

    # Strategy 1: communities filter
    params1 = {
        "communities": COMMUNITY,
        "size": PAGE_SIZE,
        "page": 1,
        "all_versions": 1,
        "sort": "mostrecent",
    }
    url1 = f"{BASE_URL}?{urlencode(params1)}"
    records = paginate(url1)

    # Strategy 2: query fallback if needed
    if not records:
        params2 = {
            "q": f"communities:\"{COMMUNITY}\"",
            "size": PAGE_SIZE,
            "page": 1,
            "all_versions": 1,
            "sort": "mostrecent",
        }
        url2 = f"{BASE_URL}?{urlencode(params2)}"
        records = paginate(url2)

    return records

records = fetch_all_records()
with open(RAW_JSON_PATH, "w", encoding="utf-8") as f:
    json.dump(records, f, ensure_ascii=False, indent=2)

## Transform: extract Zenodo URL and downloads

We extract the record page URL and total downloads (from `stats.downloads`). Missing values are treated as 0. We keep a few extra columns for context and sort by downloads (descending).

In [4]:
def rec_to_row(r: Dict[str, Any]) -> Dict[str, Any]:
    links = r.get("links") or {}
    rec_id = r.get("id")
    url = links.get("html") or (f"https://zenodo.org/records/{rec_id}" if rec_id else None)
    stats = r.get("stats") or {}
    downloads = stats.get("downloads")
    try:
        downloads = int(downloads) if downloads is not None else 0
    except Exception:
        downloads = 0
    md = r.get("metadata") or {}
    return {
        "zenodo_url": url,
        "downloads": downloads,
        "id": rec_id,
        "doi": r.get("doi") or r.get("conceptdoi"),
        "title": md.get("title"),
    }

rows = [rec_to_row(x) for x in records]
df = pd.DataFrame(rows)
if not df.empty:
    df = df.dropna(subset=["zenodo_url"]).drop_duplicates(subset=["zenodo_url"])\
           .sort_values("downloads", ascending=False).reset_index(drop=True)

## Save the summary as CSV

The CSV contains at least the Zenodo URL and the download count per record.

In [5]:
cols = ["zenodo_url", "downloads", "id", "doi", "title"]
(df[cols] if not df.empty else pd.DataFrame(columns=cols)).to_csv(CSV_PATH, index=False)