# Count pages in all PDFs of a Zenodo record

This notebook downloads all PDF files from a given Zenodo record and counts the pages in each. It then saves a CSV with per-file page counts and the total.

Notes:
- If there is no internet connection or required libraries are missing, the notebook will gracefully skip steps and still write an informative CSV.
- No outputs are pre-executed in this notebook; run it top-to-bottom to produce results.

## Imports and optional dependency setup

We try to import lightweight libraries. If missing, we attempt to install them. If installation fails (e.g., offline), we keep going with limited functionality and write informative statuses to the results CSV.

In [1]:
import os
import sys
import json
import re
from pathlib import Path
from typing import List, Dict, Optional
import csv

def _try_import_or_pip(package_name: str, import_name: Optional[str] = None):
    import importlib
    name = import_name or package_name
    try:
        return importlib.import_module(name)
    except Exception:
        try:
            # Try pip install
            import subprocess
            subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet", package_name])
            return importlib.import_module(name)
        except Exception:
            return None

# Try to import networking and PDF parser
requests = _try_import_or_pip("requests")
pypdf_mod = _try_import_or_pip("pypdf")
if pypdf_mod is None:
    pypdf_mod = _try_import_or_pip("PyPDF2", import_name="PyPDF2")
PdfReader = None
if pypdf_mod is not None:
    # pypdf.PdfReader or PyPDF2.PdfReader
    PdfReader = getattr(pypdf_mod, "PdfReader", None)

# Pandas is optional; we'll write CSV without it, but use it if available for convenience
pd = _try_import_or_pip("pandas")

# Create output directories
BASE_DIR = Path("data") / "zenodo_15858127"
BASE_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_CSV = BASE_DIR / "pdf_page_counts.csv"
SUMMARY_TXT = BASE_DIR / "summary.txt"

## Configure the Zenodo record

We will query the Zenodo API for record metadata and list its files. Record ID: 15858127.

In [2]:
ZENODO_RECORD_ID = 15858127
ZENODO_API_URL = f"https://zenodo.org/api/records/{ZENODO_RECORD_ID}"

def safe_filename(name: str) -> str:
    # Replace path separators and unsafe characters
    return re.sub(r"[^A-Za-z0-9._-]", "_", name)

def get_zenodo_files(record_id: int) -> List[Dict]:
    files: List[Dict] = []
    if requests is None:
        return files
    try:
        r = requests.get(f"https://zenodo.org/api/records/{record_id}", timeout=30)
        if r.status_code != 200:
            return files
        data = r.json()
        # Files are typically in data['files'] for new records; for legacy, might be under data['links']['self'] etc.
        files = data.get("files", []) or []
        return files
    except Exception:
        return []

zenodo_files = get_zenodo_files(ZENODO_RECORD_ID)
zenodo_files_meta_path = BASE_DIR / "zenodo_files_metadata.json"
try:
    with open(zenodo_files_meta_path, "w", encoding="utf-8") as f:
        json.dump(zenodo_files, f, indent=2)
except Exception:
    pass

# Prepare a minimal preview dict list for inspection/CSV-like view later
preview_records = []
for f in zenodo_files:
    preview_records.append({
        "key": f.get("key"),
        "size": f.get("size"),
        "checksum": f.get("checksum"),
        "download_link": (f.get("links") or {}).get("download"),
    })

preview_json_path = BASE_DIR / "zenodo_files_preview.json"
try:
    with open(preview_json_path, "w", encoding="utf-8") as f:
        json.dump(preview_records, f, indent=2)
except Exception:
    pass

## Filter to PDF files

We only keep entries that look like PDFs. We rely on the filename extension in the Zenodo file key (preferred), otherwise skip.

In [3]:
pdf_entries = []
for f in zenodo_files:
    key = f.get("key") or ""
    if isinstance(key, str) and key.lower().endswith(".pdf"):
        url = (f.get("links") or {}).get("download")
        if not url:
            # Fallback URL pattern
            url = f"https://zenodo.org/records/{ZENODO_RECORD_ID}/files/{key}?download=1"
        pdf_entries.append({
            "filename": key,
            "url": url,
            "size": f.get("size"),
        })

pdf_list_json_path = BASE_DIR / "pdf_file_list.json"
try:
    with open(pdf_list_json_path, "w", encoding="utf-8") as f:
        json.dump(pdf_entries, f, indent=2)
except Exception:
    pass

## Download PDFs (if possible)

Each PDF is downloaded to a local folder. If download is not possible, we record the status and move on. Already existing files are reused (no re-download).

In [4]:
def download_file(url: str, dst: Path, chunk_size: int = 1 << 20) -> bool:
    if requests is None:
        return False
    try:
        with requests.get(url, stream=True, timeout=60) as r:
            if r.status_code != 200:
                return False
            with open(dst, 'wb') as f:
                for chunk in r.iter_content(chunk_size=chunk_size):
                    if chunk:
                        f.write(chunk)
        return True
    except Exception:
        return False

download_results = []
for entry in pdf_entries:
    fname = safe_filename(entry["filename"]) if entry.get("filename") else None
    url = entry.get("url")
    if not fname or not url:
        download_results.append({"filename": fname or "", "status": "skip_missing_info", "local_path": "", "url": url or ""})
        continue
    local_path = BASE_DIR / fname
    if local_path.exists() and local_path.stat().st_size > 0:
        status = "exists"
    else:
        ok = download_file(url, local_path)
        status = "downloaded" if ok else "download_failed"
    download_results.append({"filename": fname, "status": status, "local_path": str(local_path), "url": url})

download_results_json = BASE_DIR / "download_results.json"
try:
    with open(download_results_json, "w", encoding="utf-8") as f:
        json.dump(download_results, f, indent=2)
except Exception:
    pass

## Count pages in each PDF

We use pypdf or PyPDF2 to count pages. Encrypted PDFs are tried with an empty password; if still encrypted, they are skipped with a status message. Results are aggregated into a list and saved as CSV along with the sum of pages across all successfully read PDFs.

In [5]:
results: List[Dict] = []
total_pages = 0

for item in download_results:
    filename = item.get("filename") or ""
    local_path = item.get("local_path") or ""
    url = item.get("url") or ""
    status = item.get("status") or ""
    pages = None
    note = ""

    if status in ("downloaded", "exists") and PdfReader is not None and local_path:
        try:
            reader = PdfReader(local_path)
            # Try decrypt if encrypted
            is_encrypted = getattr(reader, "is_encrypted", False)
            if is_encrypted:
                try:
                    # Some libraries use .decrypt("") returning int
                    _ = reader.decrypt("")
                except Exception:
                    pass
            # Get length of pages
            pages = len(reader.pages)
            status = "ok"
            total_pages += int(pages)
        except Exception as e:
            note = f"error: {type(e).__name__}"
            status = "page_count_failed"
    elif PdfReader is None and status in ("downloaded", "exists"):
        status = "pdf_lib_missing"
        note = "pypdf/PyPDF2 not available"
    elif status not in ("downloaded", "exists"):
        note = status

    results.append({
        "filename": filename,
        "pages": pages if pages is not None else "",
        "status": status,
        "note": note,
        "url": url,
        "local_path": local_path,
    })

# Save results as CSV (without requiring pandas)
fieldnames = ["filename", "pages", "status", "note", "url", "local_path"]
with open(RESULTS_CSV, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    for row in results:
        writer.writerow(row)

# Save a small text summary with the total
try:
    with open(SUMMARY_TXT, "w", encoding="utf-8") as f:
        f.write(f"Total pages (sum over successfully read PDFs): {total_pages}\n")
except Exception:
    pass

# Also save a JSON with the total and per-file data for convenience
summary_json = {
    "record_id": ZENODO_RECORD_ID,
    "total_pages": total_pages,
    "files": results,
}
summary_json_path = BASE_DIR / "summary.json"
try:
    with open(summary_json_path, "w", encoding="utf-8") as f:
        json.dump(summary_json, f, indent=2)
except Exception:
    pass

## Where to find the results

- Per-file counts: data/zenodo_15858127/pdf_page_counts.csv
- Total and details: data/zenodo_15858127/summary.json and summary.txt

Run the notebook to populate these files. If network access is unavailable, the CSV and JSON will indicate the failure statuses without raising errors.