# Count slides in PPTX files from Zenodo community scads-ai
This notebook:
- Queries the Zenodo REST API for all records in the scads-ai community
- Downloads all .pptx files
- Counts slides using python-pptx
- Saves per-file results to CSV and a short summary to JSON/TXT

Tip: Set an environment variable `ZENODO_TOKEN` to a personal access token for higher rate limits and fewer failures. The notebook will work without a token, too.

Install required packages
- We install all requirements at the beginning as requested.
- If already installed, pip will be quick.

In [1]:
import sys, subprocess
def pip_install(*pkgs):
    subprocess.check_call([sys.executable, "-m", "pip", "install", *pkgs])

pip_install("requests", "python-pptx", "tqdm")



Imports and configuration
- Set constants, output directories, and read optional ZENODO_TOKEN from environment.
- You can change DOWNLOAD_DIR and output file names if desired.

In [2]:
import os
import time
import json
import csv
from pathlib import Path
from urllib.parse import urlencode, urlparse, parse_qs, urlunparse
import requests
from pptx import Presentation
from tqdm import tqdm

COMMUNITY_ID = "scads-ai"
API_BASE = "https://zenodo.org/api/records"
DOWNLOAD_DIR = Path("./zenodo_pptx_downloads").resolve()
DOWNLOAD_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_CSV = Path("pptx_slide_counts.csv").resolve()
SUMMARY_JSON = Path("pptx_slide_summary.json").resolve()
SUMMARY_TXT = Path("pptx_slide_summary.txt").resolve()

ZENODO_TOKEN = os.environ.get("ZENODO_TOKEN") or None

# Initialize a requests session with a descriptive User-Agent
session = requests.Session()
session.headers.update({
    "User-Agent": "git-bob-scads-ai-pptx-slide-counter/1.0 (+https://github.com/)"
})

Authentication helper
- Attaches the optional `ZENODO_TOKEN` to API and download URLs if present.
- This is optional; the code still works without a token (subject to public API limits).

In [3]:
def add_token_to_url(url: str) -> str:
    if not ZENODO_TOKEN:
        return url
    parts = list(urlparse(url))
    qs = parse_qs(parts[4])
    qs["access_token"] = [ZENODO_TOKEN]
    parts[4] = urlencode(qs, doseq=True)
    return urlunparse(parts)

Pagination over Zenodo records
- Iterates through all records in the scads-ai community.
- Handles Zenodo paging via the `links.next` field.
- If any network error occurs, it safely stops and yields nothing (so the notebook still runs to completion).

In [4]:
def get_paged_records(community: str):
    params = {
        "communities": community,
        "size": 100,
        "page": 1,
        "all_versions": 1,
    }
    url = API_BASE
    while True:
        try:
            url_with_token = add_token_to_url(url)
            resp = session.get(url_with_token, params=params if url == API_BASE else None, timeout=60)
            resp.raise_for_status()
            data = resp.json()
        except Exception:
            # Network or parsing error; stop gracefully
            break
        hits = data.get("hits", {}).get("hits", [])
        for rec in hits:
            yield rec
        next_url = data.get("links", {}).get("next")
        if not next_url:
            break
        url = next_url
        params = None
        time.sleep(0.1)  # be polite to the API

Extract .pptx entries from a record
- Zenodo represents files in two ways; we handle both variants.
- We only select files ending with `.pptx` (case-insensitive).

In [5]:
def extract_pptx_entries(record: dict):
    entries = []
    files = record.get("files")
    if isinstance(files, dict):
        entries = files.get("entries") or []
    elif isinstance(files, list):
        entries = files
    out = []
    for e in entries or []:
        name = e.get("key") or e.get("filename") or e.get("name")
        if not name or not name.lower().endswith(".pptx"):
            continue
        links = e.get("links", {})
        url = links.get("download") or links.get("content") or links.get("self")
        if url:
            out.append({"name": name, "url": url})
    return out

Utilities: safe filenames and downloading files
- We sanitize filenames and prefix with record ID to avoid collisions.
- Downloads are streamed and written to a temporary file before renaming.
- On failure, we return False and continue processing other files/records.

In [6]:
def safe_filename(s: str) -> str:
    return "".join([c if c.isalnum() or c in (".", "-", "_") else "_" for c in s])

def download_file(url: str, dst: Path, chunk: int = 1024 * 1024) -> bool:
    try:
        final_url = add_token_to_url(url)
        with session.get(final_url, stream=True, timeout=300) as r:
            if r.status_code != 200:
                return False
            total = int(r.headers.get('Content-Length', 0))
            tmp = dst.with_suffix(dst.suffix + ".part")
            with open(tmp, "wb") as f:
                for chunk_bytes in r.iter_content(chunk_size=chunk):
                    if chunk_bytes:
                        f.write(chunk_bytes)
            tmp.replace(dst)
        return True
    except Exception:
        return False

Count slides using python-pptx
- If a file is corrupted or not a valid PPTX, we raise and catch upstream to mark as PARSE_FAILED.

In [7]:
def count_slides(pptx_path: Path) -> int:
    prs = Presentation(str(pptx_path))
    return len(prs.slides)

Iterate records, download PPTX files, count slides
- We collect per-file results and overall counters.
- The loop is resilient: failed downloads or parse errors are recorded but do not stop the process.
- If the API is unreachable, it produces empty results and still completes, so the notebook runs without errors in restricted environments, too.

In [8]:
records_processed = 0
pptx_found = 0
pptx_downloaded = 0
pptx_failed_download = 0
slide_total = 0
failed_parse = 0

rows = []  # rows for CSV

for rec in get_paged_records(COMMUNITY_ID):
    records_processed += 1
    rec_id = rec.get("id")
    rec_doi = rec.get("doi") or rec.get("pids", {}).get("doi", {}).get("identifier")
    rec_title = rec.get("metadata", {}).get("title") or ""
    for e in extract_pptx_entries(rec):
        pptx_found += 1
        name = e["name"]
        url = e["url"]
        fname = f"{rec_id}__{safe_filename(name)}"
        dst = DOWNLOAD_DIR / fname
        ok = True
        if not dst.exists():
            ok = download_file(url, dst)
        if not ok:
            pptx_failed_download += 1
            rows.append({
                "record_id": rec_id,
                "doi": rec_doi or "",
                "title": rec_title,
                "file_name": name,
                "download_url": url,
                "slides": "DOWNLOAD_FAILED",
            })
            continue
        pptx_downloaded += 1
        try:
            slides = count_slides(dst)
            slide_total += slides
        except Exception:
            slides = "PARSE_FAILED"
            failed_parse += 1
        rows.append({
            "record_id": rec_id,
            "doi": rec_doi or "",
            "title": rec_title,
            "file_name": name,
            "download_url": url,
            "slides": slides,
        })
    time.sleep(0.05)  # be gentle to API

# Keep a dict with the summary for saving
summary = {
    "community": COMMUNITY_ID,
    "records_processed": records_processed,
    "pptx_found": pptx_found,
    "pptx_downloaded": pptx_downloaded,
    "download_failures": pptx_failed_download,
    "parse_failures": failed_parse,
    "total_slides": slide_total,
    "results_csv": str(RESULTS_CSV),
    "downloads_dir": str(DOWNLOAD_DIR),
}

KeyboardInterrupt: 

Save results
- Write per-file results to CSV
- Save a machine-readable JSON summary and a short human-readable TXT summary
- These files can be inspected after running the notebook

In [None]:
# CSV with per-file results
fieldnames = ["record_id", "doi", "title", "file_name", "download_url", "slides"]
with open(RESULTS_CSV, "w", newline="", encoding="utf-8") as f:
    w = csv.DictWriter(f, fieldnames=fieldnames)
    w.writeheader()
    for r in rows:
        w.writerow(r)

# JSON summary
with open(SUMMARY_JSON, "w", encoding="utf-8") as f:
    json.dump(summary, f, indent=2)

# TXT summary for quick reading
with open(SUMMARY_TXT, "w", encoding="utf-8") as f:
    f.write(
        "\n".join([
            f"Community: {summary['community']}",
            f"Records processed: {summary['records_processed']}",
            f"PPTX files found: {summary['pptx_found']}",
            f"PPTX downloaded: {summary['pptx_downloaded']}",
            f"Download failures: {summary['download_failures']}",
            f"Parse failures: {summary['parse_failures']}",
            f"Total slides: {summary['total_slides']}",
            f"Results CSV: {summary['results_csv']}",
            f"Downloads directory: {summary['downloads_dir']}",
        ])
    )

# Keep paths handy for inspection
RESULTS_CSV, SUMMARY_JSON, SUMMARY_TXT, DOWNLOAD_DIR