In [None]:
from Bio import Entrez
import csv
import time
from pathlib import Path

# REQUIRED: set your email and optionally an NCBI API key
Entrez.email = "your_email@example.com"  # replace with your email
Entrez.api_key = ""  # optional but recommended

# Query to target Cochrane Database of Systematic Reviews abstracts
QUERY = '("Cochrane Database Syst Rev"[Journal]) AND hasabstract[text]'
OUT_CSV = Path.cwd() / "cochrane_pubmed_abstracts.csv"

# Tuning
BATCH_SIZE = 200        # efetch batch size (200 is safe)
SLEEP = 0.34            # ~3 req/sec with API key; use ~0.9 without a key
MAX_RECORDS = 200       # set to None to pull everything; keep small for a test run

In [None]:
def esearch_all(query: str):
    handle = Entrez.esearch(db="pubmed", term=query, usehistory="y", retmax=0)
    record = Entrez.read(handle)
    count = int(record["Count"])
    return count, record["WebEnv"], record["QueryKey"]


def efetch_medline_batches(count: int, webenv: str, query_key: str, batch_size: int, max_records=None):
    limit = count if max_records is None else min(count, max_records)
    for start in range(0, limit, batch_size):
        handle = Entrez.efetch(
            db="pubmed",
            rettype="medline",
            retmode="text",
            webenv=webenv,
            query_key=query_key,
            retstart=start,
            retmax=batch_size,
        )
        yield handle.read()
        time.sleep(SLEEP)


def medline_to_rows(medline_text: str):
    from io import StringIO
    from Bio import Medline

    for record in Medline.parse(StringIO(medline_text)):
        yield {
            "pmid": record.get("PMID", ""),
            "title": record.get("TI", ""),
            "abstract": record.get("AB", ""),
            "journal": record.get("JT", ""),
            "year": record.get("DP", "").split(" ")[0],
            "authors": "; ".join(record.get("AU", [])),
        }


def write_pubmed_to_csv(query: str, out_path: Path, batch_size: int, max_records=None):
    if not Entrez.email or "example.com" in Entrez.email:
        raise ValueError("Set Entrez.email to your email before running.")

    count, webenv, query_key = esearch_all(query)
    limit = count if max_records is None else min(count, max_records)
    print(f"Found {count} records; fetching {limit}...")

    out_path = Path(out_path)
    out_path.parent.mkdir(parents=True, exist_ok=True)

    with out_path.open("w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(
            f, fieldnames=["pmid", "title", "abstract", "journal", "year", "authors"]
        )
        writer.writeheader()

        for medline_chunk in efetch_medline_batches(limit, webenv, query_key, batch_size, max_records=limit):
            for row in medline_to_rows(medline_chunk):
                writer.writerow(row)

    print(f"Saved to {out_path.resolve()}")

In [None]:
# Run the download; set MAX_RECORDS=None to pull everything
write_pubmed_to_csv(QUERY, OUT_CSV, BATCH_SIZE, max_records=MAX_RECORDS)

# Quick peek at the first few rows
import pandas as pd
pd.read_csv(OUT_CSV).head()