# Fetch Abstracts for Referenced Papers

**Objective:** Download abstracts for papers cited in Cochrane reviews. These represent "included" papers that passed screening.

We have ~491K unique papers with PMIDs to fetch. This will take several hours due to PubMed rate limits.

In [1]:
import os
import csv
import time
from pathlib import Path
from dotenv import load_dotenv
from Bio import Entrez, Medline
from io import StringIO
from urllib.error import HTTPError
import pandas as pd

# Load credentials
notebook_dir = Path.cwd()
project_root = notebook_dir if (notebook_dir / ".env").exists() else notebook_dir.parent
env_path = project_root / ".env"
load_dotenv(env_path, override=True)

Entrez.email = os.getenv("NCBI_EMAIL", "")
Entrez.api_key = os.getenv("NCBI_API_KEY", "")

print(f"NCBI_EMAIL present: {'yes' if Entrez.email else 'no'}")
print(f"API key present: {'yes' if Entrez.api_key else 'no'}")

# Paths
DATA_DIR = project_root / "Data"
REFERENCES_CSV = DATA_DIR / "cochrane_pubmed_references.csv"
OUTPUT_CSV = DATA_DIR / "referenced_paper_abstracts.csv"

# Rate limiting
BATCH_SIZE = 200  # PubMed allows up to 200 IDs per request
SLEEP = 0.35 if Entrez.api_key else 0.9  # 3 req/sec with key, ~1/sec without

NCBI_EMAIL present: yes
API key present: yes


In [2]:
# Load references and extract unique PMIDs
refs = pd.read_csv(REFERENCES_CSV, dtype={"citing_pmid": str, "ref_pmid": str})
print(f"Total reference edges: {len(refs):,}")

# Get unique PMIDs (non-null, non-empty)
ref_pmids = refs["ref_pmid"].dropna()
ref_pmids = ref_pmids[ref_pmids != ""]
unique_pmids = ref_pmids.unique().tolist()
print(f"Unique referenced PMIDs to fetch: {len(unique_pmids):,}")

Total reference edges: 1,182,678
Unique referenced PMIDs to fetch: 491,531


In [7]:
# Check for already fetched PMIDs (resume capability)
already_fetched = set()
if OUTPUT_CSV.exists():
    existing = pd.read_csv(OUTPUT_CSV, dtype={"pmid": str}, usecols=["pmid"])
    already_fetched = set(existing["pmid"].dropna().unique())
    print(f"Already fetched: {len(already_fetched):,} PMIDs")

pmids_to_fetch = [p for p in unique_pmids if p not in already_fetched]
print(f"Remaining to fetch: {len(pmids_to_fetch):,} PMIDs")

Already fetched: 2,000 PMIDs
Remaining to fetch: 489,531 PMIDs


In [6]:
from urllib.error import URLError
from http.client import RemoteDisconnected

def efetch_medline_batch(pmids: list[str], max_retries: int = 5) -> str:
    """Fetch MEDLINE records for a batch of PMIDs with retry logic."""
    for attempt in range(max_retries):
        try:
            handle = Entrez.efetch(
                db="pubmed",
                id=",".join(pmids),
                rettype="medline",
                retmode="text",
            )
            return handle.read()
        except (HTTPError, URLError, RemoteDisconnected, ConnectionError, TimeoutError) as e:
            if attempt == max_retries - 1:
                print(f"Failed batch after {max_retries} attempts: {e}")
                return ""
            backoff = 2 ** attempt  # Exponential backoff: 1, 2, 4, 8, 16 seconds
            print(f"Error: {type(e).__name__}, retrying in {backoff}s...")
            time.sleep(backoff)
        except Exception as e:
            print(f"Unexpected error: {type(e).__name__}: {e}")
            return ""
    return ""


def parse_medline_records(medline_text: str):
    """Parse MEDLINE text into dictionaries."""
    for record in Medline.parse(StringIO(medline_text)):
        yield {
            "pmid": record.get("PMID", ""),
            "title": record.get("TI", ""),
            "abstract": record.get("AB", ""),
            "journal": record.get("JT", ""),
            "year": record.get("DP", "").split(" ")[0],
            "authors": "; ".join(record.get("AU", [])),
        }

In [8]:
# Fetch abstracts in batches with progress tracking
from datetime import datetime

total_batches = (len(pmids_to_fetch) + BATCH_SIZE - 1) // BATCH_SIZE
print(f"Starting fetch: {len(pmids_to_fetch):,} PMIDs in {total_batches:,} batches")
print(f"Estimated time: {total_batches * SLEEP / 60:.1f} minutes (plus network latency)")
print(f"Output: {OUTPUT_CSV}")
print("-" * 60)

# Open file in append mode
file_exists = OUTPUT_CSV.exists()
with OUTPUT_CSV.open("a", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["pmid", "title", "abstract", "journal", "year", "authors"])
    if not file_exists:
        writer.writeheader()
    
    fetched_count = 0
    start_time = datetime.now()
    
    for i in range(0, len(pmids_to_fetch), BATCH_SIZE):
        batch = pmids_to_fetch[i:i + BATCH_SIZE]
        batch_num = i // BATCH_SIZE + 1
        
        medline_text = efetch_medline_batch(batch)
        if medline_text:
            for record in parse_medline_records(medline_text):
                writer.writerow(record)
                fetched_count += 1
        
        # Progress update every 50 batches
        if batch_num % 50 == 0 or batch_num == total_batches:
            elapsed = (datetime.now() - start_time).total_seconds()
            rate = fetched_count / elapsed if elapsed > 0 else 0
            remaining = (len(pmids_to_fetch) - fetched_count) / rate / 60 if rate > 0 else 0
            print(f"Batch {batch_num:,}/{total_batches:,} | Fetched: {fetched_count:,} | Rate: {rate:.1f}/sec | ETA: {remaining:.1f} min")
            f.flush()  # Ensure data is written
        
        time.sleep(SLEEP)

print("-" * 60)
print(f"Done! Total fetched this run: {fetched_count:,}")

Starting fetch: 489,531 PMIDs in 2,448 batches
Estimated time: 14.3 minutes (plus network latency)
Output: c:\Users\juanx\Documents\LSE-UKHSA Project\Data\referenced_paper_abstracts.csv
------------------------------------------------------------
Batch 50/2,448 | Fetched: 9,999 | Rate: 10.4/sec | ETA: 771.7 min
Batch 100/2,448 | Fetched: 19,999 | Rate: 14.0/sec | ETA: 560.6 min
Batch 150/2,448 | Fetched: 29,998 | Rate: 18.5/sec | ETA: 414.7 min
Batch 200/2,448 | Fetched: 39,998 | Rate: 22.2/sec | ETA: 337.1 min
Batch 250/2,448 | Fetched: 49,998 | Rate: 25.1/sec | ETA: 292.1 min
Batch 300/2,448 | Fetched: 59,998 | Rate: 27.6/sec | ETA: 259.4 min
Batch 350/2,448 | Fetched: 69,998 | Rate: 23.7/sec | ETA: 295.3 min
Unexpected error: IncompleteRead: IncompleteRead(450149 bytes read)
Batch 400/2,448 | Fetched: 79,798 | Rate: 24.6/sec | ETA: 277.2 min
Batch 450/2,448 | Fetched: 89,798 | Rate: 25.2/sec | ETA: 264.4 min
Batch 500/2,448 | Fetched: 99,798 | Rate: 23.2/sec | ETA: 280.1 min
Batch 5

In [9]:
# Verify the output
if OUTPUT_CSV.exists():
    result = pd.read_csv(OUTPUT_CSV, dtype={"pmid": str})
    print(f"Total records in output file: {len(result):,}")
    print(f"Records with abstracts: {(result['abstract'].notna() & (result['abstract'] != '')).sum():,}")
    print(f"\nSample:")
    display(result.head())

Total records in output file: 490,929
Records with abstracts: 443,418

Sample:


Unnamed: 0,pmid,title,abstract,journal,year,authors
0,2314794,The use of modified Martius graft as an adjunc...,"The use of the Martius graft, a labial fibro-f...",Obstetrics and gynecology,1990,Elkins TE; DeLancey JO; McGuire EJ
1,21905761,Quality of life following successful repair of...,INTRODUCTION: The impact of obstetric vesicova...,Rural and remote health,2011,Umoiyoho AJ; Inyang-Etoh EC; Abah GM; Abasiatt...
2,32459344,Association of Low Socioeconomic Status With P...,IMPORTANCE: Individuals with low socioeconomic...,JAMA cardiology,2020,Hamad R; Penko J; Kazi DS; Coxson P; Guzman D;...
3,10547403,Long-term benefit of primary angioplasty as co...,BACKGROUND: As compared with thrombolytic ther...,The New England journal of medicine,1999,Zijlstra F; Hoorntje JC; de Boer MJ; Reiffers ...
4,12241831,Interventional versus conservative treatment f...,"BACKGROUND: Current guidelines suggest that, f...","Lancet (London, England)",2002,Fox KA; Poole-Wilson PA; Henderson RA; Clayton...
