# 02: Fetch Cochrane PDFs via Wiley TDM API

## Summary
This notebook downloads Cochrane review PDFs using the Wiley Text and Data Mining (TDM) API. The PDFs contain structured reference sections that properly categorize included vs. excluded studies - information not available in PubMed XML.

**Pipeline Position:** Third notebook - downloads source PDFs for extracting categorized references.

**What this notebook does:**
1. Loads Cochrane review metadata from PubMed (including DOIs)
2. Downloads PDFs for each review via Wiley TDM API
3. Implements rate limiting and error handling for batch downloads
4. Saves PDFs to local storage for text extraction

**Input:** `Data/cochrane_pubmed_abstracts.csv`

**Output:** `Data/cochrane_pdfs/*.pdf`

**Requirements:**
- Wiley TDM API token in `.env` file (WILEY_TEXT_AND_DATA_MINING_TOKEN)
- Institutional IP access to Cochrane Library content

**Important:** PDFs contain proprietary content and must NOT be uploaded to GitHub. They are excluded via .gitignore.

In [None]:
# Install required packages for PDF download and processing
%pip install -q wiley-tdm python-dotenv pandas biopython

In [None]:
# Set up environment and load credentials
import os
from pathlib import Path
from dotenv import load_dotenv
import pandas as pd

notebook_dir = Path.cwd()
project_root = notebook_dir if (notebook_dir / ".env").exists() else notebook_dir.parent
env_path = project_root / ".env"
load_dotenv(env_path, override=True)

WILEY_TDM_TOKEN = os.getenv("WILEY_TEXT_AND_DATA_MINING_TOKEN")
os.environ['TDM_API_TOKEN'] = WILEY_TDM_TOKEN or ""

DATA_DIR = project_root / "Data"
PDF_DIR = DATA_DIR / "cochrane_pdfs"
ABSTRACTS_CSV = DATA_DIR / "cochrane_pubmed_abstracts.csv"

print(f"Wiley TDM Token loaded: {'✓' if WILEY_TDM_TOKEN else '✗'}")
print(f"PDF output directory: {PDF_DIR}")

if not WILEY_TDM_TOKEN:
    raise ValueError("WILEY_TEXT_AND_DATA_MINING_TOKEN not set in .env file")

In [None]:
# Load Cochrane reviews and fetch DOIs from PubMed
from Bio import Entrez
import time
import re

Entrez.email = os.getenv("NCBI_EMAIL", "")
Entrez.api_key = os.getenv("NCBI_API_KEY", "")

abstracts = pd.read_csv(ABSTRACTS_CSV, dtype={"pmid": str})
print(f"Loaded {len(abstracts):,} Cochrane reviews")

def get_doi_from_pmid(pmid):
    """Fetch DOI for a PubMed ID."""
    try:
        handle = Entrez.efetch(db="pubmed", id=pmid, rettype="xml", retmode="xml")
        xml = handle.read()
        if isinstance(xml, bytes):
            xml = xml.decode('utf-8')
        handle.close()
        doi_match = re.search(r'<ArticleId IdType="doi">([^<]+)</ArticleId>', xml)
        return doi_match.group(1) if doi_match else None
    except Exception as e:
        return None

# Test with a sample
sample_pmid = abstracts['pmid'].iloc[0]
sample_doi = get_doi_from_pmid(sample_pmid)
print(f"Sample: PMID {sample_pmid} -> DOI {sample_doi}")

In [None]:
# Initialize Wiley TDM client and test download
from wiley_tdm import TDMClient

PDF_DIR.mkdir(parents=True, exist_ok=True)
tdm = TDMClient(download_dir=str(PDF_DIR))

# Test with a single download
if sample_doi:
    print(f"Testing download for DOI: {sample_doi}")
    result = tdm.download_pdf(sample_doi)
    print(f"Result: {result}")
else:
    print("Could not get DOI for test download")

In [None]:
# Batch download PDFs with progress tracking
import csv

# Track DOIs and download status
DOI_CACHE_FILE = DATA_DIR / "cochrane_dois.csv"
DOWNLOAD_LOG = DATA_DIR / "pdf_download_log.csv"

def batch_get_dois(pmids, batch_size=50, sleep_time=0.5):
    """Fetch DOIs for a list of PMIDs in batches."""
    results = {}
    for i in range(0, len(pmids), batch_size):
        batch = pmids[i:i+batch_size]
        for pmid in batch:
            doi = get_doi_from_pmid(pmid)
            results[pmid] = doi
            time.sleep(sleep_time)
        if (i // batch_size) % 10 == 0:
            print(f"  DOI progress: {i + len(batch)}/{len(pmids)}")
    return results

# Load or create DOI cache
if DOI_CACHE_FILE.exists():
    doi_df = pd.read_csv(DOI_CACHE_FILE, dtype={"pmid": str})
    dois = dict(zip(doi_df['pmid'], doi_df['doi']))
    print(f"Loaded {len(dois):,} DOIs from cache")
else:
    print("Fetching DOIs from PubMed (this will take a while)...")
    pmids = abstracts['pmid'].tolist()
    dois = batch_get_dois(pmids)
    doi_df = pd.DataFrame([(k, v) for k, v in dois.items()], columns=['pmid', 'doi'])
    doi_df.to_csv(DOI_CACHE_FILE, index=False)
    print(f"Saved {len(dois):,} DOIs to cache")

valid_dois = {k: v for k, v in dois.items() if v is not None}
print(f"\nReviews with valid DOIs: {len(valid_dois):,} ({100*len(valid_dois)/len(dois):.1f}%)")

In [None]:
# Download PDFs in batches (run this cell multiple times if interrupted)
import time

# Check which PDFs already exist
existing_pdfs = set(p.stem for p in PDF_DIR.glob("*.pdf"))
print(f"Already downloaded: {len(existing_pdfs)} PDFs")

# Filter to DOIs we haven't downloaded yet
to_download = [(pmid, doi) for pmid, doi in valid_dois.items() 
               if doi.replace("/", "-") not in existing_pdfs and doi.replace("/", "%2F") not in existing_pdfs]

print(f"Remaining to download: {len(to_download)}")

# Download with rate limiting (adjust MAX_DOWNLOADS for testing)
MAX_DOWNLOADS = 100  # Set to None for all
downloads = to_download[:MAX_DOWNLOADS] if MAX_DOWNLOADS else to_download

success_count = 0
fail_count = 0

for i, (pmid, doi) in enumerate(downloads):
    try:
        result = tdm.download_pdf(doi)
        if result == "Success":
            success_count += 1
        else:
            fail_count += 1
    except Exception as e:
        fail_count += 1
    
    if (i + 1) % 10 == 0:
        print(f"Progress: {i+1}/{len(downloads)} | Success: {success_count} | Failed: {fail_count}")
    
    time.sleep(0.5)  # Rate limiting

print(f"\nDownload complete: {success_count} succeeded, {fail_count} failed")

In [None]:
# Summary of downloaded PDFs
pdf_files = list(PDF_DIR.glob("*.pdf"))
total_size_mb = sum(p.stat().st_size for p in pdf_files) / (1024 * 1024)

print("="*60)
print("DOWNLOAD SUMMARY")
print("="*60)
print(f"Total PDFs downloaded: {len(pdf_files):,}")
print(f"Total storage used: {total_size_mb:.1f} MB")
print(f"Average PDF size: {total_size_mb/len(pdf_files)*1024:.0f} KB" if pdf_files else "N/A")
print(f"\nNext step: Run notebook 03 to extract categorized references from PDFs")