# 03: Extract Categorized References from Cochrane PDFs

## Summary
This notebook extracts categorized study references from Cochrane review PDFs. Unlike PubMed XML which mixes all references together, the PDFs contain clearly labeled sections for:
- **Included studies** - Studies that passed screening criteria (positives)
- **Excluded studies** - Studies that were reviewed but rejected (hard negatives)
- **Awaiting classification** - Studies pending assessment (excluded from training)

**Pipeline Position:** Fourth notebook - extracts structured reference data from PDFs.

**What this notebook does:**
1. Reads PDF text using pdfplumber
2. Identifies reference section boundaries using regex patterns
3. Parses individual study identifiers (Author Year format)
4. Saves categorized references to CSV

**Input:** `Data/cochrane_pdfs/*.pdf`

**Output:** `Data/categorized_references.csv`

In [None]:
# Install required packages for PDF text extraction
%pip install -q pdfplumber pandas

In [None]:
# Set up paths and load required libraries
import os
from pathlib import Path
import pandas as pd
import pdfplumber
import re
from typing import Dict, List, Tuple

notebook_dir = Path.cwd()
project_root = notebook_dir if (notebook_dir / "Data").exists() else notebook_dir.parent
DATA_DIR = project_root / "Data"
PDF_DIR = DATA_DIR / "cochrane_pdfs"
OUTPUT_CSV = DATA_DIR / "categorized_references.csv"

pdf_files = list(PDF_DIR.glob("*.pdf"))
print(f"Found {len(pdf_files)} PDFs to process")

In [None]:
# Define functions to extract text and parse reference sections

def extract_pdf_text(pdf_path: Path) -> str:
    """Extract all text from a PDF file."""
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n\n"
    return text

def extract_reference_sections(text: str) -> Dict[str, str]:
    """Extract text for included, excluded, and awaiting classification sections."""
    sections = {}
    
    # Pattern for included studies
    included_match = re.search(
        r'References to studies included in this review(.*?)(?=References to studies excluded|References to studies awaiting|Additional references|ADDITIONAL TABLES|DATA AND ANALYSES|CHARACTERISTICS OF STUDIES|$)',
        text, re.IGNORECASE | re.DOTALL
    )
    if included_match:
        sections['included'] = included_match.group(1)
    
    # Pattern for excluded studies
    excluded_match = re.search(
        r'References to studies excluded from this review(.*?)(?=References to studies awaiting|Additional references|ADDITIONAL TABLES|DATA AND ANALYSES|CHARACTERISTICS OF STUDIES|$)',
        text, re.IGNORECASE | re.DOTALL
    )
    if excluded_match:
        sections['excluded'] = excluded_match.group(1)
    
    # Pattern for awaiting classification
    awaiting_match = re.search(
        r'References to studies awaiting (?:classification|assessment)(.*?)(?=Additional references|ADDITIONAL TABLES|DATA AND ANALYSES|CHARACTERISTICS OF STUDIES|$)',
        text, re.IGNORECASE | re.DOTALL
    )
    if awaiting_match:
        sections['awaiting'] = awaiting_match.group(1)
    
    return sections

def parse_study_ids(section_text: str) -> List[str]:
    """Extract study identifiers (Author Year format) from section text."""
    # Pattern matches: AuthorName Year {published/unpublished data only}
    pattern = r'([A-Z][a-z]+(?:\s+et\s+al\.?)?\s+\d{4}[a-z]?)\s*\{(?:published|unpublished)'
    matches = re.findall(pattern, section_text)
    return [m.strip() for m in matches]

def process_pdf(pdf_path: Path) -> List[Dict]:
    """Process a single PDF and extract all categorized references."""
    results = []
    try:
        text = extract_pdf_text(pdf_path)
        sections = extract_reference_sections(text)
        
        # Extract DOI from filename (format: 10.1002-14651858.CDxxxxxx.pubN.pdf)
        doi = pdf_path.stem.replace("-", "/")
        
        for category, section_text in sections.items():
            study_ids = parse_study_ids(section_text)
            for study_id in study_ids:
                results.append({
                    'review_doi': doi,
                    'study_id': study_id,
                    'category': category
                })
    except Exception as e:
        print(f"Error processing {pdf_path.name}: {e}")
    
    return results

In [None]:
# Test extraction on a sample PDF
if pdf_files:
    sample_pdf = pdf_files[0]
    print(f"Testing on: {sample_pdf.name}")
    
    results = process_pdf(sample_pdf)
    
    print(f"\nExtracted {len(results)} references:")
    for cat in ['included', 'excluded', 'awaiting']:
        count = sum(1 for r in results if r['category'] == cat)
        print(f"  {cat}: {count}")
    
    print("\nSample references:")
    for r in results[:5]:
        print(f"  [{r['category']}] {r['study_id']}")

In [None]:
# Process all PDFs and collect results
from tqdm.notebook import tqdm

all_results = []

print(f"Processing {len(pdf_files)} PDFs...")
for pdf_path in tqdm(pdf_files, desc="Extracting references"):
    results = process_pdf(pdf_path)
    all_results.extend(results)

print(f"\nTotal references extracted: {len(all_results):,}")

In [None]:
# Create DataFrame and analyze results
refs_df = pd.DataFrame(all_results)

print("Reference counts by category:")
print(refs_df['category'].value_counts())

print(f"\nUnique reviews processed: {refs_df['review_doi'].nunique()}")
print(f"Unique study IDs: {refs_df['study_id'].nunique()}")

# Preview
refs_df.head(10)

In [None]:
# Save to CSV
refs_df.to_csv(OUTPUT_CSV, index=False)
print(f"Saved {len(refs_df):,} categorized references to {OUTPUT_CSV}")

print("\n" + "="*60)
print("EXTRACTION SUMMARY")
print("="*60)
print(f"Total references: {len(refs_df):,}")
print(f"  Included (positives): {(refs_df['category'] == 'included').sum():,}")
print(f"  Excluded (negatives): {(refs_df['category'] == 'excluded').sum():,}")
print(f"  Awaiting (excluded from training): {(refs_df['category'] == 'awaiting').sum():,}")
print(f"\nNext step: Run notebook 04 to build ground truth dataset")