# 03: Extract Metadata and References from Cochrane PDFs

## Summary
This notebook extracts metadata and references from Cochrane review PDFs using **PyMuPDF** (fitz) - a fast PDF library.

### Extracted Data:

**Metadata:**
- `doi`: DOI from filename
- `title`: Review title
- `authors`: Author list (from abstract section)
- `abstract`: Full abstract text
- `review_type`: review, protocol, or withdrawn
- `cochrane_group`: Cochrane review group

**References:**
- `category`: included, excluded, awaiting, ongoing
- `study_id`: Author Year format
- `authors`: Full author list
- `title`: Reference title
- `year`: Publication year
- `ref_doi`: DOI of reference
- `pmid`: PubMed ID
- `full_citation`: Complete citation text

### Performance:
- **~15-20 minutes** for 16,588 PDFs (using PyMuPDF)
- ~50x faster than pdfplumber

### Output:
- `Data/review_metadata.csv`
- `Data/categorized_references.csv`

In [8]:
%pip install -q pymupdf pandas tqdm

Note: you may need to restart the kernel to use updated packages.


In [9]:
import os
from pathlib import Path
import pandas as pd
import fitz  # PyMuPDF - fast PDF library
import re
import time
from collections import Counter
from tqdm.notebook import tqdm
from typing import Dict, List, Tuple

# Setup paths
notebook_dir = Path.cwd()
project_root = notebook_dir if (notebook_dir / "Data").exists() else notebook_dir.parent
DATA_DIR = project_root / "Data"
PDF_DIR = DATA_DIR / "cochrane_pdfs"

METADATA_CSV = DATA_DIR / "review_metadata.csv"
REFERENCES_CSV = DATA_DIR / "categorized_references.csv"

pdf_files = list(PDF_DIR.glob("*.pdf"))
print(f"Project root: {project_root}")
print(f"PDF directory: {PDF_DIR}")
print(f"Found {len(pdf_files):,} PDFs")

Project root: c:\Users\juanx\Documents\LSE-UKHSA Project
PDF directory: c:\Users\juanx\Documents\LSE-UKHSA Project\Data\cochrane_pdfs
Found 16,588 PDFs


In [10]:
# =============================================================================
# PyMuPDF Helper Functions
# =============================================================================
# PyMuPDF (fitz) is 10-50x faster than pdfplumber for text extraction
# =============================================================================

def extract_text_fast(pdf_path: Path, start_page: int = 0, end_page: int = None) -> str:
    """Extract text from PDF pages using PyMuPDF (fast)."""
    doc = fitz.open(pdf_path)
    if end_page is None:
        end_page = len(doc)
    
    text = ""
    for i in range(start_page, min(end_page, len(doc))):
        text += doc[i].get_text() + "\n\n"
    doc.close()
    return text


def get_pdf_page_count(pdf_path: Path) -> int:
    """Get page count quickly."""
    doc = fitz.open(pdf_path)
    count = len(doc)
    doc.close()
    return count


# Quick speed test
test_pdf = pdf_files[0]
start = time.time()
text = extract_text_fast(test_pdf)
elapsed = time.time() - start
print(f"✓ PyMuPDF extracted {len(text):,} chars in {elapsed:.3f} seconds")
print(f"  Estimated time for all {len(pdf_files):,} PDFs: {elapsed * len(pdf_files) / 60:.1f} minutes")

✓ PyMuPDF extracted 31,979 chars in 0.035 seconds
  Estimated time for all 16,588 PDFs: 9.6 minutes


In [23]:
# =============================================================================
# Metadata Extraction (using PyMuPDF)
# =============================================================================

def extract_metadata(pdf_path: Path) -> Dict:
    """Extract review metadata from first pages of PDF."""
    doi = pdf_path.stem.replace("-", "/")
    result = {
        'doi': doi, 'title': '', 'authors': '', 'abstract': '',
        'review_type': '', 'cochrane_group': '',
    }
    try:
        # Extract first 5 pages for metadata
        text = extract_text_fast(pdf_path, 0, 5)
        
        # --- TITLE ---
        title_patterns = [
            r'Cochrane Database of Systematic Reviews\s*\n+\s*([A-Z][^\.]{10,200})',
            r'CochraneDatabaseofSystematicReviews\s*\n*\s*([A-Z][^\.]{10,200})',
            r'Review\s*\n+([A-Z][A-Za-z\s\-\,\:]{10,200})',
        ]
        for pattern in title_patterns:
            title_match = re.search(pattern, text)
            if title_match:
                title = re.sub(r'\s+', ' ', title_match.group(1)).strip()
                if len(title) > 10 and not title.startswith('Copyright'):
                    result['title'] = title[:500]
                    break
        
        # --- AUTHORS ---
        authors_match = re.search(
            r'^([A-Z][a-z]+(?:\s+[A-Z]\.?)+(?:,\s*[A-Z][a-z]+(?:\s+[A-Z]\.?)+)*)',
            text[200:2000], re.MULTILINE
        )
        if authors_match:
            result['authors'] = re.sub(r'\s+', ' ', authors_match.group(1)).strip()[:500]
        
        # --- ABSTRACT ---
        # Look for "Background" section content (not the table of contents with dots)
        # Key: Find "Background" followed by actual paragraph text, not "..." dots
        abstract_patterns = [
            # Pattern 1: Background section with real content (no dots)
            r'Background\s*\n+([A-Z][^\.]{20,}(?:\.\s+[A-Z][^\.]+)*\.)',
            # Pattern 2: Objectives section
            r'Objectives\s*\n+([A-Z][^\.]{20,}(?:\.\s+[A-Z][^\.]+)*\.)',
            # Pattern 3: Summary section
            r'Summary\s*\n+([A-Z][^\.]{20,}(?:\.\s+[A-Z][^\.]+)*\.)',
        ]
        for pattern in abstract_patterns:
            abs_match = re.search(pattern, text)
            if abs_match:
                abstract = abs_match.group(1).strip()
                # Skip if it's mostly dots (table of contents)
                if '...' not in abstract and len(abstract) > 50:
                    result['abstract'] = re.sub(r'\s+', ' ', abstract)[:3000]
                    break
        
        # Fallback: Try to find any substantive text after "Background"
        if not result['abstract']:
            bg_match = re.search(r'Background[:\s]+(.{100,1500}?)(?=\n\s*(?:Objectives|Methods|Search|Selection))', 
                                 text, re.IGNORECASE | re.DOTALL)
            if bg_match:
                abstract = bg_match.group(1).strip()
                abstract = re.sub(r'\.+\s*\d+', '', abstract)  # Remove "... 2" patterns
                abstract = re.sub(r'\s+', ' ', abstract)
                if len(abstract) > 50 and '...' not in abstract:
                    result['abstract'] = abstract[:3000]
        
        # --- REVIEW TYPE ---
        text_lower = text.lower()[:3000]
        if 'protocol' in text_lower:
            result['review_type'] = 'protocol'
        elif 'withdrawn' in text_lower:
            result['review_type'] = 'withdrawn'
        else:
            result['review_type'] = 'review'
        
        # --- COCHRANE GROUP ---
        group_patterns = [
            r'Cochrane\s+([A-Za-z\s&]+?)\s+Group',
            r'Cochrane\s+([A-Za-z\s&]+?)\s+Review Group',
        ]
        for pattern in group_patterns:
            group_match = re.search(pattern, text)
            if group_match:
                result['cochrane_group'] = group_match.group(1).strip()
                break
                
    except Exception as e:
        result['error'] = str(e)
    return result


# Test on a few PDFs
print("Testing metadata extraction...")
for i in range(5):
    meta = extract_metadata(pdf_files[i])
    print(f"\n{meta['doi']}")
    print(f"  Title: {meta['title'][:60]}..." if meta['title'] else "  Title: (none)")
    print(f"  Type: {meta['review_type']}")
    print(f"  Abstract: {meta['abstract'][:80]}..." if meta['abstract'] else "  Abstract: (none)")

Testing metadata extraction...

10.1002/14651858.CD000004
  Title: Abdominal decompression for suspected fetal compromise/pre-e...
  Type: review
  Abstract: Abdominal decompression was developed as a means of pain relief during labour. I...

10.1002/14651858.CD000004.pub2
  Title: Abdominal decompression for suspected fetal compromise/pre-e...
  Type: review
  Abstract: Abdominal decompression was developed as a means of pain relief during labour. I...

10.1002/14651858.CD000005
  Title: Absorbable staples for uterine incision at caesarean section...
  Type: review
  Abstract: Staples can be placed during the making of an incision, with the aim of decreasi...

10.1002/14651858.CD000005.pub2
  Title: Absorbable staples for uterine incision at caesarean section...
  Type: protocol
  Abstract: (none)

10.1002/14651858.CD000006
  Title: Absorbable synthetic versus catgut suture material for perin...
  Type: review
  Abstract: Approximately 70% of women will experience some degree of perin

In [21]:
# =============================================================================
# Reference Extraction (using PyMuPDF)
# =============================================================================

def extract_references_from_pdf(pdf_path: Path) -> Tuple[List[Dict], str]:
    """Extract categorized references from PDF."""
    doi = pdf_path.stem.replace("-", "/")
    
    try:
        doc = fitz.open(pdf_path)
        total_pages = len(doc)
        
        # Check first page for protocol/withdrawn
        first_text = doc[0].get_text().lower() if total_pages > 0 else ""
        if 'protocol' in first_text[:1500]:
            doc.close()
            return [], 'protocol'
        if 'withdrawn' in first_text[:1500]:
            doc.close()
            return [], 'withdrawn'
        
        # Find reference pages - search from page 2 onwards
        ref_text = ""
        in_refs = False
        
        for i in range(2, total_pages):
            page_text = doc[i].get_text()
            page_lower = page_text.lower()
            
            # Start capturing when we find reference section markers
            if not in_refs:
                if any(marker in page_lower for marker in [
                    'references to studies included',
                    'references to studies excluded',
                    '{published data only}',
                    '{unpublished data only}'
                ]):
                    in_refs = True
            
            if in_refs:
                ref_text += page_text + "\n"
                # Stop if we hit characteristics section
                if 'characteristics of included' in page_lower:
                    break
                if 'characteristics of excluded' in page_lower:
                    break
        
        doc.close()
        
        if not ref_text:
            return [], 'no_refs'
        
        # Parse references
        references = parse_references(ref_text, doi)
        return references, 'review' if references else 'no_refs'
        
    except Exception as e:
        return [], f'error: {str(e)[:30]}'


def parse_references(text: str, review_doi: str) -> List[Dict]:
    """Parse references with structure: AuthorName Year {datatype} followed by citation."""
    references = []
    
    # Define section markers
    sections = [
        ('included', r'references\s*to\s*studies\s*included'),
        ('excluded', r'references\s*to\s*studies\s*excluded'),
        ('awaiting', r'references\s*to\s*studies\s*awaiting'),
        ('ongoing', r'references\s*to\s*ongoing\s*studies'),
    ]
    
    for category, pattern in sections:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            start = match.end()
            
            # Find end of section
            end = len(text)
            end_patterns = [
                r'references\s*to\s*studies\s*(included|excluded|awaiting)',
                r'references\s*to\s*ongoing',
                r'additional\s+references',
                r'characteristics\s*of',
                r'\*\s*indicates',
            ]
            for ep in end_patterns:
                end_match = re.search(ep, text[start:], re.IGNORECASE)
                if end_match and end_match.start() > 10:  # Avoid matching at very start
                    end = min(end, start + end_match.start())
            
            section_text = text[start:end]
            
            # Pattern: "AuthorName Year {published/unpublished data only}"
            # Can have whitespace/newlines between parts
            study_pattern = re.compile(
                r'([A-Z][A-Za-z\'\-]+(?:\s+et\s+al)?)\s+(\d{4}[a-z]?)\s*\{(published|unpublished)',
                re.IGNORECASE
            )
            
            for m in study_pattern.finditer(section_text):
                author_id = m.group(1).strip()
                year = m.group(2)
                
                # Get citation text after "data only}"
                cite_start = m.end()
                data_only_end = section_text.find('}', cite_start)
                if data_only_end > 0:
                    cite_start = data_only_end + 1
                
                # Find end of this citation (next study ID or section end)
                next_study = study_pattern.search(section_text[cite_start:])
                cite_end = cite_start + (next_study.start() if next_study else 2000)
                
                raw_citation = section_text[cite_start:cite_end].strip()
                
                # Clean up citation
                citation = re.sub(r'\n+', ' ', raw_citation)
                citation = re.sub(r'\s+', ' ', citation).strip()
                
                # Parse fields from citation
                # Format: "Authors. Title. Journal Year;Vol:Pages."
                parts = citation.split('. ')
                authors = parts[0].strip() if parts else ""
                title = parts[1].strip() if len(parts) > 1 else ""
                
                # Extract DOI
                doi_match = re.search(r'(10\.\d{4,}/[^\s\]\)]+)', citation)
                ref_doi = doi_match.group(1).rstrip('.,;])') if doi_match else ""
                
                # Extract PMID
                pmid_match = re.search(r'(?:PMID[:\s]*|PubMed[:\s]*|\[PM[:\s]*)(\d{6,9})', citation, re.IGNORECASE)
                pmid = pmid_match.group(1) if pmid_match else ""
                
                references.append({
                    'review_doi': review_doi,
                    'category': category,
                    'study_id': f"{author_id} {year}",
                    'year': year,
                    'authors': authors[:500],
                    'title': title[:500],
                    'ref_doi': ref_doi[:100],
                    'pmid': pmid,
                    'full_citation': citation[:1000],
                })
    
    return references


# Test
print("Testing reference extraction on first 5 PDFs...")
for i, pdf in enumerate(pdf_files[:5]):
    refs, rtype = extract_references_from_pdf(pdf)
    print(f"{pdf.name}: {len(refs)} refs, type: {rtype}")
    if refs:
        r = refs[0]
        print(f"  [{r['category']}] {r['study_id']}")
        print(f"  Authors: {r['authors'][:50]}")
        print(f"  Title: {r['title'][:50]}")

Testing reference extraction on first 5 PDFs...
10.1002-14651858.CD000004.pdf: 4 refs, type: review
  [included] Blecher 1967
  Authors: Blecher JA
  Title: Aspects of the physiology of decompression and its
10.1002-14651858.CD000004.pub2.pdf: 4 refs, type: review
  [included] Blecher 1967
  Authors: Blecher JA
  Title: Aspects of the physiology of decompression and its
10.1002-14651858.CD000005.pdf: 4 refs, type: review
  [included] Dargent 1990
  Authors: Dargent D, Audra G, Noblot G
  Title: Utilization de la pince POLY CS 57 pour l’operatio
10.1002-14651858.CD000005.pub2.pdf: 0 refs, type: no_refs
10.1002-14651858.CD000006.pdf: 11 refs, type: review
  [included] Banninger 1978
  Authors: Banninger U, Buhrig H, Schreiner WE
  Title: A comparison between chromic catgut and polyglycol


In [24]:
# =============================================================================
# COMPREHENSIVE VALIDATION
# =============================================================================
# Testing on 100 PDFs to verify:
# 1. Both old format (no spaces) and new format (with spaces) work
# 2. All metadata fields are extracted properly
# 3. All reference fields are extracted properly
# 4. All reference categories are captured (included, excluded, awaiting, ongoing)
# =============================================================================

print("=" * 70)
print("COMPREHENSIVE VALIDATION")
print("=" * 70)

# Test on 100 PDFs
test_pdfs = pdf_files[:100]
start = time.time()

all_test_meta = []
all_test_refs = []
stats = {'review': 0, 'protocol': 0, 'withdrawn': 0, 'no_refs': 0, 'error': 0}
category_counts = {'included': 0, 'excluded': 0, 'awaiting': 0, 'ongoing': 0}

for pdf_path in test_pdfs:
    # Metadata
    meta = extract_metadata(pdf_path)
    all_test_meta.append(meta)
    
    # References
    refs, rtype = extract_references_from_pdf(pdf_path)
    all_test_refs.extend(refs)
    stats[rtype.split(':')[0]] = stats.get(rtype.split(':')[0], 0) + 1
    
    for ref in refs:
        category_counts[ref['category']] = category_counts.get(ref['category'], 0) + 1

elapsed = time.time() - start
print(f"\n✓ Processed 100 PDFs in {elapsed:.1f} seconds ({elapsed/100*1000:.0f}ms per PDF)")

# --- METADATA VALIDATION ---
print("\n" + "=" * 70)
print("METADATA EXTRACTION VALIDATION")
print("=" * 70)

meta_df = pd.DataFrame(all_test_meta)
print(f"\nFields extracted: {list(meta_df.columns)}")

for col in meta_df.columns:
    non_empty = (meta_df[col].astype(str).str.len() > 0).sum()
    print(f"  {col}: {non_empty}/100 non-empty ({non_empty}%)")

print("\nReview types distribution:")
print(meta_df['review_type'].value_counts().to_string())

# Show sample metadata
print("\n--- Sample Metadata (first PDF with title) ---")
for _, row in meta_df.iterrows():
    if row['title']:
        print(f"DOI: {row['doi']}")
        print(f"Title: {row['title'][:80]}...")
        print(f"Type: {row['review_type']}")
        print(f"Cochrane Group: {row['cochrane_group']}")
        print(f"Abstract: {row['abstract'][:150]}...")
        break

# --- REFERENCE VALIDATION ---
print("\n" + "=" * 70)
print("REFERENCE EXTRACTION VALIDATION")
print("=" * 70)

print(f"\nExtraction results: {stats}")
print(f"Total references extracted: {len(all_test_refs)}")
print(f"\nReferences by category: {category_counts}")

if all_test_refs:
    refs_df = pd.DataFrame(all_test_refs)
    print(f"\nReference fields: {list(refs_df.columns)}")
    
    for col in refs_df.columns:
        non_empty = (refs_df[col].astype(str).str.len() > 0).sum()
        pct = non_empty / len(refs_df) * 100
        print(f"  {col}: {non_empty}/{len(refs_df)} non-empty ({pct:.0f}%)")
    
    # Show sample from each category
    print("\n--- Sample References by Category ---")
    for cat in ['included', 'excluded', 'awaiting', 'ongoing']:
        cat_refs = [r for r in all_test_refs if r['category'] == cat]
        if cat_refs:
            r = cat_refs[0]
            print(f"\n[{cat.upper()}] {r['study_id']}")
            print(f"  Authors: {r['authors'][:60]}...")
            print(f"  Title: {r['title'][:60]}...")
            print(f"  DOI: {r['ref_doi']}")
            print(f"  PMID: {r['pmid']}")

# --- FORMAT VALIDATION ---
print("\n" + "=" * 70)
print("PDF FORMAT VALIDATION (Old vs New)")
print("=" * 70)

# Check for old format (no spaces: "CochraneDatabaseofSystematicReviews")
old_format = 0
new_format = 0
for pdf_path in test_pdfs[:20]:
    text = extract_text_fast(pdf_path, 0, 1)
    if 'CochraneDatabaseofSystematicReviews' in text:
        old_format += 1
    elif 'Cochrane Database of Systematic Reviews' in text:
        new_format += 1

print(f"\nFormat check (first 20 PDFs):")
print(f"  Old format (no spaces): {old_format}")
print(f"  New format (with spaces): {new_format}")

# --- TIME ESTIMATE ---
print("\n" + "=" * 70)
print("TIME ESTIMATE FOR FULL EXTRACTION")
print("=" * 70)
total_pdfs = len(pdf_files)
estimated_time = elapsed / 100 * total_pdfs
print(f"\nTotal PDFs: {total_pdfs:,}")
print(f"Estimated time: {estimated_time/60:.1f} minutes")
print(f"\n✓ VALIDATION COMPLETE - Ready for full extraction")
print("=" * 70)

COMPREHENSIVE VALIDATION

✓ Processed 100 PDFs in 5.7 seconds (57ms per PDF)

METADATA EXTRACTION VALIDATION

Fields extracted: ['doi', 'title', 'authors', 'abstract', 'review_type', 'cochrane_group']
  doi: 100/100 non-empty (100%)
  title: 100/100 non-empty (100%)
  authors: 97/100 non-empty (97%)
  abstract: 87/100 non-empty (87%)
  review_type: 100/100 non-empty (100%)
  cochrane_group: 93/100 non-empty (93%)

Review types distribution:
review_type
review       89
protocol      8
withdrawn     3

--- Sample Metadata (first PDF with title) ---
DOI: 10.1002/14651858.CD000004
Title: Abdominal decompression for suspected fetal compromise/pre-eclampsia (Review) Ho...
Type: review
Cochrane Group: Pregnancy and Childbirth
Abstract: Abdominal decompression was developed as a means of pain relief during labour. It has also been used for complications of pregnancy, and in healthy pr...

REFERENCE EXTRACTION VALIDATION

Extraction results: {'review': 86, 'protocol': 0, 'withdrawn': 0, 'no_ref

In [25]:
# =============================================================================
# MAIN EXTRACTION - All 16,588 PDFs
# =============================================================================
# Estimated time: ~15-20 minutes
# =============================================================================

print(f"Extracting from {len(pdf_files):,} PDFs...")
print("=" * 70)

start_time = time.time()
all_metadata = []
all_references = []
stats = Counter()
category_counts = Counter()

for i, pdf_path in enumerate(tqdm(pdf_files, desc="Extracting")):
    # Metadata
    meta = extract_metadata(pdf_path)
    all_metadata.append(meta)
    
    # References
    refs, rtype = extract_references_from_pdf(pdf_path)
    all_references.extend(refs)
    stats[rtype] += 1
    
    for ref in refs:
        category_counts[ref['category']] += 1
    
    # Progress every 2000
    if (i + 1) % 2000 == 0:
        elapsed = time.time() - start_time
        rate = (i + 1) / elapsed
        remaining = (len(pdf_files) - i - 1) / rate
        print(f"  {i+1:,} done | {rate:.1f}/sec | {remaining/60:.1f}min left | refs: {len(all_references):,}")

elapsed = time.time() - start_time
print(f"\n" + "=" * 70)
print(f"EXTRACTION COMPLETE")
print(f"=" * 70)
print(f"Time: {elapsed/60:.1f} minutes ({elapsed/len(pdf_files)*1000:.0f}ms per PDF)")
print(f"\nDocument types: {dict(stats)}")
print(f"Reference categories: {dict(category_counts)}")
print(f"Total references: {len(all_references):,}")

Extracting from 16,588 PDFs...


Extracting:   0%|          | 0/16588 [00:00<?, ?it/s]

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

  2,000 done | 14.7/sec | 16.5min left | refs: 72,723
MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

MuPDF error: syntax error: invalid key in dict

  4,000 done | 13.5/sec | 15.5min left | refs: 149,836
  6,000 done | 12.8/sec | 13.8min left | refs: 228,029
  8,000 done | 12.3/sec | 11.6min left | refs: 304,302
  10,000 done | 12.0/sec | 9.1min left | refs: 375,397
  12,000 done | 11.7/sec | 6.6min left | refs: 445,722
MuPDF error: syntax error: invalid key in dict

  14,000 done | 11.2/sec | 3.8min left | refs: 

In [26]:
# SAVE RESULTS
meta_df = pd.DataFrame(all_metadata)
refs_df = pd.DataFrame(all_references)

meta_df.to_csv(METADATA_CSV, index=False)
refs_df.to_csv(REFERENCES_CSV, index=False)

print(f"Saved {len(meta_df):,} metadata → {METADATA_CSV.name}")
print(f"Saved {len(refs_df):,} references → {REFERENCES_CSV.name}")

# Summary
print(f"\nReferences by category:")
if len(refs_df) > 0:
    print(refs_df['category'].value_counts())

Saved 16,588 metadata → review_metadata.csv
Saved 629,561 references → categorized_references.csv

References by category:
category
excluded    387426
included    210887
awaiting     22630
ongoing       8618
Name: count, dtype: int64
