# 04: Build Ground Truth Validation Dataset

## Summary
This notebook builds the ground truth validation dataset for LLM evaluation. It uses the categorized references extracted from Cochrane PDFs to create properly labeled examples:
- **Included studies** → Positive examples (label=1)
- **Excluded studies** → Negative examples (label=0)

Unlike the previous flawed approach using PubMed references, this dataset has verified labels based on actual Cochrane reviewer decisions.

**Pipeline Position:** Fifth notebook - creates the evaluation dataset.

**What this notebook does:**
1. Loads categorized references from previous extraction
2. Matches study IDs to PubMed IDs for abstract retrieval
3. Fetches abstracts for matched studies
4. Creates balanced training/validation sets
5. Saves ground truth CSV with labels

**Input:** `Data/categorized_references.csv`, `Data/cochrane_pubmed_abstracts.csv`

**Output:** `Data/ground_truth_validation_set.csv`

In [None]:
# Install required packages
%pip install -q biopython pandas python-dotenv

In [None]:
# Set up environment and load data
import os
from pathlib import Path
from dotenv import load_dotenv
import pandas as pd
from Bio import Entrez
import time
import re

notebook_dir = Path.cwd()
project_root = notebook_dir if (notebook_dir / ".env").exists() else notebook_dir.parent
env_path = project_root / ".env"
load_dotenv(env_path, override=True)

Entrez.email = os.getenv("NCBI_EMAIL", "")
Entrez.api_key = os.getenv("NCBI_API_KEY", "")

DATA_DIR = project_root / "Data"
REFS_CSV = DATA_DIR / "categorized_references.csv"
ABSTRACTS_CSV = DATA_DIR / "cochrane_pubmed_abstracts.csv"
OUTPUT_CSV = DATA_DIR / "ground_truth_validation_set.csv"

print(f"Loading categorized references...")
refs = pd.read_csv(REFS_CSV)
print(f"Total references: {len(refs):,}")
print(refs['category'].value_counts())

In [None]:
# Define function to search PubMed for study by author and year
def search_pubmed_for_study(study_id: str) -> list:
    """Search PubMed for a study given Author Year format."""
    parts = study_id.split()
    if len(parts) < 2:
        return []
    
    author = parts[0]
    year = re.search(r'\d{4}', study_id)
    if not year:
        return []
    year = year.group()
    
    query = f"{author}[Author] AND {year}[Date - Publication]"
    
    try:
        handle = Entrez.esearch(db="pubmed", term=query, retmax=5)
        results = Entrez.read(handle)
        handle.close()
        return results.get('IdList', [])
    except Exception:
        return []

# Test with a sample
sample_study = refs['study_id'].iloc[0]
print(f"Testing search for: {sample_study}")
pmids = search_pubmed_for_study(sample_study)
print(f"Found PMIDs: {pmids}")

In [None]:
# Match study IDs to PMIDs (sample for efficiency)
from tqdm.notebook import tqdm

# Filter to included and excluded only (skip awaiting)
refs_for_matching = refs[refs['category'].isin(['included', 'excluded'])].copy()
unique_studies = refs_for_matching['study_id'].unique()

print(f"Unique studies to match: {len(unique_studies):,}")

# Sample for manageable processing (adjust as needed)
MAX_STUDIES = 2000
if len(unique_studies) > MAX_STUDIES:
    import numpy as np
    np.random.seed(42)
    unique_studies = np.random.choice(unique_studies, MAX_STUDIES, replace=False)
    print(f"Sampled to {MAX_STUDIES} studies")

# Search PubMed for each study
study_to_pmid = {}
for study_id in tqdm(unique_studies, desc="Matching to PubMed"):
    pmids = search_pubmed_for_study(study_id)
    if pmids:
        study_to_pmid[study_id] = pmids[0]  # Take first match
    time.sleep(0.4)  # Rate limiting

print(f"\nMatched {len(study_to_pmid):,} studies to PMIDs ({100*len(study_to_pmid)/len(unique_studies):.1f}%)")

In [None]:
# Fetch abstracts for matched studies
from io import StringIO
from Bio import Medline

def fetch_abstracts(pmids: list, batch_size: int = 50) -> dict:
    """Fetch abstracts for a list of PMIDs."""
    abstracts = {}
    
    for i in range(0, len(pmids), batch_size):
        batch = pmids[i:i+batch_size]
        try:
            handle = Entrez.efetch(db="pubmed", id=",".join(batch), rettype="medline", retmode="text")
            text = handle.read()
            handle.close()
            
            for record in Medline.parse(StringIO(text)):
                pmid = record.get("PMID", "")
                abstracts[pmid] = {
                    'title': record.get("TI", ""),
                    'abstract': record.get("AB", ""),
                    'year': record.get("DP", "").split()[0] if record.get("DP") else ""
                }
        except Exception as e:
            print(f"Error fetching batch: {e}")
        
        time.sleep(0.5)
    
    return abstracts

pmids_to_fetch = list(study_to_pmid.values())
print(f"Fetching abstracts for {len(pmids_to_fetch)} studies...")
abstract_data = fetch_abstracts(pmids_to_fetch)
print(f"Retrieved {len(abstract_data)} abstracts")

In [None]:
# Build ground truth dataset with labels
ground_truth_rows = []

# Get category for each study
study_categories = refs_for_matching.groupby('study_id')['category'].first().to_dict()

for study_id, pmid in study_to_pmid.items():
    if pmid not in abstract_data:
        continue
    
    info = abstract_data[pmid]
    if not info.get('abstract'):  # Skip if no abstract
        continue
    
    category = study_categories.get(study_id, 'unknown')
    label = 1 if category == 'included' else 0
    
    # Get the review DOI for context
    review_doi = refs_for_matching[refs_for_matching['study_id'] == study_id]['review_doi'].iloc[0]
    
    ground_truth_rows.append({
        'study_pmid': pmid,
        'study_id': study_id,
        'review_doi': review_doi,
        'title': info['title'],
        'abstract': info['abstract'],
        'year': info['year'],
        'category': category,
        'label': label
    })

ground_truth = pd.DataFrame(ground_truth_rows)
print(f"Ground truth dataset: {len(ground_truth):,} examples")
print(ground_truth['category'].value_counts())

In [None]:
# Balance the dataset if needed
included = ground_truth[ground_truth['label'] == 1]
excluded = ground_truth[ground_truth['label'] == 0]

print(f"Before balancing: {len(included)} included, {len(excluded)} excluded")

# Undersample the majority class to match minority
min_count = min(len(included), len(excluded))
if len(included) != len(excluded):
    included_sample = included.sample(n=min_count, random_state=42)
    excluded_sample = excluded.sample(n=min_count, random_state=42)
    ground_truth_balanced = pd.concat([included_sample, excluded_sample]).sample(frac=1, random_state=42)
else:
    ground_truth_balanced = ground_truth

print(f"After balancing: {len(ground_truth_balanced)} total examples")
print(ground_truth_balanced['category'].value_counts())

In [None]:
# Save ground truth dataset
ground_truth_balanced.to_csv(OUTPUT_CSV, index=False)
print(f"Saved ground truth to {OUTPUT_CSV}")

print("\n" + "="*60)
print("GROUND TRUTH SUMMARY")
print("="*60)
print(f"Total examples: {len(ground_truth_balanced):,}")
print(f"  Included (label=1): {(ground_truth_balanced['label'] == 1).sum():,}")
print(f"  Excluded (label=0): {(ground_truth_balanced['label'] == 0).sum():,}")
print(f"Unique reviews: {ground_truth_balanced['review_doi'].nunique()}")
print(f"\nNext step: Run notebook 05 to evaluate local LLMs")