# Build Ground Truth Dataset for LLM Screening Evaluation

**Objective:** Create an annotated validation dataset with:
- **Positive examples (included):** Papers that appear in a Cochrane review's references
- **Negative examples (excluded):** Papers that could have been considered but were NOT included

## Strategy for Negative Sampling
For each Cochrane review, we'll use papers that:
1. Were included in *other* reviews on similar topics, but NOT in this review
2. This simulates realistic "near-miss" papers that reviewers would have considered

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from collections import defaultdict
import re

# Paths
DATA_DIR = Path.cwd().parent / "Data" if not (Path.cwd() / "Data").exists() else Path.cwd() / "Data"
ABSTRACTS_CSV = DATA_DIR / "cochrane_pubmed_abstracts.csv"
REFERENCES_CSV = DATA_DIR / "cochrane_pubmed_references.csv"
REF_ABSTRACTS_CSV = DATA_DIR / "referenced_paper_abstracts.csv"
OUTPUT_CSV = DATA_DIR / "ground_truth_validation_set.csv"

print(f"Data directory: {DATA_DIR}")

Data directory: c:\Users\juanx\Documents\LSE-UKHSA Project\Data


In [2]:
# Load all datasets
print("Loading datasets...")
cochrane = pd.read_csv(ABSTRACTS_CSV, dtype={"pmid": str, "year": str})
refs = pd.read_csv(REFERENCES_CSV, dtype={"citing_pmid": str, "ref_pmid": str})
ref_abstracts = pd.read_csv(REF_ABSTRACTS_CSV, dtype={"pmid": str, "year": str})

print(f"Cochrane reviews: {len(cochrane):,}")
print(f"Reference edges: {len(refs):,}")
print(f"Referenced paper abstracts: {len(ref_abstracts):,}")

Loading datasets...
Cochrane reviews: 17,092
Reference edges: 1,182,678
Referenced paper abstracts: 490,929


In [3]:
# Build review -> included papers mapping
# Filter to references with valid PMIDs
refs_valid = refs[refs["ref_pmid"].notna() & (refs["ref_pmid"] != "")].copy()
print(f"Valid reference edges (with PMID): {len(refs_valid):,}")

# Create mapping: review_pmid -> set of included paper PMIDs
review_to_included = refs_valid.groupby("citing_pmid")["ref_pmid"].apply(set).to_dict()
print(f"Reviews with valid references: {len(review_to_included):,}")

# Inverse mapping: paper_pmid -> set of reviews that include it
paper_to_reviews = defaultdict(set)
for review, papers in review_to_included.items():
    for paper in papers:
        paper_to_reviews[paper].add(review)

print(f"Unique papers in reference graph: {len(paper_to_reviews):,}")

Valid reference edges (with PMID): 848,607
Reviews with valid references: 10,077
Unique papers in reference graph: 491,531


In [4]:
# Filter to papers that have abstracts available
papers_with_abstracts = set(ref_abstracts[ref_abstracts["abstract"].notna() & (ref_abstracts["abstract"] != "")]["pmid"])
print(f"Papers with abstracts: {len(papers_with_abstracts):,}")

# Filter review_to_included to only papers with abstracts
review_to_included_filtered = {
    review: papers & papers_with_abstracts
    for review, papers in review_to_included.items()
}

# Keep reviews that have at least 5 included papers with abstracts
MIN_INCLUDED = 5
eligible_reviews = {r: p for r, p in review_to_included_filtered.items() if len(p) >= MIN_INCLUDED}
print(f"Reviews with >= {MIN_INCLUDED} included papers (with abstracts): {len(eligible_reviews):,}")

Papers with abstracts: 443,418
Reviews with >= 5 included papers (with abstracts): 10,012


## Extract Inclusion/Exclusion Criteria from Review Abstracts

Cochrane abstracts often follow a structured format with sections like:
- OBJECTIVES / RATIONALE
- SELECTION CRITERIA / TYPES OF STUDIES / PARTICIPANTS / INTERVENTIONS
- SEARCH METHODS
- DATA COLLECTION AND ANALYSIS
- MAIN RESULTS
- AUTHORS' CONCLUSIONS

We'll extract the criteria-related sections.

In [5]:
def extract_criteria_from_abstract(abstract: str) -> dict:
    """
    Extract structured sections from a Cochrane abstract.
    Returns dict with 'objectives', 'criteria', 'full_context'.
    """
    if not abstract or pd.isna(abstract):
        return {"objectives": "", "criteria": "", "full_context": ""}
    
    # Common section headers in Cochrane abstracts
    section_patterns = [
        (r"OBJECTIVE[S]?:\s*", "objectives"),
        (r"RATIONALE:\s*", "objectives"),
        (r"BACKGROUND:\s*", "objectives"),
        (r"SELECTION CRITERIA:\s*", "criteria"),
        (r"ELIGIBILITY CRITERIA:\s*", "criteria"),
        (r"TYPES OF STUDIES:\s*", "criteria"),
        (r"TYPES OF PARTICIPANTS:\s*", "criteria"),
        (r"TYPES OF INTERVENTIONS:\s*", "criteria"),
        (r"INCLUSION CRITERIA:\s*", "criteria"),
    ]
    
    result = {"objectives": "", "criteria": "", "full_context": abstract}
    
    # Try to extract sections
    text = abstract.upper()
    
    # Find SELECTION CRITERIA or similar
    criteria_match = re.search(
        r"(SELECTION CRITERIA|ELIGIBILITY CRITERIA|TYPES OF STUDIES|INCLUSION CRITERIA)[:\s]*(.*?)(?=(SEARCH METHODS|DATA COLLECTION|MAIN RESULTS|AUTHORS|$))",
        text, re.DOTALL
    )
    if criteria_match:
        start = criteria_match.start()
        end = criteria_match.end()
        result["criteria"] = abstract[start:end].strip()
    
    # Find OBJECTIVES/BACKGROUND
    obj_match = re.search(
        r"(OBJECTIVE[S]?|RATIONALE|BACKGROUND)[:\s]*(.*?)(?=(SELECTION CRITERIA|SEARCH METHODS|ELIGIBILITY|TYPES OF|DATA COLLECTION|$))",
        text, re.DOTALL
    )
    if obj_match:
        start = obj_match.start()
        end = obj_match.end()
        result["objectives"] = abstract[start:end].strip()
    
    return result


# Test on a sample
sample_abstract = cochrane.iloc[4]["abstract"]
print("Sample abstract:")
print(sample_abstract[:500])
print("\n" + "="*60)
extracted = extract_criteria_from_abstract(sample_abstract)
print(f"\nExtracted objectives: {extracted['objectives'][:300]}...")
print(f"\nExtracted criteria: {extracted['criteria'][:300]}...")

Sample abstract:
RATIONALE: Depression is a common cause of morbidity and mortality worldwide. Depression is often treated with antidepressants or psychological therapy, or both, but some people may prefer alternative approaches such as exercise. This review updates one first published in 2008 and last updated in 2013. OBJECTIVES: To determine the effectiveness of exercise in the treatment of depression in adults compared with no intervention, waiting list control or placebo, or where exercise is used as an adju


Extracted objectives: RATIONALE: Depression is a common cause of morbidity and mortality worldwide. Depression is often treated with antidepressants or psychological therapy, or both, but some people may prefer alternative approaches such as exercise. This review updates one first published in 2008 and last updated in 20...

Extracted criteria: ELIGIBILITY CRITERIA: We included randomised controlled trials (RCTs) in which exercise was compared to no treatment, inactive treatm

In [6]:
# Extract criteria for all eligible reviews
cochrane_lookup = cochrane.set_index("pmid").to_dict("index")

review_criteria = {}
for review_pmid in eligible_reviews.keys():
    if review_pmid in cochrane_lookup:
        review_data = cochrane_lookup[review_pmid]
        criteria = extract_criteria_from_abstract(review_data.get("abstract", ""))
        review_criteria[review_pmid] = {
            "title": review_data.get("title", ""),
            "objectives": criteria["objectives"],
            "criteria": criteria["criteria"],
            "full_abstract": review_data.get("abstract", ""),
        }

print(f"Extracted criteria for {len(review_criteria):,} reviews")

# Check how many have parseable criteria
has_criteria = sum(1 for v in review_criteria.values() if v["criteria"])
print(f"Reviews with parseable criteria section: {has_criteria:,} ({100*has_criteria/len(review_criteria):.1f}%)")

Extracted criteria for 10,012 reviews
Reviews with parseable criteria section: 9,659 (96.5%)


## Generate Validation Set with Positive and Negative Examples

In [7]:
import random
random.seed(42)
np.random.seed(42)

# Parameters
N_REVIEWS = 100  # Number of reviews to sample
POSITIVES_PER_REVIEW = 5  # Included papers per review
NEGATIVES_PER_REVIEW = 5  # Excluded papers per review

# Sample reviews that have criteria and enough included papers
reviews_with_criteria = [r for r in eligible_reviews.keys() if r in review_criteria and review_criteria[r]["criteria"]]
print(f"Eligible reviews with criteria: {len(reviews_with_criteria):,}")

# Further filter to reviews with enough included papers for sampling
reviews_with_enough = [r for r in reviews_with_criteria if len(eligible_reviews[r]) >= POSITIVES_PER_REVIEW]
print(f"Reviews with >= {POSITIVES_PER_REVIEW} included papers: {len(reviews_with_enough):,}")

# Sample reviews
sampled_reviews = random.sample(reviews_with_enough, min(N_REVIEWS, len(reviews_with_enough)))
print(f"Sampled {len(sampled_reviews)} reviews for validation set")

Eligible reviews with criteria: 9,659
Reviews with >= 5 included papers: 9,659
Sampled 100 reviews for validation set


In [8]:
# Build paper abstract lookup
paper_lookup = ref_abstracts.set_index("pmid").to_dict("index")

def get_negative_candidates(review_pmid: str, included_papers: set, all_papers: set) -> list:
    """
    Get candidate negative papers for a review.
    Strategy: papers included in OTHER reviews but not in this one.
    These are realistic "near-miss" papers.
    """
    # Find papers that share inclusion in at least one other review
    related_papers = set()
    for paper in included_papers:
        # Get other reviews that include this paper
        other_reviews = paper_to_reviews.get(paper, set()) - {review_pmid}
        for other_review in other_reviews:
            # Add papers from those related reviews
            related_papers.update(review_to_included_filtered.get(other_review, set()))
    
    # Exclude papers that are in this review
    negative_candidates = related_papers - included_papers
    # Only keep papers with abstracts
    negative_candidates = negative_candidates & papers_with_abstracts
    
    return list(negative_candidates)

# Test
test_review = sampled_reviews[0]
test_included = eligible_reviews[test_review]
test_negatives = get_negative_candidates(test_review, test_included, papers_with_abstracts)
print(f"Test review {test_review}:")
print(f"  Included papers: {len(test_included)}")
print(f"  Negative candidates: {len(test_negatives)}")

Test review 21678351:
  Included papers: 78
  Negative candidates: 57924


In [9]:
# Generate the validation set
validation_records = []

for review_pmid in sampled_reviews:
    included = list(eligible_reviews[review_pmid])
    criteria_data = review_criteria[review_pmid]
    
    # Sample positive examples
    pos_sample = random.sample(included, min(POSITIVES_PER_REVIEW, len(included)))
    
    # Get and sample negative examples
    neg_candidates = get_negative_candidates(review_pmid, set(included), papers_with_abstracts)
    neg_sample = random.sample(neg_candidates, min(NEGATIVES_PER_REVIEW, len(neg_candidates))) if neg_candidates else []
    
    # Create records
    for paper_pmid in pos_sample:
        if paper_pmid in paper_lookup:
            paper = paper_lookup[paper_pmid]
            validation_records.append({
                "review_pmid": review_pmid,
                "review_title": criteria_data["title"],
                "review_objectives": criteria_data["objectives"],
                "review_criteria": criteria_data["criteria"],
                "paper_pmid": paper_pmid,
                "paper_title": paper.get("title", ""),
                "paper_abstract": paper.get("abstract", ""),
                "label": 1,  # Included
            })
    
    for paper_pmid in neg_sample:
        if paper_pmid in paper_lookup:
            paper = paper_lookup[paper_pmid]
            validation_records.append({
                "review_pmid": review_pmid,
                "review_title": criteria_data["title"],
                "review_objectives": criteria_data["objectives"],
                "review_criteria": criteria_data["criteria"],
                "paper_pmid": paper_pmid,
                "paper_title": paper.get("title", ""),
                "paper_abstract": paper.get("abstract", ""),
                "label": 0,  # Excluded
            })

print(f"Generated {len(validation_records):,} validation records")

Generated 1,000 validation records


In [10]:
# Convert to DataFrame and analyze
validation_df = pd.DataFrame(validation_records)

print("Validation Set Summary:")
print(f"  Total records: {len(validation_df):,}")
print(f"  Unique reviews: {validation_df['review_pmid'].nunique()}")
print(f"  Unique papers: {validation_df['paper_pmid'].nunique()}")
print(f"\nLabel distribution:")
print(validation_df["label"].value_counts().rename({1: "Included", 0: "Excluded"}))
print(f"\nBalance: {validation_df['label'].mean():.1%} positive")

Validation Set Summary:
  Total records: 1,000
  Unique reviews: 100
  Unique papers: 993

Label distribution:
label
Included    500
Excluded    500
Name: count, dtype: int64

Balance: 50.0% positive


In [11]:
# Save to CSV
validation_df.to_csv(OUTPUT_CSV, index=False)
print(f"Saved validation set to: {OUTPUT_CSV}")
print(f"File size: {OUTPUT_CSV.stat().st_size / 1024 / 1024:.2f} MB")

Saved validation set to: c:\Users\juanx\Documents\LSE-UKHSA Project\Data\ground_truth_validation_set.csv
File size: 2.89 MB


In [12]:
# Preview sample records
print("Sample INCLUDED record:")
sample_pos = validation_df[validation_df["label"] == 1].iloc[0]
print(f"Review: {sample_pos['review_title'][:80]}...")
print(f"Criteria: {sample_pos['review_criteria'][:200]}...")
print(f"Paper: {sample_pos['paper_title']}")
print(f"Abstract: {sample_pos['paper_abstract'][:200]}...")

print("\n" + "="*60)
print("\nSample EXCLUDED record:")
sample_neg = validation_df[validation_df["label"] == 0].iloc[0]
print(f"Review: {sample_neg['review_title'][:80]}...")
print(f"Criteria: {sample_neg['review_criteria'][:200]}...")
print(f"Paper: {sample_neg['paper_title']}")
print(f"Abstract: {sample_neg['paper_abstract'][:200]}...")

Sample INCLUDED record:
Review: Evaluation of follow-up strategies for patients with epithelial ovarian cancer f...
Criteria: SELECTION CRITERIA: All relevant randomised controlled trials (RCTs) that evaluated follow-up strategies for patients with epithelial ovarian cancer following completion of primary treatment....
Paper: A critical evaluation of current protocols for the follow-up of women treated for gynecological malignancies: a pilot study.
Abstract: This retrospective review was undertaken to determine the efficacy of routine follow-up in the detection and management of recurrent cancer. The case notes of all women attending a regional cancer cen...


Sample EXCLUDED record:
Review: Evaluation of follow-up strategies for patients with epithelial ovarian cancer f...
Criteria: SELECTION CRITERIA: All relevant randomised controlled trials (RCTs) that evaluated follow-up strategies for patients with epithelial ovarian cancer following completion of primary treatment....
Paper: Est