# 1. Search and Screen Papers

This notebook guides you through:
1. Defining your research question and inclusion criteria
2. Searching PubMed for relevant papers
3. AI-assisted abstract screening
4. Tracking decisions for PRISMA reporting

In [None]:
# Setup - add parent directory to path
import sys
sys.path.insert(0, '..')

import pandas as pd
from pathlib import Path

## Step 1: Define Your Research Question

Use the PICO framework:
- **P**opulation: Who are you studying?
- **I**ntervention: What exposure/treatment?
- **C**omparison: What is the control condition?
- **O**utcome: What are you measuring?

In [None]:
# Define your research question
RESEARCH_QUESTION = "What brain regions are activated during spatial decision-making in T-maze tasks?"

PICO = {
    "Population": "Healthy adult humans",
    "Intervention": "T-maze or spatial decision-making task",
    "Comparison": "Control condition or baseline",
    "Outcome": "Brain activation (fMRI coordinates)"
}

print("Research Question:")
print(f"  {RESEARCH_QUESTION}")
print("\nPICO:")
for key, value in PICO.items():
    print(f"  {key}: {value}")

## Step 2: Define Inclusion/Exclusion Criteria

In [None]:
INCLUSION_CRITERIA = [
    "Reports original fMRI or PET neuroimaging data",
    "Uses T-maze, spatial navigation, or decision-making task",
    "Reports activation coordinates in MNI or Talairach space",
    "Published in peer-reviewed journal",
    "Human participants",
    "Written in English"
]

EXCLUSION_CRITERIA = [
    "Review articles or meta-analyses without original data",
    "Case studies with n < 5",
    "Only ROI analysis (no whole-brain coordinates)",
    "Clinical populations only (unless healthy control group)",
    "Animal studies"
]

print("Inclusion Criteria:")
for i, c in enumerate(INCLUSION_CRITERIA, 1):
    print(f"  {i}. {c}")

print("\nExclusion Criteria:")
for i, c in enumerate(EXCLUSION_CRITERIA, 1):
    print(f"  {i}. {c}")

## Step 3: Search PubMed

Build your search query and retrieve abstracts.

In [None]:
# Build PubMed search query
SEARCH_TERMS = [
    '("T-maze" OR "T maze" OR "spatial decision" OR "spatial navigation")',
    '(fMRI OR "functional MRI" OR "functional magnetic resonance")',
    '(activation OR BOLD OR "brain activity")'
]

SEARCH_QUERY = " AND ".join(SEARCH_TERMS)
print("Search Query:")
print(SEARCH_QUERY)

In [None]:
# Search PubMed using Biopython
from Bio import Entrez

# Set your email (required by NCBI)
Entrez.email = "your-email@example.com"  # CHANGE THIS

def search_pubmed(query, max_results=100):
    """Search PubMed and return paper metadata."""
    # Search
    handle = Entrez.esearch(
        db="pubmed",
        term=query,
        retmax=max_results,
        sort="relevance"
    )
    results = Entrez.read(handle)
    handle.close()
    
    pmids = results["IdList"]
    print(f"Found {len(pmids)} papers")
    
    if not pmids:
        return []
    
    # Fetch details
    handle = Entrez.efetch(
        db="pubmed",
        id=",".join(pmids),
        rettype="xml"
    )
    records = Entrez.read(handle)
    handle.close()
    
    papers = []
    for article in records["PubmedArticle"]:
        try:
            medline = article["MedlineCitation"]
            article_data = medline["Article"]
            
            # Get authors
            authors = []
            if "AuthorList" in article_data:
                for author in article_data["AuthorList"]:
                    if "LastName" in author:
                        name = author["LastName"]
                        if "Initials" in author:
                            name += " " + author["Initials"]
                        authors.append(name)
            
            # Get abstract
            abstract = ""
            if "Abstract" in article_data:
                abstract_texts = article_data["Abstract"]["AbstractText"]
                if isinstance(abstract_texts, list):
                    abstract = " ".join(str(t) for t in abstract_texts)
                else:
                    abstract = str(abstract_texts)
            
            papers.append({
                "pmid": str(medline["PMID"]),
                "title": str(article_data["ArticleTitle"]),
                "authors": "; ".join(authors[:3]) + (" et al." if len(authors) > 3 else ""),
                "year": int(medline["DateCompleted"]["Year"]) if "DateCompleted" in medline else None,
                "abstract": abstract,
                "journal": str(article_data["Journal"]["Title"])
            })
        except Exception as e:
            continue
    
    return papers

# Run search
papers = search_pubmed(SEARCH_QUERY, max_results=50)
print(f"\nRetrieved {len(papers)} papers with abstracts")

In [None]:
# View first few papers
papers_df = pd.DataFrame(papers)
papers_df[["pmid", "title", "year", "authors"]].head(10)

## Step 4: AI-Assisted Screening

Use Claude to screen abstracts against your criteria.

In [None]:
import os
from extraction.extractors.base_extractor import LLMProvider

# Initialize LLM
# Make sure ANTHROPIC_API_KEY is set in environment
llm = LLMProvider(provider="anthropic")

SCREENING_PROMPT = """You are screening abstracts for a meta-analysis.

Research Question: {question}

INCLUSION CRITERIA:
{inclusion}

EXCLUSION CRITERIA:
{exclusion}

Abstract to screen:
Title: {title}
Year: {year}
Abstract: {abstract}

Based on the abstract, determine:
1. INCLUDE, EXCLUDE, or UNCERTAIN
2. Which criteria are met/not met
3. Brief reasoning

Return JSON:
{{
    "decision": "INCLUDE" | "EXCLUDE" | "UNCERTAIN",
    "confidence": 0.0-1.0,
    "reasoning": "brief explanation",
    "criteria_met": ["list of met inclusion criteria"],
    "exclusion_reasons": ["list of exclusion reasons, if any"]
}}
"""

def screen_abstract(paper):
    """Screen a single abstract."""
    prompt = SCREENING_PROMPT.format(
        question=RESEARCH_QUESTION,
        inclusion="\n".join(f"- {c}" for c in INCLUSION_CRITERIA),
        exclusion="\n".join(f"- {c}" for c in EXCLUSION_CRITERIA),
        title=paper["title"],
        year=paper["year"],
        abstract=paper["abstract"]
    )
    
    result = llm.extract(paper["abstract"], prompt)
    return result

In [None]:
# Screen papers (this will make API calls - costs money!)
# For demo, just screen first 5
from tqdm import tqdm

screening_results = []

for paper in tqdm(papers[:5], desc="Screening"):
    try:
        result = screen_abstract(paper)
        result["pmid"] = paper["pmid"]
        result["title"] = paper["title"]
        screening_results.append(result)
    except Exception as e:
        print(f"Error screening {paper['pmid']}: {e}")

screening_df = pd.DataFrame(screening_results)
screening_df

## Step 5: Review and Export

Review screening decisions and export included papers.

In [None]:
# Summary of screening
if len(screening_results) > 0:
    print("Screening Summary:")
    print(screening_df["decision"].value_counts())
    
    # Get included papers
    included = screening_df[screening_df["decision"] == "INCLUDE"]
    print(f"\nIncluded: {len(included)} papers")

In [None]:
# Save results
output_dir = Path("../data")
output_dir.mkdir(exist_ok=True)

papers_df.to_csv(output_dir / "search_results.csv", index=False)
if len(screening_results) > 0:
    screening_df.to_csv(output_dir / "screening_results.csv", index=False)
    
print(f"Saved to {output_dir}")

## Next Steps

1. Review UNCERTAIN papers manually
2. Get full-text PDFs for INCLUDED papers
3. Proceed to notebook 02 for data extraction