# 1. Search and Screen Papers

This notebook guides you through:
1. Defining your research question and inclusion criteria
2. **Searching multiple databases** (PubMed, Google Scholar, Semantic Scholar)
3. AI-assisted abstract screening
4. Tracking decisions for PRISMA reporting

## Supported Search Engines
- **PubMed** - Free, requires email only
- **Google Scholar** - Via SerpAPI (requires API key) or scholarly (free, rate-limited)
- **Semantic Scholar** - Free API with optional key for higher limits
- **Scopus** - Requires institutional API key
- **Web of Science** - Requires institutional access

In [None]:
# Setup - add parent directory to path
import sys
sys.path.insert(0, '..')

import os
import json
import time
import pandas as pd
from pathlib import Path
from abc import ABC, abstractmethod
from typing import List, Dict, Optional, Any
from dataclasses import dataclass, field, asdict
from datetime import datetime

## Configuration: API Keys & Credentials

Set your credentials here or use environment variables.

In [None]:
@dataclass
class SearchCredentials:
    """Store credentials for various search engines."""
    
    # PubMed (required: email)
    pubmed_email: str = ""
    pubmed_api_key: Optional[str] = None  # Optional, increases rate limit
    
    # Google Scholar via SerpAPI
    serpapi_key: Optional[str] = None
    
    # Semantic Scholar
    semantic_scholar_key: Optional[str] = None
    
    # Scopus (Elsevier)
    scopus_api_key: Optional[str] = None
    scopus_inst_token: Optional[str] = None
    
    # Institutional proxy (e.g., Rutgers)
    proxy_url: Optional[str] = None
    proxy_username: Optional[str] = None
    proxy_password: Optional[str] = None
    
    @classmethod
    def from_env(cls):
        """Load credentials from environment variables."""
        return cls(
            pubmed_email=os.getenv("PUBMED_EMAIL", ""),
            pubmed_api_key=os.getenv("PUBMED_API_KEY"),
            serpapi_key=os.getenv("SERPAPI_KEY"),
            semantic_scholar_key=os.getenv("SEMANTIC_SCHOLAR_KEY"),
            scopus_api_key=os.getenv("SCOPUS_API_KEY"),
            scopus_inst_token=os.getenv("SCOPUS_INST_TOKEN"),
            proxy_url=os.getenv("LIBRARY_PROXY_URL"),
            proxy_username=os.getenv("LIBRARY_PROXY_USER"),
            proxy_password=os.getenv("LIBRARY_PROXY_PASS")
        )
    
    @classmethod
    def from_file(cls, path: str):
        """Load credentials from JSON file."""
        with open(path) as f:
            data = json.load(f)
        return cls(**data)
    
    def save(self, path: str):
        """Save credentials to JSON file (be careful with secrets!)."""
        with open(path, 'w') as f:
            json.dump(asdict(self), f, indent=2)


# Load credentials - EDIT THIS SECTION
# Option 1: Set directly
credentials = SearchCredentials(
    pubmed_email="your-email@example.com",  # REQUIRED for PubMed
    # pubmed_api_key="your-key",            # Optional: higher rate limits
    # serpapi_key="your-serpapi-key",       # For Google Scholar
    # semantic_scholar_key="your-s2-key",   # Optional: higher limits
    
    # Rutgers proxy example
    proxy_url="https://www.libraries.rutgers.edu/proxy",
    proxy_username="jss388",  # Your NetID
)

# Option 2: Load from environment
# credentials = SearchCredentials.from_env()

# Option 3: Load from file (don't commit this file!)
# credentials = SearchCredentials.from_file("~/.meta_analysis_credentials.json")

print("Credentials configured:")
print(f"  PubMed email: {'✓' if credentials.pubmed_email else '✗'}")
print(f"  PubMed API key: {'✓' if credentials.pubmed_api_key else '✗ (optional)'}")
print(f"  SerpAPI (Google Scholar): {'✓' if credentials.serpapi_key else '✗'}")
print(f"  Semantic Scholar key: {'✓' if credentials.semantic_scholar_key else '✗ (optional)'}")
print(f"  Scopus API: {'✓' if credentials.scopus_api_key else '✗'}")
print(f"  Library proxy: {'✓' if credentials.proxy_url else '✗'}")

## Step 1: Define Your Research Question

Use the PICO framework:
- **P**opulation: Who are you studying?
- **I**ntervention: What exposure/treatment?
- **C**omparison: What is the control condition?
- **O**utcome: What are you measuring?

In [None]:
# Define your research question
RESEARCH_QUESTION = "What brain regions are activated during spatial decision-making in T-maze tasks?"

PICO = {
    "Population": "Healthy adult humans",
    "Intervention": "T-maze or spatial decision-making task",
    "Comparison": "Control condition or baseline",
    "Outcome": "Brain activation (fMRI coordinates)"
}

print("Research Question:")
print(f"  {RESEARCH_QUESTION}")
print("\nPICO:")
for key, value in PICO.items():
    print(f"  {key}: {value}")

## Step 2: Define Inclusion/Exclusion Criteria

In [None]:
INCLUSION_CRITERIA = [
    "Reports original fMRI or PET neuroimaging data",
    "Uses T-maze, spatial navigation, or decision-making task",
    "Reports activation coordinates in MNI or Talairach space",
    "Published in peer-reviewed journal",
    "Human participants",
    "Written in English"
]

EXCLUSION_CRITERIA = [
    "Review articles or meta-analyses without original data",
    "Case studies with n < 5",
    "Only ROI analysis (no whole-brain coordinates)",
    "Clinical populations only (unless healthy control group)",
    "Animal studies"
]

print("Inclusion Criteria:")
for i, c in enumerate(INCLUSION_CRITERIA, 1):
    print(f"  {i}. {c}")

print("\nExclusion Criteria:")
for i, c in enumerate(EXCLUSION_CRITERIA, 1):
    print(f"  {i}. {c}")

## Step 3: Search Engine Framework

Unified interface for multiple search engines.

In [None]:
@dataclass
class Paper:
    """Standardized paper representation across all search engines."""
    title: str
    authors: List[str]
    year: Optional[int]
    abstract: str
    source: str  # Which search engine found this
    
    # Identifiers (may vary by source)
    doi: Optional[str] = None
    pmid: Optional[str] = None
    arxiv_id: Optional[str] = None
    semantic_scholar_id: Optional[str] = None
    
    # Metadata
    journal: Optional[str] = None
    url: Optional[str] = None
    pdf_url: Optional[str] = None
    citation_count: Optional[int] = None
    
    def to_dict(self) -> dict:
        return asdict(self)
    
    @property
    def citation(self) -> str:
        author_str = self.authors[0] if self.authors else "Unknown"
        if len(self.authors) > 1:
            author_str += " et al."
        return f"{author_str} ({self.year or 'n.d.'})"


class SearchEngine(ABC):
    """Abstract base class for search engines."""
    
    name: str = "BaseEngine"
    
    def __init__(self, credentials: SearchCredentials):
        self.credentials = credentials
    
    @abstractmethod
    def search(self, query: str, max_results: int = 100) -> List[Paper]:
        """Search for papers matching query."""
        pass
    
    @abstractmethod
    def is_available(self) -> bool:
        """Check if this engine is properly configured."""
        pass


print("Search engine framework loaded")

### PubMed Search Engine

In [None]:
class PubMedEngine(SearchEngine):
    """Search PubMed via Entrez API."""
    
    name = "PubMed"
    
    def is_available(self) -> bool:
        try:
            from Bio import Entrez
            return bool(self.credentials.pubmed_email)
        except ImportError:
            return False
    
    def search(self, query: str, max_results: int = 100) -> List[Paper]:
        from Bio import Entrez
        
        Entrez.email = self.credentials.pubmed_email
        if self.credentials.pubmed_api_key:
            Entrez.api_key = self.credentials.pubmed_api_key
        
        # Search
        handle = Entrez.esearch(
            db="pubmed",
            term=query,
            retmax=max_results,
            sort="relevance"
        )
        results = Entrez.read(handle)
        handle.close()
        
        pmids = results["IdList"]
        print(f"[PubMed] Found {len(pmids)} papers")
        
        if not pmids:
            return []
        
        # Fetch details
        handle = Entrez.efetch(
            db="pubmed",
            id=",".join(pmids),
            rettype="xml"
        )
        records = Entrez.read(handle)
        handle.close()
        
        papers = []
        for article in records["PubmedArticle"]:
            try:
                paper = self._parse_article(article)
                if paper:
                    papers.append(paper)
            except Exception as e:
                continue
        
        return papers
    
    def _parse_article(self, article) -> Optional[Paper]:
        """Parse PubMed article into Paper object."""
        medline = article["MedlineCitation"]
        article_data = medline["Article"]
        
        # Authors
        authors = []
        if "AuthorList" in article_data:
            for author in article_data["AuthorList"]:
                if "LastName" in author:
                    name = author["LastName"]
                    if "ForeName" in author:
                        name = f"{author['ForeName']} {name}"
                    elif "Initials" in author:
                        name = f"{author['Initials']} {name}"
                    authors.append(name)
        
        # Abstract
        abstract = ""
        if "Abstract" in article_data:
            abstract_texts = article_data["Abstract"]["AbstractText"]
            if isinstance(abstract_texts, list):
                abstract = " ".join(str(t) for t in abstract_texts)
            else:
                abstract = str(abstract_texts)
        
        # Year
        year = None
        if "DateCompleted" in medline:
            year = int(medline["DateCompleted"]["Year"])
        elif "DateRevised" in medline:
            year = int(medline["DateRevised"]["Year"])
        
        # DOI
        doi = None
        if "ELocationID" in article_data:
            for eid in article_data["ELocationID"]:
                if str(eid.attributes.get("EIdType", "")) == "doi":
                    doi = str(eid)
        
        return Paper(
            title=str(article_data["ArticleTitle"]),
            authors=authors,
            year=year,
            abstract=abstract,
            source="PubMed",
            pmid=str(medline["PMID"]),
            doi=doi,
            journal=str(article_data["Journal"]["Title"]) if "Journal" in article_data else None,
            url=f"https://pubmed.ncbi.nlm.nih.gov/{medline['PMID']}/"
        )


print("PubMed engine loaded")

### Google Scholar Search Engine

Two options:
1. **SerpAPI** (paid, reliable) - requires API key
2. **scholarly** (free, rate-limited) - may get blocked

In [None]:
class GoogleScholarEngine(SearchEngine):
    """Search Google Scholar via SerpAPI or scholarly library."""
    
    name = "Google Scholar"
    
    def __init__(self, credentials: SearchCredentials, use_serpapi: bool = True):
        super().__init__(credentials)
        self.use_serpapi = use_serpapi and bool(credentials.serpapi_key)
    
    def is_available(self) -> bool:
        if self.use_serpapi:
            try:
                from serpapi import GoogleSearch
                return bool(self.credentials.serpapi_key)
            except ImportError:
                pass
        
        # Fall back to scholarly
        try:
            import scholarly
            self.use_serpapi = False
            return True
        except ImportError:
            return False
    
    def search(self, query: str, max_results: int = 100) -> List[Paper]:
        if self.use_serpapi:
            return self._search_serpapi(query, max_results)
        else:
            return self._search_scholarly(query, max_results)
    
    def _search_serpapi(self, query: str, max_results: int) -> List[Paper]:
        """Search using SerpAPI (paid, reliable)."""
        from serpapi import GoogleSearch
        
        papers = []
        start = 0
        
        while len(papers) < max_results:
            params = {
                "engine": "google_scholar",
                "q": query,
                "api_key": self.credentials.serpapi_key,
                "start": start,
                "num": min(20, max_results - len(papers))
            }
            
            search = GoogleSearch(params)
            results = search.get_dict()
            
            if "organic_results" not in results:
                break
            
            for result in results["organic_results"]:
                paper = self._parse_serpapi_result(result)
                if paper:
                    papers.append(paper)
            
            start += 20
            if len(results["organic_results"]) < 20:
                break
        
        print(f"[Google Scholar/SerpAPI] Found {len(papers)} papers")
        return papers[:max_results]
    
    def _parse_serpapi_result(self, result: dict) -> Optional[Paper]:
        """Parse SerpAPI result into Paper."""
        # Extract authors from publication_info
        authors = []
        pub_info = result.get("publication_info", {})
        if "authors" in pub_info:
            authors = [a.get("name", "") for a in pub_info["authors"]]
        elif "summary" in pub_info:
            # Try to parse from summary string
            parts = pub_info["summary"].split(" - ")
            if parts:
                authors = [a.strip() for a in parts[0].split(",")]
        
        # Extract year
        year = None
        summary = pub_info.get("summary", "")
        import re
        year_match = re.search(r'\b(19|20)\d{2}\b', summary)
        if year_match:
            year = int(year_match.group())
        
        return Paper(
            title=result.get("title", ""),
            authors=authors,
            year=year,
            abstract=result.get("snippet", ""),
            source="Google Scholar",
            url=result.get("link"),
            citation_count=result.get("inline_links", {}).get("cited_by", {}).get("total")
        )
    
    def _search_scholarly(self, query: str, max_results: int) -> List[Paper]:
        """Search using scholarly library (free, rate-limited)."""
        from scholarly import scholarly
        
        papers = []
        search_query = scholarly.search_pubs(query)
        
        for i, result in enumerate(search_query):
            if i >= max_results:
                break
            
            try:
                paper = self._parse_scholarly_result(result)
                if paper:
                    papers.append(paper)
                
                # Rate limiting to avoid blocks
                if i > 0 and i % 10 == 0:
                    time.sleep(2)
                    
            except Exception as e:
                print(f"  Warning: Could not parse result {i}: {e}")
                continue
        
        print(f"[Google Scholar/scholarly] Found {len(papers)} papers")
        return papers
    
    def _parse_scholarly_result(self, result) -> Optional[Paper]:
        """Parse scholarly result into Paper."""
        bib = result.get("bib", {})
        
        # Authors
        authors = bib.get("author", [])
        if isinstance(authors, str):
            authors = [a.strip() for a in authors.split(" and ")]
        
        # Year
        year = None
        if "pub_year" in bib:
            try:
                year = int(bib["pub_year"])
            except ValueError:
                pass
        
        return Paper(
            title=bib.get("title", ""),
            authors=authors,
            year=year,
            abstract=bib.get("abstract", ""),
            source="Google Scholar",
            url=result.get("pub_url") or result.get("eprint_url"),
            citation_count=result.get("num_citations")
        )


print("Google Scholar engine loaded")

### Semantic Scholar Search Engine

In [None]:
class SemanticScholarEngine(SearchEngine):
    """Search Semantic Scholar API (free, high quality)."""
    
    name = "Semantic Scholar"
    BASE_URL = "https://api.semanticscholar.org/graph/v1"
    
    def is_available(self) -> bool:
        try:
            import requests
            return True
        except ImportError:
            return False
    
    def search(self, query: str, max_results: int = 100) -> List[Paper]:
        import requests
        
        headers = {}
        if self.credentials.semantic_scholar_key:
            headers["x-api-key"] = self.credentials.semantic_scholar_key
        
        papers = []
        offset = 0
        
        while len(papers) < max_results:
            params = {
                "query": query,
                "offset": offset,
                "limit": min(100, max_results - len(papers)),
                "fields": "paperId,title,abstract,authors,year,citationCount,externalIds,url,venue"
            }
            
            response = requests.get(
                f"{self.BASE_URL}/paper/search",
                params=params,
                headers=headers
            )
            
            if response.status_code != 200:
                print(f"  Warning: Semantic Scholar API error {response.status_code}")
                break
            
            data = response.json()
            
            if "data" not in data or not data["data"]:
                break
            
            for item in data["data"]:
                paper = self._parse_result(item)
                if paper:
                    papers.append(paper)
            
            offset += len(data["data"])
            
            # Rate limiting
            time.sleep(0.5)
            
            if len(data["data"]) < 100:
                break
        
        print(f"[Semantic Scholar] Found {len(papers)} papers")
        return papers[:max_results]
    
    def _parse_result(self, item: dict) -> Optional[Paper]:
        """Parse Semantic Scholar result into Paper."""
        # Authors
        authors = [a.get("name", "") for a in item.get("authors", [])]
        
        # External IDs
        ext_ids = item.get("externalIds", {}) or {}
        
        return Paper(
            title=item.get("title", ""),
            authors=authors,
            year=item.get("year"),
            abstract=item.get("abstract", "") or "",
            source="Semantic Scholar",
            semantic_scholar_id=item.get("paperId"),
            doi=ext_ids.get("DOI"),
            pmid=ext_ids.get("PubMed"),
            arxiv_id=ext_ids.get("ArXiv"),
            url=item.get("url"),
            journal=item.get("venue"),
            citation_count=item.get("citationCount")
        )


print("Semantic Scholar engine loaded")

### Scopus Search Engine (Institutional Access)

In [None]:
class ScopusEngine(SearchEngine):
    """Search Scopus via Elsevier API (requires institutional key)."""
    
    name = "Scopus"
    BASE_URL = "https://api.elsevier.com/content/search/scopus"
    
    def is_available(self) -> bool:
        try:
            import requests
            return bool(self.credentials.scopus_api_key)
        except ImportError:
            return False
    
    def search(self, query: str, max_results: int = 100) -> List[Paper]:
        import requests
        
        headers = {
            "X-ELS-APIKey": self.credentials.scopus_api_key,
            "Accept": "application/json"
        }
        
        if self.credentials.scopus_inst_token:
            headers["X-ELS-Insttoken"] = self.credentials.scopus_inst_token
        
        papers = []
        start = 0
        
        while len(papers) < max_results:
            params = {
                "query": query,
                "start": start,
                "count": min(25, max_results - len(papers)),
                "view": "COMPLETE"
            }
            
            response = requests.get(self.BASE_URL, params=params, headers=headers)
            
            if response.status_code != 200:
                print(f"  Warning: Scopus API error {response.status_code}")
                break
            
            data = response.json()
            results = data.get("search-results", {}).get("entry", [])
            
            if not results:
                break
            
            for item in results:
                paper = self._parse_result(item)
                if paper:
                    papers.append(paper)
            
            start += len(results)
            
            if len(results) < 25:
                break
        
        print(f"[Scopus] Found {len(papers)} papers")
        return papers[:max_results]
    
    def _parse_result(self, item: dict) -> Optional[Paper]:
        """Parse Scopus result into Paper."""
        # Authors
        authors = []
        if "author" in item:
            for author in item["author"]:
                name = author.get("authname", "")
                if name:
                    authors.append(name)
        
        # Year from cover date
        year = None
        cover_date = item.get("prism:coverDate", "")
        if cover_date:
            try:
                year = int(cover_date[:4])
            except ValueError:
                pass
        
        return Paper(
            title=item.get("dc:title", ""),
            authors=authors,
            year=year,
            abstract=item.get("dc:description", "") or "",
            source="Scopus",
            doi=item.get("prism:doi"),
            url=item.get("link", [{}])[0].get("@href") if item.get("link") else None,
            journal=item.get("prism:publicationName"),
            citation_count=int(item.get("citedby-count", 0)) if item.get("citedby-count") else None
        )


print("Scopus engine loaded")

## Step 4: Multi-Engine Search Manager

In [None]:
class SearchManager:
    """Manage searches across multiple engines."""
    
    def __init__(self, credentials: SearchCredentials):
        self.credentials = credentials
        self.engines: Dict[str, SearchEngine] = {}
        self._register_engines()
    
    def _register_engines(self):
        """Register all available search engines."""
        engine_classes = [
            PubMedEngine,
            GoogleScholarEngine,
            SemanticScholarEngine,
            ScopusEngine
        ]
        
        for engine_class in engine_classes:
            try:
                engine = engine_class(self.credentials)
                if engine.is_available():
                    self.engines[engine.name] = engine
            except Exception as e:
                print(f"  Could not initialize {engine_class.name}: {e}")
    
    def list_engines(self) -> List[str]:
        """List available search engines."""
        return list(self.engines.keys())
    
    def search(
        self,
        query: str,
        engines: Optional[List[str]] = None,
        max_results_per_engine: int = 50,
        deduplicate: bool = True
    ) -> List[Paper]:
        """
        Search across multiple engines.
        
        Args:
            query: Search query
            engines: List of engine names to use (None = all available)
            max_results_per_engine: Max results from each engine
            deduplicate: Remove duplicate papers (by DOI/title)
        
        Returns:
            Combined list of papers
        """
        if engines is None:
            engines = list(self.engines.keys())
        
        all_papers = []
        
        for engine_name in engines:
            if engine_name not in self.engines:
                print(f"  Warning: {engine_name} not available, skipping")
                continue
            
            engine = self.engines[engine_name]
            try:
                papers = engine.search(query, max_results=max_results_per_engine)
                all_papers.extend(papers)
            except Exception as e:
                print(f"  Error searching {engine_name}: {e}")
        
        if deduplicate:
            all_papers = self._deduplicate(all_papers)
        
        print(f"\nTotal unique papers: {len(all_papers)}")
        return all_papers
    
    def _deduplicate(self, papers: List[Paper]) -> List[Paper]:
        """Remove duplicate papers based on DOI or title similarity."""
        seen_dois = set()
        seen_titles = set()
        unique = []
        
        for paper in papers:
            # Check DOI
            if paper.doi:
                if paper.doi.lower() in seen_dois:
                    continue
                seen_dois.add(paper.doi.lower())
            
            # Check title (normalized)
            normalized_title = paper.title.lower().strip()[:100]
            if normalized_title in seen_titles:
                continue
            seen_titles.add(normalized_title)
            
            unique.append(paper)
        
        return unique


# Initialize search manager
search_manager = SearchManager(credentials)

print("\nAvailable search engines:")
for engine in search_manager.list_engines():
    print(f"  ✓ {engine}")

## Step 5: Run Search

In [None]:
# Build search query
SEARCH_TERMS = [
    '("T-maze" OR "T maze" OR "spatial decision" OR "spatial navigation")',
    '(fMRI OR "functional MRI" OR "functional magnetic resonance")',
    '(activation OR BOLD OR "brain activity")'
]

SEARCH_QUERY = " AND ".join(SEARCH_TERMS)
print("Search Query:")
print(SEARCH_QUERY)

In [None]:
# Run multi-engine search
# Option 1: Search all available engines
papers = search_manager.search(
    query=SEARCH_QUERY,
    max_results_per_engine=30,
    deduplicate=True
)

# Option 2: Search specific engines only
# papers = search_manager.search(
#     query=SEARCH_QUERY,
#     engines=["PubMed", "Semantic Scholar"],
#     max_results_per_engine=50
# )

In [None]:
# Convert to DataFrame for viewing
papers_df = pd.DataFrame([p.to_dict() for p in papers])

print(f"\nResults by source:")
print(papers_df["source"].value_counts())

papers_df[["title", "year", "source", "citation_count"]].head(15)

## Step 6: AI-Assisted Screening

In [None]:
from extraction.extractors.base_extractor import LLMProvider

# Initialize LLM (requires ANTHROPIC_API_KEY environment variable)
try:
    llm = LLMProvider(provider="anthropic")
    llm_available = True
    print("LLM initialized for screening")
except Exception as e:
    print(f"LLM not available: {e}")
    print("Set ANTHROPIC_API_KEY environment variable to enable AI screening")
    llm_available = False

In [None]:
SCREENING_PROMPT = """You are screening abstracts for a meta-analysis.

Research Question: {question}

INCLUSION CRITERIA:
{inclusion}

EXCLUSION CRITERIA:
{exclusion}

Paper to screen:
Title: {title}
Year: {year}
Abstract: {abstract}

Based on the abstract, determine:
1. INCLUDE, EXCLUDE, or UNCERTAIN
2. Which criteria are met/not met
3. Brief reasoning

Return JSON:
{{
    "decision": "INCLUDE" | "EXCLUDE" | "UNCERTAIN",
    "confidence": 0.0-1.0,
    "reasoning": "brief explanation",
    "criteria_met": ["list of met inclusion criteria"],
    "exclusion_reasons": ["list of exclusion reasons, if any"]
}}
"""

def screen_paper(paper: Paper) -> dict:
    """Screen a single paper using AI."""
    prompt = SCREENING_PROMPT.format(
        question=RESEARCH_QUESTION,
        inclusion="\n".join(f"- {c}" for c in INCLUSION_CRITERIA),
        exclusion="\n".join(f"- {c}" for c in EXCLUSION_CRITERIA),
        title=paper.title,
        year=paper.year,
        abstract=paper.abstract
    )
    
    result = llm.extract(paper.abstract, prompt)
    result["title"] = paper.title
    result["source"] = paper.source
    result["doi"] = paper.doi
    result["pmid"] = paper.pmid
    return result

In [None]:
# Screen papers (costs money per API call!)
# Limit to first N papers for demo
N_TO_SCREEN = 5

screening_results = []

if llm_available and papers:
    from tqdm import tqdm
    
    # Filter to papers with abstracts
    papers_with_abstracts = [p for p in papers if p.abstract]
    
    for paper in tqdm(papers_with_abstracts[:N_TO_SCREEN], desc="Screening"):
        try:
            result = screen_paper(paper)
            screening_results.append(result)
        except Exception as e:
            print(f"Error screening '{paper.title[:50]}...': {e}")
    
    screening_df = pd.DataFrame(screening_results)
    print(f"\nScreened {len(screening_results)} papers")
else:
    print("Skipping AI screening (LLM not available or no papers)")
    screening_df = pd.DataFrame()

In [None]:
# View screening results
if len(screening_df) > 0:
    print("Screening Summary:")
    print(screening_df["decision"].value_counts())
    print()
    screening_df[["title", "decision", "confidence", "reasoning"]]

## Step 7: Save Results

In [None]:
# Save results
output_dir = Path("../data")
output_dir.mkdir(exist_ok=True)

# Save all search results
papers_df.to_csv(output_dir / "search_results.csv", index=False)
print(f"Saved {len(papers_df)} papers to {output_dir / 'search_results.csv'}")

# Save screening results
if len(screening_df) > 0:
    screening_df.to_csv(output_dir / "screening_results.csv", index=False)
    print(f"Saved screening results to {output_dir / 'screening_results.csv'}")

# Save search metadata for PRISMA
search_metadata = {
    "search_date": datetime.now().isoformat(),
    "query": SEARCH_QUERY,
    "research_question": RESEARCH_QUESTION,
    "pico": PICO,
    "inclusion_criteria": INCLUSION_CRITERIA,
    "exclusion_criteria": EXCLUSION_CRITERIA,
    "engines_used": search_manager.list_engines(),
    "total_results": len(papers),
    "results_by_source": papers_df["source"].value_counts().to_dict() if len(papers_df) > 0 else {}
}

with open(output_dir / "search_metadata.json", "w") as f:
    json.dump(search_metadata, f, indent=2)
print(f"Saved search metadata to {output_dir / 'search_metadata.json'}")

## PRISMA Flow Diagram Data

In [None]:
# Generate PRISMA numbers
prisma = {
    "identification": {
        "records_identified": len(papers),
        "by_source": papers_df["source"].value_counts().to_dict() if len(papers_df) > 0 else {}
    },
    "screening": {
        "records_screened": len(screening_df) if len(screening_df) > 0 else 0,
        "records_excluded": len(screening_df[screening_df["decision"] == "EXCLUDE"]) if len(screening_df) > 0 else 0
    },
    "eligibility": {
        "full_text_assessed": len(screening_df[screening_df["decision"] == "INCLUDE"]) if len(screening_df) > 0 else 0,
    }
}

print("PRISMA Flow Data:")
print(json.dumps(prisma, indent=2))

## Next Steps

1. **Review UNCERTAIN papers** manually
2. **Get full-text PDFs** for INCLUDED papers
3. **Proceed to notebook 02** for data extraction

### Installing Additional Dependencies

```bash
# For PubMed
pip install biopython

# For Google Scholar (free option)
pip install scholarly

# For Google Scholar (paid, reliable)
pip install google-search-results  # SerpAPI

# For Semantic Scholar / Scopus
pip install requests
```