# Deep agentValidation Framework

This notebook provides a comprehensive framework for evaluating deep agent.



**Approach:** Sequential testing (one system at a time), no ground truth required.

**Evaluation Methods:**
1. LLM-as-Judge (automated quality scoring)
2. Citation Verification (URL liveness + content support)
3. DeepEval Metrics (faithfulness, relevancy)
4. Manual Verification (freshness/recency checks)
5. Cross-System Comparison (after both systems are tested)

In [None]:
# Cell 2: Install dependencies
# Uncomment and run if not already installed

# !pip install deepeval ragas openai anthropic requests beautifulsoup4 plotly pandas python-dotenv

In [None]:
# Cell 3: Imports and environment setup
import sys
import os
import json
import re
import time
import asyncio
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Any, Optional

import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from IPython.display import display, HTML, Markdown
import requests
from bs4 import BeautifulSoup

# Add project root to path so we can import EDR modules
PROJECT_ROOT = Path(os.getcwd())
sys.path.insert(0, str(PROJECT_ROOT))

# Load environment variables
from dotenv import load_dotenv
load_dotenv(dotenv_path=PROJECT_ROOT / ".env")

print(f"Project root: {PROJECT_ROOT}")
print(f"Python path configured")
print(f"Environment loaded from: {PROJECT_ROOT / '.env'}")

In [None]:
# Cell 4: Configuration

CONFIG = {
    # Which system are you testing right now?
    # Options: "sf_edr" or "open_deep_research"
    "system_under_test": "sf_edr",
    
    # sf EDR settings
    "edr_provider": os.environ.get("LLM_PROVIDER", "google"),
    "edr_model": os.environ.get("LLM_MODEL", "gemini-2.5-pro"),
    "edr_max_loops": 3,
    
    # LLM Judge settings (use a different LLM than the system being tested)
    "judge_provider": "openai",  # or "anthropic"
    "judge_model": "gpt-4o",     # or "claude-sonnet-4-5-20250929"
    
    # Output settings
    "output_dir": str(PROJECT_ROOT / "validation_results"),
    "run_id": datetime.now().strftime("%Y%m%d_%H%M%S"),
}

# Create output directory
os.makedirs(CONFIG["output_dir"], exist_ok=True)

print("Configuration:")
for k, v in CONFIG.items():
    print(f"  {k}: {v}")

---
## Section 2: Test Query Bank

All test queries organized by category. Each query has:
- `id`: Unique identifier
- `query`: The research question
- `category`: High-level category (factual, stress, manual_verification, etc.)
- `subcategory`: Specific test type within the category

In [None]:
# Cell 6: Define all test queries

TEST_QUERIES = [
    # === A. Factual / Verifiable Queries ===
    {"id": "A1", "query": "What were the top 5 causes of the 2008 financial crisis, with supporting data?", "category": "factual", "subcategory": "verifiable"},
    {"id": "A2", "query": "List all Nobel Prize winners in Physics from 2020-2024 and their contributions.", "category": "factual", "subcategory": "verifiable"},
    {"id": "A3", "query": "What is the current market share of cloud providers (AWS, Azure, GCP) as of 2024?", "category": "factual", "subcategory": "verifiable"},
    {"id": "A4", "query": "Summarize the key provisions of the EU AI Act.", "category": "factual", "subcategory": "verifiable"},
    {"id": "A5", "query": "What are the FDA-approved treatments for Type 2 diabetes as of 2024?", "category": "factual", "subcategory": "verifiable"},
    
    # === B. Multi-Hop Reasoning Queries ===
    {"id": "B1", "query": "How did the semiconductor chip shortage of 2020-2023 affect both the automotive and gaming industries differently?", "category": "multi_hop", "subcategory": "synthesis"},
    {"id": "B2", "query": "Compare the economic policies of the US and EU regarding AI regulation and their downstream effects on startup funding.", "category": "multi_hop", "subcategory": "synthesis"},
    {"id": "B3", "query": "Trace the supply chain of lithium from mining to EV batteries - what are the geopolitical risks at each stage?", "category": "multi_hop", "subcategory": "synthesis"},
    
    # === C. Ambiguous / Open-Ended Queries ===
    {"id": "C1", "query": "What's the best programming language?", "category": "ambiguous", "subcategory": "vague"},
    {"id": "C2", "query": "Is AI dangerous?", "category": "ambiguous", "subcategory": "vague"},
    {"id": "C3", "query": "Tell me about the Apple situation", "category": "ambiguous", "subcategory": "ambiguous_entity"},
    {"id": "C4", "query": "What happened recently in tech?", "category": "ambiguous", "subcategory": "vague"},
    
    # === D. Stress Tests ===
    # D1: Contradictory sources
    {"id": "D1a", "query": "Is coffee good or bad for health? Provide evidence for both sides.", "category": "stress", "subcategory": "contradictory"},
    {"id": "D1b", "query": "Is remote work more productive than in-office? What does the research say?", "category": "stress", "subcategory": "contradictory"},
    {"id": "D1c", "query": "Are electric vehicles truly better for the environment when considering full lifecycle?", "category": "stress", "subcategory": "contradictory"},
    
    # D2: Obscure topics
    {"id": "D2a", "query": "What is the history of the Voynich Manuscript's ownership chain?", "category": "stress", "subcategory": "obscure"},
    {"id": "D2b", "query": "Describe the political structure of the Principality of Sealand.", "category": "stress", "subcategory": "obscure"},
    {"id": "D2c", "query": "What are the known side effects of the drug Zuranolone in postpartum depression?", "category": "stress", "subcategory": "obscure"},
    {"id": "D2d", "query": "Summarize the contributions of Srinivasa Ramanujan's lost notebook to number theory.", "category": "stress", "subcategory": "obscure"},
    
    # D3: Very recent events
    {"id": "D3a", "query": "What were the most significant tech industry events in the past 48 hours?", "category": "stress", "subcategory": "freshness"},
    {"id": "D3b", "query": "What is the latest stock price movement for NVIDIA and why?", "category": "stress", "subcategory": "freshness"},
    
    # D4: Highly technical
    {"id": "D4a", "query": "Explain the differences between LoRA, QLoRA, and DoRA fine-tuning methods with benchmark comparisons.", "category": "stress", "subcategory": "technical"},
    {"id": "D4b", "query": "Compare the architectures of Mamba, RWKV, and Transformer models for sequence modeling.", "category": "stress", "subcategory": "technical"},
    {"id": "D4c", "query": "What is the current state of topological quantum computing at Microsoft and IBM?", "category": "stress", "subcategory": "technical"},
    
    # D5: Long-form output
    {"id": "D5a", "query": "Write a comprehensive 5000-word research report on quantum computing's impact on cryptography.", "category": "stress", "subcategory": "long_form"},
    
    # D7: Edge cases
    {"id": "D7a", "query": "research", "category": "stress", "subcategory": "edge_case"},
    {"id": "D7b", "query": "asdfghjkl qwerty zxcvbnm research this", "category": "stress", "subcategory": "edge_case"},
    {"id": "D7c", "query": "Write a short but comprehensive 10,000-word summary", "category": "stress", "subcategory": "edge_case"},
    
    # D9: Multi-language
    {"id": "D9a", "query": "Summarize the key findings of recent Chinese AI research papers on large language models.", "category": "stress", "subcategory": "multi_language"},
    {"id": "D9b", "query": "What are the latest German automotive industry reports on EV adoption?", "category": "stress", "subcategory": "multi_language"},
    
    # D10: Numerical accuracy
    {"id": "D10a", "query": "What were the exact GDP growth rates for G7 countries in 2024 Q1-Q4?", "category": "stress", "subcategory": "numerical"},
    {"id": "D10b", "query": "List the top 10 most funded AI startups in 2024 with their exact funding amounts.", "category": "stress", "subcategory": "numerical"},
    {"id": "D10c", "query": "What are the current interest rates set by the Fed, ECB, and Bank of Japan?", "category": "stress", "subcategory": "numerical"},
    
    # === E. Domain-Specific Queries ===
    {"id": "E1", "query": "What are the key differences between GDPR and CCPA?", "category": "domain_specific", "subcategory": "legal"},
    {"id": "E2", "query": "What is the current evidence on intermittent fasting for cardiovascular health?", "category": "domain_specific", "subcategory": "medical"},
    {"id": "E3", "query": "Analyze Tesla's Q3 2024 earnings - what are the key takeaways?", "category": "domain_specific", "subcategory": "financial"},
    {"id": "E4", "query": "What is the current state of nuclear fusion research?", "category": "domain_specific", "subcategory": "scientific"},
    
    # === F. Recent Research Generation ===
    {"id": "F1", "query": "Find and summarize the 5 most recent research papers on LLM hallucination mitigation published in 2025-2026.", "category": "recent_research", "subcategory": "papers"},
    {"id": "F2", "query": "What are the latest breakthroughs in solid-state batteries from the past 6 months?", "category": "recent_research", "subcategory": "breakthroughs"},
    {"id": "F3", "query": "Summarize recent clinical trial results for GLP-1 receptor agonists in 2025.", "category": "recent_research", "subcategory": "clinical_trials"},
    
    # === G. Manual Verification Queries (Freshness Check) ===
    # G1: Live/real-time data
    {"id": "G1a", "query": "What is today's price of Bitcoin?", "category": "manual_verification", "subcategory": "realtime"},
    {"id": "G1b", "query": "What is the current USD to INR exchange rate?", "category": "manual_verification", "subcategory": "realtime"},
    {"id": "G1c", "query": "What is NVIDIA's stock price right now?", "category": "manual_verification", "subcategory": "realtime"},
    
    # G2: Recent events
    {"id": "G2a", "query": "What were the top tech news stories this week?", "category": "manual_verification", "subcategory": "recent_events"},
    
    # G3: Recently changed facts
    {"id": "G3a", "query": "What is the latest version of Python?", "category": "manual_verification", "subcategory": "changed_facts"},
    {"id": "G3b", "query": "What is the current US federal interest rate?", "category": "manual_verification", "subcategory": "changed_facts"},
    
    # G5: Trick questions
    {"id": "G5a", "query": "What is the latest iPhone model?", "category": "manual_verification", "subcategory": "trick"},
    {"id": "G5b", "query": "What is the most recent SpaceX Starship launch result?", "category": "manual_verification", "subcategory": "trick"},
]

# Summary
df_queries = pd.DataFrame(TEST_QUERIES)
print(f"Total test queries: {len(TEST_QUERIES)}")
print(f"\nQueries by category:")
print(df_queries['category'].value_counts().to_string())
print(f"\nQueries by subcategory:")
print(df_queries['subcategory'].value_counts().to_string())

---
## Section 3: Research Runner (Adapter Pattern)

One standardized interface, multiple backends. Each adapter returns:
```python
{
    "query": str,
    "response_text": str,
    "sources": List[str],
    "timing_seconds": float,
    "word_count": int,
    "metadata": dict
}
```

In [None]:
# Cell 8: SF enterprise EDR Adapter

async def run_sf_edr(query: str, config: dict) -> dict:
    """Run a query through SF Enterprise Deep Research.
    Wraps the existing run_research_sync() from benchmarks/run_research.py.
    """
    from benchmarks.run_research import run_research_sync
    
    output_file = os.path.join(
        config["output_dir"],
        f"edr_raw_{config['run_id']}_{query[:30].replace(' ', '_')}.json"
    )
    
    start_time = time.time()
    try:
        result = await run_research_sync(
            query=query,
            max_web_search_loops=config.get("edr_max_loops", 3),
            visualization_disabled=True,
            provider=config.get("edr_provider"),
            model=config.get("edr_model"),
            output_file=output_file,
        )
        elapsed = time.time() - start_time
        
        if result is None:
            return {
                "query": query,
                "response_text": "",
                "sources": [],
                "timing_seconds": elapsed,
                "word_count": 0,
                "error": "run_research_sync returned None",
                "metadata": {"system": "sf_edr", "provider": config.get("edr_provider"), "model": config.get("edr_model")}
            }
        
        response_text = result.get("article", result.get("summary", ""))
        sources = result.get("debug_info", {}).get("sources_gathered", [])
        if isinstance(sources, int):
            sources = []  # sources_gathered is sometimes a count
        
        return {
            "query": query,
            "response_text": response_text,
            "sources": sources if isinstance(sources, list) else [],
            "timing_seconds": result.get("timing", {}).get("total_duration_seconds", elapsed),
            "word_count": len(response_text.split()),
            "metadata": {
                "system": "sf_edr",
                "provider": config.get("edr_provider"),
                "model": config.get("edr_model"),
                "research_loops": result.get("debug_info", {}).get("research_loops", 0),
                "sources_count": result.get("debug_info", {}).get("sources_gathered", 0),
                "raw_output_file": output_file,
            }
        }
    except Exception as e:
        elapsed = time.time() - start_time
        return {
            "query": query,
            "response_text": "",
            "sources": [],
            "timing_seconds": elapsed,
            "word_count": 0,
            "error": str(e),
            "metadata": {"system": "sf_edr", "provider": config.get("edr_provider"), "model": config.get("edr_model")}
        }

In [None]:
# Cell 9: Open Deep agentAdapter (Placeholder)

async def run_open_deep_research(query: str, config: dict) -> dict:
    """Run a query through LangChain Open Deep Research.
    
    TODO: Replace this placeholder with your actual Open Deep agentAPI call.
    The function should return the same standardized dict format.
    """
    start_time = time.time()
    
    try:
        # ============================================================
        # TODO: Replace this block with your Open Deep agentcall
        # Example:
        #   from open_deep_research import research
        #   result = await research(query)
        #   response_text = result.report
        #   sources = result.sources
        # ============================================================
        
        raise NotImplementedError(
            "Open Deep agentadapter not yet configured. "
            "Please implement the API call in this cell."
        )
        
        elapsed = time.time() - start_time
        return {
            "query": query,
            "response_text": response_text,
            "sources": sources,
            "timing_seconds": elapsed,
            "word_count": len(response_text.split()),
            "metadata": {"system": "open_deep_research"}
        }
    except NotImplementedError:
        raise
    except Exception as e:
        elapsed = time.time() - start_time
        return {
            "query": query,
            "response_text": "",
            "sources": [],
            "timing_seconds": elapsed,
            "word_count": 0,
            "error": str(e),
            "metadata": {"system": "open_deep_research"}
        }

In [None]:
# Cell 10: Dispatcher

async def run_research(query: str, system: str, config: dict) -> dict:
    """Route a research query to the appropriate system adapter."""
    adapters = {
        "sf_edr": run_sf_edr,
        "open_deep_research": run_open_deep_research,
    }
    
    adapter = adapters.get(system)
    if adapter is None:
        raise ValueError(f"Unknown system: {system}. Choose from: {list(adapters.keys())}")
    
    return await adapter(query, config)

print(f"Dispatcher ready. System under test: {CONFIG['system_under_test']}")

---
## Section 4: Execute Research Queries

Run all test queries through the selected system. Results are saved incrementally to JSON after each query, so no data is lost if the process is interrupted.

In [None]:
# Cell 12: Run all queries

RESULTS_FILE = os.path.join(
    CONFIG["output_dir"],
    f"results_{CONFIG['system_under_test']}_{CONFIG['run_id']}.json"
)

# Load existing results if resuming a partial run
if os.path.exists(RESULTS_FILE):
    with open(RESULTS_FILE, "r", encoding="utf-8") as f:
        results = json.load(f)
    completed_ids = {r["query_id"] for r in results}
    print(f"Resuming: {len(completed_ids)} queries already completed")
else:
    results = []
    completed_ids = set()

# Filter to queries not yet completed
pending_queries = [q for q in TEST_QUERIES if q["id"] not in completed_ids]
print(f"Queries to run: {len(pending_queries)} / {len(TEST_QUERIES)}")

for i, query_info in enumerate(pending_queries):
    print(f"\n[{i+1}/{len(pending_queries)}] Running: {query_info['id']} - {query_info['query'][:60]}...")
    
    try:
        result = await run_research(
            query=query_info["query"],
            system=CONFIG["system_under_test"],
            config=CONFIG
        )
        
        # Attach query metadata
        result["query_id"] = query_info["id"]
        result["category"] = query_info["category"]
        result["subcategory"] = query_info["subcategory"]
        result["system"] = CONFIG["system_under_test"]
        result["timestamp"] = datetime.now().isoformat()
        
        results.append(result)
        
        # Incremental save
        with open(RESULTS_FILE, "w", encoding="utf-8") as f:
            json.dump(results, f, indent=2, ensure_ascii=False, default=str)
        
        status = "ERROR" if result.get("error") else "OK"
        print(f"  [{status}] {result['word_count']} words, {result['timing_seconds']:.1f}s")
        
    except Exception as e:
        print(f"  [FAILED] {e}")
        results.append({
            "query_id": query_info["id"],
            "query": query_info["query"],
            "category": query_info["category"],
            "subcategory": query_info["subcategory"],
            "system": CONFIG["system_under_test"],
            "response_text": "",
            "sources": [],
            "timing_seconds": 0,
            "word_count": 0,
            "error": str(e),
            "timestamp": datetime.now().isoformat(),
        })
        with open(RESULTS_FILE, "w", encoding="utf-8") as f:
            json.dump(results, f, indent=2, ensure_ascii=False, default=str)

print(f"\nAll queries complete. Results saved to: {RESULTS_FILE}")

In [None]:
# Cell 13: Quick summary of results

df_results = pd.DataFrame(results)

total = len(df_results)
success = len(df_results[~df_results.get("error", pd.Series(dtype=str)).notna() | (df_results.get("error", pd.Series(dtype=str)) == "")])
errors = total - success

print(f"=== Execution Summary ===")
print(f"Total queries run:   {total}")
print(f"Successful:          {total - len(df_results[df_results.get('error', '').astype(bool)])}")
print(f"Average latency:     {df_results['timing_seconds'].mean():.1f}s")
print(f"Median latency:      {df_results['timing_seconds'].median():.1f}s")
print(f"Average word count:  {df_results['word_count'].mean():.0f}")
print(f"\nLatency by category:")
print(df_results.groupby('category')['timing_seconds'].agg(['mean', 'median', 'max']).round(1).to_string())

---
## Section 5: Automated Evaluation - LLM-as-Judge

Uses a strong LLM (GPT-4 / Claude) to automatically score each research output on multiple dimensions. **No ground truth required** - the judge evaluates standalone quality.

In [None]:
# Cell 15: LLM-as-Judge scoring function

def get_judge_client(config: dict):
    """Initialize the judge LLM client."""
    provider = config.get("judge_provider", "openai")
    if provider == "openai":
        from openai import OpenAI
        return OpenAI(), config.get("judge_model", "gpt-4o")
    elif provider == "anthropic":
        from anthropic import Anthropic
        return Anthropic(), config.get("judge_model", "claude-sonnet-4-5-20250929")
    else:
        raise ValueError(f"Unsupported judge provider: {provider}")


def llm_judge_score(query: str, response: str, config: dict) -> dict:
    """Score a research response using an LLM judge. No ground truth needed."""
    
    judge_prompt = f"""You are an expert research quality evaluator. Score the following research output.
There is NO ground truth - evaluate the response on its own merits.

QUERY: {query}

RESPONSE:
{response[:8000]}

Score each dimension on a 1-5 scale (5 is best):

1. relevancy (1-5): Does the response directly address the query?
2. depth (1-5): How thorough and comprehensive is the coverage?
3. source_quality (1-5): Are citations from reputable, relevant sources? Are sources properly referenced?
4. coherence (1-5): Is the response well-structured, logical, and readable?
5. confidence_calibration (1-5): Does it appropriately express uncertainty where warranted? (5 = good calibration)

Return ONLY valid JSON (no markdown, no explanation outside the JSON):
{{"relevancy": X, "depth": X, "source_quality": X, "coherence": X, "confidence_calibration": X, "reasoning": "brief 1-2 sentence explanation"}}"""

    provider = config.get("judge_provider", "openai")
    
    try:
        if provider == "openai":
            from openai import OpenAI
            client = OpenAI()
            resp = client.chat.completions.create(
                model=config.get("judge_model", "gpt-4o"),
                messages=[{"role": "user", "content": judge_prompt}],
                temperature=0.1,
            )
            raw = resp.choices[0].message.content.strip()
        elif provider == "anthropic":
            from anthropic import Anthropic
            client = Anthropic()
            resp = client.messages.create(
                model=config.get("judge_model", "claude-sonnet-4-5-20250929"),
                max_tokens=500,
                messages=[{"role": "user", "content": judge_prompt}],
            )
            raw = resp.content[0].text.strip()
        else:
            return {"error": f"Unsupported judge provider: {provider}"}
        
        # Parse JSON from response (handle markdown code blocks)
        if raw.startswith("```"):
            raw = raw.split("```")[1]
            if raw.startswith("json"):
                raw = raw[4:]
        
        return json.loads(raw)
    
    except Exception as e:
        return {"error": str(e)}

print("LLM-as-Judge function ready.")

In [None]:
# Cell 16: Run LLM-as-Judge on all results

JUDGE_RESULTS_FILE = os.path.join(
    CONFIG["output_dir"],
    f"judge_scores_{CONFIG['system_under_test']}_{CONFIG['run_id']}.json"
)

# Load existing judge scores if resuming
if os.path.exists(JUDGE_RESULTS_FILE):
    with open(JUDGE_RESULTS_FILE, "r", encoding="utf-8") as f:
        judge_scores = json.load(f)
    judged_ids = {s["query_id"] for s in judge_scores}
    print(f"Resuming: {len(judged_ids)} already judged")
else:
    judge_scores = []
    judged_ids = set()

pending = [r for r in results if r["query_id"] not in judged_ids and r.get("response_text")]
print(f"Queries to judge: {len(pending)}")

for i, result in enumerate(pending):
    print(f"  [{i+1}/{len(pending)}] Judging {result['query_id']}...", end=" ")
    
    score = llm_judge_score(result["query"], result["response_text"], CONFIG)
    score["query_id"] = result["query_id"]
    score["category"] = result["category"]
    score["subcategory"] = result["subcategory"]
    
    judge_scores.append(score)
    
    # Incremental save
    with open(JUDGE_RESULTS_FILE, "w", encoding="utf-8") as f:
        json.dump(judge_scores, f, indent=2, ensure_ascii=False)
    
    if "error" in score:
        print(f"ERROR: {score['error']}")
    else:
        avg = (score["relevancy"] + score["depth"] + score["source_quality"] + score["coherence"] + score["confidence_calibration"]) / 5
        print(f"avg={avg:.1f}")

print(f"\nJudge scores saved to: {JUDGE_RESULTS_FILE}")

In [None]:
# Cell 17: Display judge scores

valid_scores = [s for s in judge_scores if "error" not in s]
df_scores = pd.DataFrame(valid_scores)

score_cols = ["relevancy", "depth", "source_quality", "coherence", "confidence_calibration"]

if not df_scores.empty:
    # Overall averages
    print("=== Overall LLM-as-Judge Scores ===")
    print(df_scores[score_cols].mean().round(2).to_string())
    
    # By category
    print("\n=== Scores by Category ===")
    display(df_scores.groupby("category")[score_cols].mean().round(2))
    
    # Full table
    print("\n=== All Scores ===")
    display(df_scores[["query_id", "category"] + score_cols + ["reasoning"]].sort_values("category"))
else:
    print("No valid judge scores yet.")

---
## Section 6: Automated Evaluation - Citation Verification

Checks whether cited URLs are alive and whether the source content actually supports the claims made in the research output.

In [None]:
# Cell 19: Extract URLs from response text

def extract_urls(text: str) -> list:
    """Extract all URLs from text."""
    url_pattern = r'https?://[^\s\)\]\"\'>]+'
    urls = re.findall(url_pattern, text)
    # Clean trailing punctuation
    cleaned = []
    for url in urls:
        url = url.rstrip('.,;:!?')
        if url not in cleaned:
            cleaned.append(url)
    return cleaned

# Quick test
test_text = "See https://example.com/article and also https://test.org/paper.pdf for details."
print(f"Test URL extraction: {extract_urls(test_text)}")

In [None]:
# Cell 20: Check if URLs are alive

def check_url_alive(url: str, timeout: int = 10) -> dict:
    """Check if a URL responds with HTTP 200."""
    try:
        r = requests.head(url, timeout=timeout, allow_redirects=True,
                         headers={"User-Agent": "Mozilla/5.0 (research-validator)"})
        return {"url": url, "alive": r.status_code < 400, "status_code": r.status_code}
    except requests.exceptions.Timeout:
        return {"url": url, "alive": False, "status_code": "timeout"}
    except Exception as e:
        return {"url": url, "alive": False, "status_code": str(e)}

print("URL liveness checker ready.")

In [None]:
# Cell 21: LLM-based citation content verification

def fetch_page_text(url: str, max_chars: int = 3000) -> Optional[str]:
    """Fetch and extract text content from a URL."""
    try:
        r = requests.get(url, timeout=15, headers={"User-Agent": "Mozilla/5.0 (research-validator)"})
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")
        # Remove script/style elements
        for tag in soup(["script", "style", "nav", "footer", "header"]):
            tag.decompose()
        text = soup.get_text(separator=" ", strip=True)
        return text[:max_chars]
    except Exception:
        return None


def verify_citation_with_llm(response_text: str, url: str, source_content: str, config: dict) -> dict:
    """Use LLM to check if the source content supports claims near the URL in the response."""
    
    # Extract text near the URL reference
    url_pos = response_text.find(url)
    if url_pos == -1:
        # Try partial URL match
        for part in url.split("/")[2:4]:
            if part in response_text:
                url_pos = response_text.find(part)
                break
    
    context_start = max(0, url_pos - 500) if url_pos >= 0 else 0
    context_end = min(len(response_text), url_pos + 500) if url_pos >= 0 else 1000
    claim_context = response_text[context_start:context_end]
    
    prompt = f"""Does the source content support the claims made in the research text near this citation?

RESEARCH TEXT (near citation):
{claim_context}

SOURCE CONTENT:
{source_content[:2000]}

Return ONLY valid JSON:
{{"verdict": "supported" or "partially_supported" or "unsupported" or "unrelated", "confidence": 0.0 to 1.0, "reasoning": "brief explanation"}}"""

    provider = config.get("judge_provider", "openai")
    try:
        if provider == "openai":
            from openai import OpenAI
            client = OpenAI()
            resp = client.chat.completions.create(
                model=config.get("judge_model", "gpt-4o"),
                messages=[{"role": "user", "content": prompt}],
                temperature=0.1,
            )
            raw = resp.choices[0].message.content.strip()
        elif provider == "anthropic":
            from anthropic import Anthropic
            client = Anthropic()
            resp = client.messages.create(
                model=config.get("judge_model", "claude-sonnet-4-5-20250929"),
                max_tokens=300,
                messages=[{"role": "user", "content": prompt}],
            )
            raw = resp.content[0].text.strip()
        else:
            return {"verdict": "error", "confidence": 0, "reasoning": f"Unsupported provider: {provider}"}
        
        if raw.startswith("```"):
            raw = raw.split("```")[1]
            if raw.startswith("json"):
                raw = raw[4:]
        return json.loads(raw)
    except Exception as e:
        return {"verdict": "error", "confidence": 0, "reasoning": str(e)}

print("Citation verification functions ready.")

In [None]:
# Cell 22: Run full citation verification pipeline

CITATION_RESULTS_FILE = os.path.join(
    CONFIG["output_dir"],
    f"citation_verification_{CONFIG['system_under_test']}_{CONFIG['run_id']}.json"
)

citation_results = []

for result in results:
    if not result.get("response_text"):
        continue
    
    urls = extract_urls(result["response_text"])
    if not urls:
        citation_results.append({
            "query_id": result["query_id"],
            "total_citations": 0,
            "alive": 0,
            "dead": 0,
            "supported": 0,
            "unsupported": 0,
            "details": [],
        })
        continue
    
    print(f"Verifying {len(urls)} citations for {result['query_id']}...", end=" ")
    
    query_citations = []
    for url in urls[:10]:  # Limit to 10 URLs per query to save time/cost
        alive_check = check_url_alive(url)
        
        verification = {"verdict": "not_checked"}
        if alive_check["alive"]:
            page_text = fetch_page_text(url)
            if page_text:
                verification = verify_citation_with_llm(
                    result["response_text"], url, page_text, CONFIG
                )
        
        query_citations.append({
            "url": url,
            "alive": alive_check["alive"],
            "status_code": alive_check["status_code"],
            "verdict": verification.get("verdict", "not_checked"),
            "confidence": verification.get("confidence", 0),
        })
    
    alive_count = sum(1 for c in query_citations if c["alive"])
    supported_count = sum(1 for c in query_citations if c["verdict"] in ["supported", "partially_supported"])
    
    citation_results.append({
        "query_id": result["query_id"],
        "total_citations": len(query_citations),
        "alive": alive_count,
        "dead": len(query_citations) - alive_count,
        "supported": supported_count,
        "unsupported": sum(1 for c in query_citations if c["verdict"] == "unsupported"),
        "details": query_citations,
    })
    
    print(f"alive={alive_count}/{len(query_citations)}, supported={supported_count}")

# Save
with open(CITATION_RESULTS_FILE, "w", encoding="utf-8") as f:
    json.dump(citation_results, f, indent=2, ensure_ascii=False, default=str)

print(f"\nCitation results saved to: {CITATION_RESULTS_FILE}")

In [None]:
# Cell 23: Display citation verification results

df_citations = pd.DataFrame([{
    "query_id": c["query_id"],
    "total_urls": c["total_citations"],
    "alive": c["alive"],
    "dead": c["dead"],
    "supported": c["supported"],
    "unsupported": c["unsupported"],
    "alive_rate": c["alive"] / c["total_citations"] if c["total_citations"] > 0 else 0,
    "support_rate": c["supported"] / c["alive"] if c["alive"] > 0 else 0,
} for c in citation_results])

if not df_citations.empty:
    print("=== Citation Verification Summary ===")
    with_citations = df_citations[df_citations["total_urls"] > 0]
    
    if not with_citations.empty:
        print(f"Queries with citations: {len(with_citations)} / {len(df_citations)}")
        print(f"Average URLs per query: {with_citations['total_urls'].mean():.1f}")
        print(f"Average alive rate:     {with_citations['alive_rate'].mean():.1%}")
        print(f"Average support rate:   {with_citations['support_rate'].mean():.1%}")
        print(f"\nPer-query breakdown:")
        display(with_citations)
    else:
        print("No citations found in any responses.")
else:
    print("No citation results yet.")

---
## Section 7: Automated Evaluation - DeepEval Metrics (Optional)

Uses the DeepEval library for faithfulness and answer relevancy metrics. These metrics work **without ground truth**:
- **FaithfulnessMetric**: Are claims grounded in the cited sources?
- **AnswerRelevancyMetric**: Does the response address the question?

In [None]:
# Cell 25: Run DeepEval metrics

try:
    from deepeval import evaluate
    from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric
    from deepeval.test_case import LLMTestCase
    
    DEEPEVAL_AVAILABLE = True
    print("DeepEval loaded successfully.")
except ImportError:
    DEEPEVAL_AVAILABLE = False
    print("DeepEval not installed. Run: pip install deepeval")
    print("Skipping this section.")

deepeval_scores = []

if DEEPEVAL_AVAILABLE:
    relevancy_metric = AnswerRelevancyMetric(threshold=0.5)
    faithfulness_metric = FaithfulnessMetric(threshold=0.5)
    
    for result in results:
        if not result.get("response_text"):
            continue
        
        print(f"  DeepEval scoring {result['query_id']}...", end=" ")
        
        # Extract source content as retrieval context
        sources = result.get("sources", [])
        retrieval_context = sources if isinstance(sources, list) and sources else [result["response_text"][:2000]]
        
        test_case = LLMTestCase(
            input=result["query"],
            actual_output=result["response_text"][:5000],
            retrieval_context=retrieval_context[:5],
        )
        
        try:
            relevancy_metric.measure(test_case)
            faithfulness_metric.measure(test_case)
            
            deepeval_scores.append({
                "query_id": result["query_id"],
                "relevancy": relevancy_metric.score,
                "faithfulness": faithfulness_metric.score,
                "relevancy_reason": relevancy_metric.reason,
                "faithfulness_reason": faithfulness_metric.reason,
            })
            print(f"rel={relevancy_metric.score:.2f}, faith={faithfulness_metric.score:.2f}")
        except Exception as e:
            print(f"ERROR: {e}")
            deepeval_scores.append({
                "query_id": result["query_id"],
                "error": str(e),
            })
    
    # Save
    deepeval_file = os.path.join(CONFIG["output_dir"], f"deepeval_{CONFIG['system_under_test']}_{CONFIG['run_id']}.json")
    with open(deepeval_file, "w", encoding="utf-8") as f:
        json.dump(deepeval_scores, f, indent=2, ensure_ascii=False)
    print(f"\nDeepEval scores saved to: {deepeval_file}")

In [None]:
# Cell 26: Display DeepEval scores

if deepeval_scores:
    valid_de = [s for s in deepeval_scores if "error" not in s]
    df_de = pd.DataFrame(valid_de)
    
    if not df_de.empty:
        print("=== DeepEval Scores ===")
        print(f"Average Relevancy:    {df_de['relevancy'].mean():.2f}")
        print(f"Average Faithfulness: {df_de['faithfulness'].mean():.2f}")
        print()
        display(df_de[["query_id", "relevancy", "faithfulness"]])
    else:
        print("No valid DeepEval scores.")
else:
    print("No DeepEval scores available. Install deepeval and re-run Section 7.")

---
## Section 8: Manual Verification - Freshness Check

For queries in the `manual_verification` category, you can personally check if the returned data is current. Review each response and assign a freshness score:
- **Fresh**: Data is current (within last 7 days)
- **Slightly Stale**: Data is 1-4 weeks old
- **Stale**: Data is 1-6 months old
- **Outdated**: Data is 6+ months old or incorrect

In [None]:
# Cell 28: Display manual verification queries with responses

manual_results = [r for r in results if r.get("category") == "manual_verification"]

if manual_results:
    print(f"=== Manual Verification Queries ({len(manual_results)} queries) ===")
    print("Review each response below and assign a freshness score.\n")
    
    for r in manual_results:
        display(HTML(f"""
        <div style='border: 1px solid #ccc; padding: 12px; margin: 8px 0; border-radius: 4px;'>
            <h4>[{r['query_id']}] {r['query']}</h4>
            <p><b>Subcategory:</b> {r.get('subcategory', 'N/A')}</p>
            <p><b>Response (first 500 chars):</b></p>
            <pre style='white-space: pre-wrap; background: #f5f5f5; padding: 8px;'>{r.get('response_text', 'No response')[:500]}</pre>
        </div>
        """))
else:
    print("No manual verification results yet. Run Section 4 first.")

In [None]:
# Cell 29: Enter manual freshness scores
# Update the scores dict below after reviewing the responses above.

# Options: "fresh", "slightly_stale", "stale", "outdated"
MANUAL_FRESHNESS_SCORES = {
    # "G1a": "fresh",
    # "G1b": "slightly_stale",
    # "G1c": "stale",
    # "G2a": "fresh",
    # "G3a": "fresh",
    # "G3b": "outdated",
    # "G5a": "fresh",
    # "G5b": "slightly_stale",
}

# Convert to numeric for aggregation
freshness_to_score = {"fresh": 4, "slightly_stale": 3, "stale": 2, "outdated": 1}

if MANUAL_FRESHNESS_SCORES:
    freshness_data = []
    for qid, freshness in MANUAL_FRESHNESS_SCORES.items():
        freshness_data.append({
            "query_id": qid,
            "freshness": freshness,
            "freshness_score": freshness_to_score.get(freshness, 0),
        })
    
    df_freshness = pd.DataFrame(freshness_data)
    print("=== Manual Freshness Scores ===")
    print(f"Average freshness score: {df_freshness['freshness_score'].mean():.2f} / 4.0")
    display(df_freshness)
    
    # Save
    freshness_file = os.path.join(CONFIG["output_dir"], f"freshness_{CONFIG['system_under_test']}_{CONFIG['run_id']}.json")
    with open(freshness_file, "w", encoding="utf-8") as f:
        json.dump(freshness_data, f, indent=2)
    print(f"Saved to: {freshness_file}")
else:
    print("No freshness scores entered yet. Fill in MANUAL_FRESHNESS_SCORES above and re-run.")

---
## Section 9: Results Aggregation & Visualization

Combine all evaluation scores and generate visual reports.

In [None]:
# Cell 31: Aggregate all scores into a single DataFrame

# Start with basic result info
agg_data = []
for r in results:
    row = {
        "query_id": r["query_id"],
        "category": r.get("category", ""),
        "subcategory": r.get("subcategory", ""),
        "timing_seconds": r.get("timing_seconds", 0),
        "word_count": r.get("word_count", 0),
        "has_error": bool(r.get("error")),
    }
    
    # Merge judge scores
    judge = next((s for s in judge_scores if s.get("query_id") == r["query_id"] and "error" not in s), None)
    if judge:
        for col in ["relevancy", "depth", "source_quality", "coherence", "confidence_calibration"]:
            row[f"judge_{col}"] = judge.get(col)
    
    # Merge citation scores
    citation = next((c for c in citation_results if c["query_id"] == r["query_id"]), None)
    if citation:
        row["citation_count"] = citation["total_citations"]
        row["citation_alive_rate"] = citation["alive"] / citation["total_citations"] if citation["total_citations"] > 0 else None
        row["citation_support_rate"] = citation["supported"] / citation["alive"] if citation["alive"] > 0 else None
    
    # Merge DeepEval scores
    de = next((s for s in deepeval_scores if s.get("query_id") == r["query_id"] and "error" not in s), None)
    if de:
        row["deepeval_relevancy"] = de.get("relevancy")
        row["deepeval_faithfulness"] = de.get("faithfulness")
    
    # Merge freshness scores
    if r["query_id"] in MANUAL_FRESHNESS_SCORES:
        row["freshness"] = MANUAL_FRESHNESS_SCORES[r["query_id"]]
        row["freshness_score"] = freshness_to_score.get(MANUAL_FRESHNESS_SCORES[r["query_id"]], 0)
    
    agg_data.append(row)

df_agg = pd.DataFrame(agg_data)

print(f"=== Aggregated Results ({len(df_agg)} queries) ===")
display(df_agg.describe().round(2))

In [None]:
# Cell 32: Radar chart - overall system performance

judge_cols = [c for c in df_agg.columns if c.startswith("judge_")]

if judge_cols:
    avg_scores = df_agg[judge_cols].mean()
    categories_radar = [c.replace("judge_", "").replace("_", " ").title() for c in judge_cols]
    values = avg_scores.values.tolist()
    values.append(values[0])  # Close the polygon
    categories_radar.append(categories_radar[0])
    
    fig = go.Figure()
    fig.add_trace(go.Scatterpolar(
        r=values,
        theta=categories_radar,
        fill='toself',
        name=CONFIG["system_under_test"],
        line_color='rgb(31, 119, 180)',
    ))
    
    fig.update_layout(
        polar=dict(radialaxis=dict(visible=True, range=[0, 5])),
        showlegend=True,
        title=f"System Performance Radar - {CONFIG['system_under_test']}",
    )
    fig.show()
else:
    print("No judge scores available for radar chart. Run Section 5 first.")

In [None]:
# Cell 33: Bar charts - scores by category

if judge_cols and not df_agg.empty:
    # Average judge score per category
    df_agg["judge_avg"] = df_agg[judge_cols].mean(axis=1)
    
    cat_avg = df_agg.groupby("category")["judge_avg"].mean().sort_values(ascending=True)
    
    fig = px.bar(
        x=cat_avg.values,
        y=cat_avg.index,
        orientation='h',
        title=f"Average Quality Score by Category - {CONFIG['system_under_test']}",
        labels={"x": "Average Score (1-5)", "y": "Category"},
    )
    fig.update_layout(xaxis_range=[0, 5])
    fig.show()
else:
    print("No data for bar chart.")

In [None]:
# Cell 34: Latency analysis

if not df_agg.empty and df_agg["timing_seconds"].sum() > 0:
    fig = px.histogram(
        df_agg, x="timing_seconds", nbins=20,
        title=f"Response Time Distribution - {CONFIG['system_under_test']}",
        labels={"timing_seconds": "Response Time (seconds)"},
    )
    fig.show()
    
    print(f"Latency Statistics:")
    print(f"  Mean:   {df_agg['timing_seconds'].mean():.1f}s")
    print(f"  Median: {df_agg['timing_seconds'].median():.1f}s")
    print(f"  P95:    {df_agg['timing_seconds'].quantile(0.95):.1f}s")
    print(f"  Max:    {df_agg['timing_seconds'].max():.1f}s")
    
    # Latency by category
    fig2 = px.box(
        df_agg, x="category", y="timing_seconds",
        title=f"Response Time by Category - {CONFIG['system_under_test']}",
        labels={"timing_seconds": "Response Time (seconds)", "category": "Category"},
    )
    fig2.show()
else:
    print("No timing data available.")

In [None]:
# Cell 35: Save final aggregated results

final_json = os.path.join(CONFIG["output_dir"], f"final_aggregated_{CONFIG['system_under_test']}_{CONFIG['run_id']}.json")
final_csv = os.path.join(CONFIG["output_dir"], f"final_aggregated_{CONFIG['system_under_test']}_{CONFIG['run_id']}.csv")

df_agg.to_json(final_json, orient="records", indent=2)
df_agg.to_csv(final_csv, index=False)

print(f"Final results saved:")
print(f"  JSON: {final_json}")
print(f"  CSV:  {final_csv}")
print(f"\nOutput directory contents:")
for f in sorted(os.listdir(CONFIG["output_dir"])):
    size = os.path.getsize(os.path.join(CONFIG["output_dir"], f))
    print(f"  {f} ({size:,} bytes)")

---
## Section 10: Cross-System Comparison

**Run this section only after you have tested BOTH systems.**

Load results from both sf EDR and Open Deep agentruns, then compare them head-to-head with pairwise LLM judging and visual overlays.

In [None]:
# Cell 37: Load results from both systems

# Update these paths to point to your saved result files
sf_RESULTS_FILE = ""  # e.g. "validation_results/results_sf_edr_20260220_143000.json"
OPEN_DR_RESULTS_FILE = ""     # e.g. "validation_results/results_open_deep_research_20260221_100000.json"

if sf_RESULTS_FILE and OPEN_DR_RESULTS_FILE:
    with open(sf_RESULTS_FILE, "r", encoding="utf-8") as f:
        sf_results = json.load(f)
    with open(OPEN_DR_RESULTS_FILE, "r", encoding="utf-8") as f:
        odr_results = json.load(f)
    
    print(f"sf EDR results: {len(sf_results)} queries")
    print(f"Open Deep agentresults: {len(odr_results)} queries")
    
    # Build lookup by query_id
    sf_lookup = {r["query_id"]: r for r in sf_results}
    odr_lookup = {r["query_id"]: r for r in odr_results}
    common_ids = set(sf_lookup.keys()) & set(odr_lookup.keys())
    print(f"Common queries: {len(common_ids)}")
else:
    print("Please set sf_RESULTS_FILE and OPEN_DR_RESULTS_FILE paths above.")
    print("Look in your validation_results/ directory for the JSON files.")

In [None]:
# Cell 38: Pairwise LLM comparison

def pairwise_compare(query: str, response_a: str, response_b: str, config: dict) -> dict:
    """Ask the judge LLM to compare two responses head-to-head."""
    
    prompt = f"""You are an expert research evaluator. Compare two research outputs for the same query.

QUERY: {query}

RESPONSE A:
{response_a[:4000]}

RESPONSE B:
{response_b[:4000]}

For each dimension, indicate which response is better (A, B, or Tie):
1. relevancy: Which better addresses the query?
2. depth: Which is more thorough?
3. source_quality: Which has better citations?
4. coherence: Which is better structured?
5. overall: Which is the better research output overall?

Return ONLY valid JSON:
{{"relevancy": "A" or "B" or "Tie", "depth": "A" or "B" or "Tie", "source_quality": "A" or "B" or "Tie", "coherence": "A" or "B" or "Tie", "overall": "A" or "B" or "Tie", "reasoning": "brief explanation"}}"""

    provider = config.get("judge_provider", "openai")
    try:
        if provider == "openai":
            from openai import OpenAI
            client = OpenAI()
            resp = client.chat.completions.create(
                model=config.get("judge_model", "gpt-4o"),
                messages=[{"role": "user", "content": prompt}],
                temperature=0.1,
            )
            raw = resp.choices[0].message.content.strip()
        elif provider == "anthropic":
            from anthropic import Anthropic
            client = Anthropic()
            resp = client.messages.create(
                model=config.get("judge_model", "claude-sonnet-4-5-20250929"),
                max_tokens=500,
                messages=[{"role": "user", "content": prompt}],
            )
            raw = resp.content[0].text.strip()
        else:
            return {"error": f"Unsupported provider: {provider}"}
        
        if raw.startswith("```"):
            raw = raw.split("```")[1]
            if raw.startswith("json"):
                raw = raw[4:]
        return json.loads(raw)
    except Exception as e:
        return {"error": str(e)}


# Run pairwise comparison
pairwise_results = []

if 'common_ids' in dir() and common_ids:
    for qid in sorted(common_ids):
        sf_resp = sf_lookup[qid].get("response_text", "")
        odr_resp = odr_lookup[qid].get("response_text", "")
        
        if not sf_resp or not odr_resp:
            continue
        
        query_text = sf_lookup[qid]["query"]
        print(f"Comparing {qid}...", end=" ")
        
        comparison = pairwise_compare(query_text, sf_resp, odr_resp, CONFIG)
        comparison["query_id"] = qid
        pairwise_results.append(comparison)
        
        overall = comparison.get("overall", "?")
        winner = "sf" if overall == "A" else "Open DR" if overall == "B" else "Tie"
        print(f"Winner: {winner}")
    
    # Save
    pairwise_file = os.path.join(CONFIG["output_dir"], f"pairwise_comparison_{CONFIG['run_id']}.json")
    with open(pairwise_file, "w", encoding="utf-8") as f:
        json.dump(pairwise_results, f, indent=2, ensure_ascii=False)
    print(f"\nPairwise results saved to: {pairwise_file}")
else:
    print("Load both system results first (Cell 37).")

In [None]:
# Cell 39: Side-by-side comparison table

if pairwise_results:
    df_pw = pd.DataFrame(pairwise_results)
    
    print("=== Pairwise Comparison Summary ===")
    print(f"A = sf EDR, B = Open Deep Research\n")
    
    for dim in ["relevancy", "depth", "source_quality", "coherence", "overall"]:
        if dim in df_pw.columns:
            counts = df_pw[dim].value_counts()
            a_wins = counts.get("A", 0)
            b_wins = counts.get("B", 0)
            ties = counts.get("Tie", 0)
            print(f"  {dim:25s}: sf={a_wins}  Open DR={b_wins}  Tie={ties}")
    
    print(f"\nDetailed comparison:")
    display(df_pw[["query_id", "relevancy", "depth", "source_quality", "coherence", "overall", "reasoning"]])
else:
    print("No pairwise results yet. Run Cell 38 first.")

In [None]:
# Cell 40: Overlay radar chart - both systems

# Load judge scores for both systems
sf_JUDGE_FILE = ""  # e.g. "validation_results/judge_scores_sf_edr_20260220_143000.json"
OPEN_DR_JUDGE_FILE = ""     # e.g. "validation_results/judge_scores_open_deep_research_20260221_100000.json"

if sf_JUDGE_FILE and OPEN_DR_JUDGE_FILE:
    with open(sf_JUDGE_FILE, "r") as f:
        sf_judge = json.load(f)
    with open(OPEN_DR_JUDGE_FILE, "r") as f:
        odr_judge = json.load(f)
    
    dims = ["relevancy", "depth", "source_quality", "coherence", "confidence_calibration"]
    dims_display = [d.replace("_", " ").title() for d in dims]
    
    sf_valid = [s for s in sf_judge if "error" not in s]
    odr_valid = [s for s in odr_judge if "error" not in s]
    
    sf_avgs = [pd.DataFrame(sf_valid)[d].mean() for d in dims]
    odr_avgs = [pd.DataFrame(odr_valid)[d].mean() for d in dims]
    
    # Close polygons
    sf_avgs.append(sf_avgs[0])
    odr_avgs.append(odr_avgs[0])
    dims_display.append(dims_display[0])
    
    fig = go.Figure()
    fig.add_trace(go.Scatterpolar(r=sf_avgs, theta=dims_display, fill='toself', name='sf EDR', line_color='blue'))
    fig.add_trace(go.Scatterpolar(r=odr_avgs, theta=dims_display, fill='toself', name='Open Deep Research', line_color='red', opacity=0.6))
    
    fig.update_layout(
        polar=dict(radialaxis=dict(visible=True, range=[0, 5])),
        title="Head-to-Head: sf EDR vs Open Deep Research",
    )
    fig.show()
else:
    print("Set sf_JUDGE_FILE and OPEN_DR_JUDGE_FILE paths to generate the overlay radar chart.")

In [None]:
# Cell 41: Final winner determination

if pairwise_results:
    dims = ["relevancy", "depth", "source_quality", "coherence", "overall"]
    weights = {"relevancy": 0.2, "depth": 0.2, "source_quality": 0.2, "coherence": 0.15, "overall": 0.25}
    
    sf_score = 0
    odr_score = 0
    
    for dim in dims:
        w = weights[dim]
        for pw in pairwise_results:
            if pw.get(dim) == "A":
                sf_score += w
            elif pw.get(dim) == "B":
                odr_score += w
            else:  # Tie
                sf_score += w * 0.5
                odr_score += w * 0.5
    
    total = sf_score + odr_score
    sf_pct = sf_score / total * 100 if total > 0 else 50
    odr_pct = odr_score / total * 100 if total > 0 else 50
    
    print("=" * 60)
    print("FINAL RESULTS")
    print("=" * 60)
    print(f"\n  sf EDR:       {sf_pct:.1f}%")
    print(f"  Open Deep Research:   {odr_pct:.1f}%")
    print()
    
    if sf_pct > odr_pct + 5:
        print(f"  WINNER: sf EDR")
    elif odr_pct > sf_pct + 5:
        print(f"  WINNER: Open Deep Research")
    else:
        print(f"  RESULT: Too close to call (within 5% margin)")
    
    print("=" * 60)
else:
    print("Run pairwise comparison first (Cell 38).")

## Report Convention check code.

In [None]:
# Cell 1: Setup - imports and env vars
import sys, os, logging
from dotenv import load_dotenv

# Ensure project root is on path
PROJECT_ROOT = os.path.abspath(".")
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

# Load secrets from .env
load_dotenv(os.path.join(PROJECT_ROOT, ".env"), override=True)

# Show what we loaded
print(f"LLM_PROVIDER = {os.getenv('LLM_PROVIDER')}")
print(f"GEMINI_API_KEY = {os.getenv('GEMINI_API_KEY', 'NOT SET')[:10]}...")
print(f"Project root: {PROJECT_ROOT}")

# Enable logging so we see pipeline steps
logging.basicConfig(level=logging.INFO, format="%(message)s")

In [None]:
# Cell 2: Fake research report (hardcoded - zero LLM cost)
# Contains: a markdown table, numbers in prose, comparisons, time-series data

FAKE_RESEARCH_REPORT = """
# Global Electric Vehicle Market Analysis 2024

## Market Overview

The global electric vehicle (EV) market reached $388.1 billion in 2024, growing at a CAGR of 17.8%.
China dominated with 59% market share, followed by Europe at 25% and North America at 12%.

## Sales by Region (2024)

| Region | Units Sold (millions) | Market Share (%) | YoY Growth (%) |
|--------|----------------------|------------------|----------------|
| China | 8.9 | 59 | 22.4 |
| Europe | 3.8 | 25 | 14.2 |
| North America | 1.8 | 12 | 31.5 |
| Rest of World | 0.6 | 4 | 45.1 |

## Top Manufacturers by Market Cap

Tesla leads with a market capitalization of $785 billion, followed by BYD at $98 billion,
Rivian at $14 billion, and NIO at $9.2 billion. Li Auto reported $21 billion market cap
while XPeng stood at $8.5 billion.

## Battery Technology Trends

Average battery pack cost declined from $153/kWh in 2022 to $139/kWh in 2023 and $128/kWh in 2024.
Lithium iron phosphate (LFP) batteries now account for 40% of the market, up from 30% in 2023.
Energy density improved from 250 Wh/kg in 2022 to 275 Wh/kg in 2023 and 295 Wh/kg in 2024.

## Charging Infrastructure

Global public charging points reached 3.9 million in 2024:
- China: 2.7 million (69%)
- Europe: 0.63 million (16%)
- North America: 0.35 million (9%)
- Rest of World: 0.22 million (6%)

## Price Comparison (Average Selling Price USD)

Tesla Model 3: $38,990. BYD Seal: $25,500. Hyundai Ioniq 5: $41,800.
Volkswagen ID.4: $39,735. Nissan Leaf: $28,040. Chevrolet Equinox EV: $33,900.
BMW iX3: $52,200. Mercedes EQA: $49,950. Kia EV6: $42,600.

## Forecast

The EV market is projected to reach $906.7 billion by 2028, with annual unit sales
expected to exceed 25 million vehicles globally.
"""

print(f"Fake report ready: {len(FAKE_RESEARCH_REPORT)} chars")

In [None]:
# Run research using the LangGraph pipeline directly (no server needed)
# This invokes the same graph that conduct_research() uses internally

import uuid
from src.graph import create_graph
from src.state import SummaryState

# --- Configuration ---
RESEARCH_QUERY = "Top 5 programming languages by popularity in 2026"  # <-- CHANGE THIS
PROVIDER = os.getenv("LLM_PROVIDER", "gemini")
MODEL = os.getenv("LLM_MODEL", None)

print(f"Query:    {RESEARCH_QUERY}")
print(f"Provider: {PROVIDER}")
print(f"Model:    {MODEL}")
print("=" * 60)

# Create graph and initial state (mirrors conduct_research logic)
graph = create_graph()

initial_state = SummaryState(
    research_topic=RESEARCH_QUERY,
    search_query=RESEARCH_QUERY,
    running_summary="",
    research_complete=False,
    knowledge_gap="",
    research_loop_count=0,
    sources_gathered=[],
    web_research_results=[],
    search_results_empty=False,
    selected_search_tool="general_search",
    source_citations={},
    subtopic_queries=[],
    subtopics_metadata=[],
    extra_effort=False,
    minimum_effort=True,  # Keep it cheap
    benchmark_mode=False,
    llm_provider=PROVIDER,
    llm_model=MODEL,
    uploaded_knowledge=None,
    uploaded_files=[],
    steering_enabled=False,
    parallel_search_enabled=False,
    parallel_search_max_concurrency=2,
    database_info=None,
    analysis_required=False,  # We'll run analysis separately in the next cell
)

graph_config = {
    "configurable": {
        "thread_id": str(uuid.uuid4()),
        "stream_id": str(uuid.uuid4()),
        "llm_provider": PROVIDER,
        "llm_model": MODEL,
        "user_prompt": RESEARCH_QUERY,
        "database_info": None,
    },
    "recursion_limit": 100,
}

print("Starting research (this may take a few minutes)...")
import time
research_start = time.time()

# ainvoke returns the final state as a dict
final_state = await graph.ainvoke(initial_state, graph_config)

research_time = time.time() - research_start
print(f"\nResearch completed in {research_time:.1f}s")

# Extract the research report
research_report = final_state.get("markdown_report") or final_state.get("running_summary") or ""
print(f"Report length: {len(research_report)} chars")
print(f"Sources: {len(final_state.get('sources_gathered', []))}")
print("=" * 60)

In [None]:
# Preview the research report

from IPython.display import Markdown, display

if research_report:
    print(f"Report: {len(research_report)} chars\n")
    # Show first 2000 chars as rendered markdown
    display(Markdown(research_report[:2000]))
    if len(research_report) > 2000:
        print(f"\n... (truncated, full report is {len(research_report)} chars)")
else:
    print("No research report generated.")

In [None]:
# Pass research report to the analysis pipeline (~3 LLM calls)

from extensions.agents.data_analysis_agent import DataAnalysisAgent

if not research_report or len(research_report) < 100:
    print("Research report is too short or empty. Skipping analysis.")
else:
    print(f"Passing {len(research_report)} chars of research to analysis pipeline...")
    print("=" * 60)

    agent = DataAnalysisAgent(provider=PROVIDER, model=MODEL)
    analysis_result = agent.run_pipeline(research_report)

    print(f"\n{'='*60}")
    print(f"Status:          {analysis_result['status']}")
    print(f"Error:           {analysis_result['error']}")
    print(f"Execution time:  {analysis_result['execution_time']:.1f}s")
    print(f"Charts created:  {len(analysis_result['charts'])}")
    print(f"Explanations:    {len(analysis_result['chart_explanations'])}")
    print(f"Outlier analysis:{' Yes' if analysis_result.get('outlier_analysis') else ' No'}")
    print(f"Output:          {analysis_result['output']}")
    print(f"{'='*60}")

    # Show extracted data
    extracted = analysis_result.get("extracted_data", "")
    if extracted:
        print("\n=== EXTRACTED DATA (first 1500 chars) ===")
        print(extracted[:1500])

In [None]:
# Cell 3: Initialize agent and run pipeline (~3 LLM calls)

from extensions.agents.data_analysis_agent import DataAnalysisAgent

provider = os.getenv("LLM_PROVIDER", "gemini")
model = os.getenv("LLM_MODEL", None)
print(f"Initializing agent with provider={provider}, model={model}")

agent = DataAnalysisAgent(provider=provider, model=model)
print("Agent initialized. Running pipeline...\n")

result = agent.run_pipeline(FAKE_RESEARCH_REPORT)

print(f"\n{'='*60}")
print(f"Status:          {result['status']}")
print(f"Error:           {result['error']}")
print(f"Execution time:  {result['execution_time']:.1f}s")
print(f"Charts created:  {len(result['charts'])}")
print(f"Explanations:    {len(result['chart_explanations'])}")
print(f"Outlier analysis:{' Yes' if result.get('outlier_analysis') else ' No'}")
print(f"Output:          {result['output']}")
print(f"{'='*60}")

In [None]:
# Cell 4: Inspect extracted data and profile

print("=== EXTRACTED DATA ===")
extracted = result.get("extracted_data", "")
if extracted:
    print(extracted[:2000])
else:
    print("(no data extracted)")

print("\n=== DATA PROFILE (first 1000 chars) ===")
profile = result.get("data_profile", "")
if profile:
    print(profile[:1000])
else:
    print("(no profile)")

In [None]:
# Cell 5: Validate chart files exist on disk

charts = result.get("charts", [])
print(f"Charts: {len(charts)}")
for i, path in enumerate(charts, 1):
    exists = os.path.exists(path)
    size = os.path.getsize(path) if exists else 0
    status = f"{size:,} bytes" if exists else "MISSING"
    print(f"  {i}. {path} [{status}]")

print(f"\nChart Explanations:")
for path, info in result.get("chart_explanations", {}).items():
    print(f"  - {info['title']}: {info['explanation'][:120]}...")

In [None]:
# Cell 6: Display charts inline (Plotly HTML renders in Jupyter)

from IPython.display import HTML, display

charts = result.get("charts", [])
if not charts:
    print("No charts to display.")
else:
    for i, path in enumerate(charts, 1):
        if os.path.exists(path):
            title = result.get("chart_explanations", {}).get(path, {}).get("title", f"Chart {i}")
            print(f"\n--- {title} ---")
            with open(path, "r", encoding="utf-8") as f:
                html_content = f.read()
            # Wrap in iframe to isolate Plotly JS
            iframe = f'<iframe srcdoc="{html_content.replace(chr(34), "&quot;")}" width="100%" height="500" frameborder="0"></iframe>'
            display(HTML(iframe))

In [None]:
# Cell 7: Test HTML report builder (zero LLM calls - just builds HTML)

charts = result.get("charts", [])
if charts:
    from extensions.utils.report_builder import build_html_report

    report_path = build_html_report(
        display_text=FAKE_RESEARCH_REPORT,
        analysis_output=result["output"],
        figures=result["charts"],
        chart_explanations=result["chart_explanations"],
        sources=[],
        query="EV Market Analysis Test",
        sub_queries=[],
        extracted_data_summary=result.get("extracted_data", ""),
        data_profile_summary=result.get("data_profile", ""),
    )
    print(f"HTML report generated: {report_path}")
    print(f"File size: {os.path.getsize(report_path):,} bytes")
    print(f"\nOpen in browser: file:///{os.path.abspath(report_path).replace(os.sep, '/')}")
else:
    print("Skipping report builder - no charts were generated.")

In [None]:
# Cell 8: Test intent detection (~4 LLM calls)
# Verifies that analysis_required=True for analysis queries, False for research-only

import asyncio
from services.intent_detector import detect_intent

test_queries = [
    ("Research AI market trends", False),
    ("Research AI market trends and analyze with charts", True),
    ("Tell me about quantum computing", False),
    ("Pollution data - analyze and visualize", True),
    ("Research on Environment Pollution  - Generate visualizations", True),

]

print("Intent Detection Tests:")
print("-" * 70)
for query, expected in test_queries:
    result_intent = await detect_intent(query, provider=provider, model=model)
    actual = result_intent["analysis_required"]
    match = "PASS" if actual == expected else "FAIL"
    print(f"  [{match}] '{query[:50]}' -> {actual} (expected {expected})")

In [None]:
# Cell 9: Summary - pass/fail checklist

print("\n" + "=" * 60)
print("PIPELINE TEST SUMMARY")
print("=" * 60)

checks = [
    ("Pipeline completed without error", result["status"] == "completed"),
    ("Data was extracted", bool(result.get("extracted_data", "").strip())),
    ("Data was profiled", bool(result.get("data_profile", "").strip())),
    ("At least 1 chart created", len(result.get("charts", [])) >= 1),
    ("All chart files exist on disk", all(os.path.exists(p) for p in result.get("charts", []))),
    ("Chart explanations populated", len(result.get("chart_explanations", {})) >= 1),
    ("Execution time recorded", result.get("execution_time", 0) > 0),
]

all_pass = True
for desc, passed in checks:
    status = "PASS" if passed else "FAIL"
    if not passed:
        all_pass = False
    print(f"  [{status}] {desc}")

print("\n" + ("ALL CHECKS PASSED" if all_pass else "SOME CHECKS FAILED"))
print("=" * 60)