# Open Deep Research Validation Framework

This notebook provides a comprehensive framework for evaluating the Deep agent application.

**System Under Test:**
- Deep agent

**Approach:** Sequential testing with incremental result saving. No ground truth required.

**Evaluation Methods:**
1. LLM-as-Judge (automated quality scoring)
2. Citation Verification (URL liveness + content support)
3. DeepEval Metrics (faithfulness, relevancy)
4. Manual Verification (freshness/recency checks)
5. Cross-System Comparison (compare two validation runs head-to-head)

In [None]:
# Cell 2: Install dependencies
# Uncomment and run if not already installed

# !pip install deepeval ragas openai anthropic requests beautifulsoup4 plotly pandas python-dotenv

In [None]:
# Cell 3: Imports and environment setup
import sys
import os
import json
import re
import time
import asyncio
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Any, Optional

import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from IPython.display import display, HTML, Markdown
import requests
from bs4 import BeautifulSoup

# Project paths
PROJECT_ROOT = Path(r"xx\open_deep_research-main\open_deep_research-main")
SRC_DIR = PROJECT_ROOT / "src"

os.chdir(PROJECT_ROOT)
if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

# Load environment variables
from dotenv import load_dotenv
load_dotenv(dotenv_path=PROJECT_ROOT / ".env")

# Import MasterAgent and utilities
from extensions.agents.master_agent import MasterAgent
from extensions.utils.report_builder import build_html_report

print(f"Project root: {PROJECT_ROOT}")
print(f"Source dir:   {SRC_DIR}")
print(f"Environment loaded from: {PROJECT_ROOT / '.env'}")
print(f"MasterAgent imported successfully")

In [None]:
# Cell 4: Configuration

CONFIG = {
    # System under test
    "system_under_test": "open_deep_research",
    
    # Open Deep Research settings
    "odr_use_enhanced_research": False,   # Set True for 3-4x more comprehensive (slower)
    "odr_provider": os.environ.get("LLM_PROVIDER", "google"),
    "odr_model": os.environ.get("LLM_MODEL", "gemini-2.5-pro"),
    
    # LLM Judge settings (use a different LLM than the system being tested)
    "judge_provider": "openai",   # or "anthropic"
    "judge_model": "gpt-4o",      # or "claude-sonnet-4-5-20250929"
    
    # Output settings
    "output_dir": str(PROJECT_ROOT / "validation_results"),
    "run_id": datetime.now().strftime("%Y%m%d_%H%M%S"),
}

# Create output directory
os.makedirs(CONFIG["output_dir"], exist_ok=True)

print("Configuration:")
for k, v in CONFIG.items():
    print(f"  {k}: {v}")

---
## Section 2: Test Query Bank

All test queries organized by category. Each query has:
- `id`: Unique identifier
- `query`: The research question
- `category`: High-level category (factual, stress, manual_verification, etc.)
- `subcategory`: Specific test type within the category

In [None]:
# Cell 6: Define all test queries

TEST_QUERIES = [
    # === A. Factual / Verifiable Queries ===
    {"id": "A1", "query": "What were the top 5 causes of the 2008 financial crisis, with supporting data?", "category": "factual", "subcategory": "verifiable"},
    {"id": "A2", "query": "List all Nobel Prize winners in Physics from 2020-2024 and their contributions.", "category": "factual", "subcategory": "verifiable"},
    {"id": "A3", "query": "What is the current market share of cloud providers (AWS, Azure, GCP) as of 2024?", "category": "factual", "subcategory": "verifiable"},
    {"id": "A4", "query": "Summarize the key provisions of the EU AI Act.", "category": "factual", "subcategory": "verifiable"},
    {"id": "A5", "query": "What are the FDA-approved treatments for Type 2 diabetes as of 2024?", "category": "factual", "subcategory": "verifiable"},
    
    # === B. Multi-Hop Reasoning Queries ===
    {"id": "B1", "query": "How did the semiconductor chip shortage of 2020-2023 affect both the automotive and gaming industries differently?", "category": "multi_hop", "subcategory": "synthesis"},
    {"id": "B2", "query": "Compare the economic policies of the US and EU regarding AI regulation and their downstream effects on startup funding.", "category": "multi_hop", "subcategory": "synthesis"},
    {"id": "B3", "query": "Trace the supply chain of lithium from mining to EV batteries - what are the geopolitical risks at each stage?", "category": "multi_hop", "subcategory": "synthesis"},
    
    # === C. Ambiguous / Open-Ended Queries ===
    {"id": "C1", "query": "What's the best programming language?", "category": "ambiguous", "subcategory": "vague"},
    {"id": "C2", "query": "Is AI dangerous?", "category": "ambiguous", "subcategory": "vague"},
    {"id": "C3", "query": "Tell me about the Apple situation", "category": "ambiguous", "subcategory": "ambiguous_entity"},
    {"id": "C4", "query": "What happened recently in tech?", "category": "ambiguous", "subcategory": "vague"},
    
    # === D. Stress Tests ===
    # D1: Contradictory sources
    {"id": "D1a", "query": "Is coffee good or bad for health? Provide evidence for both sides.", "category": "stress", "subcategory": "contradictory"},
    {"id": "D1b", "query": "Is remote work more productive than in-office? What does the research say?", "category": "stress", "subcategory": "contradictory"},
    {"id": "D1c", "query": "Are electric vehicles truly better for the environment when considering full lifecycle?", "category": "stress", "subcategory": "contradictory"},
    
    # D2: Obscure topics
    {"id": "D2a", "query": "What is the history of the Voynich Manuscript's ownership chain?", "category": "stress", "subcategory": "obscure"},
    {"id": "D2b", "query": "Describe the political structure of the Principality of Sealand.", "category": "stress", "subcategory": "obscure"},
    {"id": "D2c", "query": "What are the known side effects of the drug Zuranolone in postpartum depression?", "category": "stress", "subcategory": "obscure"},
    {"id": "D2d", "query": "Summarize the contributions of Srinivasa Ramanujan's lost notebook to number theory.", "category": "stress", "subcategory": "obscure"},
    
    # D3: Very recent events
    {"id": "D3a", "query": "What were the most significant tech industry events in the past 48 hours?", "category": "stress", "subcategory": "freshness"},
    {"id": "D3b", "query": "What is the latest stock price movement for NVIDIA and why?", "category": "stress", "subcategory": "freshness"},
    
    # D4: Highly technical
    {"id": "D4a", "query": "Explain the differences between LoRA, QLoRA, and DoRA fine-tuning methods with benchmark comparisons.", "category": "stress", "subcategory": "technical"},
    {"id": "D4b", "query": "Compare the architectures of Mamba, RWKV, and Transformer models for sequence modeling.", "category": "stress", "subcategory": "technical"},
    {"id": "D4c", "query": "What is the current state of topological quantum computing at Microsoft and IBM?", "category": "stress", "subcategory": "technical"},
    
    # D5: Long-form output
    {"id": "D5a", "query": "Write a comprehensive 5000-word research report on quantum computing's impact on cryptography.", "category": "stress", "subcategory": "long_form"},
    
    # D7: Edge cases
    {"id": "D7a", "query": "research", "category": "stress", "subcategory": "edge_case"},
    {"id": "D7b", "query": "asdfghjkl qwerty zxcvbnm research this", "category": "stress", "subcategory": "edge_case"},
    {"id": "D7c", "query": "Write a short but comprehensive 10,000-word summary", "category": "stress", "subcategory": "edge_case"},
    
    # D9: Multi-language
    {"id": "D9a", "query": "Summarize the key findings of recent Chinese AI research papers on large language models.", "category": "stress", "subcategory": "multi_language"},
    {"id": "D9b", "query": "What are the latest German automotive industry reports on EV adoption?", "category": "stress", "subcategory": "multi_language"},
    
    # D10: Numerical accuracy
    {"id": "D10a", "query": "What were the exact GDP growth rates for G7 countries in 2024 Q1-Q4?", "category": "stress", "subcategory": "numerical"},
    {"id": "D10b", "query": "List the top 10 most funded AI startups in 2024 with their exact funding amounts.", "category": "stress", "subcategory": "numerical"},
    {"id": "D10c", "query": "What are the current interest rates set by the Fed, ECB, and Bank of Japan?", "category": "stress", "subcategory": "numerical"},
    
    # === E. Domain-Specific Queries ===
    {"id": "E1", "query": "What are the key differences between GDPR and CCPA?", "category": "domain_specific", "subcategory": "legal"},
    {"id": "E2", "query": "What is the current evidence on intermittent fasting for cardiovascular health?", "category": "domain_specific", "subcategory": "medical"},
    {"id": "E3", "query": "Analyze Tesla's Q3 2024 earnings - what are the key takeaways?", "category": "domain_specific", "subcategory": "financial"},
    {"id": "E4", "query": "What is the current state of nuclear fusion research?", "category": "domain_specific", "subcategory": "scientific"},
    
    # === F. Recent Research Generation ===
    {"id": "F1", "query": "Find and summarize the 5 most recent research papers on LLM hallucination mitigation published in 2025-2026.", "category": "recent_research", "subcategory": "papers"},
    {"id": "F2", "query": "What are the latest breakthroughs in solid-state batteries from the past 6 months?", "category": "recent_research", "subcategory": "breakthroughs"},
    {"id": "F3", "query": "Summarize recent clinical trial results for GLP-1 receptor agonists in 2025.", "category": "recent_research", "subcategory": "clinical_trials"},
    
    # === G. Manual Verification Queries (Freshness Check) ===
    # G1: Live/real-time data
    {"id": "G1a", "query": "What is today's price of Bitcoin?", "category": "manual_verification", "subcategory": "realtime"},
    {"id": "G1b", "query": "What is the current USD to INR exchange rate?", "category": "manual_verification", "subcategory": "realtime"},
    {"id": "G1c", "query": "What is NVIDIA's stock price right now?", "category": "manual_verification", "subcategory": "realtime"},
    
    # G2: Recent events
    {"id": "G2a", "query": "What were the top tech news stories this week?", "category": "manual_verification", "subcategory": "recent_events"},
    
    # G3: Recently changed facts
    {"id": "G3a", "query": "What is the latest version of Python?", "category": "manual_verification", "subcategory": "changed_facts"},
    {"id": "G3b", "query": "What is the current US federal interest rate?", "category": "manual_verification", "subcategory": "changed_facts"},
    
    # G5: Trick questions
    {"id": "G5a", "query": "What is the latest iPhone model?", "category": "manual_verification", "subcategory": "trick"},
    {"id": "G5b", "query": "What is the most recent SpaceX Starship launch result?", "category": "manual_verification", "subcategory": "trick"},
]

# Summary
df_queries = pd.DataFrame(TEST_QUERIES)
print(f"Total test queries: {len(TEST_QUERIES)}")
print(f"\nQueries by category:")
print(df_queries['category'].value_counts().to_string())
print(f"\nQueries by subcategory:")
print(df_queries['subcategory'].value_counts().to_string())

---
## Section 3: Research Runner (Adapter Pattern)

One standardized interface backed by MasterAgent. The adapter returns:
```python
{
    "query": str,
    "response_text": str,
    "sources": List[str],
    "timing_seconds": float,
    "word_count": int,
    "metadata": dict
}
```

In [None]:
# Cell 8: SF Enterprise EDR Adapter (DISABLED)
# Set SF_EDR_AVAILABLE = True and uncomment the import if you have the SF EDR system
# and want to run cross-system comparison.

SF_EDR_AVAILABLE = False

async def run_sf_edr(query: str, config: dict) -> dict:
    """Run a query through SF Enterprise Deep Research.
    
    Requires benchmarks/run_research.py which is not part of this repository.
    Enable SF_EDR_AVAILABLE and configure the import if available.
    """
    if not SF_EDR_AVAILABLE:
        return {
            "query": query,
            "response_text": "",
            "sources": [],
            "timing_seconds": 0,
            "word_count": 0,
            "error": "SF EDR adapter not available in this repository. Set SF_EDR_AVAILABLE = True and configure the import.",
            "metadata": {"system": "sf_edr"}
        }
    
    # Uncomment below if SF EDR is available:
    # from benchmarks.run_research import run_research_sync
    # ... original implementation ...
    
print("SF EDR adapter: DISABLED (set SF_EDR_AVAILABLE = True to enable)")

In [None]:
# Cell 9: Open Deep Research Adapter (MasterAgent)

# Singleton MasterAgent instance -- reused across all 45 queries to avoid re-initialization overhead
_master_agent = None

def _get_master_agent(config: dict) -> MasterAgent:
    """Lazily initialize a shared MasterAgent instance."""
    global _master_agent
    if _master_agent is None:
        _master_agent = MasterAgent(
            use_enhanced_research=config.get("odr_use_enhanced_research", False),
            provider=config.get("odr_provider"),
            model=config.get("odr_model"),
            enable_state_persistence=False,  # No DB persistence needed during validation
        )
        print(f"[Adapter] MasterAgent initialized (enhanced={config.get('odr_use_enhanced_research', False)})")
    return _master_agent


async def run_open_deep_research(query: str, config: dict) -> dict:
    """Run a query through Open Deep Research via MasterAgent.
    
    Returns the standardized adapter format consumed by all downstream evaluation sections.
    """
    try:
        agent = _get_master_agent(config)
        result = await agent.run_async(query)
        
        state = result.get("state", {})
        status = result.get("status", "error")
        elapsed = result.get("execution_time", 0)
        
        response_text = state.get("final_report", "")
        
        # Sources come as List[str] of URLs
        sources_raw = state.get("sources", [])
        sources = []
        for s in sources_raw:
            if isinstance(s, str):
                sources.append(s)
            elif isinstance(s, dict):
                sources.append(s.get("url", s.get("title", str(s))))
        
        base_metadata = {
            "system": "open_deep_research",
            "provider": config.get("odr_provider"),
            "model": config.get("odr_model"),
            "conversation_id": result.get("conversation_id", ""),
            "agents_used": result.get("agents_used", []),
        }
        
        if status == "error":
            return {
                "query": query,
                "response_text": response_text,
                "sources": sources,
                "timing_seconds": elapsed,
                "word_count": len(response_text.split()) if response_text else 0,
                "error": result.get("error", "Unknown error"),
                "metadata": base_metadata,
            }
        
        # Include analysis artifacts in metadata for rich validation
        base_metadata.update({
            "analysis_output": state.get("analysis_output", ""),
            "charts": state.get("charts", []),
            "chart_explanations": state.get("chart_explanations", {}),
            "extracted_data": state.get("extracted_data", ""),
            "data_profile": state.get("data_profile", ""),
            "sub_queries": state.get("sub_queries", []),
        })
        
        return {
            "query": query,
            "response_text": response_text,
            "sources": sources,
            "timing_seconds": elapsed,
            "word_count": len(response_text.split()) if response_text else 0,
            "metadata": base_metadata,
        }
    except Exception as e:
        return {
            "query": query,
            "response_text": "",
            "sources": [],
            "timing_seconds": 0,
            "word_count": 0,
            "error": str(e),
            "metadata": {
                "system": "open_deep_research",
                "provider": config.get("odr_provider"),
                "model": config.get("odr_model"),
            }
        }

print("Open Deep Research adapter ready (MasterAgent).")

In [None]:
# Cell 10: Dispatcher

async def run_research(query: str, system: str, config: dict) -> dict:
    """Route a research query to the appropriate system adapter."""
    adapters = {
        "open_deep_research": run_open_deep_research,
    }
    
    # Include SF EDR only if available
    if SF_EDR_AVAILABLE:
        adapters["sf_edr"] = run_sf_edr
    
    adapter = adapters.get(system)
    if adapter is None:
        raise ValueError(f"Unknown system: {system}. Choose from: {list(adapters.keys())}")
    
    return await adapter(query, config)

print(f"Dispatcher ready. System under test: {CONFIG['system_under_test']}")

---
## Section 4: Execute Research Queries

Run all test queries through the selected system. Results are saved incrementally to JSON after each query, so no data is lost if the process is interrupted.

In [None]:
# Cell 12: Run all queries

RESULTS_FILE = os.path.join(
    CONFIG["output_dir"],
    f"results_{CONFIG['system_under_test']}_{CONFIG['run_id']}.json"
)

# Load existing results if resuming a partial run
if os.path.exists(RESULTS_FILE):
    with open(RESULTS_FILE, "r", encoding="utf-8") as f:
        results = json.load(f)
    completed_ids = {r["query_id"] for r in results}
    print(f"Resuming: {len(completed_ids)} queries already completed")
else:
    results = []
    completed_ids = set()

# Filter to queries not yet completed
pending_queries = [q for q in TEST_QUERIES if q["id"] not in completed_ids]
print(f"Queries to run: {len(pending_queries)} / {len(TEST_QUERIES)}")

for i, query_info in enumerate(pending_queries):
    print(f"\n[{i+1}/{len(pending_queries)}] Running: {query_info['id']} - {query_info['query'][:60]}...")
    
    try:
        result = await run_research(
            query=query_info["query"],
            system=CONFIG["system_under_test"],
            config=CONFIG
        )
        
        # Attach query metadata
        result["query_id"] = query_info["id"]
        result["category"] = query_info["category"]
        result["subcategory"] = query_info["subcategory"]
        result["system"] = CONFIG["system_under_test"]
        result["timestamp"] = datetime.now().isoformat()
        
        results.append(result)
        
        # Incremental save
        with open(RESULTS_FILE, "w", encoding="utf-8") as f:
            json.dump(results, f, indent=2, ensure_ascii=False, default=str)
        
        status = "ERROR" if result.get("error") else "OK"
        print(f"  [{status}] {result['word_count']} words, {result['timing_seconds']:.1f}s")
        
    except Exception as e:
        print(f"  [FAILED] {e}")
        results.append({
            "query_id": query_info["id"],
            "query": query_info["query"],
            "category": query_info["category"],
            "subcategory": query_info["subcategory"],
            "system": CONFIG["system_under_test"],
            "response_text": "",
            "sources": [],
            "timing_seconds": 0,
            "word_count": 0,
            "error": str(e),
            "timestamp": datetime.now().isoformat(),
        })
        with open(RESULTS_FILE, "w", encoding="utf-8") as f:
            json.dump(results, f, indent=2, ensure_ascii=False, default=str)

print(f"\nAll queries complete. Results saved to: {RESULTS_FILE}")

In [None]:
# Cell 13: Quick summary of results

df_results = pd.DataFrame(results)

total = len(df_results)

# Safe error counting - handle missing 'error' column
if 'error' in df_results.columns:
    error_mask = df_results['error'].notna() & (df_results['error'] != '')
    error_count = int(error_mask.sum())
else:
    error_count = 0

success_count = total - error_count

print(f"=== Execution Summary ===")
print(f"Total queries run:   {total}")
print(f"Successful:          {success_count}")
print(f"Errors:              {error_count}")
print(f"Average latency:     {df_results['timing_seconds'].mean():.1f}s")
print(f"Median latency:      {df_results['timing_seconds'].median():.1f}s")
print(f"Average word count:  {df_results['word_count'].mean():.0f}")
print(f"\nLatency by category:")
print(df_results.groupby('category')['timing_seconds'].agg(['mean', 'median', 'max']).round(1).to_string())

---
## Section 5: Automated Evaluation - LLM-as-Judge

Uses a strong LLM (GPT-4 / Claude) to automatically score each research output on multiple dimensions. **No ground truth required** - the judge evaluates standalone quality.

In [None]:
# Cell 15: LLM-as-Judge scoring function

def get_judge_client(config: dict):
    """Initialize the judge LLM client."""
    provider = config.get("judge_provider", "openai")
    if provider == "openai":
        from openai import OpenAI
        return OpenAI(), config.get("judge_model", "gpt-4o")
    elif provider == "anthropic":
        from anthropic import Anthropic
        return Anthropic(), config.get("judge_model", "claude-sonnet-4-5-20250929")
    else:
        raise ValueError(f"Unsupported judge provider: {provider}")


def llm_judge_score(query: str, response: str, config: dict) -> dict:
    """Score a research response using an LLM judge. No ground truth needed."""
    
    judge_prompt = f"""You are an expert research quality evaluator. Score the following research output.
There is NO ground truth - evaluate the response on its own merits.

QUERY: {query}

RESPONSE:
{response[:8000]}

Score each dimension on a 1-5 scale (5 is best):

1. relevancy (1-5): Does the response directly address the query?
2. depth (1-5): How thorough and comprehensive is the coverage?
3. source_quality (1-5): Are citations from reputable, relevant sources? Are sources properly referenced?
4. coherence (1-5): Is the response well-structured, logical, and readable?
5. confidence_calibration (1-5): Does it appropriately express uncertainty where warranted? (5 = good calibration)

Return ONLY valid JSON (no markdown, no explanation outside the JSON):
{{"relevancy": X, "depth": X, "source_quality": X, "coherence": X, "confidence_calibration": X, "reasoning": "brief 1-2 sentence explanation"}}"""

    provider = config.get("judge_provider", "openai")
    
    try:
        if provider == "openai":
            from openai import OpenAI
            client = OpenAI()
            resp = client.chat.completions.create(
                model=config.get("judge_model", "gpt-4o"),
                messages=[{"role": "user", "content": judge_prompt}],
                temperature=0.1,
            )
            raw = resp.choices[0].message.content.strip()
        elif provider == "anthropic":
            from anthropic import Anthropic
            client = Anthropic()
            resp = client.messages.create(
                model=config.get("judge_model", "claude-sonnet-4-5-20250929"),
                max_tokens=500,
                messages=[{"role": "user", "content": judge_prompt}],
            )
            raw = resp.content[0].text.strip()
        else:
            return {"error": f"Unsupported judge provider: {provider}"}
        
        # Parse JSON from response (handle markdown code blocks)
        if raw.startswith("```"):
            raw = raw.split("```")[1]
            if raw.startswith("json"):
                raw = raw[4:]
        
        return json.loads(raw)
    
    except Exception as e:
        return {"error": str(e)}

print("LLM-as-Judge function ready.")

In [None]:
# Cell 16: Run LLM-as-Judge on all results

JUDGE_RESULTS_FILE = os.path.join(
    CONFIG["output_dir"],
    f"judge_scores_{CONFIG['system_under_test']}_{CONFIG['run_id']}.json"
)

# Load existing judge scores if resuming
if os.path.exists(JUDGE_RESULTS_FILE):
    with open(JUDGE_RESULTS_FILE, "r", encoding="utf-8") as f:
        judge_scores = json.load(f)
    judged_ids = {s["query_id"] for s in judge_scores}
    print(f"Resuming: {len(judged_ids)} already judged")
else:
    judge_scores = []
    judged_ids = set()

pending = [r for r in results if r["query_id"] not in judged_ids and r.get("response_text")]
print(f"Queries to judge: {len(pending)}")

for i, result in enumerate(pending):
    print(f"  [{i+1}/{len(pending)}] Judging {result['query_id']}...", end=" ")
    
    score = llm_judge_score(result["query"], result["response_text"], CONFIG)
    score["query_id"] = result["query_id"]
    score["category"] = result["category"]
    score["subcategory"] = result["subcategory"]
    
    judge_scores.append(score)
    
    # Incremental save
    with open(JUDGE_RESULTS_FILE, "w", encoding="utf-8") as f:
        json.dump(judge_scores, f, indent=2, ensure_ascii=False)
    
    if "error" in score:
        print(f"ERROR: {score['error']}")
    else:
        avg = (score["relevancy"] + score["depth"] + score["source_quality"] + score["coherence"] + score["confidence_calibration"]) / 5
        print(f"avg={avg:.1f}")

print(f"\nJudge scores saved to: {JUDGE_RESULTS_FILE}")

In [None]:
# Cell 17: Display judge scores

valid_scores = [s for s in judge_scores if "error" not in s]
df_scores = pd.DataFrame(valid_scores)

score_cols = ["relevancy", "depth", "source_quality", "coherence", "confidence_calibration"]

if not df_scores.empty:
    # Overall averages
    print("=== Overall LLM-as-Judge Scores ===")
    print(df_scores[score_cols].mean().round(2).to_string())
    
    # By category
    print("\n=== Scores by Category ===")
    display(df_scores.groupby("category")[score_cols].mean().round(2))
    
    # Full table
    print("\n=== All Scores ===")
    display(df_scores[["query_id", "category"] + score_cols + ["reasoning"]].sort_values("category"))
else:
    print("No valid judge scores yet.")

---
## Section 6: Automated Evaluation - Citation Verification

Checks whether cited URLs are alive and whether the source content actually supports the claims made in the research output.

In [None]:
# Cell 19: Extract URLs from response text

def extract_urls(text: str) -> list:
    """Extract all URLs from text."""
    url_pattern = r'https?://[^\s\)\]\"\'>]+'
    urls = re.findall(url_pattern, text)
    # Clean trailing punctuation
    cleaned = []
    for url in urls:
        url = url.rstrip('.,;:!?')
        if url not in cleaned:
            cleaned.append(url)
    return cleaned

# Quick test
test_text = "See https://example.com/article and also https://test.org/paper.pdf for details."
print(f"Test URL extraction: {extract_urls(test_text)}")

In [None]:
# Cell 20: Check if URLs are alive

def check_url_alive(url: str, timeout: int = 10) -> dict:
    """Check if a URL responds with HTTP 200."""
    try:
        r = requests.head(url, timeout=timeout, allow_redirects=True,
                         headers={"User-Agent": "Mozilla/5.0 (research-validator)"})
        return {"url": url, "alive": r.status_code < 400, "status_code": r.status_code}
    except requests.exceptions.Timeout:
        return {"url": url, "alive": False, "status_code": "timeout"}
    except Exception as e:
        return {"url": url, "alive": False, "status_code": str(e)}

print("URL liveness checker ready.")

In [None]:
# Cell 21: LLM-based citation content verification

def fetch_page_text(url: str, max_chars: int = 3000) -> Optional[str]:
    """Fetch and extract text content from a URL."""
    try:
        r = requests.get(url, timeout=15, headers={"User-Agent": "Mozilla/5.0 (research-validator)"})
        r.raise_for_status()
        soup = BeautifulSoup(r.text, "html.parser")
        # Remove script/style elements
        for tag in soup(["script", "style", "nav", "footer", "header"]):
            tag.decompose()
        text = soup.get_text(separator=" ", strip=True)
        return text[:max_chars]
    except Exception:
        return None


def verify_citation_with_llm(response_text: str, url: str, source_content: str, config: dict) -> dict:
    """Use LLM to check if the source content supports claims near the URL in the response."""
    
    # Extract text near the URL reference
    url_pos = response_text.find(url)
    if url_pos == -1:
        # Try partial URL match
        for part in url.split("/")[2:4]:
            if part in response_text:
                url_pos = response_text.find(part)
                break
    
    context_start = max(0, url_pos - 500) if url_pos >= 0 else 0
    context_end = min(len(response_text), url_pos + 500) if url_pos >= 0 else 1000
    claim_context = response_text[context_start:context_end]
    
    prompt = f"""Does the source content support the claims made in the research text near this citation?

RESEARCH TEXT (near citation):
{claim_context}

SOURCE CONTENT:
{source_content[:2000]}

Return ONLY valid JSON:
{{"verdict": "supported" or "partially_supported" or "unsupported" or "unrelated", "confidence": 0.0 to 1.0, "reasoning": "brief explanation"}}"""

    provider = config.get("judge_provider", "openai")
    try:
        if provider == "openai":
            from openai import OpenAI
            client = OpenAI()
            resp = client.chat.completions.create(
                model=config.get("judge_model", "gpt-4o"),
                messages=[{"role": "user", "content": prompt}],
                temperature=0.1,
            )
            raw = resp.choices[0].message.content.strip()
        elif provider == "anthropic":
            from anthropic import Anthropic
            client = Anthropic()
            resp = client.messages.create(
                model=config.get("judge_model", "claude-sonnet-4-5-20250929"),
                max_tokens=300,
                messages=[{"role": "user", "content": prompt}],
            )
            raw = resp.content[0].text.strip()
        else:
            return {"verdict": "error", "confidence": 0, "reasoning": f"Unsupported provider: {provider}"}
        
        if raw.startswith("```"):
            raw = raw.split("```")[1]
            if raw.startswith("json"):
                raw = raw[4:]
        return json.loads(raw)
    except Exception as e:
        return {"verdict": "error", "confidence": 0, "reasoning": str(e)}

print("Citation verification functions ready.")

In [None]:
# Cell 22: Run full citation verification pipeline

CITATION_RESULTS_FILE = os.path.join(
    CONFIG["output_dir"],
    f"citation_verification_{CONFIG['system_under_test']}_{CONFIG['run_id']}.json"
)

citation_results = []

for result in results:
    if not result.get("response_text"):
        continue
    
    urls = extract_urls(result["response_text"])
    if not urls:
        citation_results.append({
            "query_id": result["query_id"],
            "total_citations": 0,
            "alive": 0,
            "dead": 0,
            "supported": 0,
            "unsupported": 0,
            "details": [],
        })
        continue
    
    print(f"Verifying {len(urls)} citations for {result['query_id']}...", end=" ")
    
    query_citations = []
    for url in urls[:10]:  # Limit to 10 URLs per query to save time/cost
        alive_check = check_url_alive(url)
        
        verification = {"verdict": "not_checked"}
        if alive_check["alive"]:
            page_text = fetch_page_text(url)
            if page_text:
                verification = verify_citation_with_llm(
                    result["response_text"], url, page_text, CONFIG
                )
        
        query_citations.append({
            "url": url,
            "alive": alive_check["alive"],
            "status_code": alive_check["status_code"],
            "verdict": verification.get("verdict", "not_checked"),
            "confidence": verification.get("confidence", 0),
        })
    
    alive_count = sum(1 for c in query_citations if c["alive"])
    supported_count = sum(1 for c in query_citations if c["verdict"] in ["supported", "partially_supported"])
    
    citation_results.append({
        "query_id": result["query_id"],
        "total_citations": len(query_citations),
        "alive": alive_count,
        "dead": len(query_citations) - alive_count,
        "supported": supported_count,
        "unsupported": sum(1 for c in query_citations if c["verdict"] == "unsupported"),
        "details": query_citations,
    })
    
    print(f"alive={alive_count}/{len(query_citations)}, supported={supported_count}")

# Save
with open(CITATION_RESULTS_FILE, "w", encoding="utf-8") as f:
    json.dump(citation_results, f, indent=2, ensure_ascii=False, default=str)

print(f"\nCitation results saved to: {CITATION_RESULTS_FILE}")

In [None]:
# Cell 23: Display citation verification results

df_citations = pd.DataFrame([{
    "query_id": c["query_id"],
    "total_urls": c["total_citations"],
    "alive": c["alive"],
    "dead": c["dead"],
    "supported": c["supported"],
    "unsupported": c["unsupported"],
    "alive_rate": c["alive"] / c["total_citations"] if c["total_citations"] > 0 else 0,
    "support_rate": c["supported"] / c["alive"] if c["alive"] > 0 else 0,
} for c in citation_results])

if not df_citations.empty:
    print("=== Citation Verification Summary ===")
    with_citations = df_citations[df_citations["total_urls"] > 0]
    
    if not with_citations.empty:
        print(f"Queries with citations: {len(with_citations)} / {len(df_citations)}")
        print(f"Average URLs per query: {with_citations['total_urls'].mean():.1f}")
        print(f"Average alive rate:     {with_citations['alive_rate'].mean():.1%}")
        print(f"Average support rate:   {with_citations['support_rate'].mean():.1%}")
        print(f"\nPer-query breakdown:")
        display(with_citations)
    else:
        print("No citations found in any responses.")
else:
    print("No citation results yet.")

---
## Section 7: Automated Evaluation - DeepEval Metrics (Optional)

Uses the DeepEval library for faithfulness and answer relevancy metrics. These metrics work **without ground truth**:
- **FaithfulnessMetric**: Are claims grounded in the cited sources?
- **AnswerRelevancyMetric**: Does the response address the question?

In [None]:
# Cell 25: Run DeepEval metrics

try:
    from deepeval import evaluate
    from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric
    from deepeval.test_case import LLMTestCase
    
    DEEPEVAL_AVAILABLE = True
    print("DeepEval loaded successfully.")
except ImportError:
    DEEPEVAL_AVAILABLE = False
    print("DeepEval not installed. Run: pip install deepeval")
    print("Skipping this section.")

deepeval_scores = []

if DEEPEVAL_AVAILABLE:
    relevancy_metric = AnswerRelevancyMetric(threshold=0.5)
    faithfulness_metric = FaithfulnessMetric(threshold=0.5)
    
    for result in results:
        if not result.get("response_text"):
            continue
        
        print(f"  DeepEval scoring {result['query_id']}...", end=" ")
        
        # Extract source content as retrieval context
        sources = result.get("sources", [])
        retrieval_context = sources if isinstance(sources, list) and sources else [result["response_text"][:2000]]
        
        test_case = LLMTestCase(
            input=result["query"],
            actual_output=result["response_text"][:5000],
            retrieval_context=retrieval_context[:5],
        )
        
        try:
            relevancy_metric.measure(test_case)
            faithfulness_metric.measure(test_case)
            
            deepeval_scores.append({
                "query_id": result["query_id"],
                "relevancy": relevancy_metric.score,
                "faithfulness": faithfulness_metric.score,
                "relevancy_reason": relevancy_metric.reason,
                "faithfulness_reason": faithfulness_metric.reason,
            })
            print(f"rel={relevancy_metric.score:.2f}, faith={faithfulness_metric.score:.2f}")
        except Exception as e:
            print(f"ERROR: {e}")
            deepeval_scores.append({
                "query_id": result["query_id"],
                "error": str(e),
            })
    
    # Save
    deepeval_file = os.path.join(CONFIG["output_dir"], f"deepeval_{CONFIG['system_under_test']}_{CONFIG['run_id']}.json")
    with open(deepeval_file, "w", encoding="utf-8") as f:
        json.dump(deepeval_scores, f, indent=2, ensure_ascii=False)
    print(f"\nDeepEval scores saved to: {deepeval_file}")

In [None]:
# Cell 26: Display DeepEval scores

if deepeval_scores:
    valid_de = [s for s in deepeval_scores if "error" not in s]
    df_de = pd.DataFrame(valid_de)
    
    if not df_de.empty:
        print("=== DeepEval Scores ===")
        print(f"Average Relevancy:    {df_de['relevancy'].mean():.2f}")
        print(f"Average Faithfulness: {df_de['faithfulness'].mean():.2f}")
        print()
        display(df_de[["query_id", "relevancy", "faithfulness"]])
    else:
        print("No valid DeepEval scores.")
else:
    print("No DeepEval scores available. Install deepeval and re-run Section 7.")

---
## Section 8: Manual Verification - Freshness Check

For queries in the `manual_verification` category, you can personally check if the returned data is current. Review each response and assign a freshness score:
- **Fresh**: Data is current (within last 7 days)
- **Slightly Stale**: Data is 1-4 weeks old
- **Stale**: Data is 1-6 months old
- **Outdated**: Data is 6+ months old or incorrect

In [None]:
# Cell 28: Display manual verification queries with responses

manual_results = [r for r in results if r.get("category") == "manual_verification"]

if manual_results:
    print(f"=== Manual Verification Queries ({len(manual_results)} queries) ===")
    print("Review each response below and assign a freshness score.\n")
    
    for r in manual_results:
        display(HTML(f"""
        <div style='border: 1px solid #ccc; padding: 12px; margin: 8px 0; border-radius: 4px;'>
            <h4>[{r['query_id']}] {r['query']}</h4>
            <p><b>Subcategory:</b> {r.get('subcategory', 'N/A')}</p>
            <p><b>Response (first 500 chars):</b></p>
            <pre style='white-space: pre-wrap; background: #f5f5f5; padding: 8px;'>{r.get('response_text', 'No response')[:500]}</pre>
        </div>
        """))
else:
    print("No manual verification results yet. Run Section 4 first.")

In [None]:
# Cell 29: Enter manual freshness scores
# Update the scores dict below after reviewing the responses above.

# Options: "fresh", "slightly_stale", "stale", "outdated"
MANUAL_FRESHNESS_SCORES = {
    # "G1a": "fresh",
    # "G1b": "slightly_stale",
    # "G1c": "stale",
    # "G2a": "fresh",
    # "G3a": "fresh",
    # "G3b": "outdated",
    # "G5a": "fresh",
    # "G5b": "slightly_stale",
}

# Convert to numeric for aggregation
freshness_to_score = {"fresh": 4, "slightly_stale": 3, "stale": 2, "outdated": 1}

if MANUAL_FRESHNESS_SCORES:
    freshness_data = []
    for qid, freshness in MANUAL_FRESHNESS_SCORES.items():
        freshness_data.append({
            "query_id": qid,
            "freshness": freshness,
            "freshness_score": freshness_to_score.get(freshness, 0),
        })
    
    df_freshness = pd.DataFrame(freshness_data)
    print("=== Manual Freshness Scores ===")
    print(f"Average freshness score: {df_freshness['freshness_score'].mean():.2f} / 4.0")
    display(df_freshness)
    
    # Save
    freshness_file = os.path.join(CONFIG["output_dir"], f"freshness_{CONFIG['system_under_test']}_{CONFIG['run_id']}.json")
    with open(freshness_file, "w", encoding="utf-8") as f:
        json.dump(freshness_data, f, indent=2)
    print(f"Saved to: {freshness_file}")
else:
    print("No freshness scores entered yet. Fill in MANUAL_FRESHNESS_SCORES above and re-run.")

---
## Section 9: Results Aggregation & Visualization

Combine all evaluation scores and generate visual reports.

In [None]:
# Cell 31: Aggregate all scores into a single DataFrame

# Start with basic result info
agg_data = []
for r in results:
    row = {
        "query_id": r["query_id"],
        "category": r.get("category", ""),
        "subcategory": r.get("subcategory", ""),
        "timing_seconds": r.get("timing_seconds", 0),
        "word_count": r.get("word_count", 0),
        "has_error": bool(r.get("error")),
    }
    
    # Merge judge scores
    judge = next((s for s in judge_scores if s.get("query_id") == r["query_id"] and "error" not in s), None)
    if judge:
        for col in ["relevancy", "depth", "source_quality", "coherence", "confidence_calibration"]:
            row[f"judge_{col}"] = judge.get(col)
    
    # Merge citation scores
    citation = next((c for c in citation_results if c["query_id"] == r["query_id"]), None)
    if citation:
        row["citation_count"] = citation["total_citations"]
        row["citation_alive_rate"] = citation["alive"] / citation["total_citations"] if citation["total_citations"] > 0 else None
        row["citation_support_rate"] = citation["supported"] / citation["alive"] if citation["alive"] > 0 else None
    
    # Merge DeepEval scores
    de = next((s for s in deepeval_scores if s.get("query_id") == r["query_id"] and "error" not in s), None)
    if de:
        row["deepeval_relevancy"] = de.get("relevancy")
        row["deepeval_faithfulness"] = de.get("faithfulness")
    
    # Merge freshness scores
    if r["query_id"] in MANUAL_FRESHNESS_SCORES:
        row["freshness"] = MANUAL_FRESHNESS_SCORES[r["query_id"]]
        row["freshness_score"] = freshness_to_score.get(MANUAL_FRESHNESS_SCORES[r["query_id"]], 0)
    
    agg_data.append(row)

df_agg = pd.DataFrame(agg_data)

print(f"=== Aggregated Results ({len(df_agg)} queries) ===")
display(df_agg.describe().round(2))

In [None]:
# Cell 32: Radar chart - overall system performance

judge_cols = [c for c in df_agg.columns if c.startswith("judge_")]

if judge_cols:
    avg_scores = df_agg[judge_cols].mean()
    categories_radar = [c.replace("judge_", "").replace("_", " ").title() for c in judge_cols]
    values = avg_scores.values.tolist()
    values.append(values[0])  # Close the polygon
    categories_radar.append(categories_radar[0])
    
    fig = go.Figure()
    fig.add_trace(go.Scatterpolar(
        r=values,
        theta=categories_radar,
        fill='toself',
        name=CONFIG["system_under_test"],
        line_color='rgb(31, 119, 180)',
    ))
    
    fig.update_layout(
        polar=dict(radialaxis=dict(visible=True, range=[0, 5])),
        showlegend=True,
        title=f"System Performance Radar - {CONFIG['system_under_test']}",
    )
    fig.show()
else:
    print("No judge scores available for radar chart. Run Section 5 first.")

In [None]:
# Cell 33: Bar charts - scores by category

if judge_cols and not df_agg.empty:
    # Average judge score per category
    df_agg["judge_avg"] = df_agg[judge_cols].mean(axis=1)
    
    cat_avg = df_agg.groupby("category")["judge_avg"].mean().sort_values(ascending=True)
    
    fig = px.bar(
        x=cat_avg.values,
        y=cat_avg.index,
        orientation='h',
        title=f"Average Quality Score by Category - {CONFIG['system_under_test']}",
        labels={"x": "Average Score (1-5)", "y": "Category"},
    )
    fig.update_layout(xaxis_range=[0, 5])
    fig.show()
else:
    print("No data for bar chart.")

In [None]:
# Cell 34: Latency analysis

if not df_agg.empty and df_agg["timing_seconds"].sum() > 0:
    fig = px.histogram(
        df_agg, x="timing_seconds", nbins=20,
        title=f"Response Time Distribution - {CONFIG['system_under_test']}",
        labels={"timing_seconds": "Response Time (seconds)"},
    )
    fig.show()
    
    print(f"Latency Statistics:")
    print(f"  Mean:   {df_agg['timing_seconds'].mean():.1f}s")
    print(f"  Median: {df_agg['timing_seconds'].median():.1f}s")
    print(f"  P95:    {df_agg['timing_seconds'].quantile(0.95):.1f}s")
    print(f"  Max:    {df_agg['timing_seconds'].max():.1f}s")
    
    # Latency by category
    fig2 = px.box(
        df_agg, x="category", y="timing_seconds",
        title=f"Response Time by Category - {CONFIG['system_under_test']}",
        labels={"timing_seconds": "Response Time (seconds)", "category": "Category"},
    )
    fig2.show()
else:
    print("No timing data available.")

In [None]:
# Cell 35: Save final aggregated results

final_json = os.path.join(CONFIG["output_dir"], f"final_aggregated_{CONFIG['system_under_test']}_{CONFIG['run_id']}.json")
final_csv = os.path.join(CONFIG["output_dir"], f"final_aggregated_{CONFIG['system_under_test']}_{CONFIG['run_id']}.csv")

df_agg.to_json(final_json, orient="records", indent=2)
df_agg.to_csv(final_csv, index=False)

print(f"Final results saved:")
print(f"  JSON: {final_json}")
print(f"  CSV:  {final_csv}")
print(f"\nOutput directory contents:")
for f in sorted(os.listdir(CONFIG["output_dir"])):
    size = os.path.getsize(os.path.join(CONFIG["output_dir"], f))
    print(f"  {f} ({size:,} bytes)")

---
## Section 10: Cross-System Comparison

**Run this section only after you have two validation runs to compare.**

Load results from two runs (e.g., standard vs enhanced research, or different providers), then compare them head-to-head with pairwise LLM judging and visual overlays.

In [None]:
# Cell 37: Load results from both systems

# Update these paths to point to your saved result files from two different runs.
SYSTEM_A_RESULTS_FILE = ""  # e.g. "validation_results/results_open_deep_research_20260220_143000.json"
SYSTEM_B_RESULTS_FILE = ""  # e.g. "validation_results/results_open_deep_research_20260221_100000.json"

# Labels for display in charts and tables
SYSTEM_A_LABEL = "System A"  # e.g. "ODR (standard)" or "ODR (gemini)"
SYSTEM_B_LABEL = "System B"  # e.g. "ODR (enhanced)" or "ODR (openai)"

if SYSTEM_A_RESULTS_FILE and SYSTEM_B_RESULTS_FILE:
    with open(SYSTEM_A_RESULTS_FILE, "r", encoding="utf-8") as f:
        system_a_results = json.load(f)
    with open(SYSTEM_B_RESULTS_FILE, "r", encoding="utf-8") as f:
        system_b_results = json.load(f)
    
    print(f"{SYSTEM_A_LABEL} results: {len(system_a_results)} queries")
    print(f"{SYSTEM_B_LABEL} results: {len(system_b_results)} queries")
    
    # Build lookup by query_id
    system_a_lookup = {r["query_id"]: r for r in system_a_results}
    system_b_lookup = {r["query_id"]: r for r in system_b_results}
    common_ids = set(system_a_lookup.keys()) & set(system_b_lookup.keys())
    print(f"Common queries: {len(common_ids)}")
else:
    print("Please set SYSTEM_A_RESULTS_FILE and SYSTEM_B_RESULTS_FILE paths above.")
    print("Look in your validation_results/ directory for the JSON files.")

In [None]:
# Cell 38: Pairwise LLM comparison

def pairwise_compare(query: str, response_a: str, response_b: str, config: dict) -> dict:
    """Ask the judge LLM to compare two responses head-to-head."""
    
    prompt = f"""You are an expert research evaluator. Compare two research outputs for the same query.

QUERY: {query}

RESPONSE A:
{response_a[:4000]}

RESPONSE B:
{response_b[:4000]}

For each dimension, indicate which response is better (A, B, or Tie):
1. relevancy: Which better addresses the query?
2. depth: Which is more thorough?
3. source_quality: Which has better citations?
4. coherence: Which is better structured?
5. overall: Which is the better research output overall?

Return ONLY valid JSON:
{{"relevancy": "A" or "B" or "Tie", "depth": "A" or "B" or "Tie", "source_quality": "A" or "B" or "Tie", "coherence": "A" or "B" or "Tie", "overall": "A" or "B" or "Tie", "reasoning": "brief explanation"}}"""

    provider = config.get("judge_provider", "openai")
    try:
        if provider == "openai":
            from openai import OpenAI
            client = OpenAI()
            resp = client.chat.completions.create(
                model=config.get("judge_model", "gpt-4o"),
                messages=[{"role": "user", "content": prompt}],
                temperature=0.1,
            )
            raw = resp.choices[0].message.content.strip()
        elif provider == "anthropic":
            from anthropic import Anthropic
            client = Anthropic()
            resp = client.messages.create(
                model=config.get("judge_model", "claude-sonnet-4-5-20250929"),
                max_tokens=500,
                messages=[{"role": "user", "content": prompt}],
            )
            raw = resp.content[0].text.strip()
        else:
            return {"error": f"Unsupported provider: {provider}"}
        
        if raw.startswith("```"):
            raw = raw.split("```")[1]
            if raw.startswith("json"):
                raw = raw[4:]
        return json.loads(raw)
    except Exception as e:
        return {"error": str(e)}


# Run pairwise comparison
pairwise_results = []

if 'common_ids' in dir() and common_ids:
    for qid in sorted(common_ids):
        a_resp = system_a_lookup[qid].get("response_text", "")
        b_resp = system_b_lookup[qid].get("response_text", "")
        
        if not a_resp or not b_resp:
            continue
        
        query_text = system_a_lookup[qid]["query"]
        print(f"Comparing {qid}...", end=" ")
        
        comparison = pairwise_compare(query_text, a_resp, b_resp, CONFIG)
        comparison["query_id"] = qid
        pairwise_results.append(comparison)
        
        overall = comparison.get("overall", "?")
        winner = SYSTEM_A_LABEL if overall == "A" else SYSTEM_B_LABEL if overall == "B" else "Tie"
        print(f"Winner: {winner}")
    
    # Save
    pairwise_file = os.path.join(CONFIG["output_dir"], f"pairwise_comparison_{CONFIG['run_id']}.json")
    with open(pairwise_file, "w", encoding="utf-8") as f:
        json.dump(pairwise_results, f, indent=2, ensure_ascii=False)
    print(f"\nPairwise results saved to: {pairwise_file}")
else:
    print("Load both system results first (Cell 37).")

In [None]:
# Cell 39: Side-by-side comparison table

if pairwise_results:
    df_pw = pd.DataFrame(pairwise_results)
    
    print("=== Pairwise Comparison Summary ===")
    print(f"A = {SYSTEM_A_LABEL}, B = {SYSTEM_B_LABEL}\n")
    
    for dim in ["relevancy", "depth", "source_quality", "coherence", "overall"]:
        if dim in df_pw.columns:
            counts = df_pw[dim].value_counts()
            a_wins = counts.get("A", 0)
            b_wins = counts.get("B", 0)
            ties = counts.get("Tie", 0)
            print(f"  {dim:25s}: {SYSTEM_A_LABEL}={a_wins}  {SYSTEM_B_LABEL}={b_wins}  Tie={ties}")
    
    print(f"\nDetailed comparison:")
    display(df_pw[["query_id", "relevancy", "depth", "source_quality", "coherence", "overall", "reasoning"]])
else:
    print("No pairwise results yet. Run Cell 38 first.")

In [None]:
# Cell 40: Overlay radar chart - both systems

# Update these paths to point to your saved judge score files
SYSTEM_A_JUDGE_FILE = ""  # e.g. "validation_results/judge_scores_open_deep_research_20260220_143000.json"
SYSTEM_B_JUDGE_FILE = ""  # e.g. "validation_results/judge_scores_open_deep_research_20260221_100000.json"

if SYSTEM_A_JUDGE_FILE and SYSTEM_B_JUDGE_FILE:
    with open(SYSTEM_A_JUDGE_FILE, "r") as f:
        system_a_judge = json.load(f)
    with open(SYSTEM_B_JUDGE_FILE, "r") as f:
        system_b_judge = json.load(f)
    
    dims = ["relevancy", "depth", "source_quality", "coherence", "confidence_calibration"]
    dims_display = [d.replace("_", " ").title() for d in dims]
    
    a_valid = [s for s in system_a_judge if "error" not in s]
    b_valid = [s for s in system_b_judge if "error" not in s]
    
    a_avgs = [pd.DataFrame(a_valid)[d].mean() for d in dims]
    b_avgs = [pd.DataFrame(b_valid)[d].mean() for d in dims]
    
    # Close polygons
    a_avgs.append(a_avgs[0])
    b_avgs.append(b_avgs[0])
    dims_display.append(dims_display[0])
    
    fig = go.Figure()
    fig.add_trace(go.Scatterpolar(r=a_avgs, theta=dims_display, fill='toself', name=SYSTEM_A_LABEL, line_color='blue'))
    fig.add_trace(go.Scatterpolar(r=b_avgs, theta=dims_display, fill='toself', name=SYSTEM_B_LABEL, line_color='red', opacity=0.6))
    
    fig.update_layout(
        polar=dict(radialaxis=dict(visible=True, range=[0, 5])),
        title=f"Head-to-Head: {SYSTEM_A_LABEL} vs {SYSTEM_B_LABEL}",
    )
    fig.show()
else:
    print("Set SYSTEM_A_JUDGE_FILE and SYSTEM_B_JUDGE_FILE paths to generate the overlay radar chart.")

In [None]:
# Cell 41: Final winner determination

if pairwise_results:
    dims = ["relevancy", "depth", "source_quality", "coherence", "overall"]
    weights = {"relevancy": 0.2, "depth": 0.2, "source_quality": 0.2, "coherence": 0.15, "overall": 0.25}
    
    system_a_score = 0
    system_b_score = 0
    
    for dim in dims:
        w = weights[dim]
        for pw in pairwise_results:
            if pw.get(dim) == "A":
                system_a_score += w
            elif pw.get(dim) == "B":
                system_b_score += w
            else:  # Tie
                system_a_score += w * 0.5
                system_b_score += w * 0.5
    
    total = system_a_score + system_b_score
    a_pct = system_a_score / total * 100 if total > 0 else 50
    b_pct = system_b_score / total * 100 if total > 0 else 50
    
    print("=" * 60)
    print("FINAL RESULTS")
    print("=" * 60)
    print(f"\n  {SYSTEM_A_LABEL}:  {a_pct:.1f}%")
    print(f"  {SYSTEM_B_LABEL}:  {b_pct:.1f}%")
    print()
    
    if a_pct > b_pct + 5:
        print(f"  WINNER: {SYSTEM_A_LABEL}")
    elif b_pct > a_pct + 5:
        print(f"  WINNER: {SYSTEM_B_LABEL}")
    else:
        print(f"  RESULT: Too close to call (within 5% margin)")
    
    print("=" * 60)
else:
    print("Run pairwise comparison first (Cell 38).")

## Report Convention check code.

In [None]:
# Cell 1: Setup - Environment & Imports
import os
import sys
import logging
from pathlib import Path
from dotenv import load_dotenv

# Resolve project root reliably (works in VS Code + Jupyter)
try:
    PROJECT_ROOT = Path(__vsc_ipynb_file__).resolve().parent
except NameError:
    PROJECT_ROOT = Path.cwd()

if not (PROJECT_ROOT / "pyproject.toml").exists():
    raise RuntimeError(
        f"Project root not found at {PROJECT_ROOT}. "
        "Open VS Code from the project folder, or set PROJECT_ROOT manually."
    )

SRC_DIR = PROJECT_ROOT / "src"
os.chdir(PROJECT_ROOT)

if str(SRC_DIR) not in sys.path:
    sys.path.insert(0, str(SRC_DIR))

load_dotenv(override=True)
logging.basicConfig(level=logging.INFO, format="%(message)s")

# Import both agents + report builder
from extensions.agents.master_agent import MasterAgent
from extensions.agents.data_analysis_agent import DataAnalysisAgent
from extensions.utils.report_builder import build_html_report

print(f"Project root: {PROJECT_ROOT}")
print(f"LLM_PROVIDER: {os.getenv('LLM_PROVIDER')}")
print(f"LLM_MODEL: {os.getenv('LLM_MODEL')}")
print("Environment loaded and imports ready")

In [None]:
# Cell 2: Mock Research Data + Run Analysis Pipeline (~3-4 Gemini API calls)
# We skip deep_researcher (saves ~25 API calls) and feed mock data directly

QUERY = "environment pollution 2026 and generate plots and graphs"

MOCK_RESEARCH = """
# Environment Pollution: Global Status Report 2026

## Executive Summary
Environmental pollution remains one of the most pressing global challenges in 2026.
Air pollution alone causes approximately 7 million premature deaths annually according
to the World Health Organization. Water pollution affects over 2 billion people worldwide.

## Air Quality Index by Major Cities (2026)

| City | Country | AQI | PM2.5 (ug/m3) | Population (millions) |
|------|---------|-----|----------------|----------------------|
| Delhi | India | 285 | 120.5 | 32.9 |
| Beijing | China | 175 | 78.3 | 21.5 |
| Lagos | Nigeria | 162 | 71.0 | 16.0 |
| Cairo | Egypt | 155 | 68.2 | 21.3 |
| Mumbai | India | 148 | 65.5 | 21.7 |
| Jakarta | Indonesia | 135 | 58.9 | 10.6 |
| Sao Paulo | Brazil | 82 | 35.4 | 12.3 |
| London | UK | 45 | 18.2 | 9.0 |
| New York | USA | 42 | 16.8 | 8.3 |
| Stockholm | Sweden | 22 | 8.5 | 1.0 |

## Ocean Plastic Pollution (Million Metric Tons per Year)

| Year | Plastic Waste Generated | Plastic Entering Oceans | Recycling Rate Percent |
|------|------------------------|------------------------|----------------------|
| 2015 | 322 | 8.0 | 9.0 |
| 2018 | 359 | 9.1 | 10.5 |
| 2020 | 367 | 11.0 | 12.0 |
| 2022 | 390 | 10.5 | 15.0 |
| 2024 | 410 | 9.8 | 18.5 |
| 2026 | 435 | 9.2 | 22.0 |

## Carbon Emissions by Sector (2026, Billion Tons CO2)
The energy sector remains the largest contributor at 15.2 billion tons CO2,
followed by transportation at 7.8 billion tons, industry at 6.4 billion tons,
agriculture at 5.2 billion tons, and buildings at 3.1 billion tons.

## Key Findings
1. Air pollution levels in South Asian cities remain critically high
2. Ocean plastic pollution is stabilizing due to improved recycling rates
3. Carbon emissions from the energy sector continue to dominate
"""

MOCK_SOURCES = [
    "https://www.who.int/air-pollution/2026-report",
    "https://www.unep.org/ocean-plastics-2026",
    "https://www.iea.org/co2-emissions-2026",
    "https://www.worldbank.org/water-quality-2026",
]

In [None]:
# Cell 2A: FULL PIPELINE - MasterAgent (Research + Data Analysis + Charts)
# Uses ~25-35 API calls: web research -> data extraction -> profiling -> charts -> outliers
# Run this OR Cell 2B (mock data), not both

QUERY = "environment pollution 2026 and if possible generate some plots and graphs"

# Initialize MasterAgent
# use_enhanced_research=True for 3-4x deeper research (more API calls)
agent = MasterAgent(
    use_enhanced_research=False,  # standard mode
    provider=os.getenv("LLM_PROVIDER"),
    model=os.getenv("LLM_MODEL"),
)
print(f"LLM: {type(agent.llm).__name__}")
print(f"Tools: {[t.name for t in agent.tools]}")

# Run full pipeline: research -> analysis -> charts
print(f"\nRunning MasterAgent with query: {QUERY}")
print("(This may take a few minutes...)\n")
result = await agent.run_async(QUERY)

state = result["state"]
print(f"\nStatus: {result['status']}")
print(f"Time: {result['execution_time']:.1f}s")
print(f"Agents used: {result['agents_used']}")
print(f"Report length: {len(state.get('final_report', ''))} chars")
print(f"Sources: {len(state.get('sources', []))}")
print(f"Charts: {len(state.get('charts', []))}")
print(f"Extracted data: {len(str(state.get('extracted_data', '')))} chars")

# Prepare variables for report generation (Cell 3)
RESEARCH_TEXT = state.get("final_report", "")
SOURCES = state.get("sources", [])
pipeline_result = {
    "output": state.get("analysis_output", ""),
    "charts": state.get("charts", []),
    "chart_explanations": state.get("chart_explanations", {}),
    "extracted_data": state.get("extracted_data", ""),
    "data_profile": state.get("data_profile", ""),
}
SUB_QUERIES = state.get("sub_queries", [])
CONVERSATION_ID = result.get("conversation_id", "")

In [None]:
# Cell 2B: LIGHTWEIGHT - DataAnalysisAgent only with mock data (~4-5 API calls)
# Skips web research, uses mock research text directly
# Run this OR Cell 2A (full pipeline), not both

QUERY = "environment pollution 2026 and generate plots and graphs"

# Initialize DataAnalysisAgent with Gemini
da_agent = DataAnalysisAgent(
    provider=os.getenv("LLM_PROVIDER"),
    model=os.getenv("LLM_MODEL"),
)
print(f"LLM: {type(da_agent.llm).__name__}")
print(f"Structured output: extraction={da_agent.extraction_llm is not None}, explanation={da_agent.explanation_llm is not None}")

# Run pipeline on mock data
print("\nRunning analysis pipeline (~4-5 API calls)...")
pipeline_result = da_agent.run_pipeline(MOCK_RESEARCH)

print(f"\nStatus: {pipeline_result['status']}")
print(f"Time: {pipeline_result.get('execution_time', 0):.1f}s")
print(f"Extracted data: {len(pipeline_result.get('extracted_data', ''))} chars")
print(f"Data profile: {len(pipeline_result.get('data_profile', ''))} chars")
print(f"Charts: {len(pipeline_result.get('charts', []))}")
print(f"Chart explanations: {len(pipeline_result.get('chart_explanations', {}))}")

# Prepare variables for report generation (Cell 3)
RESEARCH_TEXT = MOCK_RESEARCH
SOURCES = MOCK_SOURCES
SUB_QUERIES = []
CONVERSATION_ID = "mock-test-001"

In [None]:
# Cell 3: Generate HTML Report (works with either Cell 2A or 2B)
import webbrowser

charts = pipeline_result.get("charts", [])
chart_explanations = pipeline_result.get("chart_explanations", {})

report_path = build_html_report(
    display_text=RESEARCH_TEXT,
    analysis_output=pipeline_result.get("output", ""),
    figures=charts,
    chart_explanations=chart_explanations,
    sources=SOURCES,
    query=QUERY,
    sub_queries=SUB_QUERIES,
    conversation_id=CONVERSATION_ID,
    src_dir=SRC_DIR,
    extracted_data_summary=pipeline_result.get("extracted_data", ""),
    data_profile_summary=pipeline_result.get("data_profile", ""),
)

print(f"HTML report saved to: {report_path}")
print(f"Report size: {os.path.getsize(report_path):,} bytes")
print(f"Charts embedded: {len(charts)}")

webbrowser.open(f"file:///{os.path.abspath(report_path)}")

In [None]:
# Cell 4: Preview extracted data and chart details

print("=== EXTRACTED DATA (first 500 chars) ===")
print(result.get("extracted_data", "None")[:500])

print("\n=== DATA PROFILE (first 500 chars) ===")
print(result.get("data_profile", "None")[:500])

print("\n=== CHARTS ===")
for path in result.get("charts", []):
    info = result.get("chart_explanations", {}).get(path, {})
    print(f"  {os.path.basename(path)}: {info.get('title', 'N/A')}")
    print(f"    {info.get('explanation', 'N/A')[:100]}")

print(f"\n=== PIPELINE OUTPUT ===")
print(result.get("output", "No output"))