# Evaluation Framework Validation Notebook

This notebook validates each step of the ART Voice Agent evaluation framework as outlined in the documentation.

## What This Notebook Covers

1. **Import & Environment Setup** - Verify all components are importable
2. **Event Recording** - Test EventRecorder functionality
3. **Event Loading & Inspection** - Load and analyze recorded events
4. **Metrics Scoring** - Score individual turns and generate summaries
5. **Scenario Execution** - Run YAML-based scenarios
6. **A/B Comparison** - Compare model configurations
7. **Azure AI Foundry Export** - Export to cloud evaluation format
8. **CLI Validation** - Verify CLI commands work correctly

> **Note**: This package should **never** be imported in production code. Import guards prevent usage when `ENV=production`.

## 1. Import Required Libraries and Set Up Environment

Import all evaluation framework components and verify the environment is correctly configured.

In [None]:
import sys
import os
from pathlib import Path

# Set up the project root for imports
# From samples/labs/dev/ we need 3 levels up to reach project root
PROJECT_ROOT = Path.cwd().parent.parent.parent
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

# Verify we're not in production
env = os.getenv("ENV", "development")
print(f"‚úÖ Environment: {env}")
assert env != "production", "Cannot run evaluation in production!"

print(f"‚úÖ Project root: {PROJECT_ROOT}")
print(f"‚úÖ Python path configured")

In [None]:
# Load environment configuration (same pattern as the app)
from dotenv import load_dotenv

# Load .env files in order of precedence
env_local_path = PROJECT_ROOT / ".env.local"
env_path = PROJECT_ROOT / ".env"

config_source = "system environment"
if env_local_path.exists():
    print(f"‚úÖ Loading .env.local")
    load_dotenv(env_local_path, override=True)
    config_source = ".env.local"
elif env_path.exists():
    print(f"‚úÖ Loading .env")
    load_dotenv(env_path, override=True)
    config_source = ".env"
else:
    print("‚ö†Ô∏è  No .env file found. Using system environment variables.")

# Try to load Azure App Configuration (preferred)
try:
    from config.appconfig_provider import bootstrap_appconfig, get_provider_status
    
    appconfig_loaded = bootstrap_appconfig()
    if appconfig_loaded:
        status = get_provider_status()
        endpoint_name = status.get("endpoint", "").split("//")[-1].split(".")[0] if status.get("endpoint") else "unknown"
        print(f"‚úÖ Loaded configuration from Azure App Config ({endpoint_name})")
        config_source = f"Azure App Config ({endpoint_name})"
except Exception as e:
    print(f"‚ÑπÔ∏è  App Configuration not available, using {config_source}")

# Verify Azure OpenAI is configured
endpoint = os.getenv('AZURE_OPENAI_ENDPOINT')
deployment = os.getenv('AZURE_OPENAI_CHAT_DEPLOYMENT_ID') or 'gpt-4o'

print(f"\nüìã Configuration source: {config_source}")
if endpoint:
    print(f"‚úÖ Azure OpenAI endpoint: {endpoint}")
else:
    print("‚ö†Ô∏è  AZURE_OPENAI_ENDPOINT not set (some features may be limited)")

if deployment:
    print(f"‚úÖ Default deployment: {deployment}")

In [None]:
# Import evaluation framework components
from tests.evaluation import (
    EventRecorder,
    EvaluationOrchestratorWrapper,
    MetricsScorer,
)
from tests.evaluation.schemas import (
    TurnEvent,
    ToolCall,
    EvidenceBlob,
    EvalModelConfig,
)
from tests.evaluation.foundry_exporter import FoundryExporter
from tests.evaluation.mocks import MockMemoManager

print("‚úÖ Core components imported:")
print("   - EventRecorder")
print("   - EvaluationOrchestratorWrapper") 
print("   - MetricsScorer")
print("   - TurnEvent, ToolCall, EvidenceBlob schemas")
print("   - FoundryExporter")
print("   - MockMemoManager")

In [None]:
# Additional imports
import json
import subprocess
import tempfile
import shutil
import hashlib
from datetime import datetime
from uuid import uuid4

def make_hash(data: dict | str) -> str:
    """Create SHA256 hash of data for ToolCall/EvidenceBlob."""
    if isinstance(data, dict):
        data = json.dumps(data, sort_keys=True)
    return hashlib.sha256(data.encode()).hexdigest()

def make_excerpt(data: dict | str, max_len: int = 200) -> str:
    """Create excerpt of data (first 200 chars)."""
    if isinstance(data, dict):
        data = json.dumps(data)
    return data[:max_len]

# Create output directory for this notebook run
NOTEBOOK_OUTPUT_DIR = PROJECT_ROOT / "runs" / "notebook_validation"
NOTEBOOK_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"‚úÖ Output directory: {NOTEBOOK_OUTPUT_DIR}")
print(f"‚úÖ Helper functions: make_hash(), make_excerpt()")

## 2. Create EventRecorder and Record Events

The `EventRecorder` captures orchestration events to JSONL format for later analysis.

In [None]:
# Create an EventRecorder for this validation run
run_id = f"notebook_validation_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
recorder = EventRecorder(
    run_id=run_id,
    output_dir=NOTEBOOK_OUTPUT_DIR
)

print(f"‚úÖ EventRecorder created")
print(f"   Run ID: {run_id}")
print(f"   Output file: {recorder.output_path}")

In [None]:
# Manually record a sample conversation with multiple turns
# This simulates what the EvaluationOrchestratorWrapper does automatically

session_id = str(uuid4())
base_ts = datetime.now()

# --- Turn 1: User reports potential fraud ---
turn1_id = "turn_001"
turn1_start = base_ts.timestamp()

recorder.record_turn_start(
    turn_id=turn1_id,
    agent="FraudDetectionAgent",
    user_text="I think someone stole my credit card. I see charges I didn't make.",
    timestamp=turn1_start
)
print(f"‚úÖ Turn 1 started: {turn1_id}")

# Record tool call: check_recent_transactions
tool1_start = turn1_start + 0.1
recorder.record_tool_start(
    tool_name="check_recent_transactions",
    arguments={"account_id": "ACC-12345", "days": 7},
    timestamp=tool1_start
)

tool1_result = {
    "transactions": [
        {"id": "TXN-001", "amount": 499.99, "merchant": "Unknown Store", "date": "2026-01-20"},
        {"id": "TXN-002", "amount": 150.00, "merchant": "Gas Station", "date": "2026-01-19"},
    ]
}
tool1_end = tool1_start + 0.5
recorder.record_tool_end(
    tool_name="check_recent_transactions",
    result=tool1_result,
    end_ts=tool1_end,
    start_ts=tool1_start
)
print(f"   ‚úÖ Tool call recorded: check_recent_transactions (500ms)")

# End turn 1 - tool_calls and evidence_blobs are built automatically from record_tool_* calls
turn1_end = turn1_start + 1.2

recorder.record_turn_end(
    turn_id=turn1_id,
    agent="FraudDetectionAgent",
    response_text="I found 2 recent transactions on your account. There's a charge of $499.99 at 'Unknown Store' on January 20th, and $150.00 at 'Gas Station' on January 19th. Can you confirm if either of these was unauthorized?",
    e2e_ms=1200,
    timestamp=turn1_end,
    ttft_ms=150,
    model_config=EvalModelConfig(
        model_name="gpt-4o",
        endpoint_used="chat",
        temperature=0.7,
        max_tokens=500
    ),
    input_tokens=250,
    response_tokens=85
)
print(f"‚úÖ Turn 1 completed (1200ms E2E)")

In [None]:
# --- Turn 2: User confirms fraud and requests action ---
turn2_id = "turn_002"
turn2_start = turn1_end + 2.0

recorder.record_turn_start(
    turn_id=turn2_id,
    agent="FraudDetectionAgent",
    user_text="Yes, the $499.99 charge at Unknown Store is fraudulent. I didn't make that purchase.",
    timestamp=turn2_start
)
print(f"‚úÖ Turn 2 started: {turn2_id}")

# Record tool calls: flag_transaction and create_dispute
tool2a_start = turn2_start + 0.1
tool2a_end = tool2a_start + 0.3
tool2a_result = {"status": "flagged", "case_id": "CASE-789"}

recorder.record_tool_start(
    tool_name="flag_fraudulent_transaction",
    arguments={"transaction_id": "TXN-001", "reason": "unauthorized"},
    timestamp=tool2a_start
)
recorder.record_tool_end(
    tool_name="flag_fraudulent_transaction",
    result=tool2a_result,
    end_ts=tool2a_end,
    start_ts=tool2a_start
)
print(f"   ‚úÖ Tool call recorded: flag_fraudulent_transaction (300ms)")

tool2b_start = tool2a_start + 0.4
tool2b_end = tool2b_start + 0.4
tool2b_result = {"dispute_id": "DISP-456", "status": "pending", "estimated_resolution": "5-7 business days"}

recorder.record_tool_start(
    tool_name="create_dispute",
    arguments={"transaction_id": "TXN-001", "case_id": "CASE-789"},
    timestamp=tool2b_start
)
recorder.record_tool_end(
    tool_name="create_dispute",
    result=tool2b_result,
    end_ts=tool2b_end,
    start_ts=tool2b_start
)
print(f"   ‚úÖ Tool call recorded: create_dispute (400ms)")

# End turn 2 - tool_calls and evidence_blobs are built automatically
turn2_end = turn2_start + 1.5

recorder.record_turn_end(
    turn_id=turn2_id,
    agent="FraudDetectionAgent",
    response_text="I've flagged the $499.99 transaction as fraudulent and created dispute DISP-456. Your case ID is CASE-789. The dispute should be resolved within 5-7 business days. A provisional credit will be applied to your account within 24 hours.",
    e2e_ms=1500,
    timestamp=turn2_end,
    ttft_ms=180,
    model_config=EvalModelConfig(
        model_name="gpt-4o",
        endpoint_used="chat",
        temperature=0.7,
        max_tokens=500
    ),
    input_tokens=320,
    response_tokens=95
)
print(f"‚úÖ Turn 2 completed (1500ms E2E)")

In [None]:
# --- Turn 3: Handoff to specialist ---
turn3_id = "turn_003"
turn3_start = turn2_end + 1.5

recorder.record_turn_start(
    turn_id=turn3_id,
    agent="FraudDetectionAgent",
    user_text="I want to speak with a supervisor about this.",
    timestamp=turn3_start
)
print(f"‚úÖ Turn 3 started: {turn3_id}")

# Record handoff
recorder.record_handoff(
    source_agent="FraudDetectionAgent",
    target_agent="FraudSpecialistAgent",
    timestamp=turn3_start + 0.2
)
print(f"   ‚úÖ Handoff recorded: FraudDetectionAgent ‚Üí FraudSpecialistAgent")

# End turn 3
turn3_end = turn3_start + 0.8
recorder.record_turn_end(
    turn_id=turn3_id,
    agent="FraudSpecialistAgent",
    response_text="I'm transferring you to our Fraud Specialist team. They will have full access to your case CASE-789 and can provide additional assistance. Please hold for a moment.",
    e2e_ms=800,
    timestamp=turn3_end,
    ttft_ms=120,
    model_config=EvalModelConfig(
        model_name="gpt-4o",
        endpoint_used="chat",
        temperature=0.7,
        max_tokens=500
    ),
    input_tokens=180,
    response_tokens=55
)
print(f"‚úÖ Turn 3 completed with handoff (800ms E2E)")

# Access the recorded events file path
print(f"\n‚úÖ All events written to: {recorder.output_path}")

## 3. Load and Inspect Recorded Events

Load the events from the JSONL file and inspect their structure.

In [None]:
# Use MetricsScorer to load events
scorer = MetricsScorer()
events = scorer.load_events(recorder.output_path)

print(f"‚úÖ Loaded {len(events)} events from JSONL file")
print(f"\nEvent types:")
for i, event in enumerate(events, 1):
    tool_count = len(event.tool_calls) if event.tool_calls else 0
    handoff = f" ‚Üí {event.handoff.target_agent}" if event.handoff else ""
    print(f"   Turn {i}: {event.agent_name} ({tool_count} tools){handoff}")

In [None]:
# Inspect first event in detail
event = events[0]
print("üìã First Event Details:")
print(f"   Turn ID:        {event.turn_id}")
print(f"   Agent:          {event.agent_name}")
print(f"   User Text:      {event.user_text[:60]}...")
print(f"   Response:       {event.response_text[:60]}...")
print(f"   E2E Latency:    {event.e2e_ms}ms")
print(f"   TTFT:           {event.ttft_ms}ms")
print(f"   Input Tokens:   {event.input_tokens}")
print(f"   Response Tokens:{event.response_tokens}")

if event.tool_calls:
    print(f"\nüîß Tool Calls:")
    for tc in event.tool_calls:
        print(f"   - {tc.name}: {tc.duration_ms}ms (status={tc.status})")

if event.evidence_blobs:
    print(f"\nüìÑ Evidence Blobs:")
    for eb in event.evidence_blobs:
        print(f"   - [{eb.source}]: {eb.content_excerpt[:50]}...")

## 4. Score Individual Turns

Use `MetricsScorer.score_turn()` to compute metrics for each turn.

In [None]:
# Score each turn individually
print("üìä Per-Turn Scores:\n")
print("-" * 70)

for event in events:
    score = scorer.score_turn(event)
    
    print(f"Turn {event.turn_id}:")
    print(f"  üîß Tool Precision: {score.tool_precision:.2%}")
    print(f"  üîß Tool Recall:    {score.tool_recall:.2%}")
    print(f"  üîß Tool Efficiency:{score.tool_efficiency:.2%}")
    print(f"  ‚úì  Grounded Ratio: {score.grounded_span_ratio:.2%}")
    print(f"  ‚è±Ô∏è  E2E Latency:    {score.e2e_ms:.1f}ms")
    print(f"  üìù Verbosity Score:{score.verbosity_score:.2f}")
    print(f"  üìù Verbosity Tokens:{score.verbosity_tokens} / {score.verbosity_budget}")
    print("-" * 70)

## 5. Generate Summary Metrics

Aggregate metrics across all turns using `MetricsScorer.generate_summary()`.

In [None]:
# Generate summary across all events
summary = scorer.generate_summary(events, scenario_name="fraud_detection_validation")

print("=" * 70)
print("üìä EVALUATION SUMMARY: fraud_detection_validation")
print("=" * 70)

print(f"\nüîß Tool Metrics:")
print(f"   Total Calls:  {summary.tool_metrics.get('total_calls', 0)}")
print(f"   Precision:    {summary.tool_metrics.get('precision', 0):.2%}")
print(f"   Recall:       {summary.tool_metrics.get('recall', 0):.2%}")
print(f"   Efficiency:   {summary.tool_metrics.get('efficiency', 0):.2%}")

print(f"\n‚è±Ô∏è  Latency Metrics:")
print(f"   E2E P50:      {summary.latency_metrics.get('e2e_p50_ms', 0):.1f}ms")
print(f"   E2E P95:      {summary.latency_metrics.get('e2e_p95_ms', 0):.1f}ms")
print(f"   TTFT P50:     {summary.latency_metrics.get('ttft_p50_ms', 0):.1f}ms")

print(f"\n‚úì Groundedness Metrics:")
print(f"   Grounded Ratio:     {summary.groundedness_metrics.get('avg_grounded_ratio', 0):.2%}")
print(f"   Unsupported Claims: {summary.groundedness_metrics.get('avg_unsupported_claims', 0):.1f}")

print(f"\nüìù Verbosity Metrics:")
print(f"   Avg Response Tokens: {summary.verbosity_metrics.get('avg_response_tokens', 0):.0f}")
print(f"   Budget Violations:   {summary.verbosity_metrics.get('budget_violations', 0)}")

print(f"\nüí∞ Cost Analysis:")
print(f"   Total Input Tokens:  {summary.cost_analysis.get('total_input_tokens', 0)}")
print(f"   Total Output Tokens: {summary.cost_analysis.get('total_output_tokens', 0)}")
print(f"   Estimated Cost:      ${summary.cost_analysis.get('estimated_cost_usd', 0):.4f}")

print("=" * 70)

In [None]:
# Save summary to JSON file
summary_file = NOTEBOOK_OUTPUT_DIR / "summary.json"
with open(summary_file, "w") as f:
    f.write(summary.model_dump_json(indent=2))

print(f"‚úÖ Summary saved to: {summary_file}")

## 6. Export to Azure AI Foundry Format

Use `FoundryExporter` to convert events to Azure AI Foundry compatible format for cloud-based evaluation.

In [None]:
# Export events to Foundry format
exporter = FoundryExporter()
foundry_output = NOTEBOOK_OUTPUT_DIR / "foundry_eval.jsonl"

# export_events takes a list of TurnEvents, not a path
exporter.export_events(
    events=events,
    output_path=foundry_output,
)

print(f"‚úÖ Exported to Foundry format: {foundry_output}")

# Inspect the exported data
print(f"\nüìã Foundry Dataset Sample:")
with open(foundry_output, "r") as f:
    for i, line in enumerate(f):
        if i >= 2:  # Show first 2 records
            break
        record = json.loads(line)
        print(f"\nRecord {i+1}:")
        print(f"   Query:    {record.get('query', '')[:50]}...")
        print(f"   Response: {record.get('response', '')[:50]}...")
        if record.get('context'):
            print(f"   Context:  {record.get('context', '')[:50]}...")

## 7. Validate CLI Commands

Test the CLI commands to ensure they work correctly.

In [None]:
# Test CLI --help command
result = subprocess.run(
    ["python", "-m", "tests.evaluation.cli", "--help"],
    capture_output=True,
    text=True,
    cwd=str(PROJECT_ROOT)
)

print("üìã CLI Help Output:")
print("-" * 70)
print(result.stdout)
print("-" * 70)
print(f"‚úÖ Exit code: {result.returncode}")

In [None]:
# Test CLI score command with our recorded events
cli_output_dir = NOTEBOOK_OUTPUT_DIR / "cli_output"
cli_output_dir.mkdir(exist_ok=True)

result = subprocess.run(
    [
        "python", "-m", "tests.evaluation.cli", "score",
        "--input", str(recorder.output_path),
        "--output", str(cli_output_dir)
    ],
    capture_output=True,
    text=True,
    cwd=str(PROJECT_ROOT)
)

print("üìã CLI Score Command Output:")
print("-" * 70)
print(result.stdout)
if result.stderr:
    print("STDERR:", result.stderr[:500])
print("-" * 70)
print(f"‚úÖ Exit code: {result.returncode}")

# Check output files
scores_file = cli_output_dir / "scores.jsonl"
summary_file = cli_output_dir / "summary.json"

print(f"\nüìÅ Output Files:")
print(f"   scores.jsonl exists: {scores_file.exists()}")
print(f"   summary.json exists: {summary_file.exists()}")

## 8. Score Existing Events (Real Data)

If there are existing evaluation runs in the `runs/` directory, score them.

In [None]:
# Find existing events files in runs directory
runs_dir = PROJECT_ROOT / "runs"
events_files = list(runs_dir.glob("**/*events.jsonl"))

print(f"üìÅ Found {len(events_files)} events files in runs/")
for f in events_files[:5]:  # Show first 5
    relative_path = f.relative_to(runs_dir)
    size_kb = f.stat().st_size / 1024
    print(f"   - {relative_path} ({size_kb:.1f} KB)")

## 9. Submit to Azure AI Foundry

Submit evaluation data to Azure AI Foundry for cloud-based evaluation with built-in evaluators (relevance, coherence, groundedness, safety, etc.).

**Prerequisites:**
- Azure AI Foundry project with endpoint URL
- `AZURE_AI_FOUNDRY_PROJECT_ENDPOINT` set in `.env.local` or passed directly
- Storage account connected to Foundry project (for studio_url generation)

In [None]:
# Check if we have foundry_eval.jsonl files from our test data
foundry_files = list(NOTEBOOK_OUTPUT_DIR.rglob("foundry_eval.jsonl"))
print(f"üìÅ Found {len(foundry_files)} foundry_eval.jsonl files")

for f in foundry_files:
    relative_path = f.relative_to(NOTEBOOK_OUTPUT_DIR)
    size_kb = f.stat().st_size / 1024
    print(f"   - {relative_path} ({size_kb:.1f} KB)")

# Also check runs directory for any existing Foundry exports
runs_foundry_files = list(runs_dir.rglob("foundry_eval.jsonl")) if runs_dir.exists() else []
print(f"\nüìÅ Found {len(runs_foundry_files)} foundry_eval.jsonl files in runs/")
for f in runs_foundry_files[:5]:
    relative_path = f.relative_to(runs_dir)
    size_kb = f.stat().st_size / 1024
    print(f"   - {relative_path} ({size_kb:.1f} KB)")

In [None]:
# Export our test events to Foundry format first (if not already exported)
from tests.evaluation.foundry_exporter import FoundryExporter
from tests.evaluation.schemas import FoundryExportConfig

# Create Foundry export from our test data
foundry_export_path = NOTEBOOK_OUTPUT_DIR / "foundry_eval.jsonl"

if events and not foundry_export_path.exists():
    # Configure export with common evaluators
    export_config = FoundryExportConfig(
        enabled=True,
        evaluators=["relevance", "coherence", "groundedness", "fluency"],
        include_metadata=True,
        context_source="evidence",
    )
    
    exporter = FoundryExporter(export_config)
    
    # Export events
    exporter.export_events(events, foundry_export_path)
    print(f"‚úÖ Exported {len(events)} events to {foundry_export_path}")
    
    # Also generate evaluator config
    evaluator_config_path = NOTEBOOK_OUTPUT_DIR / "foundry_evaluators.json"
    exporter.generate_evaluator_config(evaluator_config_path)
    print(f"‚úÖ Generated evaluator config: {evaluator_config_path}")
else:
    if foundry_export_path.exists():
        print(f"‚ÑπÔ∏è  Foundry export already exists: {foundry_export_path}")
    else:
        print("‚ö†Ô∏è  No events to export. Run the test data generation cells first.")

In [None]:
# Submit to Azure AI Foundry
# NOTE: Requires AZURE_AI_FOUNDRY_PROJECT_ENDPOINT to be set

from tests.evaluation.foundry_exporter import submit_to_foundry
import os

# Get endpoint from environment (loaded from .env.local)
foundry_endpoint = os.environ.get("AZURE_AI_FOUNDRY_PROJECT_ENDPOINT", "")

if foundry_endpoint:
    print(f"‚úÖ Foundry endpoint configured: {foundry_endpoint[:50]}...")
else:
    print("‚ö†Ô∏è  AZURE_AI_FOUNDRY_PROJECT_ENDPOINT not set")
    print("   Set it in .env.local or App Config to enable Foundry submission")
    print("   Format: https://<resource>.services.ai.azure.com/api/projects/<project>")

In [None]:
# Run Foundry evaluation (only if endpoint is configured)
# This will:
#   1. Run evaluators locally (relevance, coherence, groundedness, etc.)
#   2. Log results to Azure AI Foundry portal
#   3. Return a studio_url to view results in the portal

if foundry_endpoint and foundry_export_path.exists():
    print("üöÄ Submitting to Azure AI Foundry...")
    print(f"   Data: {foundry_export_path}")
    
    try:
        # Submit evaluation
        foundry_result = await submit_to_foundry(
            data_path=foundry_export_path,
            evaluators_config_path=NOTEBOOK_OUTPUT_DIR / "foundry_evaluators.json",
            project_endpoint=foundry_endpoint,
            evaluation_name=f"notebook_validation_{run_id}",
            model_deployment_name="gpt-4o",  # For AI-based evaluators
        )
        
        print("\n‚úÖ Foundry evaluation complete!")
        print(f"   Evaluation: {foundry_result.get('evaluation_name')}")
        print(f"   Status: {foundry_result.get('status')}")
        print(f"   Rows: {foundry_result.get('rows_evaluated')}")
        
        # Show metrics
        metrics = foundry_result.get("metrics", {})
        if metrics:
            print("\nüìä Evaluation Metrics:")
            for name, value in metrics.items():
                if isinstance(value, float):
                    print(f"   {name}: {value:.3f}")
                else:
                    print(f"   {name}: {value}")
        
        # Show studio URL (most important!)
        studio_url = foundry_result.get("studio_url")
        if studio_url:
            print(f"\nüîó View in Azure AI Foundry:")
            print(f"   {studio_url}")
        else:
            print("\n‚ö†Ô∏è  No studio_url returned - check that storage account is connected to Foundry project")
            
    except ImportError as e:
        print(f"‚ùå Missing dependency: {e}")
        print("   Install with: pip install azure-ai-evaluation")
    except ValueError as e:
        print(f"‚ùå Configuration error: {e}")
    except Exception as e:
        print(f"‚ùå Foundry submission failed: {e}")
else:
    if not foundry_endpoint:
        print("‚è≠Ô∏è  Skipping Foundry submission (endpoint not configured)")
    else:
        print(f"‚è≠Ô∏è  Skipping Foundry submission (no data at {foundry_export_path})")

In [None]:
# Alternative: Use CLI to submit to Foundry
# This is useful for batch submissions or CI/CD pipelines

print("üìã CLI Submit Command:")
print("-" * 70)

if foundry_export_path.exists():
    cli_cmd = f"""python -m tests.evaluation.cli submit \\
    --input {foundry_export_path} \\
    --endpoint "$AZURE_AI_FOUNDRY_PROJECT_ENDPOINT" \\
    --name "notebook_validation" \\
    --model "gpt-4o" """
    print(cli_cmd)
else:
    print("# First generate foundry_eval.jsonl, then run:")
    print("""python -m tests.evaluation.cli submit \\
    --input path/to/foundry_eval.jsonl \\
    --endpoint "$AZURE_AI_FOUNDRY_PROJECT_ENDPOINT" \\
    --name "my_evaluation" \\
    --model "gpt-4o" """)

print("-" * 70)
print("\nüí° Tip: Run 'python -m tests.evaluation.cli submit --help' for all options")