# Baseline Evaluation: Model-Agnostic Evaluation System

This notebook evaluates baseline performance for multiple models:
- **Gemini 2.0 Flash** (2-step pipeline)
- **Gemini 2.5 Flash Lite** (2-step pipeline)
- **Qwen3-4B-Instruct-2507** (base model, untuned)

Metrics tracked:
- Syntactic validity (passes RuleBasedValidator)
- Execution success (runs on Neo4j)
- Semantic accuracy (exact result match with gold Cypher)
- Latency (ms)
- Token usage (input/output tokens)


In [None]:
# Setup & Configuration

import json
import os
import sys
from collections import defaultdict
from pathlib import Path
from typing import Any

import pandas as pd
from dotenv import load_dotenv

# Add project root to path
ROOT = Path(os.getcwd()).resolve()

# Walk up until we find a directory with both pyproject.toml AND src/
found_root = False
while True:
    has_pyproject = (ROOT / "pyproject.toml").exists()
    has_src = (ROOT / "src").exists() and (ROOT / "src").is_dir()

    if has_pyproject and has_src:
        found_root = True
        break

    parent = ROOT.parent
    if parent == ROOT:
        break
    ROOT = parent

if not found_root:
    raise FileNotFoundError(
        f"Could not find project root (directory with both pyproject.toml and src/). "
        f"Current directory: {os.getcwd()}. "
        f"Please set ROOT manually above."
    )

if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

print(f"Project root: {ROOT}")
print(f"Current working directory: {os.getcwd()}")
print(f"src module verified at: {ROOT / 'src'}")

from src.pipeline.executor import Neo4jExecutor
from src.pipeline.types import PipelineConfig
from src.pipeline.validator import RuleBasedValidator

load_dotenv()

# Configuration
# Models to evaluate (supports single model in list or multiple models)
MODELS_TO_RUN = [
    # "gemini-2.0-flash",
    # "gemini-2.5-flash-lite",
    "qwen3-4b-it-2507-base",
]
TEST_SUBSET_SIZE = 80  # Number of test records to evaluate (fits rate limits)
RATE_LIMIT_RPM = 15  # Gemini rate limit: requests per minute (common for all Gemini models)

# Path configuration
BASE_DIR = ROOT
DATA_DIR = BASE_DIR / "finetuning" / "data" / "processed" / "splits"
EVAL_DIR = BASE_DIR / "finetuning" / "evaluation" / "results"
EVAL_DIR.mkdir(parents=True, exist_ok=True)

# Environment variables
NEO4J_URI = os.environ.get("NEO4J_URI", "bolt://localhost:7687")
NEO4J_USER = os.environ.get("NEO4J_USER", "neo4j")
NEO4J_PASSWORD = os.environ.get("NEO4J_PASSWORD", "")
GEMINI_API_KEY = os.environ.get("GOOGLE_API_KEY", "")

if not NEO4J_PASSWORD:
    raise ValueError("NEO4J_PASSWORD environment variable not set")
if not GEMINI_API_KEY:
    raise ValueError("GEMINI_API_KEY environment variable not set")

print(f"Base directory: {BASE_DIR}")
print(f"Data directory: {DATA_DIR}")
print(f"Evaluation directory: {EVAL_DIR}")
print(f"Test subset size: {TEST_SUBSET_SIZE}")
print(f"Models to evaluate: {MODELS_TO_RUN}")

In [None]:
# Helper Functions

from collections import Counter


def load_test_set(n: int) -> list[dict[str, Any]]:
    """Load test set and optionally sample n records with stratified sampling."""
    test_file = DATA_DIR / "test_sample.jsonl"
    if not test_file.exists():
        raise FileNotFoundError(f"Test file not found: {test_file}")

    records = []
    with test_file.open("r", encoding="utf-8") as f:
        for line in f:
            records.append(json.loads(line))

    if n >= len(records):
        return records

    # Stratified sampling by template_id
    records_by_template = defaultdict(list)
    for record in records:
        records_by_template[record["template_id"]].append(record)

    sampled = []
    for _template_id, template_records in records_by_template.items():
        num_to_sample = max(1, int(len(template_records) * (n / len(records))))
        if num_to_sample > len(template_records):
            num_to_sample = len(template_records)
        sampled.extend(template_records[:num_to_sample])

    # If we still need more, randomly sample remaining
    if len(sampled) < n:
        remaining = [r for r in records if r not in sampled]
        import random

        random.seed(42)
        additional = random.sample(remaining, n - len(sampled))
        sampled.extend(additional)

    return sampled[:n]


def extract_partial_matches(results: list[dict[str, Any]], model_name: str) -> list[dict[str, Any]]:
    """Extract records where syntactic and execution pass but result doesn't match."""
    partial = []
    for r in results:
        if r["syntactic_valid"] and r["execution_success"] and not r["result_match"]:
            partial.append(
                {
                    "id": r["id"],
                    "question": r["question"],
                    "gold_cypher": r["gold_cypher"],
                    "generated_cypher": r["generated_cypher"],
                    "gold_rows": r.get("gold_rows", []),
                    "generated_rows": r.get("generated_rows", []),
                    "error": r.get("error"),
                }
            )
    return partial


def create_inspection_summary(results: list[dict[str, Any]], model_name: str) -> pd.DataFrame:
    """Create a readable summary DataFrame from results."""
    summary_data = []
    for r in results:
        error_val = r.get("error") or ""
        question_val = r.get("question") or ""

        summary_data.append(
            {
                "id": r.get("id", ""),
                "question": question_val,
                "syntactic_valid": "âœ“" if r.get("syntactic_valid") else "âœ—",
                "execution_success": "âœ“" if r.get("execution_success") else "âœ—",
                "result_match": "âœ“" if r.get("result_match") else "âœ—",
                "gold_rows": len(r.get("gold_rows", [])),
                "gen_rows": len(r.get("generated_rows", [])),
                "error": error_val,
            }
        )
    return pd.DataFrame(summary_data)


def inspect_record(record: dict[str, Any], max_rows_display: int = 20):
    """Pretty-print a record for detailed inspection."""
    print("=" * 100)
    print(f"ID: {record.get('id', 'N/A')}")
    print(f"Question: {record.get('question', 'N/A')}")
    print("\n" + "-" * 100)

    if record.get("error"):
        print(f"ERROR: {record.get('error')}")

    gold_cypher = record.get("gold_cypher", "")
    gen_cypher = record.get("generated_cypher", "")
    gold_rows = record.get("gold_rows", [])
    gen_rows = record.get("generated_rows", [])

    print("\n" + "=" * 100)
    print("ðŸ“‹ GOLD CYPHER (Expected Reference)")
    print("=" * 100)
    if gold_cypher:
        print("```cypher")
        for line in gold_cypher.split("\n"):
            print(line)
        print("```")
    else:
        print("  (empty)")

    print(f"\nðŸ“Š GOLD RESULTS: {len(gold_rows)} row(s)")
    print("-" * 100)
    if gold_rows:
        for i, row in enumerate(gold_rows[:max_rows_display], 1):
            row_str = ", ".join([f"{k}={v!r}" for k, v in row.items()])
            print(f"  [{i:3d}] {row_str}")
        if len(gold_rows) > max_rows_display:
            print(f"  ... ({len(gold_rows) - max_rows_display} more rows)")
    else:
        print("  (no results)")

    print("\n" + "=" * 100)
    print("ðŸ”§ GENERATED CYPHER (Model Output)")
    print("=" * 100)
    if gen_cypher:
        print("```cypher")
        for line in gen_cypher.split("\n"):
            print(line)
        print("```")
    else:
        print("  (empty)")

    print(f"\nðŸ“Š GENERATED RESULTS: {len(gen_rows)} row(s)")
    print("-" * 100)
    if gen_rows:
        for i, row in enumerate(gen_rows[:max_rows_display], 1):
            row_str = ", ".join([f"{k}={v!r}" for k, v in row.items()])
            print(f"  [{i:3d}] {row_str}")
        if len(gen_rows) > max_rows_display:
            print(f"  ... ({len(gen_rows) - max_rows_display} more rows)")
    else:
        print("  (no results)")

    print("\n")


print("Helper functions loaded successfully")

In [None]:
# Import Evaluation Harness and Model Adapters

from finetuning.evaluation import (
    Evaluator,
    GeminiModelAdapter,
    ModelAdapter,
    QwenModelAdapter,
    run_evaluation,
)

print("Evaluation harness and model adapters imported successfully")

In [None]:
# Load Test Set

test_records = load_test_set(TEST_SUBSET_SIZE)
print(f"Loaded {len(test_records)} test records")
print("Template distribution:")
template_counts = Counter(r["template_id"] for r in test_records)
for template_id, count in sorted(template_counts.items()):
    print(f"  {template_id}: {count}")

In [None]:
# Initialize Shared Components

config = PipelineConfig()
validator = RuleBasedValidator(config=config)
executor = Neo4jExecutor(
    uri=NEO4J_URI,
    user=NEO4J_USER,
    password=NEO4J_PASSWORD,
    config=config,
)

print("Shared components initialized (validator, executor)")

In [None]:
# Run Evaluations for All Models


def create_model_adapter(model_id: str) -> ModelAdapter:
    """Factory function to create the appropriate model adapter."""
    if model_id.startswith("gemini"):
        return GeminiModelAdapter(
            model=model_id,
            api_key=GEMINI_API_KEY,
            temperature=0.1,
            rate_limit_rpm=RATE_LIMIT_RPM,
        )
    elif model_id.startswith("qwen"):
        return QwenModelAdapter()
    else:
        raise ValueError(f"Unknown model: {model_id}")


# Store all results
all_results: dict[str, list[dict[str, Any]]] = {}
all_metrics: dict[str, dict[str, Any]] = {}

# Run evaluation for each model
for model_id in MODELS_TO_RUN:
    print(f"\n{'='*80}")
    print(f"Evaluating: {model_id}")
    print(f"{'='*80}")

    # Create model adapter
    adapter = create_model_adapter(model_id)

    # Create evaluator for this adapter
    evaluator = Evaluator(validator, executor, adapter)

    # Create checkpoint file path
    checkpoint_file = EVAL_DIR / f"{model_id}_checkpoint.jsonl"

    # Run evaluation
    results = run_evaluation(
        model_adapter=adapter,
        test_records=test_records[:2],
        checkpoint_file=checkpoint_file,
        evaluator=evaluator,
        checkpoint_interval=5,
    )

    # Store results and compute metrics
    all_results[model_id] = results
    all_metrics[model_id] = evaluator.aggregate_metrics(results)

    # Print metrics
    print(f"\n{model_id} Metrics:")
    for key, value in all_metrics[model_id].items():
        print(f"  {key}: {value:.2f}" if isinstance(value, float) else f"  {key}: {value}")

print(f"\n{'='*80}")
print("All evaluations complete!")
print(f"{'='*80}")

In [None]:
# Load Results from Checkpoints (if notebook was restarted)

# If all_results is not defined or empty, load from checkpoints
if "all_results" not in globals() or not all_results:
    print("Loading results from checkpoint files...")
    all_results = {}
    all_metrics = {}

    for model_id in MODELS_TO_RUN:
        checkpoint_file = EVAL_DIR / f"{model_id}_checkpoint.jsonl"
        if checkpoint_file.exists():
            with checkpoint_file.open("r", encoding="utf-8") as f:
                results = [json.loads(line) for line in f]
                all_results[model_id] = results

                # Create a dummy adapter to get evaluator for metrics
                adapter = create_model_adapter(model_id)
                evaluator = Evaluator(validator, executor, adapter)
                all_metrics[model_id] = evaluator.aggregate_metrics(results)
                print(f"  Loaded {len(results)} results for {model_id}")
        else:
            print(f"  No checkpoint found for {model_id}")

print(f"\nAvailable model results: {list(all_results.keys())}")

In [None]:
# Results Analysis & Comparison

if not all_results:
    raise ValueError("No results available. Run Cell 6 to evaluate models first.")

# Create comparison table for all evaluated models
comparison_data = {
    "Metric": [
        "Syntactic Validity (%)",
        "Execution Success (%)",
        "Semantic Accuracy (%)",
        "Avg Latency (ms)",
        "Avg Input Tokens",
        "Avg Output Tokens",
        "Total Input Tokens",
        "Total Output Tokens",
    ],
}

# Add a column for each model
for model_id, metrics in all_metrics.items():
    comparison_data[model_id] = [
        metrics.get("syntactic_validity_pct", 0),
        metrics.get("execution_success_pct", 0),
        metrics.get("semantic_accuracy_pct", 0),
        metrics.get("avg_latency_ms", 0),
        metrics.get("avg_input_tokens", 0),
        metrics.get("avg_output_tokens", 0),
        metrics.get("total_input_tokens", 0),
        metrics.get("total_output_tokens", 0),
    ]

df_comparison = pd.DataFrame(comparison_data)
print("\n" + "=" * 80)
print("BASELINE EVALUATION COMPARISON")
print("=" * 80)
print(df_comparison.to_string(index=False))
print("\n")

# Save summaries per model
for model_id, results in all_results.items():
    summary = create_inspection_summary(results, model_id)
    summary_file = EVAL_DIR / f"{model_id}_summary.csv"
    summary.to_csv(summary_file, index=False)
    print(f"  {model_id} summary saved to: {summary_file}")

In [None]:
# Identify Partial Matches for All Models

all_partial_matches = {}

for model_id, results in all_results.items():
    partial = extract_partial_matches(results, model_id)
    all_partial_matches[model_id] = partial

    # Save partial matches for inspection
    partial_file = EVAL_DIR / f"partial_matches_{model_id}.jsonl"
    with partial_file.open("w", encoding="utf-8") as f:
        for record in partial:
            f.write(json.dumps(record, ensure_ascii=False) + "\n")

    print(f"  {model_id}: {len(partial)} partial matches")

print("\nPartial matches saved:")
for model_id, partial in all_partial_matches.items():
    print(f"  {EVAL_DIR / f'partial_matches_{model_id}.jsonl'}")

In [None]:
# Display Detailed Inspection for All Models

for model_id, partial_matches in all_partial_matches.items():
    if partial_matches:
        print("\n" + "=" * 100)
        print(f"DETAILED INSPECTION: {model_id.upper()} PARTIAL MATCHES (syntax + exec pass, result mismatch)")
        print("=" * 100)
        for i, record in enumerate(partial_matches[:10]):
            print(f"\n--- Mismatch #{i} ({model_id}) ---")
            inspect_record(record)

In [None]:
# Save Final Results Summary

from datetime import datetime

results_summary = {
    "timestamp": datetime.now().isoformat(),
    "test_set_size": len(test_records),
    "models": {},
}

# Add all evaluated models
for model_id, metrics in all_metrics.items():
    results_summary["models"][model_id] = {
        "metrics": metrics,
        "result_count": len(all_results.get(model_id, [])),
    }

results_file = EVAL_DIR / "evaluation_results.json"
with results_file.open("w", encoding="utf-8") as f:
    json.dump(results_summary, f, indent=2, ensure_ascii=False)

print(f"\nFinal results summary saved to: {results_file}")
print("\nEvaluation complete!")

In [None]:
# Interactive Inspection Tool


def inspect_by_id(record_id: str, model_id: str | None = None):
    """Inspect a specific record by ID across all evaluated models.

    Args:
        record_id: The record ID to search for.
        model_id: Optional model ID to search in. If None, searches all models.
    """
    if model_id:
        # Search in specific model
        if model_id in all_results:
            for record in all_results[model_id]:
                if record.get("id") == record_id:
                    print(f"\nFound in {model_id}:")
                    inspect_record(record, max_rows_display=50)
                    return
            print(f"Record with ID '{record_id}' not found in {model_id}.")
        else:
            print(f"Model '{model_id}' not found. Available models: {list(all_results.keys())}")
    else:
        # Search across all models
        found = False
        for mid, results in all_results.items():
            for record in results:
                if record.get("id") == record_id:
                    print(f"\nFound in {mid}:")
                    inspect_record(record, max_rows_display=50)
                    found = True
                    break
            if found:
                break
        if not found:
            print(f"Record with ID '{record_id}' not found in any model.")


print("\nUse inspect_by_id('RECORD_ID') to inspect a specific record in detail.")
print("Use inspect_by_id('RECORD_ID', 'model_id') to search in a specific model.")
print("Example: inspect_by_id('F5.2-000167-S1-P2')")
print("Example: inspect_by_id('F5.2-000167-S1-P2', 'gemini-2.0-flash')")