# iofold Eval Experimentation Notebook

This notebook provides a sandbox for experimenting with different eval strategies on trace data from the iofold database.

## Setup

1. Make sure the backend is running (`pnpm run dev` in project root)
2. Install dependencies: `pip install -r requirements.txt`
3. Run cells in order

## What you can do here:
- Explore trace data structure
- Write custom eval functions in Python
- Test eval functions against labeled traces
- Compare eval strategies with statistical metrics

## 1. Setup & Imports

In [None]:
# Core imports
import json
import sqlite3
from pathlib import Path
from typing import Optional, Any

# Data analysis
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Statistics
from scipy import stats
from sklearn.metrics import confusion_matrix, cohen_kappa_score, f1_score, accuracy_score

# LLM clients (optional, for LLM-as-judge evals)
try:
    from openai import OpenAI
    OPENAI_AVAILABLE = True
except ImportError:
    OPENAI_AVAILABLE = False

try:
    import anthropic
    ANTHROPIC_AVAILABLE = True
except ImportError:
    ANTHROPIC_AVAILABLE = False

# Local utilities
import db_utils

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)
plt.style.use('seaborn-v0_8-whitegrid')

print("Setup complete!")
print(f"OpenAI available: {OPENAI_AVAILABLE}")
print(f"Anthropic available: {ANTHROPIC_AVAILABLE}")

## 2. Database Connection & Exploration

In [None]:
# List all tables in the database
tables = db_utils.list_tables()
print("Database Tables:")
for table in tables:
    print(f"  - {table}")

In [None]:
# Get trace statistics
stats = db_utils.get_trace_statistics()
print("\nTrace Statistics:")
for key, value in stats.items():
    print(f"  {key}: {value}")

In [None]:
# View traces table schema
print("\nTraces Table Schema:")
display(db_utils.get_table_schema('traces'))

## 3. Exploring Traces

In [None]:
# Fetch recent traces
traces_df = db_utils.query("""
    SELECT 
        t.id,
        t.trace_id,
        t.source,
        t.timestamp,
        t.step_count,
        t.has_errors,
        t.input_preview,
        t.output_preview,
        f.rating as feedback_rating,
        f.rating_detail
    FROM traces t
    LEFT JOIN feedback f ON f.trace_id = t.id
    ORDER BY t.imported_at DESC
    LIMIT 20
""", as_df=True)

print(f"Found {len(traces_df)} traces")
display(traces_df)

In [None]:
# Examine a single trace in detail
if len(traces_df) > 0:
    sample_trace_id = traces_df.iloc[0]['id']
    trace = db_utils.get_trace_by_id(sample_trace_id)
    
    if trace:
        print("Sample Trace Details:")
        db_utils.print_trace_summary(trace)
        
        print("\n--- Steps Structure ---")
        for i, step in enumerate(trace.steps[:3]):  # First 3 steps
            print(f"\nStep {i}:")
            print(f"  Messages: {len(step.get('messages_added', []))}")
            print(f"  Tool Calls: {len(step.get('tool_calls', []))}")
            if step.get('error'):
                print(f"  Error: {step['error']}")

## 4. Working with Labeled Traces

Labeled traces have human feedback (positive/negative rating) that we can use to train and test eval functions.

In [None]:
# Get labeled traces
labeled_traces = db_utils.get_labeled_traces(limit=100)
print(f"Found {len(labeled_traces)} labeled traces")

if labeled_traces:
    labeled_df = pd.DataFrame(labeled_traces)
    print("\nRating distribution:")
    print(labeled_df['rating'].value_counts())
    
    print("\nHuman score distribution:")
    print(labeled_df['human_score'].describe())

## 5. Writing Custom Eval Functions

Eval functions follow this signature:
```python
def eval_function(task: dict, task_metadata: dict, trace: dict, ctx) -> tuple[float, str]:
    # Returns: (score between 0.0 and 1.0, feedback string)
    return (score, feedback)
```

For notebook experimentation, we'll use a simplified version without the ctx parameter.

In [None]:
# Mock context for eval functions (simplified version)
class MockEvalContext:
    """Mock context for testing eval functions in notebooks."""
    
    def __init__(self, openai_client=None, anthropic_client=None):
        self.openai = openai_client
        self.anthropic = anthropic_client
        self._cost = 0.0
        self._cache = {}
    
    def call_llm(self, prompt: str, model: str = "gpt-4o-mini", 
                 temperature: float = 0.0, max_tokens: int = 500,
                 cache_key: str = None) -> str:
        """Call LLM for semantic evaluation (optional)."""
        # Check cache first
        if cache_key and cache_key in self._cache:
            return self._cache[cache_key]
        
        if self.openai and 'gpt' in model.lower():
            response = self.openai.chat.completions.create(
                model=model,
                messages=[{"role": "user", "content": prompt}],
                temperature=temperature,
                max_tokens=max_tokens
            )
            result = response.choices[0].message.content
        elif self.anthropic and 'claude' in model.lower():
            response = self.anthropic.messages.create(
                model=model,
                max_tokens=max_tokens,
                messages=[{"role": "user", "content": prompt}]
            )
            result = response.content[0].text
        else:
            raise ValueError(f"No client available for model: {model}")
        
        # Cache result
        if cache_key:
            self._cache[cache_key] = result
        
        return result
    
    def get_cost_so_far(self) -> float:
        return self._cost
    
    def get_remaining_budget(self) -> float:
        return 0.05 - self._cost

# Initialize context (optional - add your API keys if needed)
ctx = MockEvalContext()
print("Mock context created. Add API clients if you need LLM-based evals.")

In [None]:
# Example Eval Function 1: Simple heuristic-based eval
def eval_response_length(task: dict, task_metadata: dict, trace: dict, ctx) -> tuple[float, str]:
    """
    Simple eval: Score based on response length.
    Longer responses get higher scores (up to a point).
    """
    response = db_utils.extract_assistant_response(trace)
    
    if not response:
        return (0.0, "No assistant response found")
    
    length = len(response)
    
    # Score based on length brackets
    if length < 50:
        score = 0.2
        feedback = "Response too short"
    elif length < 200:
        score = 0.5
        feedback = "Response is brief"
    elif length < 500:
        score = 0.8
        feedback = "Response is well-developed"
    else:
        score = 1.0
        feedback = "Response is comprehensive"
    
    return (score, f"{feedback} ({length} chars)")

print("Eval function 'eval_response_length' defined.")

In [None]:
# Example Eval Function 2: Error detection
def eval_no_errors(task: dict, task_metadata: dict, trace: dict, ctx) -> tuple[float, str]:
    """
    Eval: Penalize traces with errors.
    """
    steps = trace.get('steps', [])
    if isinstance(steps, str):
        steps = json.loads(steps)
    
    error_count = 0
    error_messages = []
    
    for step in steps:
        if step.get('error'):
            error_count += 1
            error_messages.append(step['error'][:50])
        
        for tc in step.get('tool_calls', []):
            if tc.get('error'):
                error_count += 1
                error_messages.append(tc['error'][:50])
    
    if error_count == 0:
        return (1.0, "No errors detected")
    elif error_count <= 2:
        return (0.5, f"Minor errors: {'; '.join(error_messages[:2])}")
    else:
        return (0.0, f"Multiple errors ({error_count}): {'; '.join(error_messages[:3])}")

print("Eval function 'eval_no_errors' defined.")

In [None]:
# Example Eval Function 3: Tool usage efficiency
def eval_tool_efficiency(task: dict, task_metadata: dict, trace: dict, ctx) -> tuple[float, str]:
    """
    Eval: Score based on reasonable tool usage.
    Penalize too few or too many tool calls.
    """
    tool_calls = db_utils.extract_tool_calls(trace)
    num_tools = len(tool_calls)
    
    if num_tools == 0:
        # No tools - might be okay for simple tasks
        return (0.6, "No tool calls - may be appropriate for simple queries")
    elif num_tools <= 3:
        return (1.0, f"Efficient tool usage ({num_tools} calls)")
    elif num_tools <= 7:
        return (0.7, f"Moderate tool usage ({num_tools} calls)")
    else:
        return (0.4, f"Excessive tool usage ({num_tools} calls) - may indicate inefficiency")

print("Eval function 'eval_tool_efficiency' defined.")

## 6. Testing Eval Functions Against Labeled Data

In [None]:
def test_eval_function(eval_fn, labeled_traces: list[dict]) -> dict:
    """
    Test an eval function against labeled traces and compute metrics.
    
    Returns dict with:
    - predictions: list of (trace_id, predicted_score, human_score)
    - pearson_r: Pearson correlation coefficient
    - accuracy: Binary accuracy at 0.5 threshold
    - cohen_kappa: Cohen's kappa statistic
    - f1: F1 score
    - confusion_matrix: dict with tp, tn, fp, fn
    """
    predictions = []
    
    for trace_row in labeled_traces:
        # Prepare trace dict
        trace_dict = db_utils.trace_to_dict_for_eval(trace_row)
        
        # Prepare task dict
        task = {"user_message": db_utils.extract_user_message(trace_dict) or ""}
        
        # Empty task_metadata for now
        task_metadata = {}
        
        try:
            score, feedback = eval_fn(task, task_metadata, trace_dict, ctx)
            predictions.append({
                "trace_id": trace_row["id"],
                "predicted_score": score,
                "human_score": trace_row["human_score"],
                "feedback": feedback,
                "human_rating": trace_row["rating"]
            })
        except Exception as e:
            print(f"Error evaluating trace {trace_row['id']}: {e}")
            predictions.append({
                "trace_id": trace_row["id"],
                "predicted_score": 0.5,
                "human_score": trace_row["human_score"],
                "feedback": f"Error: {e}",
                "human_rating": trace_row["rating"]
            })
    
    if not predictions:
        return {"error": "No predictions made"}
    
    # Calculate metrics
    pred_scores = np.array([p["predicted_score"] for p in predictions])
    human_scores = np.array([p["human_score"] for p in predictions])
    
    # Pearson correlation
    if len(set(pred_scores)) > 1 and len(set(human_scores)) > 1:
        pearson_r, pearson_p = stats.pearsonr(pred_scores, human_scores)
    else:
        pearson_r, pearson_p = 0.0, 1.0
    
    # Binary classification at 0.5 threshold
    pred_binary = (pred_scores >= 0.5).astype(int)
    human_binary = (human_scores >= 0.5).astype(int)
    
    acc = accuracy_score(human_binary, pred_binary)
    kappa = cohen_kappa_score(human_binary, pred_binary)
    f1 = f1_score(human_binary, pred_binary, zero_division=0)
    
    # Confusion matrix
    cm = confusion_matrix(human_binary, pred_binary, labels=[0, 1])
    tn, fp, fn, tp = cm.ravel() if cm.size == 4 else (0, 0, 0, 0)
    
    return {
        "predictions": predictions,
        "pearson_r": pearson_r,
        "pearson_p": pearson_p,
        "accuracy": acc,
        "cohen_kappa": kappa,
        "f1_score": f1,
        "confusion_matrix": {"tp": int(tp), "tn": int(tn), "fp": int(fp), "fn": int(fn)}
    }

print("Test function defined.")

In [None]:
# Test all eval functions (if we have labeled data)
if labeled_traces:
    eval_functions = [
        ("Response Length", eval_response_length),
        ("No Errors", eval_no_errors),
        ("Tool Efficiency", eval_tool_efficiency),
    ]
    
    results = []
    for name, eval_fn in eval_functions:
        print(f"\nTesting: {name}")
        result = test_eval_function(eval_fn, labeled_traces)
        result["name"] = name
        results.append(result)
        
        print(f"  Pearson r: {result['pearson_r']:.3f} (p={result['pearson_p']:.3f})")
        print(f"  Accuracy:  {result['accuracy']:.3f}")
        print(f"  Cohen's k: {result['cohen_kappa']:.3f}")
        print(f"  F1 Score:  {result['f1_score']:.3f}")
        print(f"  Confusion: {result['confusion_matrix']}")
else:
    print("No labeled traces available. Add feedback to traces first.")

## 7. Comparing Eval Strategies

In [None]:
# Create comparison DataFrame
if 'results' in dir() and results:
    comparison_df = pd.DataFrame([
        {
            "Eval Function": r["name"],
            "Pearson r": r["pearson_r"],
            "Accuracy": r["accuracy"],
            "Cohen's Kappa": r["cohen_kappa"],
            "F1 Score": r["f1_score"],
            "TP": r["confusion_matrix"]["tp"],
            "TN": r["confusion_matrix"]["tn"],
            "FP": r["confusion_matrix"]["fp"],
            "FN": r["confusion_matrix"]["fn"],
        }
        for r in results
    ])
    
    display(comparison_df)
    
    # Plot comparison
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    
    metrics = ["Accuracy", "Cohen's Kappa", "F1 Score"]
    colors = ['steelblue', 'coral', 'forestgreen']
    
    for ax, metric, color in zip(axes, metrics, colors):
        comparison_df.plot(kind='bar', x='Eval Function', y=metric, ax=ax, 
                          color=color, legend=False)
        ax.set_title(metric)
        ax.set_ylabel(metric)
        ax.set_ylim(0, 1)
        ax.axhline(y=0.5, color='gray', linestyle='--', alpha=0.5)
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
    
    plt.tight_layout()
    plt.show()
else:
    print("No results to compare. Run the eval tests first.")

## 8. Write Your Own Eval Function

Use this template to experiment with your own eval strategies:

In [None]:
# Template for custom eval function
def my_custom_eval(task: dict, task_metadata: dict, trace: dict, ctx) -> tuple[float, str]:
    """
    Your custom eval function.
    
    Args:
        task: {"user_message": "the user's request"}
        task_metadata: Additional context (expected_output, success_criteria, etc.)
        trace: The agent's execution trace with steps, tool_calls, etc.
        ctx: EvalContext with LLM access (ctx.call_llm()) and utilities
    
    Returns:
        (score, feedback) where score is 0.0-1.0 and feedback explains the score
    """
    # Extract useful data
    user_message = task.get("user_message", "")
    assistant_response = db_utils.extract_assistant_response(trace)
    tool_calls = db_utils.extract_tool_calls(trace)
    
    # Your evaluation logic here
    # Example: Check if response mentions key terms from the user message
    score = 0.5  # Default score
    feedback = "Default evaluation"
    
    # TODO: Implement your custom logic
    # For example:
    # - Check response relevance to user message
    # - Verify tool usage is appropriate
    # - Check for specific patterns or content
    # - Use ctx.call_llm() for semantic evaluation
    
    return (score, feedback)

# Test your custom eval
if labeled_traces:
    print("Testing custom eval...")
    result = test_eval_function(my_custom_eval, labeled_traces[:10])  # Test on first 10
    print(f"Accuracy: {result['accuracy']:.3f}")
    print(f"Cohen's Kappa: {result['cohen_kappa']:.3f}")

## 9. LLM-as-Judge Eval (Advanced)

If you have API keys set up, you can use LLMs for semantic evaluation.

In [None]:
# LLM-as-Judge eval function (requires API keys)
def eval_llm_judge(task: dict, task_metadata: dict, trace: dict, ctx) -> tuple[float, str]:
    """
    Use an LLM to evaluate the trace quality.
    """
    user_message = task.get("user_message", "")
    assistant_response = db_utils.extract_assistant_response(trace)
    
    if not assistant_response:
        return (0.0, "No response to evaluate")
    
    prompt = f"""Evaluate this AI assistant response on a scale of 0-10.

User Request: {user_message[:500]}

Assistant Response: {assistant_response[:1000]}

Rate the response based on:
1. Relevance: Does it address the user's request?
2. Completeness: Is it thorough?
3. Accuracy: Is the information correct?
4. Clarity: Is it well-written?

Respond with ONLY a JSON object:
{{"score": <0-10>, "reasoning": "<brief explanation>"}}"""
    
    try:
        response = ctx.call_llm(prompt, model="gpt-4o-mini", temperature=0)
        
        # Parse JSON response
        result = json.loads(response)
        score = result.get("score", 5) / 10.0  # Normalize to 0-1
        reasoning = result.get("reasoning", "No reasoning provided")
        
        return (score, reasoning)
    except Exception as e:
        return (0.5, f"LLM evaluation failed: {e}")

# Test LLM judge (only if API is available)
if OPENAI_AVAILABLE and labeled_traces:
    # Initialize OpenAI client (add your API key)
    # ctx.openai = OpenAI(api_key="your-api-key")
    
    print("LLM-as-Judge eval defined. Set ctx.openai to test.")
else:
    print("OpenAI not available or no labeled traces. Skipping LLM eval test.")

## 10. Export Results

Save your best eval function for use in the main system.

In [None]:
# Export an eval function to a Python file
def export_eval_function(eval_fn, filepath: str):
    """Export an eval function to a standalone Python file."""
    import inspect
    source = inspect.getsource(eval_fn)
    
    template = '''"""Auto-generated eval function from notebook experimentation."""
import json
from typing import Any, Optional

{source}

# Main function that will be called by the eval runner
eval_function = {fn_name}
'''.format(source=source, fn_name=eval_fn.__name__)
    
    with open(filepath, 'w') as f:
        f.write(template)
    
    print(f"Exported to {filepath}")

# Example: export_eval_function(eval_response_length, "my_eval.py")

---

## Next Steps

1. **Add more labeled traces**: Label traces in the iofold UI to get more training data
2. **Experiment with different strategies**: Try combining multiple signals
3. **Test LLM-as-judge**: Set up API keys and try semantic evaluation
4. **Export successful evals**: Use the export function to save good eval functions
5. **Integrate with main system**: Use the exported eval in the iofold UI