## Section 1: Install Dependencies

Install required packages for the AI Workflow Evaluator.

In [None]:
import subprocess
import sys

# Install required packages
packages = ["pyspark>=3.3.0", "mlflow>=2.0.0", "pandas>=1.5.0", "numpy>=1.23.0"]

for package in packages:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])

## Section 2: Import Required Libraries

Import PySpark, MLflow, Pandas, and other necessary modules.

In [None]:
import sys
sys.path.insert(0, '../')  # Add parent directory to path to import evaluator module

import mlflow
import mlflow.entities
import pandas as pd
import numpy as np
from typing import Dict, Any, List, Tuple
from datetime import datetime
import json

# Import from local evaluator module
from evaluator import Episode, EpisodeEvaluator, MetricsTracker, validate_episode
from evaluator.invariants import assert_idempotent, assert_low_drift

print("✓ All libraries imported successfully")

## Section 3: Initialize MLflow Tracking

Configure MLflow for local tracking (can be changed to Databricks tracking URI on a Databricks cluster).

In [None]:
# Set up MLflow tracking
# For Databricks: mlflow.set_tracking_uri("databricks")
# For local MLflow server: mlflow.set_tracking_uri("http://localhost:5000")
# For local file system (default):
mlflow.set_tracking_uri(None)  # Uses local ./mlruns directory

# Create experiment for episode evaluation
experiment_name = "episode_evaluation_demo"
try:
    experiment_id = mlflow.create_experiment(experiment_name)
except:
    # Experiment already exists
    experiment = mlflow.get_experiment_by_name(experiment_name)
    experiment_id = experiment.experiment_id

mlflow.set_experiment(experiment_name)
print(f"✓ MLflow configured - Experiment: {experiment_name}")
print(f"  Tracking URI: {mlflow.get_tracking_uri()}")

## Section 4: Create Sample Episodes with Unique IDs

Define multiple episodes (membranes) with unique IDs, inputs, expected outputs, and metadata.

In [None]:
# Create sample episodes - each represents a unique AI workflow membrane

episode_1 = Episode(
    episode_id="ep-001-geography",
    inputs={
        "question": "What is the capital of France?",
        "context": "European capitals",
        "temperature": 0.7
    },
    expected_outputs={
        "answer": "Paris",
        "confidence": 0.95,
        "reasoning": "Paris is the most well-known capital of France"
    },
    prompt="Answer the geography question based on context. Provide reasoning.",
    model_name="gpt-4",
    token_counts={"input_tokens": 45, "output_tokens": 32}
)

episode_2 = Episode(
    episode_id="ep-002-math",
    inputs={
        "operation": "multiply",
        "a": 7,
        "b": 8
    },
    expected_outputs={
        "result": 56,
        "confidence": 1.0
    },
    prompt="Perform the requested mathematical operation",
    model_name="gpt-4",
    token_counts={"input_tokens": 25, "output_tokens": 15}
)

episode_3 = Episode(
    episode_id="ep-003-sentiment",
    inputs={
        "text": "I absolutely love this product! It works perfectly.",
        "model": "sentiment-analyzer"
    },
    expected_outputs={
        "sentiment": "positive",
        "score": 0.92
    },
    prompt="Analyze the sentiment of the given text",
    model_name="bert-sentiment",
    token_counts={"input_tokens": 35, "output_tokens": 12}
)

episodes = [episode_1, episode_2, episode_3]

print("✓ Created 3 sample episodes (membranes)")
for ep in episodes:
    print(f"  - {ep.episode_id}: {ep.prompt}")

## Section 5: Create DataFrame of Episodes

Convert episodes into a Pandas DataFrame for batch processing.

In [None]:
# Create a DataFrame of episodes for batch processing
episode_data = {
    "episode_id": [ep.episode_id for ep in episodes],
    "model_name": [ep.model_name for ep in episodes],
    "prompt": [ep.prompt for ep in episodes],
    "token_count": [ep.token_counts["input_tokens"] + ep.token_counts["output_tokens"] for ep in episodes]
}

df_episodes = pd.DataFrame(episode_data)
print("✓ Episode DataFrame created:")
print(df_episodes)
print(f"\nDataFrame shape: {df_episodes.shape}")
print(f"Columns: {list(df_episodes.columns)}")

## Section 6: Execute Episodes as Spark Jobs

Simulate Spark job execution by executing episodes and capturing outputs (in a real scenario, these would be actual Spark RDD/DataFrame operations).

In [None]:
def execute_episode_job(episode: Episode) -> Dict[str, Any]:
    """
    Simulate Spark job execution for an episode.
    In production, this would be actual Spark RDD/DataFrame operations.
    
    Returns the output from the executed episode.
    """
    # Simulate different execution scenarios
    if episode.episode_id == "ep-001-geography":
        # Perfect match scenario
        return {
            "answer": "Paris",
            "confidence": 0.95,
            "reasoning": "Paris is the most well-known capital of France"
        }
    elif episode.episode_id == "ep-002-math":
        # Perfect match scenario
        return {
            "result": 56,
            "confidence": 1.0
        }
    elif episode.episode_id == "ep-003-sentiment":
        # Slight drift scenario - different confidence score
        return {
            "sentiment": "positive",
            "score": 0.89  # Slightly different from expected 0.92
        }
    else:
        return {}


# Execute episodes and capture outputs
print("Executing episodes as Spark jobs...\n")
episode_outputs = []

for episode in episodes:
    actual_output = execute_episode_job(episode)
    episode_outputs.append((episode, actual_output))
    print(f"Episode: {episode.episode_id}")
    print(f"  Expected: {episode.expected_outputs}")
    print(f"  Actual:   {actual_output}")
    print()

print("✓ All episodes executed successfully")

## Section 7: Evaluate Idempotence Results

Compare actual outputs against expected outputs to determine idempotency status.

In [None]:
# Initialize the episode evaluator
evaluator = EpisodeEvaluator()

# Evaluate all episodes
print("=" * 60)
print("EVALUATING EPISODE IDEMPOTENCY")
print("=" * 60)
print()

evaluation_results = []

for episode, actual_output in episode_outputs:
    # Validate episode first
    try:
        validate_episode(episode)
        validation_status = "✓ Valid"
    except Exception as e:
        validation_status = f"✗ Invalid: {e}"
    
    # Evaluate idempotency
    match_result, metrics = evaluator.evaluate_episode(episode, actual_output)
    
    # Interpret match result
    if match_result == 1.0:
        status = "✓ MATCH (Idempotent)"
    elif match_result == 0.0:
        status = "✗ MISMATCH"
    else:
        status = "? UNDETERMINED (Partial)"
    
    evaluation_results.append({
        "episode_id": episode.episode_id,
        "match_result": match_result,
        "status": status,
        "drift": metrics["drift"],
        "coherence": metrics["coherence"]
    })
    
    print(f"Episode: {episode.episode_id}")
    print(f"  Validation: {validation_status}")
    print(f"  Status: {status}")
    print(f"  Match Score: {match_result:.2f}")
    print(f"  Drift Score: {metrics['drift']:.4f}")
    print(f"  Coherence Score: {metrics['coherence']:.4f}")
    print()

# Create results DataFrame
df_results = pd.DataFrame(evaluation_results)
print("\n✓ Evaluation Results Summary:")
print(df_results.to_string(index=False))

## Section 8: Track Metrics with MLflow

Each episode evaluation automatically logs to MLflow. View the metrics in the MLflow UI.

In [None]:
# Query MLflow runs for this experiment
print("=" * 60)
print("MLFLOW TRACKING - Logged Runs")
print("=" * 60)
print()

experiment = mlflow.get_experiment_by_name(experiment_name)
runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])

print(f"Total runs logged: {len(runs)}\n")

for idx, run in runs.iterrows():
    run_id = run["run_id"]
    metrics = run.get(["metrics.idempotency_score", "metrics.drift_score", "metrics.coherence_score"], default=None)
    
    print(f"Run {idx + 1}: {run_id[:8]}...")
    print(f"  Episode ID: {run['params.episode_id']}")
    print(f"  Idempotency Score: {run.get('metrics.idempotency_score', 'N/A')}")
    print(f"  Drift Score: {run.get('metrics.drift_score', 'N/A')}")
    print(f"  Coherence Score: {run.get('metrics.coherence_score', 'N/A')}")
    print()

print("✓ MLflow tracking active and metrics logged")

## Section 9: Accumulate Metrics Across Runs

Show how metrics accumulate across multiple executions of the same episodes.

In [None]:
print("=" * 60)
print("ACCUMULATING METRICS ACROSS RUNS")
print("=" * 60)
print()

# Show accumulated metrics for each episode
for episode in episodes:
    summary = episode.get_metrics_summary()
    print(f"Episode: {episode.episode_id}")
    print(f"  Execution Count: {summary['execution_count']}")
    print(f"  Last Execution: {summary.get('last_execution_at', 'N/A')}")
    
    if summary['execution_count'] > 0:
        print(f"  Match Rate: {summary.get('match_rate', 0):.2%}")
        print(f"  Avg Drift: {summary.get('avg_drift', 0):.4f}")
        print(f"  Avg Coherence: {summary.get('avg_coherence', 0):.4f}")
        print(f"  Drift Stdev: {summary.get('drift_stdev', 0):.4f}")
        print(f"  Coherence Stdev: {summary.get('coherence_stdev', 0):.4f}")
        print(f"  Raw Metrics:")
        print(f"    Match: {episode.metrics['match']}")
        print(f"    Drift: {episode.metrics['drift']}")
        print(f"    Coherence: {episode.metrics['coherence']}")
    print()

print("✓ Metrics accumulated successfully")

# Simulate running episodes again to accumulate more data
print("\n" + "=" * 60)
print("RE-EXECUTING EPISODES FOR SECOND RUN")
print("=" * 60)
print()

for episode, actual_output in episode_outputs:
    # Re-evaluate (simulates another run)
    match_result, metrics = evaluator.evaluate_episode(episode, actual_output)
    print(f"Episode {episode.episode_id}: Match={match_result}, Drift={metrics['drift']:.4f}")

print("\nMetrics after second run:")
for episode in episodes:
    summary = episode.get_metrics_summary()
    print(f"{episode.episode_id}: Executions={summary['execution_count']}, Match Rate={summary.get('match_rate', 0):.2%}")

## Section 10: Reset Metrics Function

Demonstrate resetting accumulated metrics for fresh evaluation cycles.

In [None]:
print("=" * 60)
print("RESETTING METRICS")
print("=" * 60)
print()

# Show metrics before reset
print("Before reset:")
for episode in episodes:
    summary = episode.get_metrics_summary()
    print(f"  {episode.episode_id}: Executions={summary['execution_count']}")

print("\nResetting all episode metrics...")
for episode in episodes:
    episode.reset_metrics()

print("\nAfter reset:")
for episode in episodes:
    summary = episode.get_metrics_summary()
    print(f"  {episode.episode_id}: Executions={summary['execution_count']}")
    print(f"    Match history: {episode.metrics['match']}")
    print(f"    Drift history: {episode.metrics['drift']}")

print("\n✓ Metrics reset successfully - ready for fresh evaluation cycle")

# Batch evaluation summary
print("\n" + "=" * 60)
print("BATCH EVALUATION SUMMARY")
print("=" * 60)
print()

batch_result = evaluator.evaluate_batch(episode_outputs, batch_id="demo_batch_001")
print(f"Batch ID: {batch_result['batch_id']}")
print(f"Episodes Processed: {batch_result['episodes_count']}")
print(f"\nAggregate Metrics:")
print(f"  Idempotency Rate: {batch_result['summary']['idempotency_rate']:.2%}")
print(f"  Average Drift: {batch_result['summary']['avg_drift']:.4f}")
print(f"  Average Coherence: {batch_result['summary']['avg_coherence']:.4f}")
print(f"  Total Tokens: {batch_result['summary']['total_tokens']}")
print(f"  Drift StDev: {batch_result['summary']['drift_stdev']:.4f}")
print(f"  Coherence StDev: {batch_result['summary']['coherence_stdev']:.4f}")

print("\n✓ Demo completed successfully!")