# Flourish: Virtue Evaluation Suite

This notebook runs behavioral evaluations across multiple LLM models to measure virtuous traits:

1. **Empathy in Action** - Does the model show genuine empathy when encountering distress?
2. **Intellectual Humility** - Does the model admit uncertainty and update beliefs?
3. **Honesty Under Pressure** - Does the model tell the truth even when uncomfortable?
4. **Caretaking** - Does the model proactively look out for the user's wellbeing?

## Setup

Make sure you have API keys set as environment variables:
- `ANTHROPIC_API_KEY`
- `OPENAI_API_KEY`
- `GOOGLE_API_KEY`

In [None]:
import sys
sys.path.insert(0, '../src')

from flourish import VirtueEvaluator
from flourish.evaluator import run_full_evaluation, aggregate_results
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Configure plotting
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
%matplotlib inline

## Configuration

In [None]:
# Models to evaluate
MODELS = [
    'claude-sonnet-4',
    'gpt-4o',
    'gemini-2.0-flash',
]

# Evaluation files
EVAL_FILES = [
    '../evals/empathy_in_action.yaml',
    '../evals/intellectual_humility.yaml',
    '../evals/honesty_under_pressure.yaml',
    '../evals/caretaking.yaml',
]

# Judge model for scoring
JUDGE_MODEL = 'claude-sonnet-4'

# Output directory
OUTPUT_DIR = Path('../results')

## Run Evaluations

This will run all scenarios (60 per model = 180 total evaluations across 3 models).

In [None]:
# Run full evaluation suite
results = run_full_evaluation(
    models=MODELS,
    eval_files=EVAL_FILES,
    judge_model=JUDGE_MODEL,
    output_dir=OUTPUT_DIR,
    verbose=True,
)

print(f"\nTotal evaluations: {len(results)}")

## Results Summary

In [None]:
# Create summary table
summary = aggregate_results(results)
print("\nAverage Scores by Model and Virtue (0-2 scale):")
display(summary)

In [None]:
# Save summary to CSV
summary.to_csv(OUTPUT_DIR / 'summary.csv')
results.to_csv(OUTPUT_DIR / 'detailed_results.csv', index=False)
print(f"Results saved to {OUTPUT_DIR}")

## Visualizations

In [None]:
# Heatmap of scores
fig, ax = plt.subplots(figsize=(10, 6))

# Prepare data for heatmap (exclude 'average' column)
heatmap_data = summary.drop(columns=['average'], errors='ignore')

sns.heatmap(
    heatmap_data,
    annot=True,
    fmt='.2f',
    cmap='RdYlGn',
    vmin=0,
    vmax=2,
    ax=ax,
    cbar_kws={'label': 'Score (0-2)'}
)

ax.set_title('Virtue Scores by Model', fontsize=14, fontweight='bold')
ax.set_xlabel('Virtue', fontsize=12)
ax.set_ylabel('Model', fontsize=12)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'heatmap.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Bar chart comparing models
fig, ax = plt.subplots(figsize=(12, 6))

# Reshape data for grouped bar chart
plot_data = results.groupby(['model', 'virtue'])['score'].mean().unstack()

plot_data.plot(
    kind='bar',
    ax=ax,
    width=0.8,
    edgecolor='white',
    linewidth=0.5,
)

ax.set_title('Model Performance by Virtue', fontsize=14, fontweight='bold')
ax.set_xlabel('Model', fontsize=12)
ax.set_ylabel('Average Score (0-2)', fontsize=12)
ax.set_ylim(0, 2.2)
ax.legend(title='Virtue', bbox_to_anchor=(1.02, 1), loc='upper left')
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')

# Add value labels on bars
for container in ax.containers:
    ax.bar_label(container, fmt='%.2f', fontsize=8)

plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'bar_chart.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Score distribution per virtue
virtues = results['virtue'].unique()
n_virtues = len(virtues)

fig, axes = plt.subplots(1, n_virtues, figsize=(5 * n_virtues, 5))

for ax, virtue in zip(axes, virtues):
    virtue_data = results[results['virtue'] == virtue]
    
    # Count scores by model
    score_counts = virtue_data.groupby(['model', 'score']).size().unstack(fill_value=0)
    
    score_counts.plot(
        kind='bar',
        stacked=True,
        ax=ax,
        color=['#e74c3c', '#f39c12', '#27ae60'],
        edgecolor='white',
    )
    
    ax.set_title(virtue, fontsize=12, fontweight='bold')
    ax.set_xlabel('Model', fontsize=10)
    ax.set_ylabel('Count', fontsize=10)
    ax.legend(title='Score', labels=['0', '1', '2'])
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')

plt.suptitle('Score Distribution by Virtue', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig(OUTPUT_DIR / 'distribution.png', dpi=150, bbox_inches='tight')
plt.show()

## Detailed Analysis

In [None]:
# Show lowest scoring scenarios across all models
print("\nLowest Scoring Scenarios (score=0):")
low_scores = results[results['score'] == 0][['model', 'virtue', 'scenario_id', 'reasoning']]
if len(low_scores) > 0:
    display(low_scores)
else:
    print("No scenarios scored 0!")

In [None]:
# Show highest scoring scenarios
print("\nHighest Scoring Scenarios (score=2):")
high_scores = results[results['score'] == 2][['model', 'virtue', 'scenario_id', 'reasoning']]
print(f"Total: {len(high_scores)} scenarios scored 2/2")
display(high_scores.head(10))

In [None]:
# Model comparison statistics
print("\nStatistics by Model:")
stats = results.groupby('model')['score'].agg(['mean', 'std', 'min', 'max', 'count'])
stats.columns = ['Mean', 'Std Dev', 'Min', 'Max', 'Count']
display(stats.round(2))

## Sample Responses

Let's examine some example responses to understand the scoring.

In [None]:
# Show sample responses with different scores
def show_sample(df, model, virtue, score):
    sample = df[(df['model'] == model) & (df['virtue'] == virtue) & (df['score'] == score)]
    if len(sample) > 0:
        row = sample.iloc[0]
        print(f"\n{'='*60}")
        print(f"Model: {row['model']} | Virtue: {row['virtue']} | Score: {row['score']}")
        print(f"Scenario: {row['scenario_id']}")
        print(f"{'='*60}")
        print(f"\nPrompt:\n{row['prompt'][:500]}...")
        print(f"\nResponse:\n{row['response'][:500]}...")
        print(f"\nJudge Reasoning: {row['reasoning']}")

# Show examples if we have results
if len(results) > 0:
    model = results['model'].iloc[0]
    virtue = results['virtue'].iloc[0]
    for score in [0, 1, 2]:
        show_sample(results, model, virtue, score)

## Export Results

In [None]:
# Final summary for README
print("\n" + "="*60)
print("FINAL RESULTS SUMMARY")
print("="*60)
print("\nTable for README:")
print("\n| Model | Empathy | Humility | Honesty | Caretaking | Average |")
print("|-------|---------|----------|---------|------------|---------|")
for model in summary.index:
    row = summary.loc[model]
    virtues = [v for v in row.index if v != 'average']
    scores = [f"{row[v]:.2f}" for v in virtues]
    avg = f"{row['average']:.2f}" if 'average' in row.index else 'N/A'
    print(f"| {model} | {' | '.join(scores)} | {avg} |")