## Setup

In [3]:
import glob
import os
import matplotlib.pyplot as plt
from collections import Counter
import pandas as pd
import colorsys
from pathlib import Path
from matplotlib.lines import Line2D

import numpy as np
from adjustText import adjust_text

from moral_lens.models import load_model_config


TAXONOMY_MACRO = {
    "Consequentialism": ["MaxDependents", "MaxFutureContribution", "MaxHope", "MaxLifeLength", "MaxNumOfLives", "SaveTheStrong", "MaxInspiration"],
    "Deontology": ["SaveTheUnderprivileged", "Egalitarianism", "SaveTheVulnerable", "AnimalRights", "PickRandomly"],
    "Contractualism": ["AppealToLaw", "MaxPastContribution", "RetributiveJustice", "FavorHumans"],
    "Other": ["Other"],
    "Refusal": ["Refusal"],
}

TAXONOMY_MICRO = [
    micro
    for micro_list in TAXONOMY_MACRO.values()
    for micro in micro_list
]

[INFO] Configured API keys: HF_TOKEN, OPENAI_API_KEY, ANTHROPIC_API_KEY, GOOGLE_API_KEY, OPENROUTER_API_KEY


## Query models for results

In [40]:
from moral_lens.dilemma import DilemmaRunner
from moral_lens.ranker import RankerRunner
from moral_lens.judge import JudgeRunner
from moral_lens.config import PathConfig
from moral_lens.utils import mydisplay

In [52]:
decision_model_ids = [
    # "openrouter/quasar-alpha",
    # "openrouter/optimus-alpha",
    # "gemini-2.0-flash-lite-001",
    "gemini-2.0-flash-001",
    # "gpt-3.5-turbo-0125",
    # "gpt-4o-2024-08-06",
    # "gpt-4o-mini-2024-07-18",
    # "o3-mini-2025-01-31:low",
    # "meta-llama/llama-4-scout",
    # "meta-llama/llama-4-maverick",
    # "meta-llama/llama-3.1-8b-instruct",
    # "meta-llama/llama-3.3-70b-instruct",
    # "deepseek/deepseek-chat-v3-0324",
    # "qwen/qwq-32b",
    # "qwen/qwen-plus",
    # "microsoft/phi-4",
]

ranker_model_ids = [
    # "gemini-2.0-flash-001",
    # "meta-llama/llama-4-scout",
    "gpt-4o-mini-2024-07-18",
    # "deepseek/deepseek-chat-v3-0324",
]

results_dir = "data/ranker_consistency"

In [None]:
# for decision_model_id in decision_model_ids:
#     for experiment in ["s1", "s2", "s3"]:
#         dr = DilemmaRunner(
#             model_id=decision_model_id,
#             decision_run_name=experiment,
#             results_dir=results_dir,
#             # choices_filename="choices_10pct.csv",
#             override_decision_temperature=0.7,
#         )
#         await dr.run()


In [53]:
rr = RankerRunner(
    ranker_model_id=ranker_model_ids[0],
    # ranker_run_name="s1",
    results_dir=results_dir,
    ranker_cot=True,
    # override_ranker_temperature=0.7,
)
await rr.run()

[INFO] OpenAI model gpt-4o-mini-2024-07-18 loaded.


Valid responses received: 100%|##########| 1000/1000 [04:57<00:00,  3.36it/s]


[INFO] Ranker output saved to data/ranker_consistency/ranker/gpt-4o-mini-2024-07-18.csv


In [None]:
for ranker_model_id in ranker_model_ids:
    for experiment in ["s1", "s2", "s3"]:
        rr = RankerRunner(
            ranker_model_id=ranker_model_id,
            ranker_run_name=experiment,
            results_dir=results_dir,
            ranker_cot=True,
            override_ranker_temperature=0.7,
        )
        await jr.run()


## Plots

In [58]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict


def run_elo_analysis(
    input_file,
    output_dir="results",
    initial_rating=1000,
    k_factor=32
):
    """
    Run a simple Elo analysis on the ranker output file.

    Args:
        input_file: Path to the CSV file with ranker results
        output_dir: Directory to save results and visualizations
        initial_rating: Initial Elo rating for all models
        k_factor: K-factor for Elo calculations
    """
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(os.path.join(output_dir, "plots"), exist_ok=True)

    # Load data
    print(f"Loading data from {input_file}...")
    df = pd.read_csv(input_file, keep_default_na=False)

    # Initialize ratings
    unique_models = set()
    for model in df['model_a'].unique():
        unique_models.add(model)
    for model in df['model_b'].unique():
        unique_models.add(model)

    ratings = {model: initial_rating for model in unique_models}
    print(f"Found {len(unique_models)} unique models")

    # Initialize category tracking
    categories = df['phenomenon_category'].unique()
    category_performance = {}
    for category in categories:
        category_performance[category] = defaultdict(lambda: {'wins': 0, 'losses': 0, 'ties': 0})

    # Track match history
    match_history = []

    # Process each comparison
    print("Computing Elo ratings...")
    for _, row in df.iterrows():
        model_a = row['model_a']
        model_b = row['model_b']
        decision = row['ranker_decision'].lower()  # Convert to lowercase
        phenomenon_category = row['phenomenon_category']

        # Skip if invalid decision
        if decision not in ['a', 'b', 'tie', 'both']:
            continue

        # Get current ratings
        rating_a = ratings[model_a]
        rating_b = ratings[model_b]

        # Expected scores
        expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
        expected_b = 1 / (1 + 10 ** ((rating_a - rating_b) / 400))

        # Actual scores
        if decision == 'a':
            actual_a, actual_b = 1.0, 0.0
            # Update category performance
            category_performance[phenomenon_category][model_a]['wins'] += 1
            category_performance[phenomenon_category][model_b]['losses'] += 1
        elif decision == 'b':
            actual_a, actual_b = 0.0, 1.0
            # Update category performance
            category_performance[phenomenon_category][model_a]['losses'] += 1
            category_performance[phenomenon_category][model_b]['wins'] += 1
        else:  # tie or both
            actual_a, actual_b = 0.5, 0.5
            # Update category performance
            category_performance[phenomenon_category][model_a]['ties'] += 1
            category_performance[phenomenon_category][model_b]['ties'] += 1

        # Calculate new ratings
        new_rating_a = rating_a + k_factor * (actual_a - expected_a)
        new_rating_b = rating_b + k_factor * (actual_b - expected_b)

        # Update ratings
        ratings[model_a] = new_rating_a
        ratings[model_b] = new_rating_b

        # Store match history
        match_record = {
            'model_a': model_a,
            'model_b': model_b,
            'rating_a_before': rating_a,
            'rating_b_before': rating_b,
            'rating_a_after': new_rating_a,
            'rating_b_after': new_rating_b,
            'decision': decision,
            'phenomenon_category': phenomenon_category
        }
        match_history.append(match_record)

    # Prepare results
    ratings_df = pd.DataFrame([
        {'model': model, 'elo_rating': rating}
        for model, rating in ratings.items()
    ]).sort_values('elo_rating', ascending=False).reset_index(drop=True)

    # Save ratings
    ratings_file = os.path.join(output_dir, 'model_elo_ratings.csv')
    ratings_df.to_csv(ratings_file, index=False)
    print(f"Saved model ratings to {ratings_file}")

    # Save match history
    history_df = pd.DataFrame(match_history)
    history_file = os.path.join(output_dir, 'elo_match_history.csv')
    history_df.to_csv(history_file, index=False)
    print(f"Saved match history to {history_file}")

    # Generate category performance data
    category_records = []
    for category, model_data in category_performance.items():
        for model, results in model_data.items():
            total_matches = results['wins'] + results['losses'] + results['ties']
            if total_matches > 0:
                win_rate = (results['wins'] + 0.5 * results['ties']) / total_matches
                category_records.append({
                    'phenomenon_category': category,
                    'model': model,
                    'wins': results['wins'],
                    'losses': results['losses'],
                    'ties': results['ties'],
                    'total_matches': total_matches,
                    'win_rate': win_rate
                })

    category_df = pd.DataFrame(category_records)
    category_file = os.path.join(output_dir, 'category_performance.csv')
    category_df.to_csv(category_file, index=False)
    print(f"Saved category performance to {category_file}")

    # Create basic visualizations
    print("Generating visualizations...")

    # 1. Bar chart of model ratings
    plt.figure(figsize=(12, 8))
    sns.barplot(x='model', y='elo_rating', data=ratings_df)
    plt.title('Model Elo Ratings')
    plt.xticks(rotation=45, ha='right')
    plt.axhline(y=initial_rating, color='r', linestyle='--')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'plots', 'model_elo_ratings.png'))
    plt.close()

    # 2. Heatmap of category performance
    if len(category_df) > 0:
        pivot_df = category_df.pivot(index='model', columns='phenomenon_category', values='win_rate')
        plt.figure(figsize=(max(12, len(categories) * 1.5), max(8, len(unique_models) * 0.5)))
        sns.heatmap(pivot_df, annot=True, fmt=".2f", cmap="YlGnBu", vmin=0, vmax=1)
        plt.title('Model Performance by Category (Win Rate)')
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, 'plots', 'category_performance.png'))
        plt.close()

    # Print summary of results
    print("\nTop Models by Elo Rating:")
    print(ratings_df.head(10).to_string(index=False))

    print("\nPhenomenon Category Performance Summary:")
    # For each category, show top 3 models
    for category in category_df['phenomenon_category'].unique():
        cat_data = category_df[category_df['phenomenon_category'] == category].sort_values('win_rate', ascending=False)
        print(f"\n{category}:")
        print(cat_data.head(3)[['model', 'win_rate', 'total_matches']].to_string(index=False))

    print(f"\nAll results and visualizations saved to {output_dir}")
    return ratings_df

In [None]:
def analyze_model_consistency(
    ranker_output_file,
    results_dir="results"
):
    """
    Analyze the consistency of decisions for each individual model rather than model pairs.

    Args:
        ranker_output_file: Path to the CSV file with ranker results
        results_dir: Directory to save results and visualizations
    """
    print(f"Analyzing model consistency from {ranker_output_file}...")

    # Create output directory
    os.makedirs(results_dir, exist_ok=True)
    os.makedirs(os.path.join(results_dir, "plots"), exist_ok=True)

    # Load data
    df = pd.read_csv(ranker_output_file)

    # Extract key columns for identifying unique scenarios
    if 'dilemma_prompt' in df.columns:
        scenario_id_col = 'dilemma_prompt'
    elif 'scenario_text' in df.columns:
        scenario_id_col = 'scenario_text'
    else:
        # If neither is available, try to use 'id' or another identifier
        potential_id_cols = ['id', 'scenario_id', 'dilemma_id']
        for col in potential_id_cols:
            if col in df.columns:
                scenario_id_col = col
                break
        else:
            print("Could not find a column to identify unique scenarios. Please specify one.")
            return

    print(f"Using '{scenario_id_col}' to identify unique scenarios")

    # Initialize consistency tracking per model
    model_consistency = defaultdict(lambda: {
        'wins': 0,
        'losses': 0,
        'consistent_wins': 0,
        'consistent_losses': 0,
        'inconsistent': 0,
        'total_comparisons': 0
    })

    # Track position bias
    position_bias = {
        'model_a_wins': 0,
        'model_b_wins': 0,
        'ties': 0,
        'total': 0
    }

    # Group scenarios
    scenario_groups = df.groupby(scenario_id_col)

    # For each scenario
    for scenario, group in scenario_groups:
        # Track comparisons within this scenario
        scenario_comparisons = []

        for _, row in group.iterrows():
            model_a = row['model_a']
            model_b = row['model_b']
            decision = row['ranker_decision'].lower()

            # Record the comparison
            comparison = {
                'model_a': model_a,
                'model_b': model_b,
                'decision': decision
            }
            scenario_comparisons.append(comparison)

            # Update position bias metrics
            position_bias['total'] += 1
            if decision == 'a':
                position_bias['model_a_wins'] += 1
            elif decision == 'b':
                position_bias['model_b_wins'] += 1
            else:  # tie or both
                position_bias['ties'] += 1

        # Now check for reflected comparisons within this scenario
        for i in range(len(scenario_comparisons)):
            comp1 = scenario_comparisons[i]
            model_a1 = comp1['model_a']
            model_b1 = comp1['model_b']
            decision1 = comp1['decision']

            # Find any reflected comparison (model order swapped)
            reflection_found = False
            for j in range(len(scenario_comparisons)):
                if i == j:
                    continue

                comp2 = scenario_comparisons[j]
                model_a2 = comp2['model_a']
                model_b2 = comp2['model_b']
                decision2 = comp2['decision']

                # Check if this is a reflection (same models, different order)
                if (model_a1 == model_b2 and model_b1 == model_a2):
                    reflection_found = True

                    # Extract actual winners for comparison
                    if decision1 == 'a':
                        winner1 = model_a1
                    elif decision1 == 'b':
                        winner1 = model_b1
                    else:
                        winner1 = "tie"

                    if decision2 == 'a':
                        winner2 = model_a2
                    elif decision2 == 'b':
                        winner2 = model_b2
                    else:
                        winner2 = "tie"

                    # Check if decisions are consistent
                    is_consistent = (winner1 == winner2) or winner1 == "tie" or winner2 == "tie"

                    # Update model consistency data
                    for model in [model_a1, model_b1]:
                        model_consistency[model]['total_comparisons'] += 1

                        # Determine if this model won in the first comparison
                        if (model == model_a1 and decision1 == 'a') or (model == model_b1 and decision1 == 'b'):
                            model_consistency[model]['wins'] += 1
                            if is_consistent:
                                model_consistency[model]['consistent_wins'] += 1
                            else:
                                model_consistency[model]['inconsistent'] += 1
                        elif decision1 not in ['tie', 'both']:  # Don't count ties as losses
                            model_consistency[model]['losses'] += 1
                            if is_consistent:
                                model_consistency[model]['consistent_losses'] += 1
                            else:
                                model_consistency[model]['inconsistent'] += 1

                    # We found a reflection, no need to check others
                    break

    # Prepare summary data
    model_summary = []
    for model, data in model_consistency.items():
        total_decisive = data['wins'] + data['losses']
        if total_decisive > 0:
            consistency_rate = (data['consistent_wins'] + data['consistent_losses']) / total_decisive
            win_rate = data['wins'] / total_decisive if total_decisive > 0 else 0

            model_summary.append({
                'model': model,
                'consistency_rate': consistency_rate,
                'win_rate': win_rate,
                'wins': data['wins'],
                'losses': data['losses'],
                'consistent_wins': data['consistent_wins'],
                'consistent_losses': data['consistent_losses'],
                'inconsistent': data['inconsistent'],
                'total_comparisons': data['total_comparisons']
            })

    if model_summary:
        # Create DataFrames
        model_df = pd.DataFrame(model_summary).sort_values('consistency_rate', ascending=False)

        # Save to CSV
        model_file = os.path.join(results_dir, 'model_consistency.csv')
        model_df.to_csv(model_file, index=False)
        print(f"Saved model consistency to {model_file}")

        # Create visualizations

        # 1. Model Consistency Bar Chart
        plt.figure(figsize=(12, 8))
        # Sort data for consistent presentation
        plot_df = model_df.sort_values('consistency_rate', ascending=False)

        # Create the barplot with the sorted data
        ax = sns.barplot(x='consistency_rate', y='model', data=plot_df)

        # Add value labels to the bars
        for i, row in enumerate(plot_df.itertuples()):
            ax.text(row.consistency_rate + 0.01, i, f"{row.consistency_rate:.2f}",
               va='center', fontsize=10)

        plt.title('Model Consistency Rate')
        plt.xlabel('Consistency Rate')
        plt.ylabel('Model')
        plt.xlim(0, 1.1)  # Leave room for the text labels
        plt.tight_layout()
        plt.savefig(os.path.join(results_dir, 'plots', 'model_consistency.png'))
        plt.close()

        # 2. Win Rate vs Consistency Rate
        plt.figure(figsize=(10, 8))
        scatter = sns.scatterplot(x='consistency_rate', y='win_rate',
                                  size='total_comparisons', sizes=(100, 500),
                                  alpha=0.7, data=model_df)

        # Add model labels to the points
        for i, row in model_df.iterrows():
            plt.text(row['consistency_rate'] + 0.01, row['win_rate'],
                    row['model'], fontsize=9)

        plt.title('Model Win Rate vs Consistency Rate')
        plt.xlabel('Consistency Rate')
        plt.ylabel('Win Rate')
        plt.xlim(0, 1.1)
        plt.ylim(0, 1)
        plt.tight_layout()
        plt.savefig(os.path.join(results_dir, 'plots', 'win_vs_consistency.png'))
        plt.close()

        # Calculate position bias
        model_a_win_rate = position_bias['model_a_wins'] / position_bias['total'] if position_bias['total'] > 0 else 0
        model_b_win_rate = position_bias['model_b_wins'] / position_bias['total'] if position_bias['total'] > 0 else 0
        tie_rate = position_bias['ties'] / position_bias['total'] if position_bias['total'] > 0 else 0

        # 3. Position Bias Pie Chart
        labels = ['Model A Wins', 'Model B Wins', 'Ties']
        sizes = [model_a_win_rate, model_b_win_rate, tie_rate]
        colors = ['#ff9999', '#66b3ff', '#99ff99']

        plt.figure(figsize=(8, 8))
        plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
        plt.title('Position Bias in Ranker Decisions')
        plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
        plt.tight_layout()
        plt.savefig(os.path.join(results_dir, 'plots', 'position_bias.png'))
        plt.close()

        # Print summary
        print("\nModel Consistency Summary:")
        print(model_df[['model', 'consistency_rate', 'win_rate', 'total_comparisons']].to_string(index=False))

        print("\nPosition Bias Analysis:")
        print(f"  Model A wins: {position_bias['model_a_wins']} ({model_a_win_rate:.2f})")
        print(f"  Model B wins: {position_bias['model_b_wins']} ({model_b_win_rate:.2f})")
        print(f"  Ties: {position_bias['ties']} ({tie_rate:.2f})")

        if abs(model_a_win_rate - model_b_win_rate) > 0.1:  # More than 10% difference
            print(f"  POTENTIAL POSITION BIAS DETECTED")
        else:
            print(f"  No significant position bias detected")

        return model_df
    else:
        print("No suitable comparisons found to analyze model consistency.")
        return None

In [None]:
a = pd.read_csv(file)

In [50]:
a.ranker_decision.value_counts()

ranker_decision
b      549
a      427
tie     24
Name: count, dtype: int64

In [66]:

file = "data/ranker_consistency/ranker/gemini-2.0-flash-001.csv" # gpt-4o-mini-2024-07-18
# file = "data/ranker_consistency/ranker/gpt-4o-mini-2024-07-18.csv"
results_dir = "data/ranker_consistency/ranker/" + file.split("/")[-1].split(".csv")[0]

run_elo_analysis(
    input_file=file,
    initial_rating=1000,
    k_factor=32,
    output_dir=results_dir,
    # filter_categories=args.filter_categories
)

analyze_model_consistency(
    ranker_output_file=file,
    results_dir=results_dir,
)

Loading data from data/ranker_consistency/ranker/gemini-2.0-flash-001.csv...
Found 16 unique models
Computing Elo ratings...
Saved model ratings to data/ranker_consistency/ranker/gemini-2.0-flash-001/model_elo_ratings.csv
Saved match history to data/ranker_consistency/ranker/gemini-2.0-flash-001/elo_match_history.csv
Saved category performance to data/ranker_consistency/ranker/gemini-2.0-flash-001/category_performance.csv
Generating visualizations...

Top Models by Elo Rating:
                 model  elo_rating
                 phi-4 1335.992863
 deepseek-chat-v3-0324 1176.496335
      llama-4-maverick 1144.777196
gpt-4o-mini-2024-07-18 1123.229275
             qwen-plus 1074.817800
     gpt-4o-2024-08-06 1059.214993
               qwq-32b 1038.815224
llama-3.3-70b-instruct 1011.068646
         optimus-alpha  999.394741
 llama-3.1-8b-instruct  990.572250

Phenomenon Category Performance Summary:

Utilitarianism:
                model  win_rate  total_matches
                phi-4  0.83

Unnamed: 0,model,consistency_rate,win_rate,wins,losses,consistent_wins,consistent_losses,inconsistent,total_comparisons
7,llama-4-maverick,0.884298,0.719008,87,34,80,27,14,124
8,llama-3.3-70b-instruct,0.83871,0.532258,66,58,56,48,20,126
5,gemini-2.0-flash-lite-001,0.836066,0.147541,18,104,8,94,20,124
13,gemini-2.0-flash-001,0.828125,0.242188,31,97,20,86,22,128
12,phi-4,0.806452,0.854839,106,18,94,6,24,124
0,gpt-4o-2024-08-06,0.789474,0.570175,65,49,53,37,24,126
11,gpt-3.5-turbo-0125,0.779528,0.204724,26,101,12,87,28,128
14,o3-mini-2025-01-31:low,0.764706,0.260504,31,88,17,74,28,124
4,qwen-plus,0.761905,0.611111,77,49,62,34,30,126
6,llama-3.1-8b-instruct,0.725806,0.508065,63,61,46,44,34,124
