# Strategy Score on Output (using GPT-4o)

In [None]:
import os
import sys
import json
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import requests
import pandas as pd

from pandas import DataFrame, json_normalize
from typing import List, Dict, Any, Tuple, Union, Optional

LOGS_PATH: str = "../evaluations/results/"

In [None]:
import dotenv
dotenv.load_dotenv()

In [None]:
sys.path.append("..")

from utils import load_agent_logs_df, read_jsonl_as_json, load_game_summary

In [None]:
EXPT_NAMES: List[str] = [
    "2025-02-01_phi_llama_100_games_v3",
    "2025-02-01_llama_phi_100_games_v3",
    "2025-02-01_phi_phi_100_games_v3",
    "2025-02-01_llama_llama_100_games_v3",
    ]

In [None]:
DESCRIPTIONS: List[str] = [
    "Crew: Phi, Imp: Llama",
    "Crew: Llama, Imp: Phi",
    "Crew: Phi, Imp: Phi",
    "Crew: Llama, Imp: Llama",
    ]

In [None]:
summary_logs_paths: List[str] = [
    os.path.join(LOGS_PATH, f"{expt_name}_all_skill_scores.json") for expt_name in EXPT_NAMES
]

In [None]:
summary_dfs: List[DataFrame] = []

for summary_logs_path in summary_logs_paths:
    # read json line by line
    summary_logs: List[Dict[str, Any]] = read_jsonl_as_json(summary_logs_path)
    summary_df: DataFrame = json_normalize(summary_logs)
    # sort by game_index and then step
    summary_df = summary_df.sort_values(by=["game_index", "step"])
    summary_dfs.append(summary_df)
    print(f"Loaded {len(summary_df)} logs from {summary_logs_path}")

In [None]:
summary_dfs[0].head()

In [None]:
metrics = ["awareness", "lying", "deception", "planning"]
metric_names = ["Awareness", "Lying", "Deception", "Planning"]
identity_names = ["Impostor", "Crewmate"]  # Changed to match case in data

# Define colors for metrics and patterns for identities
metric_colors = ['#1f77b4', '#d62728', '#2ca02c', '#9467bd']
identity_colors = ['#ff7f0e', '#1f77b4']  # Orange for Impostor, Blue for Crewmate

# Create figure
fig = go.Figure()

# Create x-labels (now just one per metric)
x_labels = metric_names

# Add violins for each metric and identity
for i, metric in enumerate(metrics):
    for j, identity in enumerate(identity_names):
        summary_df = summary_dfs[2]  # only Phi vs Phi
        y_values = np.array(summary_df[summary_df["player_identity"] == identity][metric], dtype=np.float64)
        
        fig.add_trace(go.Violin(
            x=[x_labels[i]] * len(y_values),
            y=y_values,
            name=f"{identity}",
            line_color=identity_colors[j],
            meanline_visible=True,
            box_visible=False,
            legendgroup=identity,
            bandwidth=0.5,
            showlegend=True if i == 0 else False,  # Only show in legend once
            side="negative" if j == 0 else "positive",  # Impostor on left, Crewmate on right
            width=0.8,
        ))
# Layout settings
fig.update_layout(
    legend=dict(
        title="Player Role",
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="center",
        x=0.5
    ),
    yaxis=dict(
        title="Score",
        range=[0, 11], 
        tickmode='linear', 
        dtick=2, 
        showgrid=True, 
        gridcolor='LightGray'
    ),
    xaxis=dict(
        title="",
        showline=True, 
        linewidth=1, 
        linecolor='black', 
        categoryorder="array",
        categoryarray=x_labels,
    ),
    plot_bgcolor='white',
    font=dict(family='serif', size=15, color='black'),
    title="",
    height=500, width=500,
)
fig.update_layout(
    template='plotly_white', 
    font=dict(family="Computer Modern", size=14), 
    plot_bgcolor='#fafaf7'
)

fig.show()

In [None]:
# save figure as high-quality pdf
fig.write_image("plots/phi_v_phi_violins.pdf", format="pdf")