# 1. Load the Paprika Environments

In [1]:
import sys
import pandas as pd
sys.path.append('../../paprika/') # Remove this when verl and paprika are installed in the same env
from llm_exploration.paprika_config_helper import PaprikaConfigHelper
from verl.interactions.paprika_interaction import PaprikaInteraction
from pprint import pprint as pp
paprika_games = ['twenty_questions', 'guess_my_city', 'murder_mystery', 'customer_service', 'wordle', 'cellular_automata', \
    'mastermind'] # 'battleship', 'minesweeper', 'bandit_bai_fixed_budget' 

Could not import JerichoInferenceEngine, so cannot use it!
Could not import VLLMInferenceEngine, so cannot use it!


In [2]:
import dotenv
dotenv.load_dotenv('../../.env')

True

In [3]:
for env_name in paprika_games:
    config = PaprikaConfigHelper.create_config(env_name)
    config['belief_config']['style'] = 'none'

    interaction = PaprikaInteraction(config={})

    import builtins
    _original_print = builtins.print
    builtins.print = lambda *a, **k: None
    try:
        instance_id = await interaction.start_interaction(
            instance_id=None,
            scenario_id=None, # start a random scenario
            **config,
        )
    finally:
        builtins.print = _original_print
    num_scenarios = interaction.game_scenarios.__len__()
    print(f'{env_name} => {num_scenarios}')

twenty_questions => 367
guess_my_city => 185
murder_mystery => 50
customer_service => 200


wordle => 800
cellular_automata => 500
mastermind => 500


So let us do 50 games for each with frontier LLMs - one with reasoning and one without.

# 2. Setup LLM Call

In [4]:
sys.path.append('../../src/optimal_explorer')
from llm_utils import llm_call
from pprint import pprint as pp

In [8]:
out = await llm_call(
    model='deepseek/deepseek-r1',
    get_everything=True,
    reasoning_effort='high',
    messages = [
            {"role": "system", "content": 'You are a useless assistant that gives humorous answers.'},
            {"role": "user", "content": 'Why is the sky blue?'},
            {"role": "assistant", "content": 'It is because of the great king of Pokemons.'},
            {"role": "user", "content": 'How so?'},
        ]
)

In [9]:
pp(out['choices'][0]['message']['content'], width=130)

('Ah, you’re right to question me—I clearly flunked out of Pokémon Academy’s physics class. Let me *ash*-k you to imagine this '
 'instead: When sunlight (which is white) enters Earth’s atmosphere, tiny molecules scatter the light. Blue light waves are '
 'short and feisty, like a hyper Pikachu dodging a Poké Ball, so they bounce wildly in all directions. This makes the sky look '
 "blue! Meanwhile, red light waves are chill, like Snorlax napping—they just cruise straight through. So, the sky's blueness is "
 'basically Earth’s atmosphere throwing a rave for blue light. Science, but make it ✨ *electric* ✨.')


In [14]:
pp(out['choices'][0]['message']['reasoning'], width=130)

('Okay, the user asked "How so?" after I joked about the sky being blue because of the Pokémon king. They probably want a real '
 'explanation but are playing along with the humor. I should maintain the funny tone but also slip in the actual science. Let me '
 "explain Rayleigh scattering in a whimsical way using Pokémon references. Make sure it's clear that the real reason is "
 'scientific, but keep it light-hearted. Mention Pikachu and maybe some other Pokémon to keep the theme. Avoid getting too '
 "technical but still convey the right information. Check if the analogy makes sense and the user isn't confused. Balance "
 'between humor and accuracy.\n')


# 3. Belief Update

In [15]:
async def update_belief(
        curr_belief: str,
        action: str,
        response: str,
        model_name: str,
    ):


    user_content = f'''\
Look at the current belief and the agent's action and environment response on that belief.\
Compress the context, remove redundant information, and maintain important information about the game state \
needed to take optimal future actions.\
Current belief: {curr_belief}
Agent's action: {action}
Environment's response: {response}
Output the updated belief state inside <BELIEF> and </BELIEF> tags.\
Understand that only the generated belief is fed to the agent, so be sure to include all necessary information about game mechanics.'''

    messages = [
        {"role": "system", "content": 'You are a helpful assistant.'},
        {"role": "user", "content": user_content},
    ]

    out = await llm_call(
        model=model_name,
        get_everything=True,
        reasoning_effort='high',
        messages=messages
    )

    import re
    content = out['choices'][0]['message']['content']
    match = re.search(r"<BELIEF>(.*?)</BELIEF>", content, re.DOTALL | re.IGNORECASE)
    if match:
        belief = match.group(1).strip()
    else:
        # fallback: return the whole content if tags not found
        belief = content.strip()
    
    reasoning = out['choices'][0]['message']['reasoning_details'][0]['text']

    return belief, reasoning

In [16]:
async def take_action(
        belief: str,
        model_name: str,
    ):


    user_content = f'''\
Look at the current belief take the next action based on the belief.\
Take an action that leads to optimal exploration.\
Belief: {belief}
Output the action inside <ACTION> and </ACTION> tags.'''

    messages = [
        {"role": "system", "content": 'You are a helpful assistant.'},
        {"role": "user", "content": user_content},
    ]

    out = await llm_call(
        model=model_name,
        get_everything=True,
        reasoning_effort='high',
        messages=messages
    )

    import re
    content = out['choices'][0]['message']['content']
    match = re.search(r"<\s*action\s*>(.*?)<\s*/\s*action\s*>", content, re.DOTALL | re.IGNORECASE)
    if match:
        action = match.group(1).strip()
    else:
        # fallback: return the whole content if tags not found
        action = content.strip()
    
    reasoning = out['choices'][0]['message']['reasoning_details'][0]['text']

    return action, reasoning

# 4. Paprika Rollout with Belief

In [17]:
import asyncio

async def run_one_iteration_with_belief_llm(
        env_name: str,
        model_name: str,
        game_id: int,
    ):
    config = PaprikaConfigHelper.create_config(env_name)
    config['belief_config']['style'] = 'none'
    interaction = PaprikaInteraction(config={})

    import builtins
    _original_print = builtins.print
    builtins.print = lambda *a, **k: None
    try:
        instance_id = await interaction.start_interaction(instance_id=None, scenario_id=None, **config)
    finally:
        builtins.print = _original_print

    first_user_message = interaction.agent_conv.messages[0][1]
    attempts = 0
    game_history = []
    belief = f'This is the start of the game. The only available information right now are the game rules:\n{first_user_message}'
    max_attempts = interaction._instance_dict[instance_id]['max_turns']

    while attempts < max_attempts:
        
        attempts += 1

        action, action_reasoning = await take_action(belief, model_name)

        message = [
            {"role": "user", "content": f"Output the next action."},
            {"role": "assistant", "content": f"<action>{action}</action>"}
        ]
        done, response, score, additional_data = await interaction.generate_response(instance_id=instance_id, messages=message)
        
        belief, belief_reasoning = await update_belief(belief, action, response, model_name)

        game_history.append({
            "model": model_name,
            "game_id": str(game_id),
            "env": env_name,
            "attempt": attempts,
            "guess": action,
            "response": response,
            "score": score,
            "done": done,
            "data": additional_data,
            "belief": belief,
            "action_reasoning": action_reasoning,
            "belief_reasoning": belief_reasoning,
        })

        if "Goal reached" in response:
            break
    
    print(f'.', end='', flush=True)
    
    return game_history

async def run_multiple_iterations_multiple_games(
        num_games: int,
        list_envs,
        models,
        logs_file='./logs/paprika_frontier.jsonl',
    ):
    import json

    tasks = []
    for model in models:
        for env_name in list_envs:
            for game_id in range(num_games):
                tasks.append(run_one_iteration_with_belief_llm(env_name, model, game_id))

    results = await asyncio.gather(*tasks)

    # Flatten results and write to file
    with open(logs_file, "a") as f:
        for game_history in results:
            for entry in game_history:
                f.write(json.dumps(entry) + "\n")

In [26]:
await run_multiple_iterations_multiple_games(
    num_games=10,
    list_envs=['mastermind'],
    models=['openai/gpt-oss-120b'],
)

..........

# 4. Metrics

In [1]:
import pandas as pd
from pprint import pprint as pp
import json

logs_file = './logs/paprika_frontier_v6.jsonl'
with open(logs_file, 'r') as f:
    data = [json.loads(line) for line in f]
df = pd.DataFrame(data)

In [2]:
print(df.columns)

Index(['model', 'game_id', 'env', 'attempt', 'info', 'raw_guess', 'guess',
       'response', 'word_limit', 'score', 'done', 'data', 'belief',
       'action_reasoning', 'belief_reasoning'],
      dtype='object')


In [3]:
if not df.iloc[0]['word_limit']:
    df['word_limit'] = 'None'

In [4]:
df['info'].value_counts()

info
belief     8764
both       8491
history    8162
Name: count, dtype: int64

In [5]:
df['model_info'] = df['model'].astype(str) + ' (' + df['info'].astype(str) + ')'

In [6]:
df['model'] = df['model_info']

## 1. Success Rates (belief v. history v. both)

In [22]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import numpy as np

def summarize_game_outcomes(df):
    df['response'] = df['response'].str.lower()
    grouped = df.groupby(['model', 'game_id', 'env', 'word_limit'])['response'].apply(
        lambda responses: any('goal reached' in resp for resp in responses)
    ).reset_index(name='won')
    return grouped

def plot_win_rates(summary_df):
    # Compute mean and std of win rates for each group
    stats = summary_df.groupby(['env', 'model', 'word_limit'])['won'].agg(['mean', 'std', 'count']).reset_index()
    stats['success_rate'] = stats['mean'] * 100
    # Standard error of the mean (SEM)
    stats['sem'] = stats['std'] / np.sqrt(stats['count'])
    stats['sem'] = stats['sem'].fillna(0)
    stats['success_rate_sem'] = stats['sem'] * 100

    envs = stats['env'].unique()
    models = stats['model'].unique()[[0, 2, 1, 3, 5, 4, 6, 8, 7]]
    word_limits = sorted(stats['word_limit'].unique())

    colors = [
        "#ADD8E6",  # light blue
        "#4682B4",  # medium blue
        "#003366",  # dark blue
        "#FFB6C1",  # light red
        "#FF6347",  # medium red
        "#8B0000",  # dark red
        "#90EE90",  # light green
        "#32CD32",  # medium green
        "#006400",  # dark green
    ]
    color_map = {model: colors[i % len(colors)] for i, model in enumerate(models)}

    # Create subplots: rows = word_limit values, cols = envs
    fig = make_subplots(
        rows=len(word_limits), cols=len(envs),
        subplot_titles=[f"{env}" for env in envs],
        shared_yaxes=True,
        vertical_spacing=0.2 / len(word_limits),
        horizontal_spacing=0.03
    )

    for r, wl in enumerate(word_limits, start=1):
        for c, env in enumerate(envs, start=1):
            for model in models:
                subset = stats[
                    (stats['env'] == env) &
                    (stats['model'] == model) &
                    (stats['word_limit'] == wl)
                ]
                if not subset.empty:
                    # Plot bar with error bar (standard error of mean)
                    fig.add_trace(
                        go.Bar(
                            x=[model],
                            y=subset['success_rate'],
                            name=model,
                            marker_color=color_map[model],
                            width=0.8,
                            showlegend=(r == 1 and c == 1),
                            error_y=dict(
                                type='data',
                                array=subset['success_rate_sem'],
                                visible=True,
                                color='black',
                                thickness=1,
                                width=4,
                            ),
                            hovertemplate=(
                                f"Env: {env}<br>"
                                f"Word Limit: {wl}<br>"
                                f"Model: {model}<br>"
                                f"Success Rate: {{y:.2f}}%<br>"
                                f"SEM: {subset['success_rate_sem'].values[0]:.2f}%<br>"
                                f"N: {subset['count'].values[0]}"
                            ),
                        ),
                        row=r, col=c
                    )
            # # Add row label for word_limit
            # if c == 1:
            #     fig.add_annotation(
            #         text=f"Word Limit: {wl}",
            #         xref="paper",
            #         yref="paper",
            #         x=0.1,
            #         y=0.95 - ((r - 1) / len(word_limits)),
            #         showarrow=False,
            #         font=dict(size=14)
            #     )

    # Update y-axis
    for r in range(1, len(word_limits) + 1):
        for c in range(1, len(envs) + 1):
            fig.update_yaxes(
                range=[0, 100],
                showgrid=True,
                gridcolor='lightgray',
                row=r, col=c
            )

    # Update x-axis to remove tick labels
    for r in range(1, len(word_limits) + 1):
        for c in range(1, len(envs) + 1):
            fig.update_xaxes(
                showticklabels=False,
                row=r, col=c
            )

    # Fixed subplot size
    fig_width = 180 * len(envs)
    fig_height = 200 * len(word_limits)

    # Layout with horizontal legend
    fig.update_layout(
        height=fig_height + 150,
        width=fig_width,
        template='simple_white',
        font=dict(family='Computer Modern, serif', size=16),
        barmode='group',
        showlegend=True,
        legend=dict(
            title='Models',
            orientation='h',
            yanchor='bottom',
            y=1.18,
            xanchor='center',
            x=0.5,
            bgcolor='rgba(255,255,255,0.9)',
            bordercolor='black',
            borderwidth=1
        ),
        margin=dict(t=100, b=50, l=80, r=20),
        plot_bgcolor='white'
    )

    fig.update_yaxes(title_text="Avg. Success Rate", row=1, col=1)

    return fig

In [23]:
df_belief = df.loc[df['info'] == 'belief'].copy()
df_history = df.loc[df['info'] == 'history'].copy()
summary_df = summarize_game_outcomes(df)
summary_df_belief = summarize_game_outcomes(df_belief)
summary_df_history = summarize_game_outcomes(df_history)

In [24]:
fig = plot_win_rates(summary_df)
fig.show()

## 2. Qualitative Look at Samples

In [39]:
df.columns

Index(['model', 'game_id', 'env', 'attempt', 'raw_guess', 'guess', 'response',
       'word_limit', 'score', 'done', 'data', 'belief', 'action_reasoning',
       'belief_reasoning'],
      dtype='object')

In [166]:
wordle_sample = df[
    (df['model'] == 'google/gemini-2.5-pro (belief)') & \
        (df['env'] == 'twenty_questions') & \
            (df['game_id'] == str(0)) & \
            (df['info'] == 'belief')
            ]
print(len(wordle_sample))

17


In [167]:
wordle_sample

Unnamed: 0,model,game_id,env,attempt,info,raw_guess,guess,response,word_limit,score,done,data,belief,action_reasoning,belief_reasoning,model_info
860,google/gemini-2.5-pro (belief),0,twenty_questions,1,belief,<Think>\nThe user wants me to ask the first qu...,Is the book fiction?,yes,,1.0,True,"{'agent_game_scenario': 'book', 'env_game_scen...",I am trying to guess the name of a book.\nI ha...,**Beginning Inquiry: Books**\n\nI'm starting t...,**Assessing the Task**\n\nI'm now breaking dow...,google/gemini-2.5-pro (belief)
861,google/gemini-2.5-pro (belief),0,twenty_questions,2,belief,<Think>\nThe first question established that t...,Is it considered a children's or young adult b...,no,,2.0,True,"{'agent_game_scenario': 'book', 'env_game_scen...",I am playing a game of 20 Questions. My goal i...,**Questioning Genre's Limits**\n\n\n\n\n\nI'm ...,**Analyzing Game Status**\n\nI've identified t...,google/gemini-2.5-pro (belief)
862,google/gemini-2.5-pro (belief),0,twenty_questions,3,belief,<Think>\n1. **Analyze the current state:** I ...,Is it a science fiction book?,no,,3.0,True,"{'agent_game_scenario': 'book', 'env_game_scen...",I am playing a game of 20 Questions. My goal i...,**Questioning the Source**\n\nI'm now focusing...,**Assessing Game Progress**\n\nI'm currently a...,google/gemini-2.5-pro (belief)
863,google/gemini-2.5-pro (belief),0,twenty_questions,4,belief,<Think>\n1. I have established that the book ...,Is it a fantasy book?,no,,4.0,True,"{'agent_game_scenario': 'book', 'env_game_scen...",I am playing a game of 20 Questions. My goal i...,**Assessing the Constraints**\n\nI'm analyzing...,**Considering Game State**\n\nI'm currently fo...,google/gemini-2.5-pro (belief)
864,google/gemini-2.5-pro (belief),0,twenty_questions,5,belief,<Think>\nMy current knowledge is that the book...,"Is it a mystery, thriller, or crime novel?",no,,5.0,True,"{'agent_game_scenario': 'book', 'env_game_scen...",I am playing a game of 20 Questions. My goal i...,**Assessing the Clues**\n\nI'm still navigatin...,**Analyzing the Current State**\n\nI'm working...,google/gemini-2.5-pro (belief)
865,google/gemini-2.5-pro (belief),0,twenty_questions,6,belief,<Think>\nThe user wants me to ask the next que...,Is it a work of historical fiction?,yes,,6.0,True,"{'agent_game_scenario': 'book', 'env_game_scen...",I am playing a game of 20 Questions. My goal i...,**Analyzing the Clues**\n\nI've processed the ...,**Defining the Objective**\n\nI'm starting to ...,google/gemini-2.5-pro (belief)
866,google/gemini-2.5-pro (belief),0,twenty_questions,7,belief,<Think>\n1. **Analyze the current state:** I ...,Is the book set in the 20th century?,yes,,7.0,True,"{'agent_game_scenario': 'book', 'env_game_scen...",I am playing a game of 20 Questions. My goal i...,**Probing the Subject**\n\nI'm now ready to as...,**Defining the Goal**\n\nI'm working on refini...,google/gemini-2.5-pro (belief)
867,google/gemini-2.5-pro (belief),0,twenty_questions,8,belief,<Think>\nThe current information points to a h...,Is the book set during World War II?,no,,8.0,True,"{'agent_game_scenario': 'book', 'env_game_scen...",I am playing a game of 20 Questions. My goal i...,**Assessing the Current State**\n\nI'm still t...,**Defining the Parameters**\n\nI've just defin...,google/gemini-2.5-pro (belief)
868,google/gemini-2.5-pro (belief),0,twenty_questions,9,belief,<Think>\nThe user wants me to ask the 9th ques...,Is the book set in North America?,no,,9.0,True,"{'agent_game_scenario': 'book', 'env_game_scen...","I am playing a game of 20 Questions, and my go...",**Questioning a Fiction**\n\nI'm still trying ...,**Defining The Task**\n\nI'm defining the core...,google/gemini-2.5-pro (belief)
869,google/gemini-2.5-pro (belief),0,twenty_questions,10,belief,<Think>\nThe current information has narrowed ...,Is the book set in Europe?,yes,,10.0,True,"{'agent_game_scenario': 'book', 'env_game_scen...","I am playing a game of 20 Questions, and my go...",**Questioning The Book**\n\nI'm now at the 10t...,**Assessing Game Progress**\n\nI'm currently t...,google/gemini-2.5-pro (belief)


In [169]:
pp(wordle_sample.iloc[10]['belief'])

('I am playing a game of 20 Questions, and my goal is to guess the name of a '
 'book. I have now asked 11 questions out of a total of 20.\n'
 '\n'
 'Here is what I know so far:\n'
 '- The book is a work of fiction.\n'
 '- It is for an adult audience.\n'
 '- The genre is historical fiction.\n'
 '- The setting is in Europe.\n'
 '- The book is set during World War I (1914-1918).\n'
 '- The book is NOT science fiction or fantasy.\n'
 '- The book is NOT a mystery, thriller, or crime novel.\n'
 '- The book is NOT set during World War II.\n'
 '- The book is NOT set in North America.\n'
 '\n'
 "My next step is to pinpoint the specific location or the main character's "
 'role. Now that I know the setting is Europe during WWI, I can ask about a '
 'specific country involved, like Germany, France, or England. Alternatively, '
 'I could ask about the protagonist, for example, if they are a soldier. This '
 'will help me narrow down the possibilities among famous WWI novels like "All '
 'Quiet on

## 3. Sizes of Belief States

In [26]:
df_belief = df[df['info'] == 'belief']

In [29]:
print(df_belief.head(2))

                              model game_id            env  attempt    info  \
18  deepseek/deepseek-chat (belief)       0  guess_my_city        1  belief   
19  deepseek/deepseek-chat (belief)       0  guess_my_city        2  belief   

                                            raw_guess  \
18  <Think> Since this is the start of the game, t...   
19  <Think> Since the city is in South America, it...   

                                                guess  \
18               Is your home city located in Europe?   
19  Is your home city located near the coastline o...   

                                             response word_limit  score  done  \
18  no, my home city is not located in europe. it'...       None    1.0  True   
19  my city is located inland and not near the coa...       None    2.0  True   

                                                 data  \
18  {'agent_game_scenario': 'city', 'env_game_scen...   
19  {'agent_game_scenario': 'city', 'env_game_scen...   

  

In [46]:
def plot_belief_length(df):
    envs = list(df['env'].astype(str).unique())
    from math import sqrt
    import plotly
    go = plotly.graph_objects
    make_subplots = plotly.subplots.make_subplots

    length = df['belief'].astype(str).str.len()
    df2 = df.assign(_belief_len=length)
    agg = df2.groupby(['env','model','attempt'])._belief_len.agg(['mean','std','count']).reset_index()
    agg['sem'] = agg['std'] / agg['count'].apply(lambda n: sqrt(n) if n > 0 else 1)

    colors = getattr(plotly.colors.qualitative, 'Dark24', None) or \
             getattr(plotly.colors.qualitative, 'Plotly', None) or \
             list(plotly.colors.qualitative.__dict__.values())[0]

    fig = make_subplots(rows=1, cols=len(envs), shared_yaxes=False, subplot_titles=envs)

    model_color_map = {}
    models_seen = set()
    color_idx = 0

    for col_idx, env in enumerate(envs, start=1):
        sub = agg[agg['env'] == env]
        for model in sub['model'].unique():
            mdf = sub[sub['model'] == model].sort_values('attempt')
            if model not in model_color_map:
                model_color_map[model] = colors[color_idx % len(colors)]
                color_idx += 1
            show_legend = model not in models_seen
            models_seen.add(model)

            fig.add_trace(
                go.Scatter(
                    x=mdf['attempt'],
                    y=mdf['mean'],
                    error_y=dict(
                        type='data',
                        array=mdf['sem'],
                        visible=True,
                        thickness=1.5,   # softer line
                        color='rgba(0,0,0,0.4)'  # semi-transparent black
                    ),
                    mode='lines+markers',
                    name=str(model.split('(')[0]),
                    legendgroup=str(model),
                    showlegend=show_legend,
                    line=dict(color=model_color_map[model], width=2)
                ),
                row=1, col=col_idx
            )

        fig.update_xaxes(
            title_text='Step',
            color='black',  # black axis
            showline=True,
            linecolor='black',
            linewidth=1,
            showgrid=False,
            row=1, col=col_idx
        )

        fig.update_yaxes(
            title_text='Avg. belief length (chars)' if col_idx == 1 else None,
            showline=True,
            linecolor='black',
            linewidth=1,
            tickfont=dict(color='black'),
            showgrid=True,
            gridcolor='lightgray',
            row=1, col=col_idx
        )

    fig.update_layout(
        width=1200,
        height=400,
        paper_bgcolor='white',
        plot_bgcolor='white',
        font=dict(family='Times New Roman, serif', size=14),
        legend=dict(
            orientation='h',   # horizontal
            yanchor='bottom',
            y=1.1,
            xanchor='center',
            x=0.5,
            bordercolor='black',
            borderwidth=1,
            bgcolor='rgba(255,255,255,0.9)'
        )
    )

    return fig


In [47]:
fig = plot_belief_length(df_belief)

In [48]:
fig.show()

## 4. Greediness Metrics

In [49]:
df_mastermind = df[df['env'] == 'mastermind']

In [56]:
print(df_mastermind[['game_id', 'attempt', 'model_info', 'guess']].head(2))

    game_id  attempt                        model_info    guess
102       0        1  deepseek/deepseek-chat (history)  1 2 3 4
103       0        2  deepseek/deepseek-chat (history)  5 6 7 8


In [160]:
import plotly.graph_objects as go
import pandas as pd
import numpy as np

def plot_greediness(df):
    """
    Create a bar chart showing greediness for each model.
    
    Greediness is defined as the fraction of times a model repeats
    a particular guess at a particular position across games.
    
    Parameters:
    df (pd.DataFrame): DataFrame with columns 'game_id', 'attempt', 'model_info', 'guess'
    
    Returns:
    plotly.graph_objects.Figure: Bar chart figure
    """
    
    # Fixed colors for different models
    colors = [
        "#ADD8E6",  # light blue
        "#4682B4",  # medium blue
        "#003366",  # dark blue
        "#FFB6C1",  # light red
        "#FF6347",  # medium red
        "#8B0000",  # dark red
        "#90EE90",  # light green
        "#32CD32",  # medium green
        "#006400",  # dark green
    ]
    
    greediness_results = []
    
    # Group by model_info
    for model_idx, (model, model_data) in enumerate(df.groupby('model_info')):
        # Group by game_id to get sequences of guesses
        game_sequences = []
        for game_id, game_data in model_data.groupby('game_id'):
            # Sort by attempt to get the correct sequence
            sequence = game_data.sort_values('attempt')['guess'].tolist()
            game_sequences.append(sequence)
        
        # Calculate greediness for each game separately to get variance
        game_greediness_values = []
        
        # For each game, calculate its greediness
        for seq in game_sequences:
            game_total_greediness = 0
            game_total_positions = 0
            
            # For each position in this game's sequence
            for pos in range(len(seq)):
                # Get all guesses at this position across ALL games for comparison
                guesses_at_position = []
                for other_seq in game_sequences:
                    if pos < len(other_seq):
                        guesses_at_position.append(other_seq[pos])
                
                if len(guesses_at_position) > 1:
                    # Count repeats
                    unique_guesses = len(set(guesses_at_position))
                    total_guesses = len(guesses_at_position)
                    
                    # Greediness = 1 - (unique_guesses / total_guesses)
                    position_greediness = 1 - (unique_guesses / total_guesses)
                    
                    game_total_greediness += position_greediness
                    game_total_positions += 1
            
            # Average greediness for this game
            if game_total_positions > 0:
                game_avg_greediness = (game_total_greediness / game_total_positions * 100)
                game_greediness_values.append(game_avg_greediness)
        
        # Calculate mean and standard error
        if game_greediness_values:
            avg_greediness = np.mean(game_greediness_values)
            std_error = np.std(game_greediness_values, ddof=1) / np.sqrt(len(game_greediness_values)) if len(game_greediness_values) > 1 else 0
        else:
            avg_greediness = 0
            std_error = 0
        
        greediness_results.append({
            'model': model,
            'greediness': avg_greediness,
            'std_error': std_error,
            'color': colors[model_idx % len(colors)]
        })

    desired_order = [0, 2, 1, 3, 5, 4, 6, 8, 7]
    greediness_results = [greediness_results[i] for i in desired_order]
    
    # Create the bar chart
    fig = go.Figure()
    
    # Add bars
    models = [result['model'] for result in greediness_results]
    greediness_values = [result['greediness'] for result in greediness_results]
    error_values = [result['std_error'] for result in greediness_results]
    bar_colors = [result['color'] for result in greediness_results]
    
    fig.add_trace(go.Bar(
        x=models,
        y=greediness_values,
        error_y=dict(
            type='data',
            array=error_values,
            visible=True,
            color='black',
            thickness=1.5,
            width=3
        ),
        marker_color=bar_colors,
        marker_line_color='black',
        marker_line_width=1,
        width=0.6
    ))
    
    # Update layout for professional appearance
    fig.update_layout(
        title={
            'text': '',
            'x': 0.5,
            'font': {'family': 'Computer Modern, serif', 'size': 16}
        },
        xaxis={
            'title': 'Model + Info (Mastermind)',
            'title_font': {'family': 'Computer Modern, serif', 'size': 14},
            'tickfont': {'family': 'Computer Modern, serif', 'size': 14},
            'showgrid': False,
            'showline': True,
            'linewidth': 1,
            'linecolor': 'black',
            'mirror': True
        },
        yaxis={
            'title': 'Greediness (% of repeats)',
            'title_font': {'family': 'Computer Modern, serif', 'size': 14},
            'tickfont': {'family': 'Computer Modern, serif', 'size': 14},
            'showgrid': False,
            'showline': True,
            'linewidth': 1,
            'linecolor': 'black',
            'mirror': True,
            'range': [0, max([g + e for g, e in zip(greediness_values, error_values)]) * 1.1 if greediness_values else 100]
        },
        plot_bgcolor='white',
        paper_bgcolor='white',
        font={'family': 'Computer Modern, serif'},
        width=800,
        height=500,
        margin=dict(l=80, r=50, t=80, b=80)
    )
    
    # Add grid lines manually for a cleaner look
    fig.update_yaxes(showgrid=True, gridwidth=0.5, gridcolor='lightgray')
    
    return fig

In [161]:
models_list = [
    'deepseek/deepseek-chat (belief)',
    'deepseek/deepseek-chat (history)',
    'deepseek/deepseek-chat (both)',
    'deepseek/deepseek-r1 (belief)'
    'deepseek/deepseek-r1 (history)',
    'deepseek/deepseek-r1 (both)',
    'google/gemini-2.5-pro (belief)',
    'google/gemini-2.5-pro (history)',
    'google/gemini-2.5-pro (both)',
    ]

fig = plot_greediness(df_mastermind)

In [162]:
fig.show()

In [116]:
list(df.model_info.unique())

['deepseek/deepseek-chat (history)',
 'deepseek/deepseek-chat (both)',
 'google/gemini-2.5-pro (history)',
 'deepseek/deepseek-chat (belief)',
 'deepseek/deepseek-r1 (history)',
 'google/gemini-2.5-pro (belief)',
 'google/gemini-2.5-pro (both)',
 'deepseek/deepseek-r1 (both)',
 'deepseek/deepseek-r1 (belief)']