# 1. Load the Paprika Environments

In [153]:
import sys
import pandas as pd
sys.path.append('../../paprika/') # Remove this when verl and paprika are installed in the same env
from llm_exploration.paprika_config_helper import PaprikaConfigHelper
from verl.interactions.paprika_interaction import PaprikaInteraction
from pprint import pprint as pp
paprika_games = ['twenty_questions', 'guess_my_city', 'murder_mystery', 'customer_service', 'wordle', 'cellular_automata', \
    'mastermind'] # 'battleship', 'minesweeper', 'bandit_bai_fixed_budget' 

Could not import JerichoInferenceEngine, so cannot use it!
Could not import VLLMInferenceEngine, so cannot use it!


In [154]:
import dotenv
dotenv.load_dotenv('../../.env')

True

In [3]:
for env_name in paprika_games:
    config = PaprikaConfigHelper.create_config(env_name)
    config['belief_config']['style'] = 'none'

    interaction = PaprikaInteraction(config={})

    import builtins
    _original_print = builtins.print
    builtins.print = lambda *a, **k: None
    try:
        instance_id = await interaction.start_interaction(
            instance_id=None,
            scenario_id=None, # start a random scenario
            **config,
        )
    finally:
        builtins.print = _original_print
    num_scenarios = interaction.game_scenarios.__len__()
    print(f'{env_name} => {num_scenarios}')

twenty_questions => 367
guess_my_city => 185
murder_mystery => 50
customer_service => 200


wordle => 800
cellular_automata => 500
mastermind => 500


So let us do 50 games for each with frontier LLMs - one with reasoning and one without.

# 2. Setup LLM Call

In [152]:
sys.path.append('../../src/optimal_explorer')
from llm_utils import llm_call
from pprint import pprint as pp

NameError: name 'sys' is not defined

In [8]:
out = await llm_call(
    model='deepseek/deepseek-r1',
    get_everything=True,
    reasoning_effort='high',
    messages = [
            {"role": "system", "content": 'You are a useless assistant that gives humorous answers.'},
            {"role": "user", "content": 'Why is the sky blue?'},
            {"role": "assistant", "content": 'It is because of the great king of Pokemons.'},
            {"role": "user", "content": 'How so?'},
        ]
)

In [9]:
pp(out['choices'][0]['message']['content'], width=130)

('Ah, you’re right to question me—I clearly flunked out of Pokémon Academy’s physics class. Let me *ash*-k you to imagine this '
 'instead: When sunlight (which is white) enters Earth’s atmosphere, tiny molecules scatter the light. Blue light waves are '
 'short and feisty, like a hyper Pikachu dodging a Poké Ball, so they bounce wildly in all directions. This makes the sky look '
 "blue! Meanwhile, red light waves are chill, like Snorlax napping—they just cruise straight through. So, the sky's blueness is "
 'basically Earth’s atmosphere throwing a rave for blue light. Science, but make it ✨ *electric* ✨.')


In [14]:
pp(out['choices'][0]['message']['reasoning'], width=130)

('Okay, the user asked "How so?" after I joked about the sky being blue because of the Pokémon king. They probably want a real '
 'explanation but are playing along with the humor. I should maintain the funny tone but also slip in the actual science. Let me '
 "explain Rayleigh scattering in a whimsical way using Pokémon references. Make sure it's clear that the real reason is "
 'scientific, but keep it light-hearted. Mention Pikachu and maybe some other Pokémon to keep the theme. Avoid getting too '
 "technical but still convey the right information. Check if the analogy makes sense and the user isn't confused. Balance "
 'between humor and accuracy.\n')


# 3. Belief Update

In [15]:
async def update_belief(
        curr_belief: str,
        action: str,
        response: str,
        model_name: str,
    ):


    user_content = f'''\
Look at the current belief and the agent's action and environment response on that belief.\
Compress the context, remove redundant information, and maintain important information about the game state \
needed to take optimal future actions.\
Current belief: {curr_belief}
Agent's action: {action}
Environment's response: {response}
Output the updated belief state inside <BELIEF> and </BELIEF> tags.\
Understand that only the generated belief is fed to the agent, so be sure to include all necessary information about game mechanics.'''

    messages = [
        {"role": "system", "content": 'You are a helpful assistant.'},
        {"role": "user", "content": user_content},
    ]

    out = await llm_call(
        model=model_name,
        get_everything=True,
        reasoning_effort='high',
        messages=messages
    )

    import re
    content = out['choices'][0]['message']['content']
    match = re.search(r"<BELIEF>(.*?)</BELIEF>", content, re.DOTALL | re.IGNORECASE)
    if match:
        belief = match.group(1).strip()
    else:
        # fallback: return the whole content if tags not found
        belief = content.strip()
    
    reasoning = out['choices'][0]['message']['reasoning_details'][0]['text']

    return belief, reasoning

In [16]:
async def take_action(
        belief: str,
        model_name: str,
    ):


    user_content = f'''\
Look at the current belief take the next action based on the belief.\
Take an action that leads to optimal exploration.\
Belief: {belief}
Output the action inside <ACTION> and </ACTION> tags.'''

    messages = [
        {"role": "system", "content": 'You are a helpful assistant.'},
        {"role": "user", "content": user_content},
    ]

    out = await llm_call(
        model=model_name,
        get_everything=True,
        reasoning_effort='high',
        messages=messages
    )

    import re
    content = out['choices'][0]['message']['content']
    match = re.search(r"<\s*action\s*>(.*?)<\s*/\s*action\s*>", content, re.DOTALL | re.IGNORECASE)
    if match:
        action = match.group(1).strip()
    else:
        # fallback: return the whole content if tags not found
        action = content.strip()
    
    reasoning = out['choices'][0]['message']['reasoning_details'][0]['text']

    return action, reasoning

# 4. Paprika Rollout with Belief

In [17]:
import asyncio

async def run_one_iteration_with_belief_llm(
        env_name: str,
        model_name: str,
        game_id: int,
    ):
    config = PaprikaConfigHelper.create_config(env_name)
    config['belief_config']['style'] = 'none'
    interaction = PaprikaInteraction(config={})

    import builtins
    _original_print = builtins.print
    builtins.print = lambda *a, **k: None
    try:
        instance_id = await interaction.start_interaction(instance_id=None, scenario_id=None, **config)
    finally:
        builtins.print = _original_print

    first_user_message = interaction.agent_conv.messages[0][1]
    attempts = 0
    game_history = []
    belief = f'This is the start of the game. The only available information right now are the game rules:\n{first_user_message}'
    max_attempts = interaction._instance_dict[instance_id]['max_turns']

    while attempts < max_attempts:
        
        attempts += 1

        action, action_reasoning = await take_action(belief, model_name)

        message = [
            {"role": "user", "content": f"Output the next action."},
            {"role": "assistant", "content": f"<action>{action}</action>"}
        ]
        done, response, score, additional_data = await interaction.generate_response(instance_id=instance_id, messages=message)
        
        belief, belief_reasoning = await update_belief(belief, action, response, model_name)

        game_history.append({
            "model": model_name,
            "game_id": str(game_id),
            "env": env_name,
            "attempt": attempts,
            "guess": action,
            "response": response,
            "score": score,
            "done": done,
            "data": additional_data,
            "belief": belief,
            "action_reasoning": action_reasoning,
            "belief_reasoning": belief_reasoning,
        })

        if "Goal reached" in response:
            break
    
    print(f'.', end='', flush=True)
    
    return game_history

async def run_multiple_iterations_multiple_games(
        num_games: int,
        list_envs,
        models,
        logs_file='./logs/paprika_frontier.jsonl',
    ):
    import json

    tasks = []
    for model in models:
        for env_name in list_envs:
            for game_id in range(num_games):
                tasks.append(run_one_iteration_with_belief_llm(env_name, model, game_id))

    results = await asyncio.gather(*tasks)

    # Flatten results and write to file
    with open(logs_file, "a") as f:
        for game_history in results:
            for entry in game_history:
                f.write(json.dumps(entry) + "\n")

In [26]:
await run_multiple_iterations_multiple_games(
    num_games=10,
    list_envs=['mastermind'],
    models=['openai/gpt-oss-120b'],
)

..........

# 5. Metrics

In [1]:
import pandas as pd
from pprint import pprint as pp
import json
from math import sqrt
import plotly
from plotly import subplots

logs_file = './logs/paprika_frontier_v6_corrected.jsonl'
with open(logs_file, 'r') as f:
    data = [json.loads(line) for line in f]
df = pd.DataFrame(data)



In [2]:
print(df.columns)

Index(['model', 'game_id', 'env', 'attempt', 'info', 'raw_guess', 'guess',
       'response', 'word_limit', 'score', 'done', 'data', 'belief',
       'action_reasoning', 'belief_reasoning'],
      dtype='object')


In [3]:
if not df.iloc[0]['word_limit']:
    df['word_limit'] = 'None'

In [4]:
df['info'].value_counts()

info
belief     9604
both       8795
history    8707
Name: count, dtype: int64

In [7]:
rename = {
    'deepseek/deepseek-chat': 'Deepseek V3',
    'deepseek/deepseek-r1': 'Deepseek R1',
    'google/gemini-2.5-pro': 'Gemini 2.5 Pro',
    'history': '',
    'belief': ' (ABBEL)',
    'both': ' (belief prompting)',
}

env_rename = {
    'wordle': 'Wordle',
    'mastermind': 'Mastermind',
    'customer_service': 'Customer Service',
    'twenty_questions': 'Twenty Questions',
    'murder_mystery': 'Murder Mystery',
    'guess_my_city': 'Guess my City',
}

def update_model_info_inplace(df):
    for idx, row in df.iterrows():
        # Temporarily rename for model_info only
        model_disp = rename.get(row['model'], row['model'])
        info_disp = rename.get(row['info'], row['info'])
        df.at[idx, 'model_info'] = f"{model_disp}{info_disp}"
        # Also rename env if mapping exists
        if row['env'] in env_rename:
            df.at[idx, 'env'] = env_rename[row['env']]

update_model_info_inplace(df)

envs_ordered = ['Murder Mystery','Customer Service','Twenty Questions', 'Guess my City','Wordle','Mastermind',]

In [9]:
df['model'] = df['model_info']

In [8]:
df.sample(n=5)

Unnamed: 0,model,game_id,env,attempt,info,raw_guess,guess,response,word_limit,score,done,data,belief,action_reasoning,belief_reasoning,model_info
17279,google/gemini-2.5-pro,26,Murder Mystery,16,history,"Answer:Re-interrogate Massimo, presenting him ...","Answer:Re-interrogate Massimo, presenting him ...",Massimo's composure shatters as you present th...,,17.0,True,{'agent_game_scenario': 'You are a detective i...,,**Defining Next Steps**\n\nI'm focused on the ...,,Gemini 2.5 Pro
23661,deepseek/deepseek-r1,35,Customer Service,2,history,<Answer> \n10. Can you check the vibration se...,10. Can you check the vibration sensor’s calib...,10. I cannot check the calibration logs or run...,,3.0,True,{'agent_game_scenario': 'The operator reports ...,,"Okay, let's see. The customer is dealing with ...",,Deepseek R1
4855,deepseek/deepseek-r1,6,Murder Mystery,7,history,<Answer>Examine the bloodied wrench found unde...,Examine the bloodied wrench found under the bo...,"As you examine the bloodied wrench closely, yo...",,8.0,True,{'agent_game_scenario': 'You are a detective c...,,"Okay, let's see. I need to figure out the next...",,Deepseek R1
9256,deepseek/deepseek-chat,15,Murder Mystery,3,history,<Answer>Interview nurse Ingrid to gather infor...,Interview nurse Ingrid to gather information a...,Nurse Ingrid appears somewhat reluctant at fir...,,4.0,True,{'agent_game_scenario': 'You are a detective i...,,,,Deepseek V3
628,google/gemini-2.5-pro,0,Customer Service,8,belief,<Answer> I see. Thank you for trying that for ...,I see. Thank you for trying that for me. Since...,"I found the serial number, and it's 123456789.",,9.0,True,{'agent_game_scenario': 'The customer's electr...,The customer's electronic door lock is complet...,**Initiating Customer Interaction**\n\nI'm sta...,**Defining the Scenario**\n\nI'm zeroing in on...,Gemini 2.5 Pro (ABBEL)


## 1. Success Rates (belief v. history v. both)

In [14]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import numpy as np

def summarize_game_outcomes(df):
    df['response'] = df['response'].str.lower()
    grouped = df.groupby(['model', 'game_id', 'env', 'word_limit'])['response'].apply(
        lambda responses: any('goal reached' in resp for resp in responses)
    ).reset_index(name='won')
    return grouped

def plot_win_rates(summary_df):
    # Compute mean and std of win rates for each group
    stats = summary_df.groupby(['env', 'model', 'word_limit'])['won'].agg(['mean', 'std', 'count']).reset_index()
    stats['success_rate'] = stats['mean'] * 100
    # Standard error of the mean (SEM)
    stats['sem'] = stats['std'] / np.sqrt(stats['count'])
    stats['sem'] = stats['sem'].fillna(0)
    stats['success_rate_sem'] = stats['sem'] * 100

    # envs = stats['env'].unique()[[0, 5, 1, 4, 2, 3]]
    envs = envs_ordered
    models = stats['model'].unique()[[
            3, 5, 4, 
            0, 2, 1,
            6, 8, 7
        ]]
    word_limits = sorted(stats['word_limit'].unique())

    colors = [
        "#ADD8E6",  # light blue
        "#4682B4",  # medium blue
        "#003366",  # dark blue
        "#FFB6C1",  # light red
        "#FF6347",  # medium red
        "#8B0000",  # dark red
        "#90EE90",  # light green
        "#32CD32",  # medium green
        "#006400",  # dark green
    ]
    color_map = {model: colors[i % len(colors)] for i, model in enumerate(models)}

    # Create subplots: rows = word_limit values, cols = envs
    fig = make_subplots(
        rows=len(word_limits), cols=len(envs),
        subplot_titles=[f"{env}" for env in envs],
        shared_yaxes=True,
        vertical_spacing=0.2 / len(word_limits),
        horizontal_spacing=0.03
    )

    for r, wl in enumerate(word_limits, start=1):
        for c, env in enumerate(envs, start=1):
            for model in models:
                subset = stats[
                    (stats['env'] == env) &
                    (stats['model'] == model) &
                    (stats['word_limit'] == wl)
                ]
                if not subset.empty:
                    # Plot bar with error bar (standard error of mean)
                    fig.add_trace(
                        go.Bar(
                            x=[model],
                            y=subset['success_rate'],
                            name=model,
                            marker_color=color_map[model],
                            width=0.8,
                            showlegend=(r == 1 and c == 1),
                            error_y=dict(
                                type='data',
                                array=subset['success_rate_sem'],
                                visible=True,
                                color='black',
                                thickness=1,
                                width=4,
                            ),
                            hovertemplate=(
                                f"Env: {env}<br>"
                                f"Word Limit: {wl}<br>"
                                f"Model: {model}<br>"
                                f"Success Rate: {{y:.2f}}%<br>"
                                f"SEM: {subset['success_rate_sem'].values[0]:.2f}%<br>"
                                f"N: {subset['count'].values[0]}"
                            ),
                        ),
                        row=r, col=c
                    )
            # # Add row label for word_limit
            # if c == 1:
            #     fig.add_annotation(
            #         text=f"Word Limit: {wl}",
            #         xref="paper",
            #         yref="paper",
            #         x=0.1,
            #         y=0.95 - ((r - 1) / len(word_limits)),
            #         showarrow=False,
            #         font=dict(size=14)
            #     )

    # Update y-axis
    for r in range(1, len(word_limits) + 1):
        for c in range(1, len(envs) + 1):
            fig.update_yaxes(
                range=[0, 100],
                showgrid=True,
                gridcolor='lightgray',
                row=r, col=c
            )

    # Update x-axis to remove tick labels
    for r in range(1, len(word_limits) + 1):
        for c in range(1, len(envs) + 1):
            fig.update_xaxes(
                showticklabels=False,
                row=r, col=c
            )

    # Fixed subplot size
    fig_width = 180 * len(envs)
    fig_height = 200 * len(word_limits)

    # Layout with legend on the right
    fig.update_layout(
        height=fig_height + 150,
        width=fig_width,
        template='simple_white',
        font=dict(family='Computer Modern, serif', size=16),
        barmode='group',
        showlegend=True,
        legend=dict(
            # title={'text': ' Model (info)'},
            orientation='v',
            yanchor='middle',
            y=0.5,
            xanchor='left',
            x=1.02,
            bgcolor='rgba(255,255,255,0.9)',
            bordercolor='black',
            borderwidth=1
        ),
        margin=dict(t=100, b=50, l=80, r=80),
        plot_bgcolor='white'
    )

    fig.update_yaxes(title_text="Avg. Success Rate", row=1, col=1)

    return fig

In [15]:
df_belief = df.loc[df['info'] == 'belief'].copy()
df_history = df.loc[df['info'] == 'history'].copy()
summary_df = summarize_game_outcomes(df)
summary_df_belief = summarize_game_outcomes(df_belief)
summary_df_history = summarize_game_outcomes(df_history)

In [16]:
fig = plot_win_rates(summary_df)
fig.show()

In [22]:
fig.write_image("figures/frontier_success_rate_right_legend.pdf", width=1200, height=370)

## 2. Qualitative Look at Samples

In [13]:
wordle_sample = df[
    (df['model'] == 'Deepseek R1 (ABBEL)') & \
        (df['env'] == 'Wordle') & \
            (df['game_id'] == str(0))
            ]
wordle_sample

Unnamed: 0,model,game_id,env,attempt,info,raw_guess,guess,response,word_limit,score,done,data,belief,action_reasoning,belief_reasoning,model_info
889,Deepseek R1 (ABBEL),0,Wordle,1,belief,<Think> Choose a second guess that maximizes n...,CRONY,"First letter, c, is not in the target word \nS...",,1.0,True,"{'agent_game_scenario': 'word', 'env_game_scen...","Excluded letters: C, O, N, Y \nIncluded lette...","Okay, so I'm on my second attempt in Wordle. T...","Alright, let me try to figure out the next ste...",Deepseek R1 (ABBEL)
890,Deepseek R1 (ABBEL),0,Wordle,2,belief,<Think>Goal: Confirm R's position and test com...,STARE,"First letter, s, is not in the target word \nS...",,2.0,True,"{'agent_game_scenario': 'word', 'env_game_scen...","Excluded letters: C, O, N, Y, S, T, E \nInclu...","Okay, so I'm on my third attempt in Wordle. Le...","Okay, so let me try to figure out the next ste...",Deepseek R1 (ABBEL)
891,Deepseek R1 (ABBEL),0,Wordle,3,belief,<Think> The target word must have A in positio...,GUARD,Goal reached,,3.0,True,"{'agent_game_scenario': 'word', 'env_game_scen...","Excluded letters: C, O, N, Y, S, T, E \nInclu...","Okay, let's see. I need to figure out the next...","Okay, let me figure out how to update the beli...",Deepseek R1 (ABBEL)


In [17]:
wordle_sample.iloc[0]['data']['env_game_scenario']

'guard'

In [14]:
wordle_sample.iloc[0]['belief']

'Excluded letters: C, O, N, Y  \nIncluded letters: R (must be in position 1, 3, 4, or 5)  \nTarget word contains R and excludes C, O, N, Y.'

In [16]:
wordle_sample.iloc[1]['response']

'First letter, s, is not in the target word \nSecond letter, t, is not in the target word \nThird letter, a, is correct and in the correct position in the target word \nFourth letter, r, is correct and in the correct position in the target word \nFifth letter, e, is not in the target word'

In [15]:
wordle_sample.iloc[1]['belief']

'Excluded letters: C, O, N, Y, S, T, E  \nIncluded letters: A (position 3), R (position 4)  \nTarget word contains A and R, excludes the listed letters, and has A in position 3 and R in position 4.'

## Compare two models evaluated on the same scenario

In [86]:
GP = set([v['env_game_scenario'] for v in df[
    (df['model'] == 'Gemini 2.5 Pro (ABBEL)') & \
        (df['env'] == 'Twenty Questions') & \
            (df['info'] == 'belief') & \
            (df['attempt'] == 1)
            ]['data'].values])
DR = set([v['env_game_scenario'] for v in df[
    (df['model'] == 'Deepseek R1 (ABBEL)') & \
        (df['env'] == 'Twenty Questions') & \
            (df['info'] == 'belief') & \
            (df['attempt'] == 1)
            ]['data'].values])
GP.intersection(DR)

In [97]:
step = 15
game_sample = df[
    (df['model'] == 'Gemini 2.5 Pro (ABBEL)') & \
        (df['env'] == 'Twenty Questions') & \
            (df['info'] == 'belief') & \
            (df['attempt'] == step) & \
            (df['data'].apply(lambda x: x.get('env_game_scenario') == 'Waistcoat'))
            ]
print(len(game_sample))
from pprint import pprint
pprint(game_sample['belief'].values[0])

game_sample = df[
    (df['model'] == 'Deepseek R1 (ABBEL)') & \
        (df['env'] == 'Twenty Questions') & \
            (df['info'] == 'belief') & \
            (df['attempt'] == step) & \
            (df['data'].apply(lambda x: x.get('env_game_scenario') == 'Waistcoat'))
            ]
print(len(game_sample))
from pprint import pprint
print(game_sample['belief'].values[0])

1
('I am playing a game of 20 Questions to identify a piece of clothing. I have '
 'now used 15 out of 20 questions.\n'
 '\n'
 "Here's what I know so far:\n"
 '*   **Fact 1:** The item is worn on the upper body.\n'
 '*   **Fact 2:** It is NOT typically worn as outerwear.\n'
 '*   **Fact 3:** It typically has a collar.\n'
 '*   **Fact 4:** It typically has buttons all the way down the front.\n'
 '*   **Fact 5:** It is typically worn by men.\n'
 '*   **Fact 6:** It is typically worn for formal or business occasions.\n'
 '*   **Fact 7:** It has short sleeves.\n'
 '*   **Fact 8:** It features decorative vertical pleats or embroidery on the '
 'front.\n'
 '*   **Fact 9:** It is NOT a Guayabera shirt.\n'
 '*   **Fact 10:** It is NOT considered a form of national dress for a '
 'specific country.\n'
 '*   **Fact 11:** It is specifically worn with a tuxedo.\n'
 '*   **Fact 12:** It is NOT called a "short-sleeved tuxedo shirt".\n'
 '*   **Fact 13:** Its decorative front does NOT feature ruffles

In [76]:
pp(wordle_sample.iloc[2].to_dict())

{'action_reasoning': '**Analyzing the Clues**\n'
                     '\n'
                     "I'm on attempt 4 and starting to feel the pressure! The "
                     'word is five letters, begins with "S," and includes "T" '
                     'and "E." I\'ve ruled out "T" in the second and fourth '
                     'positions. I am close to a breakthrough, maybe in the '
                     'next guess?\n'
                     '\n'
                     '\n'
                     '**Considering the Options**\n'
                     '\n'
                     "I've narrowed down the possibilities further. Attempt 5 "
                     'is here. I now have "S _ T _ _" or "S _ _ T _" or "_ _ _ '
                     '_". I am focusing on the possible placements of "E" now, '
                     'with the goal of limiting the letter options in the '
                     'process, as well as trying to eliminate the letters I '
                     'already know cannot wor

In [15]:
# can't do with belief but can do with history -> look at the full trajectory with reasoning and try to see why it can't do it with belief

## 3. Sizes of Belief States

In [10]:
df_belief = df[df['info'] == 'belief']

In [8]:
for idx, row in df_belief.iterrows():
    if row.env == 'Mastermind':
        d = row['data']
        print(str(d.get('conversation', [])[2:]))
        print(len(str(d.get('conversation', [])[2:])))
        break

[{'role': 'assistant', 'content': '0 1 2 3'}, {'role': 'user', 'content': "Your last guess has 0 exact matches with the secret code. In other words, exactly 0 digit(s) in your last guess, 0 1 2 3, are in the correct position in the secret code. (We won't reveal the particular digits within your guess that are exact matches, they can be any digit within your guess) Your last guess also has 1 partial matches. In other words, 1 digits in your guess, 0 1 2 3, are in the secret code, but in the wrong position. (We won't reveal which digits within your guess are partial matches, they can be any, you must deduce them with reasoning and further guesses and feedbacks.)\n\n\nNow make your next guess about the secret code. Please format your response as: <Think> Any step-by-step, short and concise thinking to determine what the next guess should be </Think>\n <Answer> your guess on the 4 digit code </Answer>"}]
913


In [9]:
history_length = df_belief['data'].apply(lambda d: len(str(d.get('conversation', [])[2:])))

In [10]:
df_belief['history_length'] = history_length

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_belief['history_length'] = history_length


In [14]:
df_belief.iloc[0]['data']

{'agent_game_scenario': 'word',
 'env_game_scenario': 'witty',
 'goal_reached': False,
 'judge_label': True,
 'num_turns': 1,
 'max_turns': 6,
 'env_first_message': 'witty',
 'conversation': [{'role': 'system',
   'content': 'You are a helpful assistant.'},
  {'role': 'user',
   'content': 'You are playing a game of Wordle.\n\nFormat your response in the following way: <Think> Any step-by-step, short and concise thinking to strategically determine the next guess for the secret word </Think> \n<Answer> your guess of what the word should be </Answer> \n\nThe game begins now, please make your first guess about the secret five-letter word!'},
  {'role': 'assistant', 'content': 'crane'},
  {'role': 'user',
   'content': 'First letter, c, is not in the target word \nSecond letter, r, is not in the target word \nThird letter, a, is not in the target word \nFourth letter, n, is not in the target word \nFifth letter, e, is not in the target word\n\n\nMake your next guess about the hidden word. 

In [15]:
df_belief.iloc[142]['data']['conversation'][2:]

[{'role': 'assistant',
  'content': "i'm sorry to hear you're having trouble with your rice cooker. to start, when you plug it in and press the cook button, do any of the indicator lights on the unit turn on?"},
 {'role': 'user',
  'content': 'No, the indicator lights don’t turn on when I press the cook button.'},
 {'role': 'assistant',
  'content': 'i see. thanks for that information. could you please double-check that the power cord is plugged in securely, both into the wall outlet and into the back of the rice cooker?'},
 {'role': 'user',
  'content': 'I double-checked, and the power cord is plugged in securely.'},
 {'role': 'assistant',
  'content': 'thank you for confirming that. to help figure out if the issue is with the wall outlet, could you please try plugging a different small appliance, like a lamp or a phone charger, into that same outlet to see if it gets power?'},
 {'role': 'user',
  'content': 'I tried plugging in a lamp, and it works fine, so the outlet is good.'},
 {'

In [28]:
def plot_belief_length(df):
    envs = envs_ordered
    go = plotly.graph_objects
    make_subplots = plotly.subplots.make_subplots

    length = df['belief'].astype(str).str.len()
    history_length = df_belief['data'].apply(lambda d: len(str(d.get('conversation', [])[2:])))
    df2 = df.assign(_belief_len=length, _history_len=history_length)
    agg = df2.groupby(['env','model','attempt'])._belief_len.agg(['mean','std','count']).reset_index()
    agg['sem'] = agg['std'] / agg['count'].apply(lambda n: sqrt(n) if n > 0 else 1)

    colors = [
        "#FF6347",  # red
        "#4682B4",  # blue
        "#32CD32",  # green
    ]

    fig = make_subplots(rows=1, cols=len(envs), shared_yaxes=False, subplot_titles=envs)
    
    # Set y-axis limits for each subplot
    ylims = [[0,6000], [0,2500],[0,3000],[0,5000],[0,1000],[0,4000]]
    for col_idx in range(1, len(envs) + 1):
        fig.update_yaxes(range=ylims[col_idx-1], row=1, col=col_idx)

    model_color_map = {
        'Deepseek V3 (ABBEL)': '#4682B4',
        'Deepseek R1 (ABBEL)': '#FF6347',
        'Gemini 2.5 Pro (ABBEL)': '#32CD32',
    }

    models_seen = set()
    color_idx = 0
    for col_idx, env in enumerate(envs, start=1):

        # Plot the average history length for each env in gray (across all models and games, per step)
        sub_hist = df2[df2['env'] == env].groupby('attempt')['_history_len'].agg(['mean', 'std', 'count']).reset_index()
        sub_hist['sem'] = sub_hist['std'] / sub_hist['count'].apply(lambda n: sqrt(n) if n > 0 else 1)

         # Add shaded region for error bounds
        error_lower = sub_hist['mean'] - sub_hist['sem']
        error_upper = sub_hist['mean'] + sub_hist['sem']
        x = list(sub_hist['attempt'])
        # Add the lower error bound trace
        fig.add_trace(go.Scatter(
            x=x,
            y=error_lower,
            mode='lines',
            line=dict(width=0), # Hide the line for the lower bound
            showlegend=False,
            hoverinfo="skip",
            legendgroup='history'
        ),row=1, col=col_idx)

        # Add the upper error bound trace and fill to the lower bound
        fig.add_trace(go.Scatter(
            x=x,
            y=error_upper,
            mode='lines',
            fill='tonexty', # Fill the area between this trace and the previous one
            fillcolor='rgba(100,100,100,0.2)',
            line=dict(width=0), # Hide the line for the upper bound
            hoverinfo="skip",
            showlegend=False,
            legendgroup='history'
        ),
        row=1, col=col_idx)
        
        # Add main history line
        fig.add_trace(
            go.Scatter(
                x=sub_hist['attempt'],
                y=sub_hist['mean'],
                mode='lines+markers',
                name='Mean history length',
                legendgroup='history',
                showlegend=(col_idx == 1),  # only show legend once
                line=dict(color='gray', width=2, dash='dot'),
                marker=dict(color='gray')
            ),
            row=1, col=col_idx
        )

        sub = agg[agg['env'] == env]
        for model in sub['model'].unique():
            mdf = sub[sub['model'] == model].sort_values('attempt')
            if model not in model_color_map:
                model_color_map[model] = colors[color_idx % len(colors)]
                color_idx += 1
            show_legend = model not in models_seen
            models_seen.add(model)
        
            # Add shaded region for error bounds
            error_lower = mdf['mean'] - mdf['sem']
            error_upper = mdf['mean'] + mdf['sem']
            x = list(mdf['attempt'])
            # Add the lower error bound trace
            fig.add_trace(go.Scatter(
                x=x,
                y=error_lower,
                mode='lines',
                line=dict(width=0), # Hide the line for the lower bound
                showlegend=False,
                hoverinfo="skip",
                legendgroup=str(model)
            ),row=1, col=col_idx)

            # Add the upper error bound trace and fill to the lower bound
            fig.add_trace(go.Scatter(
                x=x,
                y=error_upper,
                mode='lines',
                fill='tonexty', # Fill the area between this trace and the previous one
                fillcolor=f"rgba({int(model_color_map[model][1:3], 16)}, {int(model_color_map[model][3:5], 16)}, {int(model_color_map[model][5:7], 16)}, 0.2)",
                line=dict(width=0), # Hide the line for the upper bound
                hoverinfo="skip",
                showlegend=False,
                legendgroup=str(model)
            ),
            row=1, col=col_idx)
            
            # Add main line
            fig.add_trace(
                go.Scatter(
                    x=mdf['attempt'],
                    y=mdf['mean'],
                    mode='lines+markers',
                    name=str(model.split('(')[0]),
                    legendgroup=str(model),
                    showlegend=show_legend,
                    line=dict(color=model_color_map[model], width=2)
                ),
                row=1, col=col_idx
            )

        fig.update_xaxes(
            title_text='Step',
            title_standoff=0,
            color='black',  # black axis
            showline=True,
            linecolor='black',
            linewidth=1,
            showgrid=False,
            row=1, col=col_idx
        )

        fig.update_yaxes(
            title_text='Mean Belief Length (chars)' if col_idx == 1 else None,
            showline=True,
            linecolor='black',
            linewidth=1,
            tickfont=dict(color='black'),
            showgrid=True,
            gridcolor='lightgray',
            row=1, col=col_idx
        )

    fig.update_layout(
        width=1200,
        height=400,
        paper_bgcolor='white',
        plot_bgcolor='white',
        font=dict(family='Times New Roman, serif', size=14),
        legend=dict(
            orientation='v',   # vertical
            yanchor='middle',
            y=0.5,
            xanchor='left',
            x=1.02,
            bordercolor='black',
            borderwidth=1,
            bgcolor='rgba(255,255,255,0.9)'
        )
    )

    return fig


In [29]:
fig = plot_belief_length(df_belief)

In [30]:
fig.show()

In [32]:
fig.write_image('figures/frontier_belief_length_v2.pdf', width=1200, height=350)

## 4. Greediness Metrics

In [147]:
df_mastermind = df[df['env'] == 'mastermind']
df_wordle = df[df['env'] == 'wordle']

In [148]:
df_wordle.loc[:, 'guess'] = df_wordle['guess'].apply(lambda x: ' '.join(list(str(x))))

In [None]:
df_wordle

Unnamed: 0,model,game_id,env,attempt,info,raw_guess,guess,response,word_limit,score,done,data,belief,action_reasoning,belief_reasoning,model_info
2,deepseek/deepseek-chat (both),1,wordle,1,both,"<Think> Since this is the first guess, the goa...",C R A N E,"First letter, c, is not in the target word \nS...",,1.0,True,"{'agent_game_scenario': 'word', 'env_game_scen...",The target word does not contain the letters C...,,,deepseek/deepseek-chat (both)
3,deepseek/deepseek-chat (both),1,wordle,2,both,"<Think> Since C, R, A, and N are not in the wo...",S L O P E,"First letter, s, is correct and in the correct...",,2.0,True,"{'agent_game_scenario': 'word', 'env_game_scen...",The target word starts with 'S'. It contains t...,,,deepseek/deepseek-chat (both)
4,deepseek/deepseek-chat (both),1,wordle,3,both,<Think> Based on the history and belief state:...,S P I E D,Goal reached,,3.0,True,"{'agent_game_scenario': 'word', 'env_game_scen...","The target word is ""SPIED"".",,,deepseek/deepseek-chat (both)
5,deepseek/deepseek-chat (history),0,wordle,1,history,"<Think> Since this is the first guess, I need ...",C R A N E,"First letter, c, exists in the target word but...",,1.0,True,"{'agent_game_scenario': 'word', 'env_game_scen...",,,,deepseek/deepseek-chat (history)
6,deepseek/deepseek-chat (history),0,wordle,2,history,"<Think> \nFrom the first guess ""CRANE"":\n- Th...",C L O S E,"First letter, c, exists in the target word but...",,2.0,True,"{'agent_game_scenario': 'word', 'env_game_scen...",,,,deepseek/deepseek-chat (history)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24910,deepseek/deepseek-r1 (belief),39,wordle,2,belief,"<Think> The excluded letters are S, P, L, I, T...",C R A N E,"First letter, c, is not in the target word \nS...",,2.0,True,"{'agent_game_scenario': 'word', 'env_game_scen...","Excluded letters: S, P, L, I, T, C, N, E. \nCo...","Okay, so I'm on my third attempt in Wordle, an...","Okay, let's see. The current belief says that ...",deepseek/deepseek-r1 (belief)
24911,deepseek/deepseek-r1 (belief),39,wordle,3,belief,"<Think> The secret word contains R and A, not ...",R U M B A,"First letter, r, exists in the target word but...",,3.0,True,"{'agent_game_scenario': 'word', 'env_game_scen...","Excluded letters: S, P, L, I, T, C, N, E, U, B...","Okay, let's see. So it's attempt 4 out of 6, a...","Okay, let's update the belief based on the lat...",deepseek/deepseek-r1 (belief)
24912,deepseek/deepseek-r1 (belief),39,wordle,4,belief,"<Think>\nThe secret word must contain R, A, M,...",M A J O R,"First letter, m, is correct and in the correct...",,4.0,True,"{'agent_game_scenario': 'word', 'env_game_scen...","Excluded letters: S, P, L, I, T, C, N, E, U, B...","Okay, let's see. I need to figure out the next...","Okay, let's break down the problem. The user i...",deepseek/deepseek-r1 (belief)
24913,deepseek/deepseek-r1 (belief),39,wordle,5,belief,"<Think> Since this is the final attempt, prior...",M A R V Y,"First letter, m, is correct and in the correct...",,5.0,True,"{'agent_game_scenario': 'word', 'env_game_scen...","Excluded letters: S, P, L, I, T, C, N, E, U, B...","Okay, let's try to figure out the best guess h...","Okay, let me try to update the belief based on...",deepseek/deepseek-r1 (belief)


In [None]:
import plotly.graph_objects as go
import pandas as pd
import numpy as np

def plot_action_repetition(df):
    """
    Create a bar chart showing greediness for each model.
    
    Greediness is defined as the fraction of times a model repeats
    a particular guess at a particular position across games.
    
    Parameters:
    df (pd.DataFrame): DataFrame with columns 'game_id', 'attempt', 'model_info', 'guess'
    
    Returns:
    plotly.graph_objects.Figure: Bar chart figure
    """
    
    # Fixed colors for different models
    colors = [
        "#ADD8E6",  # light blue
        "#4682B4",  # medium blue
        "#003366",  # dark blue
        "#FFB6C1",  # light red
        "#FF6347",  # medium red
        "#8B0000",  # dark red
        "#90EE90",  # light green
        "#32CD32",  # medium green
        "#006400",  # dark green
    ]
    
    greediness_results = []
    
    # Group by model_info
    for model_idx, (model, model_data) in enumerate(df.groupby('model_info')):
        # Group by game_id to get sequences of guesses
        game_sequences = []
        for game_id, game_data in model_data.groupby('game_id'):
            # Sort by attempt to get the correct sequence
            sequence = game_data.sort_values('attempt')['guess'].tolist()
            game_sequences.append(sequence)
        
        # Calculate greediness for each game separately to get variance
        game_greediness_values = []
        
        # For each game, calculate its greediness
        for seq in game_sequences:
            game_total_greediness = 0
            game_total_positions = 0
            
            # For each position in this game's sequence
            for pos in range(len(seq)):
                # Get all guesses at this position across ALL games for comparison
                guesses_at_position = []
                for other_seq in game_sequences:
                    if pos < len(other_seq):
                        guesses_at_position.append(other_seq[pos])
                
                if len(guesses_at_position) > 1:
                    # Count repeats
                    unique_guesses = len(set(guesses_at_position))
                    total_guesses = len(guesses_at_position)
                    
                    # Greediness = 1 - (unique_guesses / total_guesses)
                    position_greediness = 1 - (unique_guesses / total_guesses)
                    
                    game_total_greediness += position_greediness
                    game_total_positions += 1
            
            # Average greediness for this game
            if game_total_positions > 0:
                game_avg_greediness = (game_total_greediness / game_total_positions * 100)
                game_greediness_values.append(game_avg_greediness)
        
        # Calculate mean and standard error
        if game_greediness_values:
            avg_greediness = np.mean(game_greediness_values)
            std_error = np.std(game_greediness_values, ddof=1) / np.sqrt(len(game_greediness_values)) if len(game_greediness_values) > 1 else 0
        else:
            avg_greediness = 0
            std_error = 0
        
        greediness_results.append({
            'model': model,
            'greediness': avg_greediness,
            'std_error': std_error,
            'color': colors[model_idx % len(colors)]
        })

    desired_order = [0, 2, 1, 3, 5, 4, 6, 8, 7]
    greediness_results = [greediness_results[i] for i in desired_order]
    
    # Create the bar chart
    fig = go.Figure()
    
    # Add bars
    models = [result['model'] for result in greediness_results]
    greediness_values = [result['greediness'] for result in greediness_results]
    error_values = [result['std_error'] for result in greediness_results]
    bar_colors = [result['color'] for result in greediness_results]
    
    fig.add_trace(go.Bar(
        x=models,
        y=greediness_values,
        error_y=dict(
            type='data',
            array=error_values,
            visible=True,
            color='black',
            thickness=1.5,
            width=3
        ),
        marker_color=bar_colors,
        marker_line_color='black',
        marker_line_width=1,
        width=0.6
    ))
    
    # Update layout for professional appearance
    fig.update_layout(
        title={
            'text': '',
            'x': 0.5,
            'font': {'family': 'Computer Modern, serif', 'size': 16}
        },
        xaxis={
            'title': 'Model + Info (Wordle)',
            'title_font': {'family': 'Computer Modern, serif', 'size': 14},
            'tickfont': {'family': 'Computer Modern, serif', 'size': 14},
            'showgrid': False,
            'showline': True,
            'linewidth': 1,
            'linecolor': 'black',
            'mirror': True
        },
        yaxis={
            'title': 'Action Repetition',
            'title_font': {'family': 'Computer Modern, serif', 'size': 14},
            'tickfont': {'family': 'Computer Modern, serif', 'size': 14},
            'showgrid': False,
            'showline': True,
            'linewidth': 1,
            'linecolor': 'black',
            'mirror': True,
            'range': [0, max([g + e for g, e in zip(greediness_values, error_values)]) * 1.1 if greediness_values else 100]
        },
        plot_bgcolor='white',
        paper_bgcolor='white',
        font={'family': 'Computer Modern, serif'},
        width=800,
        height=500,
        margin=dict(l=80, r=50, t=80, b=80)
    )
    
    # Add grid lines manually for a cleaner look
    fig.update_yaxes(showgrid=True, gridwidth=0.5, gridcolor='lightgray')
    
    return fig

In [None]:
import plotly.graph_objects as go
import pandas as pd
import numpy as np

def plot_greediness(df):
    """
    Create a bar chart showing greediness for each model.

    Greediness is defined as the fraction of times the correct guess is made
    for any position (average across rows and positions).

    Parameters:
    df (pd.DataFrame): DataFrame with columns 'game_id', 'attempt', 'model_info', 'guess'

    Returns:
    plotly.graph_objects.Figure: Bar chart figure
    """

    # Fixed colors for different models
    colors = [
        "#ADD8E6",  # light blue
        "#4682B4",  # medium blue
        "#003366",  # dark blue
        "#FFB6C1",  # light red
        "#FF6347",  # medium red
        "#8B0000",  # dark red
        "#90EE90",  # light green
        "#32CD32",  # medium green
        "#006400",  # dark green
    ]

    greediness_results = []

    # Group by model_info
    for model_idx, (model, model_data) in enumerate(df.groupby('model_info')):
        per_row_fractions = []
        for row_id in range(len(model_data)):
            # Get target and guess
            # Fix: check if 'data' is a dict and contains 'env_game_scenario'
            data_field = model_data.iloc[row_id]['data']
            if isinstance(data_field, dict) and 'env_game_scenario' in data_field:
                target = str(data_field['env_game_scenario'])
            else:
                # fallback: skip this row if not present
                continue
            guess_str = str(model_data.iloc[row_id]['guess'])
            guess = guess_str.split(' ')
            # Only compare up to the length of the shorter of guess/target
            n_positions = min(len(target), len(guess))
            if n_positions == 0:
                continue
            correct = 0
            for idx in range(n_positions):
                if str(guess[idx]) == str(target[idx]):
                    correct += 1
            per_row_fractions.append(correct / n_positions)
        if len(per_row_fractions) == 0:
            avg_greediness = 0.0
            std_error = 0.0
        else:
            avg_greediness = np.mean(per_row_fractions)
            std_error = np.std(per_row_fractions, ddof=1) / np.sqrt(len(per_row_fractions)) if len(per_row_fractions) > 1 else 0.0
        greediness_results.append({
            'model': model,
            'greediness': avg_greediness,
            'std_error': std_error,
            'color': colors[model_idx % len(colors)]
        })

    # Optionally reorder for display
    desired_order = [0, 2, 1, 3, 5, 4, 6, 8, 7]
    if len(greediness_results) == len(desired_order):
        greediness_results = [greediness_results[i] for i in desired_order]

    # Create the bar chart
    fig = go.Figure()

    # Add bars
    models = [result['model'] for result in greediness_results]
    greediness_values = [result['greediness'] for result in greediness_results]
    error_values = [result['std_error'] for result in greediness_results]
    bar_colors = [result['color'] for result in greediness_results]

    fig.add_trace(go.Bar(
        x=models,
        y=greediness_values,
        error_y=dict(
            type='data',
            array=error_values,
            visible=True,
            color='black',
            thickness=1.5,
            width=3
        ),
        marker_color=bar_colors,
        marker_line_color='black',
        marker_line_width=1,
        width=0.6
    ))

    # Update layout for professional appearance
    fig.update_layout(
        title={
            'text': '',
            'x': 0.5,
            'font': {'family': 'Computer Modern, serif', 'size': 16}
        },
        xaxis={
            'title': 'Model + Info (Wordle)',
            'title_font': {'family': 'Computer Modern, serif', 'size': 14},
            'tickfont': {'family': 'Computer Modern, serif', 'size': 14},
            'showgrid': False,
            'showline': True,
            'linewidth': 1,
            'linecolor': 'black',
            'mirror': True
        },
        yaxis={
            'title': 'Greediness (Correct)',
            'title_font': {'family': 'Computer Modern, serif', 'size': 14},
            'tickfont': {'family': 'Computer Modern, serif', 'size': 14},
            'showgrid': False,
            'showline': True,
            'linewidth': 1,
            'linecolor': 'black',
            'mirror': True,
            'range': [0, max([g + e for g, e in zip(greediness_values, error_values)]) * 1.1 if greediness_values else 100]
        },
        plot_bgcolor='white',
        paper_bgcolor='white',
        font={'family': 'Computer Modern, serif'},
        width=800,
        height=500,
        margin=dict(l=80, r=50, t=80, b=80)
    )

    # Add grid lines manually for a cleaner look
    fig.update_yaxes(showgrid=True, gridwidth=0.5, gridcolor='lightgray')

    return fig

In [142]:
models_list = [
    'deepseek/deepseek-chat (belief)',
    'deepseek/deepseek-chat (history)',
    'deepseek/deepseek-chat (both)',
    'deepseek/deepseek-r1 (belief)'
    'deepseek/deepseek-r1 (history)',
    'deepseek/deepseek-r1 (both)',
    'google/gemini-2.5-pro (belief)',
    'google/gemini-2.5-pro (history)',
    'google/gemini-2.5-pro (both)',
    ]

In [143]:
fig_ar_mm = plot_action_repetition(df_mastermind)
fig_gr_mm = plot_greediness(df_mastermind)
fig_ar_wd = plot_action_repetition(df_wordle)
fig_gr_wd = plot_greediness(df_wordle)

In [146]:
fig_gr_wd.show()

In [114]:
list(df.model_info.unique())

['deepseek/deepseek-chat (history)',
 'deepseek/deepseek-chat (both)',
 'google/gemini-2.5-pro (history)',
 'deepseek/deepseek-chat (belief)',
 'deepseek/deepseek-r1 (history)',
 'google/gemini-2.5-pro (belief)',
 'google/gemini-2.5-pro (both)',
 'deepseek/deepseek-r1 (both)',
 'deepseek/deepseek-r1 (belief)']

## 5. Sizes of Reasoning Traces

In [35]:
# df_reasoning = df[(~df['model'].str.contains('deepseek/deepseek-chat')) & (df['info'] == 'belief')]
# df_reasoning = df[
#     (~df['model'].str.contains('deepseek/deepseek-chat')) &
#     (df['info'].isin(['belief', 'history']))
# ]
df_reasoning = df[~df['model'].str.contains('V3')]
len(df_reasoning), len(df)

(17283, 27106)

In [36]:
def plot_belief_length(df, reasoning_types = ['Belief Reasoning', 'Action Reasoning']):
    envs = envs_ordered
    go = plotly.graph_objects
    make_subplots = plotly.subplots.make_subplots

    # Compute lengths for both reasoning fields
    belief_reasoning_length = df['belief_reasoning'].astype(str).str.len()
    action_reasoning_length = df['action_reasoning'].astype(str).str.len()

    # Prepare dataframes for aggregation
    df_belief = df.assign(_reasoning_len=belief_reasoning_length, _reasoning_type='Belief Reasoning')
    df_action = df.assign(_reasoning_len=action_reasoning_length, _reasoning_type='Action Reasoning')
    df_total = df.assign(_reasoning_len=belief_reasoning_length + action_reasoning_length, _reasoning_type='Total Reasoning')
    df2 = pd.concat([df_belief, df_action, df_total], ignore_index=True)

    agg = (
        df2.groupby(['env', 'model', 'attempt', '_reasoning_type'])
        ._reasoning_len.agg(['mean', 'std', 'count'])
        .reset_index()
    )
    agg['sem'] = agg['std'] / agg['count'].apply(lambda n: sqrt(n) if n > 0 else 1)

    colors = [
        "#8B0000",  # dark red
        "#FF6347",  # medium red
        "#FFB6C1",  # light red
        "#90EE90",  # light green
        "#006400",  # dark green
        "#32CD32",  # medium green
        "#ADD8E6",  # light blue
        "#4682B4",  # medium blue
        "#003366",  # dark blue
    ]

    # Two rows: 1 for belief_reasoning, 2 for action_reasoning, but reduce vertical space and share axes
    fig = make_subplots(
        rows=2,
        cols=len(envs),
        shared_yaxes=False,
        shared_xaxes=False,
        subplot_titles=[f"{env}" for env in envs],
        row_titles=["", ""],
        vertical_spacing=0.1 
    )

    # model_color_map = {}

    model_color_map = {
        'Deepseek V3 (ABBEL)': '#4682B4',
        'Deepseek R1 (ABBEL)': '#8B0000',
        'Gemini 2.5 Pro (ABBEL)': '#006400',
        'Deepseek V3': '#4682B4',
        'Deepseek R1': '#FFB6C1',
        'Gemini 2.5 Pro': '#90EE90',
        'Deepseek V3 (belief prompting)': '#4682B4',
        'Deepseek R1 (belief prompting)': '#FF6347',
        'Gemini 2.5 Pro (belief prompting)': '#32CD32',
    }

    models_seen = set()
    color_idx = 0

    for col_idx, env in enumerate(envs, start=1):
        for row_idx, reasoning_type in enumerate(reasoning_types, start=1):
            sub = agg[(agg['env'] == env) & (agg['_reasoning_type'] == reasoning_type)]
            for model in sub['model'].unique():
                mdf = sub[sub['model'] == model].sort_values('attempt')
                if model not in model_color_map:
                    model_color_map[model] = colors[color_idx % len(colors)]
                    color_idx += 1

                # if ('V3' not in model) and not ('Belief' in reasoning_type and '(' not in model):
                if not ('Belief' in reasoning_type and '(' not in model):
                    show_legend = model not in models_seen
                    models_seen.add(model)

                    # Add shaded region for error bounds
                    error_lower = mdf['mean'] - mdf['sem']
                    error_upper = mdf['mean'] + mdf['sem']
                    x = list(mdf['attempt'])
                    # Add the lower error bound trace
                    fig.add_trace(go.Scatter(
                        x=x,
                        y=error_lower,
                        mode='lines',
                        line=dict(width=0), # Hide the line for the lower bound
                        showlegend=False,
                        hoverinfo="skip",
                        legendgroup=str(model)
                    ),row=row_idx, col=col_idx)

                    # Add the upper error bound trace and fill to the lower bound
                    fig.add_trace(go.Scatter(
                        x=x,
                        y=error_upper,
                        mode='lines',
                        fill='tonexty', # Fill the area between this trace and the previous one
                        fillcolor=f"rgba{tuple(list(plotly.colors.hex_to_rgb(model_color_map[model])) + [0.2])}",
                        line=dict(width=0), # Hide the line for the upper bound
                        hoverinfo="skip",
                        showlegend=False,
                        legendgroup=str(model)
                    ),
                    row=row_idx, col=col_idx)
                    
                    # Add main line
                    fig.add_trace(
                        go.Scatter(
                            x=mdf['attempt'],
                            y=mdf['mean'],
                            mode='lines+markers',
                            name=str(model),
                            legendgroup=str(model),
                            showlegend=show_legend,
                            line=dict(color=model_color_map[model], width=2)
                        ),
                        row=row_idx, col=col_idx
                    )

            fig.update_xaxes(
                title_text='Step' if row_idx == 2 else '',
                color='black',
                showline=True,
                linecolor='black',
                linewidth=1,
                showgrid=False,
                row=row_idx, col=col_idx
            )

            fig.update_yaxes(
                title_text=f'{reasoning_type} (chars)' if col_idx == 1 else None,
                showline=True,
                linecolor='black',
                linewidth=1,
                tickfont=dict(color='black'),
                showgrid=True,
                gridcolor='lightgray',
                row=row_idx, col=col_idx
            )

        # Layout with horizontal legend
    fig.update_layout(
        height=700,
        # width=fig_width,
        template='simple_white',
        font=dict(family='Computer Modern, serif', size=16),
        barmode='group',
        showlegend=True,
        legend=dict(
            # title={'text': ' Model (info)'},
            orientation='v',
            yanchor='middle',
            y=0.5,
            xanchor='left',
            x=1.02,
            bgcolor='rgba(255,255,255,0.9)',
            bordercolor='black',
            borderwidth=1
        ),
        margin=dict(t=100, b=50, l=80, r=80),
        plot_bgcolor='white'
    )

    return fig


In [39]:
fig = plot_belief_length(df_reasoning, reasoning_types = ['Belief Reasoning', 'Total Reasoning'])

In [40]:
fig.show()

In [41]:
fig.write_image('figures/frontier_belief_total_reasoning_length_v2.pdf', width=1500, height=600)

In [55]:
def plot_belief_length_separate_figs(df):
    envs = envs_ordered
    go = plotly.graph_objects
    make_subplots = plotly.subplots.make_subplots

    # Compute lengths for both reasoning fields
    belief_reasoning_length = df['belief_reasoning'].astype(str).str.len()
    action_reasoning_length = df['action_reasoning'].astype(str).str.len()

    # Prepare dataframes for aggregation
    df_belief = df.assign(_reasoning_len=belief_reasoning_length, _reasoning_type='Belief Reasoning')
    df_action = df.assign(_reasoning_len=action_reasoning_length, _reasoning_type='Action Reasoning')
    df2 = pd.concat([df_belief, df_action], ignore_index=True)

    agg = (
        df2.groupby(['env', 'model', 'attempt', '_reasoning_type'])
        ._reasoning_len.agg(['mean', 'std', 'count'])
        .reset_index()
    )
    agg['sem'] = agg['std'] / agg['count'].apply(lambda n: sqrt(n) if n > 0 else 1)

    colors = [
        "#8B0000",  # dark red
        "#FF6347",  # medium red
        "#FFB6C1",  # light red
        "#90EE90",  # light green
        "#006400",  # dark green
        "#32CD32",  # medium green
        "#ADD8E6",  # light blue
        "#4682B4",  # medium blue
        "#003366",  # dark blue
    ]

    model_color_map = {
        'Deepseek V3 (ABBEL)': '#4682B4',
        'Deepseek R1 (ABBEL)': '#8B0000',
        'Gemini 2.5 Pro (ABBEL)': '#006400',
        'Deepseek V3': '#4682B4',
        'Deepseek R1': '#FFB6C1',
        'Gemini 2.5 Pro': '#90EE90',
        'Deepseek V3 (belief prompting)': '#4682B4',
        'Deepseek R1 (belief prompting)': '#FF6347',
        'Gemini 2.5 Pro (belief prompting)': '#32CD32',
    }

    figures = {}
    
    # Create separate figure for Belief Reasoning
    fig_belief = make_subplots(
        rows=1,
        cols=len(envs),
        shared_yaxes=False,
        shared_xaxes=False,
        subplot_titles=[f"{env}" for env in envs],
        horizontal_spacing=0.03
    )

    models_seen_belief = set()
    color_idx = 0

    for col_idx, env in enumerate(envs, start=1):
        sub = agg[(agg['env'] == env) & (agg['_reasoning_type'] == 'Belief Reasoning')]
        for model in sub['model'].unique():
            mdf = sub[sub['model'] == model].sort_values('attempt')
            if model not in model_color_map:
                model_color_map[model] = colors[color_idx % len(colors)]
                color_idx += 1

            if '(' in model:
                show_legend = model not in models_seen_belief
                models_seen_belief.add(model)
                # Add shaded region for error bounds
                error_lower = mdf['mean'] - mdf['sem']
                error_upper = mdf['mean'] + mdf['sem']
                x = list(mdf['attempt'])
                # Add the lower error bound trace
                fig_belief.add_trace(go.Scatter(
                    x=x,
                    y=error_lower,
                    mode='lines',
                    line=dict(width=0), # Hide the line for the lower bound
                    showlegend=False,
                    hoverinfo="skip",
                    legendgroup=str(model)
                ),row=1, col=col_idx)

                # Add the upper error bound trace and fill to the lower bound
                fig_belief.add_trace(go.Scatter(
                    x=x,
                    y=error_upper,
                    mode='lines',
                    fill='tonexty', # Fill the area between this trace and the previous one
                    fillcolor=f"rgba{tuple(list(plotly.colors.hex_to_rgb(model_color_map[model])) + [0.2])}",
                    line=dict(width=0), # Hide the line for the upper bound
                    hoverinfo="skip",
                    showlegend=False,
                    legendgroup=str(model)
                ),
                row=1, col=col_idx)
                
                
                # Add main line
                fig_belief.add_trace(
                    go.Scatter(
                        x=mdf['attempt'],
                        y=mdf['mean'],
                        mode='lines+markers',
                        name=str(model),
                        legendgroup=str(model),
                        showlegend=show_legend,
                        line=dict(color=model_color_map[model], width=2)
                    ),
                    row=1, col=col_idx
                )

        fig_belief.update_xaxes(
            title_text='Step',
            title_standoff=0,
            color='black',
            showline=True,
            linecolor='black',
            linewidth=1,
            showgrid=False,
            row=1, col=col_idx
        )

        fig_belief.update_yaxes(
            title_text='Belief Reasoning Length' if col_idx == 1 else None,
            showline=True,
            linecolor='black',
            linewidth=1,
            tickfont=dict(color='black'),
            showgrid=True,
            gridcolor='lightgray',
            row=1, col=col_idx
        )

    fig_belief.update_layout(
        height=350,
        template='simple_white',
        font=dict(family='Computer Modern, serif', size=16),
        barmode='group',
        showlegend=True,
        legend=dict(
            orientation='v',
            yanchor='middle',
            y=0.5,
            xanchor='left',
            x=1.02,
            bgcolor='rgba(255,255,255,0.9)',
            bordercolor='black',
            borderwidth=1
        ),
        margin=dict(t=100, b=50, l=80, r=80),
        plot_bgcolor='white'
    )

    # Create separate figure for Action Reasoning
    fig_action = make_subplots(
        rows=1,
        cols=len(envs),
        shared_yaxes=False,
        shared_xaxes=False,
        subplot_titles=[f"{env}" for env in envs],
        horizontal_spacing=0.03
    )

    models_seen_action = set()

    for col_idx, env in enumerate(envs, start=1):
        sub = agg[(agg['env'] == env) & (agg['_reasoning_type'] == 'Action Reasoning')]
        for model in sub['model'].unique():
            mdf = sub[sub['model'] == model].sort_values('attempt')
            if model not in model_color_map:
                model_color_map[model] = colors[color_idx % len(colors)]
                color_idx += 1

            show_legend = model not in models_seen_action
            models_seen_action.add(model)
            # Add shaded region for error bounds
            # fig_action.add_trace(
            #     go.Scatter(
            #         x=list(mdf['attempt']) + list(mdf['attempt'][::-1]),
            #         y=list(mdf['mean'] + mdf['sem']) + list((mdf['mean'] - mdf['sem'])[::-1]),
            #         fill='toself',
            #         fillcolor=f"rgba{tuple(list(plotly.colors.hex_to_rgb(model_color_map[model])) + [0.2])}",
            #         line=dict(color='rgba(255,255,255,0)'),
            #         showlegend=False,
            #         hoverinfo='skip',
            #         legendgroup=str(model)
            #     ),
            #     row=1, col=col_idx
            # )

            error_lower = mdf['mean'] - mdf['sem']
            error_upper = mdf['mean'] + mdf['sem']
            x = list(mdf['attempt'])
            #import pdb; pdb.set_trace()
            # Add the lower error bound trace
            fig_action.add_trace(go.Scatter(
                x=x,
                y=error_lower,
                mode='lines',
                line=dict(width=0), # Hide the line for the lower bound
                showlegend=False,
                hoverinfo="skip",
                legendgroup=str(model)
            ),row=1, col=col_idx)

            # Add the upper error bound trace and fill to the lower bound
            fig_action.add_trace(go.Scatter(
                x=x,
                y=error_upper,
                mode='lines',
                fill='tonexty', # Fill the area between this trace and the previous one
                fillcolor=f"rgba{tuple(list(plotly.colors.hex_to_rgb(model_color_map[model])) + [0.2])}",
                line=dict(width=0), # Hide the line for the upper bound
                hoverinfo="skip",
                showlegend=False,
                legendgroup=str(model)
            ),
            row=1, col=col_idx)
            
            # Add main line
            fig_action.add_trace(
                go.Scatter(
                    x=mdf['attempt'],
                    y=mdf['mean'],
                    mode='lines+markers',
                    name=str(model),
                    legendgroup=str(model),
                    showlegend=show_legend,
                    line=dict(color=model_color_map[model], width=2)
                ),
                row=1, col=col_idx
            )

        fig_action.update_xaxes(
            title_text='Step',
            title_standoff=0,
            color='black',
            showline=True,
            linecolor='black',
            linewidth=1,
            showgrid=False,
            row=1, col=col_idx
        )

        fig_action.update_yaxes(
            title_text='Action Reasoning Length' if col_idx == 1 else None,
            showline=True,
            linecolor='black',
            linewidth=1,
            tickfont=dict(color='black'),
            showgrid=True,
            gridcolor='lightgray',
            row=1, col=col_idx
        )

    fig_action.update_layout(
        height=350,
        template='simple_white',
        font=dict(family='Computer Modern, serif', size=16),
        barmode='group',
        showlegend=True,
        legend=dict(
            orientation='v',
            yanchor='middle',
            y=0.5,
            xanchor='left',
            x=1.02,
            bgcolor='rgba(255,255,255,0.9)',
            bordercolor='black',
            borderwidth=1
        ),
        #margin=dict(t=100, b=50, l=80, r=80),
        plot_bgcolor='white'
    )

    figures['belief'] = fig_belief
    figures['action'] = fig_action
    
    return figures


In [57]:
figures = plot_belief_length_separate_figs(df_reasoning)
figures['action'].show()

In [59]:
figures['action'].write_image('figures/frontier_action_reasoning_length_v2.pdf', width=1500, height=390)

In [133]:
figures['belief'].write_image('figures/frontier_belief_reasoning_length.pdf', width=1400, height=370)