# 1. Load the Paprika Environments

In [1]:
import sys
import pandas as pd
sys.path.append('../../paprika/') # Remove this when verl and paprika are installed in the same env
from llm_exploration.paprika_config_helper import PaprikaConfigHelper
from verl.interactions.paprika_interaction import PaprikaInteraction
from pprint import pprint as pp
paprika_games = ['twenty_questions', 'guess_my_city', 'murder_mystery', 'customer_service', 'wordle', 'cellular_automata', \
    'mastermind'] # 'battleship', 'minesweeper', 'bandit_bai_fixed_budget' 

Could not import JerichoInferenceEngine, so cannot use it!
Could not import VLLMInferenceEngine, so cannot use it!


In [2]:
import dotenv
dotenv.load_dotenv('../../.env')

True

In [3]:
for env_name in paprika_games:
    config = PaprikaConfigHelper.create_config(env_name)
    config['belief_config']['style'] = 'none'

    interaction = PaprikaInteraction(config={})

    import builtins
    _original_print = builtins.print
    builtins.print = lambda *a, **k: None
    try:
        instance_id = await interaction.start_interaction(
            instance_id=None,
            scenario_id=None, # start a random scenario
            **config,
        )
    finally:
        builtins.print = _original_print
    num_scenarios = interaction.game_scenarios.__len__()
    print(f'{env_name} => {num_scenarios}')

twenty_questions => 367
guess_my_city => 185
murder_mystery => 50
customer_service => 200


wordle => 800
cellular_automata => 500
mastermind => 500


So let us do 50 games for each with frontier LLMs - one with reasoning and one without.

# 2. Setup LLM Call

In [4]:
sys.path.append('../../src/optimal_explorer')
from llm_utils import llm_call
from pprint import pprint as pp

In [8]:
out = await llm_call(
    model='deepseek/deepseek-r1',
    get_everything=True,
    reasoning_effort='high',
    messages = [
            {"role": "system", "content": 'You are a useless assistant that gives humorous answers.'},
            {"role": "user", "content": 'Why is the sky blue?'},
            {"role": "assistant", "content": 'It is because of the great king of Pokemons.'},
            {"role": "user", "content": 'How so?'},
        ]
)

In [9]:
pp(out['choices'][0]['message']['content'], width=130)

('Ah, you’re right to question me—I clearly flunked out of Pokémon Academy’s physics class. Let me *ash*-k you to imagine this '
 'instead: When sunlight (which is white) enters Earth’s atmosphere, tiny molecules scatter the light. Blue light waves are '
 'short and feisty, like a hyper Pikachu dodging a Poké Ball, so they bounce wildly in all directions. This makes the sky look '
 "blue! Meanwhile, red light waves are chill, like Snorlax napping—they just cruise straight through. So, the sky's blueness is "
 'basically Earth’s atmosphere throwing a rave for blue light. Science, but make it ✨ *electric* ✨.')


In [14]:
pp(out['choices'][0]['message']['reasoning'], width=130)

('Okay, the user asked "How so?" after I joked about the sky being blue because of the Pokémon king. They probably want a real '
 'explanation but are playing along with the humor. I should maintain the funny tone but also slip in the actual science. Let me '
 "explain Rayleigh scattering in a whimsical way using Pokémon references. Make sure it's clear that the real reason is "
 'scientific, but keep it light-hearted. Mention Pikachu and maybe some other Pokémon to keep the theme. Avoid getting too '
 "technical but still convey the right information. Check if the analogy makes sense and the user isn't confused. Balance "
 'between humor and accuracy.\n')


# 3. Belief Update

In [15]:
async def update_belief(
        curr_belief: str,
        action: str,
        response: str,
        model_name: str,
    ):


    user_content = f'''\
Look at the current belief and the agent's action and environment response on that belief.\
Compress the context, remove redundant information, and maintain important information about the game state \
needed to take optimal future actions.\
Current belief: {curr_belief}
Agent's action: {action}
Environment's response: {response}
Output the updated belief state inside <BELIEF> and </BELIEF> tags.\
Understand that only the generated belief is fed to the agent, so be sure to include all necessary information about game mechanics.'''

    messages = [
        {"role": "system", "content": 'You are a helpful assistant.'},
        {"role": "user", "content": user_content},
    ]

    out = await llm_call(
        model=model_name,
        get_everything=True,
        reasoning_effort='high',
        messages=messages
    )

    import re
    content = out['choices'][0]['message']['content']
    match = re.search(r"<BELIEF>(.*?)</BELIEF>", content, re.DOTALL | re.IGNORECASE)
    if match:
        belief = match.group(1).strip()
    else:
        # fallback: return the whole content if tags not found
        belief = content.strip()
    
    reasoning = out['choices'][0]['message']['reasoning_details'][0]['text']

    return belief, reasoning

In [16]:
async def take_action(
        belief: str,
        model_name: str,
    ):


    user_content = f'''\
Look at the current belief take the next action based on the belief.\
Take an action that leads to optimal exploration.\
Belief: {belief}
Output the action inside <ACTION> and </ACTION> tags.'''

    messages = [
        {"role": "system", "content": 'You are a helpful assistant.'},
        {"role": "user", "content": user_content},
    ]

    out = await llm_call(
        model=model_name,
        get_everything=True,
        reasoning_effort='high',
        messages=messages
    )

    import re
    content = out['choices'][0]['message']['content']
    match = re.search(r"<\s*action\s*>(.*?)<\s*/\s*action\s*>", content, re.DOTALL | re.IGNORECASE)
    if match:
        action = match.group(1).strip()
    else:
        # fallback: return the whole content if tags not found
        action = content.strip()
    
    reasoning = out['choices'][0]['message']['reasoning_details'][0]['text']

    return action, reasoning

# 4. Paprika Rollout with Belief

In [17]:
import asyncio

async def run_one_iteration_with_belief_llm(
        env_name: str,
        model_name: str,
        game_id: int,
    ):
    config = PaprikaConfigHelper.create_config(env_name)
    config['belief_config']['style'] = 'none'
    interaction = PaprikaInteraction(config={})

    import builtins
    _original_print = builtins.print
    builtins.print = lambda *a, **k: None
    try:
        instance_id = await interaction.start_interaction(instance_id=None, scenario_id=None, **config)
    finally:
        builtins.print = _original_print

    first_user_message = interaction.agent_conv.messages[0][1]
    attempts = 0
    game_history = []
    belief = f'This is the start of the game. The only available information right now are the game rules:\n{first_user_message}'
    max_attempts = interaction._instance_dict[instance_id]['max_turns']

    while attempts < max_attempts:
        
        attempts += 1

        action, action_reasoning = await take_action(belief, model_name)

        message = [
            {"role": "user", "content": f"Output the next action."},
            {"role": "assistant", "content": f"<action>{action}</action>"}
        ]
        done, response, score, additional_data = await interaction.generate_response(instance_id=instance_id, messages=message)
        
        belief, belief_reasoning = await update_belief(belief, action, response, model_name)

        game_history.append({
            "model": model_name,
            "game_id": str(game_id),
            "env": env_name,
            "attempt": attempts,
            "guess": action,
            "response": response,
            "score": score,
            "done": done,
            "data": additional_data,
            "belief": belief,
            "action_reasoning": action_reasoning,
            "belief_reasoning": belief_reasoning,
        })

        if "Goal reached" in response:
            break
    
    print(f'.', end='', flush=True)
    
    return game_history

async def run_multiple_iterations_multiple_games(
        num_games: int,
        list_envs,
        models,
        logs_file='./logs/paprika_frontier.jsonl',
    ):
    import json

    tasks = []
    for model in models:
        for env_name in list_envs:
            for game_id in range(num_games):
                tasks.append(run_one_iteration_with_belief_llm(env_name, model, game_id))

    results = await asyncio.gather(*tasks)

    # Flatten results and write to file
    with open(logs_file, "a") as f:
        for game_history in results:
            for entry in game_history:
                f.write(json.dumps(entry) + "\n")

In [26]:
await run_multiple_iterations_multiple_games(
    num_games=10,
    list_envs=['mastermind'],
    models=['openai/gpt-oss-120b'],
)

..........

# 4. Metrics

In [1]:
import pandas as pd
from pprint import pprint as pp
import json

logs_file = './logs/paprika_frontier_v6.jsonl'
with open(logs_file, 'r') as f:
    data = [json.loads(line) for line in f]
df = pd.DataFrame(data)

In [2]:
print(df.columns)

Index(['model', 'game_id', 'env', 'attempt', 'info', 'raw_guess', 'guess',
       'response', 'word_limit', 'score', 'done', 'data', 'belief',
       'action_reasoning', 'belief_reasoning'],
      dtype='object')


In [3]:
if not df.iloc[0]['word_limit']:
    df['word_limit'] = 'None'

In [4]:
df['info'].value_counts()

info
belief     8764
both       8491
history    8162
Name: count, dtype: int64

In [5]:
df['model_info'] = df['model'].astype(str) + ' (' + df['info'].astype(str) + ')'

In [6]:
df['model'] = df['model_info']

In [7]:
df

Unnamed: 0,model,game_id,env,attempt,info,raw_guess,guess,response,word_limit,score,done,data,belief,action_reasoning,belief_reasoning,model_info
0,deepseek/deepseek-chat (history),0,customer_service,1,history,"<Think> To diagnose the issue effectively, I n...",Can you please check if the refrigerator door ...,"The door is not fully closed, and the temperat...",,0.0,True,{'agent_game_scenario': 'The customer's refrig...,,,,deepseek/deepseek-chat (history)
1,deepseek/deepseek-chat (history),0,customer_service,2,history,"<Think> Since the door is not fully closed, th...",Please make sure the refrigerator door is full...,Goal reached.,,0.0,True,{'agent_game_scenario': 'The customer's refrig...,,,,deepseek/deepseek-chat (history)
2,deepseek/deepseek-chat (both),1,wordle,1,both,"<Think> Since this is the first guess, the goa...",CRANE,"First letter, c, is not in the target word \nS...",,1.0,True,"{'agent_game_scenario': 'word', 'env_game_scen...",The target word does not contain the letters C...,,,deepseek/deepseek-chat (both)
3,deepseek/deepseek-chat (both),1,wordle,2,both,"<Think> Since C, R, A, and N are not in the wo...",SLOPE,"First letter, s, is correct and in the correct...",,2.0,True,"{'agent_game_scenario': 'word', 'env_game_scen...",The target word starts with 'S'. It contains t...,,,deepseek/deepseek-chat (both)
4,deepseek/deepseek-chat (both),1,wordle,3,both,<Think> Based on the history and belief state:...,SPIED,Goal reached,,3.0,True,"{'agent_game_scenario': 'word', 'env_game_scen...","The target word is ""SPIED"".",,,deepseek/deepseek-chat (both)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25412,deepseek/deepseek-r1 (both),36,mastermind,8,both,<Think>\nGiven the feedback history and curren...,4 5 3 6,Your last guess has 0 exact matches with the s...,,8.0,True,"{'agent_game_scenario': 'secret code', 'env_ga...","**Updated Belief**:\n- **Excluded digits**: 7,...","Okay, let's see. So I'm on attempt 9 out of 12...","Okay, let's try to figure this out. The curren...",deepseek/deepseek-r1 (both)
25413,deepseek/deepseek-r1 (both),36,mastermind,9,both,<Think> \nGiven the belief that the code has o...,0 2 5 1,Your last guess has 1 exact matches with the s...,,9.0,True,"{'agent_game_scenario': 'secret code', 'env_ga...","**Updated Belief**:\n- **Excluded digits**: 7,...","Okay, let's try to figure out what the next gu...","Okay, let's break this down. The current belie...",deepseek/deepseek-r1 (both)
25414,deepseek/deepseek-r1 (both),36,mastermind,10,both,<Think>\nGiven the feedback from previous gues...,1 0 2 5,Your last guess has 0 exact matches with the s...,,10.0,True,"{'agent_game_scenario': 'secret code', 'env_ga...","- **Excluded digits**: 7, 8, 9. Possible digit...","Okay, let's see. We're on attempt 11 out of 12...","Okay, let me try to process the new feedback. ...",deepseek/deepseek-r1 (both)
25415,deepseek/deepseek-r1 (both),36,mastermind,11,both,"<Think> \nGiven the belief state, pos4 is conf...",2 4 0 1,Your last guess has 0 exact matches with the s...,,11.0,True,"{'agent_game_scenario': 'secret code', 'env_ga...","- **Excluded digits**: 7, 8, 9. Possible digit...","Okay, let's see. We're on the 12th attempt, an...","Okay, let's start by looking at the current be...",deepseek/deepseek-r1 (both)


## 1. Success Rates (belief v. history v. both)

In [22]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import numpy as np

def summarize_game_outcomes(df):
    df['response'] = df['response'].str.lower()
    grouped = df.groupby(['model', 'game_id', 'env', 'word_limit'])['response'].apply(
        lambda responses: any('goal reached' in resp for resp in responses)
    ).reset_index(name='won')
    return grouped

def plot_win_rates(summary_df):
    # Compute mean and std of win rates for each group
    stats = summary_df.groupby(['env', 'model', 'word_limit'])['won'].agg(['mean', 'std', 'count']).reset_index()
    stats['success_rate'] = stats['mean'] * 100
    # Standard error of the mean (SEM)
    stats['sem'] = stats['std'] / np.sqrt(stats['count'])
    stats['sem'] = stats['sem'].fillna(0)
    stats['success_rate_sem'] = stats['sem'] * 100

    envs = stats['env'].unique()
    models = stats['model'].unique()[[0, 2, 1, 3, 5, 4, 6, 8, 7]]
    word_limits = sorted(stats['word_limit'].unique())

    colors = [
        "#ADD8E6",  # light blue
        "#4682B4",  # medium blue
        "#003366",  # dark blue
        "#FFB6C1",  # light red
        "#FF6347",  # medium red
        "#8B0000",  # dark red
        "#90EE90",  # light green
        "#32CD32",  # medium green
        "#006400",  # dark green
    ]
    color_map = {model: colors[i % len(colors)] for i, model in enumerate(models)}

    # Create subplots: rows = word_limit values, cols = envs
    fig = make_subplots(
        rows=len(word_limits), cols=len(envs),
        subplot_titles=[f"{env}" for env in envs],
        shared_yaxes=True,
        vertical_spacing=0.2 / len(word_limits),
        horizontal_spacing=0.03
    )

    for r, wl in enumerate(word_limits, start=1):
        for c, env in enumerate(envs, start=1):
            for model in models:
                subset = stats[
                    (stats['env'] == env) &
                    (stats['model'] == model) &
                    (stats['word_limit'] == wl)
                ]
                if not subset.empty:
                    # Plot bar with error bar (standard error of mean)
                    fig.add_trace(
                        go.Bar(
                            x=[model],
                            y=subset['success_rate'],
                            name=model,
                            marker_color=color_map[model],
                            width=0.8,
                            showlegend=(r == 1 and c == 1),
                            error_y=dict(
                                type='data',
                                array=subset['success_rate_sem'],
                                visible=True,
                                color='black',
                                thickness=1,
                                width=4,
                            ),
                            hovertemplate=(
                                f"Env: {env}<br>"
                                f"Word Limit: {wl}<br>"
                                f"Model: {model}<br>"
                                f"Success Rate: {{y:.2f}}%<br>"
                                f"SEM: {subset['success_rate_sem'].values[0]:.2f}%<br>"
                                f"N: {subset['count'].values[0]}"
                            ),
                        ),
                        row=r, col=c
                    )
            # # Add row label for word_limit
            # if c == 1:
            #     fig.add_annotation(
            #         text=f"Word Limit: {wl}",
            #         xref="paper",
            #         yref="paper",
            #         x=0.1,
            #         y=0.95 - ((r - 1) / len(word_limits)),
            #         showarrow=False,
            #         font=dict(size=14)
            #     )

    # Update y-axis
    for r in range(1, len(word_limits) + 1):
        for c in range(1, len(envs) + 1):
            fig.update_yaxes(
                range=[0, 100],
                showgrid=True,
                gridcolor='lightgray',
                row=r, col=c
            )

    # Update x-axis to remove tick labels
    for r in range(1, len(word_limits) + 1):
        for c in range(1, len(envs) + 1):
            fig.update_xaxes(
                showticklabels=False,
                row=r, col=c
            )

    # Fixed subplot size
    fig_width = 180 * len(envs)
    fig_height = 200 * len(word_limits)

    # Layout with horizontal legend
    fig.update_layout(
        height=fig_height + 150,
        width=fig_width,
        template='simple_white',
        font=dict(family='Computer Modern, serif', size=16),
        barmode='group',
        showlegend=True,
        legend=dict(
            title='Models',
            orientation='h',
            yanchor='bottom',
            y=1.18,
            xanchor='center',
            x=0.5,
            bgcolor='rgba(255,255,255,0.9)',
            bordercolor='black',
            borderwidth=1
        ),
        margin=dict(t=100, b=50, l=80, r=20),
        plot_bgcolor='white'
    )

    fig.update_yaxes(title_text="Avg. Success Rate", row=1, col=1)

    return fig

In [23]:
df_belief = df.loc[df['info'] == 'belief'].copy()
df_history = df.loc[df['info'] == 'history'].copy()
summary_df = summarize_game_outcomes(df)
summary_df_belief = summarize_game_outcomes(df_belief)
summary_df_history = summarize_game_outcomes(df_history)

In [24]:
fig = plot_win_rates(summary_df)
fig.show()

## 2. Qualitative Look at Samples

In [None]:
df.columns

Index(['model', 'game_id', 'env', 'attempt', 'raw_guess', 'guess', 'response',
       'word_limit', 'score', 'done', 'data', 'belief', 'action_reasoning',
       'belief_reasoning'],
      dtype='object')

In [13]:
wordle_sample = df[
    (df['model'] == 'deepseek/deepseek-r1 (belief)') & \
        (df['env'] == 'murder_mystery') & \
            (df['game_id'] == str(0)) & \
            (df['info'] == 'belief')
            ]
print(len(wordle_sample))

20


In [14]:
wordle_sample

Unnamed: 0,model,game_id,env,attempt,info,raw_guess,guess,response,word_limit,score,done,data,belief,action_reasoning,belief_reasoning,model_info
3499,deepseek/deepseek-r1 (belief),0,murder_mystery,1,belief,<Think> Start by determining the time of death...,Question the station medic: What was the estim...,"The station medic looks grave as they respond,...",,1.0,True,{'agent_game_scenario': 'You are a detective i...,**Key Facts**: \n- Dr. Lila Moore died at ~2 ...,"Okay, so I'm starting this murder mystery game...","Okay, let's start by understanding the scenari...",deepseek/deepseek-r1 (belief)
3500,deepseek/deepseek-r1 (belief),0,murder_mystery,2,belief,<Think> Verifying alibis for the 2 PM timefram...,"Ask: What were the alibis of Captain Drake, Je...","The medic replies, ""I only have partial knowle...",,2.0,True,{'agent_game_scenario': 'You are a detective i...,**Key Facts**: \n- Dr. Lila Moore died at ~2 ...,"Okay, let's see where to start. The key facts ...","Okay, so the agent asked about the alibis of t...",deepseek/deepseek-r1 (belief)
3501,deepseek/deepseek-r1 (belief),0,murder_mystery,3,belief,<Think>Verifying alibis via system logs direct...,"**Action**: Examine system logs (control room,...",You access the system logs. \n\n- **Control Ro...,,3.0,True,{'agent_game_scenario': 'You are a detective i...,**Key Facts**: \n- Dr. Lila Moore died at ~2 ...,"Okay, so I need to figure out the next best st...","Okay, let me update the belief based on the ag...",deepseek/deepseek-r1 (belief)
3502,deepseek/deepseek-r1 (belief),0,murder_mystery,4,belief,"<Think> \nTo optimize information gathering, ...",**Action:** Interview the engineer witness to ...,The engineer looks a bit nervous as you questi...,,4.0,True,{'agent_game_scenario': 'You are a detective i...,**Key Facts**: \n- Dr. Lila Moore died at ~2 ...,"Okay, let's try to figure out the next best st...","Okay, so the agent decided to interview the en...",deepseek/deepseek-r1 (belief)
3503,deepseek/deepseek-r1 (belief),0,murder_mystery,5,belief,<Answer> \n**Action**: Interview the station ...,<Answer> \n**Action**: Interview the station ...,"The engineer nods seriously, taking a moment t...",,5.0,True,{'agent_game_scenario': 'You are a detective i...,**Key Facts**: \n- Dr. Lila Moore died at ~2 ...,"Okay, let's review the information again. Dr. ...","Okay, let me process the engineer's response t...",deepseek/deepseek-r1 (belief)
3504,deepseek/deepseek-r1 (belief),0,murder_mystery,6,belief,<Think>\nThe critical unknown is the exact tim...,**Action**: Review decompression chamber’s sys...,You access the decompression chamber’s system ...,,6.0,True,{'agent_game_scenario': 'You are a detective i...,**Key Facts**: \n- Dr. Lila Moore died at ~2 ...,"Okay, let me look at the current information. ...","Okay, let's see. The agent just checked the de...",deepseek/deepseek-r1 (belief)
3505,deepseek/deepseek-r1 (belief),0,murder_mystery,7,belief,"<Think> The sabotage occurred at 1:58 PM, and ...",**Action**: Cross-check control room logs for ...,You delve into the control room logs and find ...,,7.0,True,{'agent_game_scenario': 'You are a detective i...,**Key Facts**: \n- Dr. Lila Moore died at ~2 ...,"Okay, let me think through this step by step. ...","Okay, let me process this step by step. The ag...",deepseek/deepseek-r1 (belief)
3506,deepseek/deepseek-r1 (belief),0,murder_mystery,8,belief,<Think>Captain Drake's confirmed actions at th...,Interrogate Captain Drake: Justify his 1:58 PM...,Captain Drake appears defensive as you questio...,,8.0,True,{'agent_game_scenario': 'You are a detective i...,**Key Facts**: \n- Dr. Lila Moore died at ~2 ...,"Okay, let me try to figure out the best next s...","Okay, let's process the new information. The a...",deepseek/deepseek-r1 (belief)
3507,deepseek/deepseek-r1 (belief),0,murder_mystery,9,belief,<Think> \nThe most critical unknowns are whet...,**Action**: Audit maintenance schedules to ver...,You review the maintenance schedules and find ...,,9.0,True,{'agent_game_scenario': 'You are a detective i...,**Key Facts**: \n- Dr. Lila Moore died at ~2 ...,"Okay, let's see. The current belief state poin...","Okay, let's start by updating the belief based...",deepseek/deepseek-r1 (belief)
3508,deepseek/deepseek-r1 (belief),0,murder_mystery,10,belief,<Think> \nThe most critical unknown is whethe...,**Review System Telemetry**: Check historical ...,You analyze the historical telemetry data for ...,,10.0,True,{'agent_game_scenario': 'You are a detective i...,**Key Facts**: \n- Dr. Lila Moore died at ~2 ...,"Okay, let me try to figure out the best next s...","Okay, let me go through this step by step. The...",deepseek/deepseek-r1 (belief)


In [15]:
pp(wordle_sample.iloc[5]['belief'])

('**Key Facts**:  \n'
 '- Dr. Lila Moore died at ~2 PM via asphyxiation due to decompression chamber '
 'sabotage.  \n'
 '- **Sabotage Timing Confirmed**: System logs show pressure controls were '
 'modified at **1:58 PM** (pre-2 PM), causing immediate malfunction. '
 'Delayed-trigger hypothesis ruled out.  \n'
 '- **Sabotage Method**: Direct manipulation at 1:58 PM (engineer’s "delicate '
 'system" remark aligns with rapid failure).  \n'
 '\n'
 '**Alibi Reassessment**:  \n'
 '- **Captain Drake**: Control room access **1:50–2:10 PM**. **Critical '
 'overlap** with sabotage time (1:58 PM). If control room logs confirm he '
 'operated pressure systems during this window, direct guilt likely.  \n'
 '- **Jenna Parks**: Habitat module log ends at 2:05 PM. Since sabotage '
 'occurred at 1:58 PM, her alibi **does not exclude her**—she could have '
 'tampered with systems before 2:05 PM. **Log integrity** (e.g., falsified '
 'timestamps) now pivotal.  \n'
 '- **Victor Kane**: Lab login starts 

In [None]:
# can't do with belief but can do with history -> look at the full trajectory with reasoning and try to see why it can't do it with belief

## 3. Sizes of Belief States

In [26]:
df_belief = df[df['info'] == 'belief']

In [29]:
print(df_belief.head(2))

                              model game_id            env  attempt    info  \
18  deepseek/deepseek-chat (belief)       0  guess_my_city        1  belief   
19  deepseek/deepseek-chat (belief)       0  guess_my_city        2  belief   

                                            raw_guess  \
18  <Think> Since this is the start of the game, t...   
19  <Think> Since the city is in South America, it...   

                                                guess  \
18               Is your home city located in Europe?   
19  Is your home city located near the coastline o...   

                                             response word_limit  score  done  \
18  no, my home city is not located in europe. it'...       None    1.0  True   
19  my city is located inland and not near the coa...       None    2.0  True   

                                                 data  \
18  {'agent_game_scenario': 'city', 'env_game_scen...   
19  {'agent_game_scenario': 'city', 'env_game_scen...   

  

In [46]:
def plot_belief_length(df):
    envs = list(df['env'].astype(str).unique())
    from math import sqrt
    import plotly
    go = plotly.graph_objects
    make_subplots = plotly.subplots.make_subplots

    length = df['belief'].astype(str).str.len()
    df2 = df.assign(_belief_len=length)
    agg = df2.groupby(['env','model','attempt'])._belief_len.agg(['mean','std','count']).reset_index()
    agg['sem'] = agg['std'] / agg['count'].apply(lambda n: sqrt(n) if n > 0 else 1)

    colors = getattr(plotly.colors.qualitative, 'Dark24', None) or \
             getattr(plotly.colors.qualitative, 'Plotly', None) or \
             list(plotly.colors.qualitative.__dict__.values())[0]

    fig = make_subplots(rows=1, cols=len(envs), shared_yaxes=False, subplot_titles=envs)

    model_color_map = {}
    models_seen = set()
    color_idx = 0

    for col_idx, env in enumerate(envs, start=1):
        sub = agg[agg['env'] == env]
        for model in sub['model'].unique():
            mdf = sub[sub['model'] == model].sort_values('attempt')
            if model not in model_color_map:
                model_color_map[model] = colors[color_idx % len(colors)]
                color_idx += 1
            show_legend = model not in models_seen
            models_seen.add(model)

            fig.add_trace(
                go.Scatter(
                    x=mdf['attempt'],
                    y=mdf['mean'],
                    error_y=dict(
                        type='data',
                        array=mdf['sem'],
                        visible=True,
                        thickness=1.5,   # softer line
                        color='rgba(0,0,0,0.4)'  # semi-transparent black
                    ),
                    mode='lines+markers',
                    name=str(model.split('(')[0]),
                    legendgroup=str(model),
                    showlegend=show_legend,
                    line=dict(color=model_color_map[model], width=2)
                ),
                row=1, col=col_idx
            )

        fig.update_xaxes(
            title_text='Step',
            color='black',  # black axis
            showline=True,
            linecolor='black',
            linewidth=1,
            showgrid=False,
            row=1, col=col_idx
        )

        fig.update_yaxes(
            title_text='Avg. belief length (chars)' if col_idx == 1 else None,
            showline=True,
            linecolor='black',
            linewidth=1,
            tickfont=dict(color='black'),
            showgrid=True,
            gridcolor='lightgray',
            row=1, col=col_idx
        )

    fig.update_layout(
        width=1200,
        height=400,
        paper_bgcolor='white',
        plot_bgcolor='white',
        font=dict(family='Times New Roman, serif', size=14),
        legend=dict(
            orientation='h',   # horizontal
            yanchor='bottom',
            y=1.1,
            xanchor='center',
            x=0.5,
            bordercolor='black',
            borderwidth=1,
            bgcolor='rgba(255,255,255,0.9)'
        )
    )

    return fig


In [47]:
fig = plot_belief_length(df_belief)

In [48]:
fig.show()

## 4. Greediness Metrics

In [49]:
df_mastermind = df[df['env'] == 'mastermind']

In [56]:
print(df_mastermind[['game_id', 'attempt', 'model_info', 'guess']].head(2))

    game_id  attempt                        model_info    guess
102       0        1  deepseek/deepseek-chat (history)  1 2 3 4
103       0        2  deepseek/deepseek-chat (history)  5 6 7 8


In [160]:
import plotly.graph_objects as go
import pandas as pd
import numpy as np

def plot_greediness(df):
    """
    Create a bar chart showing greediness for each model.
    
    Greediness is defined as the fraction of times a model repeats
    a particular guess at a particular position across games.
    
    Parameters:
    df (pd.DataFrame): DataFrame with columns 'game_id', 'attempt', 'model_info', 'guess'
    
    Returns:
    plotly.graph_objects.Figure: Bar chart figure
    """
    
    # Fixed colors for different models
    colors = [
        "#ADD8E6",  # light blue
        "#4682B4",  # medium blue
        "#003366",  # dark blue
        "#FFB6C1",  # light red
        "#FF6347",  # medium red
        "#8B0000",  # dark red
        "#90EE90",  # light green
        "#32CD32",  # medium green
        "#006400",  # dark green
    ]
    
    greediness_results = []
    
    # Group by model_info
    for model_idx, (model, model_data) in enumerate(df.groupby('model_info')):
        # Group by game_id to get sequences of guesses
        game_sequences = []
        for game_id, game_data in model_data.groupby('game_id'):
            # Sort by attempt to get the correct sequence
            sequence = game_data.sort_values('attempt')['guess'].tolist()
            game_sequences.append(sequence)
        
        # Calculate greediness for each game separately to get variance
        game_greediness_values = []
        
        # For each game, calculate its greediness
        for seq in game_sequences:
            game_total_greediness = 0
            game_total_positions = 0
            
            # For each position in this game's sequence
            for pos in range(len(seq)):
                # Get all guesses at this position across ALL games for comparison
                guesses_at_position = []
                for other_seq in game_sequences:
                    if pos < len(other_seq):
                        guesses_at_position.append(other_seq[pos])
                
                if len(guesses_at_position) > 1:
                    # Count repeats
                    unique_guesses = len(set(guesses_at_position))
                    total_guesses = len(guesses_at_position)
                    
                    # Greediness = 1 - (unique_guesses / total_guesses)
                    position_greediness = 1 - (unique_guesses / total_guesses)
                    
                    game_total_greediness += position_greediness
                    game_total_positions += 1
            
            # Average greediness for this game
            if game_total_positions > 0:
                game_avg_greediness = (game_total_greediness / game_total_positions * 100)
                game_greediness_values.append(game_avg_greediness)
        
        # Calculate mean and standard error
        if game_greediness_values:
            avg_greediness = np.mean(game_greediness_values)
            std_error = np.std(game_greediness_values, ddof=1) / np.sqrt(len(game_greediness_values)) if len(game_greediness_values) > 1 else 0
        else:
            avg_greediness = 0
            std_error = 0
        
        greediness_results.append({
            'model': model,
            'greediness': avg_greediness,
            'std_error': std_error,
            'color': colors[model_idx % len(colors)]
        })

    desired_order = [0, 2, 1, 3, 5, 4, 6, 8, 7]
    greediness_results = [greediness_results[i] for i in desired_order]
    
    # Create the bar chart
    fig = go.Figure()
    
    # Add bars
    models = [result['model'] for result in greediness_results]
    greediness_values = [result['greediness'] for result in greediness_results]
    error_values = [result['std_error'] for result in greediness_results]
    bar_colors = [result['color'] for result in greediness_results]
    
    fig.add_trace(go.Bar(
        x=models,
        y=greediness_values,
        error_y=dict(
            type='data',
            array=error_values,
            visible=True,
            color='black',
            thickness=1.5,
            width=3
        ),
        marker_color=bar_colors,
        marker_line_color='black',
        marker_line_width=1,
        width=0.6
    ))
    
    # Update layout for professional appearance
    fig.update_layout(
        title={
            'text': '',
            'x': 0.5,
            'font': {'family': 'Computer Modern, serif', 'size': 16}
        },
        xaxis={
            'title': 'Model + Info (Mastermind)',
            'title_font': {'family': 'Computer Modern, serif', 'size': 14},
            'tickfont': {'family': 'Computer Modern, serif', 'size': 14},
            'showgrid': False,
            'showline': True,
            'linewidth': 1,
            'linecolor': 'black',
            'mirror': True
        },
        yaxis={
            'title': 'Greediness (% of repeats)',
            'title_font': {'family': 'Computer Modern, serif', 'size': 14},
            'tickfont': {'family': 'Computer Modern, serif', 'size': 14},
            'showgrid': False,
            'showline': True,
            'linewidth': 1,
            'linecolor': 'black',
            'mirror': True,
            'range': [0, max([g + e for g, e in zip(greediness_values, error_values)]) * 1.1 if greediness_values else 100]
        },
        plot_bgcolor='white',
        paper_bgcolor='white',
        font={'family': 'Computer Modern, serif'},
        width=800,
        height=500,
        margin=dict(l=80, r=50, t=80, b=80)
    )
    
    # Add grid lines manually for a cleaner look
    fig.update_yaxes(showgrid=True, gridwidth=0.5, gridcolor='lightgray')
    
    return fig

In [161]:
models_list = [
    'deepseek/deepseek-chat (belief)',
    'deepseek/deepseek-chat (history)',
    'deepseek/deepseek-chat (both)',
    'deepseek/deepseek-r1 (belief)'
    'deepseek/deepseek-r1 (history)',
    'deepseek/deepseek-r1 (both)',
    'google/gemini-2.5-pro (belief)',
    'google/gemini-2.5-pro (history)',
    'google/gemini-2.5-pro (both)',
    ]

fig = plot_greediness(df_mastermind)

In [162]:
fig.show()

In [116]:
list(df.model_info.unique())

['deepseek/deepseek-chat (history)',
 'deepseek/deepseek-chat (both)',
 'google/gemini-2.5-pro (history)',
 'deepseek/deepseek-chat (belief)',
 'deepseek/deepseek-r1 (history)',
 'google/gemini-2.5-pro (belief)',
 'google/gemini-2.5-pro (both)',
 'deepseek/deepseek-r1 (both)',
 'deepseek/deepseek-r1 (belief)']