# 1. Load the Paprika Environments

In [1]:
import sys
import pandas as pd
sys.path.append('../../paprika/') # Remove this when verl and paprika are installed in the same env
from llm_exploration.paprika_config_helper import PaprikaConfigHelper
from verl.interactions.paprika_interaction import PaprikaInteraction
from pprint import pprint as pp
paprika_games = ['twenty_questions', 'guess_my_city', 'murder_mystery', 'customer_service', 'wordle', 'cellular_automata', \
    'mastermind'] # 'battleship', 'minesweeper', 'bandit_bai_fixed_budget' 

Could not import JerichoInferenceEngine, so cannot use it!
Could not import VLLMInferenceEngine, so cannot use it!


In [2]:
import dotenv
dotenv.load_dotenv('../../.env')

True

In [3]:
for env_name in paprika_games:
    config = PaprikaConfigHelper.create_config(env_name)
    config['belief_config']['style'] = 'none'

    interaction = PaprikaInteraction(config={})

    import builtins
    _original_print = builtins.print
    builtins.print = lambda *a, **k: None
    try:
        instance_id = await interaction.start_interaction(
            instance_id=None,
            scenario_id=None, # start a random scenario
            **config,
        )
    finally:
        builtins.print = _original_print
    num_scenarios = interaction.game_scenarios.__len__()
    print(f'{env_name} => {num_scenarios}')

twenty_questions => 367
guess_my_city => 185
murder_mystery => 50
customer_service => 200


wordle => 800
cellular_automata => 500
mastermind => 500


So let us do 50 games for each with frontier LLMs - one with reasoning and one without.

# 2. Setup LLM Call

In [4]:
sys.path.append('../../src/optimal_explorer')
from llm_utils import llm_call
from pprint import pprint as pp

In [8]:
out = await llm_call(
    model='deepseek/deepseek-r1',
    get_everything=True,
    reasoning_effort='high',
    messages = [
            {"role": "system", "content": 'You are a useless assistant that gives humorous answers.'},
            {"role": "user", "content": 'Why is the sky blue?'},
            {"role": "assistant", "content": 'It is because of the great king of Pokemons.'},
            {"role": "user", "content": 'How so?'},
        ]
)

In [9]:
pp(out['choices'][0]['message']['content'], width=130)

('Ah, you’re right to question me—I clearly flunked out of Pokémon Academy’s physics class. Let me *ash*-k you to imagine this '
 'instead: When sunlight (which is white) enters Earth’s atmosphere, tiny molecules scatter the light. Blue light waves are '
 'short and feisty, like a hyper Pikachu dodging a Poké Ball, so they bounce wildly in all directions. This makes the sky look '
 "blue! Meanwhile, red light waves are chill, like Snorlax napping—they just cruise straight through. So, the sky's blueness is "
 'basically Earth’s atmosphere throwing a rave for blue light. Science, but make it ✨ *electric* ✨.')


In [14]:
pp(out['choices'][0]['message']['reasoning'], width=130)

('Okay, the user asked "How so?" after I joked about the sky being blue because of the Pokémon king. They probably want a real '
 'explanation but are playing along with the humor. I should maintain the funny tone but also slip in the actual science. Let me '
 "explain Rayleigh scattering in a whimsical way using Pokémon references. Make sure it's clear that the real reason is "
 'scientific, but keep it light-hearted. Mention Pikachu and maybe some other Pokémon to keep the theme. Avoid getting too '
 "technical but still convey the right information. Check if the analogy makes sense and the user isn't confused. Balance "
 'between humor and accuracy.\n')


# 3. Belief Update

In [15]:
async def update_belief(
        curr_belief: str,
        action: str,
        response: str,
        model_name: str,
    ):


    user_content = f'''\
Look at the current belief and the agent's action and environment response on that belief.\
Compress the context, remove redundant information, and maintain important information about the game state \
needed to take optimal future actions.\
Current belief: {curr_belief}
Agent's action: {action}
Environment's response: {response}
Output the updated belief state inside <BELIEF> and </BELIEF> tags.\
Understand that only the generated belief is fed to the agent, so be sure to include all necessary information about game mechanics.'''

    messages = [
        {"role": "system", "content": 'You are a helpful assistant.'},
        {"role": "user", "content": user_content},
    ]

    out = await llm_call(
        model=model_name,
        get_everything=True,
        reasoning_effort='high',
        messages=messages
    )

    import re
    content = out['choices'][0]['message']['content']
    match = re.search(r"<BELIEF>(.*?)</BELIEF>", content, re.DOTALL | re.IGNORECASE)
    if match:
        belief = match.group(1).strip()
    else:
        # fallback: return the whole content if tags not found
        belief = content.strip()
    
    reasoning = out['choices'][0]['message']['reasoning_details'][0]['text']

    return belief, reasoning

In [16]:
async def take_action(
        belief: str,
        model_name: str,
    ):


    user_content = f'''\
Look at the current belief take the next action based on the belief.\
Take an action that leads to optimal exploration.\
Belief: {belief}
Output the action inside <ACTION> and </ACTION> tags.'''

    messages = [
        {"role": "system", "content": 'You are a helpful assistant.'},
        {"role": "user", "content": user_content},
    ]

    out = await llm_call(
        model=model_name,
        get_everything=True,
        reasoning_effort='high',
        messages=messages
    )

    import re
    content = out['choices'][0]['message']['content']
    match = re.search(r"<\s*action\s*>(.*?)<\s*/\s*action\s*>", content, re.DOTALL | re.IGNORECASE)
    if match:
        action = match.group(1).strip()
    else:
        # fallback: return the whole content if tags not found
        action = content.strip()
    
    reasoning = out['choices'][0]['message']['reasoning_details'][0]['text']

    return action, reasoning

# 4. Paprika Rollout with Belief

In [17]:
import asyncio

async def run_one_iteration_with_belief_llm(
        env_name: str,
        model_name: str,
        game_id: int,
    ):
    config = PaprikaConfigHelper.create_config(env_name)
    config['belief_config']['style'] = 'none'
    interaction = PaprikaInteraction(config={})

    import builtins
    _original_print = builtins.print
    builtins.print = lambda *a, **k: None
    try:
        instance_id = await interaction.start_interaction(instance_id=None, scenario_id=None, **config)
    finally:
        builtins.print = _original_print

    first_user_message = interaction.agent_conv.messages[0][1]
    attempts = 0
    game_history = []
    belief = f'This is the start of the game. The only available information right now are the game rules:\n{first_user_message}'
    max_attempts = interaction._instance_dict[instance_id]['max_turns']

    while attempts < max_attempts:
        
        attempts += 1

        action, action_reasoning = await take_action(belief, model_name)

        message = [
            {"role": "user", "content": f"Output the next action."},
            {"role": "assistant", "content": f"<action>{action}</action>"}
        ]
        done, response, score, additional_data = await interaction.generate_response(instance_id=instance_id, messages=message)
        
        belief, belief_reasoning = await update_belief(belief, action, response, model_name)

        game_history.append({
            "model": model_name,
            "game_id": str(game_id),
            "env": env_name,
            "attempt": attempts,
            "guess": action,
            "response": response,
            "score": score,
            "done": done,
            "data": additional_data,
            "belief": belief,
            "action_reasoning": action_reasoning,
            "belief_reasoning": belief_reasoning,
        })

        if "Goal reached" in response:
            break
    
    print(f'.', end='', flush=True)
    
    return game_history

async def run_multiple_iterations_multiple_games(
        num_games: int,
        list_envs,
        models,
        logs_file='./logs/paprika_frontier.jsonl',
    ):
    import json

    tasks = []
    for model in models:
        for env_name in list_envs:
            for game_id in range(num_games):
                tasks.append(run_one_iteration_with_belief_llm(env_name, model, game_id))

    results = await asyncio.gather(*tasks)

    # Flatten results and write to file
    with open(logs_file, "a") as f:
        for game_history in results:
            for entry in game_history:
                f.write(json.dumps(entry) + "\n")

In [26]:
await run_multiple_iterations_multiple_games(
    num_games=10,
    list_envs=['mastermind'],
    models=['openai/gpt-oss-120b'],
)

..........

# 4. Metrics

In [1]:
import pandas as pd
from pprint import pprint as pp
import json

logs_file = './logs/paprika_frontier_v6.jsonl'
with open(logs_file, 'r') as f:
    data = [json.loads(line) for line in f]
df = pd.DataFrame(data)

In [2]:
print(df.columns)

Index(['model', 'game_id', 'env', 'attempt', 'info', 'raw_guess', 'guess',
       'response', 'word_limit', 'score', 'done', 'data', 'belief',
       'action_reasoning', 'belief_reasoning'],
      dtype='object')


In [3]:
if not df.iloc[0]['word_limit']:
    df['word_limit'] = 'None'

In [4]:
df['info'].value_counts()

info
belief     8764
both       8479
history    8162
Name: count, dtype: int64

In [5]:
df['model_info'] = df['model'].astype(str) + ' (' + df['info'].astype(str) + ')'

In [6]:
df['model'] = df['model_info']

In [44]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

def summarize_game_outcomes(df):
    df['response'] = df['response'].str.lower()
    grouped = df.groupby(['model', 'game_id', 'env', 'word_limit'])['response'].apply(
        lambda responses: any('goal reached' in resp for resp in responses)
    ).reset_index(name='won')
    return grouped

def plot_win_rates(summary_df):
    # Include word_limit in grouping
    win_rates = summary_df.groupby(['env', 'model', 'word_limit'])['won'].mean().reset_index()
    win_rates['success_rate'] = win_rates['won'] * 100

    envs = win_rates['env'].unique()
    models = win_rates['model'].unique()
    word_limits = sorted(win_rates['word_limit'].unique())

    colors = [
        "#ADD8E6",  # light blue
        "#4682B4",  # medium blue
        "#003366",  # dark blue
        "#FFB6C1",  # light red
        "#FF6347",  # medium red
        "#8B0000",  # dark red
        "#90EE90",  # light green
        "#32CD32",  # medium green
        "#006400",  # dark green
    ]
    color_map = {model: colors[i % len(colors)] for i, model in enumerate(models)}

    # Create subplots: rows = word_limit values, cols = envs
    fig = make_subplots(
        rows=len(word_limits), cols=len(envs),
        subplot_titles=[f"{env}" for env in envs],
        shared_yaxes=True,
        vertical_spacing=0.2 / len(word_limits),
        horizontal_spacing=0.03
    )

    for r, wl in enumerate(word_limits, start=1):
        for c, env in enumerate(envs, start=1):
            for model in models:
                subset = win_rates[
                    (win_rates['env'] == env) &
                    (win_rates['model'] == model) &
                    (win_rates['word_limit'] == wl)
                ]
                if not subset.empty:
                    fig.add_trace(
                        go.Bar(
                            x=[model],
                            y=subset['success_rate'],
                            name=model,
                            marker_color=color_map[model],
                            width=0.8,
                            showlegend=(r == 1 and c == 1),
                            hovertemplate=(
                                f"Env: {env}<br>"
                                f"Word Limit: {wl}<br>"
                                f"Model: {model}<br>"
                                f"Success Rate: {{y:.2f}}%"
                            ),
                        ),
                        row=r, col=c
                    )
            # Add row label for word_limit
            if c == 1:
                fig.add_annotation(
                    text=f"Word Limit: {wl}",
                    xref="paper",
                    yref="paper",
                    x=0.1,
                    y=0.95 - ((r - 1) / len(word_limits)),
                    showarrow=False,
                    font=dict(size=14)
                )

    # Update y-axis
    for r in range(1, len(word_limits) + 1):
        for c in range(1, len(envs) + 1):
            fig.update_yaxes(
                range=[0, 100],
                showgrid=True,
                gridcolor='lightgray',
                row=r, col=c
            )

    # Update x-axis to remove tick labels
    for r in range(1, len(word_limits) + 1):
        for c in range(1, len(envs) + 1):
            fig.update_xaxes(
                showticklabels=False,
                row=r, col=c
            )

    # Fixed subplot size
    fig_width = 180 * len(envs)
    fig_height = 200 * len(word_limits)

    # Layout with horizontal legend
    fig.update_layout(
        height=fig_height + 150,
        width=fig_width,
        template='simple_white',
        font=dict(family='Computer Modern, serif', size=16),
        barmode='group',
        showlegend=True,
        legend=dict(
            title='Models',
            orientation='h',
            yanchor='bottom',
            y=1.18,
            xanchor='center',
            x=0.5,
            bgcolor='rgba(255,255,255,0.9)',
            bordercolor='black',
            borderwidth=1
        ),
        margin=dict(t=100, b=50, l=80, r=20),
        plot_bgcolor='white'
    )

    return fig

In [45]:
df_belief = df.loc[df['info'] == 'belief'].copy()
df_history = df.loc[df['info'] == 'history'].copy()
summary_df = summarize_game_outcomes(df)
summary_df_belief = summarize_game_outcomes(df_belief)
summary_df_history = summarize_game_outcomes(df_history)

In [46]:
fig = plot_win_rates(summary_df)
fig.show()

In [56]:
fig = plot_win_rates(summary_df_history)
fig.show()

In [39]:
df.columns

Index(['model', 'game_id', 'env', 'attempt', 'raw_guess', 'guess', 'response',
       'word_limit', 'score', 'done', 'data', 'belief', 'action_reasoning',
       'belief_reasoning'],
      dtype='object')

In [78]:
wordle_sample = df[
    (df['model'] == 'deepseek/deepseek-r1 (belief)') & \
        (df['env'] == 'twenty_questions') & \
            (df['game_id'] == str(0)) & \
            (df['info'] == 'belief')
            ]
print(len(wordle_sample))

20


In [79]:
wordle_sample

Unnamed: 0,model,game_id,env,attempt,info,raw_guess,guess,response,word_limit,score,done,data,belief,action_reasoning,belief_reasoning,model_info
554,deepseek/deepseek-r1 (belief),0,twenty_questions,1,belief,<Think> Start by distinguishing between common...,Is the vegetable typically grown underground?,yes.,,1.0,True,"{'agent_game_scenario': 'vegetable', 'env_game...",The vegetable is a root vegetable typically gr...,"Okay, so I'm trying to guess a vegetable in 20...","Okay, so I need to update the belief state aft...",deepseek/deepseek-r1 (belief)
555,deepseek/deepseek-r1 (belief),0,twenty_questions,2,belief,<Think> Asking whether the vegetable is a bulb...,"Is the vegetable a bulb, such as onion or garlic?",no.,,2.0,True,"{'agent_game_scenario': 'vegetable', 'env_game...",The vegetable is a root vegetable typically gr...,"Okay, so the user is playing 20 Questions and ...","Okay, let's see. The user is playing 20 Questi...",deepseek/deepseek-r1 (belief)
556,deepseek/deepseek-r1 (belief),0,twenty_questions,3,belief,<Think> Asking whether the vegetable is a tube...,"Is the vegetable classified as a tuber (e.g., ...",yes.,,3.0,True,"{'agent_game_scenario': 'vegetable', 'env_game...",The vegetable is confirmed to be a tuber (e.g....,"Okay, let's see. The belief state says it's a ...","Okay, let's break this down. The user provided...",deepseek/deepseek-r1 (belief)
557,deepseek/deepseek-r1 (belief),0,twenty_questions,4,belief,<Think>Since the belief state confirms the veg...,Is the vegetable a potato?,no.,,4.0,True,"{'agent_game_scenario': 'vegetable', 'env_game...","The vegetable is confirmed to be a tuber, but ...","Okay, so the belief state says the vegetable i...","Okay, let's break down how to update the belie...",deepseek/deepseek-r1 (belief)
558,deepseek/deepseek-r1 (belief),0,twenty_questions,5,belief,<Think>\nTo differentiate between the remainin...,Is the vegetable typically smaller than a pota...,yes.,,5.0,True,"{'agent_game_scenario': 'vegetable', 'env_game...","The vegetable is a tuber, confirmed not to be ...","Okay, let's see. The user is trying to figure ...","Okay, let me process this step by step. The cu...",deepseek/deepseek-r1 (belief)
559,deepseek/deepseek-r1 (belief),0,twenty_questions,6,belief,<Think> The belief strongly points to Jerusale...,"Does the vegetable have thin, pale brown skin?",yes.,,6.0,True,"{'agent_game_scenario': 'vegetable', 'env_game...",The vegetable is a tuber smaller than a potato...,"Okay, let's see. The belief state says the veg...","Okay, let me see. The user is playing 20 Quest...",deepseek/deepseek-r1 (belief)
560,deepseek/deepseek-r1 (belief),0,twenty_questions,7,belief,<Think> The vegetable is likely a sunchoke bas...,Is the vegetable known for having a slightly n...,yes.,,7.0,True,"{'agent_game_scenario': 'vegetable', 'env_game...",The vegetable is confirmed to have a slightly ...,"Okay, so based on the belief state, the curren...","Okay, the user is playing 20 Questions trying ...",deepseek/deepseek-r1 (belief)
561,deepseek/deepseek-r1 (belief),0,twenty_questions,8,belief,<Think>The belief state strongly indicates Jer...,Is the vegetable a Jerusalem artichoke (suncho...,no.,,8.0,True,"{'agent_game_scenario': 'vegetable', 'env_game...","The vegetable is a small, knobby tuber with th...","Okay, so based on the belief state, the assist...","Okay, so the user was pretty confident that th...",deepseek/deepseek-r1 (belief)
562,deepseek/deepseek-r1 (belief),0,twenty_questions,9,belief,<Think>To distinguish between Yacon (eaten raw...,Is the vegetable commonly eaten raw?,no.,,9.0,True,"{'agent_game_scenario': 'vegetable', 'env_game...","The vegetable is a small, knobby tuber with th...","Okay, so I'm trying to figure out which vegeta...","Okay, let me try to update the belief state ba...",deepseek/deepseek-r1 (belief)
563,deepseek/deepseek-r1 (belief),0,twenty_questions,10,belief,<Think> To distinguish between Crosne and Chin...,Is the vegetable typically known for having a ...,yes.,,10.0,True,"{'agent_game_scenario': 'vegetable', 'env_game...",The vegetable is identified as **Crosne (Chine...,"Okay, let's see. The belief state says the veg...","Okay, let me process this. The user is playing...",deepseek/deepseek-r1 (belief)


In [85]:
pp(wordle_sample.iloc[19]['belief'])

('The vegetable is **not** Burdock root, Crosne, potato, carrot, beet, Chinese '
 'yam, water chestnut, jicama, Jerusalem artichoke, lotus root, water caltrop, '
 'galangal, **gourd (including ivy gourd)**, Hosui pear-shaped yam, **or '
 'winged yam (Dioscorea alata)**.  \n'
 '**Confirmed traits**:  \n'
 '- Crunchy when cooked.  \n'
 '- Small, knobby, non-round/non-smooth appearance.  \n'
 '- Pale brown skin with nutty/sweet flavor.  \n'
 '- Asian origin (common in Asian cuisine).  \n'
 '- Primarily used as a culinary vegetable (**not** a spice/herb).  \n'
 '- Commonly stir-fried in Asian dishes.  \n'
 '**Narrowed possibilities**:  \n'
 '- Excludes winged/lobed shapes. Focus on knobby, irregularly shaped '
 'tubers/roots.  \n'
 '- Candidates: Lesser-known Southeast Asian tubers (e.g., *greater yam '
 'variants*, *Taro cultivars* with knobby texture) or fibrous-textured roots '
 'used in Thai/Malay/Indonesian stir-fries.  \n'
 '- Key questions remaining: Distinguish between starchy vs. 