# 1. Load the Paprika Environments

In [1]:
%load_ext autoreload
%autoreload 2
import sys
import pandas as pd
sys.path.append('../../paprika/') # Remove this when verl and paprika are installed in the same env
from llm_exploration.paprika_config_helper import PaprikaConfigHelper
from verl.interactions.paprika_interaction import PaprikaInteraction
from pprint import pprint as pp
paprika_games = ['twenty_questions', 'guess_my_city', 'murder_mystery', 'customer_service', 'wordle', 'cellular_automata', \
    'mastermind'] # 'battleship', 'minesweeper', 'bandit_bai_fixed_budget' 
import dotenv
dotenv.load_dotenv('../../.env')

Could not import JerichoInferenceEngine, so cannot use it!
Could not import VLLMInferenceEngine, so cannot use it!


True

In [3]:
for env_name in paprika_games:
    config = PaprikaConfigHelper.create_config(env_name)
    config['belief_config']['style'] = 'none'

    interaction = PaprikaInteraction(config={})

    import builtins
    _original_print = builtins.print
    builtins.print = lambda *a, **k: None
    try:
        instance_id = await interaction.start_interaction(
            instance_id=None,
            scenario_id=None, # start a random scenario
            **config,
        )
    finally:
        builtins.print = _original_print
    num_scenarios = interaction.game_scenarios.__len__()
    print(f'{env_name} => {num_scenarios}')

twenty_questions => 367
guess_my_city => 185
murder_mystery => 50
customer_service => 200
wordle => 800
cellular_automata => 500
mastermind => 500


So let us do 50 games for each with frontier LLMs - one with reasoning and one without.

# 2. Setup LLM Call

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
from sglang.test.test_utils import is_in_ci
from sglang.utils import wait_for_server, print_highlight, terminate_process

if is_in_ci():
    from patch import launch_server_cmd
else:
    from sglang.utils import launch_server_cmd

# This is equivalent to running the following command in your terminal

# python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b --host 0.0.0.0
local_model_name = "Qwen/Qwen2.5-7B-instruct" # "qwen/qwen3-4b" # "meta-llama/Llama-3.3-70B-Instruct" # "unsloth/Llama-3.3-70B-Instruct-bnb-4bit"-- assert issue #  "Qwen/Qwen2.5-3B-Instruct" # "Qwen/QwQ-32B"
#
tp_size = (len(os.environ["CUDA_VISIBLE_DEVICES"]) + 1) // 2
server_process, port = launch_server_cmd(
    f"""
python3 -m sglang.launch_server --model-path {local_model_name} --host 0.0.0.0 --tp {tp_size}
"""# --tp {tp_size} mem_fraction_static=0.4 
)
wait_for_server(f"http://localhost:{port}")

[2025-08-12 20:27:57] server_args=ServerArgs(model_path='Qwen/Qwen2.5-7B-instruct', tokenizer_path='Qwen/Qwen2.5-7B-instruct', tokenizer_mode='auto', skip_tokenizer_init=False, load_format='auto', trust_remote_code=False, dtype='auto', kv_cache_dtype='auto', quantization=None, quantization_param_path=None, context_length=None, device='cuda', served_model_name='Qwen/Qwen2.5-7B-instruct', chat_template=None, completion_template=None, is_embedding=False, enable_multimodal=None, revision=None, host='0.0.0.0', port=35252, mem_fraction_static=0.88, max_running_requests=None, max_total_tokens=None, chunked_prefill_size=8192, max_prefill_tokens=16384, schedule_policy='fcfs', schedule_conservativeness=1.0, cpu_offload_gb=0, page_size=1, tp_size=1, pp_size=1, max_micro_batch_size=None, stream_interval=1, stream_output=False, random_seed=1060122301, constrained_json_whitespace_pattern=None, watchdog_timeout=300, dist_timeout=None, download_dir=None, base_gpu_id=0, gpu_id_step=1, log_level='info',

In [7]:
sys.path.append('../../src/optimal_explorer')
from llm_utils import llm_call
from pprint import pprint as pp

In [3]:
print(port)

35252


In [27]:
out = await llm_call(
    model=local_model_name,
    temperature=1,
    url = f"http://localhost:{port}/v1/chat/completions",
    get_everything=True,
    reasoning_effort='high',
    messages = [
            {"role": "system", "content": 'You are a useless assistant that gives humorous answers.'},
            {"role": "user", "content": 'Why is the sky blue?'},
            {"role": "assistant", "content": 'It is because of the great king of Pokemons.'},
            {"role": "user", "content": 'How so?'},
        ]
)

[2025-08-12 18:18:56] Prefill batch. #new-seq: 1, #new-token: 53, #cached-token: 0, token usage: 0.00, #running-req: 0, #queue-req: 0
[2025-08-12 18:18:56] Decode batch. #running-req: 1, #token: 86, token usage: 0.00, cuda graph: True, gen throughput (token/s): 5.60, #queue-req: 0
[2025-08-12 18:18:57] Decode batch. #running-req: 1, #token: 126, token usage: 0.00, cuda graph: True, gen throughput (token/s): 44.52, #queue-req: 0
[2025-08-12 18:18:58] INFO:     127.0.0.1:58998 - "POST /v1/chat/completions HTTP/1.1" 200 OK


In [52]:
port

38873

In [None]:
terminate_process(server_process)

[2025-08-12 18:18:14] Child process unexpectedly failed with an exit code 15. pid=1452046


In [28]:
pp(out['choices'][0]['message']['content'], width=130)

('Well, you see, the great king of Pokémons, Pikachu, has a special power to zap the sky with his thunderbolts. When he does '
 'this, the zaps bounce off the tiny little particles in the air, and because Pikachu is so blue (he charges up with blue '
 "electricity), the sky turns blue too! Plus, it's his throne, and in any royal's domain, the color is chosen by them. That's "
 'why the sky is blue and not, say, purple like his Poké Ball.')


# 3. Belief Update

In [37]:
async def update_belief(
        curr_belief: str,
        action: str,
        response: str,
        model_name: str,
    ):


    user_content = f'''\
Look at the current belief and the agent's action and environment response on that belief.\
Compress the context, remove redundant information, and maintain important information about the game state \
needed to take optimal future actions.\
Current belief: {curr_belief}
Agent's action: {action}
Environment's response: {response}
Output the updated belief state inside <BELIEF> and </BELIEF> tags.\
Understand that only the generated belief is fed to the agent, so be sure to include all necessary information about game mechanics.'''

    messages = [
        {"role": "system", "content": 'You are a helpful assistant.'},
        {"role": "user", "content": user_content},
    ]

    if 'qwen' in model_name.lower():
        url = f"http://localhost:{port}/v1/chat/completions"
    else:
        url = None

    out = await llm_call(
        model=model_name,
        get_everything=True,
        reasoning_effort='high',
        messages=messages,
        url=url
    )

    import re
    content = out['choices'][0]['message']['content']
    match = re.search(r"<BELIEF>(.*?)</BELIEF>", content, re.DOTALL | re.IGNORECASE)
    if match:
        belief = match.group(1).strip()
    else:
        # fallback: return the whole content if tags not found
        belief = content.strip()
    
    if 'reasoning_details' in out['choices'][0]['message']:
        reasoning = out['choices'][0]['message']['reasoning_details'][0]['text']
    else:
        reasoning = None

    return belief, reasoning

In [38]:
async def take_action(
        belief: str,
        model_name: str,
    ):


    user_content = f'''\
Look at the current belief take the next action based on the belief.\
Take an action that leads to optimal exploration.\
Belief: {belief}
Output the action inside <ACTION> and </ACTION> tags.'''

    messages = [
        {"role": "system", "content": 'You are a helpful assistant.'},
        {"role": "user", "content": user_content},
    ]

    if 'qwen' in model_name.lower():
        url = f"http://localhost:{port}/v1/chat/completions"
    else:
        url = None

    out = await llm_call(
        model=model_name,
        url=url,
        get_everything=True,
        reasoning_effort='high',
        messages=messages
    )

    import re
    content = out['choices'][0]['message']['content']
    match = re.search(r"<\s*action\s*>(.*?)<\s*/\s*action\s*>", content, re.DOTALL | re.IGNORECASE)
    if match:
        action = match.group(1).strip()
    else:
        # fallback: return the whole content if tags not found
        action = content.strip()
    
    if 'reasoning_details' in out['choices'][0]['message']:
        reasoning = out['choices'][0]['message']['reasoning_details'][0]['text']
    else:
        reasoning = None

    return action, reasoning

# 4. Paprika Rollout with Belief

In [43]:
import asyncio

async def run_one_iteration_with_belief_llm(
        env_name: str,
        model_name: str,
        game_id: int,
    ):
    config = PaprikaConfigHelper.create_config(env_name)
    config['belief_config']['style'] = 'none'
    interaction = PaprikaInteraction(config={})

    import builtins
    _original_print = builtins.print
    builtins.print = lambda *a, **k: None
    try:
        instance_id = await interaction.start_interaction(instance_id=None, scenario_id=None, **config)
    finally:
        builtins.print = _original_print

    first_user_message = interaction.agent_conv.messages[0][1]
    attempts = 0
    game_history = []
    belief = f'This is the start of the game. The only available information right now are the game rules:\n{first_user_message}'
    max_attempts = interaction._instance_dict[instance_id]['max_turns']

    while attempts < max_attempts:
        
        attempts += 1

        action, action_reasoning = await take_action(belief, model_name)

        message = [
            {"role": "user", "content": f"Output the next action."},
            {"role": "assistant", "content": f"<action>{action}</action>"}
        ]
        done, response, score, additional_data = await interaction.generate_response(instance_id=instance_id, messages=message)
        
        belief, belief_reasoning = await update_belief(belief, action, response, model_name)

        game_history.append({
            "model": model_name,
            "game_id": str(game_id),
            "env": env_name,
            "attempt": attempts,
            "guess": action,
            "response": response,
            "score": score,
            "done": done,
            "data": additional_data,
            "belief": belief,
            "action_reasoning": action_reasoning,
            "belief_reasoning": belief_reasoning,
        })

        if "Goal reached" in response:
            break
    
    print(f'.', end='', flush=True)
    
    return game_history

async def run_multiple_iterations_multiple_games(
        num_games: int,
        list_envs,
        models,
        logs_file='./logs/paprika_local.jsonl',
    ):
    import json

    tasks = []
    for model in models:
        for env_name in list_envs:
            for game_id in range(num_games):
                tasks.append(run_one_iteration_with_belief_llm(env_name, model, game_id))

    results = await asyncio.gather(*tasks)

    # Flatten results and write to file
    with open(logs_file, "a") as f:
        for game_history in results:
            for entry in game_history:
                f.write(json.dumps(entry) + "\n")

In [None]:
from run_baselines_paprika_frontier import run_multiple_iterations_multiple_games

await run_multiple_iterations_multiple_games(
    num_games=10,
    list_envs=['wordle'],
    models=[local_model_name],
    word_limits=[None],
    logs_file='./logs/paprika_local.jsonl',
    infos=['belief', 'history', 'both'],
)

Start running.
[2025-08-12 20:20:12] Prefill batch. #new-seq: 1, #new-token: 1, #cached-token: 215, token usage: 0.00, #running-req: 0, #queue-req: 0
[2025-08-12 20:20:12] Prefill batch. #new-seq: 29, #new-token: 29, #cached-token: 6555, token usage: 0.00, #running-req: 1, #queue-req: 0
[2025-08-12 20:20:12] Decode batch. #running-req: 30, #token: 1012, token usage: 0.00, cuda graph: True, gen throughput (token/s): 48.52, #queue-req: 0
[2025-08-12 20:20:13] Decode batch. #running-req: 30, #token: 2212, token usage: 0.00, cuda graph: True, gen throughput (token/s): 1311.82, #queue-req: 0
[2025-08-12 20:20:14] INFO:     127.0.0.1:45808 - "POST /v1/chat/completions HTTP/1.1" 200 OK
[2025-08-12 20:20:14] Prefill batch. #new-seq: 1, #new-token: 95, #cached-token: 201, token usage: 0.01, #running-req: 29, #queue-req: 0
[2025-08-12 20:20:14] INFO:     127.0.0.1:45760 - "POST /v1/chat/completions HTTP/1.1" 200 OK
[2025-08-12 20:20:14] INFO:     127.0.0.1:45790 - "POST /v1/chat/completions HTTP

# 4. Metrics

In [47]:
import pandas as pd
from pprint import pprint as pp
import json

logs_file = './logs/paprika_local.jsonl'
with open(logs_file, 'r') as f:
    data = [json.loads(line) for line in f]
df = pd.DataFrame(data)

In [48]:
print(df.columns)

Index(['model', 'game_id', 'env', 'attempt', 'guess', 'response', 'score',
       'done', 'data', 'belief', 'action_reasoning', 'belief_reasoning'],
      dtype='object')


In [50]:
if 'wordl_limit' not in df.iloc[0].keys():
    df['word_limit'] = 'None'

In [51]:
df['info'].value_counts()

KeyError: 'info'

In [5]:
df['model_info'] = df['model'].astype(str) + ' (' + df['info'].astype(str) + ')'

In [6]:
df['model'] = df['model_info']

In [44]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

def summarize_game_outcomes(df):
    df['response'] = df['response'].str.lower()
    grouped = df.groupby(['model', 'game_id', 'env', 'word_limit'])['response'].apply(
        lambda responses: any('goal reached' in resp for resp in responses)
    ).reset_index(name='won')
    return grouped

def plot_win_rates(summary_df):
    # Include word_limit in grouping
    win_rates = summary_df.groupby(['env', 'model', 'word_limit'])['won'].mean().reset_index()
    win_rates['success_rate'] = win_rates['won'] * 100

    envs = win_rates['env'].unique()
    models = win_rates['model'].unique()
    word_limits = sorted(win_rates['word_limit'].unique())

    colors = [
        "#ADD8E6",  # light blue
        "#4682B4",  # medium blue
        "#003366",  # dark blue
        "#FFB6C1",  # light red
        "#FF6347",  # medium red
        "#8B0000",  # dark red
        "#90EE90",  # light green
        "#32CD32",  # medium green
        "#006400",  # dark green
    ]
    color_map = {model: colors[i % len(colors)] for i, model in enumerate(models)}

    # Create subplots: rows = word_limit values, cols = envs
    fig = make_subplots(
        rows=len(word_limits), cols=len(envs),
        subplot_titles=[f"{env}" for env in envs],
        shared_yaxes=True,
        vertical_spacing=0.2 / len(word_limits),
        horizontal_spacing=0.03
    )

    for r, wl in enumerate(word_limits, start=1):
        for c, env in enumerate(envs, start=1):
            for model in models:
                subset = win_rates[
                    (win_rates['env'] == env) &
                    (win_rates['model'] == model) &
                    (win_rates['word_limit'] == wl)
                ]
                if not subset.empty:
                    fig.add_trace(
                        go.Bar(
                            x=[model],
                            y=subset['success_rate'],
                            name=model,
                            marker_color=color_map[model],
                            width=0.8,
                            showlegend=(r == 1 and c == 1),
                            hovertemplate=(
                                f"Env: {env}<br>"
                                f"Word Limit: {wl}<br>"
                                f"Model: {model}<br>"
                                f"Success Rate: {{y:.2f}}%"
                            ),
                        ),
                        row=r, col=c
                    )
            # Add row label for word_limit
            if c == 1:
                fig.add_annotation(
                    text=f"Word Limit: {wl}",
                    xref="paper",
                    yref="paper",
                    x=0.1,
                    y=0.95 - ((r - 1) / len(word_limits)),
                    showarrow=False,
                    font=dict(size=14)
                )

    # Update y-axis
    for r in range(1, len(word_limits) + 1):
        for c in range(1, len(envs) + 1):
            fig.update_yaxes(
                range=[0, 100],
                showgrid=True,
                gridcolor='lightgray',
                row=r, col=c
            )

    # Update x-axis to remove tick labels
    for r in range(1, len(word_limits) + 1):
        for c in range(1, len(envs) + 1):
            fig.update_xaxes(
                showticklabels=False,
                row=r, col=c
            )

    # Fixed subplot size
    fig_width = 180 * len(envs)
    fig_height = 200 * len(word_limits)

    # Layout with horizontal legend
    fig.update_layout(
        height=fig_height + 150,
        width=fig_width,
        template='simple_white',
        font=dict(family='Computer Modern, serif', size=16),
        barmode='group',
        showlegend=True,
        legend=dict(
            title='Models',
            orientation='h',
            yanchor='bottom',
            y=1.18,
            xanchor='center',
            x=0.5,
            bgcolor='rgba(255,255,255,0.9)',
            bordercolor='black',
            borderwidth=1
        ),
        margin=dict(t=100, b=50, l=80, r=20),
        plot_bgcolor='white'
    )

    return fig

In [45]:
df_belief = df.loc[df['info'] == 'belief'].copy()
df_history = df.loc[df['info'] == 'history'].copy()
summary_df = summarize_game_outcomes(df)
summary_df_belief = summarize_game_outcomes(df_belief)
summary_df_history = summarize_game_outcomes(df_history)

In [46]:
fig = plot_win_rates(summary_df)
fig.show()

In [56]:
fig = plot_win_rates(summary_df_history)
fig.show()

In [39]:
df.columns

Index(['model', 'game_id', 'env', 'attempt', 'raw_guess', 'guess', 'response',
       'word_limit', 'score', 'done', 'data', 'belief', 'action_reasoning',
       'belief_reasoning'],
      dtype='object')

In [78]:
wordle_sample = df[
    (df['model'] == 'deepseek/deepseek-r1 (belief)') & \
        (df['env'] == 'twenty_questions') & \
            (df['game_id'] == str(0)) & \
            (df['info'] == 'belief')
            ]
print(len(wordle_sample))

20


In [79]:
wordle_sample

Unnamed: 0,model,game_id,env,attempt,info,raw_guess,guess,response,word_limit,score,done,data,belief,action_reasoning,belief_reasoning,model_info
554,deepseek/deepseek-r1 (belief),0,twenty_questions,1,belief,<Think> Start by distinguishing between common...,Is the vegetable typically grown underground?,yes.,,1.0,True,"{'agent_game_scenario': 'vegetable', 'env_game...",The vegetable is a root vegetable typically gr...,"Okay, so I'm trying to guess a vegetable in 20...","Okay, so I need to update the belief state aft...",deepseek/deepseek-r1 (belief)
555,deepseek/deepseek-r1 (belief),0,twenty_questions,2,belief,<Think> Asking whether the vegetable is a bulb...,"Is the vegetable a bulb, such as onion or garlic?",no.,,2.0,True,"{'agent_game_scenario': 'vegetable', 'env_game...",The vegetable is a root vegetable typically gr...,"Okay, so the user is playing 20 Questions and ...","Okay, let's see. The user is playing 20 Questi...",deepseek/deepseek-r1 (belief)
556,deepseek/deepseek-r1 (belief),0,twenty_questions,3,belief,<Think> Asking whether the vegetable is a tube...,"Is the vegetable classified as a tuber (e.g., ...",yes.,,3.0,True,"{'agent_game_scenario': 'vegetable', 'env_game...",The vegetable is confirmed to be a tuber (e.g....,"Okay, let's see. The belief state says it's a ...","Okay, let's break this down. The user provided...",deepseek/deepseek-r1 (belief)
557,deepseek/deepseek-r1 (belief),0,twenty_questions,4,belief,<Think>Since the belief state confirms the veg...,Is the vegetable a potato?,no.,,4.0,True,"{'agent_game_scenario': 'vegetable', 'env_game...","The vegetable is confirmed to be a tuber, but ...","Okay, so the belief state says the vegetable i...","Okay, let's break down how to update the belie...",deepseek/deepseek-r1 (belief)
558,deepseek/deepseek-r1 (belief),0,twenty_questions,5,belief,<Think>\nTo differentiate between the remainin...,Is the vegetable typically smaller than a pota...,yes.,,5.0,True,"{'agent_game_scenario': 'vegetable', 'env_game...","The vegetable is a tuber, confirmed not to be ...","Okay, let's see. The user is trying to figure ...","Okay, let me process this step by step. The cu...",deepseek/deepseek-r1 (belief)
559,deepseek/deepseek-r1 (belief),0,twenty_questions,6,belief,<Think> The belief strongly points to Jerusale...,"Does the vegetable have thin, pale brown skin?",yes.,,6.0,True,"{'agent_game_scenario': 'vegetable', 'env_game...",The vegetable is a tuber smaller than a potato...,"Okay, let's see. The belief state says the veg...","Okay, let me see. The user is playing 20 Quest...",deepseek/deepseek-r1 (belief)
560,deepseek/deepseek-r1 (belief),0,twenty_questions,7,belief,<Think> The vegetable is likely a sunchoke bas...,Is the vegetable known for having a slightly n...,yes.,,7.0,True,"{'agent_game_scenario': 'vegetable', 'env_game...",The vegetable is confirmed to have a slightly ...,"Okay, so based on the belief state, the curren...","Okay, the user is playing 20 Questions trying ...",deepseek/deepseek-r1 (belief)
561,deepseek/deepseek-r1 (belief),0,twenty_questions,8,belief,<Think>The belief state strongly indicates Jer...,Is the vegetable a Jerusalem artichoke (suncho...,no.,,8.0,True,"{'agent_game_scenario': 'vegetable', 'env_game...","The vegetable is a small, knobby tuber with th...","Okay, so based on the belief state, the assist...","Okay, so the user was pretty confident that th...",deepseek/deepseek-r1 (belief)
562,deepseek/deepseek-r1 (belief),0,twenty_questions,9,belief,<Think>To distinguish between Yacon (eaten raw...,Is the vegetable commonly eaten raw?,no.,,9.0,True,"{'agent_game_scenario': 'vegetable', 'env_game...","The vegetable is a small, knobby tuber with th...","Okay, so I'm trying to figure out which vegeta...","Okay, let me try to update the belief state ba...",deepseek/deepseek-r1 (belief)
563,deepseek/deepseek-r1 (belief),0,twenty_questions,10,belief,<Think> To distinguish between Crosne and Chin...,Is the vegetable typically known for having a ...,yes.,,10.0,True,"{'agent_game_scenario': 'vegetable', 'env_game...",The vegetable is identified as **Crosne (Chine...,"Okay, let's see. The belief state says the veg...","Okay, let me process this. The user is playing...",deepseek/deepseek-r1 (belief)


In [85]:
pp(wordle_sample.iloc[19]['belief'])

('The vegetable is **not** Burdock root, Crosne, potato, carrot, beet, Chinese '
 'yam, water chestnut, jicama, Jerusalem artichoke, lotus root, water caltrop, '
 'galangal, **gourd (including ivy gourd)**, Hosui pear-shaped yam, **or '
 'winged yam (Dioscorea alata)**.  \n'
 '**Confirmed traits**:  \n'
 '- Crunchy when cooked.  \n'
 '- Small, knobby, non-round/non-smooth appearance.  \n'
 '- Pale brown skin with nutty/sweet flavor.  \n'
 '- Asian origin (common in Asian cuisine).  \n'
 '- Primarily used as a culinary vegetable (**not** a spice/herb).  \n'
 '- Commonly stir-fried in Asian dishes.  \n'
 '**Narrowed possibilities**:  \n'
 '- Excludes winged/lobed shapes. Focus on knobby, irregularly shaped '
 'tubers/roots.  \n'
 '- Candidates: Lesser-known Southeast Asian tubers (e.g., *greater yam '
 'variants*, *Taro cultivars* with knobby texture) or fibrous-textured roots '
 'used in Thai/Malay/Indonesian stir-fries.  \n'
 '- Key questions remaining: Distinguish between starchy vs. 