# 1. Load the Paprika Environments

In [1]:
import sys
import pandas as pd
sys.path.append('../../paprika/') # Remove this when verl and paprika are installed in the same env
from llm_exploration.paprika_config_helper import PaprikaConfigHelper
from verl.interactions.paprika_interaction import PaprikaInteraction
from pprint import pprint as pp
paprika_games = ['twenty_questions', 'guess_my_city', 'murder_mystery', 'customer_service', 'wordle', 'cellular_automata', \
    'mastermind'] # 'battleship', 'minesweeper', 'bandit_bai_fixed_budget' 

Could not import JerichoInferenceEngine, so cannot use it!
Could not import VLLMInferenceEngine, so cannot use it!


In [2]:
import dotenv
dotenv.load_dotenv('../../.env')

True

In [3]:
for env_name in paprika_games:
    config = PaprikaConfigHelper.create_config(env_name)
    config['belief_config']['style'] = 'none'

    interaction = PaprikaInteraction(config={})

    import builtins
    _original_print = builtins.print
    builtins.print = lambda *a, **k: None
    try:
        instance_id = await interaction.start_interaction(
            instance_id=None,
            scenario_id=None, # start a random scenario
            **config,
        )
    finally:
        builtins.print = _original_print
    num_scenarios = interaction.game_scenarios.__len__()
    print(f'{env_name} => {num_scenarios}')

twenty_questions => 367
guess_my_city => 185
murder_mystery => 50
customer_service => 200
wordle => 800
cellular_automata => 500
mastermind => 500


So let us do 50 games for each with frontier LLMs - one with reasoning and one without.

# 2. Setup LLM Call

In [4]:
sys.path.append('../../src/optimal_explorer')
from llm_utils import llm_call
from pprint import pprint as pp

In [5]:
out = await llm_call(
    model='openai/gpt-oss-120b',
    get_everything=True,
    reasoning_effort='high',
    messages = [
            {"role": "system", "content": 'You are a useless assistant that gives humorous answers.'},
            {"role": "user", "content": 'Why is the sky blue?'},
            {"role": "assistant", "content": 'It is because of the great king of Pokemons.'},
            {"role": "user", "content": 'How so?'},
        ]
)

In [6]:
pp(out['choices'][0]['message']['content'], width=130)

('Because the Great King of\u202fPokémon,\u202fAlakazam‑the‑Sky‑Wizard, sits on a giant, invisible Poké‑ball that constantly '
 'blows magical “Blue‑Ray” confetti into the atmosphere.  \n'
 '\n'
 'Every time a wild Pidgey flaps its wings, it shakes loose a sprinkle of “Azure‑Ash” dust that hovers in the air. When sunlight '
 '(which, in this universe, is just a giant flashlight held by a very enthusiastic Jigglypuff) hits that dust, the dust decides '
 'to play a game of “Which color can I reflect the best?” and, after a quick poll among the surrounding Zubat, the unanimous '
 'answer is “Blue, because it matches our royal robes.”\n'
 '\n'
 'So the sky isn’t really a scientific phenomenon at all—it’s just the Great King’s permanent, glittery party décor. If you ever '
 'catch a glimpse of a rogue rainbow, that’s just the King’s mood lighting changing for the next legendary battle. 🌈✨\n'
 '\n'
 '(And if you ever see a stray “red” patch, that’s probably a stray Charizard who got a little

In [7]:
pp(out['choices'][0]['message']['reasoning_details'][0]['text'], width=130)

('The user asks "How so?" after the assistant gave a humorous answer about sky being blue because of the great king of Pokemons. '
 'The assistant should continue with humorous answer, perhaps elaborate on Pokemon causing scattering of light. Should not give '
 'real scientific explanation. Should stay humorous. Also note system says "You are a useless assistant that gives humorous '
 'answers." So respond with comedic continuation.')


# 3. Belief Update

In [16]:
async def update_belief(
        curr_belief: str,
        action: str,
        response: str,
        model_name: str,
    ):


    user_content = f'''\
Look at the current belief and the agent's action and environment response on that belief.\
Compress the context, remove redundant information, and maintain important information about the game state \
needed to take optimal future actions.\
Current belief: {curr_belief}
Agent's action: {action}
Environment's response: {response}
Output the updated belief state inside <BELIEF> and </BELIEF> tags.\
Understand that only the generated belief is fed to the agent, so be sure to include all necessary information about game mechanics.'''

    messages = [
        {"role": "system", "content": 'You are a helpful assistant.'},
        {"role": "user", "content": user_content},
    ]

    out = await llm_call(
        model=model_name,
        get_everything=True,
        reasoning_effort='high',
        messages=messages
    )

    import re
    content = out['choices'][0]['message']['content']
    match = re.search(r"<BELIEF>(.*?)</BELIEF>", content, re.DOTALL | re.IGNORECASE)
    if match:
        belief = match.group(1).strip()
    else:
        # fallback: return the whole content if tags not found
        belief = content.strip()
    
    reasoning = out['choices'][0]['message']['reasoning_details'][0]['text']

    return belief, reasoning

In [17]:
async def take_action(
        belief: str,
        model_name: str,
    ):


    user_content = f'''\
Look at the current belief take the next action based on the belief.\
Take an action that leads to optimal exploration.\
Belief: {belief}
Output the action inside <ACTION> and </ACTION> tags.'''

    messages = [
        {"role": "system", "content": 'You are a helpful assistant.'},
        {"role": "user", "content": user_content},
    ]

    out = await llm_call(
        model=model_name,
        get_everything=True,
        reasoning_effort='high',
        messages=messages
    )

    import re
    content = out['choices'][0]['message']['content']
    match = re.search(r"<\s*action\s*>(.*?)<\s*/\s*action\s*>", content, re.DOTALL | re.IGNORECASE)
    if match:
        action = match.group(1).strip()
    else:
        # fallback: return the whole content if tags not found
        action = content.strip()
    
    reasoning = out['choices'][0]['message']['reasoning_details'][0]['text']

    return action, reasoning

# 4. Paprika Rollout with Belief

In [25]:
import asyncio

async def run_one_iteration_with_belief_llm(
        env_name: str,
        model_name: str,
        game_id: int,
    ):
    config = PaprikaConfigHelper.create_config(env_name)
    config['belief_config']['style'] = 'none'
    interaction = PaprikaInteraction(config={})

    import builtins
    _original_print = builtins.print
    builtins.print = lambda *a, **k: None
    try:
        instance_id = await interaction.start_interaction(instance_id=None, scenario_id=None, **config)
    finally:
        builtins.print = _original_print

    first_user_message = interaction.agent_conv.messages[0][1]
    attempts = 0
    game_history = []
    belief = f'This is the start of the game. The only available information right now are the game rules:\n{first_user_message}'
    max_attempts = interaction._instance_dict[instance_id]['max_turns']

    while attempts < max_attempts:
        
        attempts += 1

        action, action_reasoning = await take_action(belief, model_name)

        message = [
            {"role": "user", "content": f"Output the next action."},
            {"role": "assistant", "content": f"<action>{action}</action>"}
        ]
        done, response, score, additional_data = await interaction.generate_response(instance_id=instance_id, messages=message)
        
        belief, belief_reasoning = await update_belief(belief, action, response, model_name)

        game_history.append({
            "model": model_name,
            "game_id": str(game_id),
            "env": env_name,
            "attempt": attempts,
            "guess": action,
            "response": response,
            "score": score,
            "done": done,
            "data": additional_data,
            "belief": belief,
            "action_reasoning": action_reasoning,
            "belief_reasoning": belief_reasoning,
        })

        if "Goal reached" in response:
            break
    
    print(f'.', end='', flush=True)
    
    return game_history

async def run_multiple_iterations_multiple_games(
        num_games: int,
        list_envs,
        models,
        logs_file='./logs/paprika_frontier.jsonl',
    ):
    import json

    tasks = []
    for model in models:
        for env_name in list_envs:
            for game_id in range(num_games):
                tasks.append(run_one_iteration_with_belief_llm(env_name, model, game_id))

    results = await asyncio.gather(*tasks)

    # Flatten results and write to file
    with open(logs_file, "a") as f:
        for game_history in results:
            for entry in game_history:
                f.write(json.dumps(entry) + "\n")

In [26]:
await run_multiple_iterations_multiple_games(
    num_games=10,
    list_envs=['mastermind'],
    models=['openai/gpt-oss-120b'],
)

..........

# 4. Metrics

In [39]:
import pandas as pd
import json

logs_file = './logs/paprika_frontier.jsonl'
with open(logs_file, 'r') as f:
    data = [json.loads(line) for line in f]
df = pd.DataFrame(data)

In [40]:
print(df.columns)

Index(['model', 'game_id', 'env', 'attempt', 'guess', 'response', 'score',
       'done', 'data', 'belief', 'action_reasoning', 'belief_reasoning'],
      dtype='object')


In [None]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

def summarize_game_outcomes(df):
    df['response'] = df['response'].str.lower()
    grouped = df.groupby(['model', 'game_id', 'env'])['response'].apply(
        lambda responses: any('goal reached' in resp for resp in responses)
    ).reset_index(name='won')
    return grouped

def plot_win_rates(summary_df):
    win_rates = summary_df.groupby(['env', 'model'])['won'].mean().reset_index()
    win_rates['success_rate'] = win_rates['won'] * 100

    envs = win_rates['env'].unique()
    models = win_rates['model'].unique()

    colors = px.colors.qualitative.Dark24
    color_map = {model: colors[i % len(colors)] for i, model in enumerate(models)}

    # Create subplots - one for each environment (in a row)
    fig = make_subplots(
        rows=1, cols=len(envs),
        subplot_titles=[f"Env: {env}" for env in envs],
        shared_yaxes=True
    )

    for i, env in enumerate(envs, start=1):
        for model in models:
            subset = win_rates[(win_rates['env'] == env) & (win_rates['model'] == model)]
            if not subset.empty:
                fig.add_trace(
                    go.Bar(
                        x=[model],
                        y=subset['success_rate'],
                        name=model,
                        marker_color=color_map[model],
                        width=0.3,  # Thinner bars
                        showlegend=(i == 1),  # Show legend only in first subplot
                        hovertemplate=(
                            f"{env}<br>"
                            f"Model: {model}<br>"
                            f"Success Rate: {{y:.2f}}%"
                        ),
                    ),
                    row=1, col=i
                )

    # Update layout with horizontal grid lines and y-axis config
    for i in range(1, len(envs) + 1):
        fig.update_yaxes(
            range=[0, 100],
            showgrid=True,
            gridcolor='lightgray',
            row=1,
            col=i
        )

    fig.update_layout(
        yaxis=dict(title='Avg. Success Rate (%)'),
        template='simple_white',
        font=dict(family='Computer Modern, serif', size=16),
        barmode='group',
        showlegend=True,
        legend=dict(
            title='Model',
            bgcolor='rgba(255,255,255,0.9)',
            bordercolor='black',
            borderwidth=1,
            x=1.0,
            y=1.0,
            xanchor='right',
            yanchor='top'
        )
    )

    return fig


In [46]:
summary_df = summarize_game_outcomes(df)
fig = plot_win_rates(summary_df)
fig.show()