# LLM vs Greedy playground

Run end-to-end games between the LLM agent (via OpenRouter) and a baseline Greedy agent.

Requirements:
- Set `OPENROUTER_API_KEY` in your environment.
- Network access enabled.

You can adjust seeds, budgets, and model names as needed.

In [2]:
import os
from azul_engine import GameEngine, LLMAgent, GreedyFillAgent, play_game


# Configure agents
llm_agent = LLMAgent(model="openai/gpt-oss-120b", provider_priority=("fireworks", "together", "novita/fp4"))
llm_agent2 = LLMAgent(model="openai/gpt-oss-20b", provider_priority=("fireworks", "together", "novita/fp4"))
greedy_agent = GreedyFillAgent()

assert os.getenv("OPENROUTER_API_KEY"), "Set OPENROUTER_API_KEY before running"

In [3]:
def play_llm_vs_greedy(seed: int = 0):
    engine = GameEngine(seed=seed)
    state = engine.reset()
    agents = [llm_agent, llm_agent2]
    turn = 0
    while not state.is_terminal():
        current = state.current_player
        agent = agents[current]
        action = agent.select_action(state)
        state = engine.step(action)
        print(state)
        print([p.score for p in state.players])
        print(current, agent)
        print(action)
        turn += 1
    return state

state = play_llm_vs_greedy(seed=0)
scores = [p.score for p in state.players]
scores

GameState(players=[PlayerBoard(pattern_lines=[[], [<TileColor.RED: 'red'>, <TileColor.RED: 'red'>], [], [], []], wall=[[False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False]], floor_line=[], has_first_player_token=False, score=0), PlayerBoard(pattern_lines=[[], [], [], [], []], wall=[[False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False], [False, False, False, False, False]], floor_line=[], has_first_player_token=False, score=0)], current_player=1, phase=<GamePhase.DRAFTING: 'drafting'>, supply=Supply(bag=[<TileColor.YELLOW: 'yellow'>, <TileColor.BLUE: 'blue'>, <TileColor.BLUE: 'blue'>, <TileColor.BLUE: 'blue'>, <TileColor.RED: 'red'>, <TileColor.BLUE: 'blue'>, <TileColor.BLUE: 'blue'>, <TileColor.WHITE: 'white'>, <TileColor.WHITE: 'white'>, <TileColor.RED: 'red'>, <Ti

[22, 9]

In [7]:
# Inspect the last LLM reasoning/raw output after a game
llm_agent.last_reasoning, llm_agent.last_raw


(None,
 '{\n  "action_id": 0,\n  "rationale": "Taking the single red tile adds only one more penalty (-2) to the floor, minimizing point loss and removes the only red from the center, forcing the opponent to take the two whites."\n}')

In [4]:
llm_agent.last_error

In [None]:
from evals.arena import run_arena
import os 
run_arena(
  ["openai/gpt-oss-120b", "google/gemini-3-flash-preview", "anthropic/claude-haiku-4.5", "openai/gpt-5-mini"],
  games_per_pair=16,
  parallel=24,
  out_dir="runs_final",
  providers=[["fireworks", "together"], None, None, None],
  progress=True,
  swap_sides=True
)



games:   0%|                                                                                                                                               | 0/96 [00:00<?, ?it/s]
turns: 0it [00:06, ?it/s, g0 google/gemini-3-flash... vs openai/gpt-5-mini | 0-0][A
turns: 1it [00:06,  6.33s/it, g0 google/gemini-3-flash... vs openai/gpt-5-mini | 0-0][A
turns: 1it [00:06,  6.33s/it, g0 google/gemini-3-flash... vs openai/gpt-oss-120b | 0-0][A
turns: 2it [00:06,  2.96s/it, g0 google/gemini-3-flash... vs openai/gpt-oss-120b | 0-0][A
turns: 2it [00:07,  2.96s/it, g0 google/gemini-3-flash... vs openai/gpt-oss-120b | 0-0][A
turns: 3it [00:07,  1.86s/it, g0 google/gemini-3-flash... vs openai/gpt-oss-120b | 0-0][A
turns: 3it [00:10,  1.86s/it, g0 openai/gpt-oss-120b vs google/gemini-3-flash... | 0-0][A
turns: 4it [00:10,  2.13s/it, g0 openai/gpt-oss-120b vs google/gemini-3-flash... | 0-0][A
turns: 4it [00:14,  2.13s/it, g0 google/gemini-3-flash... vs openai/gpt-oss-120b | 0-0][A
turns: 5i

In [1]:
#!uv pip install tqdm
from tqdm import tqdm

In [6]:
  from analysis import summarize
  results, matchups = summarize("runs_test")



model_a                        model_b              total  wins_a  wins_b  draws  wr_a  wr_b
google/gemini-3-flash-preview  openai/gpt-oss-120b  6      4       2       0      0.67  0.33
google/gemini-3-flash-preview  openai/gpt-oss-20b   6      6       0       0      1.00  0.00
openai/gpt-oss-120b            openai/gpt-oss-20b   6      6       0       0      1.00  0.00
