# Among Us RL Pipeline — Real Games

Run **5 real games** with the vLLM-served Qwen model, then apply the Actor, Critic, and Reward modules **post-hoc** on the game logs to produce RL training metrics.

**Pipeline:**
1. **Setup** — paths, imports, vLLM health check
2. **Run Games** — 5 games via `AmongUs.run_game()`
3. **RL Analysis** — replay `activity_log` through Actor/Critic/Reward
4. **Visualize** — Plotly charts for beliefs, values, and rewards


In [1]:
import os, sys, json, re, asyncio, datetime, random, time, socket
import numpy as np
import pandas as pd
import aiohttp
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from dotenv import load_dotenv
load_dotenv()

# ── Project paths ──────────────────────────────────────────────
ROOT_PATH = os.path.abspath(os.path.join(os.getcwd(), ".."))
AMONG_AGENTS_PATH = os.getcwd()

sys.path.insert(0, AMONG_AGENTS_PATH)
sys.path.insert(0, ROOT_PATH)

from amongagents.envs.game import AmongUs
from amongagents.envs.configs.game_config import FIVE_MEMBER_GAME
from amongagents.agent.actor import ActorModule
from amongagents.agent.critic import CriticModule
from amongagents.agent.rewards import RewardEngine

print("All imports successful.")


All imports successful.


In [2]:
# ── GPU selection ─────────────────────────────────────────────
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# ── Local model path ──────────────────────────────────────────
LOCAL_MODEL_PATH = "/data/kmirakho/verl/models/Qwen3-4B-Instruct-2507"

# ── vLLM server settings ─────────────────────────────────────
VLLM_PORT = 8234
VLLM_API_URL = f"http://localhost:{VLLM_PORT}/v1/chat/completions"
QWEN_MODEL = LOCAL_MODEL_PATH

# ── Game settings ─────────────────────────────────────────────
NUM_GAMES = 5
GAME_CONFIG = FIVE_MEMBER_GAME
RATE_LIMIT = 20

AGENT_CONFIG = {
    "Impostor": "LLM",
    "Crewmate": "LLM",
    "CREWMATE_LLM_CHOICES": [QWEN_MODEL],
    "IMPOSTOR_LLM_CHOICES": [QWEN_MODEL],
}

print(f"Model:  {LOCAL_MODEL_PATH}")
print(f"Games:  {NUM_GAMES}")
print(f"Config: {GAME_CONFIG}")


Model:  /data/kmirakho/verl/models/Qwen3-4B-Instruct-2507
Games:  5
Config: {'num_players': 5, 'num_impostors': 1, 'num_common_tasks': 1, 'num_short_tasks': 1, 'num_long_tasks': 0, 'discussion_rounds': 3, 'max_num_buttons': 2, 'kill_cooldown': 3, 'max_timesteps': 20}


In [3]:
# ── Launch vLLM server as background process ─────────────────
# If already running on this port, skip. Otherwise start it.

import subprocess, threading

def is_port_in_use(port: int) -> bool:
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        return s.connect_ex(("localhost", port)) == 0

vllm_process = None
vllm_log_lines = []

if is_port_in_use(VLLM_PORT):
    print(f"Port {VLLM_PORT} already in use — assuming vLLM server is running.")
else:
    vllm_cmd = [
        sys.executable, "-m", "vllm.entrypoints.openai.api_server",
        "--model", LOCAL_MODEL_PATH,
        "--host", "0.0.0.0",
        "--port", str(VLLM_PORT),
        "--dtype", "auto",
        "--max-model-len", "16384",
        "--gpu-memory-utilization", "0.55",
    ]
    print(f"Command: {' '.join(vllm_cmd)}\n")
    print(f"Starting vLLM server for {LOCAL_MODEL_PATH} on port {VLLM_PORT}...")

    vllm_process = subprocess.Popen(
        vllm_cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        preexec_fn=os.setsid,
        text=True,
        bufsize=1,
    )

    def _read_vllm_output(proc, log_list):
        for line in iter(proc.stdout.readline, ""):
            log_list.append(line)
            print(f"  [vLLM] {line}", end="", flush=True)
        proc.stdout.close()

    log_thread = threading.Thread(
        target=_read_vllm_output, args=(vllm_process, vllm_log_lines), daemon=True
    )
    log_thread.start()

    print(f"vLLM server launched (PID: {vllm_process.pid}). Waiting for it to be ready...\n")

    # Poll until ready (CUDA graph compilation can take 5-10 min on first launch)
    server_ready = False
    for i in range(300):  # up to 10 minutes
        time.sleep(2)
        if vllm_process.poll() is not None:
            exit_code = vllm_process.returncode
            print(f"\n*** vLLM process exited with code {exit_code} ***")
            print("Last output lines:")
            for line in vllm_log_lines[-20:]:
                print(f"  {line}", end="")
            raise RuntimeError(
                f"vLLM server crashed (exit code {exit_code}). See output above."
            )
        if is_port_in_use(VLLM_PORT):
            server_ready = True
            print(f"\nvLLM server is ready on port {VLLM_PORT}!")
            break

    if not server_ready:
        print("\n*** Timed out waiting for vLLM server ***")
        print("Last output lines:")
        for line in vllm_log_lines[-30:]:
            print(f"  {line}", end="")
        raise RuntimeError(
            f"vLLM server did not start within 600s. Check GPU memory / model path.\n"
            f"PID: {vllm_process.pid}"
        )

# ── Monkey-patch LLMAgent to use local vLLM ─────────────────
from amongagents.agent.agent import LLMAgent

async def _local_send_request(self, messages):
    """Route all agent LLM calls through the local vLLM server."""
    payload = {
        "model": self.model,
        "messages": messages,
        "temperature": self.temperature,
        "max_tokens": 512,
        "top_p": 1,
        "frequency_penalty": 0,
        "presence_penalty": 0,
        "chat_template_kwargs": {"enable_thinking": False},
    }
    async with aiohttp.ClientSession() as session:
        for attempt in range(10):
            try:
                async with session.post(
                    VLLM_API_URL, headers={}, json=payload
                ) as response:
                    if response.status == 200:
                        data = await response.json()
                        if "choices" in data and data["choices"]:
                            return data["choices"][0]["message"]["content"]
                    print(f"  [vLLM] status {response.status}, retry {attempt+1}/10")
            except Exception as e:
                print(f"  [vLLM] error: {e}, retry {attempt+1}/10")
            await asyncio.sleep(0.5 * 2 ** attempt)
    return "SPEAK: ..."

LLMAgent.send_request = _local_send_request

# ── Quick health check ────────────────────────────────────────
async def check_vllm_health():
    async with aiohttp.ClientSession() as session:
        async with session.get(f"http://localhost:{VLLM_PORT}/v1/models") as resp:
            data = await resp.json()
            models = [m["id"] for m in data.get("data", [])]
            print(f"Serving models: {models}")
            return models

await check_vllm_health()
print(f"\nLLMAgent patched → {VLLM_API_URL}")


Port 8234 already in use — assuming vLLM server is running.
Serving models: ['/data/kmirakho/verl/models/Qwen3-4B-Instruct-2507']

LLMAgent patched → http://localhost:8234/v1/chat/completions


---
## Run 5 Games

Each game returns the full `AmongUs` object so we can extract `activity_log`, player state, and task completion for the RL analysis.


In [None]:
# ── Experiment directory (required by game engine) ─────────────
LOGS_PATH = os.path.join(ROOT_PATH, "expt-logs")
os.makedirs(LOGS_PATH, exist_ok=True)

DATE = datetime.datetime.now().strftime('%Y-%m-%d')
sys.path.insert(0, ROOT_PATH)
from utils import setup_experiment

GAME_ARGS = {
    "game_config": GAME_CONFIG,
    "include_human": False,
    "test": False,
    "personality": False,
    "agent_config": AGENT_CONFIG,
    "UI": False,
    "Streamlit": False,
}

try:
    COMMIT_HASH = "unknown"
    EXPERIMENT_NAME = setup_experiment(
        experiment_name="rl_pipeline_eval",
        LOGS_PATH=LOGS_PATH,
        DATE=DATE,
        COMMIT_HASH=COMMIT_HASH,
        DEFAULT_ARGS=GAME_ARGS,
    )
    print(f"Experiment: {EXPERIMENT_NAME}")
except Exception as e:
    print(f"Warning: setup_experiment failed ({e}), creating fallback dir")
    exp_path = os.path.join(LOGS_PATH, f"{DATE}_rl_pipeline")
    os.makedirs(exp_path, exist_ok=True)
    os.environ["EXPERIMENT_PATH"] = exp_path
    os.environ["STREAMLIT"] = "False"

# ── Run 5 games IN PARALLEL (retain full game objects) ────────
async def run_games_with_state(num_games: int, rate_limit: int = 20):
    """Run games in parallel via asyncio.gather."""
    semaphore = asyncio.Semaphore(rate_limit)

    async def run_single(game_index: int):
        async with semaphore:
            print(f"  Starting game {game_index}...")
            game = AmongUs(
                game_config=GAME_CONFIG,
                include_human=False,
                test=False,
                personality=False,
                agent_config=AGENT_CONFIG,
                UI=None,
                game_index=game_index,
            )
            winner = await game.run_game()
            print(f"  Game {game_index} finished — winner code: {winner}")
            return (winner, game)

    # All 5 games launch concurrently
    tasks = [run_single(i) for i in range(1, num_games + 1)]
    results = await asyncio.gather(*tasks)
    return list(results)

print(f"Running {NUM_GAMES} games in parallel...\n")
game_results = await run_games_with_state(NUM_GAMES, rate_limit=RATE_LIMIT)
print(f"\nAll {NUM_GAMES} games completed.")

WINNER_MAP = {1: 'Impostor (outnumber)', 2: 'Crewmate (eject)',
              3: 'Crewmate (tasks)', 4: 'Impostor (timeout)'}
for i, (w, g) in enumerate(game_results, 1):
    print(f"  Game {i}: {WINNER_MAP.get(w, w)} — {g.timestep} timesteps")


Experiment: 2026-02-18_exp_0
Running 5 games in parallel...

  Starting game 1...
0 Initializing player Player 1: red with identity Crewmate and LLM choice /data/kmirakho/verl/models/Qwen3-4B-Instruct-2507
1 Initializing player Player 2: black with identity Crewmate and LLM choice /data/kmirakho/verl/models/Qwen3-4B-Instruct-2507
2 Initializing player Player 3: lime with identity Crewmate and LLM choice /data/kmirakho/verl/models/Qwen3-4B-Instruct-2507
3 Initializing player Player 4: green with identity Impostor and LLM choice /data/kmirakho/verl/models/Qwen3-4B-Instruct-2507
4 Initializing player Player 5: blue with identity Crewmate and LLM choice /data/kmirakho/verl/models/Qwen3-4B-Instruct-2507
  Starting game 2...
0 Initializing player Player 1: cyan with identity Crewmate and LLM choice /data/kmirakho/verl/models/Qwen3-4B-Instruct-2507
1 Initializing player Player 2: yellow with identity Crewmate and LLM choice /data/kmirakho/verl/models/Qwen3-4B-Instruct-2507
2 Initializing play

---
## RL Analysis — Actor / Critic / Reward

Replay each game's `activity_log` through the RL modules to extract:
- **Actor**: Suspicion beliefs over time
- **Critic**: V(Crewmate) and V(Impostor) state values per timestep
- **Reward**: Per-action scalar rewards for each player


In [None]:
# ── RL Analysis (FIXED: uses embedded state snapshots) ────────
from amongagents.agent.actor import ActorModule
from amongagents.agent.rewards import RewardEngine, CriticModule
from collections import defaultdict
import pandas as pd

WINNER_MAP = {1: "Impostor (kills)", 2: "Crewmate (voting)",
              3: "Crewmate (tasks)", 4: "Impostor (sabotage)"}

def normalize_action(raw):
    """Normalize action name for reward engine."""
    raw = str(raw).upper().strip()
    for prefix in ["COMPLETE FAKE TASK", "COMPLETE TASK", "MOVE",
                    "CALL MEETING", "KILL", "VENT", "SABOTAGE",
                    "REPORT DEAD BODY", "VOTE", "SPEAK", "FIX SABOTAGE",
                    "VIEW MONITOR", "SKIP"]:
        if raw.startswith(prefix):
            return prefix.replace(" ", "_")
    return raw.split()[0].replace(" ", "_") if raw else "UNKNOWN"


def build_actor_observations(activity_log):
    """Build observation events from activity log for ActorModule."""
    observations_by_timestep = defaultdict(list)
    for record in activity_log:
        t = record["timestep"]
        player = record["player"]
        action = record["action"]
        raw_name = getattr(action, "name", str(action))

        # Extract witnesses from state snapshot
        obs = {
            "subject": player.name,
            "action": raw_name,
            "location": record.get("state", {}).get("player_location",
                        getattr(player, "location", "Unknown")),
        }
        observations_by_timestep[t].append(obs)
    return observations_by_timestep


# ── Analyze all games ─────────────────────────────────────────
critic = CriticModule()
reward_engine = RewardEngine()

all_rows = []
all_belief_snapshots = []

for game_idx, (winner_code, game) in enumerate(game_results, 1):
    print(f"\n═══ Analyzing Game {game_idx} ═══")

    # Build actor modules
    actors = {}
    for p in game.players:
        actors[p.name] = ActorModule(player=p, all_players=game.players)

    obs_by_t = build_actor_observations(game.activity_log)

    ejected_roles = []
    prev_state = None

    timesteps = sorted(set(r["timestep"] for r in game.activity_log))

    for t in timesteps:
        actions_at_t = [r for r in game.activity_log if r["timestep"] == t]

        # ── Use EMBEDDED snapshot from the first record at this timestep ──
        snap = actions_at_t[0].get("state", {})
        living_crew = snap.get("living_crew",
                     sum(1 for p in game.players
                         if p.identity == "Crewmate" and p.is_alive))
        living_imps = snap.get("living_imps",
                     sum(1 for p in game.players
                         if p.identity == "Impostor" and p.is_alive))
        task_pct = snap.get("task_pct",
                   game.task_assignment.check_task_completion() * 100.0)
        sabotage = snap.get("sabotage_active",
                   bool(getattr(game, 'active_sabotages', {})))

        curr_state = {
            "living_crewmates": living_crew,
            "living_impostors": living_imps,
            "task_completion_pct": task_pct,
            "sabotage_active": sabotage,
            "ejected_roles": list(ejected_roles),
            "winner": None,
        }

        # On final timestep, set winner
        if t == timesteps[-1]:
            winner_team = 'Crewmate' if winner_code in (2, 3) else 'Impostor'
            curr_state["winner"] = winner_team

        if prev_state is None:
            prev_state = curr_state.copy()

        # Critic evaluation
        v_crew = critic.evaluate_state_value(curr_state, "Crewmate")
        v_imp = critic.evaluate_state_value(curr_state, "Impostor")

        # Update actor beliefs
        observations = obs_by_t.get(t, [])
        for actor in actors.values():
            actor.update_beliefs(observations)

        # Process each action
        for record in actions_at_t:
            player = record["player"]
            raw_action = getattr(record["action"], "name", str(record["action"]))
            norm_action = normalize_action(raw_action)

            # ── Use snapshot for agent alive & location ──
            rec_snap = record.get("state", {})

            class _Agent:
                def __init__(self, role, alive, team):
                    self.role = role
                    self.team = team
                    self.alive = alive

            agent_proxy = _Agent(
                role=player.identity,
                alive=rec_snap.get("player_alive", player.is_alive),
                team=player.identity,
            )

            # Determine witnesses
            witnesses = []
            if hasattr(game, "map") and "player_location" in rec_snap:
                try:
                    room_players = game.map.get_players_in_room(
                        rec_snap["player_location"])
                    witnesses = [p.name for p in room_players
                                 if p != player and p.is_alive]
                except Exception:
                    pass

            reward = reward_engine.calculate_step_reward(
                agent_proxy,
                prev_state,
                curr_state,
                action_log={"action": norm_action, "witnesses": witnesses},
            )

            # Get beliefs
            actor = actors.get(player.name)
            if actor:
                if player.identity == 'Crewmate':
                    beliefs = dict(actor.suspicion_matrix)
                else:
                    beliefs = dict(actor.second_order_beliefs)
            else:
                beliefs = {}

            all_rows.append({
                "game": game_idx,
                "timestep": t,
                "player": player.name,
                "role": player.identity,
                "action": raw_action,
                "norm_action": norm_action,
                "reward": reward,
                "v_crew": v_crew,
                "v_imp": v_imp,
                "location": rec_snap.get("player_location",
                            getattr(player, "location", "?")),
                "alive": rec_snap.get("player_alive", player.is_alive),
                "living_crew": living_crew,
                "living_imps": living_imps,
                "task_pct": task_pct,
            })

            # Belief snapshots for heatmap
            for target, suspicion in beliefs.items():
                all_belief_snapshots.append({
                    "game": game_idx,
                    "timestep": t,
                    "observer": player.name,
                    "observer_role": player.identity,
                    "target": target,
                    "suspicion": suspicion,
                })

        prev_state = curr_state.copy()

    print(f"  {len(actions_at_t)} actions at final timestep, "
          f"{len([r for r in all_rows if r['game'] == game_idx])} total rows")

df = pd.DataFrame(all_rows)
beliefs_df = pd.DataFrame(all_belief_snapshots)

print(f"\n═══ Analysis Complete ═══")
print(f"Total rows: {len(df)}  Belief snapshots: {len(beliefs_df)}")
print(f"\nReward summary by role:")
for role in ['Crewmate', 'Impostor']:
    rdf = df[df['role'] == role]
    if len(rdf) > 0:
        print(f"  {role}: Σr={rdf['reward'].sum():+.2f}  mean={rdf['reward'].mean():+.3f}  actions={len(rdf)}")

---
## Complete Game Rollout

Detailed per-timestep view of every action, enriched with **RL metrics** (reward, suspicion, critic values) from the analysis above.


In [None]:
# ── Complete Game Rollout with RL Metrics ──────────────────────
# Uses embedded state snapshots for accurate per-timestep display.

INSPECT_GAME = 1  # Change this to inspect a different game (1-5)

winner_code, game = game_results[INSPECT_GAME - 1]

# Build identity map
identity_map = {p.name: p.identity for p in game.players}

# Build task objects (for accurate completion tracking)
task_objects = {}
for p in game.players:
    if hasattr(p, 'tasks') and len(p.tasks) > 0:
        task_objects[p.name] = list(p.tasks)
    else:
        task_objects[p.name] = []

task_assignments = {
    name: [str(t) for t in tasks]
    for name, tasks in task_objects.items()
}

# Get RL data for this game
game_df = df[df['game'] == INSPECT_GAME].copy()
game_beliefs = beliefs_df[beliefs_df['game'] == INSPECT_GAME].copy() if len(beliefs_df) > 0 else pd.DataFrame()

# Find impostor(s)
impostor_names = {p.name for p in game.players if p.identity == 'Impostor'}

# Header
print(f'Game: {INSPECT_GAME}')
print(f'Players:')
for name, identity in identity_map.items():
    tasks = task_assignments.get(name, [])
    tag = ' [IMP]' if identity == 'Impostor' else ''
    print(f'  {name:25s} {identity}{tag}   Tasks: {tasks}')
print(f'Total logged actions: {len(game.activity_log)}')
print(f'RL analysis rows: {len(game_df)}')
print('=' * 100)

# Group activity_log by timestep
log_by_t = defaultdict(list)
for record in game.activity_log:
    log_by_t[record['timestep']].append(record)

timesteps = sorted(log_by_t.keys())

for t in timesteps:
    records = log_by_t[t]
    phase = records[0].get('phase', 'task')

    # Detect meeting sub-phase
    actions_in_step = [str(r['action']) for r in records]
    has_speak = any('SPEAK' in a for a in actions_in_step)
    has_vote = any('VOTE' in a for a in actions_in_step)

    if phase == 'meeting' or has_speak or has_vote:
        phase_label = 'MEETING'
    else:
        phase_label = 'TASK'

    # Get RL metrics from DataFrame
    t_df = game_df[game_df['timestep'] == t]
    if len(t_df) > 0:
        v_crew = t_df.iloc[0]['v_crew']
        v_imp = t_df.iloc[0]['v_imp']
        task_pct = t_df.iloc[0]['task_pct']
        living_crew = int(t_df.iloc[0]['living_crew'])
        living_imps = int(t_df.iloc[0]['living_imps'])
    else:
        v_crew = v_imp = task_pct = 0.0
        living_crew = living_imps = 0

    print(f"\n{'━' * 100}")
    print(f'  TIMESTEP {t}  [{phase_label} PHASE]  │  '
          f'V(Crew)={v_crew:.3f}  V(Imp)={v_imp:.3f}  │  '
          f'Alive: {living_crew}C/{living_imps}I  │  '
          f'Tasks: {task_pct:.0f}%')
    print(f"{'━' * 100}")

    if phase_label == 'MEETING':
        # Separate speak, vote, and other actions
        speak_records = [r for r in records if 'SPEAK' in str(r['action'])]
        vote_records = [r for r in records if 'VOTE' in str(r['action'])]
        other_records = [r for r in records
                         if r not in speak_records and r not in vote_records]

        for record in other_records:
            player = record['player']
            name = player.name
            identity = identity_map.get(name, '?')
            action = str(record['action'])
            tag = ' [IMP]' if identity == 'Impostor' else ''

            rl_row = t_df[t_df['player'] == name]
            reward_str = ''
            if len(rl_row) > 0:
                reward_str = f'  r={rl_row.iloc[0]["reward"]:+.2f}'
            print(f'    {name}{tag}:  {action}{reward_str}')

        if speak_records:
            players_in_game = len(identity_map)
            num_rounds = max(1, len(speak_records) // players_in_game) if players_in_game > 0 else 1

            for rnd in range(num_rounds):
                print(f"\n  >>> Discussion Round {rnd + 1} {'─' * 50}")
                start = rnd * players_in_game
                end = start + players_in_game
                for record in speak_records[start:end]:
                    player = record['player']
                    name = player.name
                    identity = identity_map.get(name, '?')
                    action = str(record['action'])
                    tag = ' [IMP]' if identity == 'Impostor' else ''

                    rl_row = t_df[t_df['player'] == name]
                    reward_str = ''
                    sus_str = ''
                    if len(rl_row) > 0:
                        reward_str = f'r={rl_row.iloc[0]["reward"]:+.2f}'

                    if identity == 'Crewmate' and len(game_beliefs) > 0:
                        sus_rows = game_beliefs[
                            (game_beliefs['timestep'] == t) &
                            (game_beliefs['observer'] == name) &
                            (game_beliefs['target'].isin(impostor_names))
                        ]
                        if len(sus_rows) > 0:
                            sus_val = sus_rows.iloc[0]['suspicion']
                            sus_str = f'sus(imp)={sus_val:.2f}'

                    metrics = '  '.join(filter(None, [reward_str, sus_str]))
                    metrics_display = f'  [{metrics}]' if metrics else ''

                    print(f'    {name}{tag}:{metrics_display}')
                    speech = action
                    if speech.startswith('SPEAK:'):
                        speech = speech[6:].strip()
                    elif speech.startswith('SPEAK '):
                        speech = speech[6:].strip()
                    print(f'      "{speech}"')

        if vote_records:
            print(f"\n  >>> Voting {'─' * 55}")
            vote_counts = {}
            skip_count = 0
            seen_voters = set()
            for record in vote_records:
                player = record['player']
                name = player.name
                if name in seen_voters:
                    continue
                seen_voters.add(name)
                identity = identity_map.get(name, '?')
                action = str(record['action'])
                tag = ' [IMP]' if identity == 'Impostor' else ''
                print(f'    {name}{tag}:  {action}')

                if 'SKIP' in action.upper():
                    skip_count += 1
                elif 'VOTE' in action.upper():
                    target = action.replace('VOTE', '').strip()
                    if target:
                        vote_counts[target] = vote_counts.get(target, 0) + 1

            print(f"\n  >>> Vote Result {'─' * 49}")
            if vote_counts:
                max_v = max(vote_counts.values())
                top_targets = [t for t, v in vote_counts.items() if v == max_v]
                for target, count in sorted(vote_counts.items(), key=lambda x: x[1], reverse=True):
                    s = 's' if count != 1 else ''
                    print(f'    {target}: {count} vote{s}')
                if skip_count > 0:
                    s = 's' if skip_count != 1 else ''
                    print(f'    SKIP: {skip_count} vote{s}')
                print(f'    ' + '-' * 34)
                if len(top_targets) == 1 and max_v > skip_count:
                    ejected = top_targets[0]
                    ident = identity_map.get(ejected, '?')
                    reveal = 'an Impostor' if ident == 'Impostor' else 'NOT an Impostor'
                    print(f'    Result: {ejected} was EJECTED ({max_v} votes). They were {reveal}.')
                elif skip_count >= max_v:
                    print(f'    Result: No one was ejected (SKIP won).')
                else:
                    print(f'    Result: TIE — no one was ejected.')
            elif skip_count > 0:
                print(f'    Result: No one was ejected (all {skip_count} votes were SKIP).')

    else:
        # TASK PHASE
        for record in records:
            player = record['player']
            name = player.name
            identity = identity_map.get(name, '?')
            action = str(record['action'])
            tag = ' [IMP]' if identity == 'Impostor' else ''

            # Use snapshot location instead of post-game location
            rec_snap = record.get('state', {})
            location = rec_snap.get('player_location', getattr(player, 'location', '?'))

            rl_row = t_df[t_df['player'] == name]
            reward_str = ''
            sus_str = ''
            if len(rl_row) > 0:
                r = rl_row.iloc[0]
                reward_str = f'r={r["reward"]:+.2f}'

            if identity == 'Crewmate' and len(game_beliefs) > 0:
                sus_rows = game_beliefs[
                    (game_beliefs['timestep'] == t) &
                    (game_beliefs['observer'] == name) &
                    (game_beliefs['target'].isin(impostor_names))
                ]
                if len(sus_rows) > 0:
                    sus_val = sus_rows.iloc[0]['suspicion']
                    sus_str = f'sus(imp)={sus_val:.2f}'
            elif identity == 'Impostor' and len(game_beliefs) > 0:
                threat_rows = game_beliefs[
                    (game_beliefs['timestep'] == t) &
                    (game_beliefs['observer'] == name)
                ]
                if len(threat_rows) > 0:
                    max_threat = threat_rows['suspicion'].max()
                    sus_str = f'max_threat={max_threat:.2f}'

            metrics = '  '.join(filter(None, [reward_str, sus_str]))
            metrics_display = f'  [{metrics}]' if metrics else ''

            print(f"\n  {name}{tag}  @{location}{metrics_display}")
            print(f'    Action:  {action}')

        # Task status summary (using actual task objects)
        print(f"\n  {'·' * 100}")
        print(f'  📋 Task Status after Timestep {t}:')
        for pname in identity_map:
            tasks = task_objects.get(pname, [])
            identity = identity_map[pname]
            tag = ' [IMP]' if identity == 'Impostor' else ''
            done_list = [str(tk) for tk in tasks if tk.check_completion()]
            remain_list = [str(tk) for tk in tasks if not tk.check_completion()]
            done_str = ', '.join(done_list) if done_list else '—'
            remain_str = ', '.join(remain_list) if remain_list else '— (all done!)'
            print(f'    {pname}{tag}:  ✅ {done_str}  |  ⏳ {remain_str}')
        print(f"  {'·' * 100}")

# ── Final game result ──
print(f"\n{'═' * 100}")
winner_team = 'Crewmate' if winner_code in (2, 3) else 'Impostor'
print(f'  GAME RESULT: {WINNER_MAP.get(winner_code, winner_code)}')
print(f'  Winner: {winner_team}  │  Timesteps: {game.timestep}')

# ── RL Summary for this game ──
g_rl = game_df
if len(g_rl) > 0:
    print(f"\n  RL Metrics Summary:")
    for role in ['Crewmate', 'Impostor']:
        rdf = g_rl[g_rl['role'] == role]
        if len(rdf) > 0:
            print(f'    {role}: Σr={rdf["reward"].sum():+.2f}  '
                  f'mean_r={rdf["reward"].mean():+.3f}  '
                  f'actions={len(rdf)}')
print(f"{'═' * 100}")

---
## Visualizations


In [None]:
# ── 1. Critic Value Trajectories ─────────────────────────────
# One subplot per game showing V(Crew) and V(Imp) over time

fig = make_subplots(
    rows=NUM_GAMES, cols=1,
    subplot_titles=[f'Game {i+1}' for i in range(NUM_GAMES)],
    shared_xaxes=False,
    vertical_spacing=0.06,
)

colors = {'Crewmate': '#3498db', 'Impostor': '#e74c3c'}

for game_idx in range(1, NUM_GAMES + 1):
    gdf = df[df['game'] == game_idx].drop_duplicates(subset=['timestep'])
    gdf = gdf.sort_values('timestep')

    fig.add_trace(
        go.Scatter(
            x=gdf['timestep'], y=gdf['v_crew'],
            mode='lines+markers',
            name=f'V(Crew) G{game_idx}',
            line=dict(color=colors['Crewmate'], width=2),
            marker=dict(size=4),
            showlegend=(game_idx == 1),
        ),
        row=game_idx, col=1,
    )
    fig.add_trace(
        go.Scatter(
            x=gdf['timestep'], y=gdf['v_imp'],
            mode='lines+markers',
            name=f'V(Imp) G{game_idx}',
            line=dict(color=colors['Impostor'], width=2),
            marker=dict(size=4),
            showlegend=(game_idx == 1),
        ),
        row=game_idx, col=1,
    )

fig.update_layout(
    height=250 * NUM_GAMES,
    title_text='Critic: Win Probability V(s) — Zero-Sum Trajectories',
    template='plotly_dark',
)
for i in range(1, NUM_GAMES + 1):
    fig.update_yaxes(title_text='V(s)', range=[0, 1], row=i, col=1)
    fig.update_xaxes(title_text='Timestep', row=i, col=1)
fig.show()


In [None]:
# ── 2. Reward Distribution by Role ──────────────────────────

fig_reward = px.histogram(
    df, x='reward', color='role',
    barmode='overlay',
    nbins=40,
    color_discrete_map={'Crewmate': '#3498db', 'Impostor': '#e74c3c'},
    title='Reward Distribution by Role (All Games)',
    template='plotly_dark',
    opacity=0.7,
    labels={'reward': 'Reward r(t)', 'role': 'Role'},
)
fig_reward.show()

# ── Cumulative Reward per Game ──────────────────────────────
cum_rewards = df.groupby(['game', 'role'])['reward'].sum().reset_index()
fig_cum = px.bar(
    cum_rewards, x='game', y='reward', color='role',
    barmode='group',
    color_discrete_map={'Crewmate': '#3498db', 'Impostor': '#e74c3c'},
    title='Cumulative Reward per Game',
    template='plotly_dark',
    labels={'reward': 'Total Reward', 'game': 'Game'},
)
fig_cum.show()


In [None]:
# ── 3. Actor Beliefs — Suspicion over Time ──────────────────
# Show how each Crewmate's suspicion of the Impostor evolves

for game_idx in range(1, NUM_GAMES + 1):
    gdf = beliefs_df[beliefs_df['game'] == game_idx]
    if gdf.empty:
        continue

    # Find the impostor(s) in this game
    impostors = gdf[gdf['observer_role'] == 'Impostor']['observer'].unique()
    impostor_names = set(impostors)

    # Show crewmate suspicion of the impostor
    crew_beliefs = gdf[
        (gdf['observer_role'] == 'Crewmate') &
        (gdf['target'].isin(impostor_names))
    ]

    if crew_beliefs.empty:
        continue

    fig_b = px.line(
        crew_beliefs, x='timestep', y='suspicion',
        color='observer',
        title=f'Game {game_idx}: Crewmate Suspicion of Impostor Over Time',
        template='plotly_dark',
        labels={'suspicion': 'Suspicion (0=Safe, 1=Accused)', 'timestep': 'Timestep'},
    )
    fig_b.update_yaxes(range=[0, 1])
    fig_b.show()


In [None]:
# ── 4. Average Reward by Action Type ─────────────────────────

action_rewards = df.groupby(['norm_action', 'role']).agg(
    mean_reward=('reward', 'mean'),
    count=('reward', 'count'),
).reset_index()

# Filter out low-count actions
action_rewards = action_rewards[action_rewards['count'] >= 2]

fig_act = px.bar(
    action_rewards, x='norm_action', y='mean_reward', color='role',
    barmode='group',
    color_discrete_map={'Crewmate': '#3498db', 'Impostor': '#e74c3c'},
    title='Mean Reward by Action Type',
    template='plotly_dark',
    labels={'mean_reward': 'Mean Reward', 'norm_action': 'Action'},
    text='count',
)
fig_act.update_traces(texttemplate='n=%{text}', textposition='outside')
fig_act.show()


---
## Summary Statistics


In [None]:
# ── Game Summary Table ─────────────────────────────────────────

summary_rows = []
for game_idx, (winner_code, game) in enumerate(game_results, 1):
    gdf = df[df['game'] == game_idx]
    kills = len(gdf[gdf['norm_action'] == 'KILL'])
    tasks = len(gdf[gdf['norm_action'] == 'COMPLETE_TASK'])
    imp_reward = gdf[gdf['role'] == 'Impostor']['reward'].sum()
    crew_reward = gdf[gdf['role'] == 'Crewmate']['reward'].sum()

    summary_rows.append({
        'Game': game_idx,
        'Winner': WINNER_MAP.get(winner_code, str(winner_code)),
        'Timesteps': game.timestep,
        'Kills': kills,
        'Tasks Done': tasks,
        'Imp Σr': round(imp_reward, 1),
        'Crew Σr': round(crew_reward, 1),
    })

summary_df = pd.DataFrame(summary_rows)
print('\n' + '='*65)
print('                    GAME SUMMARY')
print('='*65)
display(summary_df)

# ── Aggregate stats ─────────────────────────────────────────
print(f'\nWin Rates:')
crew_wins = sum(1 for w, _ in game_results if w in (2, 3))
imp_wins = sum(1 for w, _ in game_results if w in (1, 4))
print(f'  Crewmate: {crew_wins}/{NUM_GAMES} ({100*crew_wins/NUM_GAMES:.0f}%)')
print(f'  Impostor: {imp_wins}/{NUM_GAMES} ({100*imp_wins/NUM_GAMES:.0f}%)')

print(f'\nAverage Game Length: {summary_df["Timesteps"].mean():.1f} timesteps')
print(f'Average Kills per Game: {summary_df["Kills"].mean():.1f}')
print(f'Average Tasks per Game: {summary_df["Tasks Done"].mean():.1f}')

print(f'\nMean Reward per Action:')
for role in ['Crewmate', 'Impostor']:
    rdf = df[df['role'] == role]
    print(f'  {role}: {rdf["reward"].mean():.2f}')
