In [None]:
# -*- coding: utf-8 -*-
"""
Evaluation script for 2048 LLM agents.

Plays multiple games using specified LLM configurations (local vLLM or cloud OpenAI API)
and logs performance statistics (max tile, score, win rate) to a CSV file.
"""

In [2]:
# --- Add these imports for visualization ---
from IPython.display import display, clear_output, HTML, Image
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import matplotlib.font_manager as font_manager
import io
import base64

In [None]:
import numpy as np
import random
import json
import os
import time
import csv
from datetime import datetime
from tqdm.auto import tqdm
import re
from typing import List, Dict, Tuple, Optional, Any
import traceback

import torch

# VLLM Imports (only if vllm is installed and used)
try:
    from vllm import LLM, SamplingParams
    VLLM_AVAILABLE = True
except ImportError:
    VLLM_AVAILABLE = False
    print("Warning: vLLM not found. Local model evaluation will be unavailable.")

# OpenAI Imports (only if openai library is installed and used)
try:
    from openai import OpenAI, APIError
    OPENAI_AVAILABLE = True
except ImportError:
    OPENAI_AVAILABLE = False
    print("Warning: OpenAI library not found. Cloud model evaluation will be unavailable.")

print(f"vLLM Available: {VLLM_AVAILABLE}")
print(f"OpenAI Available: {OPENAI_AVAILABLE}")

In [4]:
# --- Configuration ---

# 1. Model Configurations (Add your models here)
MODEL_CONFIGS = [
    # Example Local vLLM Model
    {
        "name": "Qwen/Qwen2.5-7B-Instruct_2048_game_grpo_merged_lora_9_simple_density_reward_2048_game_difficulty_1_2_3_4_5_ds2000_model_Qwen2.5-7B-Instruct_lr_4e-05_lora_rank_16/",
        "type": "vllm",
        "path": "Qwen/Qwen2.5-7B-Instruct_2048_game_grpo_merged_lora_9_simple_density_reward_2048_game_difficulty_1_2_3_4_5_ds2000_model_Qwen2.5-7B-Instruct_lr_4e-05_lora_rank_16/"
    },
    {
        "name": 'Qwen/Qwen2.5-7B-Instruct_2048_game_grpo_merged_lora_10_simple_density_reward_2048_game_difficulty_1_2_3_4_5_ds8000_model_Qwen2.5-7B-Instruct_lr_4e-05_lora_rank_16',
        'type': 'vllm',
        'path': 'Qwen/Qwen2.5-7B-Instruct_2048_game_grpo_merged_lora_10_simple_density_reward_2048_game_difficulty_1_2_3_4_5_ds8000_model_Qwen2.5-7B-Instruct_lr_4e-05_lora_rank_16'
    }
    # Example OpenAI-Compatible API Model (e.g., Together.ai, Anyscale, local Llama.cpp server)
    # {
    #     "name": "Mistral-7B-Instruct-v0.2-Cloud",
    #     "type": "openai",
    #     "api_model_name": "mistralai/Mistral-7B-Instruct-v0.2", # Model name used by the API
    #     "base_url": "https://api.together.xyz/v1",             # *** CHANGE THIS *** API endpoint URL
    #     "api_key": "YOUR_TOGETHER_API_KEY",                   # *** CHANGE THIS *** Or set as ENV variable OPENAI_API_KEY
    # },
    # Example Official OpenAI Model
    # {
    #     "name": "GPT-4o-Mini",
    #     "type": "openai",
    #     "api_model_name": "gpt-4o-mini",
    #     "base_url": "https://api.openai.com/v1",              # Official OpenAI endpoint
    #     "api_key": "YOUR_OPENAI_API_KEY",                     # *** CHANGE THIS *** Or set as ENV variable OPENAI_API_KEY
    # },
]





In [5]:
# 2. Generation Configuration
GENERATION_CONFIG = {
    "temperature": 1.0,
    "max_new_tokens": 1024, # Max tokens for the LLM's response (thinking + answer)
    # Add other SamplingParams for vLLM or OpenAI params as needed (e.g., top_p, top_k)
    # "top_p": 0.9,
    # "min_p": 0.9,
}

In [None]:
# --- Style Configurations (Copied from your training code) ---
VLM_STYLES = [
    {   # Style 1: Classic 2048
        'bg_color': '#faf8ef',
        'cell_color': ['#eee4da', '#ede0c8', '#f2b179', '#f59563', '#f67c5f', '#f65e3b', '#edcf72', '#edcc61', '#edc850', '#edc53f', '#edc22e'], # Extended colors for higher tiles
        'text_color': '#776e65',
        'grid_color': '#bbada0', # Adjusted grid color slightly
        'font': 'Clear Sans' # Make sure this font is available or use a default like 'DejaVu Sans'
    },
    {   # Style 2: Dark Mode
        'bg_color': '#1a1a1a', # Darker background
        'cell_color': ['#3c3a32', '#5e5a51', '#7c726a', '#9b8a7e', '#b9a291', '#d7ba9e', '#f5d3ab', '#f8e0b4', '#fceecb', '#fff6e2', '#fffaf0'], # Gradient from dark to light beige/gold
        'text_color': '#f9f6f2', # Off-white text
        'grid_color': '#504b43', # Dark grid lines
        'font': 'DejaVu Sans' # Common default font
    },
    # Add your other styles (Neon, Pastel) here if desired
]
# --- Font Check ---
# Check if the default font exists, otherwise fallback
default_font_name = 'Clear Sans' # Or your preferred default
try:
    font_manager.findfont(default_font_name)
    print(f"Default font '{default_font_name}' found.")
except:
    print(f"Warning: Default font '{default_font_name}' not found. Falling back to 'DejaVu Sans'.")
    default_font_name = 'DejaVu Sans'
    VLM_STYLES[0]['font'] = default_font_name # Update style 1 if needed

# --- Visualization Config ---
VISUALIZE_GAME = True # Set to True to visualize the first game of the first model
VISUALIZATION_DELAY_SECONDS = 0.75 # Pause between visualized steps
VIZ_STYLE = VLM_STYLES[0] # Choose a style (e.g., the first one)
# -----------------------------

In [7]:
# ... other Evaluation Run Config ...
MAX_GAME_MOVES = 2000 # Safety break for games that might run too long

# --- Add Max Restarts Config ---
MAX_RESTARTS_PER_RUN = 3 # Max times to restart a single game run if agent gets stuck
# ------------------------------

In [None]:
def format_matrix_vlm_display(board, style=None):
    """Generates styled visualization of the board as base64 PNG for display."""
    style = style or random.choice(VLM_STYLES)
    font_name = style.get('font', 'DejaVu Sans') # Use default if font missing

    # Check if specified font exists, fallback if not
    try:
        font_path = font_manager.findfont(font_name)
        font_prop = font_manager.FontProperties(fname=font_path)
    except:
        print(f"Warning: Font '{font_name}' not found. Using 'DejaVu Sans'.")
        font_prop = font_manager.FontProperties(fname=font_manager.findfont('DejaVu Sans'))


    fig, ax = plt.subplots(figsize=(4, 4)) # Slightly smaller figure for display
    ax.set_facecolor(style['bg_color'])

    # --- Define a color mapping for tile values ---
    # Create a dictionary mapping power-of-2 values to colors
    max_power = 12 # Up to 4096 (2^12), adjust if needed
    tile_values = [2**i for i in range(1, max_power + 1)]
    # Ensure enough colors are defined in the style, repeat last color if needed
    num_colors_needed = len(tile_values)
    style_colors = style['cell_color']
    if len(style_colors) < num_colors_needed:
        style_colors.extend([style_colors[-1]] * (num_colors_needed - len(style_colors)))

    color_map = {val: style_colors[i] for i, val in enumerate(tile_values)}
    default_cell_color = style_colors[0] # Color for empty or unexpected values
    # ---------------------------------------------

    # --- Determine text color based on tile value ---
    # Simple heuristic: lighter text for darker tiles, darker text for lighter tiles
    # (You might need a more sophisticated approach depending on your color palettes)
    light_text_color = '#f9f6f2' # Example light color
    dark_text_color = style.get('text_color', '#776e65') # Default dark from style

    # Plot cells
    for r in range(4):
        for c in range(4):
            value = board[r][c]
            cell_color = color_map.get(value, default_cell_color) # Get color from map

            # Draw cell
            rect = plt.Rectangle((c, 3-r), 1, 1,
                               facecolor=cell_color,
                               edgecolor=style['grid_color'],
                               linewidth=2) # Thinner lines might look better
            ax.add_patch(rect)

            # Add text
            if value > 0:
                # Decide text color (simple check for higher values)
                text_color = dark_text_color if value <= 4 else light_text_color

                ax.text(c + 0.5, 3.5 - r, str(value),
                        color=text_color,
                        fontproperties=font_prop,
                        ha='center', va='center',
                        fontsize=16 if value < 1000 else 14, # Adjust font size slightly for large numbers
                        weight='bold')

    # Configure axes
    ax.set_xlim(0, 4)
    ax.set_ylim(0, 4)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_aspect('equal')
    plt.tight_layout(pad=0.1) # Reduce padding

    # Save plot to a BytesIO buffer
    buf = io.BytesIO()
    plt.savefig(buf, format='png', bbox_inches='tight', dpi=90) # Lower DPI for faster display
    plt.close(fig) # Close the figure to free memory
    buf.seek(0)

    # Encode buffer to base64
    img_str = base64.b64encode(buf.read()).decode('utf-8')
    return f"data:image/png;base64,{img_str}"

print("Visualization function 'format_matrix_vlm_display' defined.")

In [9]:
SYSTEM_PROMPT = """You are a deep thinker. You analyze problems and provide answers as asked of you. When answering you provide your reasoning in <thinking> tags and final answer in <answer> tags. <think>. Write your detailed analysis and step-by-step reasoning here </think> <answer>Write your final response here </answer>. You do not say anything outside of the tags. You follow the format exactly as asked of you.

Your ideal response should be in the following format:
<think>
[Detailed thinking]
</think>
<answer>
Final answer
</answer>"""

In [11]:
llm_prompt = """You are an expert 2048 game player. Given a board state, select the best next move.

The 2048 game board is a 4x4 grid where tiles with powers of 2 can be merged by swiping.
When identical tiles collide, they combine into one tile with double the value.

Current board state:
{board}

Analyze this board carefully. Consider:
1. Available empty cells
2. Potential tile merges
3. High-value tile positions
4. Avoiding game-ending situations

Based on your analysis, what is the BEST move? Choose only one direction:
- up
- down
- left
- right

Format your response as follows:
<think>
Analyze the current board configuration thoroughly. Consider what happens with each possible move (up, down, left, right). Think about immediate merges, resulting empty cells, and how the board position affects future possibilities. Consider how each move impacts your ability to create higher-value tiles and maximize score. Think several moves ahead if possible. Then make your decision to move up, down, left, or right.
</think>
<answer>
[SINGLE WORD RESPONSE: up/down/left/right]
</answer>
"""

In [12]:
# 3. Evaluation Run Configuration
NUM_RUNS = 20 # Number of games to play per model configuration
RESULTS_FILE = "2048_evaluation_results.csv" # Persistent CSV log file
MAX_GAME_MOVES = 2000 # Safety break for games that might run too long

# 4. vLLM Specific Configuration (if using vLLM)
VLLM_GPU_MEMORY_UTILIZATION = 0.90
VLLM_TENSOR_PARALLEL_SIZE = 1 # Adjust if using multiple GPUs
VLLM_DEVICE = "cuda:1"

In [None]:
# --- Part 1: Core Game Logic (Stable Version - Mostly Unchanged) ---

def initialize_board(size=4):
    return np.zeros((size, size), dtype=int)

# *** MODIFIED add_new_tile ***
def add_new_tile(board, rng_random):
    """Adds a 2 (90%) or 4 (10%) to a random empty cell. Uses provided RNG."""
    empty_cells = np.argwhere(board == 0)
    num_empty = len(empty_cells) # Get length explicitly
    if num_empty == 0:
        return False
    # Use randint to get an index from 0 to num_empty - 1
    random_index = rng_random.randint(0, num_empty - 1)
    y, x = empty_cells[random_index]
    # Use the worker's random instance for value
    value = 4 if rng_random.random() < 0.1 else 2
    board[y, x] = value
    return True

def _slide_row_left(row):
    # (Implementation remains the same as previous correct version)
    new_row = np.zeros_like(row); write_idx = 0; score_delta = 0
    merged_in_row = [False] * len(row)
    compacted_row = row[row != 0]; read_idx = 0
    while read_idx < len(compacted_row):
        val = compacted_row[read_idx]
        if read_idx + 1 < len(compacted_row) and val == compacted_row[read_idx + 1] and not merged_in_row[write_idx]:
             merged_val = val * 2; new_row[write_idx] = merged_val
             score_delta += merged_val; merged_in_row[write_idx] = True
             write_idx += 1; read_idx += 2
        else:
             new_row[write_idx] = val; write_idx += 1; read_idx += 1
    return new_row, score_delta


def move(board, direction):
    # (Implementation remains the same as previous correct version)
    size = board.shape[0]; original_board = board; total_score_delta = 0
    rotations = {'up': 1, 'right': 2, 'down': 3, 'left': 0}.get(direction, 0)
    if rotations > 0: rotated_board = np.rot90(original_board, k=rotations)
    else: rotated_board = original_board
    processed_board = np.zeros_like(rotated_board); changed_in_any_row = False
    for i in range(size):
        row = rotated_board[i, :]; new_row, score_delta = _slide_row_left(row)
        processed_board[i, :] = new_row; total_score_delta += score_delta
        if not np.array_equal(row, new_row): changed_in_any_row = True
    if rotations > 0: final_board = np.rot90(processed_board, k=-rotations)
    else: final_board = processed_board
    changed = not np.array_equal(original_board, final_board)
    return final_board, changed, total_score_delta

def get_valid_moves(board):
    # (Implementation remains the same as previous correct version)
    valid = []
    for direction in ['up', 'down', 'left', 'right']:
        _, changed, _ = move(board, direction)
        if changed: valid.append(direction)
    return valid

def is_game_over(board):
    # (Implementation remains the same as previous correct version)
    if np.any(board == 0): return False
    return not get_valid_moves(board)

print("Game logic functions defined.")

In [14]:
SYSTEM_PROMPT = """You are a deep thinker. You analyze problems and provide answers as asked of you. When answering you provide your reasoning in <thinking> tags and final answer in <answer> tags. <think>. Write your detailed analysis and step-by-step reasoning here </think> <answer>Write your final response here </answer>. You do not say anything outside of the tags. You follow the format exactly as asked of you.

Your ideal response should be in the following format:
<think>
[Detailed thinking]
</think>
<answer>
Final answer
</answer>"""


llm_prompt_template = """You are an expert 2048 game player. Given a board state, select the best next move.

The 2048 game board is a 4x4 grid where tiles with powers of 2 can be merged by swiping.
When identical tiles collide, they combine into one tile with double the value.

Current board state:
{board_text}

Analyze this board carefully. Consider:
1. Available empty cells
2. Potential tile merges
3. High-value tile positions
4. Avoiding game-ending situations

Based on your analysis, what is the BEST move? Choose only one direction:
- up
- down
- left
- right

Format your response as follows:
<think>
Analyze the current board configuration thoroughly. Consider what happens with each possible move (up, down, left, right). Think about immediate merges, resulting empty cells, and how the board position affects future possibilities. Consider how each move impacts your ability to create higher-value tiles and maximize score. Think several moves ahead if possible. Then make your decision to move up, down, left, or right.
</think>
<answer>
[SINGLE WORD RESPONSE: up/down/left/right]
</answer>
"""

In [15]:
def format_matrix_llm(board, zero_char='.'):
    """Formats game board into a simple text representation for the LLM prompt."""
    size = len(board)
    max_len = len(str(np.max(board))) if np.any(board > 0) else 1

    lines = []
    for row in board:
        cells = []
        for x in row:
            if x == 0:
                cells.append(zero_char.rjust(max_len))
            else:
                cells.append(str(x).rjust(max_len))
        lines.append(" | ".join(cells))
    return "\n".join(lines)

In [None]:
def parse_action(completion: str) -> Optional[str]:
    """
    Extracts the action from the <answer> tag in the LLM completion.
    Returns lowercase action ('up', 'down', 'left', 'right') or None if invalid/missing.
    """
    if not completion:
        return None
    match = re.search(r"<answer>\s*(.*?)\s*</answer>", completion, re.IGNORECASE | re.DOTALL)
    if not match:
        # Fallback: Check if the *entire* response is just the action word
        action_simple = completion.strip().lower()
        if action_simple in {"up", "down", "left", "right"}:
             print(f"Warning: Could not parse <answer> tag, but found valid action '{action_simple}' as full response.")
             return action_simple
        # print(f"Warning: Could not parse <answer> tag in completion: '{completion[:100]}...'")
        return None

    action = match.group(1).strip().lower()
    if action in {"up", "down", "left", "right"}:
        return action
    else:
        # print(f"Warning: Invalid action '{action}' found in <answer> tag.")
        return None

print("Prompting and parsing functions defined.")

In [17]:
def create_sampling_params(generation_config: Dict) -> SamplingParams:
    """Creates vLLM SamplingParams from generation config dictionary."""
    
    # Convert all possible parameters from generation_config to sampling_kwargs
    sampling_kwargs = {}
    
    # Basic parameters
    if "temperature" in generation_config:
        sampling_kwargs["temperature"] = generation_config["temperature"]
    if "max_new_tokens" in generation_config:
        sampling_kwargs["max_tokens"] = generation_config["max_new_tokens"]
    if "top_p" in generation_config:
        sampling_kwargs["top_p"] = generation_config["top_p"]
    if "top_k" in generation_config:
        sampling_kwargs["top_k"] = generation_config["top_k"]
    
    # Additional parameters
    if "min_p" in generation_config:
        sampling_kwargs["min_p"] = generation_config["min_p"]
    if "presence_penalty" in generation_config:
        sampling_kwargs["presence_penalty"] = generation_config["presence_penalty"]
    if "frequency_penalty" in generation_config:
        sampling_kwargs["frequency_penalty"] = generation_config["frequency_penalty"]
    if "repetition_penalty" in generation_config:
        sampling_kwargs["repetition_penalty"] = generation_config["repetition_penalty"]
    
    # Stop sequences
    if "stop" in generation_config:
        sampling_kwargs["stop"] = generation_config["stop"]
    
    # Create SamplingParams with only the parameters that were specified
    return SamplingParams(**sampling_kwargs)

In [18]:
# --- Part 3: LLM Inference ---

class LLMInference:
    """Base class for LLM inference engines."""
    def __init__(self, model_config: Dict):
        self.model_config = model_config
        self.model_name = model_config["name"]

    def generate(self, prompt_messages: List[Dict], generation_config: Dict) -> Tuple[Optional[str], Optional[str]]:
        """Generates a response given prompt messages. Returns (completion, error_message)."""
        raise NotImplementedError

    def cleanup(self):
        """Releases any resources (e.g., GPU memory)."""
        pass

# Import torch needed for cleanup cache clearing
# Ensure 'torch', 'os', 'traceback' are imported earlier in the notebook
# import torch
# import os
# import traceback

class VLLMInference(LLMInference):
    """Inference using local vLLM."""
    def __init__(self, model_config: Dict):
        super().__init__(model_config)
        if not VLLM_AVAILABLE:
            raise RuntimeError("vLLM is not installed, cannot use VLLMInference.")

        # --- Determine Device ID from VLLM_DEVICE ---
        # Assuming VLLM_DEVICE is like "cuda:1"
        try:
            if isinstance(VLLM_DEVICE, str) and VLLM_DEVICE.startswith("cuda:"):
                 device_id_str = VLLM_DEVICE.split(":")[-1]
                 # This specific ID will be made visible via CUDA_VISIBLE_DEVICES
                 target_device_id_list = [int(device_id_str)]
                 # Since we target one specific device, tensor_parallel_size must be 1
                 if VLLM_TENSOR_PARALLEL_SIZE != 1:
                     print(f"Warning: VLLM_DEVICE ('{VLLM_DEVICE}') implies a single device, overriding VLLM_TENSOR_PARALLEL_SIZE to 1.")
                 tensor_parallel_size = 1
            # Example for handling multiple devices like "cuda:1,
            else:
                # Fallback or handle error if VLLM_DEVICE format is unexpected
                print(f"Warning: VLLM_DEVICE ('{VLLM_DEVICE}') format not recognized or not set. Relying on default CUDA visibility and VLLM_TENSOR_PARALLEL_SIZE={VLLM_TENSOR_PARALLEL_SIZE}.")
                target_device_id_list = None # Let vLLM use default visibility
                tensor_parallel_size = VLLM_TENSOR_PARALLEL_SIZE

        except ValueError:
             print(f"Error parsing VLLM_DEVICE ('{VLLM_DEVICE}'). Using default visibility.")
             target_device_id_list = None
             tensor_parallel_size = VLLM_TENSOR_PARALLEL_SIZE
        # ---------------------------------------------

        print(f"Initializing vLLM for model: {model_config['path']}...")
        print(f"  Attempting target devices: {target_device_id_list}, tensor_parallel_size: {tensor_parallel_size}")

        original_cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES")
        if target_device_id_list is not None:
            # Attempt to set CUDA_VISIBLE_DEVICES for this specific initialization
            print(f"  Temporarily setting CUDA_VISIBLE_DEVICES='{','.join(map(str, target_device_id_list))}'")
            os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, target_device_id_list))
        else:
            # If no specific device ID was parsed, don't modify the environment var
            print("  Using existing CUDA_VISIBLE_DEVICES environment setting (if any).")

        try:
            self.llm = LLM(
                model=model_config["path"],
                tensor_parallel_size=tensor_parallel_size, # Use determined size
                gpu_memory_utilization=VLLM_GPU_MEMORY_UTILIZATION,
                trust_remote_code=True, # Often needed for 
            )
            self.tokenizer = self.llm.get_tokenizer()

            
            print("vLLM initialized successfully.")
        except Exception as e:
            print(f"Error initializing vLLM: {e}")
            traceback.print_exc()
            raise # Re-raise the exception after printing
        finally:
            # --- Restore original CUDA_VISIBLE_DEVICES ---
            if original_cuda_visible is None:
                # If it didn't exist before, remove it
                if 'CUDA_VISIBLE_DEVICES' in os.environ: del os.environ['CUDA_VISIBLE_DEVICES']
                # print("  Restored CUDA_VISIBLE_DEVICES (removed).")
            else:
                # Otherwise, restore the original value
                os.environ['CUDA_VISIBLE_DEVICES'] = original_cuda_visible
                # print(f"  Restored CUDA_VISIBLE_DEVICES to '{original_cuda_visible}'.")
            # ---------------------------------------------

    def generate(self, prompt_messages: List[Dict], generation_config: Dict) -> Tuple[Optional[str], Optional[str]]:
        """Generates a response given prompt messages using vLLM."""
        try:
            sampling_params = create_sampling_params(generation_config)
            # Apply chat template (important for instruction-tuned models)
            # Note: vLLM expects a single string prompt after applying the template
            prompt_str = self.tokenizer.apply_chat_template(
                prompt_messages,
                tokenize=False,
                add_generation_prompt=True # Ensures the model knows to generate the next turn
            )

            outputs = self.llm.generate(prompt_str, sampling_params, use_tqdm=False)
            # vLLM returns a list of RequestOutput objects
            if outputs and outputs[0].outputs:
                completion = outputs[0].outputs[0].text
                return completion, None
            else:
                return None, "vLLM generation returned no output."
        except Exception as e:
            print(f"Error during vLLM generation: {e}")
            # traceback.print_exc() # Uncomment for detailed stack trace if needed
            return None, str(e)

    def cleanup(self):
        """Releases vLLM resources."""
        if hasattr(self, 'llm'):
            # Explicitly delete the model and clear cache if possible
            # This might be necessary depending on the vLLM version and setup
            # to fully release GPU memory before loading the next model.
            print(f"Releasing vLLM resources for {self.model_name}...")
            del self.llm
            del self.tokenizer
            # Attempt to clear CUDA cache if torch is available
            if 'torch' in globals() and torch.cuda.is_available():
                torch.cuda.empty_cache()
            print("vLLM resources released.")


In [19]:
VLLM_GPU_MEMORY_UTILIZATION = 0.90
# VLLM_TENSOR_PARALLEL_SIZE = 1 # Adjust if using multiple GPUs <-- CHANGE THIS
VLLM_TENSOR_PARALLEL_SIZE = 2 # Use 2 GPUs

In [20]:
# # --- Test Initialization Cell ---

# # Set VLLM config for GPUs 1 and 2
# VLLM_TENSOR_PARALLEL_SIZE = 2
# VLLM_DEVICE = "cuda:1,2" # Relies on external CUDA_VISIBLE_DEVICES=1,2

# print("--- Testing VLLMInference Initialization ---")
# test_llm_player = None
# try:
#     test_model_config = MODEL_CONFIGS[0]
#     if test_model_config["type"] == "vllm" and VLLM_AVAILABLE:
#         print(f"Attempting to initialize {test_model_config['name']}...")
#         # Assumes CUDA_VISIBLE_DEVICES=1,2 is set externally before launch
#         test_llm_player = VLLMInference(test_model_config, GENERATION_CONFIG)
#         print("--- VLLMInference Initialization Test Successful ---")
#     else:
#         print(f"Skipping test: Model type is '{test_model_config.get('type', 'N/A')}' or vLLM available is {VLLM_AVAILABLE}")

# except Exception as e:
#     print("\n--- VLLMInference Initialization Test FAILED ---")
#     print(f"Error: {type(e).__name__} - {e}")
#     traceback.print_exc()
# finally:
#     if test_llm_player is not None:
#         print("Cleaning up test player...")
#         test_llm_player.cleanup()

# print("\n--- Initialization Test Finished ---")

In [22]:
# --- Part 4: Evaluation Loop ---

# (Keep format_matrix_vlm_display function as defined previously)

def play_game(
    llm_inference: LLMInference,
    generation_config: Dict,
    game_seed: Optional[int] = None,
    visualize: bool = False,
    viz_delay: float = 1.0,
    viz_style: Optional[Dict] = None,
    move_no_matter_what: bool = False
) -> Dict[str, Any]:
    """
    Plays a single game of 2048 using the provided LLM.
    Restarts the game if the agent gets stuck, up to MAX_RESTARTS_PER_RUN.
    Reports stats from the final attempt, plus overall bests and restart count.
    Can optionally visualize steps graphically in Jupyter.
    """
    if game_seed is None: game_seed = random.randint(0, 2**32 - 1)
        # Disable clear_output for debugging
    def debug_display(*args, **kwargs):
        if 'clear_output' in kwargs:
            del kwargs['clear_output']
        display(*args, **kwargs)
    
    # Use debug_display instead of display for visualization
    if visualize:
        display_func = debug_display
    else:
        display_func = display
    total_restarts_done = 0
    overall_max_tile_across_restarts = 0
    overall_max_score_across_restarts = 0

    # --- Loop for restarts ---
    while total_restarts_done <= MAX_RESTARTS_PER_RUN:
        attempt_seed = game_seed + total_restarts_done # Vary seed for each attempt
        game_rng = random.Random(attempt_seed)

        if visualize and total_restarts_done > 0:
             clear_output(wait=True)
             display(HTML(f"<h2>Attempt {total_restarts_done + 1} / {MAX_RESTARTS_PER_RUN + 1} (Seed: {attempt_seed})</h2>"))
             time.sleep(viz_delay * 1.5)

        # --- Initialize state for this attempt ---
        board = initialize_board()
        add_new_tile(board, game_rng)
        add_new_tile(board, game_rng)
        score = 0
        moves_made = 0
        max_tile_this_attempt = np.max(board)
        last_error_this_attempt = None # Error specific to this attempt
        stuck_counter = 0
        start_time_attempt = time.time() # Track duration of the last attempt mostly
        # ----------------------------------------

        # --- Inner game loop for this attempt ---
        while moves_made < MAX_GAME_MOVES:
            pre_move_board = board.copy()
            current_max = np.max(board)
            if current_max > max_tile_this_attempt: max_tile_this_attempt = current_max

            # --- Check Game Over ---
            if is_game_over(board):
                last_error_this_attempt = None # Natural end
                if visualize:
                    clear_output(wait=True)
                    board_img_data = format_matrix_vlm_display(board, style=viz_style)
                    html_content = f"<h2>Game Over! (Attempt {total_restarts_done+1})</h2>"
                    html_content += f"<p>Move: {moves_made}, Score: {score}, Max Tile: {current_max}</p>"
                    html_content += f"<img src='{board_img_data}' alt='Final Board State' style='width: 250px; height: auto;'/>"
                    display(HTML(html_content))
                    time.sleep(viz_delay * 2)
                break # Exit inner loop for this attempt

            valid_moves = get_valid_moves(board)
            if not valid_moves: # Should be caught by is_game_over
                last_error_this_attempt = "No valid moves left"
                if visualize: # Similar display as game over
                     clear_output(wait=True)
                     board_img_data = format_matrix_vlm_display(board, style=viz_style)
                     html_content = f"<h2>No Valid Moves! (Attempt {total_restarts_done+1})</h2>"
                     html_content += f"<p>Move: {moves_made}, Score: {score}, Max Tile: {current_max}</p>"
                     html_content += f"<img src='{board_img_data}' alt='Final Board State' style='width: 250px; height: auto;'/>"
                     display(HTML(html_content))
                     time.sleep(viz_delay * 2)
                break # Exit inner loop

            # --- Prepare Prompt (no change) ---
            board_text_representation = format_matrix_llm(board)
            user_prompt = llm_prompt_template.format(board_text=board_text_representation)
            messages = [{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": user_prompt}]

            # --- Display state before LLM call ---
            if visualize:
                clear_output(wait=True)
                board_img_data = format_matrix_vlm_display(pre_move_board, style=viz_style)
                html_content = f"<h2>Attempt {total_restarts_done+1} / Move: {moves_made + 1} / Score: {score}</h2>"
                html_content += f"<p>Board state before move:</p>"
                html_content += f"<img src='{board_img_data}' alt='Board State' style='width: 250px; height: auto;'/>"
                html_content += f"<p><i>Waiting for LLM ({llm_inference.model_name})...</i></p>"
                display(HTML(html_content))

            # --- Get LLM Move ---
            llm_response, error_msg = llm_inference.generate(messages, generation_config)

            print(f"llm prompt: {user_prompt}")
            print(f"llm response: {llm_response}")
            if error_msg: # Critical LLM error
                print(f"  LLM Generation Error: {error_msg}. Ending game run.")
                last_error_this_attempt = f"LLM Error: {error_msg}"
                if visualize:
                    clear_output(wait=True)
                    board_img_data = format_matrix_vlm_display(pre_move_board, style=viz_style)
                    html_content = f"<h2>Attempt {total_restarts_done+1} / Move: {moves_made + 1} / Score: {score}</h2>"
                    html_content += f"<p>Board state (before error):</p>"
                    html_content += f"<img src='{board_img_data}' alt='Board State Before Error' style='width: 250px; height: auto;'/>"
                    html_content += f"<p style='color:red;'><b>LLM Error:</b> {error_msg}</p>"
                    display(HTML(html_content))
                    time.sleep(viz_delay * 2)
                break # Exit inner loop

            action = parse_action(llm_response)

            # --- Execute Move ---
            final_board = None
            viz_message = ""
            action_taken_viz = "N/A"

            if action:
                action_taken_viz = action.upper()
                new_board, changed, score_delta = move(board, action)
                if changed:
                    board = new_board
                    add_new_tile(board, game_rng)
                    score += score_delta
                    moves_made += 1
                    stuck_counter = 0
                    final_board = board.copy()
                    viz_message = f"Action: <b>{action_taken_viz}</b> -> New Tile Added. Score +{score_delta}"
                else:
                    stuck_counter += 1
                    final_board = pre_move_board
                    viz_message = f"Action: <b>{action_taken_viz}</b> -> <span style='color:orange;'>No change!</span> (Stuck: {stuck_counter})"
                    if move_no_matter_what:
                        print("DEBUG: Forcing tile add on no-change move.")
                        add_new_tile(board, game_rng)
                        final_board = board.copy()
                        viz_message += " <span style='color:red;'>(Forced Tile Add)</span>"
            else:
                stuck_counter += 1
                action_taken_viz = "Invalid/Unparsed"
                final_board = pre_move_board
                viz_message = f"Action: <span style='color:red;'>Invalid/Unparsed!</span> (Stuck: {stuck_counter})"
                if move_no_matter_what:
                    print("DEBUG: Forcing tile add on invalid/unparsed action.")
                    add_new_tile(board, game_rng)
                    final_board = board.copy()
                    viz_message += " <span style='color:red;'>(Forced Tile Add)</span>"

            # --- Visualization: Display step result ---
            if visualize:
                 clear_output(wait=True)
                 pre_move_img_data = format_matrix_vlm_display(pre_move_board, style=viz_style)
                 post_move_img_data = format_matrix_vlm_display(final_board, style=viz_style)
                 html_content = f"<h2>Attempt {total_restarts_done+1} / Move: {moves_made} / Score: {score}</h2>" # Show completed moves for this attempt
                 html_content += f"<p>{viz_message}</p>"
                 html_content += "<div style='display: flex; align-items: flex-start; gap: 20px;'>"
                 html_content += f"<div>Before:<br/><img src='{pre_move_img_data}' alt='Board Before' style='width: 200px; height: auto;'/></div>"
                 html_content += f"<div>After:<br/><img src='{post_move_img_data}' alt='Board After' style='width: 200px; height: auto;'/></div>"
                 html_content += "</div>"
                 display(HTML(html_content))
                 time.sleep(viz_delay)

            # --- Check if stuck ---
            if stuck_counter >= 5:
                last_error_this_attempt = f"Stuck after {stuck_counter} invalid moves"
                if visualize:
                     clear_output(wait=True)
                     board_img_data = format_matrix_vlm_display(final_board, style=viz_style)
                     html_content = f"<h2>Attempt {total_restarts_done+1} STUCK!</h2>"
                     html_content += f"<p>Board state (when stuck):</p>"
                     html_content += f"<img src='{board_img_data}' alt='Board State When Stuck' style='width: 250px; height: auto;'/>"
                     html_content += f"<p style='color:red;'><b>LLM Stuck after {stuck_counter} invalid/no-op moves.</b></p>"
                     display(HTML(html_content))
                     time.sleep(viz_delay * 1.5) # Pause longer on stuck
                break # Exit inner loop to trigger restart or end run
        # --- End of inner game loop for this attempt ---

        # Update overall bests after the attempt finishes (stuck or game over)
        overall_max_tile_across_restarts = max(overall_max_tile_across_restarts, max_tile_this_attempt)
        overall_max_score_across_restarts = max(overall_max_score_across_restarts, score)

        # --- Decide whether to restart or end the run ---
        if last_error_this_attempt and "Stuck" in last_error_this_attempt:
            total_restarts_done += 1
            if total_restarts_done <= MAX_RESTARTS_PER_RUN:
                print(f"  Game Run (Seed {game_seed}): Attempt {total_restarts_done} failed (Stuck). Restarting attempt {total_restarts_done + 1}/{MAX_RESTARTS_PER_RUN + 1}...")
                continue # Continue the outer restart loop
            else:
                print(f"  Game Run (Seed {game_seed}): Stuck on final attempt ({total_restarts_done}). Ending run.")
                break # Exit outer loop - max restarts reached
        else:
             # Game finished naturally (game over, no valid moves, or LLM error)
             break # Exit outer loop
    # --- End of outer restart loop ---

    end_time = time.time() # Use overall end time
    final_max_tile_last_attempt = np.max(board) # Max tile on the *very last* board
    win_last_attempt = final_max_tile_last_attempt >= 2048

    # --- Final Visualization Display (for the last attempt) ---
    if visualize and not (last_error_this_attempt and "Stuck" in last_error_this_attempt): # Don't display final if we just showed 'Stuck' message
         if last_error_this_attempt is None or "No valid moves" in last_error_this_attempt: # Check if it ended naturally or via no moves
             clear_output(wait=True)
             result_text = "WON!" if win_last_attempt else "Game Over"
             board_img_data = format_matrix_vlm_display(board, style=viz_style)
             html_content = f"<h2>{result_text} (Attempt {total_restarts_done+1})</h2>"
             html_content += f"<p>Final Score: {score}, Max Tile: {final_max_tile_last_attempt}, Moves: {moves_made}</p>"
             html_content += f"<img src='{board_img_data}' alt='Final Board State' style='width: 250px; height: auto;'/>"
             display(HTML(html_content))
             time.sleep(viz_delay*2) # Pause on final screen

    # Return stats from the LAST attempt, plus overall bests and restart count
    return {
        "model_name": llm_inference.model_name,
        "inference_type": llm_inference.model_config["type"],
        "game_seed": game_seed, # Original seed for the run
        "final_score": score, # Score from the last attempt
        "max_tile_achieved": int(final_max_tile_last_attempt), # Max tile on final board of last attempt
        "num_moves": moves_made, # Moves in the last attempt
        "win": win_last_attempt, # Win status based on last attempt's board
        "duration_seconds": round(time.time() - start_time_attempt, 2), # Duration of the last attempt
        "temperature": generation_config.get("temperature"),
        "max_new_tokens": generation_config.get("max_new_tokens"),
        "top_p": generation_config.get("top_p"),
        "min_p": generation_config.get("min_p"),
        # --- New fields ---
        "overall_max_tile_across_restarts": int(overall_max_tile_across_restarts),
        "overall_max_score_across_restarts": overall_max_score_across_restarts,
        "restarts_needed": total_restarts_done,
        # --- Final error ---
        # Error reflects state at end of last attempt (e.g., None if natural end, 'Stuck...' if ended stuck)
        "error": last_error_this_attempt if last_error_this_attempt else None,
        "timestamp": datetime.now().isoformat(),
    }

In [23]:
# --- Part 5: Data Storage ---

def log_results(results: Dict[str, Any], filename: str):
    """Appends evaluation results to a CSV file."""
    file_exists = os.path.isfile(filename)
    # --- Define the desired order of columns --- ADD NEW FIELDS ---
    fieldnames = [
        "timestamp", "model_name", "inference_type", "run_id", "game_seed",
        "max_tile_achieved", # From last attempt
        "final_score",       # From last attempt
        "num_moves",         # From last attempt
        "win",               # From last attempt
        "overall_max_tile_across_restarts", # New field
        "overall_max_score_across_restarts",# New field
        "restarts_needed",                  # New field
        "duration_seconds",  # From last attempt
        "temperature", "max_new_tokens", "top_p", "min_p", # Added min_p just in case
        "error" # Error from last attempt (can indicate ended stuck)
    ]
    # -----------------------------------------------------------

    try:
        with open(filename, 'a', newline='', encoding='utf-8') as csvfile:
            # Ensure all fieldnames exist in results before writing
            for field in fieldnames:
                results.setdefault(field, None) # Add missing keys with None value

            writer = csv.DictWriter(csvfile, fieldnames=fieldnames, extrasaction='ignore')

            if not file_exists or os.path.getsize(filename) == 0:
                writer.writeheader()

            writer.writerow(results)
    except IOError as e:
        print(f"Error writing results to {filename}: {e}")
    except Exception as e:
        print(f"An unexpected error occurred during result logging: {e}")
        traceback.print_exc()

# print("Data storage function updated.") # Optional

In [25]:
target_config = MODEL_CONFIGS[0]

In [None]:
target_config

In [None]:
# Initialize LLM with CUDA devices 1,2
llm_player = VLLMInference(target_config)

In [28]:
messages = [{"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": 'Introduce yourself as a 2048 expert and give a brief overview of the game.'}]

In [29]:
generation_config = {
    "temperature": 0.7,
    "max_new_tokens": 1024,
    "top_p": 0.9,
    "min_p": 0.9,
}

In [None]:
llm_player.generate(messages, generation_config)

In [31]:
from time import sleep

In [None]:
print("\n=== 2048 LLM Evaluation ===\n")

# Take just the first model config
model_config = MODEL_CONFIGS[0]
model_name = model_config["name"]

try:
    sleep(1)
    
    # Run ONE visualized game first
    if VISUALIZE_GAME:
        print(f"\n--- Running visualized game for {model_name} ---")
        game_result_viz = play_game(
            llm_player,
            GENERATION_CONFIG,
            visualize=True,
            viz_delay=VISUALIZATION_DELAY_SECONDS,
            viz_style=VIZ_STYLE,
            
        )
        
        print("\n--- Visualization Game Summary ---")
        print(f"Score: {game_result_viz.get('final_score', 'N/A')}")
        print(f"Max Tile: {game_result_viz.get('max_tile_achieved', 'N/A')}")
        print(f"Moves: {game_result_viz.get('num_moves', 'N/A')}")
        if game_result_viz.get('error'): 
            print(f"Error: {game_result_viz['error']}")
    
    # Run standard evaluation games
    run_results = []
    print(f"\nStarting {NUM_RUNS} evaluation runs...")
    for run_id in tqdm(range(1, NUM_RUNS + 1), desc="Playing games", unit="game"):
        game_result = play_game(llm_player, GENERATION_CONFIG, visualize=False)
        game_result["run_id"] = run_id
        log_results(game_result, RESULTS_FILE)
        run_results.append(game_result)
    
    # Calculate stats
    if run_results:
        successful_runs = [r for r in run_results if not r.get('error')]
        if successful_runs:
            avg_score = np.mean([r['final_score'] for r in successful_runs])
            avg_max_tile = np.mean([r['max_tile_achieved'] for r in successful_runs])
            win_rate = np.mean([r['win'] for r in successful_runs]) * 100
            
            print(f"\nResults for {model_name}:")
            print(f"Avg Score: {avg_score:.2f}")
            print(f"Avg Max Tile: {avg_max_tile:.2f}")
            print(f"Win Rate: {win_rate:.1f}%")

except Exception as e:
    print(f"Error: {e}")
    traceback.print_exc()

# finally:
#     if 'llm_player' in locals():
#         llm_player.cleanup()

print(f"\nResults logged to: {RESULTS_FILE}")

In [None]:
# --- Part 6: Main Execution Block ---

print("\n" + "="*30)
print("=== 2048 LLM Evaluation ===")
print("="*30 + "\n")

# Track overall statistics per model
model_summary_stats = {}

# --- Visualization Flag ---
# Ensure VISUALIZE_GAME, VISUALIZATION_DELAY_SECONDS, VIZ_STYLE are defined in a previous cell
run_visualization = VISUALIZE_GAME
# --------------------------

for config_idx, model_config in enumerate(MODEL_CONFIGS):
    model_name = model_config["name"]
    inference_type = model_config["type"]
    print(f"\n--- Evaluating Model {config_idx+1}/{len(MODEL_CONFIGS)}: {model_name} ({inference_type}) ---")

    llm_player = None
    try:
        # --- Initialize LLM ---
        # Ensure VLLM settings like VLLM_TENSOR_PARALLEL_SIZE and VLLM_DEVICE
        # are defined in a previous cell according to your target setup.
        # **Crucially, set CUDA_VISIBLE_DEVICES environment variable *before* launching
        # Jupyter/Python for specific device targeting.**
        # Example for GPUs 1 & 2: export CUDA_VISIBLE_DEVICES=1,2
        if inference_type == "vllm":
            if not VLLM_AVAILABLE:
                print("  Skipping vLLM model - vLLM not installed.")
                continue
            # VLLMInference now handles internal parsing/warnings based on VLLM_DEVICE
            # and uses VLLM_TENSOR_PARALLEL_SIZE. It relies on external CUDA_VISIBLE_DEVICES.
            llm_player = VLLMInference(model_config, GENERATION_CONFIG)
        elif inference_type == "openai":
             if not OPENAI_AVAILABLE:
                 print("  Skipping OpenAI model - OpenAI library not installed.")
                 continue
             llm_player = OpenAIInference(model_config, GENERATION_CONFIG)
        else:
            print(f"  Skipping model - Unknown inference type: {inference_type}")
            continue
        # --- LLM Initialization Complete ---

        # --- Optional: Run ONE visualized game first ---
        if run_visualization and config_idx == 0: # Only visualize the very first model's first game
             print(f"\n--- Running ONE visualized game for {model_name} (Style: {VIZ_STYLE.get('font', 'Default')}) ---")
             try:
                 # Pass the selected style to play_game
                 game_result_viz = play_game(
                     llm_player,
                     GENERATION_CONFIG,
                     visualize=False, # Enable visualization
                     viz_delay=VISUALIZATION_DELAY_SECONDS,
                     viz_style=VIZ_STYLE # Pass the chosen style dictionary
                 )

                 print("\n--- Visualization Game Summary ---")
                 print(f"  Seed: {game_result_viz.get('game_seed', 'N/A')}")
                 print(f"  Won: {game_result_viz.get('win', 'N/A')}")
                 print(f"  Score: {game_result_viz.get('final_score', 'N/A')}")
                 print(f"  Max Tile: {game_result_viz.get('max_tile_achieved', 'N/A')}")
                 print(f"  Moves: {game_result_viz.get('num_moves', 'N/A')}")
                 if game_result_viz.get('error'): print(f"  Error: {game_result_viz['error']}")
                 print("-" * 30)
                 # Note: This visualized run is NOT logged by default to avoid skewing stats

             except Exception as viz_err:
                  print(f"\n  Error during visualized game run for model {model_name}: {viz_err}")
                  traceback.print_exc()
             # input("Press Enter to continue with standard evaluation runs...") # Optional pause
        # ----------------------------------------------


        # --- Run Standard Evaluation Games ---
        run_results = []
        print(f"\n  Starting {NUM_RUNS} standard evaluation runs (no visualization)...")
        for run_id in tqdm(range(1, NUM_RUNS + 1), desc=f"Playing ({model_name})", unit="game"):
            try:
                # Pass visualize=False to the main evaluation runs
                game_result = play_game(llm_player, GENERATION_CONFIG, visualize=False) # Use default random seed per game
                game_result["run_id"] = run_id # Add run_id for tracking
                # Log generation parameters used for this run
                game_result["temperature"] = llm_player.generation_config.get("temperature")
                game_result["max_new_tokens"] = llm_player.generation_config.get("max_new_tokens")
                game_result["top_p"] = llm_player.generation_config.get("top_p")
                game_result["min_p"] = llm_player.generation_config.get("min_p")
                log_results(game_result, RESULTS_FILE)
                run_results.append(game_result)
            except Exception as game_err:
                 print(f"\n  Error during game run {run_id} for model {model_name}: {game_err}")
                 traceback.print_exc()
                 # Log failure with generation params
                 error_result = {
                     "timestamp": datetime.now().isoformat(),
                     "model_name": model_name,
                     "inference_type": inference_type,
                     "run_id": run_id,
                     "game_seed": None,
                     "max_tile_achieved": 0,
                     "final_score": 0,
                     "num_moves": 0,
                     "win": False,
                     "duration_seconds": 0,
                     "temperature": GENERATION_CONFIG.get("temperature"),
                     "max_new_tokens": GENERATION_CONFIG.get("max_new_tokens"),
                     "top_p": GENERATION_CONFIG.get("top_p"),
                     "min_p": GENERATION_CONFIG.get("min_p"),
                     "error": f"Game execution error: {str(game_err)}"
                 }
                 log_results(error_result, RESULTS_FILE)
                 run_results.append(error_result) # Add error result to list
        # --------------------------------------

        # --- Calculate Summary Statistics for this Model ---
        if run_results:
            successful_runs = [r for r in run_results if r.get("error") is None or "Game execution error" not in str(r.get("error", ""))] # More robust error check
            num_successful = len(successful_runs)
            if num_successful > 0:
                 avg_score = np.mean([r['final_score'] for r in successful_runs])
                 std_score = np.std([r['final_score'] for r in successful_runs])
                 avg_max_tile = np.mean([r['max_tile_achieved'] for r in successful_runs])
                 std_max_tile = np.std([r['max_tile_achieved'] for r in successful_runs])
                 win_rate = np.mean([r['win'] for r in successful_runs]) * 100
                 avg_moves = np.mean([r['num_moves'] for r in successful_runs])

                 model_summary_stats[model_name] = {
                     "avg_score": avg_score,
                     "std_score": std_score,
                     "avg_max_tile": avg_max_tile,
                     "std_max_tile": std_max_tile,
                     "win_rate_percent": win_rate,
                     "avg_moves": avg_moves,
                     "successful_runs": num_successful,
                     "total_runs": NUM_RUNS
                 }
                 print(f"\n  Summary for {model_name} ({num_successful}/{NUM_RUNS} successful runs):")
                 print(f"    Avg Score: {avg_score:.2f} ± {std_score:.2f}")
                 print(f"    Avg Max Tile: {avg_max_tile:.2f} ± {std_max_tile:.2f}")
                 print(f"    Win Rate (>=2048): {win_rate:.1f}%")
                 print(f"    Avg Moves: {avg_moves:.1f}")
            else:
                 print(f"\n  No successful standard runs completed for {model_name} to calculate stats.")
                 model_summary_stats[model_name] = {"successful_runs": 0, "total_runs": NUM_RUNS, "error": "No successful runs"}
        else:
            print(f"\n  No results recorded for standard runs of {model_name}.")
            model_summary_stats[model_name] = {"successful_runs": 0, "total_runs": NUM_RUNS, "error": "No standard runs recorded"}
        # ----------------------------------------------------

    except Exception as model_init_err:
        print(f"\n  FATAL ERROR initializing or running model {model_name}: {model_init_err}")
        traceback.print_exc()
        model_summary_stats[model_name] = {"successful_runs": 0, "total_runs": NUM_RUNS, "error": f"Initialization/Fatal Error: {str(model_init_err)}"}

    finally:
        # --- Cleanup LLM Resources ---
        if llm_player:
            print(f"Cleaning up resources for model: {model_name}")
            llm_player.cleanup()
            llm_player = None # Ensure it's cleared before next loop iteration
        print("-" * 40) # Separator after model cleanup

print("\n" + "="*30)
print("=== Evaluation Complete ===")
print("="*30 + "\n")

# --- Final Summary ---
print("Overall Model Summary:")
for name, stats in model_summary_stats.items():
    print(f"\nModel: {name}")
    if stats.get("error") and stats.get("successful_runs", -1) == 0: # Check if error exists and no successful runs
         print(f"  Status: Failed ({stats.get('error', 'Unknown error')})")
    elif "successful_runs" in stats:
        print(f"  Successful Runs: {stats.get('successful_runs', 'N/A')}/{stats.get('total_runs', 'N/A')}")
        # Check if stats were actually calculated before printing
        if "avg_score" in stats:
             print(f"    Avg Score: {stats.get('avg_score', 'N/A'):.2f} ± {stats.get('std_score', 'N/A'):.2f}")
             print(f"    Avg Max Tile: {stats.get('avg_max_tile', 'N/A'):.2f} ± {stats.get('std_max_tile', 'N/A'):.2f}")
             print(f"    Win Rate (%): {stats.get('win_rate_percent', 'N/A'):.1f}")
             print(f"    Avg Moves: {stats.get('avg_moves', 'N/A'):.1f}")
        else:
             print("    (No stats calculated - likely no successful runs)")
    else:
        print("  Status: Unknown (No stats dictionary populated correctly)")


print(f"\nDetailed results logged to: {RESULTS_FILE}")

In [None]:
print("Overall Model Summary:")
for name, stats in model_summary_stats.items():
    print(f"\nModel: {name}")
    if "error" in stats and stats["successful_runs"] == 0:
         print(f"  Status: Failed ({stats.get('error', 'Unknown error')})")
    else:
        print(f"  Successful Runs: {stats['successful_runs']}/{stats['total_runs']}")
        print(f"    Avg Score: {stats.get('avg_score', 'N/A'):.2f}")
        print(f"    Avg Max Tile: {stats.get('avg_max_tile', 'N/A'):.2f}")
        print(f"    Win Rate (%): {stats.get('win_rate_percent', 'N/A'):.1f}")
        print(f"    Avg Moves: {stats.get('avg_moves', 'N/A'):.1f}")


print(f"\nDetailed results logged to: {RESULTS_FILE}")

In [None]:




# --- Optional: Analysis with Pandas ---
# import pandas as pd
# try:
#     df = pd.read_csv(RESULTS_FILE)
#     print("\n--- Results DataFrame Head ---")
#     print(df.head())
#     print("\n--- Basic Stats per Model ---")
#     print(df.groupby('model_name')[['max_tile_achieved', 'final_score', 'win', 'duration_seconds']].agg(['mean', 'std', 'count', 'max']))
# except FileNotFoundError:
#     print(f"\nResults file {RESULTS_FILE} not found for analysis.")
# except Exception as pd_err:
#      print(f"\nError analyzing results with pandas: {pd_err}")
