In [1]:
from dotenv import load_dotenv
from huggingface_hub import login
import os

import importlib
import benchmark
import llm_agent
from llm_providers import create_llm
from persona_loader import list_persona_ids

importlib.reload(benchmark)
importlib.reload(llm_agent)

load_dotenv()
HF_TOKEN = os.environ.get("HF_TOKEN")
# Paste your token inside the quotes
login(HF_TOKEN)


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [2]:
# ===== MODEL CONFIGURATION =====
# Choose which model to use by setting MODEL_CHOICE
# Options: "llama31", "mistral", "gemma2"

MODEL_CHOICE = "gemma2"  # Change this to switch models

MODEL_CONFIGS = {
    "llama31": {
        "type": "local_hf",
        "model_name": "meta-llama/Llama-3.1-8B-Instruct",
        "temperature": 0.6,
        "max_tokens": 1024,
        # "load_in_4bit": True
    },
    "mistral": {
        "type": "local_hf",
        "model_name": "mistralai/Mistral-7B-Instruct-v0.2",
        "temperature": 0.7,
        # "load_in_4bit": True
    },
    "gemma2": {
        "type": "local_hf",
        "model_name": "google/gemma-2-9b-it",
        "temperature": 0.7,
        "max_tokens": 2048,
    }
}

## Full Persona Experiment

This cell runs the complete experiment: all 441 combinations of personas (21 x 21, including None) with 5 games each.

**⚠️ WARNING**: This will run **2,205 games** total and may take **many hours** depending on your model and hardware.

**Features:**
- Runs all combinations: (None, "1", ..., "20") x (None, "1", ..., "20")
- 5 games per combination for statistical significance
- **Saves intermediate results** after each combination (every 5 games)
- Progress tracking with time estimates
- Crash-resistant: can resume from intermediate results
- All results saved to `experiment_results/{experiment_name}/`

**Before running:**
1. Make sure you have enough disk space (~500MB-1GB for all results)
2. Consider starting with a smaller test run first
3. Monitor the progress output to estimate total time

In [None]:
# Import the experiment runner
from run_full_experiment import run_full_persona_experiment

# ===== EXPERIMENT CONFIGURATION =====
config = MODEL_CONFIGS[MODEL_CHOICE]

# Number of games per persona combination (default: 5)
GAMES_PER_COMBINATION = 1

# Whether to enable persona sharing (agents know each other's personas)
SHARED_PERSONA = True  # Set to True to enable

# Optional: Give this experiment a custom name
EXPERIMENT_NAME = None

# ===== RUN THE FULL EXPERIMENT =====

print("⚠️  STARTING FULL EXPERIMENT - This will take a long time!")
print(f"Configuration:")
print(f"  - Model: {config['model_name']}")
print(f"  - Games per combination: {GAMES_PER_COMBINATION}")
print(f"  - Total games: {441 * GAMES_PER_COMBINATION}")
print(f"  - Persona sharing: {'Enabled' if SHARED_PERSONA else 'Disabled'}")
print(f"\nResults will be saved to: experiment_results/")
print(f"Intermediate results saved after each combination.\n")

bnch = benchmark.CodeNamesBenchmark()
shared_llm_instance = create_llm(config)

# Run the experiment
full_results = run_full_persona_experiment(
    bnch=bnch,
    config=config,
    shared_llm_instance=shared_llm_instance,
    num_games_per_combination=GAMES_PER_COMBINATION,
    experiment_name=EXPERIMENT_NAME,
    shared_persona=SHARED_PERSONA,
    results_dir="experiment_results"
)

print("\n✅ EXPERIMENT COMPLETE!")
print(f"Results saved to: experiment_results/{full_results['experiment_metadata']['experiment_name']}/")
print(f"Total duration: {full_results['summary_statistics']['total_duration_seconds']/3600:.2f} hours")

⚠️  STARTING FULL EXPERIMENT - This will take a long time!
Configuration:
  - Model: google/gemma-2-9b-it
  - Games per combination: 1
  - Total games: 441
  - Persona sharing: Enabled

Results will be saved to: experiment_results/
Intermediate results saved after each combination.



Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]




STARTING FULL PERSONA EXPERIMENT: full_persona_exp_20251118_213336
Model: google/gemma-2-9b-it
Persona IDs: 21 (None + 1-20)
Combinations: 441 (21 x 21)
Games per combination: 1
Total games: 441
Persona sharing: Enabled
Results directory: experiment_results/full_persona_exp_20251118_213336


--------------------------------------------------------------------------------
Combination 1/441: Codemaster=None, Guesser=None
--------------------------------------------------------------------------------

Starting collaborative game 0

Board State:
CZECH | DRILL | TRAIN | ANTARCTICA | UNICORN
TAIL | OCTOPUS | COURT | ANGEL | QUEEN
DATE | FORCE | BAT | CRICKET | CONCERT
NIGHT | PHOENIX | SWING | CAST | KETCHUP
BOTTLE | CRASH | LION | FACE | FIGURE


=== Turn 1 ===
Remaining words to guess: COURT, ANTARCTICA, ANGEL, FACE, TAIL, DATE, CONCERT, DRILL, BAT


## Load and Analyze Experiment Results

Use these cells to load results from a completed (or in-progress) experiment for analysis.

In [None]:
# Load results from a specific experiment
from run_full_experiment import load_experiment_results
import os

# List available experiments
results_dir = "experiment_results"
if os.path.exists(results_dir):
    experiments = [d for d in os.listdir(results_dir) if os.path.isdir(os.path.join(results_dir, d))]
    print("Available experiments:")
    for i, exp in enumerate(experiments, 1):
        print(f"  {i}. {exp}")
else:
    print("No experiments found yet.")
    experiments = []

# Load a specific experiment (change the experiment name)
if experiments:
    # Load the most recent experiment
    experiment_to_load = experiments[-1]
    print(f"\nLoading: {experiment_to_load}")
    
    loaded_results = load_experiment_results(experiment_to_load, results_dir=results_dir)
    
    # Display summary
    print(f"\nExperiment: {loaded_results['experiment_metadata']['experiment_name']}")
    print(f"Status: {loaded_results.get('status', 'complete')}")
    print(f"Combinations completed: {loaded_results['experiment_metadata'].get('combinations_completed', 'N/A')}")
    print(f"Games completed: {loaded_results['experiment_metadata'].get('games_completed', 'N/A')}")
    
    if 'progress' in loaded_results['experiment_metadata']:
        progress = loaded_results['experiment_metadata']['progress']
        print(f"Progress: {progress['percent_complete']:.1f}%")
    
    print(f"\nTotal combinations loaded: {len(loaded_results['all_combinations'])}")

In [None]:
# Quick analysis: Extract win rates for all combinations
import pandas as pd

if experiments and 'all_combinations' in loaded_results:
    # Extract summary statistics for each combination
    analysis_data = []
    
    for combo in loaded_results['all_combinations']:
        if 'error' not in combo:  # Skip failed combinations
            analysis_data.append({
                'codemaster_persona': combo.get('codemaster_persona_id') or 'None',
                'guesser_persona': combo.get('guesser_persona_id') or 'None',
                'win_rate': combo.get('win_rate', 0),
                'avg_turns': combo.get('average_turns', 0),
                'avg_words_per_clue': combo.get('average_words_per_clue', 0),
                'games_played': combo.get('games_played', 0),
                'total_correct_guesses': combo.get('total_correct_guesses', 0),
                'total_incorrect_guesses': combo.get('total_incorrect_guesses', 0)
            })
    
    # Create DataFrame
    df_results = pd.DataFrame(analysis_data)
    
    print(f"Loaded {len(df_results)} combinations")
    print(f"\nOverall Statistics:")
    print(f"  Mean win rate: {df_results['win_rate'].mean():.1%}")
    print(f"  Mean turns per game: {df_results['avg_turns'].mean():.1f}")
    print(f"  Mean words per clue: {df_results['avg_words_per_clue'].mean():.2f}")
    
    print(f"\nTop 10 combinations by win rate:")
    top_10 = df_results.nlargest(10, 'win_rate')[['codemaster_persona', 'guesser_persona', 'win_rate', 'avg_turns']]
    print(top_10.to_string(index=False))
    
    print(f"\nBottom 10 combinations by win rate:")
    bottom_10 = df_results.nsmallest(10, 'win_rate')[['codemaster_persona', 'guesser_persona', 'win_rate', 'avg_turns']]
    print(bottom_10.to_string(index=False))
    
    # Save analysis DataFrame for further use
    print(f"\n✓ Results DataFrame saved as 'df_results'")
else:
    print("No experiment results loaded yet. Run the experiment first!")