# Snake AI Hyperparameter Experimentation

This notebook provides tools for:
- Interactive hyperparameter exploration
- Running small-scale experiments
- Visualizing parameter sensitivity
- Prototyping new reward functions

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import itertools
from pathlib import Path
import sys
import torch
import torch.nn as nn
import torch.optim as optim
from typing import Dict, List, Tuple
import time
import json

# Add the src directory to Python path
sys.path.append('../src')

# Import our Snake AI modules
try:
    from snake_ai import DQN, SnakeAI
    from snake_game import SnakeGame
    from utils import get_state
    print("✅ Snake AI modules imported successfully!")
except ImportError as e:
    print(f"❌ Error importing modules: {e}")
    print("Make sure you're running this from the correct directory.")

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Hyperparameter experimentation notebook ready!")

❌ Error importing modules: cannot import name 'get_state' from 'utils' (c:\Users\jross\Source\ai-snake\notebooks\../src\utils.py)
Make sure you're running this from the correct directory.
Hyperparameter experimentation notebook ready!


## 1. Define Experimental Framework

In [None]:
class SimpleTrainer:
    """Simplified trainer for quick hyperparameter experiments."""
    
    def __init__(self, input_size=11, output_size=3, board_size=10):
        self.input_size = input_size
        self.output_size = output_size
        self.board_size = board_size
        
    def quick_train(self, hyperparams: Dict, episodes: int = 200) -> Dict:
        """Run a quick training session with given hyperparameters."""
        
        # Create model and optimizer
        model = DQN(self.input_size, self.output_size)
        optimizer = optim.Adam(model.parameters(), lr=hyperparams.get('learning_rate', 0.001))
        
        # Training parameters
        epsilon = hyperparams.get('epsilon_start', 1.0)
        epsilon_end = hyperparams.get('epsilon_end', 0.01)
        epsilon_decay = hyperparams.get('epsilon_decay', 0.995)
        
        # Reward weights
        food_reward = hyperparams.get('food_reward', 100)
        survival_reward = hyperparams.get('survival_reward', 0.02)
        collision_penalty = hyperparams.get('collision_penalty', -10)
        
        # Training metrics
        scores = []
        rewards = []
        episode_lengths = []
        
        for episode in range(episodes):
            game = SnakeGame(self.board_size, self.board_size)
            game.reset()
            
            total_reward = 0
            steps = 0
            
            while not game.game_over and steps < 200:  # Limit steps
                # Get state
                state = get_state(game)
                state_tensor = torch.FloatTensor(state).unsqueeze(0)
                
                # Choose action (epsilon-greedy)
                if np.random.random() < epsilon:
                    action = np.random.randint(0, 3)
                else:
                    with torch.no_grad():
                        q_values = model(state_tensor)
                        action = q_values.argmax().item()
                
                # Take action
                old_score = game.score
                game.move(action)
                new_score = game.score
                
                # Calculate reward
                reward = 0
                if new_score > old_score:  # Ate food
                    reward += food_reward
                elif game.game_over:  # Collision
                    reward += collision_penalty
                else:  # Survival
                    reward += survival_reward
                
                total_reward += reward
                steps += 1
                
                # Simple training step (without replay buffer for speed)
                if not game.game_over:
                    next_state = get_state(game)
                    next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0)
                    
                    with torch.no_grad():
                        next_q_values = model(next_state_tensor)
                        target = reward + 0.99 * next_q_values.max()
                else:
                    target = reward
                
                # Update model
                q_values = model(state_tensor)
                current_q = q_values[0][action]
                loss = nn.MSELoss()(current_q, torch.tensor(target, dtype=torch.float32))
                
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            
            # Update epsilon
            epsilon = max(epsilon_end, epsilon * epsilon_decay)
            
            # Record metrics
            scores.append(game.score)
            rewards.append(total_reward)
            episode_lengths.append(steps)
        
        # Calculate performance metrics
        avg_score = np.mean(scores[-50:])  # Average of last 50 episodes
        max_score = np.max(scores)
        success_rate = np.mean([s > 0 for s in scores[-50:]])
        convergence_speed = next((i for i, s in enumerate(scores) if s >= 5), episodes)
        
        return {
            'avg_score': avg_score,
            'max_score': max_score,
            'success_rate': success_rate,
            'convergence_speed': convergence_speed,
            'final_epsilon': epsilon,
            'scores': scores,
            'rewards': rewards,
            'episode_lengths': episode_lengths
        }

# Create trainer instance
trainer = SimpleTrainer()
print("Experimental framework ready!")

## 2. Single Parameter Experiments

In [None]:
# Experiment with learning rates
def experiment_learning_rates():
    """Test different learning rates."""
    learning_rates = [0.0001, 0.001, 0.005, 0.01, 0.05]
    results = []
    
    print("Experimenting with learning rates...")
    
    for lr in learning_rates:
        print(f"  Testing learning rate: {lr}")
        
        hyperparams = {
            'learning_rate': lr,
            'epsilon_start': 1.0,
            'epsilon_end': 0.01,
            'epsilon_decay': 0.995,
            'food_reward': 100,
            'survival_reward': 0.02,
            'collision_penalty': -10
        }
        
        result = trainer.quick_train(hyperparams, episodes=150)
        result['learning_rate'] = lr
        results.append(result)
    
    return pd.DataFrame(results)

# Run learning rate experiment
lr_results = experiment_learning_rates()
print("\nLearning Rate Experiment Results:")
display(lr_results[['learning_rate', 'avg_score', 'max_score', 'success_rate', 'convergence_speed']].round(3))

In [None]:
# Visualize learning rate results
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Average score vs learning rate
axes[0,0].plot(lr_results['learning_rate'], lr_results['avg_score'], 'bo-', linewidth=2, markersize=8)
axes[0,0].set_title('Average Score vs Learning Rate')
axes[0,0].set_xlabel('Learning Rate')
axes[0,0].set_ylabel('Average Score (Last 50 Episodes)')
axes[0,0].set_xscale('log')
axes[0,0].grid(True, alpha=0.3)

# Success rate vs learning rate
axes[0,1].plot(lr_results['learning_rate'], lr_results['success_rate'], 'ro-', linewidth=2, markersize=8)
axes[0,1].set_title('Success Rate vs Learning Rate')
axes[0,1].set_xlabel('Learning Rate')
axes[0,1].set_ylabel('Success Rate')
axes[0,1].set_xscale('log')
axes[0,1].grid(True, alpha=0.3)

# Convergence speed vs learning rate
axes[1,0].plot(lr_results['learning_rate'], lr_results['convergence_speed'], 'go-', linewidth=2, markersize=8)
axes[1,0].set_title('Convergence Speed vs Learning Rate')
axes[1,0].set_xlabel('Learning Rate')
axes[1,0].set_ylabel('Episodes to First Score ≥ 5')
axes[1,0].set_xscale('log')
axes[1,0].grid(True, alpha=0.3)

# Best learning rate summary
best_lr_idx = lr_results['avg_score'].idxmax()
best_lr = lr_results.loc[best_lr_idx, 'learning_rate']
best_score = lr_results.loc[best_lr_idx, 'avg_score']

axes[1,1].bar(['Best LR'], [best_score], color='skyblue')
axes[1,1].set_title(f'Best Learning Rate: {best_lr}')
axes[1,1].set_ylabel('Average Score')

plt.tight_layout()
plt.show()

print(f"\n🏆 Best learning rate: {best_lr} (avg score: {best_score:.2f})")

## 3. Reward Weight Exploration

In [None]:
# Experiment with reward weights
def experiment_reward_weights():
    """Test different reward weight combinations."""
    
    # Define parameter ranges
    food_rewards = [50, 100, 200]
    survival_rewards = [0.01, 0.02, 0.05]
    collision_penalties = [-5, -10, -20]
    
    results = []
    
    print("Experimenting with reward weights...")
    total_combinations = len(food_rewards) * len(survival_rewards) * len(collision_penalties)
    current = 0
    
    for food_r in food_rewards:
        for survival_r in survival_rewards:
            for collision_p in collision_penalties:
                current += 1
                print(f"  Testing combination {current}/{total_combinations}: F={food_r}, S={survival_r}, C={collision_p}")
                
                hyperparams = {
                    'learning_rate': best_lr,  # Use best learning rate from previous experiment
                    'epsilon_start': 1.0,
                    'epsilon_end': 0.01,
                    'epsilon_decay': 0.995,
                    'food_reward': food_r,
                    'survival_reward': survival_r,
                    'collision_penalty': collision_p
                }
                
                result = trainer.quick_train(hyperparams, episodes=150)
                result.update({
                    'food_reward': food_r,
                    'survival_reward': survival_r,
                    'collision_penalty': collision_p
                })
                results.append(result)
    
    return pd.DataFrame(results)

# Run reward weight experiment
reward_results = experiment_reward_weights()
print("\nReward Weight Experiment Results:")
display(reward_results[['food_reward', 'survival_reward', 'collision_penalty', 
                      'avg_score', 'max_score', 'success_rate']].round(3))

In [None]:
# Create interactive 3D visualization of reward weight exploration
fig = go.Figure()

# Create 3D scatter plot
fig.add_trace(go.Scatter3d(
    x=reward_results['food_reward'],
    y=reward_results['survival_reward'],
    z=reward_results['collision_penalty'],
    mode='markers',
    marker=dict(
        size=reward_results['avg_score'] * 2,  # Size based on performance
        color=reward_results['avg_score'],
        colorscale='Viridis',
        showscale=True,
        colorbar=dict(title="Average Score")
    ),
    text=[f"Food: {f}<br>Survival: {s}<br>Collision: {c}<br>Avg Score: {score:.2f}" 
          for f, s, c, score in zip(reward_results['food_reward'], 
                                   reward_results['survival_reward'],
                                   reward_results['collision_penalty'],
                                   reward_results['avg_score'])],
    hovertemplate='%{text}<extra></extra>'
))

fig.update_layout(
    title='Reward Weight Exploration (3D)',
    scene=dict(
        xaxis_title='Food Reward',
        yaxis_title='Survival Reward',
        zaxis_title='Collision Penalty'
    ),
    height=600
)

fig.show()

# Find best configuration
best_config_idx = reward_results['avg_score'].idxmax()
best_config = reward_results.loc[best_config_idx]

print(f"\n🏆 Best reward configuration:")
print(f"   Food Reward: {best_config['food_reward']}")
print(f"   Survival Reward: {best_config['survival_reward']}")
print(f"   Collision Penalty: {best_config['collision_penalty']}")
print(f"   Average Score: {best_config['avg_score']:.2f}")
print(f"   Success Rate: {best_config['success_rate']:.2f}")

## 4. Epsilon Decay Strategy Exploration

In [None]:
# Experiment with epsilon decay strategies
def experiment_epsilon_strategies():
    """Test different epsilon decay strategies."""
    
    strategies = [
        {'name': 'Slow Decay', 'epsilon_start': 1.0, 'epsilon_end': 0.01, 'epsilon_decay': 0.999},
        {'name': 'Medium Decay', 'epsilon_start': 1.0, 'epsilon_end': 0.01, 'epsilon_decay': 0.995},
        {'name': 'Fast Decay', 'epsilon_start': 1.0, 'epsilon_end': 0.01, 'epsilon_decay': 0.990},
        {'name': 'Very Fast Decay', 'epsilon_start': 1.0, 'epsilon_end': 0.01, 'epsilon_decay': 0.980},
        {'name': 'Linear Decay', 'epsilon_start': 1.0, 'epsilon_end': 0.01, 'epsilon_decay': 0.995}  # We'll handle this specially
    ]
    
    results = []
    
    print("Experimenting with epsilon decay strategies...")
    
    for strategy in strategies:
        print(f"  Testing strategy: {strategy['name']}")
        
        hyperparams = {
            'learning_rate': best_lr,
            'epsilon_start': strategy['epsilon_start'],
            'epsilon_end': strategy['epsilon_end'],
            'epsilon_decay': strategy['epsilon_decay'],
            'food_reward': best_config['food_reward'],
            'survival_reward': best_config['survival_reward'],
            'collision_penalty': best_config['collision_penalty']
        }
        
        result = trainer.quick_train(hyperparams, episodes=200)
        result['strategy_name'] = strategy['name']
        result['epsilon_decay'] = strategy['epsilon_decay']
        results.append(result)
    
    return pd.DataFrame(results)

# Run epsilon strategy experiment
epsilon_results = experiment_epsilon_strategies()
print("\nEpsilon Strategy Experiment Results:")
display(epsilon_results[['strategy_name', 'epsilon_decay', 'avg_score', 
                        'max_score', 'success_rate', 'convergence_speed']].round(3))

In [None]:
# Visualize epsilon strategy results
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Performance by strategy
strategy_names = epsilon_results['strategy_name']
avg_scores = epsilon_results['avg_score']

axes[0,0].bar(strategy_names, avg_scores, color='lightblue')
axes[0,0].set_title('Average Score by Epsilon Strategy')
axes[0,0].set_ylabel('Average Score')
axes[0,0].tick_params(axis='x', rotation=45)

# Convergence speed by strategy
axes[0,1].bar(strategy_names, epsilon_results['convergence_speed'], color='lightgreen')
axes[0,1].set_title('Convergence Speed by Epsilon Strategy')
axes[0,1].set_ylabel('Episodes to Score ≥ 5')
axes[0,1].tick_params(axis='x', rotation=45)

# Success rate by strategy
axes[1,0].bar(strategy_names, epsilon_results['success_rate'], color='lightcoral')
axes[1,0].set_title('Success Rate by Epsilon Strategy')
axes[1,0].set_ylabel('Success Rate')
axes[1,0].tick_params(axis='x', rotation=45)

# Learning curves for top strategies
top_strategies = epsilon_results.nlargest(3, 'avg_score')
for _, strategy in top_strategies.iterrows():
    scores = strategy['scores']
    rolling_avg = pd.Series(scores).rolling(20, min_periods=1).mean()
    axes[1,1].plot(rolling_avg, label=strategy['strategy_name'], alpha=0.8, linewidth=2)

axes[1,1].set_title('Learning Curves (Top 3 Strategies)')
axes[1,1].set_xlabel('Episode')
axes[1,1].set_ylabel('Rolling Average Score (20 episodes)')
axes[1,1].legend()
axes[1,1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Find best epsilon strategy
best_epsilon_idx = epsilon_results['avg_score'].idxmax()
best_epsilon_strategy = epsilon_results.loc[best_epsilon_idx]

print(f"\n🏆 Best epsilon strategy: {best_epsilon_strategy['strategy_name']}")
print(f"   Epsilon decay: {best_epsilon_strategy['epsilon_decay']}")
print(f"   Average score: {best_epsilon_strategy['avg_score']:.2f}")
print(f"   Convergence speed: {best_epsilon_strategy['convergence_speed']} episodes")

## 5. Custom Reward Function Prototyping

In [None]:
class CustomRewardTrainer(SimpleTrainer):
    """Trainer with custom reward functions for experimentation."""
    
    def distance_based_reward(self, game, old_score, action, distance_to_food_before, distance_to_food_after):
        """Reward function that considers distance to food."""
        reward = 0
        
        # Food reward
        if game.score > old_score:
            reward += 100
        
        # Distance reward/penalty
        if distance_to_food_after < distance_to_food_before:
            reward += 1  # Moving closer to food
        elif distance_to_food_after > distance_to_food_before:
            reward -= 0.5  # Moving away from food
        
        # Collision penalty
        if game.game_over:
            reward -= 10
        else:
            reward += 0.01  # Survival bonus
        
        return reward
    
    def length_based_reward(self, game, old_score, action, snake_length):
        """Reward function that scales with snake length."""
        reward = 0
        
        # Food reward (scales with length)
        if game.score > old_score:
            reward += 50 + (snake_length * 10)  # More reward as snake gets longer
        
        # Collision penalty (also scales with length)
        if game.game_over:
            reward -= (5 + snake_length * 2)  # Bigger penalty for longer snakes
        else:
            reward += 0.02  # Survival bonus
        
        return reward
    
    def efficiency_reward(self, game, old_score, action, steps_since_food):
        """Reward function that encourages efficiency."""
        reward = 0
        
        # Food reward (bonus for quick finding)
        if game.score > old_score:
            efficiency_bonus = max(0, 50 - steps_since_food)  # Bonus decreases with time
            reward += 100 + efficiency_bonus
        
        # Penalty for taking too long
        if steps_since_food > 30:
            reward -= 0.1
        
        # Collision penalty
        if game.game_over:
            reward -= 15
        else:
            reward += 0.01
        
        return reward
    
    def test_custom_reward(self, reward_function_name, episodes=150):
        """Test a custom reward function."""
        model = DQN(self.input_size, self.output_size)
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        
        epsilon = 1.0
        epsilon_decay = 0.995
        epsilon_end = 0.01
        
        scores = []
        rewards = []
        
        for episode in range(episodes):
            game = SnakeGame(self.board_size, self.board_size)
            game.reset()
            
            total_reward = 0
            steps = 0
            steps_since_food = 0
            
            while not game.game_over and steps < 200:
                # Get state
                state = get_state(game)
                state_tensor = torch.FloatTensor(state).unsqueeze(0)
                
                # Calculate distance to food (for distance-based reward)
                snake_head = (game.snake[0][0], game.snake[0][1])
                food_pos = (game.food[0], game.food[1])
                distance_before = abs(snake_head[0] - food_pos[0]) + abs(snake_head[1] - food_pos[1])
                
                # Choose action
                if np.random.random() < epsilon:
                    action = np.random.randint(0, 3)
                else:
                    with torch.no_grad():
                        q_values = model(state_tensor)
                        action = q_values.argmax().item()
                
                # Take action
                old_score = game.score
                snake_length = len(game.snake)
                game.move(action)
                
                # Calculate distance after move
                if not game.game_over:
                    snake_head_after = (game.snake[0][0], game.snake[0][1])
                    distance_after = abs(snake_head_after[0] - food_pos[0]) + abs(snake_head_after[1] - food_pos[1])
                else:
                    distance_after = distance_before
                
                # Calculate custom reward
                if reward_function_name == 'distance_based':
                    reward = self.distance_based_reward(game, old_score, action, distance_before, distance_after)
                elif reward_function_name == 'length_based':
                    reward = self.length_based_reward(game, old_score, action, snake_length)
                elif reward_function_name == 'efficiency':
                    reward = self.efficiency_reward(game, old_score, action, steps_since_food)
                else:
                    # Default reward
                    reward = 100 if game.score > old_score else (-10 if game.game_over else 0.02)
                
                total_reward += reward
                steps += 1
                
                # Update steps since food
                if game.score > old_score:
                    steps_since_food = 0
                else:
                    steps_since_food += 1
                
                # Training step (simplified)
                if not game.game_over:
                    next_state = get_state(game)
                    next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0)
                    with torch.no_grad():
                        next_q_values = model(next_state_tensor)
                        target = reward + 0.99 * next_q_values.max()
                else:
                    target = reward
                
                q_values = model(state_tensor)
                current_q = q_values[0][action]
                loss = nn.MSELoss()(current_q, torch.tensor(target, dtype=torch.float32))
                
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            
            epsilon = max(epsilon_end, epsilon * epsilon_decay)
            scores.append(game.score)
            rewards.append(total_reward)
        
        return {
            'reward_function': reward_function_name,
            'avg_score': np.mean(scores[-50:]),
            'max_score': np.max(scores),
            'success_rate': np.mean([s > 0 for s in scores[-50:]]),
            'scores': scores,
            'rewards': rewards
        }

# Test custom reward functions
custom_trainer = CustomRewardTrainer()
reward_functions = ['distance_based', 'length_based', 'efficiency', 'default']

custom_results = []
print("Testing custom reward functions...")

for rf in reward_functions:
    print(f"  Testing {rf} reward function...")
    result = custom_trainer.test_custom_reward(rf, episodes=150)
    custom_results.append(result)

custom_df = pd.DataFrame(custom_results)
print("\nCustom Reward Function Results:")
display(custom_df[['reward_function', 'avg_score', 'max_score', 'success_rate']].round(3))

In [None]:
# Visualize custom reward function performance
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Performance comparison
reward_names = custom_df['reward_function']
avg_scores = custom_df['avg_score']

colors = ['skyblue', 'lightgreen', 'lightcoral', 'gold']
axes[0,0].bar(reward_names, avg_scores, color=colors)
axes[0,0].set_title('Average Score by Reward Function')
axes[0,0].set_ylabel('Average Score')
axes[0,0].tick_params(axis='x', rotation=45)

# Success rate comparison
axes[0,1].bar(reward_names, custom_df['success_rate'], color=colors)
axes[0,1].set_title('Success Rate by Reward Function')
axes[0,1].set_ylabel('Success Rate')
axes[0,1].tick_params(axis='x', rotation=45)

# Learning curves
for i, result in enumerate(custom_results):
    scores = result['scores']
    rolling_avg = pd.Series(scores).rolling(20, min_periods=1).mean()
    axes[1,0].plot(rolling_avg, label=result['reward_function'], 
                  color=colors[i], alpha=0.8, linewidth=2)

axes[1,0].set_title('Learning Curves Comparison')
axes[1,0].set_xlabel('Episode')
axes[1,0].set_ylabel('Rolling Average Score (20 episodes)')
axes[1,0].legend()
axes[1,0].grid(True, alpha=0.3)

# Max score comparison
axes[1,1].bar(reward_names, custom_df['max_score'], color=colors)
axes[1,1].set_title('Maximum Score by Reward Function')
axes[1,1].set_ylabel('Maximum Score')
axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Find best custom reward function
best_custom_idx = custom_df['avg_score'].idxmax()
best_custom = custom_df.loc[best_custom_idx]

print(f"\n🏆 Best custom reward function: {best_custom['reward_function']}")
print(f"   Average score: {best_custom['avg_score']:.2f}")
print(f"   Max score: {best_custom['max_score']:.0f}")
print(f"   Success rate: {best_custom['success_rate']:.2f}")

## 6. Summary and Recommendations

In [None]:
# Create comprehensive summary
print("\n" + "="*60)
print("        HYPERPARAMETER EXPERIMENTATION SUMMARY")
print("="*60)

print(f"\n🔬 EXPERIMENTS CONDUCTED:")
print(f"   • Learning Rate Optimization ({len(lr_results)} configurations)")
print(f"   • Reward Weight Exploration ({len(reward_results)} combinations)")
print(f"   • Epsilon Decay Strategies ({len(epsilon_results)} strategies)")
print(f"   • Custom Reward Functions ({len(custom_results)} functions)")

print(f"\n🏆 BEST CONFIGURATIONS FOUND:")
print(f"   Learning Rate: {best_lr}")
print(f"   Reward Weights:")
print(f"     - Food: {best_config['food_reward']}")
print(f"     - Survival: {best_config['survival_reward']}")
print(f"     - Collision: {best_config['collision_penalty']}")
print(f"   Epsilon Strategy: {best_epsilon_strategy['strategy_name']}")
print(f"   Custom Reward: {best_custom['reward_function']}")

print(f"\n📊 PERFORMANCE SUMMARY:")
print(f"   Best Average Score: {max(lr_results['avg_score'].max(), reward_results['avg_score'].max(), epsilon_results['avg_score'].max(), custom_df['avg_score'].max()):.2f}")
print(f"   Best Max Score: {max(lr_results['max_score'].max(), reward_results['max_score'].max(), epsilon_results['max_score'].max(), custom_df['max_score'].max()):.0f}")
print(f"   Best Success Rate: {max(lr_results['success_rate'].max(), reward_results['success_rate'].max(), epsilon_results['success_rate'].max(), custom_df['success_rate'].max()):.2f}")

print(f"\n💡 RECOMMENDATIONS:")
print(f"   1. Use learning rate: {best_lr} for optimal convergence")
print(f"   2. Apply reward weights: Food={best_config['food_reward']}, Survival={best_config['survival_reward']}, Collision={best_config['collision_penalty']}")
print(f"   3. Implement {best_epsilon_strategy['strategy_name'].lower()} epsilon decay")
print(f"   4. Consider {best_custom['reward_function']} reward function for improved performance")
print(f"   5. Run longer experiments (500+ episodes) with these optimal settings")

print(f"\n🔄 NEXT STEPS:")
print(f"   • Integrate best hyperparameters into main training script")
print(f"   • Test on different board sizes and configurations")
print(f"   • Experiment with network architecture changes")
print(f"   • Implement replay buffer and target networks")

print("="*60)

## 7. Export Results

In [None]:
# Save all experimental results
output_dir = Path('../hyperparameter_experiments')
output_dir.mkdir(exist_ok=True)

timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')

# Save individual experiment results
lr_results.to_csv(output_dir / f'learning_rate_experiment_{timestamp}.csv', index=False)
reward_results.to_csv(output_dir / f'reward_weights_experiment_{timestamp}.csv', index=False)
epsilon_results.to_csv(output_dir / f'epsilon_strategies_experiment_{timestamp}.csv', index=False)
custom_df.to_csv(output_dir / f'custom_rewards_experiment_{timestamp}.csv', index=False)

# Create optimal configuration file
optimal_config = {
    'learning_rate': float(best_lr),
    'reward_weights': {
        'food_reward': int(best_config['food_reward']),
        'survival_reward': float(best_config['survival_reward']),
        'collision_penalty': int(best_config['collision_penalty'])
    },
    'epsilon_strategy': {
        'name': best_epsilon_strategy['strategy_name'],
        'epsilon_start': 1.0,
        'epsilon_end': 0.01,
        'epsilon_decay': float(best_epsilon_strategy['epsilon_decay'])
    },
    'best_custom_reward': best_custom['reward_function'],
    'experiment_date': timestamp,
    'performance_summary': {
        'best_avg_score': float(max(lr_results['avg_score'].max(), reward_results['avg_score'].max(), 
                                   epsilon_results['avg_score'].max(), custom_df['avg_score'].max())),
        'best_max_score': int(max(lr_results['max_score'].max(), reward_results['max_score'].max(), 
                                 epsilon_results['max_score'].max(), custom_df['max_score'].max())),
        'best_success_rate': float(max(lr_results['success_rate'].max(), reward_results['success_rate'].max(), 
                                      epsilon_results['success_rate'].max(), custom_df['success_rate'].max()))
    }
}

# Save optimal configuration as JSON
with open(output_dir / f'optimal_hyperparameters_{timestamp}.json', 'w') as f:
    json.dump(optimal_config, f, indent=2)

print(f"✅ All experimental results saved to: {output_dir}")
print(f"\n📁 Files created:")
print(f"   • learning_rate_experiment_{timestamp}.csv")
print(f"   • reward_weights_experiment_{timestamp}.csv")
print(f"   • epsilon_strategies_experiment_{timestamp}.csv")
print(f"   • custom_rewards_experiment_{timestamp}.csv")
print(f"   • optimal_hyperparameters_{timestamp}.json")

print(f"\n🎯 Hyperparameter experimentation complete!")
print(f"Use the optimal configuration file to improve your main training setup.")