# Reinforcement Learning Training Notebook
## Arena Game AI Training with PPO and DQN

This notebook provides an interactive environment for training and evaluating RL agents for the Arena game.

### Features:
- üéÆ Train PPO and DQN agents
- üìä Real-time training visualization
- üîß Configurable hyperparameters
- üìà Performance metrics and analysis
- üíæ Model checkpointing and loading
- üéØ Transfer learning support

## 1. Setup and Imports

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from IPython.display import display, clear_output, Image
import time

# Stable Baselines3
from stable_baselines3 import PPO, DQN
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import (
    BaseCallback, CheckpointCallback, CallbackList, EvalCallback
)
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.results_plotter import load_results, ts2xy

# Local imports
from arena.wrapper.arena_env import ArenaEnv
from arena.core.callbacks import PlottingCallback, StopTrainingOnMaxEpisodes

# Plotting configuration
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

print("‚úÖ All imports successful!")

## 2. Configuration

In [None]:
# ==================== TRAINING CONFIGURATION ====================

CONFIG = {
    # Algorithm Selection
    'algorithm': 'ppo',  # Options: 'ppo', 'dqn'
    
    # Control Style
    'control_style': 2,  # 1=Rot/Thrust, 2=Direct (Up/Down/Left/Right)
    
    # Training Parameters
    'total_timesteps': 1_000_000,  # Total training steps
    'max_episodes': None,  # Stop after X episodes (None = unlimited)
    'n_envs': 8,  # Number of parallel environments
    
    # Checkpoint Settings
    'checkpoint_freq': 50_000,  # Save model every X steps
    'eval_freq': 10_000,  # Evaluate model every X steps
    'n_eval_episodes': 10,  # Number of episodes for evaluation
    
    # Directories
    'model_dir': 'models',
    'log_dir': 'logs',
    'tensorboard_log': 'logs/tensorboard',
    
    # Transfer Learning
    'load_model': None,  # Path to pretrained model (None = train from scratch)
    
    # PPO Hyperparameters
    'ppo': {
        'learning_rate': 3e-4,
        'n_steps': 2048,
        'batch_size': 64,
        'n_epochs': 10,
        'gamma': 0.99,
        'gae_lambda': 0.95,
        'clip_range': 0.2,
        'ent_coef': 0.01,  # Entropy coefficient for exploration
        'vf_coef': 0.5,
    },
    
    # DQN Hyperparameters
    'dqn': {
        'learning_rate': 1e-4,
        'buffer_size': 100_000,
        'batch_size': 32,
        'gamma': 0.99,
        'tau': 1.0,
        'exploration_fraction': 0.1,
        'exploration_initial_eps': 1.0,
        'exploration_final_eps': 0.05,
        'target_update_interval': 1000,
        'train_freq': 4,
    }
}

# Create directories
os.makedirs(CONFIG['model_dir'], exist_ok=True)
os.makedirs(CONFIG['log_dir'], exist_ok=True)
os.makedirs(CONFIG['tensorboard_log'], exist_ok=True)

# Display configuration
print("üéØ Training Configuration:")
print("=" * 50)
for key, value in CONFIG.items():
    if not isinstance(value, dict):
        print(f"{key:25s}: {value}")
print()
print(f"üì¶ {CONFIG['algorithm'].upper()} Hyperparameters:")
print("=" * 50)
for key, value in CONFIG[CONFIG['algorithm']].items():
    print(f"{key:25s}: {value}")

## 3. Custom Callback for Jupyter Visualization

In [None]:
class JupyterPlottingCallback(BaseCallback):
    """
    Custom callback for real-time plotting in Jupyter notebooks.
    """
    def __init__(self, log_dir: str, plot_freq: int = 5000, verbose=1):
        super(JupyterPlottingCallback, self).__init__(verbose)
        self.log_dir = log_dir
        self.plot_freq = plot_freq
        self.episode_rewards = []
        self.episode_lengths = []
        self.timesteps = []
        
    def _on_step(self) -> bool:
        if self.n_calls % self.plot_freq == 0:
            self._plot_progress()
        return True
    
    def _plot_progress(self):
        try:
            # Load monitor results
            df = load_results(self.log_dir)
            if len(df) < 2:
                return
            
            clear_output(wait=True)
            
            # Create comprehensive visualization
            fig = plt.figure(figsize=(16, 10))
            gs = fig.add_gridspec(3, 2, hspace=0.3, wspace=0.3)
            
            # Calculate rolling statistics
            window = min(100, max(1, len(df) // 20))
            rewards = df['r'].values
            lengths = df['l'].values
            episodes = np.arange(len(df))
            
            rewards_mean = pd.Series(rewards).rolling(window=window, min_periods=1).mean()
            lengths_mean = pd.Series(lengths).rolling(window=window, min_periods=1).mean()
            
            # Plot 1: Episode Rewards
            ax1 = fig.add_subplot(gs[0, 0])
            ax1.plot(episodes, rewards, alpha=0.2, color='steelblue', linewidth=0.5)
            ax1.plot(episodes, rewards_mean, color='darkblue', linewidth=2, label=f'Rolling Mean ({window})')
            ax1.fill_between(episodes, rewards, rewards_mean, alpha=0.1, color='steelblue')
            ax1.set_xlabel('Episode', fontsize=10)
            ax1.set_ylabel('Reward', fontsize=10)
            ax1.set_title('üìä Episode Rewards', fontsize=12, fontweight='bold')
            ax1.legend()
            ax1.grid(True, alpha=0.3)
            
            # Plot 2: Episode Lengths
            ax2 = fig.add_subplot(gs[0, 1])
            ax2.plot(episodes, lengths, alpha=0.2, color='forestgreen', linewidth=0.5)
            ax2.plot(episodes, lengths_mean, color='darkgreen', linewidth=2, label=f'Rolling Mean ({window})')
            ax2.fill_between(episodes, lengths, lengths_mean, alpha=0.1, color='forestgreen')
            ax2.set_xlabel('Episode', fontsize=10)
            ax2.set_ylabel('Length (steps)', fontsize=10)
            ax2.set_title('‚è±Ô∏è Episode Lengths', fontsize=12, fontweight='bold')
            ax2.legend()
            ax2.grid(True, alpha=0.3)
            
            # Plot 3: Reward Distribution
            ax3 = fig.add_subplot(gs[1, 0])
            recent_rewards = rewards[-min(500, len(rewards)):]
            ax3.hist(recent_rewards, bins=30, color='coral', alpha=0.7, edgecolor='black')
            ax3.axvline(np.mean(recent_rewards), color='red', linestyle='--', linewidth=2, label=f'Mean: {np.mean(recent_rewards):.2f}')
            ax3.axvline(np.median(recent_rewards), color='blue', linestyle='--', linewidth=2, label=f'Median: {np.median(recent_rewards):.2f}')
            ax3.set_xlabel('Reward', fontsize=10)
            ax3.set_ylabel('Frequency', fontsize=10)
            ax3.set_title('üìà Recent Reward Distribution (Last 500 Episodes)', fontsize=12, fontweight='bold')
            ax3.legend()
            ax3.grid(True, alpha=0.3, axis='y')
            
            # Plot 4: Success Rate Over Time
            ax4 = fig.add_subplot(gs[1, 1])
            success_threshold = 0  # Define what constitutes success
            success = (rewards > success_threshold).astype(int)
            success_rate = pd.Series(success).rolling(window=window, min_periods=1).mean() * 100
            ax4.plot(episodes, success_rate, color='purple', linewidth=2)
            ax4.fill_between(episodes, 0, success_rate, alpha=0.3, color='purple')
            ax4.set_xlabel('Episode', fontsize=10)
            ax4.set_ylabel('Success Rate (%)', fontsize=10)
            ax4.set_title('üéØ Success Rate (Reward > 0)', fontsize=12, fontweight='bold')
            ax4.set_ylim([0, 105])
            ax4.grid(True, alpha=0.3)
            
            # Plot 5: Statistics Summary
            ax5 = fig.add_subplot(gs[2, :])
            ax5.axis('off')
            
            stats_text = f"""
            üìä TRAINING STATISTICS
            {'=' * 80}
            
            Total Episodes: {len(df):,} | Total Timesteps: {self.num_timesteps:,} / {CONFIG['total_timesteps']:,} ({100*self.num_timesteps/CONFIG['total_timesteps']:.1f}%)
            
            Recent Performance (Last {min(100, len(df))} Episodes):
              ‚Ä¢ Mean Reward: {np.mean(rewards[-100:]):.2f} ¬± {np.std(rewards[-100:]):.2f}
              ‚Ä¢ Best Reward: {np.max(rewards[-100:]):.2f}
              ‚Ä¢ Mean Length: {np.mean(lengths[-100:]):.1f} steps
              ‚Ä¢ Success Rate: {100*np.mean(rewards[-100:] > 0):.1f}%
            
            Overall Performance:
              ‚Ä¢ Mean Reward: {np.mean(rewards):.2f} ¬± {np.std(rewards):.2f}
              ‚Ä¢ Best Reward: {np.max(rewards):.2f}
              ‚Ä¢ Worst Reward: {np.min(rewards):.2f}
              ‚Ä¢ Mean Length: {np.mean(lengths):.1f} steps
            """
            
            ax5.text(0.1, 0.5, stats_text, fontsize=11, family='monospace',
                    verticalalignment='center', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.3))
            
            plt.suptitle(f'üéÆ RL Training Progress - {CONFIG["algorithm"].upper()} | Style {CONFIG["control_style"]}',
                        fontsize=14, fontweight='bold', y=0.995)
            
            plt.show()
            
        except Exception as e:
            print(f"‚ö†Ô∏è Error plotting: {e}")

print("‚úÖ Custom callback defined!")

## 4. Environment Setup

In [None]:
# Create vectorized environment
print(f"üéÆ Creating {CONFIG['n_envs']} parallel environments...")

env = make_vec_env(
    lambda: ArenaEnv(control_style=CONFIG['control_style']),
    n_envs=CONFIG['n_envs'],
    monitor_dir=CONFIG['log_dir']
)

print(f"‚úÖ Environment created!")
print(f"   - Observation Space: {env.observation_space}")
print(f"   - Action Space: {env.action_space}")
print(f"   - Control Style: {'Rot/Thrust' if CONFIG['control_style'] == 1 else 'Direct'}")

## 5. Model Initialization

In [None]:
model_name = f"{CONFIG['algorithm']}_style{CONFIG['control_style']}"

if CONFIG['load_model']:
    # Load pretrained model
    load_path = CONFIG['load_model']
    if load_path.endswith('.zip'):
        load_path = load_path[:-4]
    
    print(f"üì• Loading model from {load_path}...")
    
    if CONFIG['algorithm'] == 'ppo':
        model = PPO.load(load_path, env=env, verbose=1, tensorboard_log=CONFIG['tensorboard_log'])
    else:
        model = DQN.load(load_path, env=env, verbose=1, tensorboard_log=CONFIG['tensorboard_log'])
    
    reset_timesteps = False
    print("‚úÖ Model loaded successfully!")
    
else:
    # Create new model
    print(f"üÜï Creating new {CONFIG['algorithm'].upper()} model...")
    
    if CONFIG['algorithm'] == 'ppo':
        model = PPO(
            "MlpPolicy",
            env,
            verbose=1,
            tensorboard_log=CONFIG['tensorboard_log'],
            **CONFIG['ppo']
        )
    else:
        model = DQN(
            "MlpPolicy",
            env,
            verbose=1,
            tensorboard_log=CONFIG['tensorboard_log'],
            **CONFIG['dqn']
        )
    
    reset_timesteps = True
    print("‚úÖ Model created successfully!")

# Display model architecture
print("\nüìê Model Architecture:")
print(model.policy)

## 6. Setup Callbacks

In [None]:
# Checkpoint callback
checkpoint_callback = CheckpointCallback(
    save_freq=CONFIG['checkpoint_freq'] // CONFIG['n_envs'],
    save_path=CONFIG['model_dir'],
    name_prefix=model_name
)

# Jupyter plotting callback
jupyter_plotting_callback = JupyterPlottingCallback(
    log_dir=CONFIG['log_dir'],
    plot_freq=1000  # Update plot every 1000 steps
)

# File plotting callback (for saved graphs)
file_plotting_callback = PlottingCallback(
    log_dir=CONFIG['log_dir'],
    plot_freq=5000
)

callbacks_list = [checkpoint_callback, jupyter_plotting_callback, file_plotting_callback]

# Optional: Stop on max episodes
if CONFIG['max_episodes'] is not None:
    stop_callback = StopTrainingOnMaxEpisodes(
        max_episodes=CONFIG['max_episodes'],
        verbose=1
    )
    callbacks_list.append(stop_callback)
    print(f"‚è±Ô∏è Training will stop after {CONFIG['max_episodes']} episodes")

callbacks = CallbackList(callbacks_list)

print("‚úÖ Callbacks configured!")

## 7. Train the Model

üéØ **Training will begin when you run this cell.**

Progress will be displayed above with real-time plots updating every few thousand steps.

In [None]:
print(f"üöÄ Starting training: {model_name}")
print(f"   Total Timesteps: {CONFIG['total_timesteps']:,}")
print(f"   Parallel Envs: {CONFIG['n_envs']}")
if CONFIG['max_episodes']:
    print(f"   Max Episodes: {CONFIG['max_episodes']}")
print("\n" + "="*60)

start_time = time.time()

try:
    model.learn(
        total_timesteps=CONFIG['total_timesteps'],
        callback=callbacks,
        reset_num_timesteps=reset_timesteps,
        tb_log_name=model_name
    )
    
    training_time = time.time() - start_time
    
    print("\n" + "="*60)
    print(f"‚úÖ Training completed successfully!")
    print(f"‚è±Ô∏è Total training time: {training_time/60:.2f} minutes")
    print(f"‚ö° Average: {CONFIG['total_timesteps']/training_time:.0f} steps/second")
    
except KeyboardInterrupt:
    print("\n‚ö†Ô∏è Training interrupted by user")
    training_time = time.time() - start_time
    print(f"‚è±Ô∏è Training time: {training_time/60:.2f} minutes")

## 8. Save Final Model

In [None]:
# Save the final model
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
final_model_path = os.path.join(CONFIG['model_dir'], f"{model_name}_final_{timestamp}")

model.save(final_model_path)
print(f"üíæ Final model saved to: {final_model_path}.zip")

# Also save with a simple name for easy loading
simple_path = os.path.join(CONFIG['model_dir'], f"{model_name}_final")
model.save(simple_path)
print(f"üíæ Also saved as: {simple_path}.zip (for easy loading)")

## 9. Model Evaluation

In [None]:
print("üéØ Evaluating trained model...")

# Create evaluation environment (single env)
eval_env = ArenaEnv(control_style=CONFIG['control_style'])
eval_env = Monitor(eval_env)

# Evaluate
n_eval_episodes = 20
mean_reward, std_reward = evaluate_policy(
    model,
    eval_env,
    n_eval_episodes=n_eval_episodes,
    deterministic=True
)

print(f"\nüìä Evaluation Results ({n_eval_episodes} episodes):")
print(f"   Mean Reward: {mean_reward:.2f} ¬± {std_reward:.2f}")

eval_env.close()

## 10. Training Analysis & Visualization

In [None]:
# Load training results
df = load_results(CONFIG['log_dir'])

print(f"üìà Training Summary:")
print(f"   Total Episodes: {len(df):,}")
print(f"   Mean Reward: {df['r'].mean():.2f} ¬± {df['r'].std():.2f}")
print(f"   Best Reward: {df['r'].max():.2f}")
print(f"   Worst Reward: {df['r'].min():.2f}")
print(f"   Mean Episode Length: {df['l'].mean():.1f} steps")

# Display saved training graph
graph_path = os.path.join(CONFIG['log_dir'], 'training_graph.png')
if os.path.exists(graph_path):
    print(f"\nüìä Training Graph:")
    display(Image(filename=graph_path))
else:
    print("‚ö†Ô∏è Training graph not found")

## 11. Detailed Performance Analysis

In [None]:
# Create comprehensive analysis
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle(f'Comprehensive Training Analysis - {model_name}', fontsize=16, fontweight='bold')

# Calculate statistics
window = 100
rewards = df['r'].values
lengths = df['l'].values
episodes = np.arange(len(df))

rewards_rolling = pd.Series(rewards).rolling(window=window, min_periods=1).mean()
lengths_rolling = pd.Series(lengths).rolling(window=window, min_periods=1).mean()

# 1. Rewards over time
axes[0, 0].plot(episodes, rewards, alpha=0.3, label='Raw')
axes[0, 0].plot(episodes, rewards_rolling, linewidth=2, label=f'Rolling Mean ({window})')
axes[0, 0].set_title('Episode Rewards')
axes[0, 0].set_xlabel('Episode')
axes[0, 0].set_ylabel('Reward')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# 2. Episode lengths over time
axes[0, 1].plot(episodes, lengths, alpha=0.3, label='Raw')
axes[0, 1].plot(episodes, lengths_rolling, linewidth=2, label=f'Rolling Mean ({window})')
axes[0, 1].set_title('Episode Lengths')
axes[0, 1].set_xlabel('Episode')
axes[0, 1].set_ylabel('Steps')
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# 3. Reward distribution
axes[0, 2].hist(rewards, bins=50, edgecolor='black', alpha=0.7)
axes[0, 2].axvline(rewards.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {rewards.mean():.2f}')
axes[0, 2].set_title('Reward Distribution')
axes[0, 2].set_xlabel('Reward')
axes[0, 2].set_ylabel('Frequency')
axes[0, 2].legend()
axes[0, 2].grid(True, alpha=0.3, axis='y')

# 4. Learning curve (cumulative average)
cumulative_avg = np.cumsum(rewards) / (episodes + 1)
axes[1, 0].plot(episodes, cumulative_avg, linewidth=2)
axes[1, 0].set_title('Cumulative Average Reward')
axes[1, 0].set_xlabel('Episode')
axes[1, 0].set_ylabel('Average Reward')
axes[1, 0].grid(True, alpha=0.3)

# 5. Reward vs Episode Length
axes[1, 1].scatter(lengths, rewards, alpha=0.3, s=10)
axes[1, 1].set_title('Reward vs Episode Length')
axes[1, 1].set_xlabel('Episode Length')
axes[1, 1].set_ylabel('Reward')
axes[1, 1].grid(True, alpha=0.3)

# 6. Performance improvement
# Split into chunks and show mean reward per chunk
n_chunks = 10
chunk_size = len(df) // n_chunks
chunk_means = [rewards[i*chunk_size:(i+1)*chunk_size].mean() for i in range(n_chunks)]
chunk_stds = [rewards[i*chunk_size:(i+1)*chunk_size].std() for i in range(n_chunks)]
chunk_labels = [f"{i*chunk_size}-{(i+1)*chunk_size}" for i in range(n_chunks)]

axes[1, 2].bar(range(n_chunks), chunk_means, yerr=chunk_stds, capsize=5, alpha=0.7)
axes[1, 2].set_title('Performance by Training Chunk')
axes[1, 2].set_xlabel('Episode Range')
axes[1, 2].set_ylabel('Mean Reward')
axes[1, 2].set_xticks(range(n_chunks))
axes[1, 2].set_xticklabels([f"#{i+1}" for i in range(n_chunks)])
axes[1, 2].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

# Save analysis
analysis_path = os.path.join(CONFIG['log_dir'], f'{model_name}_analysis.png')
fig.savefig(analysis_path, dpi=150, bbox_inches='tight')
print(f"\nüíæ Analysis saved to: {analysis_path}")

## 12. Test Trained Agent (Visual)

In [None]:
# Note: This will open a pygame window
# You may need to run this in a regular Python environment rather than Jupyter for best results

print("üéÆ Testing agent with visual rendering...")
print("   (Close the pygame window to stop)\n")

test_env = ArenaEnv(control_style=CONFIG['control_style'], render_mode='human')

obs, _ = test_env.reset()
episode_reward = 0
episode_length = 0
done = False

try:
    while not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, done, truncated, info = test_env.step(action)
        episode_reward += reward
        episode_length += 1
        test_env.render()
        
    print(f"\nüìä Test Episode Results:")
    print(f"   Reward: {episode_reward:.2f}")
    print(f"   Length: {episode_length} steps")
    
except KeyboardInterrupt:
    print("\n‚ö†Ô∏è Test interrupted")
    
finally:
    test_env.close()

## 13. Compare with Baseline (Optional)

If you have multiple trained models, you can compare them here.

In [None]:
# Example: Compare PPO vs DQN, or Style 1 vs Style 2

models_to_compare = [
    # Add model paths here, e.g.:
    # ('PPO Style 1', 'models/ppo_style1_final'),
    # ('PPO Style 2', 'models/ppo_style2_final'),
    # ('DQN Style 2', 'models/dqn_style2_final'),
]

if models_to_compare:
    print("üîÑ Comparing models...\n")
    
    comparison_results = []
    
    for model_label, model_path in models_to_compare:
        # Determine algorithm from name
        algo = 'ppo' if 'ppo' in model_path.lower() else 'dqn'
        style = 1 if 'style1' in model_path.lower() else 2
        
        # Load model
        if algo == 'ppo':
            test_model = PPO.load(model_path)
        else:
            test_model = DQN.load(model_path)
        
        # Evaluate
        test_env = ArenaEnv(control_style=style)
        test_env = Monitor(test_env)
        
        mean_r, std_r = evaluate_policy(test_model, test_env, n_eval_episodes=20)
        
        comparison_results.append({
            'Model': model_label,
            'Mean Reward': mean_r,
            'Std Reward': std_r
        })
        
        test_env.close()
        print(f"‚úÖ {model_label}: {mean_r:.2f} ¬± {std_r:.2f}")
    
    # Visualize comparison
    df_comparison = pd.DataFrame(comparison_results)
    
    plt.figure(figsize=(10, 6))
    plt.bar(df_comparison['Model'], df_comparison['Mean Reward'], 
            yerr=df_comparison['Std Reward'], capsize=10, alpha=0.7)
    plt.xlabel('Model', fontsize=12)
    plt.ylabel('Mean Reward', fontsize=12)
    plt.title('Model Comparison', fontsize=14, fontweight='bold')
    plt.xticks(rotation=45, ha='right')
    plt.grid(True, alpha=0.3, axis='y')
    plt.tight_layout()
    plt.show()
    
    print("\nüìä Comparison DataFrame:")
    display(df_comparison)
else:
    print("‚ÑπÔ∏è No models specified for comparison")
    print("   Add model paths to 'models_to_compare' list above to compare models")

## 14. Export Training Data

In [None]:
# Export training data to CSV for further analysis
export_path = os.path.join(CONFIG['log_dir'], f'{model_name}_training_data.csv')
df.to_csv(export_path, index=False)
print(f"üíæ Training data exported to: {export_path}")

# Display first few rows
print("\nüìä Training Data Preview:")
display(df.head(10))

print(f"\nüìà Data Shape: {df.shape}")
print(f"   Columns: {list(df.columns)}")

## 15. Summary & Next Steps

In [None]:
print("="*70)
print("üéâ TRAINING COMPLETE!")
print("="*70)

print(f"\nüì¶ Model: {model_name}")
print(f"üéÆ Algorithm: {CONFIG['algorithm'].upper()}")
print(f"üïπÔ∏è Control Style: {CONFIG['control_style']} ({'Rot/Thrust' if CONFIG['control_style'] == 1 else 'Direct'})")
print(f"\nüìä Final Performance:")
print(f"   Evaluation Reward: {mean_reward:.2f} ¬± {std_reward:.2f}")
print(f"   Training Episodes: {len(df):,}")
print(f"   Total Timesteps: {CONFIG['total_timesteps']:,}")

print(f"\nüíæ Saved Files:")
print(f"   - Model: {simple_path}.zip")
print(f"   - Training Graph: {graph_path}")
print(f"   - Training Data: {export_path}")
print(f"   - Analysis: {analysis_path}")

print(f"\nüöÄ Next Steps:")
print("   1. Review training graphs and metrics above")
print("   2. Test the agent visually (Section 12)")
print("   3. Try different hyperparameters or control styles")
print("   4. Use transfer learning by setting 'load_model' in config")
print("   5. Compare different models (Section 13)")
print("   6. Launch TensorBoard: tensorboard --logdir=" + CONFIG['tensorboard_log'])

print("\n" + "="*70)