# Imports

In [1]:
import os
import gym
import slimevolleygym
from slimevolleygym import BaselinePolicy
import numpy as np

from Models.DDQN.DDQN_Agent import DDQN_Agent
from Models.DDQN.PRB import PrioritizedReplayBuffer

import torch
from utils import convert_to_vector, convert_to_value, LinearSchedule
import types

from torch.utils.tensorboard import SummaryWriter
from datetime import datetime
from IPython.display import clear_output

# Setup environment

In [2]:
temp_env = gym.make("SlimeVolley-v0")
print(f"Action space: {temp_env.action_space.n}")
print(f"Observation space: {temp_env.observation_space.shape}")
temp_env.close()

Action space: 3
Observation space: (12,)


# Device

In [3]:
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')

# Print the device as a check
print("Device used: ", DEVICE)

Device used:  mps


# Hyperparameters setup

In [4]:
seed = 42                               # Seed for reproducibility

# Environment parameters
hidden_layer_shape = 256                # Shape of the hidden layer
lr_init = 0.0003                        # Learning rate of both actor and critic optimizers
lr_end = 6e-5                           # Final learning rate
lr_decay_steps = 20000000               # Decay rate of the learning rate
gamma = 0.99                            # Discount factor to be applied when calculating Rewards-To-Go
batch_size = 256                        # Number of samples in each batch
epsilon_init = 0.6                      # Initial value of epsilon
epsilon_final = 0.01                    # Final value of epsilon
epsilon_decay_steps = 20000000          # Decay rate of epsilon 

# Replay buffer parameters
buffer_size = 10000                     # Size of the replay buffer
alpha = 0.6                             # Alpha value for prioritized experience replay
beta_init = 0.4                         # Initial value of beta
beta_final = 1.0                        # Final value of beta
beta_gain_steps = 20000000              # Gain of beta
replacement = True                      # Whether to use prioritized experience replay

# Model parameters
max_num_steps = 20000000                # Maximum number of steps to run the simulation

num_episodes_before_test_runs = 500     # Number of episodes before running test runs
num_episodes_before_save = 200          # Number of episodes before saving the model

num_test_runs_vs_baseline = 50          # Number of test runs against the baseline
num_test_runs_vs_random = 10            # Number of test runs against the random agent

num_steps_before_update = 50            # Number of steps before updating the model
num_updates_per_train_step = 40         # Number of updates per training step
num_warmup_steps = 3000                 # Number of warmup steps before training the model

In [5]:
# If agent2 = "random", then the agent is playing against a random policy
def evaluate(env, agent1, num_eval_episodes, agent2):

    # Set the model in evaluation mode
    agent1.evaluation_mode()

    # Only set the model in evaluation mode if it's not a random agent
    if agent2 != "random":
        agent2.evaluation_mode()
    
    # Run num_eval_episodes episodes and calculate the total return
    total_return = 0
    for _ in range(num_eval_episodes):

        state1 = env.reset()
        state2 = state1
        done = False
        while not done:
            
            with torch.no_grad():

                # Select the actions for each agent
                action1, _ = agent1.select_action(state1, greedy=True)

                if agent2 == "random":
                    action2 = convert_to_value(env.action_space.sample())
                else:
                    action2, _ = agent2.select_action(state2)
            
            # Step the environment forward
            next_state1, reward, done, info = env.step(convert_to_vector(action1), otherAction=convert_to_vector(action2))
            next_state2 = info['otherObs']
            
            # Add the individual agents' rewards to the total returns (Since they're the same for both agents)
            total_return += reward

            # Update the states
            state1 = next_state1
            state2 = next_state2
    
    # Set the model back in training mode
    agent1.training_mode()

    # Return the average return
    return total_return / num_eval_episodes

In [6]:
# To retrieve a checkpoint, just pass the number of steps at which the checkpoint occurs in the logging dir
def train(  hidden_layer_shape: int, device, gamma, batch_size, epsilon_init, epsilon_final, epsilon_decay_steps, lr_init, lr_end, lr_decay_steps,
            buffer_size, alpha, beta_init, beta_final, beta_gain_steps,
            seed, max_num_steps, num_test_runs_vs_baseline, num_test_runs_vs_random, num_episodes_before_test_runs, num_episodes_before_save, 
            threshold_test_return_to_update_opponents, num_test_runs_vs_other_agent,
            num_steps_before_update, num_updates_per_train_step, writer, logging_dir, HP_string, n_steps_checkpoint=None): 

    # Create the environment
    env = slimevolleygym.SlimeVolleyEnv()
    torch.manual_seed(seed)
    env.seed(seed)

    # Create the player agents (6 possible actions cause we don't consider pressing forward and backward at the same time)
    agent1 = DDQN_Agent(
        state_dim = env.observation_space.shape[0], 
        action_dim = 6, 
        hidden_layer_shape = hidden_layer_shape, 
        device = device, 
        lr = lr_init, 
        gamma = gamma, 
        batch_size = batch_size, 
        epsilon = epsilon_init
    )

    # Create the opponent policy
    agent2 = DDQN_Agent(
        state_dim = env.observation_space.shape[0],
        action_dim = 6,
        hidden_layer_shape = hidden_layer_shape,
        device = device,
        lr = lr_init,
        gamma = gamma,
        batch_size = batch_size,
        epsilon = epsilon_init
    )

    # Set the opponent to be in evaluation mode
    agent2.evaluation_mode()
    agent2.disable_gradients()

    buffer1 = PrioritizedReplayBuffer(
        buffer_size = buffer_size, 
        state_dim = env.observation_space.shape[0], 
        alpha = alpha, 
        beta_init = beta_init, 
        device = device
    )
    
    buffer2 = PrioritizedReplayBuffer(
        buffer_size = buffer_size,
        state_dim = env.observation_space.shape[0],
        alpha = alpha,
        beta_init = beta_init,
        device = device
    )

    # Initialize the baseline policy for evaluation (Add function wrappers for unified API)
    baseline_agent = BaselinePolicy()
    def select_action(self, state, greedy=False):
        action = self.predict(state)
        return convert_to_value(action), None
    def evaluation_mode(self):
        pass
    baseline_agent.select_action = types.MethodType(select_action, baseline_agent)
    baseline_agent.evaluation_mode = types.MethodType(evaluation_mode, baseline_agent)

    
    exp_noise_scheduler = LinearSchedule(epsilon_decay_steps, epsilon_init, epsilon_final)
    beta_scheduler = LinearSchedule(beta_gain_steps, beta_init, beta_final)
    lr_scheduler = LinearSchedule(lr_decay_steps, lr_init, lr_end)

    # Retrieve any checkpoints if necessary
    if n_steps_checkpoint is not None:
        agent1.load(logging_dir, 1, n_steps_checkpoint, buffer1)
        agent2.load(logging_dir, 2, n_steps_checkpoint, buffer2)
        n_steps = n_steps_checkpoint-1
    else:
        n_steps = 0

    generation_number = 0 # Generation number
    writer.add_scalar("Training step - Generation number", 0, 0)
    writer.flush()

    # Train the agent
    while n_steps < max_num_steps:

        # Print the progress
        clear_output(wait=True)
        print(f"{HP_string} Training step {n_steps}/{max_num_steps} ({n_steps/max_num_steps*100:.2f}%)")

        # Check if it's time to save the models
        if n_steps > 0 and (n_steps+1) % num_episodes_before_save == 0:
            agent1.save(logging_dir, 1, n_steps+1, buffer1)
            agent2.save(logging_dir, 2, n_steps+1, buffer2)
        
        if n_steps % num_episodes_before_test_runs == 0:
            
            # Evaluate the agent against the baseline policy
            average_test_return_baseline = evaluate(env, agent1, num_test_runs_vs_baseline, baseline_agent)
            writer.add_scalar("Average baseline test return - Training step", average_test_return_baseline, n_steps)

            # Evaluate the agent against a random policy
            average_test_return_random = evaluate(env, agent1, num_test_runs_vs_random, "random")
            writer.add_scalar("Average random test return - Training step", average_test_return_random, n_steps)
            
            # Evaluate the agent against the current opponent
            average_test_return_selfplay = evaluate(env, agent1, num_test_runs_vs_other_agent, agent2)
            writer.add_scalar("Average self-play test return - Training step", average_test_return_selfplay, n_steps)

            # Flush both results
            writer.flush()

            # Check if the average test return is above the threshold
            if average_test_return_selfplay > threshold_test_return_to_update_opponents:

                # Copy the player agent into the opponent agent
                agent2.copy_models(agent1)
                agent2.disable_gradients()
                agent2.evaluation_mode()

                # Increment the generation number
                generation_number += 1

                # Store the number of steps it took for each generation
                writer.add_scalar("Training step - Generation number", n_steps, generation_number)

                # Store the score against the baseline policy for the generation
                writer.add_scalar("Average baseline test return - Generation number", average_test_return_baseline, generation_number)

                # Store the score against the random policy for the generation
                writer.add_scalar("Average random test return - Generation number", average_test_return_random, generation_number)

                # Flush both sets of results
                writer.flush()

        # Reset the environment
        s1 = env.reset()
        s2 = s1
        # Select the first action
        action1, q_a = agent1.select_action(s1, deterministic=False)
        action2, _ = agent2.select_action(s2, deterministic=True) # The opponent agent is always greedy
        done = False

        # Initialize the average return and the original step for logging
        average_return = 0
        original_step = n_steps
        losses = []

        while not done:
            # Step the environment forward
            s1_next, r, done, info = env.step(convert_to_vector(action1), otherAction=convert_to_vector(action2))
            s2_next = info['otherObs']
            
            action1_next, q_a_next = agent1.select_action(s1_next, deterministic=False)
            action2_next, _ = agent2.select_action(s2_next, deterministic=True)

            # Update the average return
            average_return += r
            # [s; a, q_a; r, dw, tr, s_next; a_next, q_a_next] have been all collected.
            priority = r + (~done)*gamma*q_a_next - q_a #scalar
            buffer1.add(s1, action1, r, done, priority) 

            # Update the states
            action1 = action1_next
            action2 = action2_next
            q_a = q_a_next

            # Update
            if n_steps >= num_warmup_steps and n_steps % num_steps_before_update == 0:
                for _ in range(num_updates_per_train_step):
                    loss = agent1.train(buffer1)
                    losses.append(loss)
                # parameter annealing
                agent1.epsilon = exp_noise_scheduler.value(n_steps)
                buffer1.beta = beta_scheduler.value(n_steps)
                for p in agent1.q_net_optimizer.param_groups: p['lr'] = lr_scheduler.value(n_steps)

            n_steps += 1

        # Log metrics
        writer.add_scalar("Average baseline train return - Training step", average_return, n_steps)
        writer.add_scalar("Episode length - Training step", n_steps - original_step, n_steps)
        if len(losses) > 0:
            writer.add_scalar("Q-Loss - Training step", np.mean(losses), n_steps)
        writer.flush()

        average_return = 0
        
    # Save the final version of the models
    agent1.save_models(logging_dir, 1, n_steps+1)


In [7]:
# Create a string representing the HP combination
HP_string = f"{0}) Alpha: {lr_init} ==>"

# Create the writer
logging_dir = f"./Logging/DDQN-SELFPLAY/{datetime.now().strftime('%Y%m%d-%H%M%S')}-lr-{lr_init}"
# Create directories for logging
os.makedirs(logging_dir, exist_ok=True)

writer = SummaryWriter(logging_dir)

train(hidden_layer_shape = hidden_layer_shape,
      device = DEVICE, 
      gamma = gamma, 
      batch_size = batch_size, 
      epsilon_init = epsilon_init, 
      epsilon_final = epsilon_final, 
      epsilon_decay_steps = epsilon_decay_steps,
      lr_init = lr_init, 
      lr_end = lr_end,
      lr_decay_steps = lr_decay_steps,
      buffer_size = buffer_size, 
      alpha = alpha, 
      beta_init = beta_init, 
      beta_final = beta_final, 
      beta_gain_steps = beta_gain_steps, 
      replacement = replacement,
      seed = seed, 
      max_num_steps = max_num_steps, 
      num_test_runs_vs_baseline = num_test_runs_vs_baseline, 
      num_test_runs_vs_random = num_test_runs_vs_random, 
      num_steps_before_update = num_steps_before_update, 
      num_updates_per_train_step=num_updates_per_train_step,
      num_episodes_before_test_runs = num_episodes_before_test_runs,
      num_episodes_before_save = num_episodes_before_save,
      writer = writer, 
      logging_dir = logging_dir, 
      HP_string = HP_string
      )

0) Alpha: 0.0003 ==> Training step 7586/20000000 (0.04%)
Q loss: 0.007910395041108131
Q loss: 0.009002700448036194
Q loss: 0.009542327374219894
Q loss: 0.007244942709803581
Q loss: 0.007031168323010206
Q loss: 0.0074569545686244965
Q loss: 0.008994963951408863
Q loss: 0.008954710327088833
Q loss: 0.008156625553965569
Q loss: 0.007345702964812517
Q loss: 0.010154238902032375
Q loss: 0.009489575400948524
Q loss: 0.008789246901869774
Q loss: 0.009555034339427948
Q loss: 0.009505237452685833
Q loss: 0.010761070996522903
Q loss: 0.00801114458590746
Q loss: 0.007348907645791769
Q loss: 0.008181791752576828
Q loss: 0.011953238397836685
Q loss: 0.009751993231475353
Q loss: 0.007144710049033165
Q loss: 0.006466282997280359
Q loss: 0.008209082297980785
Q loss: 0.006812895182520151
Q loss: 0.009639308787882328
Q loss: 0.008379010483622551
Q loss: 0.007502835243940353
Q loss: 0.007907502353191376
Q loss: 0.011216980405151844
Q loss: 0.006829795427620411
Q loss: 0.007776965852826834
Q loss: 0.00913

KeyboardInterrupt: 