# Imports

In [8]:
import slimevolleygym
from Models.PPO.PPO_Agent import PPO_Agent
import torch
from tqdm.auto import tqdm
from torch.utils.tensorboard import SummaryWriter
from datetime import datetime
from slimevolleygym import BaselinePolicy
import numpy as np
from utils import convert_to_vector, convert_to_value
import types
from IPython.display import clear_output

# Setup environment

In [9]:
env = slimevolleygym.SlimeVolleyEnv()
print(f"Action space: {env.action_space.n}")
print(f"Observation space: {env.observation_space.shape}")
env.close()

Action space: 3
Observation space: (12,)


# Device

In [10]:
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')

# Print the device as a check
print("Device used: ", DEVICE)

Device used:  cuda:0


# Hyperparameters setup

In [11]:
# Hyperparameters
timesteps_per_batch = 4096    # Number of timesteps to run per batch
n_updates_per_iteration = 10  # Number of times to update actor/critic per iteration
lr_choices = [3e-4]           # Learning rate of both actor and critic optimizers
eps = 1e-5                    # Adam optimizer epsilon
gamma = 0.99                  # Discount factor to be applied when calculating Rewards-To-Go
clip = 0.2                    # Recommended 0.2, helps define the threshold to clip the ratio during SGA
lam = 0.95                    # Lambda Parameter for GAE 
num_minibatches = 6           # Number of mini-batches for Mini-batch Update
ent_coef = 0.1                # Entropy coefficient for Entropy Regularization
target_kl = 0.03              # KL Divergence threshold
max_grad_norm = 0.5           # Gradient Clipping threshold
mlp_layers = [64, 64]         # Number of neurons in each layer of the MLP
render = False                # Whether to render the environment

# Custom parameters
seed = 42
max_num_steps = 20000000
num_test_runs_vs_baseline = 50
num_test_runs_vs_random = 10
num_iterations_before_test_runs = 150
num_iterations_before_save = 100

In [12]:
# If agent2 = "random", then the agent is playing against a random policy
def evaluate(env, agent1, num_eval_episodes, agent2="random"):

    # Set the model in evaluation mode
    agent1.evaluation_mode()
    
    # Run num_eval_episodes episodes and calculate the total return
    total_return = 0
    for _ in range(num_eval_episodes):

        state1 = env.reset()
        state2 = state1
        done = False
        while not done:
            
            with torch.no_grad():

                # Select the actions for each agent
                action1, _ = agent1.select_action(state1, greedy=True)

                if agent2 == "random":
                    action2 = convert_to_value(env.action_space.sample())
                else:
                    action2, _ = agent2.select_action(state2)
            
            # Step the environment forward
            next_state1, reward, done, info = env.step(convert_to_vector(action1), otherAction=convert_to_vector(action2))
            next_state2 = info['otherObs']
            
            # Add the individual agents' rewards to the total returns (Since they're the same for both agents)
            total_return += reward

            # Update the states
            state1 = next_state1
            state2 = next_state2
    
    # Set the model back in training mode
    agent1.training_mode()

    # Return the average return
    return total_return / num_eval_episodes

In [13]:
# To retrieve a checkpoint, just pass the number of steps at which the checkpoint occurs in the logging dir
def train(timesteps_per_batch, n_updates_per_iteration, lr, eps, gamma, clip, lam, \
          num_minibatches, ent_coef, target_kl, max_grad_norm, seed, max_num_steps, \
            num_test_runs_vs_baseline, num_iterations_before_test_runs, num_iterations_before_save,\
                num_test_runs_vs_random, writer, logging_dir, mlp_layers, render, HP_string, \
                    n_steps_checkpoint=None):

    # Create the environment
    env = slimevolleygym.SlimeVolleyEnv()
    torch.manual_seed(seed)
    env.seed(seed)

    # Create the player agents (6 possible actions cause we don't consider pressing forward and backward at the same time)
    agent1 = PPO_Agent(obs_dim=12, 
                        act_dim=6,
                        DEVICE=DEVICE,
                        timesteps_per_batch=timesteps_per_batch,
                        n_updates_per_iteration=n_updates_per_iteration,
                        lr=lr,
                        eps=eps,
                        gamma=gamma,
                        clip=clip,
                        lam=lam,
                        num_minibatches=num_minibatches,
                        ent_coef=ent_coef,
                        target_kl=target_kl,
                        max_grad_norm=max_grad_norm,
                        mlp_layers=mlp_layers,
                        render=render)
    
    # Create the baseline policy (Returns a value not a vector for the action to ensure uniformity)
    agent2 = BaselinePolicy()
    def select_action(self, state, greedy=False):
        action = self.predict(state)
        return convert_to_value(action), None

    agent2.select_action = types.MethodType(select_action, agent2)

    # Retrieve any checkpoints if necessary
    if n_steps_checkpoint is not None:
        agent1.load_models(logging_dir, 1, n_steps_checkpoint)
        n_steps = n_steps_checkpoint-1
    else:
        n_steps = 0

    i = 0 # Iteration number

    # Train the agent
    while n_steps < max_num_steps:

        # Print the progress
        clear_output(wait=True)
        print(f"{HP_string} Training step {n_steps}/{max_num_steps} ({n_steps/max_num_steps*100:.2f}%)")

        # Check if it's time to save the models
        if i > 0 and (i+1) % num_iterations_before_save == 0:
            agent1.save_models(logging_dir, 1, n_steps+1)
        
        if i % num_iterations_before_test_runs == 0:

            # Evaluate the agent against the baseline policy
            average_test_return_baseline = evaluate(env, agent1, num_test_runs_vs_baseline, agent2)
            writer.add_scalar("Average baseline test return - Training step", average_test_return_baseline, n_steps)

            # Evaluate the agent against a random policy
            average_test_return_random = evaluate(env, agent1, num_test_runs_vs_random, "random")
            writer.add_scalar("Average random test return - Training step", average_test_return_random, n_steps)
            
            # Flush both results
            writer.flush()

        # Gather a batch of experiences
        batch_obs, batch_acts, batch_log_probs, batch_rews, batch_lens, batch_vals, batch_dones = agent1.gather_data(env, agent2)

        # Increment the number of steps
        n_steps += sum(batch_lens)

        # Run a training iteration on that batch
        agent1.learn(batch_acts=batch_acts,
                     batch_obs=batch_obs,
                     batch_log_probs=batch_log_probs,
                     batch_rews=batch_rews,
                     batch_vals=batch_vals,
                     batch_dones=batch_dones,
                     n_steps_so_far=n_steps,
                     total_n_steps=max_num_steps,
                     writer=writer)  

        # Log metrics
        writer.add_scalar("Average episode length - Training step", np.mean(batch_lens), n_steps)
        writer.add_scalar("Average self-play train return - Training step", np.mean([np.sum(ep_rews) for ep_rews in batch_rews]), n_steps)
        writer.flush()

        # Increment the iteration number
        i += 1
    
    # Save the final version of the models
    agent1.save_models(logging_dir, 1, n_steps+1)

In [14]:
i = 0 # Keep track of the hyperparameter combinations

for lr in lr_choices:

    # Create a string representing the HP combination
    HP_string = f"{i}) Alpha: {lr} ==>"

    # Create the writer
    logging_dir = f"./Logging/PPO-BASELINE/{datetime.now().strftime('%Y%m%d-%H%M%S')}-lr-{lr}-entcoef-{ent_coef}-mlp-{mlp_layers[0]}-kl-{target_kl}"
    writer = SummaryWriter(logging_dir)

    # Train the agent
    train(timesteps_per_batch=timesteps_per_batch, 
          n_updates_per_iteration=n_updates_per_iteration, 
          lr=lr, 
          eps=eps, 
          gamma=gamma, 
          clip=clip, 
          lam=lam, 
          num_minibatches=num_minibatches, 
          ent_coef=ent_coef, 
          target_kl=target_kl, 
          max_grad_norm=max_grad_norm, 
          seed=seed, 
          max_num_steps=max_num_steps, 
          num_test_runs_vs_baseline=num_test_runs_vs_baseline, 
          num_iterations_before_test_runs=num_iterations_before_test_runs, 
          num_iterations_before_save=num_iterations_before_save, 
          num_test_runs_vs_random=num_test_runs_vs_random, 
          writer=writer, 
          logging_dir=logging_dir, 
          mlp_layers=mlp_layers, 
          render=render, 
          HP_string=HP_string)
    
    # Close the writer
    writer.close()

0) Alpha: 0.0003 ==> Training step 19997440/20000000 (99.99%)
