In [16]:
# Insipired from: https://github.com/hardmaru/slimevolleygym/blob/master/training_scripts/train_ppo_selfplay.py

import os
import slimevolleygym
from datetime import datetime
from stable_baselines3 import A2C
from stable_baselines3.common.callbacks import EvalCallback
from shutil import copyfile # keep track of generations
import torch
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy
from slimevolleygym import BaselinePolicy
from torch.utils.tensorboard import SummaryWriter

In [17]:
SEED = 17
NUM_TIMESTEPS = int(5e7)
EVAL_EPISODES_SELFPLAY = 100
EVAL_EPISODES_BASELINE = 50
BEST_THRESHOLD = 0.5 # must achieve a mean score above this to replace prev best self
RENDER_MODE = False # set this to false if you plan on running for full 1000 trials.
n_cpu = 50
EVAL_FREQ = 250000 // n_cpu
learning_rate=0.0007
n_steps=5
gamma=0.99
gae_lambda=1.0
ent_coef=0.1
vf_coef=0.5
max_grad_norm=0.5
rms_prop_eps=1e-05
use_rms_prop=True
use_sde=False
sde_sample_freq=-1
rollout_buffer_class=None
rollout_buffer_kwargs=None
normalize_advantage=False
stats_window_size=100
policy_kwargs=None
verbose=0
device=torch.device('cuda:0' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')
_init_setup_model=True

# Log dir
LOGDIR = f"./Logging/A2C-SELFPLAY-LIBRARY/{datetime.now().strftime('%Y%m%d-%H%M%S')}-lr-{learning_rate}-entcoef-{ent_coef}"
os.mkdir(LOGDIR) 

In [18]:
# wrapper over the normal single player env, but loads the best self play model
class SlimeVolleySelfPlayEnv(slimevolleygym.SlimeVolleyEnv):
  
  def __init__(self):
    super(SlimeVolleySelfPlayEnv, self).__init__()
    self.policy = self
    self.best_model = None
    self.best_model_filename = None

  def predict(self, obs): # the policy
    if self.best_model is None:
      return self.action_space.sample() # return a random action
    else:
      action, _ = self.best_model.predict(obs)
      return action

  # load model if it's there
  def reset(self):
    modellist = [f for f in os.listdir(LOGDIR) if f.startswith("history")]
    modellist.sort()
    if len(modellist) > 0:
      filename = os.path.join(LOGDIR, modellist[-1]) # the latest generation model
      if filename != self.best_model_filename:
        self.best_model_filename = filename
        if self.best_model is not None:
          del self.best_model
        # Note: Due to a bug in sb3, we had to modify the load_from_zip_file() function in common/save_utils.py of the sb3 library
        # Set weights_only=False in the load line of the library code
        self.best_model = A2C.load(filename, env=self, weights_only=False)
    return super(SlimeVolleySelfPlayEnv, self).reset()

# hacked it to only save new version of best model if beats prev self by BEST_THRESHOLD score
# after saving model, resets the best score to be BEST_THRESHOLD
class SelfPlayCallback(EvalCallback):

  def __init__(self, vec_env_selfplay, best_model_save_path, log_path, eval_freq, n_eval_episodes_selfplay, n_eval_episodes_baseline, n_cpu, model):

    # Call the superclass constructor with all arguments except the model
    super(SelfPlayCallback, self).__init__(vec_env_selfplay, 
                                           best_model_save_path=best_model_save_path, 
                                           log_path=log_path, 
                                           eval_freq=eval_freq, 
                                           n_eval_episodes=n_eval_episodes_selfplay)
    self.generation = 0
    self.stored_model = model # Store a reference to the model being trained in order to save across generations
    self.n_eval_episodes_baseline = n_eval_episodes_baseline
    self.baseline = BaselinePolicy()
    self.n_cpu = n_cpu

    # Make the regular environment with the opponent being the baseline instead of selfplay for this callback
    self.vec_env_baseline = make_vec_env(slimevolleygym.SlimeVolleyEnv, n_envs=n_cpu, seed=SEED)

    # Create a summarywriter at the logdir
    self.writer = SummaryWriter(log_dir=f"{LOGDIR}")
      
  def _on_step(self) -> bool:

    # Updates self.last_mean_reward to be the mean reward of the selfplay evaluation (For n_eval_episodes)
    result = super(SelfPlayCallback, self)._on_step()

    # Check if it's time to run the evaluation callback for the baseline
    mean_baseline_reward = -100
    std_baseline_reward = -100
    if result and self.n_calls > 0 and self.n_calls % self.eval_freq == 0:
        
        # Evaluate the model in the vectorized environment
        mean_baseline_reward, std_baseline_reward = evaluate_policy(self.stored_model, self.vec_env_baseline, n_eval_episodes=self.n_eval_episodes_baseline)

        # Log the mean and std reward
        self.writer.add_scalar("Average baseline test return - Training step", mean_baseline_reward, self.n_calls * self.n_cpu)
        self.writer.add_scalar("Baseline test return standard deviation - Training step", std_baseline_reward, self.n_calls * self.n_cpu)

        # Only update the generation if we perform better than the threshold set
        if mean_baseline_reward != -100 and self.last_mean_reward > BEST_THRESHOLD:
          self.generation += 1
          save_model_name = os.path.join(LOGDIR, "history_"+str(self.generation).zfill(8)+".zip")
          self.stored_model.save(save_model_name)
           
          # Log the mean and std baseline reward as a function of the generation number
          self.writer.add_scalar("Average baseline test return - Generation", mean_baseline_reward, self.generation)
          self.writer.add_scalar("Baseline test return standard deviation - Generation", std_baseline_reward, self.generation)

    # Return the success / failure flag of the parent _on_step call
    return result
    
def train():

  vec_env = make_vec_env(SlimeVolleySelfPlayEnv, n_envs=n_cpu, seed=SEED)

  model = A2C("MlpPolicy", 
              vec_env, 
              learning_rate=learning_rate, 
              n_steps=n_steps, 
              gamma=gamma, 
              gae_lambda=gae_lambda, 
              ent_coef=ent_coef, 
              vf_coef=vf_coef, 
              max_grad_norm=max_grad_norm,
              rms_prop_eps=rms_prop_eps, 
              use_rms_prop=use_rms_prop, 
              use_sde=use_sde, 
              sde_sample_freq=sde_sample_freq, 
              rollout_buffer_class=rollout_buffer_class, 
              rollout_buffer_kwargs=rollout_buffer_kwargs,
              normalize_advantage=normalize_advantage, 
              stats_window_size=stats_window_size, 
              tensorboard_log=LOGDIR, 
              policy_kwargs=policy_kwargs, 
              verbose=verbose, 
              seed=SEED, 
              device=device,
              _init_setup_model=_init_setup_model)

  selfplay_eval_callback = SelfPlayCallback(vec_env_selfplay=vec_env,
                                              best_model_save_path=LOGDIR,
                                              log_path=LOGDIR,
                                              eval_freq=EVAL_FREQ,
                                              n_eval_episodes_selfplay=EVAL_EPISODES_SELFPLAY,
                                              model=model,
                                              n_eval_episodes_baseline=EVAL_EPISODES_BASELINE,
                                              n_cpu=n_cpu)

  model.learn(total_timesteps=NUM_TIMESTEPS, callback=selfplay_eval_callback)
  model.save(os.path.join(LOGDIR, "final_model"))

if __name__=="__main__":
  train()



Eval num_timesteps=250000, episode_reward=0.20 +/- 2.70
Episode length: 649.78 +/- 118.20
New best mean reward!
Eval num_timesteps=500000, episode_reward=-0.10 +/- 2.33
Episode length: 666.51 +/- 98.35
Eval num_timesteps=750000, episode_reward=0.16 +/- 2.59
Episode length: 675.76 +/- 114.76
Eval num_timesteps=1000000, episode_reward=-0.51 +/- 2.95
Episode length: 592.26 +/- 108.96
Eval num_timesteps=1250000, episode_reward=-0.47 +/- 2.41
Episode length: 661.00 +/- 95.45
Eval num_timesteps=1500000, episode_reward=-0.35 +/- 2.76
Episode length: 642.42 +/- 104.32
Eval num_timesteps=1750000, episode_reward=0.43 +/- 2.82
Episode length: 621.52 +/- 103.42
New best mean reward!
Eval num_timesteps=2000000, episode_reward=0.63 +/- 2.70
Episode length: 623.88 +/- 112.57
New best mean reward!
Eval num_timesteps=2250000, episode_reward=0.25 +/- 2.94
Episode length: 611.68 +/- 109.64
Eval num_timesteps=2500000, episode_reward=-0.23 +/- 2.84
Episode length: 649.18 +/- 128.77
Eval num_timesteps=27500