In [26]:
import os
import gym
import slimevolleygym

from mpi4py import MPI
from stable_baselines.common import set_global_seeds
from stable_baselines.common.policies import MlpPolicy
from stable_baselines import bench, logger, PPO1
from stable_baselines.common.callbacks import EvalCallback, BaseCallback

<module 'tensorflow._api.v2.version' from '/Users/stephen/anaconda3/envs/slime-rl/lib/python3.7/site-packages/tensorflow/_api/v2/version/__init__.py'>

In [2]:
NUM_TIMESTEPS = int(2e8)
SEED = 831
EVAL_FREQ = 200000
EVAL_EPISODES = 1000
LOGDIR = "ppo1_mpi"

In [3]:
def make_env(seed):
    env = gym.make("SlimeVolley-v0")
    env.seed(seed)
    return env


In [6]:
def train():
    """
    Train PPO1 model for slime volleyball, in MPI multiprocessing. Tested for 96 CPUs.
    """
    rank = MPI.COMM_WORLD.Get_rank()

    if rank == 0:
        logger.configure(folder=LOGDIR)

    else:
        logger.configure(format_strs=[])
    workerseed = SEED + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = make_env(workerseed)

    env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
    env.seed(workerseed)

    model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=4096, clip_param=0.2, entcoeff=0.0, optim_epochs=10,
               optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear',
               verbose=1)

    eval_callback = EvalCallback(env, best_model_save_path=LOGDIR, log_path=LOGDIR, eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES)

    model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)

    env.close()
    del env
    if rank == 0:
        model.save(os.path.join(LOGDIR, "final_model")) # probably never get to this point.

## Train Script for Self-Play

In [23]:
import os
import gym
import slimevolleygym
import numpy as np

from stable_baselines.ppo1 import PPO1
from stable_baselines.common.policies import MlpPolicy
from stable_baselines import logger
from stable_baselines.common.callbacks import EvalCallback

from shutil import copyfile # keep track of generations

# Settings
SEED = 17
NUM_TIMESTEPS = int(1e9)
EVAL_FREQ = int(1e5)
EVAL_EPISODES = int(1e2)
BEST_THRESHOLD = 0.5 # must achieve a mean score above this to replace prev best self

RENDER_MODE = False # set this to false if you plan on running for full 1000 trials.

LOGDIR = "ppo1_selfplay"

class SlimeVolleySelfPlayEnv(slimevolleygym.SlimeVolleyEnv):
  # wrapper over the normal single player env, but loads the best self play model
    def __init__(self):
        super(SlimeVolleySelfPlayEnv, self).__init__()
        self.policy = self
        self.best_model = None
        self.best_model_filename = None
    def predict(self, obs): # the policy
        if self.best_model is None:
            return self.action_space.sample() # return a random action
        else:
            action, _ = self.best_model.predict(obs)
            return action
    def reset(self):
        # load model if it's there
        modellist = [f for f in os.listdir(LOGDIR) if f.startswith("history")]
        modellist.sort()
        if len(modellist) > 0:
            filename = os.path.join(LOGDIR, modellist[-1]) # the latest best model
            if filename != self.best_model_filename:
                print("loading model: ", filename)
                self.best_model_filename = filename
                if self.best_model is not None:
                    del self.best_model
                self.best_model = PPO1.load(filename, env=self)
        return super(SlimeVolleySelfPlayEnv, self).reset()
LOGDIR1 = "adversarial/ppo1"
LOGDIR2 = "adversarial/ppo2"
class SlimeVolleyAdversarialEnv(slimevolleygym.SlimeVolleyEnv):
    def __init__(self):
        super(SlimeVolleyAdversarialEnv, self).__init__()
        self.policy = self
        self.best_model1 = None
        self.best_model1_filename = None
        self.best_model2 = None
        self.best_model2_filename = None
    def predict(self, obs): #policy
        if self.best_model1 is None:
            return self.action_space.sample(), self.action_space.sample()
        else:
            action1, _ = self.best_model1.predict(obs)
            action2, _ = self.best_model2.predict(obs)
            return action1, action2
    
    def reset(self):
        # loads a model if it is there 
        modellist1 = [f for f in os.listdir(LOGDIR1) if f.startswith("history")]
        modellist2 = [f for f in os.listdir(LOGDIR2) if f.startswith("history")]
        modellist1.sort()
        modellist2.sort()
        if len(modellist1) > 0:
            filename1 = os.path.join(LOGDIR1, modellist1[-1]) # the latest best model
            if filename1 != self.best_model1_filename:
                print("loading model: ", filename1)
                self.best_model1_filename = filename1
                if self.best_model is not None:
                    del self.best_model1
                self.best_model1 = PPO1.load(filename1, env=self)
        if len(modellist2) > 0:
            filename2 = os.path.join(LOGDIR2, modellist2[-1]) # the latest best model
            if filename2 != self.best_model1_filename:
                print("loading model: ", filename2)
                self.best_model2_filename = filename2
                if self.best_model2 is not None:
                    del self.best_model2
                self.best_model12 = PPO1.load(filename1, env=self)
        return super(SlimeVolleySelfPlayEnv, self).reset()
        

In [19]:
BEST_THRESHOLD1 = 0.5
BEST_THRESHOLD2 = 0.5
class AdversarialCallback(EvalCallback):
    # hacked it to only save new version of best model if beats prev self by BEST_THRESHOLD score
    # after saving model, resets the best score to be BEST_THRESHOLD
    def __init__(self, *args, **kwargs):
        super(AdversarialCallback, self).__init__(*args, **kwargs)
        self.best_mean_reward1 = BEST_THRESHOLD1
        self.best_mean_reward2 = BEST_THRESHOLD2
        self.generation = 0
    def _on_step(self)-> bool:
        result = super(SelfPlayCallback, self)._on_step()
        if result and (self.best_mean_reward1 > BEST_THRESHOLD1 or self.best_mean_reward2 > BEST_THRESHOLD2):
            self.generation += 1
            if self.best_mean_reward1 > BEST_THRESHOLD1:
                print("MODEL1: mean_reward achieved:", self.best_mean_reward1)
                print("MODEL1: new best model, bumping up generation to", self.generation)
                source_file = os.path.join(LOGDIR1, "best_model.zip")
                backup_file = os.path.join(LOGDIR1, "history_"+str(self.generation).zfill(8)+".zip")
                copyfile(source_file, backup_file)
                self.best_mean_reward1 = BEST_THRESHOLD1
            if self.best_mean_reward2 > BEST_THRESHOLD2:
                print("MODEL2: mean_reward achieved:", self.best_mean_reward2)
                print("MODEL2: new best model, bumping up generation to", self.generation)
                source_file = os.path.join(LOGDIR2, "best_model.zip")
                backup_file = os.path.join(LOGDIR2, "history_"+str(self.generation).zfill(8)+".zip")
                copyfile(source_file, backup_file)
                self.best_mean_reward2 = BEST_THRESHOLD2
        return result

class SelfPlayCallback(EvalCallback):
    # hacked it to only save new version of best model if beats prev self by BEST_THRESHOLD score
    # after saving model, resets the best score to be BEST_THRESHOLD
    def __init__(self, *args, **kwargs):
        super(SelfPlayCallback, self).__init__(*args, **kwargs)
        self.best_mean_reward = BEST_THRESHOLD
        self.generation = 0
    def _on_step(self) -> bool:
        result = super(SelfPlayCallback, self)._on_step()
        if result and self.best_mean_reward > BEST_THRESHOLD:
            self.generation += 1
            print("SELFPLAY: mean_reward achieved:", self.best_mean_reward)
            print("SELFPLAY: new best model, bumping up generation to", self.generation)
            source_file = os.path.join(LOGDIR, "best_model.zip")
            backup_file = os.path.join(LOGDIR, "history_"+str(self.generation).zfill(8)+".zip")
            copyfile(source_file, backup_file)
            self.best_mean_reward = BEST_THRESHOLD
        return result

In [6]:
# def rollout(env, policy):
#     """ play one agent vs the other in modified gym-style loop. """
#     obs = env.reset()

#     done = False
#     total_reward = 0

#     while not done:

#         action, _states = policy.predict(obs)
#         obs, reward, done, _ = env.step(action)

#         total_reward += reward

#         if RENDER_MODE:
#             env.render()

#     return total_reward
def rollout(env, policy1, policy2):
    """ play one agent vs the other in modified gym-style loop. """
    obs1 = env.reset()
    obs2 = obs1 # same observation for other agent
    done = False
    total_reward = 0

    while not done:
        
        action1, _ = policy1.predict(obs1)
        action2, _ = policy2.predict(obs2)
        
        obs1, reward, done, _ = env.step(action1, action2)

        obs2 = info['otherObs']

        total_reward += reward

        if RENDER_MODE:
            env.render()

    return total_reward


In [24]:
def train():
    # train selfplay agent
    logger.configure(folder=LOGDIR)

    env = SlimeVolleySelfPlayEnv()
    env.seed(SEED)

    # take mujoco hyperparams (but doubled timesteps_per_actorbatch to cover more steps.)
    model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=4096, clip_param=0.2, entcoeff=0.0, optim_epochs=10,
                   optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=1)

    eval_callback = SelfPlayCallback(env,
        best_model_save_path=LOGDIR,
        log_path=LOGDIR,
        eval_freq=EVAL_FREQ,
        n_eval_episodes=EVAL_EPISODES,
        deterministic=False)

    model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)

    model.save(os.path.join(LOGDIR, "final_model")) # probably never get to this point.

    env.close()
def train():
    # train selfplay agent
    logger.configure(folder=LOGDIR)

#     env = SlimeVolleySelfPlayEnv()
    env = SlimeVolleyAdversarialEnv()
    env.seed(SEED)

    # take mujoco hyperparams (but doubled timesteps_per_actorbatch to cover more steps.)
    model1 = PPO1(MlpPolicy, env, timesteps_per_actorbatch=4096, clip_param=0.2, entcoeff=0.0, optim_epochs=10,
                   optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=1)

    eval_callback = AdversarialCallback(env,
        best_model_save_path1=LOGDIR1,
        best_model_save_path2=LOGDIR2,
        log_path=LOGDIR,
        eval_freq=EVAL_FREQ,
        n_eval_episodes=EVAL_EPISODES,
        deterministic=False)

    model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)

    model.save(os.path.join(LOGDIR, "final_model")) # probably never get to this point.

    env.close()


In [25]:
train()

Logging to ppo1_selfplay


TypeError: __init__() got an unexpected keyword argument 'best_model_save_path1'