### Vectorized Environments

Train agent on multiple environments simultaneously
Collect experiences more quickly

SubprocVecEnv: run each env in separate process
DummyVecEnv: run all envs in same process

DummyVecEnv usually faster in practice - avoids communication delays

In [4]:
import time
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

import gymnasium as gym

from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.utils import set_random_seed
from stable_baselines3.common.evaluation import evaluate_policy

from stable_baselines3 import PPO, A2C

Define environment function


In [15]:
def make_env(env_id, rank, seed=0):
    """
    Utility function for multiprocessed env.

    :param env_id: (str) environment ID
    :param seed: (int) initial random seed
    :param rank: (int) index of subprocess
    """

    def _init():
        env = gym.make(env_id)
        # use seed for reproducibility
        # different seeds so each env generates different experiences
        env.reset(seed=seed + rank)
        return env
    
    set_random_seed(seed)
    return _init

Built-in helper also available

In [16]:
from stable_baselines3.common.env_util import make_vec_env

Setup evaluation environment and constants

In [18]:
PROCESSES_TO_TEST = [2**i for i in range(0,5)]
NUM_EXPERIMENTS = 3 # helps with instability
TRAIN_STEPS = 5000
EVAL_EPS = 20
ALGO = A2C

env_id = "CartPole-v1"
eval_env = gym.make(env_id)

Test different numbers of processes


In [20]:
reward_averages = []
reward_std = []
training_times = []
total_procs = 0

for n_procs in PROCESSES_TO_TEST:

    total_procs += n_procs
    print(f"Running for n_procs = {n_procs}")

    if n_procs == 1: # don't need to multiprocess
        train_env = DummyVecEnv([lambda: gym.make(env_id)])
    else:
        train_env = SubprocVecEnv(
            [make_env(env_id, i + total_procs) for i in range(n_procs)],
            start_method="fork"
        )
    
    rewards = []
    times = []

    # run and evaluate multiple experiments
    for experiment in range(NUM_EXPERIMENTS):
        train_env.reset()
        model = ALGO("MlpPolicy", train_env, verbose=1)
        start = time.time()
        model.learn(total_timesteps=TRAIN_STEPS)
        times.append(time.time() - start)
        mean_reward, _ = evaluate_policy(model, eval_env, n_eval_episodes=EVAL_EPS)
        rewards.append(mean_reward)

    # close subprocesses and collect results
    train_env.close()
    reward_averages.append(np.mean(rewards))
    reward_std.append(np.std(rewards))
    training_times.append(np.mean(times))

Running for n_procs = 1
Using cuda device
------------------------------------
| time/                 |          |
|    fps                | 405      |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.685   |
|    explained_variance | 0.148    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 1.28     |
|    value_loss         | 5.62     |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 413      |
|    iterations         | 200      |
|    time_elapsed       | 2        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -0.625   |
|    explained_variance | 0.521    |
|    learning_rate      | 0.0007   |
|    n_updates          | 199      |
|    policy_loss        | 1.46   



Using cuda device
------------------------------------
| time/                 |          |
|    fps                | 447      |
|    iterations         | 100      |
|    time_elapsed       | 1        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0.693   |
|    explained_variance | 0.114    |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | 1.95     |
|    value_loss         | 9.32     |
------------------------------------
------------------------------------
| time/                 |          |
|    fps                | 438      |
|    iterations         | 200      |
|    time_elapsed       | 2        |
|    total_timesteps    | 1000     |
| train/                |          |
|    entropy_loss       | -0.688   |
|    explained_variance | -0.0793  |
|    learning_rate      | 0.0007   |
|    n_updates          | 199      |
|    policy_loss        | 1.63     |
|    value_loss     

### Plot results

In [None]:
def plot_training_results(training_steps_per_second, reward_averages, reward_std):
    