### Saving and Loading

In [1]:
import os
from stable_baselines3 import PPO
from stable_baselines3 import A2C
import gymnasium as gym

In [2]:
# create save dir
save_dir = "/tmp/gym/"
os.makedirs(save_dir, exist_ok=True)

In [4]:
# create and save model
model = PPO("MlpPolicy", "Pendulum-v1", verbose=1).learn(8_000)
model.save(f"{save_dir}/PPO_tutorial")

Using cuda device
Creating environment from the given name 'Pendulum-v1'
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 200       |
|    ep_rew_mean     | -1.17e+03 |
| time/              |           |
|    fps             | 899       |
|    iterations      | 1         |
|    time_elapsed    | 2         |
|    total_timesteps | 2048      |
----------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 200          |
|    ep_rew_mean          | -1.11e+03    |
| time/                   |              |
|    fps                  | 635          |
|    iterations           | 2            |
|    time_elapsed         | 6            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0014977069 |
|    clip_fraction        |

In [5]:
# check predictions before and after save/load
obs = model.env.observation_space.sample()
prediction = model.predict(obs, deterministic=True)
print("Pre-saved: ", prediction)

del model
loaded_model = PPO.load(f"{save_dir}/PPO_tutorial")
prediction = loaded_model.predict(obs, deterministic=True)
print("Loaded model: ", prediction)

Pre-saved:  (array([0.00597604], dtype=float32), None)
Loaded model:  (array([0.00597604], dtype=float32), None)


In [6]:
# hyperparameters are also saved / loaded
model = A2C(
    "MlpPolicy",
    "Pendulum-v1",
    verbose=0,
    gamma=0.9,
    n_steps=20).learn(8_000)

In [7]:
model.save(f"{save_dir}/A2C_tutorial")
del model

In [8]:
loaded_model = A2C.load(f"{save_dir}/A2C_tutorial", verbose=0)
print(f"Loaded: gamma={loaded_model.gamma}, n_steps={loaded_model.n_steps}")

Loaded: gamma=0.9, n_steps=20


In [9]:
# create new serializable environment and continue training
from stable_baselines3.common.vec_env import DummyVecEnv

loaded_model.set_env(DummyVecEnv([lambda: gym.make("Pendulum-v1")]))
loaded_model.learn(8_000)

<stable_baselines3.a2c.a2c.A2C at 0x7fd340b798d0>

### Gym and VecEnv wrappers

In [10]:
# wrapper follows gym interface with reset() and step() methods
# access environment using self.env without modifying original

class CustomWrapper(gym.Wrapper):
    """
    :param env: (gym.Env) Gym environment to wrap
    """

    def __init__(self, env):
        super.__init__(env)

    def reset(self, **kwargs):
        """
        Reset the environment
        """
        obs, info = self.env.reset(**kwargs)
        return obs, info
    
    def step(self, action):
        """
        :param action: ([float] or int) Action taken by agent
        :return: (np.ndarray, float, bool, bool, dict) observation, reward, 
        terminated, truncated, info
        """
        obs, reward, terminated, truncated, info = self.env.step(action)
        return obs, reward, terminated, truncated, info

In [11]:
# Example: limit episode length

class TimeLimitWrapper(gym.Wrapper):
    """
    :param env: (gym.Env) Environement to wrap
    :param max_steps: (int) Max number of steps per episode
    """

    def __init__(self, env, max_steps=100):
        super(TimeLimitWrapper, self).__init__(env)
        self.max_steps = max_steps
        self.current_step = 0
    
    def reset(self, **kwargs):
        """
        Reset environment
        """
        self.current_step = 0
        return self.env.reset(**kwargs)
    
    def step(self, action):
        """
        :param action: ([float] or int) Action taken by agent
        :return: (np.ndarray, float, bool, bool, dict) 
        observation, reward, terminated, truncated, info
        """
        self.current_step += 1
        obs, reward, terminated, truncated, info = self.env.step(action)
        
        if self.current_step >= self.max_steps:
            truncated = True
        
        return obs, reward, terminated, truncated, info

In [12]:
# test wrapper
from gymnasium.envs.classic_control.pendulum import PendulumEnv

env = TimeLimitWrapper(PendulumEnv(), max_steps=100)

obs, _ = env.reset()
done = False
n_steps = 0

while not done:
    random_action = env.action_space.sample()
    obs, reward, terminated, truncated, info = env.step(random_action)
    done = terminated or truncated
    n_steps += 1

print(n_steps, info)

100 {}


In [14]:
import numpy as np

In [15]:
class NormalizeActionWrapper(gym.Wrapper):
    """
    :param env: (gym.Env) Gym environment that will be wrapped
    """

    def __init__(self, env):
        # Retrieve the action space
        action_space = env.action_space
        assert isinstance(
            action_space, gym.spaces.Box
        ), "This wrapper only works with continuous action space (spaces.Box)"
        # Retrieve the max/min values
        self.low, self.high = action_space.low, action_space.high

        # We modify the action space, so all actions will lie in [-1, 1]
        env.action_space = gym.spaces.Box(
            low=-1, high=1, shape=action_space.shape, dtype=np.float32
        )

        # Call the parent constructor, so we can access self.env later
        super(NormalizeActionWrapper, self).__init__(env)

    def rescale_action(self, scaled_action):
        """
        Rescale the action from [-1, 1] to [low, high]
        (no need for symmetric action space)
        :param scaled_action: (np.ndarray)
        :return: (np.ndarray)
        """
        return self.low + (0.5 * (scaled_action + 1.0) * (self.high - self.low))

    def reset(self, **kwargs):
        """
        Reset the environment
        """
        return self.env.reset(**kwargs)

    def step(self, action):
        """
        :param action: ([float] or int) Action taken by the agent
        :return: (np.ndarray, float,bool, bool, dict) observation, reward, final state? truncated?, additional informations
        """
        # Rescale action from [-1, 1] to original [low, high] interval
        rescaled_action = self.rescale_action(action)
        obs, reward, terminated, truncated, info = self.env.step(rescaled_action)
        return obs, reward, terminated, truncated, info

In [16]:
# Test before rescaling

original_env = gym.make("Pendulum-v1")

print(f"Original action space: \
{original_env.action_space.low} to \
{original_env.action_space.high}")

for _ in range(10):
    print(original_env.action_space.sample())

Original action space: [-2.] to [2.]
[-1.2815788]
[-0.90439767]
[-0.19669728]
[-0.3872228]
[-0.2788426]
[-0.5272796]
[-0.26488996]
[-1.1718161]
[-1.5033513]
[-1.1831881]


In [17]:
# Test NormalizeAction wrapper

normalized_env = NormalizeActionWrapper(gym.make("Pendulum-v1"))

print(f"Normalized action space: \
{normalized_env.action_space.low} to \
{normalized_env.action_space.high}")

for _ in range(10):
    print(normalized_env.action_space.sample())

Normalized action space: [-1.] to [1.]
[-0.84171295]
[0.87945384]
[0.18446018]
[0.30892766]
[0.06574268]
[-0.7730818]
[0.47157973]
[-0.03944604]
[-0.49572268]
[0.34055308]


#### Test NormalizeAction wrapper with rl algorithm


In [18]:
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.vec_env import DummyVecEnv

In [19]:
env = Monitor(gym.make("Pendulum-v1"))
env = DummyVecEnv([lambda: env])

In [20]:
# Without action wrapper
model = A2C("MlpPolicy", env, verbose=1).learn(1_000)

Using cuda device
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 200       |
|    ep_rew_mean        | -1.33e+03 |
| time/                 |           |
|    fps                | 488       |
|    iterations         | 100       |
|    time_elapsed       | 1         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -1.44     |
|    explained_variance | -0.0605   |
|    learning_rate      | 0.0007    |
|    n_updates          | 99        |
|    policy_loss        | -41       |
|    std                | 1.02      |
|    value_loss         | 975       |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 200       |
|    ep_rew_mean        | -1.52e+03 |
| time/                 |           |
|    fps                | 504       |
|    iterations         | 200       |
|    time_elapsed       | 1     

In [21]:
normalized_env = Monitor(gym.make("Pendulum-v1"))
# Note that we can use multiple wrappers
normalized_env = NormalizeActionWrapper(normalized_env)
normalized_env = DummyVecEnv([lambda: normalized_env])

In [22]:
model_2 = A2C("MlpPolicy", normalized_env, verbose=1).learn(int(1000))

Using cuda device
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 200       |
|    ep_rew_mean        | -1.23e+03 |
| time/                 |           |
|    fps                | 465       |
|    iterations         | 100       |
|    time_elapsed       | 1         |
|    total_timesteps    | 500       |
| train/                |           |
|    entropy_loss       | -1.41     |
|    explained_variance | -0.00151  |
|    learning_rate      | 0.0007    |
|    n_updates          | 99        |
|    policy_loss        | -67.5     |
|    std                | 0.994     |
|    value_loss         | 967       |
-------------------------------------
-------------------------------------
| rollout/              |           |
|    ep_len_mean        | 200       |
|    ep_rew_mean        | -1.54e+03 |
| time/                 |           |
|    fps                | 486       |
|    iterations         | 200       |
|    time_elapsed       | 2     


### VecEnv Wrappers

In [23]:
# VecNormalize: normalizes observations and returns by calculating running mean and stdev
# VecFrameStack: stacks successive observations, useful to integrate time

from stable_baselines3.common.vec_env import VecNormalize, VecFrameStack

env = DummyVecEnv([lambda: gym.make("Pendulum-v1")])
normalized_vec_env = VecNormalize(env)

obs = normalized_vec_env.reset()

for _ in range(10):
    action = [normalized_vec_env.action_space.sample()]
    obs, reward, _, _ = normalized_vec_env.step(action)
    print(obs, reward)

[[-0.9691557 -0.9370531 -0.9997525]] [-10.]
[[-1.2714406 -1.222842  -1.0872023]] [-2.014107]
[[-1.4955976 -1.3838277 -1.4389601]] [-1.393952]
[[-1.6212691 -1.377926  -1.4166437]] [-1.2456408]
[[-1.7315547 -1.1829904 -1.5778718]] [-1.0995727]
[[-1.770638   -0.52692324 -1.4609218 ]] [-1.0668241]
[[-1.7684513   0.93336153 -1.4090502 ]] [-0.95763135]
[[-1.7305524  2.1807544 -1.4293941]] [-0.8785769]
[[-1.6334641  2.4705312 -1.3134333]] [-0.83085084]
[[-1.4775691  2.4769847 -1.2933373]] [-0.7613985]


### Exercise: Code your own Monitor wrapper

In [24]:
class MyMonitorWrapper(gym.Wrapper):
    """
    :param env: (gym.Env) Gym environment that will be wrapped
    """

    def __init__(self, env):
        # Call the parent constructor, so we can access self.env later
        super().__init__(env)
        # === YOUR CODE HERE ===#
        # Initialize the variables that will be used
        # to store the episode length and episode reward
        self.episode_length = 0
        self.episode_reward = 0
        # ====================== #

    def reset(self, **kwargs):
        """
        Reset the environment
        """
        obs = self.env.reset(**kwargs)
        # === YOUR CODE HERE ===#
        # Reset the variables
        self.episode_length = 0
        self.episode_reward = 0
        # ====================== #
        return obs

    def step(self, action):
        """
        :param action: ([float] or int) Action taken by the agent
        :return: (np.ndarray, float, bool, bool, dict)
            observation, reward, is the episode over?, is the episode truncated?, additional information
        """
        obs, reward, terminated, truncated, info = self.env.step(action)
        # === YOUR CODE HERE ===#
        # Update the current episode reward and episode length
        self.episode_length += 1
        self.episode_reward += reward
        # ====================== #

        if terminated or truncated:
            # === YOUR CODE HERE ===#
            # Store the episode length and episode reward in the info dict
            info['episode_length'] = self.episode_length
            info['episode_reward'] = self.episode_reward

            # ====================== #
        return obs, reward, terminated, truncated, info

In [35]:
env = gym.make("LunarLander-v2")
env = MyMonitorWrapper(env)
obs, _ = env.reset()

done = False

while not done:
    action = env.action_space.sample()
    obs, reward, terminated, truncated, info = env.step(action)
    done = terminated or truncated

print(info)

{'episode_length': 100, 'episode_reward': -431.7259036607589}
