My installtion instructions: https://gitlab.com/-/snippets/2057703

Source: https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/master/monitor_training.ipynb

See also: https://stable-baselines.readthedocs.io/en/master/guide/examples.html#try-it-online-with-colab-notebooks

# Stable Baselines, a Fork of OpenAI Baselines - Monitor Training and Plotting

Github Repo: [https://github.com/hill-a/stable-baselines](https://github.com/hill-a/stable-baselines)

Medium article: [https://medium.com/@araffin/stable-baselines-a-fork-of-openai-baselines-df87c4b2fc82](https://medium.com/@araffin/stable-baselines-a-fork-of-openai-baselines-df87c4b2fc82)

[RL Baselines Zoo](https://github.com/araffin/rl-baselines-zoo) is a collection of pre-trained Reinforcement Learning agents using Stable-Baselines.

It also provides basic scripts for training, evaluating agents, tuning hyperparameters and recording videos.

Documentation is available online: [https://stable-baselines.readthedocs.io/](https://stable-baselines.readthedocs.io/)

## Install Dependencies and Stable Baselines Using Pip

List of full dependencies can be found in the [README](https://github.com/hill-a/stable-baselines).

```
sudo apt-get update && sudo apt-get install cmake libopenmpi-dev zlib1g-dev
```

```
pip install stable-baselines[mpi]
```

C.f. https://stackoverflow.com/a/61318224

## Google Colab

## Import Baselines

In [None]:
import stable_baselines
stable_baselines.__version__

## Import Policy, RL agent, ...

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import gym

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2

from stable_baselines.bench import Monitor
from stable_baselines.common.callbacks import BaseCallback
from stable_baselines.results_plotter import load_results, ts2xy
from stable_baselines import results_plotter

In [None]:
sns.set_context("talk")

## Define a Callback Function

In [None]:
class SaveOnBestTrainingRewardCallback(BaseCallback):
    """
    Callback for saving a model (the check is done every ``check_freq`` steps)
    based on the training reward (in practice, we recommend using ``EvalCallback``).

    :param check_freq: (int)
    :param log_dir: (str) Path to the folder where the model will be saved.
      It must contains the file created by the ``Monitor`` wrapper.
    :param verbose: (int)
    """
    def __init__(self, check_freq: int, log_dir: str, verbose=1):
        super(SaveOnBestTrainingRewardCallback, self).__init__(verbose)
        self.check_freq = check_freq
        self.log_dir = log_dir
        self.save_path = os.path.join(log_dir, 'best_model')
        self.best_mean_reward = -np.inf

    def _init_callback(self) -> None:
        # Create folder if needed
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:

          # Retrieve training reward
          x, y = ts2xy(load_results(self.log_dir), 'timesteps')
          if len(x) > 0:
              # Mean training reward over the last 100 episodes
              mean_reward = np.mean(y[-100:])
              if self.verbose > 0:
                print("Num timesteps: {}".format(self.num_timesteps))
                print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(self.best_mean_reward, mean_reward))

              # New best model, you could save the agent here
              if mean_reward > self.best_mean_reward:
                  self.best_mean_reward = mean_reward
                  # Example for saving best model
                  if self.verbose > 0:
                    print("Saving new best model to {}".format(self.save_path))
                  self.model.save(self.save_path)

        return True

## Create and wrap the environment

In [None]:
from gym import spaces
import random

In [None]:
# https://stable-baselines.readthedocs.io/en/master/guide/custom_env.html
# https://towardsdatascience.com/creating-a-custom-openai-gym-environment-for-stock-trading-be532be3910e

POSITION_MIN = -1000.
POSITION_MAX =  1000.
VELOCITY_MIN = -100.
VELOCITY_MAX =  100.
ACTION_MIN = -2.
ACTION_MAX =  2.
DT = 0.1
MASS = 0.1
MAX_STEPS = 1000

class PointMassEnv(gym.Env):
    """Custom Environment that follows gym interface"""
    metadata = {'render.modes': ['human']}

    def __init__(self, mass=MASS, dt=DT):
        super(PointMassEnv, self).__init__()    # Define action and observation space

        self.mass = mass
        self.dt = dt

        self.position = None
        self.velocity = None
        self.current_step = None

        self.viewer = None

        # Actions: force
        self.action_space = spaces.Box(low=ACTION_MIN, high=ACTION_MAX, shape=(1,), dtype=np.float32)

        # State: (position, velocity)
        self.observation_space = spaces.Box(low=np.array([VELOCITY_MIN, POSITION_MIN]), high=np.array([VELOCITY_MAX, POSITION_MAX]), dtype=np.float32)


    def _reward(self, position):
        # https://en.wikipedia.org/wiki/Gaussian_function
        a = 1.      # The height of the curve's peak
        b = 0.      # The position of the center of the peak
        c = 100.     # The width of the "bell"
        x = position
        return a * np.exp(-(x - b)**2/(2.*c**2))


    def step(self, action):
        # Execute one time step within the environment
        self.current_step += 1

        # Kinetics of point mass
        force = action[0]
        acceleration = force / self.mass
        self.velocity += acceleration * self.dt
        self.position += self.velocity * self.dt
        #print("force: {}, acceleration: {}, velocity: {}, position: {}".format(force, acceleration, self.velocity, self.position))

        obs = np.array([self.velocity, self.position])

        # Compute reward and done
        reward = self._reward(self.position)
        done = self.current_step > MAX_STEPS
        
        return obs, reward, done, {}


    def reset(self):
        # Reset the state of the environment to an initial state
        self.position = random.uniform(POSITION_MIN, POSITION_MAX)
        self.velocity = 0.
        self.current_step = 0

        return np.array([self.velocity, self.position])
 

#    def render(self, mode='human', close=False):
#        # Render the environment to the screen
#        print(self.velocity, self.position)
# 
#
#    def close(self):
#        pass


    def render(self, mode='human'):
        screen_width = 600
        screen_height = 400

        world_width = POSITION_MAX - POSITION_MIN
        scale = screen_width/world_width
        carty = 100  # TOP OF CART
        cartwidth = 50.0
        cartheight = 30.0

        if self.viewer is None:
            from gym.envs.classic_control import rendering
            self.viewer = rendering.Viewer(screen_width, screen_height)
            l, r, t, b = -cartwidth / 2, cartwidth / 2, cartheight / 2, -cartheight / 2
            axleoffset = cartheight / 4.0

            cart = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)])
            self.carttrans = rendering.Transform()
            cart.add_attr(self.carttrans)
            self.viewer.add_geom(cart)

            self.track = rendering.Line((0, carty), (screen_width, carty))
            self.track.set_color(0, 0, 0)
            self.viewer.add_geom(self.track)

        if self.position is None:
            return None

        x = self.position
        cartx = x * scale + screen_width / 2.0  # MIDDLE OF CART
        self.carttrans.set_translation(cartx, carty)

        return self.viewer.render(return_rgb_array=mode == 'rgb_array')

    def close(self):
        if self.viewer:
            self.viewer.close()
            self.viewer = None

In [None]:
env = PointMassEnv()
env.reset()

In [None]:
#env = PointMassEnv()
#env.reset()
#a = env.action_space.sample()
#s = env.step(a)
#a, s

In [None]:
##env = gym.make('CartPole-v1')
#env = gym.make('MountainCarContinuous-v0')
#env.reset()
#a = env.action_space.sample()
#s = env.step(a)
#a, s

In [None]:
#env.action_space.sample()

In [None]:
y = []
T = range(1000)

for t in T:
  action = np.array([2.])
  next_velocity, next_pos = env.step(action)[0]
  y.append(next_pos)

plt.plot(list(T), y);


In [None]:
#x = np.arange(-100, 100, 0.1)
#y = np.array([env._reward(_x) for _x in x])
#plt.plot(x, y);

In [None]:
# Create log dir
log_dir = "/tmp/gym/"
os.makedirs(log_dir, exist_ok=True)

# Create and wrap the environment
#env = gym.make('CartPole-v1')
env = PointMassEnv()

# Logs will be saved in log_dir/monitor.csv
env = Monitor(env, log_dir)

env = DummyVecEnv([lambda: env])  # PPO2 requires a vectorized environment to run

## Define and train the PPO agent

In [None]:
# Create the callback: check every 1000 steps
callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir)

In [None]:
%%time
model = PPO2(MlpPolicy, env, verbose=1)
model.learn(total_timesteps=100000, callback=callback)

## Plotting helpers

Stable Baselines has some built-in plotting helper, that you can find in `stable_baselines.results_plotter`. However, to show how to do it yourself, we are going to use custom plotting functions. 

In [None]:
# Helper from the library

results_plotter.plot_results([log_dir], 1e5, results_plotter.X_TIMESTEPS, "PPO Point Mass")

In [None]:
def moving_average(values, window):
    """
    Smooth values by doing a moving average
    :param values: (numpy array)
    :param window: (int)
    :return: (numpy array)
    """
    weights = np.repeat(1.0, window) / window
    return np.convolve(values, weights, 'valid')


def plot_results(log_folder, title='Learning Curve'):
    """
    plot the results

    :param log_folder: (str) the save location of the results to plot
    :param title: (str) the title of the task to plot
    """
    x, y = ts2xy(load_results(log_folder), 'timesteps')
    #y = moving_average(y, window=50)
    ## Truncate x
    #x = x[len(x) - len(y):]

    fig = plt.figure(title, figsize=(16,6))
    plt.plot(x, y)
    plt.xlabel('Number of Timesteps')
    plt.ylabel('Rewards')
    plt.title(title + " Smoothed")
    plt.show()

In [None]:
plot_results(log_dir)

## Make a GIF of a Trained Agent

C.f. https://stable-baselines.readthedocs.io/en/master/guide/examples.html#bonus-make-a-gif-of-a-trained-agent

In [None]:
import imageio

In [None]:
images = []
obs = model.env.reset()
img = model.env.render(mode='rgb_array')
for i in range(350):
    images.append(img)
    action, _ = model.predict(obs)
    obs, _, _ ,_ = model.env.step(action)
    img = model.env.render(mode='rgb_array')

imageio.mimsave('ppo_point_mass_env.gif', [np.array(img) for i, img in enumerate(images) if i%2 == 0], fps=29)

In [None]:
import IPython
from IPython.display import Image

In [None]:
Image(open('ppo_point_mass_env.gif','rb').read())    # https://stackoverflow.com/questions/61110188/how-to-display-a-gif-in-jupyter-notebook-using-google-colab

## Evaluate the trained agent

In [None]:
env._max_episode_steps = 1000

In [None]:
reward_list = []

NUM_EPISODES = 100

for episode_index in range(NUM_EPISODES):

    reward_sum = 0
    
    obs = env.reset()
    done = False
 
    while not done:
        action, _states = model.predict(obs)
        obs, reward, done, info = env.step(action)
        reward_sum += reward
        #env.render()           # Cannot render on Google Colab
    
    reward_list.append(reward_sum)

print("Mean reward:", sum(reward_list) / NUM_EPISODES)            


In [None]:
env.close()