# RL Demonstration usinge stable baselines

In [None]:
import gym

import numpy as np
import matplotlib.pyplot as plt


from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines.deepq.policies import MlpPolicy as DQNPolicy
from stable_baselines.deepq.policies import CnnPolicy as DQNCnnPolicy
from stable_baselines.common.policies import MlpPolicy as CommonPolicy
from stable_baselines import DQN, PPO2, SAC

Import all necessary libraries as you would in any other python script

In [None]:
def eval_and_plot(env_name, model, num_runs=32, run_length=500, render=False):
    env = gym.make(env_name)
    num_runs = 32
    run_length = 500

    all_rewards = np.zeros((run_length, num_runs))
    for i in range(num_runs):
        obs = env.reset()
        for j in range(run_length):
            action, _states = model.predict(obs)
            obs, rewards, dones, info = env.step(action)
            all_rewards[j,i] = rewards
            if render:
                env.render()
            if dones:
                break
        env.close()
    print(all_rewards.sum(0))
    plt.plot(range(run_length), np.mean(all_rewards, 1))
    plt.fill_between(range(run_length), 
                     np.quantile(all_rewards, .25, 1), 
                     np.quantile(all_rewards, .75, 1), 
                     alpha=0.2)
    plt.show()

Quick evaluation utility, details are explained below

In [None]:
env = gym.make('CartPole-v1')

Create an environment from the Gym examples. For most of the coding exercises, we will be using gym, since it is easy to install and has a very commonly used interface which you will find in many other simulators too.

In [None]:
model = DQN(DQNPolicy, env, verbose=1)
model.learn(total_timesteps=500, log_interval=1)

This is the main training loop. For the purpose of this demonstration, I used a preimplemented algorithm which hides all the complexity. In the class, coding these functions is of course your main goal ;)

## Evaluation

The following cells load some pretrained algorithms to show how a basic RL evaluation looks and what you can expect to see when testing the algorithms

In [None]:
model = SAC.load("models/sac_pendulum")
eval_and_plot("Pendulum-v0", model)

In [None]:
model = PPO2.load("models/ppo_acrobot_success")
eval_and_plot("Acrobot-v1", model)

In [None]:
model = PPO2.load("models/ppo_cartpole_success")
eval_and_plot("CartPole-v1", model)

In [None]:
model = DQN.load("models/dqn_cartpole")
eval_and_plot("CartPole-v1", model)

In [None]:
def eval_and_plot(env_name, model, num_runs=32, run_length=500, render=False):
    env = gym.make(env_name)

    all_rewards = np.zeros((run_length, num_runs))
    for i in range(num_runs):
        obs = env.reset()
        for j in range(run_length):;)
            action, _states = model.predict(obs)
            obs, rewards, dones, info = env.step(action)
            all_rewards[j,i] = rewards
            if render:
                env.render()
            if dones:
                break
        env.close()
    print(all_rewards.sum(0))
    plt.plot(range(run_length), np.mean(all_rewards, 1))
    plt.fill_between(range(run_length), 
                     np.mean(all_rewards, 1) + np.std(all_rewards, 1), 
                     np.mean(all_rewards, 1) - np.std(all_rewards, 1), 
                     alpha=0.2)
    plt.show()

## A closer look at the env code

Below you can find a commented version of the evaluation script

In [None]:
env = gym.make("Pendulum-v0")
model = PPO2.load("models/ppo_cartpole_success")
num_runs = 32
run_length = 200

# env.reset initializes the environment and returns the first state observation
obs = env.reset()
obs

In [None]:
# in the core loop, the environment is advanced one step at a time
for _ in range(run_length):
    # in case you need a random baseline, environments allow you to
    # generate random actions directly from the action space
    action = env.action_space.sample()
    print(_)
    
    # the step() function takes the action as an argument and returns a tuple
    # of the next observation, the direct reward, a boolean indicating whether
    # a final state was reached, and a dictionary containing additional information
    obs, rewards, dones, info = env.step(action)

    # you can render the environment on your screen with this function
    # note that the render function might have trouble drawing from a jupyter
    # notebook
    env.render()
    
    # a common pattern is to reset the environment once you have observed a "done"
    # signal, this is not automatically done by the framework
    if dones:
        break

# closing the env cleans up any initialized threads, especially renderings
env.close()