In [15]:
import numpy as np
import gym, stable_baselines, tensorflow
print(gym.__version__,stable_baselines.__version__,tensorflow.__version__)

0.13.0 2.6.0 1.13.0-rc2


| **Name\action space**   | **Refactored** to `BaseRLModel` | **Recurrent**      | `Box`          | `Discrete`     | `MultiDiscrete` | `MultiBinary`  | **Multi Processing**              |
| ---------- | ---------------------------- | ------------------ | ------------------ | ------------------ | ------------------- | ------------------ | --------------------------------- |
| A2C        | Yes | Yes | Yes | Yes | Yes  | Yes | Yes                |
| ACER       | Yes           | Yes | No | Yes | No                 | No                | Yes                |
| ACKTR      | Yes           | Yes | No | Yes | No                 | No                | Yes                |
| DDPG       | Yes           | No                | Yes | No                | No                 | No                | Yes (MPI)|
| DQN        | Yes           | No                | No                | Yes | No                 | No                | No                               |
| GAIL (only for TRPO)  | Yes           | No                | Yes |Yes| No                 | No                | Yes (MPI) |
| HER        | Yes | No                | Yes | Yes | No                 | Yes| No                               |
| PPO1       | Yes           | No                | Yes | Yes | Yes  | Yes | Yes (MPI) |
| PPO2       | Yes           | Yes | Yes | Yes | Yes  | Yes | Yes                |
| SAC        | Yes           | No                | Yes | No                | No                 | No                | No                               |
| TRPO       | Yes           | No                | Yes | Yes | Yes  | Yes | Yes (MPI) |

# Common interface

* `model = DQN(policy_name, env, learning_rate, verbose=1, tensorboard_log="fn", ..)`; `env` could be a string if it is registered
* `model.learn(total_timesteps)`: training
* `action, _states = model.predict(observed_state)`: predict; second return `_states` for recurrent policies
* `model.save(filename)`,
  `model = DQN.load("dqn_lunar")`: save and load model
  
  `load` function re-creates model from scratch. if you need to evaluate same model with multiple different sets of parameters, consider using `load_parameters` instead.

## multiprocessing

In [None]:
from stable_baselines import ACKTR
from stable_baselines.common.vec_env import SubprocVecEnv
from stable_baselines.common import set_global_seeds

def make_env(env_id, thread_id, seed=0):
    def _init():
        env = gym.make(env_id)
        env.seed(seed + thread_id)
        return env
    set_global_seeds(seed)
    return _init

cpus = 4
env = SubprocVecEnv([make_env("CartPole-v1", i) for i in range(cpus)])

model = ACKTR('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=50000)

## callbacks/Tensorboard
https://stable-baselines.readthedocs.io/en/master/guide/examples.html#using-callback-monitoring-training

## [Atari Game interface](https://stable-baselines.readthedocs.io/en/master/guide/examples.html#id1)

In [None]:
from stable_baselines.common.cmd_util import make_atari_env
from stable_baselines.common.vec_env import VecFrameStack
from stable_baselines import ACER
env = make_atari_env('PongNoFrameskip-v4', num_env=4, seed=0)
env = VecFrameStack(env, n_stack=4)
model = ACER('CnnPolicy', env, verbose=0)
model.learn(total_timesteps=25000)

obs = env.reset()
while True:
    action, _states = model.predict(obs)
    obs, rewards, dones, info = env.step(action)
    print(rewards,end='\r')

## Normalize input (env wrapper)

In [None]:
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines import PPO2

env = DummyVecEnv([lambda: gym.make("Reacher-v2")])
env = VecNormalize(env, norm_obs=True, norm_reward=False, clip_obs=10.)

model = PPO2(MlpPolicy, env)
model.learn(total_timesteps=2000)
model.save('model')
env.save_running_average('path/to/dir/b')

## [Custom "policy" network](https://stable-baselines.readthedocs.io/en/master/guide/custom_policy.html)

In [None]:
policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[32, 32])# Custom MLP policy of two layers of size 32 each with tanh activation function
model = PPO2("MlpPolicy", "CartPole-v1", policy_kwargs=policy_kwargs, verbose=1)

In [50]:
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import A2C

from stable_baselines.common.policies import FeedForwardPolicy, register_policy
# MLP A2C policy of three layers of size 128 each (with two branches out for actor & critic)
class CustomPolicyA2C(FeedForwardPolicy):
    def __init__(self, *args, **kwargs):
        super(CustomPolicyA2C, self).__init__(*args, **kwargs,
                                           net_arch=[dict(pi=[128, 128, 128],   #actor-critic policy network
                                                          vf=[128, 128, 128])],
                                           feature_extraction="mlp")

model = A2C(CustomPolicyA2C, env)      
# register_policy('CustomPolicyA2C', CustomPolicyA2C); model = A2C(policy='CustomPolicy', env) #equiv to above

In [52]:
from stable_baselines.deepq.policies import FeedForwardPolicy
# MLP DQN policy of two layers of size 32 each
class CustomPolicyDQN(FeedForwardPolicy):
    def __init__(self, *args, **kwargs):
        super(CustomPolicyDQN, self).__init__(*args, **kwargs,
                                           layers=[32, 32], #size of the Neural network for the policy (if None, default to [64, 64])
                                           layer_norm=False,
                                           feature_extraction="mlp")

# [Proximal Policy Optimization](https://stable-baselines.readthedocs.io/en/master/modules/ppo2.html)

* PPO2 is the implementation of OpenAI made for GPU.
* For multiprocessing, it uses vectorized environments compared to PPO1 which uses MPI.

In [21]:
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2

################# EITHER ###################
env = gym.make('CartPole-v0')
env = DummyVecEnv([lambda: env])  # requires a vectorized environment
model = PPO2(MlpPolicy, env, verbose=0)
model.learn(total_timesteps=10000)
################### OR #####################
# model = PPO2('MlpPolicy', 'CartPole-v1').learn(10000)

In [24]:
env = gym.make('CartPole-v0')
scores=[]
episodes=200

for e in range(1,episodes+1):
    state,done = env.reset(),False
    R = 0
    while not done:
        action, _states = model.predict(state)        # <- use of trained model
        state, reward, done, _ = env.step(action)
        R += reward
    scores+=R,
    if e%(episodes//10) == 0:
        print(f'Episode {e:4d} | Average R {np.mean(scores):6.4g} | Median R {np.median(scores)}')

Episode   20 | Average R  132.4 | Median R 144.0
Episode   40 | Average R  140.7 | Median R 161.5
Episode   60 | Average R  137.3 | Median R 159.5
Episode   80 | Average R  139.6 | Median R 154.0
Episode  100 | Average R    140 | Median R 152.0
Episode  120 | Average R  141.9 | Median R 157.5
Episode  140 | Average R  142.3 | Median R 152.0
Episode  160 | Average R  142.1 | Median R 152.0
Episode  180 | Average R  142.5 | Median R 154.0
Episode  200 | Average R  142.3 | Median R 153.5


# [Deep Q Network](https://stable-baselines.readthedocs.io/en/master/modules/dqn.html)

* "policy" network is actually the action-value network?

In [55]:
from stable_baselines import DQN
from stable_baselines.deepq.policies import FeedForwardPolicy

env = gym.make('CartPole-v0')
env = DummyVecEnv([lambda: env]) 

class CustomDQNet(FeedForwardPolicy): # MLP value network
    def __init__(self, *args, **kwargs):
        super(CustomDQNet, self).__init__(*args, **kwargs,
                                           layers=[96, 48], #size of the Neural network for the policy (if None, default to [64, 64])
                                           layer_norm=False,
                                           feature_extraction="mlp")
model = DQN(CustomDQNet,env, learning_rate=1e-3, prioritized_replay=True,
                             learning_starts=0, verbose=0)
model.learn(total_timesteps=1000)

scores=[]
episodes=200

for e in range(1,episodes+1):
    state,done = env.reset(),False
    R = 0
    while not done:
        action, _states = model.predict(state)        # <- use of trained model
        state, reward, done, _ = env.step(action)
        R += reward
    scores+=R,
    if e%show_every == 0:
        print(f'Episode {e:4d} | Average R {np.mean(scores):6.4g} | Median R {np.median(scores)}')

Episode   20 | Average R   9.35 | Median R 9.0
Episode   40 | Average R   9.45 | Median R 9.0
Episode   60 | Average R  9.483 | Median R 9.5
Episode   80 | Average R  9.438 | Median R 9.0
Episode  100 | Average R   9.37 | Median R 9.0
Episode  120 | Average R  9.333 | Median R 9.0
Episode  140 | Average R    9.3 | Median R 9.0
Episode  160 | Average R  9.331 | Median R 9.0
Episode  180 | Average R  9.356 | Median R 9.0
Episode  200 | Average R  9.365 | Median R 9.0
