# PPO

In [1]:
import gym
import numpy as np
import yaml
import os
import random
from collections import OrderedDict
import tensorflow as tf
from stable_baselines.a2c.utils import conv, linear, conv_to_fc
from stable_baselines.common.policies import FeedForwardPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines.common.policies import register_policy
from stable_baselines import PPO1

## Create Custom CNN Policy

In [11]:
def nature_cnn(scaled_images, **kwargs):
    """
    CNN from Nature paper.
    :param scaled_images: (TensorFlow Tensor) Image input placeholder
    :param kwargs: (dict) Extra keywords parameters for the convolutional layers of the CNN
    :return: (TensorFlow Tensor) The CNN output layer
    """
    activ = tf.nn.relu
    layer_1 = activ(conv(scaled_images, 'c1', n_filters=32, filter_size=8, stride=4, init_scale=np.sqrt(2), **kwargs))
    layer_2 = activ(conv(layer_1, 'c2', n_filters=64, filter_size=4, stride=2, init_scale=np.sqrt(2), **kwargs))
    layer_3 = activ(conv(layer_2, 'c3', n_filters=64, filter_size=3, stride=1, init_scale=np.sqrt(2), **kwargs))
    layer_3 = conv_to_fc(layer_3)
    return activ(linear(layer_3, 'fc1', n_hidden=512, init_scale=np.sqrt(2)))

class CustomPolicyCnn(FeedForwardPolicy):
    def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **_kwargs):
        super(CustomPolicyCnn, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse,
                cnn_extractor=nature_cnn, feature_extraction="cnn", **_kwargs)

In [12]:
register_policy('CustomPolicyCnn', CustomPolicyCnn)

## Arguments

In [26]:
agent_name= 'CarRacing_ppo_test'
env_id = 'CarRacing-v0'
seed = 0
policy = 'CustomPolicyCnn'
log_interval = 1
total_timesteps = 10000
# Stage= 0 : Create a new agent
# Stage> 1: Load an agent 
stage = 0

## Pathes:
Don't forget to create the directorie: results, logs, checkout and agents in your no_backup
`mkdir results` etc.

In [21]:
tensorboard_log = "no_backup/results/logs"
checkpoint = "no_backup/checkout"
final_model = "no_backup/agents"
path_to_tensorboard_log = os.path.join(os.getenv("HOME"),tensorboard_log)
# The path where we saved the model (model.load)
path_to_checkpoint = os.path.join(os.getenv("HOME"),checkpoint)
# path to save the model (model.save)
path_to_final_model = os.path.join(os.getenv("HOME"),final_model)

## Environment:

In [14]:
import gym
# Create and wrap the environment
env = gym.make(env_id)
env.seed(seed)
#if env_wrapper is not None:
#    env = env_wrapper(env)
env = DummyVecEnv([lambda:env])

## Define a  Model:

### Initialization 
define a new model

In [22]:
from stable_baselines.common.policies import CnnPolicy
tensorboard_log = os.path.join('/logs', env_id)
hyperparams = {'clip_param':0.25}
if stage == 0:
    model = PPO1(env=env, 
                 policy = policy, 
                 tensorboard_log='%s'%(path_to_tensorboard_log), 
                 verbose=1, 
                 **hyperparams)

### Laoding:

In [23]:
if stage > 0:
    model = PPO1.load("%s/%s" % (path_to_checkpoint, args.pretrained_model.strip()),
                      env=env, 
                      tensorboard_log='%s'%(path_to_tensorboard_log), 
                      verbose=1, 
                      **hyperparams)

## Training

In [24]:
kwargs = {}
if log_interval > -1:
    kwargs = {'log_interval': log_interval}

model.learn(total_timesteps=total_timesteps, **kwargs)

********** Iteration 0 ************
Track generation: 1143..1442 -> 299-tiles track
Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
      0.03224 |      -0.04253 |      34.79788 |       0.04758 |       4.25327
     -0.00015 |      -0.04252 |       3.11181 |       0.00029 |       4.25192
     -0.00144 |      -0.04253 |       0.92935 |       0.00246 |       4.25253
     -0.00845 |      -0.04254 |       0.89755 |       0.00566 |       4.25385
Evaluating losses...
     -0.00756 |      -0.04254 |       0.76782 |       0.00921 |       4.25442
----------------------------------
| EpThisIter      | 0            |
| EpisodesSoFar   | 0            |
| TimeElapsed     | 7.18         |
| TimestepsSoFar  | 256          |
| ev_tdlam_before | -0.0208      |
| loss_ent        | 4.2544184    |
| loss_kl         | 0.009210022  |
| loss_pol_entpen | -0.042544182 |
| loss_pol_surr   | -0.007557297 |
| loss_vf_loss    | 0.7678181    |
----------------------------

********** Iteration 8 ************
Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
      0.00092 |      -0.04262 |       0.89849 |       0.00064 |       4.26218
      0.00179 |      -0.04263 |       1.15301 |       0.00656 |       4.26313
     -0.00137 |      -0.04264 |       0.99480 |       0.00746 |       4.26429
     -0.00342 |      -0.04265 |       0.94390 |       0.00705 |       4.26500
Evaluating losses...
     -0.00308 |      -0.04265 |       0.91573 |       0.01050 |       4.26539
-----------------------------------
| EpLenMean       | 1e+03         |
| EpRewMean       | -62.1         |
| EpThisIter      | 0             |
| EpisodesSoFar   | 2             |
| TimeElapsed     | 37.6          |
| TimestepsSoFar  | 2304          |
| ev_tdlam_before | 0.013         |
| loss_ent        | 4.265394      |
| loss_kl         | 0.010495705   |
| loss_pol_entpen | -0.04265394   |
| loss_pol_surr   | -0.0030768812 |
| loss_vf_loss    | 0.9157291

********** Iteration 16 ************
Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
      0.00342 |      -0.04269 |       0.84157 |       0.00035 |       4.26852
      0.00071 |      -0.04268 |       0.85789 |       0.00160 |       4.26773
      0.00266 |      -0.04267 |       0.84742 |       0.00874 |       4.26746
     -0.00158 |      -0.04268 |       0.84996 |       0.00256 |       4.26765
Evaluating losses...
     -0.00250 |      -0.04268 |       0.79142 |       0.00442 |       4.26773
-----------------------------------
| EpLenMean       | 1e+03         |
| EpRewMean       | -59.2         |
| EpThisIter      | 0             |
| EpisodesSoFar   | 4             |
| TimeElapsed     | 68.6          |
| TimestepsSoFar  | 4352          |
| ev_tdlam_before | -4.28e-05     |
| loss_ent        | 4.2677326     |
| loss_kl         | 0.004418513   |
| loss_pol_entpen | -0.042677324  |
| loss_pol_surr   | -0.0025016665 |
| loss_vf_loss    | 0.791418

********** Iteration 24 ************
Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
     3.76e-05 |      -0.04244 |       0.86883 |       0.00026 |       4.24387
      0.00034 |      -0.04245 |       0.81381 |       0.00176 |       4.24483
      0.00018 |      -0.04246 |       0.76885 |       0.00258 |       4.24605
      0.00247 |      -0.04247 |       0.73551 |       0.00158 |       4.24729
Evaluating losses...
      0.00269 |      -0.04248 |       0.72973 |       0.00095 |       4.24816
----------------------------------
| EpLenMean       | 1e+03        |
| EpRewMean       | -60.1        |
| EpThisIter      | 0            |
| EpisodesSoFar   | 6            |
| TimeElapsed     | 100          |
| TimestepsSoFar  | 6400         |
| ev_tdlam_before | 0.231        |
| loss_ent        | 4.248163     |
| loss_kl         | 0.0009533877 |
| loss_pol_entpen | -0.04248163  |
| loss_pol_surr   | 0.0026863236 |
| loss_vf_loss    | 0.7297298    |
-----

********** Iteration 32 ************
Optimizing...
     pol_surr |    pol_entpen |       vf_loss |            kl |           ent
      0.00240 |      -0.04251 |       1.19740 |       0.00086 |       4.25113
      0.00856 |      -0.04251 |       1.18582 |       0.00356 |       4.25112
      0.00093 |      -0.04251 |       1.13987 |       0.00126 |       4.25130
      0.00322 |      -0.04252 |       1.09742 |       0.00129 |       4.25164
Evaluating losses...
      0.00270 |      -0.04252 |       1.07652 |       0.00140 |       4.25186
----------------------------------
| EpLenMean       | 1e+03        |
| EpRewMean       | -62.1        |
| EpThisIter      | 0            |
| EpisodesSoFar   | 8            |
| TimeElapsed     | 132          |
| TimestepsSoFar  | 8448         |
| ev_tdlam_before | 0.332        |
| loss_ent        | 4.2518554    |
| loss_kl         | 0.0014049986 |
| loss_pol_entpen | -0.042518552 |
| loss_pol_surr   | 0.002702698  |
| loss_vf_loss    | 1.0765173    |
-----

<stable_baselines.ppo1.pposgd_simple.PPO1 at 0x7f088870c208>

## Saving

In [28]:
print ("The trained model is successfully saved.")
model.save("%s/%s" % (path_to_final_model,agent_name))

The trained model is successfully saved.


## Evaluation

In [None]:
for j in range(4):
    obs = env.reset()
    for i in range(1000):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()
        if dones:
            break
env.close()

Track generation: 1320..1654 -> 334-tiles track
