# PPO

In [None]:
import gym
import numpy as np
import yaml
import os
import random
from collections import OrderedDict
import tensorflow as tf
from stable_baselines.a2c.utils import conv, linear, conv_to_fc
from stable_baselines.common.policies import FeedForwardPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines.common.policies import register_policy
from stable_baselines import PPO1

## Create Custom CNN Policy

In [None]:
def nature_cnn(scaled_images, **kwargs):
    """
    CNN from Nature paper.
    :param scaled_images: (TensorFlow Tensor) Image input placeholder
    :param kwargs: (dict) Extra keywords parameters for the convolutional layers of the CNN
    :return: (TensorFlow Tensor) The CNN output layer
    """
    activ = tf.nn.relu
    layer_1 = activ(conv(scaled_images, 'c1', n_filters=32, filter_size=8, stride=4, init_scale=np.sqrt(2), **kwargs))
    layer_2 = activ(conv(layer_1, 'c2', n_filters=64, filter_size=4, stride=2, init_scale=np.sqrt(2), **kwargs))
    layer_3 = activ(conv(layer_2, 'c3', n_filters=64, filter_size=3, stride=1, init_scale=np.sqrt(2), **kwargs))
    layer_3 = conv_to_fc(layer_3)
    return activ(linear(layer_3, 'fc1', n_hidden=512, init_scale=np.sqrt(2)))

class CustomPolicyCnn(FeedForwardPolicy):
    def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse=False, **_kwargs):
        super(CustomPolicyCnn, self).__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, reuse,
                cnn_extractor=nature_cnn, feature_extraction="cnn", **_kwargs)

In [None]:
register_policy('CustomPolicyCnn', CustomPolicyCnn)

## Arguments

In [None]:
agent_name= 'CarRacing_ppo_test'
env_id = 'CarRacing-v0'
seed = 0
policy = 'CustomPolicyCnn'
log_interval = 1
total_timesteps = 10000
# Stage= 0 : Create a new agent
# Stage> 1: Load an agent 
stage = 0

## Pathes:
Don't forget to create the directorie: results, logs, checkout and agents in your no_backup
`mkdir results` etc.

In [None]:
tensorboard_log = "no_backup/results/logs"
checkpoint = "no_backup/checkout"
final_model = "no_backup/agents"
path_to_tensorboard_log = os.path.join(os.getenv("HOME"),tensorboard_log)
# The path where we saved the model (model.load)
path_to_checkpoint = os.path.join(os.getenv("HOME"),checkpoint)
# path to save the model (model.save)
path_to_final_model = os.path.join(os.getenv("HOME"),final_model)

## Environment:

In [None]:
import gym
# Create and wrap the environment
env = gym.make(env_id)
env.seed(seed)
#if env_wrapper is not None:
#    env = env_wrapper(env)
env = DummyVecEnv([lambda:env])

## Define a  Model:

### Initialization 
define a new model

In [None]:
from stable_baselines.common.policies import CnnPolicy
tensorboard_log = os.path.join('/logs', env_id)
hyperparams = {'clip_param':0.25}
if stage == 0:
    model = PPO1(env=env, 
                 policy = policy, 
                 tensorboard_log='%s'%(path_to_tensorboard_log), 
                 verbose=1, 
                 **hyperparams)

### Laoding:

In [None]:
if stage > 0:
    model = PPO1.load("%s/%s" % (path_to_checkpoint, args.pretrained_model.strip()),
                      env=env, 
                      tensorboard_log='%s'%(path_to_tensorboard_log), 
                      verbose=1, 
                      **hyperparams)

## Training

In [None]:
kwargs = {}
if log_interval > -1:
    kwargs = {'log_interval': log_interval}

model.learn(total_timesteps=total_timesteps, **kwargs)

## Saving

In [None]:
print ("The trained model is successfully saved.")
model.save("%s/%s" % (path_to_final_model,agent_name))

## Evaluation

In [None]:
for j in range(4):
    obs = env.reset()
    for i in range(1000):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render()
        if dones:
            break
env.close()