### What are Wrappers?

Very frequently, you will want to extend the environment’s functionality in some generic way.

**Example 1:** An environment gives you some observations, but you want to accumulate them in some buffer and provide to the agent the N last observations, which is a common scenario for dynamic computer games, when one single frame is just not enough to get full information about the game state.

**Example 2:** If you want to be able to crop or preprocess an image’s pixels to make it more convenient for the agent to digest, or if you want to normalize reward scores somehow.

Therefore you’d like to “wrap” the existing environment and add some extra logic doing something.

* **ObservationWrapper:** You need to redefine its observation(obs) method. Argument obs is an observation from the wrapped environment, and this method should return the observation which will be given to the agent.

* **RewardWrapper:** Exposes the method reward(rew), which could modify the reward value given to the agent.

* **ActionWrapper:** You need to override the method action(act) which could tweak the action passed to the wrapped environment to the agent.

In [None]:
import gym
import numpy as np
from gym.spaces.box import Box
import matplotlib.pyplot as plt
env = gym.make("CarRacing-v0")
("The shape of observation space",env.observation_space)

In [None]:
# Returns a cropped and down sampled image where the background is erased
class PreProcessObservation(gym.ObservationWrapper):

    def __init__(self, env):
        gym.ObservationWrapper.__init__(self, env)
        # Define a new Box
        self.observation_space = Box(self.observation_space.low[0,0,0],self.observation_space.high[0,0,0],
            [40, 48, 1]  # Channel, Width, Height
        )

    def observation(self, observation):

        I = observation[0:80]  # crop
        I = 0.2989 * I[:,:,0] + 0.5879 * I[:,:,1] + 0.1140 * I[:,:,2] # Grey Image
        I = I[::2, ::2]  # down sample by factor of 2
        
        return I.astype(np.float32)[..., np.newaxis]

In [None]:
env = PreProcessObservation(env)

In [None]:
obs = env.reset()
total_reward = 0.0

for i in range(100):
    # Sample an action
    env.render()
    aciton = env.action_space.sample()
    obs, reward, done, _ = env.step(aciton)
    total_reward += reward
    if done:
        break

print("Reward got: %.2f" % total_reward)
env.close()

In [None]:
print("The new shape of the observation", np.shape(obs))

In [None]:
plt.imshow(obs[:,:,0], cmap='gray')

## How to use the new environment with OpenAi Baselines:

In [None]:
# First import the required packages.
import tensorflow as tf
import os
from baselines import logger
# Import the ALg
from baselines.ppo2 import ppo2 
# Import the vectorized env
from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
# Import the policy
# from baselines.ppo1.cnn_policy import CnnPolicy
# Tesnorflow session
from baselines.common.tf_util import make_session
# Monitor wrapper reports lengths and rewards of each episode in the info dict 
from baselines.bench import Monitor 

In [None]:
SAVE_PATH = "./"
def make_env():
    env = gym.make("CarRacing-v0")
    env.seed(0)
    env = PreProcessObservation(env)
    env = Monitor(env, SAVE_PATH)
    return env

In [None]:
# Define the policy
network='cnn'
# The number of steps to run for each environment per update 
nsteps=640
# Number of training minibatches per update
nminibatches = 8
# the number of timesteps to run
total_timesteps = 1000
# the model requires a vectorized environment ()
env = DummyVecEnv([make_env])

In [None]:
#config = tf.ConfigProto()
#config.gpu_options.allow_growth = True # pylint: disable=E1101
# Take more timesteps than we need to be sure that
# we stop due to an exception.
sess_test = make_session(make_default=True, graph=tf.Graph())

model = ppo2.learn(network='cnn',
           env=env,
           nsteps=nsteps,
           nminibatches=nminibatches,
           lam=0.95,
           gamma=0.99,
           noptepochs=3,
           log_interval=1,
           ent_coef=0.01,
           lr=lambda _: 2e-4,
           cliprange=lambda _: 0.1,
           total_timesteps=int(total_timesteps),
           save_interval=10)

env.close()
#tf.InteractiveSession.close()

* eplenmean: mean episode length
* ep_rewmean: mean reward per episode
* fps: frames per second (step per second)
* nupdates: number of gradient updates
* serial_timesteps, i think it the same as total_timesteps

In [None]:
env.close()

## Train The Environmet:
One way to train your algorithm faster is to run multiple workers in parallel to get many transitions faster.

In [None]:
# Vectorized Environments are a method for stacking multiple independent environments into a single environment.
from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
from baselines.common.vec_env.vec_frame_stack import VecFrameStack
#from stable_baselines.common.vec_env import SubprocVecEnv

Creates a multiprocess vectorized wrapper for multiple environments, distributing each environment to its own process, allowing significant speed up when the environment is computationally complex.

In [None]:
def make_vec_env():
    env = gym.make("CarRacing-v0")
    env.seed(0)
    env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(0)), allow_early_resets=True)
    env = PreProcessObservation(env)
    return env

In [None]:
make_session(make_default=True, graph=tf.Graph())
env = SubprocVecEnv([make_vec_env])

model = ppo2.learn(network='cnn',
           env=env,
           nsteps=nsteps,
           nminibatches=nminibatches,
           lam=0.95,
           gamma=0.99,
           noptepochs=3,
           log_interval=1,
           ent_coef=0.01,
           lr=lambda _: 2e-4,
           cliprange=lambda _: 0.1,
           total_timesteps=int(total_timesteps),
           save_interval=10)