<a href="https://colab.research.google.com/github/iskra3138/stable-baselines/blob/main/CartPole_Dynamic_Change_A2C_Stable_Baselines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup


In [1]:
# Stable Baselines only supports tensorflow 1.x for now
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [2]:
!apt-get install ffmpeg freeglut3-dev xvfb  # For visualization
!pip install stable-baselines[mpi]==2.10.0

Reading package lists... Done
Building dependency tree       
Reading state information... Done
freeglut3-dev is already the newest version (2.8.1-3).
ffmpeg is already the newest version (7:3.4.8-0ubuntu0.2).
xvfb is already the newest version (2:1.19.6-1ubuntu4.7).
0 upgraded, 0 newly installed, 0 to remove and 14 not upgraded.


In [3]:
import numpy as np
import math
import gym

# Custom CartPole Class 정의
```
    ### to replace deterministic action with stochastic action 
    #force = self.force_mag if action == 1 else -self.force_mag
    stochastic_force = self.force_mag + np.random.normal(self.mu, self.sigma)
    force = stochastic_force if action == 1 else -stochastic_force
```

In [4]:
from gym.envs.classic_control import cartpole

class CartPoleEnv_stochastic(cartpole.CartPoleEnv):

  def __init__(self, mu, sigma):
    super(CartPoleEnv_stochastic, self).__init__()
    self.mu = mu
    self.sigma = sigma

  def step(self, action):
    err_msg = "%r (%s) invalid" % (action, type(action))
    assert self.action_space.contains(action), err_msg

    x, x_dot, theta, theta_dot = self.state
    ### to replace deterministic action with stochastic action 
    #force = self.force_mag if action == 1 else -self.force_mag
    stochastic_force = self.force_mag + np.random.normal(self.mu, self.sigma)
    force = stochastic_force if action == 1 else -stochastic_force

    costheta = math.cos(theta)
    sintheta = math.sin(theta)

    # For the interested reader:
    # https://coneural.org/florian/papers/05_cart_pole.pdf
    temp = (force + self.polemass_length * theta_dot ** 2 * sintheta) / self.total_mass
    thetaacc = (self.gravity * sintheta - costheta * temp) / (self.length * (4.0 / 3.0 - self.masspole * costheta ** 2 / self.total_mass))
    xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass

    if self.kinematics_integrator == 'euler':
        x = x + self.tau * x_dot
        x_dot = x_dot + self.tau * xacc
        theta = theta + self.tau * theta_dot
        theta_dot = theta_dot + self.tau * thetaacc
    else:  # semi-implicit euler
        x_dot = x_dot + self.tau * xacc
        x = x + self.tau * x_dot
        theta_dot = theta_dot + self.tau * thetaacc
        theta = theta + self.tau * theta_dot

    self.state = (x, x_dot, theta, theta_dot)

    done = bool(
        x < -self.x_threshold
        or x > self.x_threshold
        or theta < -self.theta_threshold_radians
        or theta > self.theta_threshold_radians
    )

    if not done:
        reward = 1.0
    elif self.steps_beyond_done is None:
        # Pole just fell!
        self.steps_beyond_done = 0
        reward = 1.0
    else:
        if self.steps_beyond_done == 0:
            logger.warn(
                "You are calling 'step()' even though this "
                "environment has already returned done = True. You "
                "should always call 'reset()' once you receive 'done = "
                "True' -- any further steps are undefined behavior."
            )
        self.steps_beyond_done += 1
        reward = 0.0

    return np.array(self.state), reward, done, {}

### Class 확인

In [5]:
# To check deterministic actiong
environment = CartPoleEnv_stochastic(mu=0.0, sigma=0.0)
for _ in range(3):
  environment.seed(1)
  step = environment.reset()
  next_step = environment.step(np.array(0, dtype=np.int32))
  print (step, '- >', next_step[0])

[ 0.03073904  0.00145001 -0.03088818 -0.03131252] - > [ 0.03076804 -0.19321569 -0.03151444  0.25146705]
[ 0.03073904  0.00145001 -0.03088818 -0.03131252] - > [ 0.03076804 -0.19321569 -0.03151444  0.25146705]
[ 0.03073904  0.00145001 -0.03088818 -0.03131252] - > [ 0.03076804 -0.19321569 -0.03151444  0.25146705]


In [6]:
# To check stochastic action
environment = CartPoleEnv_stochastic(mu=0.0, sigma=0.5)
for _ in range(3):
  environment.seed(1)
  step = environment.reset()
  next_step = environment.step(np.array(0, dtype=np.int32))
  print (step, '- >', next_step[0])

[ 0.03073904  0.00145001 -0.03088818 -0.03131252] - > [ 0.03076804 -0.18783056 -0.03151444  0.24339321]
[ 0.03073904  0.00145001 -0.03088818 -0.03131252] - > [ 0.03076804 -0.1955017  -0.03151444  0.25489444]
[ 0.03073904  0.00145001 -0.03088818 -0.03131252] - > [ 0.03076804 -0.19513888 -0.03151444  0.25435046]


# Training

In [7]:
def TimeLimit(env):
  return gym.wrappers.TimeLimit(env, max_episode_steps=500)

In [10]:
from stable_baselines.common import make_vec_env

env = make_vec_env(CartPoleEnv_stochastic, 
                   n_envs=4, 
                   seed=1000,
                   wrapper_class=TimeLimit, 
                   env_kwargs={'mu':0., "sigma":0.5})

In [None]:
# Same as before we instantiate the agent along with the environment
from stable_baselines import A2C

a2c_model = A2C('MlpPolicy', env, tensorboard_log='./log', verbose=1)

In [16]:
# Train the agent for 10000 steps
a2c_model.learn(total_timesteps=1000000, log_interval=100)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
---------------------------------
| ep_len_mean        | 23.2     |
| ep_reward_mean     | 23.2     |
| explained_variance | 0.0288   |
| fps                | 2030     |
| nupdates           | 100      |
| policy_entropy     | 0.693    |
| total_timesteps    | 2000     |
| value_loss         | 9.86     |
---------------------------------
---------------------------------
| ep_len_mean        | 23.6     |
| ep_reward_mean     | 23.6     |
| explained_variance | -0.043   |
| fps                | 2290     |
| nupdates           | 200      |
| policy_entropy     | 0.693    |
| total_timesteps    | 4000     |
| value_loss         | 7.5      |
---------------------------------
---------------------------------
| ep_len_mean        | 22.5     |
| ep_reward_mean     | 22.5     |
| explained_variance | 0.152    |
| fps                | 2410     |
| nupdates           | 300      |
| policy_entropy     | 0.693    |
| total_timesteps

<stable_baselines.a2c.a2c.A2C at 0x7fe73aa35f28>

# Evaluation

In [13]:
def evaluate(model, env, num_episodes=10):
    """
    Evaluate a RL agent
    :param model: (BaseRLModel object) the RL Agent
    :param num_episodes: (int) number of episodes to evaluate it
    :return: (float) Mean reward for the last num_episodes
    """
    # This function will only work for a single Environment
    # single Environment에 대해서만 작동함
    #env = model.get_env() 
    all_episode_rewards = []
    for i in range(num_episodes):
        episode_rewards = []
        done = False
        obs = env.reset() # e.g. [[ 0.04632042, -0.01863058, -0.03169358,  0.00881829]])
        while not done:
            # _states are only useful when using LSTM policies
            action, _states = model.predict(obs, deterministic=True) # e.g.: [0], None
            # here, action, rewards and dones are arrays
            # because we are using vectorized env
            obs, reward, done, info = env.step(action) #e.g.: [[0.04, -0.24, -0.03, 0.27]], [1.0], [False], [{}]
            episode_rewards.append(reward)

        all_episode_rewards.append(sum(episode_rewards))

    mean_episode_reward = np.mean(all_episode_rewards)
    print("Mean reward:", mean_episode_reward, "Num episodes:", num_episodes)

    return mean_episode_reward

In [20]:
# Random Agent, before training
from stable_baselines.common import make_vec_env

eval_env = make_vec_env(CartPoleEnv_stochastic, 
                   n_envs=1, 
                   seed=1000,
                   wrapper_class=TimeLimit, 
                   env_kwargs={'mu':0., "sigma":0.5}) 
mean_reward_before_train = evaluate(a2c_model, eval_env, num_episodes=10)

Mean reward: 500.0 Num episodes: 10


In [21]:
from stable_baselines.common.evaluation import evaluate_policy
eval_env = make_vec_env(CartPoleEnv_stochastic, 
                   n_envs=1, 
                   seed=1000,
                   wrapper_class=TimeLimit, 
                   env_kwargs={'mu':0., "sigma":0.5})

mean_reward, std_reward = evaluate_policy(a2c_model, eval_env, n_eval_episodes=10)

print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

mean_reward:500.00 +/- 0.00


# Different Dynamics Test

In [22]:
# mu=0.0으로 고정
sigmas = [0.0, 1.0, 3.0, 5.0, 10.0]
for sigma in sigmas :
  eval_env = make_vec_env(CartPoleEnv_stochastic, 
                   n_envs=1, 
                   seed=1000,
                   wrapper_class=TimeLimit, 
                   env_kwargs={'mu':0., "sigma":sigma})
  
  mean_reward, std_reward = evaluate_policy(a2c_model, eval_env, n_eval_episodes=10)

  print(f"sigma:{sigma:.1f} mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

sigma:0.0 mean_reward:500.00 +/- 0.00
sigma:1.0 mean_reward:500.00 +/- 0.00
sigma:3.0 mean_reward:500.00 +/- 0.00
sigma:5.0 mean_reward:500.00 +/- 0.00
sigma:10.0 mean_reward:452.30 +/- 116.43


# Rendering

### Prepare video recording

In [23]:
# Set up fake display; otherwise rendering will fail
import os
os.system("Xvfb :1 -screen 0 1024x768x24 &")
os.environ['DISPLAY'] = ':1'

In [24]:
import base64
from pathlib import Path

from IPython import display as ipythondisplay

def show_videos(video_path='', prefix=''):
  """
  Taken from https://github.com/eleurent/highway-env

  :param video_path: (str) Path to the folder containing videos
  :param prefix: (str) Filter the video, showing only the only starting with this prefix
  """
  html = []
  for mp4 in Path(video_path).glob("{}*.mp4".format(prefix)):
      video_b64 = base64.b64encode(mp4.read_bytes())
      html.append('''<video alt="{}" autoplay 
                    loop controls style="height: 400px;">
                    <source src="data:video/mp4;base64,{}" type="video/mp4" />
                </video>'''.format(mp4, video_b64.decode('ascii')))
  ipythondisplay.display(ipythondisplay.HTML(data="<br>".join(html)))

We will record a video using the [VecVideoRecorder](https://stable-baselines.readthedocs.io/en/master/guide/vec_envs.html#vecvideorecorder) wrapper, you will learn about those wrapper in the next notebook.

In [25]:
from stable_baselines.common.vec_env import VecVideoRecorder, DummyVecEnv

def record_video(env, model, video_length=500, prefix='', video_folder='videos/'):
  """
  :param env_id: (str)
  :param model: (RL model)
  :param video_length: (int)
  :param prefix: (str)
  :param video_folder: (str)
  """
  eval_env = DummyVecEnv([lambda: env])
  # Start the video at step=0 and record 500 steps
  eval_env = VecVideoRecorder(eval_env, video_folder=video_folder,
                              record_video_trigger=lambda step: step == 0, video_length=video_length,
                              name_prefix=prefix)

  obs = eval_env.reset()
  for _ in range(video_length):
    action, _ = model.predict(obs, deterministic=True)
    obs, _, _, _ = eval_env.step(action)

  # Close the video recorder
  eval_env.close()

### Visualize trained agent



In [28]:
show_env = CartPoleEnv_stochastic(mu=0.0, sigma=0.0)
show_env = gym.wrappers.TimeLimit(show_env, max_episode_steps=500)

record_video(show_env, a2c_model, video_length=500, prefix='a2c-cartpole')

Saving video to  /content/videos/a2c-cartpole-step-0-to-step-500.mp4


In [29]:
show_videos('videos', prefix='a2c')

In [144]:
!rm /content/videos/a2c-cartpole-step-0-to-step-*