In [1]:
# https://stable-baselines3.readthedocs.io/en/master/guide/rl.html
# https://spinningup.openai.com/en/latest/spinningup/rl_intro2.html#a-taxonomy-of-rl-algorithms

# Import dependencies

In [5]:
import os
import gym 
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

# Make Environment

In [6]:
environment_name = "CartPole-v1"

In [7]:
env = gym.make(environment_name)

In [5]:
env.observation_space

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

# Test env manualy

In [6]:
episodes = 5
for episode in range(1, episodes + 1):
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action = env.action_space.sample()
        next_state, reward, done, info = env.step(action)
        score += reward
    print('Episode={}, Score={}'.format(episode, score))

# env.close()

Episode=1, Score=14.0
Episode=2, Score=36.0
Episode=3, Score=15.0
Episode=4, Score=11.0
Episode=5, Score=31.0


In [7]:
env.close()

# Train an RL Model

In [11]:
log_path = os.path.join('training', 'logs')

env = gym.make(environment_name)
env = DummyVecEnv([lambda: env]) # type: ignore
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log=log_path)

Using cpu device


In [18]:
model.learn(total_timesteps=20000)

Logging to training\logs\PPO_1
-----------------------------
| time/              |      |
|    fps             | 2079 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
---------------------------------------
| time/                   |           |
|    fps                  | 1441      |
|    iterations           | 2         |
|    time_elapsed         | 2         |
|    total_timesteps      | 4096      |
| train/                  |           |
|    approx_kl            | 0.0094273 |
|    clip_fraction        | 0.11      |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.686    |
|    explained_variance   | 0.00184   |
|    learning_rate        | 0.0003    |
|    loss                 | 6.28      |
|    n_updates            | 10        |
|    policy_gradient_loss | -0.0159   |
|    value_loss           | 52        |
---------------------------------------
---------------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x1aa32887eb0>

# Save and Reload Model

In [11]:
PPO_path = os.path.join('training', 'saved_models', 'PPO_model')
model.save(PPO_path)



In [12]:
del model

model = PPO.load(PPO_path, env=env)

# Evaluation Model

In [13]:
from stable_baselines3.common.evaluation import evaluate_policy

evaluate_policy(model, env, n_eval_episodes=10, render=True)



(500.0, 0.0)

In [14]:
env.close()

# Test Model

In [15]:
obs = env.reset()
while True:
    action, _ = model.predict(obs) # type: ignore
    obs, reward, done, info = env.step(action)
    env.render()
    if done:
        print('info', info)
        break

info [{'TimeLimit.truncated': True, 'terminal_observation': array([-0.64307225, -0.697868  ,  0.01868926,  0.36466566], dtype=float32)}]


In [16]:
env.close()

# Viewing Logs in Tensorboard

In [24]:
training_log_path = os.path.join(log_path, 'PPO_1')

In [25]:
!tensorboard --logdir={training_log_path}

^C


TensorFlow installation not found - running with reduced feature set.
Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.13.0 at http://localhost:6006/ (Press CTRL+C to quit)


# Adding a callback to the training stage

In [3]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [8]:
save_path = os.path.join('training', 'saved_models')

In [9]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)
eval_callback = EvalCallback(env, 
                             callback_on_new_best=stop_callback,
                             eval_freq=10000,
                             best_model_save_path=save_path,
                             verbose=1)

In [12]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [13]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to training\logs\PPO_4
-----------------------------
| time/              |      |
|    fps             | 2111 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1466        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008203074 |
|    clip_fraction        | 0.0873      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.687      |
|    explained_variance   | -0.0058     |
|    learning_rate        | 0.0003      |
|    loss                 | 6.84        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0132     |
|    value_loss           | 52.5        |
-----------------------------------------
---



Eval num_timesteps=10000, episode_reward=373.20 +/- 109.85
Episode length: 373.20 +/- 109.85
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 373         |
|    mean_reward          | 373         |
| time/                   |             |
|    total_timesteps      | 10000       |
| train/                  |             |
|    approx_kl            | 0.010204764 |
|    clip_fraction        | 0.082       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.61       |
|    explained_variance   | 0.282       |
|    learning_rate        | 0.0003      |
|    loss                 | 24.2        |
|    n_updates            | 40          |
|    policy_gradient_loss | -0.0167     |
|    value_loss           | 71.4        |
-----------------------------------------
New best mean reward!
Stopping training because the mean reward 373.20  is above the threshold 200


<stable_baselines3.ppo.ppo.PPO at 0x21dc9e784f0>

# Changing Policies

In [14]:
net_arch=[dict(pi=[128, 128, 128, 128], vf=[128, 128, 128, 128])]

In [15]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path, policy_kwargs={'net_arch': net_arch})

Using cpu device




In [16]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to training\logs\PPO_4
-----------------------------
| time/              |      |
|    fps             | 1781 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1085        |
|    iterations           | 2           |
|    time_elapsed         | 3           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.013655902 |
|    clip_fraction        | 0.183       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.682      |
|    explained_variance   | -0.00312    |
|    learning_rate        | 0.0003      |
|    loss                 | 3.42        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0237     |
|    value_loss           | 19.8        |
-----------------------------------------
---



Eval num_timesteps=10000, episode_reward=267.20 +/- 78.73
Episode length: 267.20 +/- 78.73
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 267        |
|    mean_reward          | 267        |
| time/                   |            |
|    total_timesteps      | 10000      |
| train/                  |            |
|    approx_kl            | 0.00894175 |
|    clip_fraction        | 0.109      |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.566     |
|    explained_variance   | 0.639      |
|    learning_rate        | 0.0003     |
|    loss                 | 15.1       |
|    n_updates            | 40         |
|    policy_gradient_loss | -0.017     |
|    value_loss           | 37.5       |
----------------------------------------
------------------------------
| time/              |       |
|    fps             | 842   |
|    iterations      | 5     |
|    time_elapsed    | 12    |
|    total_times

<stable_baselines3.ppo.ppo.PPO at 0x21dc9e7bdf0>

In [19]:
obs = env.reset()
times = 0
score = 0
while True:
    action, _ = model.predict(obs) # type: ignore
    obs, reward, done, info = env.step(action)
    score += reward
    env.render()
    if done:
        print('info', info)
        print('score', score)
        break
    times += 1
    if times > 50000:
        print('time out')
        print('score', score)
        break
env.close()

info [{'TimeLimit.truncated': True, 'terminal_observation': array([-0.55599135, -0.00442678, -0.03060992, -0.36950326], dtype=float32)}]
score [500.]
