In [1]:
# https://stable-baselines3.readthedocs.io/en/master/guide/rl.html
# https://spinningup.openai.com/en/latest/spinningup/rl_intro2.html#a-taxonomy-of-rl-algorithms

# Import dependencies

In [2]:
import os
import gym 
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

# Make Environment

In [3]:
environment_name = "CartPole-v1"

In [4]:
env = gym.make(environment_name)

In [5]:
env.observation_space

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

# Test env manualy

In [6]:
episodes = 5
for episode in range(1, episodes + 1):
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action = env.action_space.sample()
        next_state, reward, done, info = env.step(action)
        score += reward
    print('Episode={}, Score={}'.format(episode, score))

# env.close()

Episode=1, Score=14.0
Episode=2, Score=36.0
Episode=3, Score=15.0
Episode=4, Score=11.0
Episode=5, Score=31.0


In [7]:
env.close()

# Train an RL Model

In [17]:
log_path = os.path.join('training', 'logs')

env = gym.make(environment_name)
env = DummyVecEnv([lambda: env]) # type: ignore
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log=log_path)

Using cpu device


In [18]:
model.learn(total_timesteps=20000)

Logging to training\logs\PPO_1
-----------------------------
| time/              |      |
|    fps             | 2079 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
---------------------------------------
| time/                   |           |
|    fps                  | 1441      |
|    iterations           | 2         |
|    time_elapsed         | 2         |
|    total_timesteps      | 4096      |
| train/                  |           |
|    approx_kl            | 0.0094273 |
|    clip_fraction        | 0.11      |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.686    |
|    explained_variance   | 0.00184   |
|    learning_rate        | 0.0003    |
|    loss                 | 6.28      |
|    n_updates            | 10        |
|    policy_gradient_loss | -0.0159   |
|    value_loss           | 52        |
---------------------------------------
---------------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x1aa32887eb0>

# Save and Reload Model

In [11]:
PPO_path = os.path.join('training', 'saved_models', 'PPO_model')
model.save(PPO_path)



In [12]:
del model

model = PPO.load(PPO_path, env=env)

# Evaluation Model

In [13]:
from stable_baselines3.common.evaluation import evaluate_policy

evaluate_policy(model, env, n_eval_episodes=10, render=True)



(500.0, 0.0)

In [14]:
env.close()

# Test Model

In [15]:
obs = env.reset()
while True:
    action, _ = model.predict(obs) # type: ignore
    obs, reward, done, info = env.step(action)
    env.render()
    if done:
        print('info', info)
        break

info [{'TimeLimit.truncated': True, 'terminal_observation': array([-0.64307225, -0.697868  ,  0.01868926,  0.36466566], dtype=float32)}]


In [16]:
env.close()

# Viewing Logs in Tensorboard

In [24]:
training_log_path = os.path.join(log_path, 'PPO_1')

In [25]:
!tensorboard --logdir={training_log_path}

^C


TensorFlow installation not found - running with reduced feature set.
Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.13.0 at http://localhost:6006/ (Press CTRL+C to quit)


# Adding a callback to the training stage

In [26]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [34]:
save_path = os.path.join('training', 'saved_models')

In [35]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)
eval_callback = EvalCallback(env, 
                             callback_on_new_best=stop_callback,
                             eval_freq=10000,
                             best_model_save_path=save_path,
                             verbose=1)

In [36]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cpu device


In [37]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to training\logs\PPO_3
-----------------------------
| time/              |      |
|    fps             | 1994 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1402        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008570656 |
|    clip_fraction        | 0.113       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | 0.00219     |
|    learning_rate        | 0.0003      |
|    loss                 | 6.53        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0172     |
|    value_loss           | 50          |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x1aa32886b90>