This code is from the following youtube video:

https://youtu.be/Mut_u40Sqz4?si=z0aF0rxUl36XG6dc

In [None]:
# https://stable-baselines3.readthedocs.io/en/master/guide/rl.html
# https://spinningup.openai.com/en/latest/spinningup/rl_intro2.html#a-taxonomy-of-rl-algorithms

# 1. Import dependencies

In [6]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv

# 2. Load Environment

In [5]:
environment_name = "CartPole-v0"

In [3]:
env = gym.make(environment_name, render_mode="human")

In [5]:
episodes = 5

for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:

        # display the environment
        env.render()

        # take a random action from the action space
        action = env.action_space.sample()

        # take the action and get the next state, reward, done, and info
        observation, reward, info, done, _ = env.step(action)

        # update the score
        score += reward

    print('Episode:{} Score:{}'.format(episode, score))
    
env.close()

  logger.warn(


Episode:1 Score:17.0
Episode:2 Score:21.0
Episode:3 Score:12.0
Episode:4 Score:16.0
Episode:5 Score:25.0


# Understanding The Environment
https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py

In [6]:
# 0-push cart to left, 1-push cart to the right
env.action_space.sample()

1

In [7]:
# [cart position, cart velocity, pole angle, pole angular velocity]
env.observation_space.sample()

array([-1.7632186e+00,  1.7292971e+38,  3.0424047e-01,  3.1315310e+38],
      dtype=float32)

# 3. Train an RL Model

In [8]:
import os

In [9]:
# make the directories first
# defining log path
log_path = os.path.join('Training', 'Logs')

In [10]:
env = gym.make(environment_name)

# wrapping the environment in a vectorized environment
env = DummyVecEnv([lambda: env])

# defining the model
# MlpPolicy is a simple policy network that only has one hidden layer
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log = log_path)

Using cpu device


In [11]:
# training the model for 20'000 timestemps
model.learn(total_timesteps=20000)

Logging to Training/Logs/PPO_3
-----------------------------
| time/              |      |
|    fps             | 7028 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
---------------------------------------
| time/                   |           |
|    fps                  | 4090      |
|    iterations           | 2         |
|    time_elapsed         | 1         |
|    total_timesteps      | 4096      |
| train/                  |           |
|    approx_kl            | 0.0087725 |
|    clip_fraction        | 0.0967    |
|    clip_range           | 0.2       |
|    entropy_loss         | -0.686    |
|    explained_variance   | -0.000872 |
|    learning_rate        | 0.0003    |
|    loss                 | 7.61      |
|    n_updates            | 10        |
|    policy_gradient_loss | -0.0169   |
|    value_loss           | 51.3      |
---------------------------------------
---------------------------------------

<stable_baselines3.ppo.ppo.PPO at 0x312ad7230>

# 4. Save and Reload Model

In [12]:
PPO_path = os.path.join('Training', 'Models', 'PPO_model')

In [13]:
model.save(PPO_path)

In [14]:
# deleting the model
del model

In [15]:
model = PPO.load(PPO_path, env = env)

# 5. Evaluation

In [21]:
from stable_baselines3.common.evaluation import evaluate_policy

In [17]:
# in this case the problem is considered solved if the average reward is greater than or equal to 200

evaluate_policy(model, env, n_eval_episodes=10, render=False)



(253.9, 114.7287670987534)

In [None]:
env.close()

# 6. Test Model

In [18]:
obs = env.reset()

episodes = 5

for episode in range(1, episodes+1):

    # reset the environment --> get the initial state
    obs = env.reset()
    done = False
    score = 0

    while not done:
        # env.render()
        action, _  = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        score += rewards

        # if done: 
        #     print('info', info)
        #     break

    print('Episode:{} Score:{}'.format(episode, score))

Episode:1 Score:[410.]
Episode:2 Score:[185.]
Episode:3 Score:[203.]
Episode:4 Score:[165.]
Episode:5 Score:[128.]


In [None]:
env.close()

# 7. Viewing Logs in Tensorboard

In [19]:
training_log_path = os.path.join(log_path, 'PPO_1')

: 

In [None]:
!tensorboard --logdir={training_log_path}

You can also use the tensorboard via the terminal.

Best way is to go to the directory where the logs are saved and run the following command: 'tensorboard --logdir=.'

# 8. Adding a callback to the training Stage

In [3]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

import os

In [4]:
save_path = os.path.join('Training', 'Models')
log_path = os.path.join('Training', 'Logs')

In [7]:
env = gym.make(environment_name)
env = DummyVecEnv([lambda: env])

  logger.deprecation(


In [8]:
# stops the training once we reach the reward threshold, here 190, verbose gives some additional logging
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=190, verbose=1)

# evaluates the model every 10'000 timesteps, saves the best model and logs, verbose gives some additional logging
eval_callback = EvalCallback(env, 
                             callback_on_new_best=stop_callback, # stop training when the best model is found
                             eval_freq=10000, # evaluate the model every 10'000 timesteps
                             best_model_save_path=save_path, 
                             verbose=1)

In [9]:
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log=log_path)

Using cpu device


In [10]:
model.learn(total_timesteps=20000, callback = eval_callback)

Logging to Training/Logs/PPO_4
-----------------------------
| time/              |      |
|    fps             | 6581 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 3842        |
|    iterations           | 2           |
|    time_elapsed         | 1           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008059412 |
|    clip_fraction        | 0.0877      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.687      |
|    explained_variance   | 0.00291     |
|    learning_rate        | 0.0003      |
|    loss                 | 7.66        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0133     |
|    value_loss           | 52          |
-----------------------------------------
---



-----------------------------------------
| time/                   |             |
|    fps                  | 3081        |
|    iterations           | 6           |
|    time_elapsed         | 3           |
|    total_timesteps      | 12288       |
| train/                  |             |
|    approx_kl            | 0.006490647 |
|    clip_fraction        | 0.033       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.602      |
|    explained_variance   | 0.232       |
|    learning_rate        | 0.0003      |
|    loss                 | 16.8        |
|    n_updates            | 50          |
|    policy_gradient_loss | -0.00993    |
|    value_loss           | 66          |
-----------------------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 3039         |
|    iterations           | 7            |
|    time_elapsed         | 4            |
|    total_timesteps      | 1

<stable_baselines3.ppo.ppo.PPO at 0x3087879b0>

In [None]:
model_path = os.path.join('Training', 'Saved Models', 'best_model')
model = PPO.load(model_path, env=env)

In [None]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

In [None]:
env.close()

# 9. Changing Policies

In [11]:
# defining the model --> architecture for a neural network
# 4 inputs, 2 outputs, 128 neurons in each layer --> 4 layers
# pi = policy network, vf = value network

net_arch=[dict(pi=[128, 128, 128, 128], vf=[128, 128, 128, 128])]

In [12]:
model = PPO('MlpPolicy', env, verbose = 1, tensorboard_log = log_path, policy_kwargs = {'net_arch': net_arch})

Using cpu device




In [13]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Training/Logs/PPO_5
-----------------------------
| time/              |      |
|    fps             | 4692 |
|    iterations      | 1    |
|    time_elapsed    | 0    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1509        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.013984086 |
|    clip_fraction        | 0.199       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.682      |
|    explained_variance   | -0.00285    |
|    learning_rate        | 0.0003      |
|    loss                 | 4.2         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0243     |
|    value_loss           | 21.1        |
-----------------------------------------
---



-----------------------------------------
| time/                   |             |
|    fps                  | 1035        |
|    iterations           | 6           |
|    time_elapsed         | 11          |
|    total_timesteps      | 12288       |
| train/                  |             |
|    approx_kl            | 0.012049798 |
|    clip_fraction        | 0.0843      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.522      |
|    explained_variance   | 0.438       |
|    learning_rate        | 0.0003      |
|    loss                 | 12.1        |
|    n_updates            | 50          |
|    policy_gradient_loss | -0.00994    |
|    value_loss           | 36.3        |
-----------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 1016        |
|    iterations           | 7           |
|    time_elapsed         | 14          |
|    total_timesteps      | 14336 

<stable_baselines3.ppo.ppo.PPO at 0x318747da0>

# 10. Using an Alternate Algorithm

In [14]:
from stable_baselines3 import DQN

In [15]:
model = DQN('MlpPolicy', env, verbose = 1, tensorboard_log=log_path)

Using cpu device


In [16]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Training/Logs/DQN_1
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.952    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 10440    |
|    time_elapsed     | 0        |
|    total_timesteps  | 101      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.914    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 2881     |
|    time_elapsed     | 0        |
|    total_timesteps  | 180      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.597    |
|    n_updates        | 19       |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.888    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 

<stable_baselines3.dqn.dqn.DQN at 0x3187a8e90>

In [17]:
dqn_path = os.path.join('Training', 'Models', 'DQN_model')

In [18]:
model.save(dqn_path)

In [19]:
model = DQN.load(dqn_path, env=env)

In [22]:
evaluate_policy(model, env, n_eval_episodes=10, render=False)



(17.2, 10.943491216243562)

In [None]:
env.close()