1) Imoprt Dependencies

pip install stable-baselines3[extra]

https://stable-baselines3.readthedocs.io/en/master/

In [1]:
import os                                                           # operating system library
import gymnasium as gym                                             # for openAI gymnasium (which has replaced openai gym)
from stable_baselines3 import PPO                                   # algorithm
# Action Spaces: Algorithms
#   Discrete Single Process: DQN
#   Discrete Multi Professed: PPO or A2C
#   Continuous Single Process: SAC or TD3
#   Continuous Multi Processed: PPO or A2C

from stable_baselines3.common.vec_env import DummyVecEnv            # vectorizing environment allows train on multiple environments at the same time (this doesn't do that)
from stable_baselines3.common.evaluation import evaluate_policy     # mean and stdev of reward

# CALLBACKS
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

2) Load Environment

In [9]:
# Create environment name
environment_name = 'CartPole-v1'

In [10]:
# create the environment
env = gym.make(environment_name,render_mode = 'human')

print(environment_name)


CartPole-v1


Understanding the Environment

In [6]:
# two different spaces:
# action space (actions we can do)
print(env.action_space)
print(env.action_space.sample())

# and observations (things we observe)
print(env.observation_space)
print(env.observation_space.sample())

state = env.reset()
print(env.step(1))

Discrete(2)
0
Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)
[ 2.9300828e+00  1.5114768e+38  1.5634502e-01 -2.3155817e+38]
(array([ 0.04802989,  0.19765113, -0.04387073, -0.3552087 ], dtype=float32), 1.0, False, False, {})


In [7]:
env.close()

In [11]:
# loop for testing environment
episodes = 5
for episode in range(1,episodes+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        # make the render
        env.render()

        # select the next action
        action = env.action_space.sample()

        # take a step using the action and return the new state, reward, is the episode done? x2, info
        n_state, reward, terminated, truncated, info = env.step(action)
        done = truncated or terminated

        

        # aggregate the reward
        score += reward

    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:13.0
Episode:2 Score:16.0
Episode:3 Score:13.0
Episode:4 Score:32.0
Episode:5 Score:56.0


3) Train an RL Model

In [19]:
# where we save our tensorboard log
log_path = os.path.join('Training','Logs')

# recreate the environment
environment_name = 'CartPole-v1'
env = gym.make(environment_name)

# wrap environ in dummy vec environment (wrapper for non-vectorized environment)
env = DummyVecEnv([lambda: env])

# create the model: multilayer neural network policy (the rules teh agent uses to determine its actions)
model = PPO('MlpPolicy',env,verbose=1,tensorboard_log=log_path)

# train model (including number of timesteps to train model for)
model.learn(total_timesteps=20000)

Using cuda device
Logging to Training\Logs\PPO_1
-----------------------------
| time/              |      |
|    fps             | 929  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 774          |
|    iterations           | 2            |
|    time_elapsed         | 5            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0064420556 |
|    clip_fraction        | 0.083        |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.687       |
|    explained_variance   | -0.00295     |
|    learning_rate        | 0.0003       |
|    loss                 | 7.72         |
|    n_updates            | 10           |
|    policy_gradient_loss | -0.0115      |
|    value_loss           | 54.8         |
----------

<stable_baselines3.ppo.ppo.PPO at 0x1c4e5897f10>

4) Save and Reload Model

In [20]:
# path to save model to
PPO_Path = os.path.join('Training','Saved Models','PPO_Model_Cartpole')

In [21]:
# test saving an deleting the model
model.save(PPO_Path)
del model
model = PPO.load(PPO_Path,env=env)

5) Evaluation

In [13]:
# score of 200 or higher is generally considered solved for PPO

# load model
env = gym.make(environment_name,render_mode = 'human')
model = PPO.load(PPO_Path,env=env)

# returns mean and std of rewards
evaluate_policy(model,env,n_eval_episodes=10,render=True)
env.close()

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




6) Test Model

In [22]:
# take observations, pass them to agent, agent will determine action using learned policy
# - in this example the agent is the model object and the environment is the env object
# - the actions and the observations/rewards are things that are returned from the agent and envirnment respectively
# - there are a variety of methods that can be used to return an action or an observation/rewards

# load model
env = gym.make(environment_name,render_mode = 'human')
model = PPO.load(PPO_Path,env=env)

# loop for testing environment
episodes = 5
for episode in range(1,episodes+1):
    obs = env.reset()[0]
    done = False
    score = 0

    while not done:
        # make the render
        env.render()

        # NOW USING MODEL HERE!!
        action, _ = model.predict(obs)

        # take a step using the action and return the new state, reward, is the episode done? x2, info
        obs, reward, terminated, truncated, info = env.step(action)
        done = truncated or terminated

        # aggregate the reward
        score += reward

    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Episode:1 Score:403.0
Episode:2 Score:292.0
Episode:3 Score:500.0
Episode:4 Score:371.0
Episode:5 Score:375.0


7. Viewing Logs in Tensorboard

Ideally want to do this in terminal/cmd line so that we don't bog down the notebook
Steps:
1) specify path to folder with log

    ipynb: training_log_path = os.path.join(log_path,PPO_2)
    
    cmd: ./Training/Logs/PPO_2
2) launch tensorboard

    ipynb: !tensorboard --logdir training_log_path

    cmd: tensorboard --logdir ./Training/Logs/PPO_#





8. Adding a Callback to the Training Stage

    Now we will apply callbacks (we will stop training once we reach a desired performance, etc. and save the model at our desired condition)

In [24]:
save_path = os.path.join('Training','Saved Models')
env = gym.make(environment_name,render_mode = 'human')
log_path = os.path.join('Training','Logs')

# this is the callback that will stop our training once we pass a certain reward threshold (the average reward we want to stop on and verbose to get some additional logging)
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)  
# the callback that will be triggered after each training run
# pass the:
#   - environment
#   - the callback we want to run on the new best model: every time ther eis a new bet model it will run the stop callback and check the reward threshold
#   - how frequently we want to run the eval callback
#   - path to the model
# basically every 10000 runs it will check the reward threshold, if it's above it will stop the training and save the model
eval_callback = EvalCallback(env,
                             callback_on_new_best=stop_callback,
                             eval_freq=10000,
                             best_model_save_path=save_path,
                             verbose=1)

# need to associate these callbacks with our model              
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

# now will pass callback in when we train our model
model.learn(total_timesteps=20000, callback=eval_callback)



Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to Training\Logs\PPO_2
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 23       |
|    ep_rew_mean     | 23       |
| time/              |          |
|    fps             | 46       |
|    iterations      | 1        |
|    time_elapsed    | 43       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 29.8        |
|    ep_rew_mean          | 29.8        |
| time/                   |             |
|    fps                  | 45          |
|    iterations           | 2           |
|    time_elapsed         | 89          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009020019 |
|    clip_fraction        | 0.112       |
|    clip_range           | 0.



Eval num_timesteps=10000, episode_reward=432.80 +/- 88.17
Episode length: 432.80 +/- 88.17
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 433         |
|    mean_reward          | 433         |
| time/                   |             |
|    total_timesteps      | 10000       |
| train/                  |             |
|    approx_kl            | 0.011458362 |
|    clip_fraction        | 0.103       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.602      |
|    explained_variance   | 0.251       |
|    learning_rate        | 0.0003      |
|    loss                 | 33.1        |
|    n_updates            | 40          |
|    policy_gradient_loss | -0.0188     |
|    value_loss           | 72          |
-----------------------------------------
New best mean reward!
Stopping training because the mean reward 432.80  is above the threshold 200


<stable_baselines3.ppo.ppo.PPO at 0x1c4e5872090>

9. Changing Policies

    We can change our model architecture if we want

In [3]:
save_path = os.path.join('Training','Saved Models')
environment_name = 'CartPole-v1'
env = gym.make(environment_name,render_mode = 'human')
log_path = os.path.join('Training','Logs')

# this is the callback that will stop our training once we pass a certain reward threshold (the average reward we want to stop on and verbose to get some additional logging)
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)  
# the callback that will be triggered after each training run
# pass the:
#   - environment
#   - the callback we want to run on the new best model: every time ther eis a new bet model it will run the stop callback and check the reward threshold
#   - how frequently we want to run the eval callback
#   - path to the model
# basically every 10000 runs it will check the reward threshold, if it's above it will stop the training and save the model
eval_callback = EvalCallback(env,
                             callback_on_new_best=stop_callback,
                             eval_freq=10000,
                             best_model_save_path=save_path,
                             verbose=1)

# specify a custom neural network architecture
net_arch = [dict(pi=[128,128,128,128], vf=[128,128,128,128])]

# declare model and train with new model
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path, policy_kwargs={'net_arch':net_arch})
model.learn(total_timesteps=20000, callback=eval_callback)

env.close()

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




Logging to Training\Logs\PPO_3
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 22.6     |
|    ep_rew_mean     | 22.6     |
| time/              |          |
|    fps             | 42       |
|    iterations      | 1        |
|    time_elapsed    | 47       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 29.4        |
|    ep_rew_mean          | 29.4        |
| time/                   |             |
|    fps                  | 43          |
|    iterations           | 2           |
|    time_elapsed         | 93          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.014564334 |
|    clip_fraction        | 0.235       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.681      |
|    explained_variance   | 0.00396     |



Eval num_timesteps=10000, episode_reward=459.80 +/- 50.46
Episode length: 459.80 +/- 50.46
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 460         |
|    mean_reward          | 460         |
| time/                   |             |
|    total_timesteps      | 10000       |
| train/                  |             |
|    approx_kl            | 0.009463313 |
|    clip_fraction        | 0.139       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.584      |
|    explained_variance   | 0.469       |
|    learning_rate        | 0.0003      |
|    loss                 | 16.3        |
|    n_updates            | 40          |
|    policy_gradient_loss | -0.02       |
|    value_loss           | 44.4        |
-----------------------------------------
New best mean reward!
Stopping training because the mean reward 459.80  is above the threshold 200


<stable_baselines3.ppo.ppo.PPO at 0x21d85e95690>

10. Using an Alternate Algorithm

In [None]:
# import DQN
from stable_baselines3 import DQN

# create the model
model = DQN('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

# train the model
model.learn(total_timesteps=20000)

# to load the model use DQN.load instead of PPO.load
