IMPORT DEPENDENCIES

In [1]:
import os
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

LOAD ENV

In [2]:
env_name= "CartPole-v0"
env= gym.make(env_name, render_mode="human")

  logger.deprecation(


In [3]:
episodes = 5 #number of times the whole environment is run
for episode in range(1, episodes + 1):
    state= env.reset() # reset the environment to the initial state.... it will return the initial state value which is the first observation
    done = False # done is a boolean that indicates whether the episode has ended
    score = 0 # initialize the score for the episode
    
    while not done:
        env.render() # render   the environment to visualize the actions taken
        action= env.action_space.sample() # take a random action from the action space
        state, reward, terminated , done, info  = env.step(action)  # apply the action to the environment
        # state is the new state after taking the action
        # reward is the reward received for taking the action
        # terminated is a boolean that indicates whether the episode has ended due to a terminal state
        # done is a boolean that indicates whether the episode has ended
        # info is a dictionary that contains additional information about the environment
        score += reward 

    print(f"Episode {episode} Score: {score}")
env.close() # close the render frame 

  logger.warn(


Episode 1 Score: 17.0
Episode 2 Score: 11.0
Episode 3 Score: 17.0
Episode 4 Score: 44.0
Episode 5 Score: 20.0


In [43]:
env.action_space.sample()

1

In [109]:
env.step(1)

(array([ 1.0918093,  4.37141  , -1.7520808, -8.110683 ], dtype=float32),
 0.0,
 True,
 False,
 {})

UNDERSTANDING ENV

In [136]:
env.action_space.sample()

0

In [120]:
env.action_space

Discrete(2)

In [137]:
env.observation_space

Box([-4.8               -inf -0.41887903        -inf], [4.8               inf 0.41887903        inf], (4,), float32)

In [138]:
env.observation_space.sample()

array([-0.45313802, -0.63500464, -0.2454992 , -0.98092854], dtype=float32)

TRAINING THE MODEL

In [3]:
log_path = os.path.join('Training', 'Logs')

In [158]:
log_path

'Training\\Logs'

In [None]:
env = gym.make(env_name, render_mode="human") #render_mode="human" is used to visualize the environment
env = DummyVecEnv([lambda: env])  # Wrap the environment in a DummyVec because Stable Baselines3 requires a vectorized environment for training
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path)  # Create the PPO model with MLP policy which is a Multi-Layer Perceptron
# verbose=1 will print the training progress
# tensorboard_log=log_path will log the training progress to the specified path for visualization in TensorBoard

In [160]:
model.learn(total_timesteps=20000)  # Train the model for 20,000 timesteps

Logging to Training\Logs\PPO_1
-----------------------------
| time/              |      |
|    fps             | 45   |
|    iterations      | 1    |
|    time_elapsed    | 45   |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 45          |
|    iterations           | 2           |
|    time_elapsed         | 90          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008916764 |
|    clip_fraction        | 0.0875      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | -0.00335    |
|    learning_rate        | 0.0003      |
|    loss                 | 4.5         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0125     |
|    value_loss           | 50.8        |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x2086ee68650>

In [None]:
env.close()  # close the environment

SAVING THE MODEL

In [4]:
PPO_path = os.path.join('Training', 'SavedModels', 'PPO_CartPole_Model')

In [163]:
model.save(PPO_path)  # Save the model to the specified path

In [165]:
del model  # delete the model to free up memory

In [5]:
model = PPO.load(PPO_path, env=env)  # Load the model from the specified path

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


EVALUATION

In [8]:
env = gym.make(env_name, render_mode="human")  # Recreate the environment for evaluation

  logger.deprecation(


In [9]:
evaluate_policy( model, env, n_eval_episodes= 10, render=True)  # Evaluate the model on 10 episodes and render the environment)
# output will be the mean reward and standard deviation of the reward over the 10 episodes



(200.0, 0.0)

In [11]:
env.close()  # close the environment

TESTING

In [12]:
episodes = 5 
env= gym.make(env_name, render_mode="human")  # create the environment with human rendering
env = DummyVecEnv([lambda: env])  

for episode in range(1, episodes + 1): 
    state = env.reset()  # reset the environment to the initial state
    done = False  # done is a boolean that indicates whether the episode has ended
    score = 0  # initialize the score for the episode
    
    while not done:
        env.render()
        action, _states = model.predict(state)  # use the model to predict the action to take based on the current state
        state, reward, done, info = env.step(action)  # apply the action to the environment
        score += reward  # accumulate the score

    print(f"Episode {episode} Score: {score}")  # print the score for the episode

Episode 1 Score: [200.]
Episode 2 Score: [200.]
Episode 3 Score: [200.]
Episode 4 Score: [200.]
Episode 5 Score: [200.]


In [13]:
env.close()

In [187]:
env = gym.make(env_name, render_mode="human")  # create the environment with human rendering
env = DummyVecEnv([lambda: env])  # Wrap the environment in a DummyVec
obs=env.reset()  

In [None]:
model.predict(obs) 

(array([0], dtype=int64), None)

VIEWING LOGS IN TENSORBOARD

In [None]:
# log_path = os.path.join('Training', 'Logs')  # Path to save the training logs
training_log_path = os.path.join(log_path, 'PPO_1')
training_log_path

'Training\\Logs\\PPO_1'

In [16]:
!tensorboard --logdir={training_log_path}  

^C


## Adding a callback to training stage

In [6]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [11]:
save_path = os.path.join('Training', 'SavedModels')  # Path to save the best model

In [None]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=195, verbose=1)  # Stop training when the average reward reaches 195
eval_callback = EvalCallback(env,  
                            callback_on_new_best=stop_callback,  # Stop training when the average reward reaches 195
                            eval_freq=10000,  # Evaluate the model every 1000 steps
                            best_model_save_path = save_path, # Save the best model
                            verbose=1)  # Print evaluation results

In [13]:
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path)  # Create the PPO model with MLP policy which is a Multi-Layer Perceptron
model.learn(total_timesteps=20000, callback=eval_callback)  # Train the model

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to Training\Logs\PPO_2
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 21.7     |
|    ep_rew_mean     | 21.7     |
| time/              |          |
|    fps             | 44       |
|    iterations      | 1        |
|    time_elapsed    | 45       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 26.4        |
|    ep_rew_mean          | 26.4        |
| time/                   |             |
|    fps                  | 44          |
|    iterations           | 2           |
|    time_elapsed         | 93          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.009264075 |
|    clip_fraction        | 0.11        |
|    clip_range           | 0.2



Eval num_timesteps=10000, episode_reward=184.00 +/- 20.09
Episode length: 184.00 +/- 20.09
------------------------------------------
| eval/                   |              |
|    mean_ep_length       | 184          |
|    mean_reward          | 184          |
| time/                   |              |
|    total_timesteps      | 10000        |
| train/                  |              |
|    approx_kl            | 0.0070005567 |
|    clip_fraction        | 0.0461       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.62        |
|    explained_variance   | 0.229        |
|    learning_rate        | 0.0003       |
|    loss                 | 16.1         |
|    n_updates            | 40           |
|    policy_gradient_loss | -0.0108      |
|    value_loss           | 60.8         |
------------------------------------------
New best mean reward!
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 63.1     |
|    ep_rew

<stable_baselines3.ppo.ppo.PPO at 0x26d0c74b450>

In [14]:
env.close()

CHANGING POLICIES

In [28]:
net_arch= [dict(pi=[128, 128, 128, 128], vf=[128, 128, 128, 128])]  # Define the neural network architecture for the policy and value function
# pi is the policy network and vf is the value function network
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path, policy_kwargs = {'net_arch':net_arch})  # Create the PPO model with MLP policy which is a Multi-Layer Perceptron

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




In [26]:
env = gym.make(env_name, render_mode="human") 

In [29]:
model.learn(total_timesteps=20000, callback=eval_callback)  # Train the model

Logging to Training\Logs\PPO_3
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 21.2     |
|    ep_rew_mean     | 21.2     |
| time/              |          |
|    fps             | 44       |
|    iterations      | 1        |
|    time_elapsed    | 45       |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 31          |
|    ep_rew_mean          | 31          |
| time/                   |             |
|    fps                  | 42          |
|    iterations           | 2           |
|    time_elapsed         | 96          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.016894193 |
|    clip_fraction        | 0.253       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.68       |
|    explained_variance   | -0.00718    |



error: display Surface quit

USING ALTERNATE ALGO

In [30]:
from stable_baselines3 import DQN

In [31]:
model = DQN("MlpPolicy", env, verbose=1, tensorboard_log=log_path)  # Create the PPO model with MLP policy which is a Multi-Layer Perceptron

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [32]:
model.learn(total_timesteps=20000)  # Train the model

Logging to Training\Logs\DQN_1
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 19.8     |
|    ep_rew_mean      | 19.8     |
|    exploration_rate | 0.962    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 45       |
|    time_elapsed     | 1        |
|    total_timesteps  | 79       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 19.6     |
|    ep_rew_mean      | 19.6     |
|    exploration_rate | 0.925    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 45       |
|    time_elapsed     | 3        |
|    total_timesteps  | 157      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.581    |
|    n_updates        | 14       |
----------------------------------
----------------------------------
| rollout/            | 

<stable_baselines3.dqn.dqn.DQN at 0x26d0fa36c90>

In [34]:
DQN_path = os.path.join('Training', 'SavedModels', 'DQN_CartPole_Model')
model.save(DQN_path)  # Save the model to the specified path
del model  # delete the model to free up memory
# model = DQN.load(DQN_path, env=env)  # Load the model from the specified path