# Learning RL using StableBaseline3

### Importing Necessary libraries

In [20]:
import os
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

Here We are using the inbuild example environment of Cartpole environemnt


In [23]:
#Making the vatiable for the model name
env_name = "CartPole-v1"
#Creating the environment , remnder_mode is set to human for visualisation
env = gym.make(env_name,  render_mode="human")

Understanding the environment class menthods

In [19]:
#Checking what are the outputs from the resetting the environment
print(env.reset())
# Checking the action and observation space of the environment
print(env.action_space)
print(env.observation_space)
#Observing the output of the step function with the action 1
obs, reward, done, info, _ = env.step(1)
print("obs : ", obs,"reward : ", reward,"Done : " ,done,"info : " ,info)
# closing the environment to close the Visualisation window
env.close()

(array([ 0.04225325, -0.03066197, -0.03478266,  0.04284509], dtype=float32), {})
Discrete(2)
Box([-4.8               -inf -0.41887903        -inf], [4.8               inf 0.41887903        inf], (4,), float32)
obs :  [ 0.04164001  0.16494104 -0.03392576 -0.26060602] reward :  1.0 Done :  False info :  False


Now Lets run the episode of balacing the cartpole 
and assuming we are not learning or trying to learn to balance

In [24]:
# Lets define the number of episodes that is
# we will run the simulation for number of episodes time just 
# to see how the environment is working and how the action space is being used
# and how the reward is being calculated
episodes = 10 # Defineing the number of episodes
for episode in range(1, episodes + 1): #for loop to run the simulation for number of episodes
    # Reset the environment for each episode
    obs = env.reset()
    # Initialize done variable to False which is used to raise flage of termination
    done = False
    # Variable to keep track of the score; Score is cumulative reward
    score=0
    # Loop until the episode is done (or not terminated)
    #we could have also used max reward or max time steps to terminate the episode
    while not done:
        # Sample a random action from the action space
        # In a real scenario, you would use a trained model to predict the action
        action= env.action_space.sample()  # Random action for demonstration
        # Take a step in the environment with the sampled action
        # The step function returns the next observation, reward, done flag, info dictionary and the time step
        obs, reward, done, info, _ = env.step(action)
        # render the environment to visualize the action taken
        env.render()
        #keep the score of the episode
        score += reward
        # print(f"Episode: {episode}, Score: {score}, Action: {action}, Reward: {reward}")
    print(f"Episode {episode} finished with score: {score}")

env.close()  # Close the environment after all episodes are done
    

Episode 1 finished with score: 16.0
Episode 2 finished with score: 19.0
Episode 3 finished with score: 23.0
Episode 4 finished with score: 38.0
Episode 5 finished with score: 20.0
Episode 6 finished with score: 17.0
Episode 7 finished with score: 29.0
Episode 8 finished with score: 26.0
Episode 9 finished with score: 53.0
Episode 10 finished with score: 21.0


So in above logs you can see the random score being accumulated


# Training

Training the model to select action based on obs env

Before starting the training, we need to create a directory for logging
the updated weights

In [None]:
#Create directory and make a variable of that directory
# Log path of the taraing
log_path = os.path.join("Training", "Logs")

In [34]:
log_path

'Training\\Logs'

Again recreating the env variable,
Making a dummy vec env for vectorizing a single env
and setting up the model with PPO, env and saving the logs in log path

In [48]:
env=gym.make(env_name, render_mode="human") # redefining the environment
env = DummyVecEnv([lambda: env])  # Wrap the environment in a DummyVecEnv for vectorized training
# Create the PPO model with the specified policy and environment
model= PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path)

Using cpu device


Begin learning using the below command and set a total timesteps to terminate

In [36]:
model.learn(total_timesteps=20000)

Logging to Training\Logs\PPO_1
-----------------------------
| time/              |      |
|    fps             | 1056 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 906         |
|    iterations           | 2           |
|    time_elapsed         | 4           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008801829 |
|    clip_fraction        | 0.11        |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | 0.0057      |
|    learning_rate        | 0.0003      |
|    loss                 | 6.2         |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0165     |
|    value_loss           | 52.5        |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x2ab898c6710>

Next task is to save the model,

In [None]:
# Firstly create the directory and variable of the path to the directory
# Save the PPO model
PPO_Path = os.path.join("Training", "Saved Models", "PPO_CartPole")

In [None]:
# Save the model using this command
model.save(PPO_Path)

Overall Consolidated Training a model Looks like this

In [None]:
log_path = os.path.join("Training", "Logs")
PPO_Path = os.path.join("Training", "Saved Models", "PPO_CartPole")
env=gym.make(env_name, render_mode="human") # redefining the environment
env = DummyVecEnv([lambda: env])  # Wrap the environment in a DummyVecEnv for vectorized training
model= PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path)# Create the PPO model with the specified policy and environment
model.learn(total_timesteps=20000)
model.save(PPO_Path)


Now you can delete the variable model so that I can demonstrate howe to load model

In [None]:
del model  # Delete the model to demonstrate loading

In [5]:
# Load the model using the same path as above
# Load the PPO model
model = PPO.load(PPO_Path, env=env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


# Evaluation

Once the model is loaded now you can evaluate the policy using another method as shown below

This evaluation is done based on given num random episodes and thier scores are averaged out 

In [63]:
# Evaluate policy takes arguments as , model, env. no. of episodes to evaluate, and render mode
evaluate_policy(model, env, n_eval_episodes=10, render=True)

(500.0, 0.0)

# Testing

While Evalute policy doesnot give you enough insites of each episodes, we create Testing 

In [8]:
# Now all the code is same as above but we will use
# trained model to predict the action and then take a step in the environment
episodes = 10
for episode in range(1, episodes + 1):
    obs ,info= env.reset()
    done = False
    score=0

    while not done:
        # olnly change this line to use the trained model to predict the action
        action,_= model.predict(obs)
        # action= env.action_space.sample()  # Random action for demonstration
        obs, reward, done, info,_ = env.step(action)
        env.render()
        score += reward
        # print(f"Episode: {episode}, Score: {score}, Action: {action}, Reward: {reward}")
    print(f"Episode {episode} finished with score: {score}")
env.close()
    

Episode 1 finished with score: 279.0
Episode 2 finished with score: 1615.0
Episode 3 finished with score: 759.0
Episode 4 finished with score: 729.0
Episode 5 finished with score: 96.0
Episode 6 finished with score: 419.0
Episode 7 finished with score: 828.0
Episode 8 finished with score: 176.0
Episode 9 finished with score: 86.0
Episode 10 finished with score: 188.0


# Vectorizing

In [51]:
from stable_baselines3.common.vec_env import VecEnv
from stable_baselines3.common.env_util import make_vec_env


In [52]:
log_path = os.path.join("Training", "Logs")

# env = gym.make_vec(env_name,num_envs=4, render_mode="human", vectorization_mode="vector_entry_point") # redefining the environment
# env= VecEnv(env, num_envs=4) # Wrapping the environment in a VecFrameStack for stacking frames
# env = VecEnv(lambda: gym.make("CartPole-v1"), n_envs=4)
env = make_vec_env(lambda: gym.make("CartPole-v1"), n_envs=4)

# model= PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path) # Create the PPO model with the specified policy and environment

In [53]:
env

<stable_baselines3.common.vec_env.dummy_vec_env.DummyVecEnv at 0x1fdaf94a8d0>

In [56]:
model = PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path)
model.learn(total_timesteps=100000)

Using cpu device
Logging to Training\Logs\PPO_4


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 22.6     |
|    ep_rew_mean     | 22.6     |
| time/              |          |
|    fps             | 2815     |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 8192     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 32.2        |
|    ep_rew_mean          | 32.2        |
| time/                   |             |
|    fps                  | 1059        |
|    iterations           | 2           |
|    time_elapsed         | 15          |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.014833221 |
|    clip_fraction        | 0.218       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.682      |
|    explained_variance   | -0.00338    |
|    learning_rate        | 0.

<stable_baselines3.ppo.ppo.PPO at 0x1fdaf82be90>

### Testing model after Vectorization 

In [59]:
# Firstly create the directory and variable of the path to the directory
# Save the PPO model
PPO_Path = os.path.join("Training", "Saved Models", "PPO_CartPole_TS_1000000")

In [60]:
# Save the model using this command
model.save(PPO_Path)

In [71]:
#Making the vatiable for the model name
env_name = "CartPole-v1"
#Creating the environment , remnder_mode is set to human for visualisation
env = gym.make(env_name,  render_mode="human")

In [72]:
del model  # Delete the model to demonstrate loading
# Load the model using the same path as above
# Load the PPO model
model = PPO.load(PPO_Path, env=env)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [73]:
# Now all the code is same as above but we will use
# trained model to predict the action and then take a step in the environment
episodes = 10
for episode in range(1, episodes + 1):
    obs ,info= env.reset()
    done = False
    score=0

    while (not done and score<60):#adding extra condition to stop the simualtion
        # olnly change this line to use the trained model to predict the action
        action,_= model.predict(obs)
        # action= env.action_space.sample()  # Random action for demonstration
        obs, reward, done, info,_ = env.step(action)
        env.render()
        score += reward
        # print(f"Episode: {episode}, Score: {score}, Action: {action}, Reward: {reward}")
    print(f"Episode {episode} finished with score: {score}")
env.close()

Episode 1 finished with score: 60.0
Episode 2 finished with score: 60.0
Episode 3 finished with score: 60.0
Episode 4 finished with score: 60.0
Episode 5 finished with score: 60.0
Episode 6 finished with score: 60.0
Episode 7 finished with score: 60.0
Episode 8 finished with score: 60.0
Episode 9 finished with score: 60.0
Episode 10 finished with score: 60.0


# Custom Environement

In [2]:
import gymnasium as gym
from gymnasium import Env
from gymnasium.spaces import Discrete, Box, Dict,Tuple, MultiDiscrete, MultiBinary
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
import os
import numpy as np
import random

### Types of spaces

In [77]:
for i in range(5):# Random number between 1 and 100
    print(Discrete(5).sample()) # Discrete action space with 5 actions

4
3
2
0
0


In [78]:
Box(0,1, shape=(3,), dtype=np.float32).sample() # Box action space with 3 actions and range between 0 and 1

array([0.65011007, 0.0919581 , 0.5091919 ], dtype=float32)

In [79]:
Tuple((Discrete(5), Box(0,1, shape=(3,), dtype=np.float32))).sample() # Tuple action space with 5 actions and range between 0 and 1

(0, array([0.07912254, 0.1186677 , 0.78638875], dtype=float32))

In [80]:
Dict({"action": Discrete(5), "observation": Box(0,1, shape=(3,), dtype=np.float32)}).sample() # Dict action space with 5 actions and range between 0 and 1

{'action': 3,
 'observation': array([0.11112826, 0.10480467, 0.94225425], dtype=float32)}

In [81]:
MultiBinary(5).sample() # MultiBinary action space with 5 actions and range between 0 and 1

array([0, 0, 1, 1, 0], dtype=int8)

In [83]:
for i in range(5):# Random number between 1 and 100
    print(MultiDiscrete([5, 5, 5]).sample()) # MultiDiscrete action space with 5 actions and range between 0 and 1

[2 0 3]
[1 4 2]
[4 2 4]
[0 1 2]
[0 4 0]


### Building env
- BUild an agent to give best shower
- random temp
- 37 and 39 degreees

In [3]:
class ShowerEnv(Env):
    def __init__(self):
        super(ShowerEnv, self).__init__()

        self.action_space = Discrete(3)
        # Both are same observation space
        self.observation_space = Box(low=np.array([0]), high=np.array([100]), dtype=np.float32)
        # self.observation_space = Box(low=0, high=0, shape=(1,) ,dtype=np.float32)

        self.state = 38 + random.randint(90, 100)  # Initial temperature (random between 38 and 40)
        self.shower_length = 60  # Length of the shower in seconds
    def step(self, action):
        # Apply action on the state
        self.state += action-1

        # Decrease the shower length by 1 second
        self.shower_length -= 1

        #Reward calculation
        if self.state>=37 and self.state<=39:
            reward = 1
        else:
            reward = -1
        
        #Termination condition
        if self.shower_length <= 0:
            done = True
        else:
            done = False
        info = {}
        truncated = False
        return self.state, reward, done, truncated, info
    
    def seed(self, seed=None):
        np.random.seed(seed)
        random.seed(seed)
        
    def render(self, mode='human'):
        pass


    def reset(self,*, seed=None, options=None):
        super().reset(seed=seed)
        if seed is not None:
            np.random.seed(seed)
            random.seed(seed)
        # Reset the state to a random temperature
        self.state =np.array([38+random.randint(-10, 10)]).astype(np.float32)
        self.shower_length = 60
        # Return the initial state
        return self.state , {}
    

In [None]:
observation_space1 = Box(low=np.array([0]), high=np.array([100]), dtype=np.float32)
observation_space2 = Box(low=0, high=100, shape=(1,) ,dtype=np.float32)

print(observation_space1.sample()) # Sample from the observation space
print(observation_space2.sample()) # Sample from the observation space

In [142]:
env = ShowerEnv() # Creating the environment
print(env.observation_space)
print(env.action_space) # Printing the action space of the environment
env.reset() # Resetting the environment
print(env.state)
print(env.step(1)) # Taking a step in the environment with action 1

Box(0.0, 100.0, (1,), float32)
Discrete(3)
[35.]
(array([35.], dtype=float32), -1, False, False, {})


### Test Environment

In [126]:
episode=5
for episode in range(1, episode + 1): #for loop to run the simulation for number of episodes
    # Reset the environment for each episode
    obs, info = env.reset()
    # Initialize done variable to False which is used to raise flage of termination
    done = False
    # Variable to keep track of the score; Score is cumulative reward
    score=0
    # Loop until the episode is done (or not terminated)
    #we could have also used max reward or max time steps to terminate the episode
    while not done:
        # Sample a random action from the action space
        # In a real scenario, you would use a trained model to predict the action
        action= env.action_space.sample()  # Random action for demonstration
        # Take a step in the environment with the sampled action
        # The step function returns the next observation, reward, done flag, info dictionary and the time step
        obs, reward, done, truncated, info = env.step(action)
        # render the environment to visualize the action taken
        env.render()
        #keep the score of the episode
        score += reward
        # print(f"Episode: {episode}, Score: {score}, Action: {action}, Reward: {reward}")
    print(f"Episode {episode} finished with score: {score}")

Episode 1 finished with score: -16
Episode 2 finished with score: 38
Episode 3 finished with score: 6
Episode 4 finished with score: 30
Episode 5 finished with score: -54


### Trainning a model

In [120]:
log_path = os.path.join("Training", "Logs")
env=ShowerEnv() # redefining the environment
# env = DummyVecEnv([lambda: env])  # Wrap the environment in a DummyVecEnv for vectorized training
# Create the PPO model with the specified policy and environment
model= PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path)
model.learn(total_timesteps=100000)
# model.save(PPO_Path)


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to Training\Logs\PPO_7
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 60       |
|    ep_rew_mean     | -30.5    |
| time/              |          |
|    fps             | 3191     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 60          |
|    ep_rew_mean          | -30.6       |
| time/                   |             |
|    fps                  | 1908        |
|    iterations           | 2           |
|    time_elapsed         | 2           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008185738 |
|    clip_fraction        | 0.0448      |
|    clip_range           | 0.2

<stable_baselines3.ppo.ppo.PPO at 0x1fdb099b790>

In [None]:
# Saving the Shower Model
PPO_Path = os.path.join("Training", "Saved Models", "PPO_ShowerEnv_TS_1000000")
model.save(PPO_Path)


In [138]:
del model  # Delete the model to demonstrate loading
model = PPO.load(PPO_Path, env=env) # Load the model using the same path as above
evaluate_policy(model, env, n_eval_episodes=50, render=True) # Evaluate policy takes arguments as , model, env. no. of episodes to evaluate, and render mode

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


(59.48, 0.8772684879784522)

### Testing Model

- It is point to be noted that
- Model is trained really well can see from the log chart as well as evaluate policy
- when you increase the random value generation limits, you can notice your rewards goes down as it cannot achieve to the max reward within 60 time step
- PLay with the rand intitalization value in the self.reset and see different values of the rewards

In [None]:
env=ShowerEnv()
episode=50
for episode in range(1, episode + 1): #for loop to run the simulation for number of episodes
    # Reset the environment for each episode
    obs, info = env.reset()
    # Initialize done variable to False which is used to raise flage of termination
    done = False
    # Variable to keep track of the score; Score is cumulative reward
    score=0
    # Loop until the episode is done (or not terminated)
    #we could have also used max reward or max time steps to terminate the episode
    while not done:
        # Sample a random action from the action space
        # In a real scenario, you would use a trained model to predict the action
        action, _= model.predict(obs)  # Random action for demonstration
        # Take a step in the environment with the sampled action
        # The step function returns the next observation, reward, done flag, info dictionary and the time step
        obs, reward, done, truncated, info = env.step(action)
        # render the environment to visualize the action taken
        env.render()
        #keep the score of the episode
        score += reward
        # print(f"Episode: {episode}, Score: {score}, Action: {action}, Reward: {reward}")
    print(f"Episode {episode} finished with score: {score}")

Episode 1 finished with score: 34
Episode 2 finished with score: 12
Episode 3 finished with score: 48
Episode 4 finished with score: 38
Episode 5 finished with score: 34
Episode 6 finished with score: 6
Episode 7 finished with score: 2
Episode 8 finished with score: 18
Episode 9 finished with score: 12
Episode 10 finished with score: 26
Episode 11 finished with score: 8
Episode 12 finished with score: 32
Episode 13 finished with score: 28
Episode 14 finished with score: 46
Episode 15 finished with score: 14
Episode 16 finished with score: 26
Episode 17 finished with score: 38
Episode 18 finished with score: 54
Episode 19 finished with score: -2
Episode 20 finished with score: 4
Episode 21 finished with score: 42
Episode 22 finished with score: 6
Episode 23 finished with score: 36
Episode 24 finished with score: 36
Episode 25 finished with score: 24
Episode 26 finished with score: 36
Episode 27 finished with score: 20
Episode 28 finished with score: 48
Episode 29 finished with score: 2


: 

### Vectorising the custom Environment

- Setting up the log path
- vectorzoing it 
- defining a model
- training a model

In [4]:
from stable_baselines3.common.env_util import make_vec_env

log_path = os.path.join("Training", "Logs")
env = make_vec_env(lambda: ShowerEnv(), n_envs=4)


In [6]:
model= PPO("MlpPolicy", env, verbose=1, tensorboard_log=log_path)
model.learn(total_timesteps=100000)

Using cpu device
Logging to Training\Logs\PPO_9
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 60       |
|    ep_rew_mean     | -41.6    |
| time/              |          |
|    fps             | 8512     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 8192     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 60           |
|    ep_rew_mean          | -43.3        |
| time/                   |              |
|    fps                  | 3282         |
|    iterations           | 2            |
|    time_elapsed         | 4            |
|    total_timesteps      | 16384        |
| train/                  |              |
|    approx_kl            | 0.0020263607 |
|    clip_fraction        | 0.0298       |
|    clip_range           | 0.2          |
|    entropy_loss         | -1.09        |
|    expla

<stable_baselines3.ppo.ppo.PPO at 0x1ef6bb037d0>

In [7]:
# Saving the Shower Model
PPO_Path = os.path.join("Training", "Saved Models", "PPO_Vec_env_ShowerEnv_TS_1000000")
model.save(PPO_Path)

In [11]:
del model  # Delete the model to demonstrate loading
model = PPO.load(PPO_Path, env=ShowerEnv()) # Load the model using the same path as above
evaluate_policy(model, env, n_eval_episodes=200, render=True) # Evaluate policy takes arguments as , model, env. no. of episodes to evaluate, and render mode


Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


(53.14, 5.379628239943723)