## Importing Dependent modules

In [1]:
!pip install stable-baselines3[extra]




[notice] A new release of pip available: 22.3.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os #to pass paths and to save and log out

In [3]:
import gym #allow us to build environments and also work with pre existing environments

In [4]:
from stable_baselines3 import PPO    #an RL algorithm form baseline pakage

In [5]:
from stable_baselines3.common.vec_env import DummyVecEnv  #dummy vectorised environment to train different models at the same time

In [6]:
from stable_baselines3.common.evaluation import evaluate_policy     
#to test how model is performing to get avg reward after certain number of epsords

## Load Environments

In [7]:
!pip install pyglet==1.5.27
#pyglet is a cross-platform windowing and multimedia library for Python, intended for developing games and other visually rich applications




[notice] A new release of pip available: 22.3.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
environment_name='CartPole-v0'


In [9]:
env=gym.make(environment_name)

In [10]:
#Trying to test the cartpole evn 5 itmes:

episodes = 5                                                        #loop in 5 times
for episode in range(1, episodes+1):
    state = env.reset()                                             #reset the evironment to initial conditins
    done = False                                                    #setting variables
    score = 0 
    
    while not done:
        env.render()                                                #view the graphical representation of environment 
        action = env.action_space.sample()                          #generate a random action space
        n_state, reward, done, info = env.step(action)              #passing action to environment 
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:19.0
Episode:2 Score:33.0
Episode:3 Score:10.0
Episode:4 Score:31.0
Episode:5 Score:24.0


In [11]:
env.reset()   #these are the observations we get for environment

array([-0.0490357 , -0.03807538, -0.02438258, -0.02018723], dtype=float32)

In [12]:
env.action_space.sample()      #there are two space in this example (0,1) anyone is selected at random

0

In [13]:
env.observation_space.sample()        #gives observation space

array([ 2.4195108e+00,  3.6727641e+37,  4.2447452e-02, -1.5034697e+38],
      dtype=float32)

## Undernstand the algorithm 

In [14]:
env.action_space

Discrete(2)

In [15]:
env.action_space.sample()      
#there are two space in this example (0,1) anyone is selected at random
#i.e box moving left or right

1

In [16]:
env.observation_space

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

In [17]:
env.observation_space.sample()        
#gives observation space , e.e Cart Position,Cart Velocity,Pole Angle,Pole Angular Velocity

array([ 1.7382528e+00, -2.4266128e+38,  4.0516815e-01,  1.0352617e+38],
      dtype=float32)

## Train the RL model

In [18]:
#make directory first
log_path=os.path.join('Training','Logs')   #to save logs to monitor our training

In [19]:
log_path

'Training\\Logs'

In [20]:
env=gym.make(environment_name)                                            #creating the environment
env=DummyVecEnv([lambda: env])                                            #wrapped our environment into dummy env
model = PPO('MlpPolicy',env,verbose=1,tensorboard_log=log_path)           #think of it as defining out agent
# verbose=1 to seve results
# MlpPolicy = the policy we r using

Using cpu device


In [21]:
model.learn(total_timesteps=20000)

Logging to Training\Logs\PPO_1
-----------------------------
| time/              |      |
|    fps             | 793  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 571         |
|    iterations           | 2           |
|    time_elapsed         | 7           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008128976 |
|    clip_fraction        | 0.103       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | -0.00449    |
|    learning_rate        | 0.0003      |
|    loss                 | 11.1        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0167     |
|    value_loss           | 65.2        |
-----------------------------------------
---

<stable_baselines3.ppo.ppo.PPO at 0x175800d3c10>

## Save and reload model


In [22]:
PPO_PATH=os.path.join('Training','Saved Models','PPO_Model_CartPole')

In [23]:
model.save(PPO_PATH)                                                       #saving the model into PC

In [24]:
del model                                                                 #DEL model in this notebook

In [25]:
PPO_PATH

'Training\\Saved Models\\PPO_Model_CartPole'

In [26]:
model=PPO.load(PPO_PATH,env=env)                                          #loading model form PC

## Evaluation 

In [27]:
evaluate_policy (model,env,n_eval_episodes=10,render=True)



(200.0, 0.0)

In [28]:
env.close()            #to close the environment

## Test model

In [29]:
episodes = 5                                                        #loop in 5 times
for episode in range(1, episodes+1):
    obs = env.reset()                                               #reset the evironment to initial conditins
    done = False                                                    #setting variables
    score = 0 
    
    while not done:
        env.render()                                         #view the graphical representation of environment 
        action,_ = model.predict(obs)                        #now using model here, get two outputs 1.model action, 2.next state
        obs, reward, done, info = env.step(action)           #passing action to environment 
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:[200.]
Episode:2 Score:[200.]
Episode:3 Score:[200.]
Episode:4 Score:[200.]
Episode:5 Score:[200.]


In [30]:
model.predict(obs)    #return 2 values

(array([0], dtype=int64), None)

## Viewing logs in tenserboard

In [31]:
training_log_path=os.path.join(log_path,'PPO_20')

In [32]:
training_log_path

'Training\\Logs\\PPO_20'

In [33]:
# !tensorboard --logdir={training_log_path}

## Adding callback to the training stage

In [34]:
from stable_baselines3.common.callbacks import EvalCallback , StopTrainingOnRewardThreshold
#EvalCallBack is the callback that runs during the training state
#StopTrainingOnRewardThreshold = to set the training threshold

In [35]:
save_path=os.path.join('Training','Saved Models')           #To save the best moddels

In [36]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200,verbose=1)

In [38]:
eval_callback=EvalCallback(env,callback_on_new_best=stop_callback,eval_freq=1000,best_model_save_path=save_path,verbose=1)
#basically we have set the reward_threshold which will be checked at frequency of 1000 and if our model has surprassed 
# the reward threhsold we will save the model on save_path

In [39]:
model=PPO('MlpPolicy',env,verbose=1,tensorboard_log=log_path)

Using cpu device


In [40]:
model.learn(total_timesteps=20000,callback=eval_callback)

Logging to Training\Logs\PPO_2




Eval num_timesteps=1000, episode_reward=74.80 +/- 26.81
Episode length: 74.80 +/- 26.81
---------------------------------
| eval/              |          |
|    mean_ep_length  | 74.8     |
|    mean_reward     | 74.8     |
| time/              |          |
|    total_timesteps | 1000     |
---------------------------------
New best mean reward!
Eval num_timesteps=2000, episode_reward=113.80 +/- 46.23
Episode length: 113.80 +/- 46.23
---------------------------------
| eval/              |          |
|    mean_ep_length  | 114      |
|    mean_reward     | 114      |
| time/              |          |
|    total_timesteps | 2000     |
---------------------------------
New best mean reward!
-----------------------------
| time/              |      |
|    fps             | 508  |
|    iterations      | 1    |
|    time_elapsed    | 4    |
|    total_timesteps | 2048 |
-----------------------------
Eval num_timesteps=3000, episode_reward=193.40 +/- 9.07
Episode length: 193.40 +/- 9.07
----

<stable_baselines3.ppo.ppo.PPO at 0x1758d40a2c0>

## Changing policy

In [46]:
net_arch=[dict(pi=[128, 128, 128, 128], vf=[128, 128, 128, 128])]  #new NN arch

In [47]:
model = PPO('MlpPolicy', env, verbose = 1, policy_kwargs={'net_arch': net_arch})

Using cpu device




In [48]:
 model.learn(total_timesteps=20000, callback=eval_callback)

Eval num_timesteps=1000, episode_reward=9.00 +/- 0.89
Episode length: 9.00 +/- 0.89
---------------------------------
| eval/              |          |
|    mean_ep_length  | 9        |
|    mean_reward     | 9        |
| time/              |          |
|    total_timesteps | 1000     |
---------------------------------




Eval num_timesteps=2000, episode_reward=9.20 +/- 0.75
Episode length: 9.20 +/- 0.75
---------------------------------
| eval/              |          |
|    mean_ep_length  | 9.2      |
|    mean_reward     | 9.2      |
| time/              |          |
|    total_timesteps | 2000     |
---------------------------------
-----------------------------
| time/              |      |
|    fps             | 557  |
|    iterations      | 1    |
|    time_elapsed    | 3    |
|    total_timesteps | 2048 |
-----------------------------
Eval num_timesteps=3000, episode_reward=76.20 +/- 18.00
Episode length: 76.20 +/- 18.00
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 76.2       |
|    mean_reward          | 76.2       |
| time/                   |            |
|    total_timesteps      | 3000       |
| train/                  |            |
|    approx_kl            | 0.01374372 |
|    clip_fraction        | 0.183      |
|    clip_r

Eval num_timesteps=14000, episode_reward=200.00 +/- 0.00
Episode length: 200.00 +/- 0.00
---------------------------------
| eval/              |          |
|    mean_ep_length  | 200      |
|    mean_reward     | 200      |
| time/              |          |
|    total_timesteps | 14000    |
---------------------------------
------------------------------
| time/              |       |
|    fps             | 237   |
|    iterations      | 7     |
|    time_elapsed    | 60    |
|    total_timesteps | 14336 |
------------------------------
Eval num_timesteps=15000, episode_reward=200.00 +/- 0.00
Episode length: 200.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 200         |
|    mean_reward          | 200         |
| time/                   |             |
|    total_timesteps      | 15000       |
| train/                  |             |
|    approx_kl            | 0.008249769 |
|    clip_fraction        | 0.1

<stable_baselines3.ppo.ppo.PPO at 0x1758d4dca90>

## Using alternate algorithm

In [49]:
#Using DQN instead of PPO


In [52]:
from stable_baselines3 import DQN

In [54]:
model = DQN('MlpPolicy', env, verbose = 1, tensorboard_log=log_path)

Using cpu device


In [55]:
model.learn(total_timesteps=20000)

Logging to Training\Logs\DQN_1
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.972    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 2529     |
|    time_elapsed     | 0        |
|    total_timesteps  | 60       |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.933    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 3487     |
|    time_elapsed     | 0        |
|    total_timesteps  | 141      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.887    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 3939     |
|    time_elapsed     | 0        |
|    total_timesteps  | 237      |
----------------------------------
------------------------

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 108      |
|    fps              | 4265     |
|    time_elapsed     | 0        |
|    total_timesteps  | 2348     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 112      |
|    fps              | 4283     |
|    time_elapsed     | 0        |
|    total_timesteps  | 2436     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 116      |
|    fps              | 4314     |
|    time_elapsed     | 0        |
|    total_timesteps  | 2523     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 216      |
|    fps              | 4241     |
|    time_elapsed     | 1        |
|    total_timesteps  | 4772     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 220      |
|    fps              | 4222     |
|    time_elapsed     | 1        |
|    total_timesteps  | 4844     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 224      |
|    fps              | 4195     |
|    time_elapsed     | 1        |
|    total_timesteps  | 4914     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 324      |
|    fps              | 4354     |
|    time_elapsed     | 1        |
|    total_timesteps  | 7367     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 328      |
|    fps              | 4359     |
|    time_elapsed     | 1        |
|    total_timesteps  | 7438     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 332      |
|    fps              | 4369     |
|    time_elapsed     | 1        |
|    total_timesteps  | 7522     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 432      |
|    fps              | 4366     |
|    time_elapsed     | 2        |
|    total_timesteps  | 9783     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 436      |
|    fps              | 4376     |
|    time_elapsed     | 2        |
|    total_timesteps  | 9888     |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 440      |
|    fps              | 4380     |
|    time_elapsed     | 2        |
|    total_timesteps  | 9990     |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 540      |
|    fps              | 4485     |
|    time_elapsed     | 2        |
|    total_timesteps  | 12228    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 544      |
|    fps              | 4481     |
|    time_elapsed     | 2        |
|    total_timesteps  | 12296    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 548      |
|    fps              | 4483     |
|    time_elapsed     | 2        |
|    total_timesteps  | 12375    |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 648      |
|    fps              | 4529     |
|    time_elapsed     | 3        |
|    total_timesteps  | 14657    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 652      |
|    fps              | 4532     |
|    time_elapsed     | 3        |
|    total_timesteps  | 14755    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 656      |
|    fps              | 4534     |
|    time_elapsed     | 3        |
|    total_timesteps  | 14811    |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 756      |
|    fps              | 4551     |
|    time_elapsed     | 3        |
|    total_timesteps  | 17119    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 760      |
|    fps              | 4555     |
|    time_elapsed     | 3        |
|    total_timesteps  | 17205    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 764      |
|    fps              | 4556     |
|    time_elapsed     | 3        |
|    total_timesteps  | 17296    |
----------------------------------
----------------------------------
| rollout/          

----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 864      |
|    fps              | 4594     |
|    time_elapsed     | 4        |
|    total_timesteps  | 19300    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 868      |
|    fps              | 4591     |
|    time_elapsed     | 4        |
|    total_timesteps  | 19367    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 872      |
|    fps              | 4590     |
|    time_elapsed     | 4        |
|    total_timesteps  | 19435    |
----------------------------------
----------------------------------
| rollout/          

<stable_baselines3.dqn.dqn.DQN at 0x1758d4ddd50>