## Reinforcement Learning First Class

In [1]:
##!pip install 'stable-baselines3[extra]'

In [2]:
#!pip install pyglet

In [1]:
import os
import gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy

### Load Enviroment

In [3]:
enviroment_name = 'CartPole-v0'
env = gym.make(enviroment_name)

In [4]:
episodes = 5
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action = env.action_space.sample()
        n_state, reward, done,info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode, score))
env.close()

Episode:1 Score:19.0
Episode:2 Score:14.0
Episode:3 Score:24.0
Episode:4 Score:28.0
Episode:5 Score:40.0


### Understanding the Enviroment

In [5]:
env.action_space.sample()

0

In [6]:
env.observation_space.sample()

array([-1.8453034e+00, -1.3501253e+38,  2.3068184e-02, -5.5751136e+37],
      dtype=float32)

### Train RL Model

In [7]:
#This folders need to be created first before running this code
log_path = os.path.join('Training','Logs')

In [8]:
log_path

'Training/Logs'

In [12]:
env = gym.make(enviroment_name)
env = DummyVecEnv([lambda: env])
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cuda device


In [15]:
model.learn(total_timesteps=20000)

Logging to Training/Logs/PPO_2
-----------------------------
| time/              |      |
|    fps             | 1459 |
|    iterations      | 1    |
|    time_elapsed    | 1    |
|    total_timesteps | 2048 |
-----------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 1066         |
|    iterations           | 2            |
|    time_elapsed         | 3            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0025207438 |
|    clip_fraction        | 0.0139       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.53        |
|    explained_variance   | 0.127        |
|    learning_rate        | 0.0003       |
|    loss                 | 0.881        |
|    n_updates            | 110          |
|    policy_gradient_loss | -0.00143     |
|    value_loss           | 33.9         |
----------------------------

<stable_baselines3.ppo.ppo.PPO at 0x7f58ceafb160>

### Save and Reload Model

In [16]:
PPO_Path = os.path.join('Training','Saved Models','PPO_Model_Cartpole')

In [17]:
PPO_Path

'Training/Saved Models/PPO_Model_Cartpole'

In [18]:
model.save(PPO_Path)

In [19]:
del model

In [21]:
model = PPO.load(PPO_Path)

### Evaluation

In [22]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)



(200.0, 0.0)

In [23]:
env.close()

### Test Model

In [28]:
##action, _ = model.predict(obs)

In [29]:
##action

array([1])

In [30]:
episodes = 5
for episode in range(1, episodes+1):
    obs = env.reset()
    done = False
    score = 0
    
    while not done:
        env.render()
        action, _ = model.predict(obs)
        obs, reward, done,info = env.step(action)
        score += reward
    print('Episode:{} Score:{}'.format(episode, score))
#env.close()

Episode:1 Score:[200.]
Episode:2 Score:[200.]
Episode:3 Score:[200.]
Episode:4 Score:[200.]
Episode:5 Score:[200.]


In [31]:
env.close()

### View logs in Tensorboard

#### It is recommended to do this on command line so the notebook is not locked to do any other tasks.

In [33]:
training_log_path =  os.path.join(log_path, 'PPO_2')

In [34]:
training_log_path

'Training/Logs/PPO_2'

In [35]:
##!tensorboard --logdir={training_log_path}

TensorFlow installation not found - running with reduced feature set.

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.10.0 at http://localhost:6006/ (Press CTRL+C to quit)
^C


### Adding a callback to a training stage

In [36]:
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [37]:
save_path = os.path.join('Training','Saved Models')

In [40]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=200, verbose=1)
eval_callback = EvalCallback(env, 
                             callback_on_new_best=stop_callback,
                             eval_freq=10000,
                             best_model_save_path=save_path,
                             verbose=1)

In [41]:
model = PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cuda device


In [52]:
model.learn(total_timesteps=20000)

Logging to Training/Logs/DQN_2
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.961    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 11603    |
|    time_elapsed     | 0        |
|    total_timesteps  | 83       |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.923    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 12781    |
|    time_elapsed     | 0        |
|    total_timesteps  | 163      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.885    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 13330    |
|    time_elapsed     | 0        |
|    total_timesteps  | 243      |
----------------------------------
------------------------

<stable_baselines3.dqn.dqn.DQN at 0x7f58c03b1f60>

### Changing Policies

In [44]:
net_arch = [dict(pi=[128,128,128,128], vf=[128,128,128,128])]

In [46]:
model =  PPO('MlpPolicy', env, verbose=1, tensorboard_log=log_path, policy_kwargs={'net_arch':net_arch})

Using cuda device


In [47]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Training/Logs/PPO_4
-----------------------------
| time/              |      |
|    fps             | 982  |
|    iterations      | 1    |
|    time_elapsed    | 2    |
|    total_timesteps | 2048 |
-----------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 785         |
|    iterations           | 2           |
|    time_elapsed         | 5           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.014696758 |
|    clip_fraction        | 0.236       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.681      |
|    explained_variance   | -0.00197    |
|    learning_rate        | 0.0003      |
|    loss                 | 3.95        |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0296     |
|    value_loss           | 18.9        |
-----------------------------------------
---



Eval num_timesteps=10000, episode_reward=200.00 +/- 0.00
Episode length: 200.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 200         |
|    mean_reward          | 200         |
| time/                   |             |
|    total_timesteps      | 10000       |
| train/                  |             |
|    approx_kl            | 0.013386311 |
|    clip_fraction        | 0.131       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.569      |
|    explained_variance   | 0.392       |
|    learning_rate        | 0.0003      |
|    loss                 | 17.9        |
|    n_updates            | 40          |
|    policy_gradient_loss | -0.02       |
|    value_loss           | 46          |
-----------------------------------------
------------------------------
| time/              |       |
|    fps             | 678   |
|    iterations      | 5     |
|    time_elapsed    | 15    |


<stable_baselines3.ppo.ppo.PPO at 0x7f58c039bfa0>

### Using an alternate algorithm

In [49]:
from stable_baselines3 import DQN

In [50]:
model =  DQN('MlpPolicy', env, verbose=1, tensorboard_log=log_path)

Using cuda device


In [51]:
model.learn(total_timesteps=20000, callback=eval_callback)

Logging to Training/Logs/DQN_1
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.924    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 9470     |
|    time_elapsed     | 0        |
|    total_timesteps  | 160      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.889    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 10077    |
|    time_elapsed     | 0        |
|    total_timesteps  | 234      |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.812    |
| time/               |          |
|    episodes         | 12       |
|    fps              | 11347    |
|    time_elapsed     | 0        |
|    total_timesteps  | 395      |
----------------------------------
------------------------



----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 520      |
|    fps              | 12932    |
|    time_elapsed     | 0        |
|    total_timesteps  | 11920    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 524      |
|    fps              | 12926    |
|    time_elapsed     | 0        |
|    total_timesteps  | 12002    |
----------------------------------
----------------------------------
| rollout/            |          |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 528      |
|    fps              | 12923    |
|    time_elapsed     | 0        |
|    total_timesteps  | 12090    |
----------------------------------
----------------------------------
| rollout/          

<stable_baselines3.dqn.dqn.DQN at 0x7f58c03b1f60>

In [53]:
### DQN.load