In [7]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import (
    evaluate_policy,
)  # test the trained model

In [12]:
env = gym.make("CartPole-v1")

In [None]:
episodes = 15
for episode in range(1, episodes + 1):
    state, _ = env.reset()
    terminated = truncated = False
    score = 0.0

    while not (terminated or truncated):
        env.render()
        action = env.action_space.sample()
        n_state, reward, terminated, truncated, info = env.step(action)
        score += float(reward)

    print("Episode: {} Score: {}".format(episode, score))

env.close()

In [None]:
env.action_space.sample()

In [None]:
env.observation_space

# Training the model


In [None]:
# you should create your environment and vectorize it only once
env = DummyVecEnv([lambda: env])
model = PPO("MlpPolicy", env, verbose=1)

In [None]:
model.learn(total_timesteps=10)

# EVALUTE

In [None]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)

In [None]:
env.close()

# TESTING

In [None]:
episodes = 5
for episode in range(1, episodes + 1):
    state, _ = env.reset()
    terminated = truncated = False
    score = 0.0

    while not (terminated or truncated):
        env.render()
        action, _ = model.predict(state)
        state, reward, terminated, truncated, info = env.step(action)
        score += float(reward)

    print("Episode: {} Score: {}".format(episode, score))

env.close()

# EARLY STOPPING AND CALLBACKS 

In [13]:
from stable_baselines3.common.callbacks import  EvalCallback, StopTrainingOnRewardThreshold

In [14]:
save_path = r"D:\RL\RL Oreilly\Deep RL\Model"

In [15]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=195, verbose=1)

eval_callback = EvalCallback(
    env,
    callback_on_new_best=stop_callback,
    eval_freq=10000,
    best_model_save_path=save_path,
    verbose=1,
)

In [16]:
model = PPO('MlpPolicy', env, verbose=1)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [17]:
model.learn(total_timesteps=20000, callback=eval_callback,)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 23.6     |
|    ep_rew_mean     | 23.6     |
| time/              |          |
|    fps             | 339      |
|    iterations      | 1        |
|    time_elapsed    | 6        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 27.3         |
|    ep_rew_mean          | 27.3         |
| time/                   |              |
|    fps                  | 246          |
|    iterations           | 2            |
|    time_elapsed         | 16           |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 0.0077379085 |
|    clip_fraction        | 0.0814       |
|    clip_range           | 0.2          |
|    entropy_loss         | -0.687       |
|    explained_variance   | -0.00608     |
|    learning_r



Eval num_timesteps=10000, episode_reward=335.20 +/- 136.00
Episode length: 335.20 +/- 136.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 335         |
|    mean_reward          | 335         |
| time/                   |             |
|    total_timesteps      | 10000       |
| train/                  |             |
|    approx_kl            | 0.007502982 |
|    clip_fraction        | 0.0559      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.616      |
|    explained_variance   | 0.288       |
|    learning_rate        | 0.0003      |
|    loss                 | 26.4        |
|    n_updates            | 40          |
|    policy_gradient_loss | -0.0152     |
|    value_loss           | 60.6        |
-----------------------------------------
New best mean reward!
Stopping training because the mean reward 335.20  is above the threshold 195


<stable_baselines3.ppo.ppo.PPO at 0x210051edde0>

# MLPPOLICY

In [18]:
net_arch = [dict(pi=[64,64,64],vf = [64,64,64])]

In [19]:
model = PPO('MlpPolicy', env, verbose=1, policy_kwargs={"net_arch":net_arch})

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




In [20]:
model.learn(total_timesteps=20000, callback=eval_callback)

---------------------------------
| rollout/           |          |
|    ep_len_mean     | 21.9     |
|    ep_rew_mean     | 21.9     |
| time/              |          |
|    fps             | 346      |
|    iterations      | 1        |
|    time_elapsed    | 5        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 28.3        |
|    ep_rew_mean          | 28.3        |
| time/                   |             |
|    fps                  | 194         |
|    iterations           | 2           |
|    time_elapsed         | 21          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.013647614 |
|    clip_fraction        | 0.163       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.683      |
|    explained_variance   | 0.00289     |
|    learning_rate        | 0.



Eval num_timesteps=10000, episode_reward=452.00 +/- 87.75
Episode length: 452.00 +/- 87.75
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 452         |
|    mean_reward          | 452         |
| time/                   |             |
|    total_timesteps      | 10000       |
| train/                  |             |
|    approx_kl            | 0.008595639 |
|    clip_fraction        | 0.105       |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.599      |
|    explained_variance   | 0.455       |
|    learning_rate        | 0.0003      |
|    loss                 | 22.1        |
|    n_updates            | 40          |
|    policy_gradient_loss | -0.0213     |
|    value_loss           | 60.4        |
-----------------------------------------
New best mean reward!
Stopping training because the mean reward 452.00  is above the threshold 195


<stable_baselines3.ppo.ppo.PPO at 0x21005272170>

# Change Policy

In [22]:
from stable_baselines3 import DQN

In [24]:
model = DQN("MlpPolicy", env, verbose=1)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [25]:
model.learn(total_timesteps=20000, callback=eval_callback)

----------------------------------
| rollout/            |          |
|    ep_len_mean      | 22       |
|    ep_rew_mean      | 22       |
|    exploration_rate | 0.958    |
| time/               |          |
|    episodes         | 4        |
|    fps              | 6263     |
|    time_elapsed     | 0        |
|    total_timesteps  | 88       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 18.9     |
|    ep_rew_mean      | 18.9     |
|    exploration_rate | 0.928    |
| time/               |          |
|    episodes         | 8        |
|    fps              | 838      |
|    time_elapsed     | 0        |
|    total_timesteps  | 151      |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.471    |
|    n_updates        | 12       |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean    



----------------------------------
| rollout/            |          |
|    ep_len_mean      | 11.2     |
|    ep_rew_mean      | 11.2     |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 884      |
|    fps              | 447      |
|    time_elapsed     | 22       |
|    total_timesteps  | 10049    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.22     |
|    n_updates        | 2487     |
----------------------------------
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 11.1     |
|    ep_rew_mean      | 11.1     |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 888      |
|    fps              | 446      |
|    time_elapsed     | 22       |
|    total_timesteps  | 10087    |
| train/              |          |
|    learning_rate    | 0.0001   |
|    loss             | 0.103    |
|    n_updates      

<stable_baselines3.dqn.dqn.DQN at 0x210051efcd0>

In [27]:
evaluate_policy(model, env, n_eval_episodes=10, render=True)



(9.0, 0.4472135954999579)

In [28]:
env.close()

: 

# HOW TO IMPROVE ACCURACY

## TRAIN FOR LONGER EPISODES
## HYPERPARAMETER TUNING
## CHANGE ALGORITHM