In [1]:
import gymnasium as gym
import cv2

from stable_baselines3 import DQN, PPO
from stable_baselines3.common.callbacks import (
    EvalCallback,
    StopTrainingOnRewardThreshold,
)

In [2]:
# Create the CartPole-v1 environment
env = gym.make("CartPole-v1", render_mode="rgb_array")

In [3]:
# # Create Deep Q-Network
# model = DQN(
#     "MlpPolicy",
#     env,
#     learning_rate=1e-3, # default: 0.0001 # good: 1e-3
#     buffer_size=50_000, # default: 1_000_000 # good: 50_000
#     learning_starts=1000, # default: 100 # good: 1000
#     batch_size=32, # default: 32
#     tau=1.0, # default: 1.0
#     gamma=0.99, # default: 0.99
#     train_freq=4, # default: 4
#     target_update_interval=1000, # default: 10000 # good: 1000
#     exploration_fraction=0.1, # default: 0.1
#     exploration_final_eps=0.02, # default: 0.05 # good: 0.02
#     seed=42,
#     verbose=1,
#     tensorboard_log="../logs/exercise_1/dqn_tensorboard/",
# )

In [4]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=500, verbose=1)

eval_callback = EvalCallback(
    env,
    best_model_save_path="../logs/exercise_1/",
    log_path="../logs/exercise_1/",
    eval_freq=5_000,
    deterministic=True,
    render=False,
    n_eval_episodes=50,
    callback_on_new_best=stop_callback,
)

In [5]:
# Create PPO model
model = PPO(
    "MlpPolicy",
    env,
    seed=42,
    device="cpu",
    verbose=1,
    tensorboard_log="../logs/exercise_1/ppo_tensorboard/",
)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [6]:
# Train model on the environment
TRAINING_TIMESTEPS = 1_000_000
model.learn(total_timesteps=TRAINING_TIMESTEPS, callback=[eval_callback])

Logging to ../logs/exercise_1/ppo_tensorboard/PPO_10
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 22       |
|    ep_rew_mean     | 22       |
| time/              |          |
|    fps             | 6463     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 24.6        |
|    ep_rew_mean          | 24.6        |
| time/                   |             |
|    fps                  | 4091        |
|    iterations           | 2           |
|    time_elapsed         | 1           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008349772 |
|    clip_fraction        | 0.0965      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_vari



Eval num_timesteps=5000, episode_reward=234.64 +/- 149.23
Episode length: 234.64 +/- 149.23
----------------------------------------
| eval/                   |            |
|    mean_ep_length       | 235        |
|    mean_reward          | 235        |
| time/                   |            |
|    total_timesteps      | 5000       |
| train/                  |            |
|    approx_kl            | 0.01179982 |
|    clip_fraction        | 0.0871     |
|    clip_range           | 0.2        |
|    entropy_loss         | -0.664     |
|    explained_variance   | 0.0738     |
|    learning_rate        | 0.0003     |
|    loss                 | 11.6       |
|    n_updates            | 20         |
|    policy_gradient_loss | -0.0223    |
|    value_loss           | 31.3       |
----------------------------------------
New best mean reward!
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 33.3     |
|    ep_rew_mean     | 33.3     |
| time/     

<stable_baselines3.ppo.ppo.PPO at 0x791d1a5397f0>

In [7]:
def add_overlay(frame_rgb, step, total_reward):
    """
    Adds step count and cumulative reward to the frame.
    """
    # Convert to BGR for OpenCV
    frame_bgr = cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2BGR)

    # Prepare text
    step_text = f"Step: {step}"
    reward_text = f"Reward: {total_reward:.1f}"

    # Add text to frame
    cv2.putText(
        frame_bgr,
        step_text,
        org=(10, 30),
        fontFace=cv2.FONT_HERSHEY_SIMPLEX,
        fontScale=1,
        color=(0, 255, 0),
        thickness=2,
    )

    # Convert back to RGB
    return cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)

In [8]:
MAX_STEPS = 500
success = False

observation, info = env.reset(seed=42)
frames = []
step_count = 0
cumulative_reward = 0

for _ in range(MAX_STEPS):
    frame = env.render()

    frame_with_overlay = add_overlay(
        frame, step=step_count + 1, total_reward=cumulative_reward
    )
    frames.append(frame_with_overlay)

    action, _ = model.predict(observation, deterministic=True)
    observation, reward, terminated, truncated, info = env.step(action)
    cumulative_reward += reward
    step_count += 1

    if terminated or truncated:
        break

if step_count == MAX_STEPS:
    success = True
    print("Successfully balanced pole for 500 steps.")
else:
    print(f"Failed with {step_count} steps. Retrying...")

env.close()

Successfully balanced pole for 500 steps.


In [10]:
from IPython.display import HTML
from base64 import b64encode
import os

# Create a video from the frames
video_filename = "../videos/cartpole_ppo.mp4"
compressed_path = "../videos/cartpole_ppo_compressed.mp4"
height, width, _ = frames[0].shape

fourcc = cv2.VideoWriter_fourcc(*"mp4v")
video = cv2.VideoWriter(video_filename, fourcc, 30.0, (width, height))

for frame in frames:
    video.write(
        cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
    )
video.release()

print(f"Video guardado como {video_filename}")

os.system(f"rm {compressed_path}")
# Compressed video path
os.system(f"ffmpeg -i {video_filename} -vcodec libx264 {compressed_path}")
os.system(f"rm {video_filename}")
os.system(f"mv {compressed_path} {video_filename}")

# Show video
mp4 = open(video_filename, "rb").read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML(
    """
<video width=800 controls>
      <source src="%s" type="video/mp4">
</video>"""
    % data_url
)

Video guardado como ../videos/cartpole_ppo.mp4


rm: cannot remove '../videos/cartpole_ppo_compressed.mp4': No such file or directory
ffmpeg version n7.1 Copyright (c) 2000-2024 the FFmpeg developers
  built with gcc 14.2.1 (GCC) 20250207
  configuration: --prefix=/usr --disable-debug --disable-static --disable-stripping --enable-amf --enable-avisynth --enable-cuda-llvm --enable-lto --enable-fontconfig --enable-frei0r --enable-gmp --enable-gnutls --enable-gpl --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libdav1d --enable-libdrm --enable-libdvdnav --enable-libdvdread --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgsm --enable-libharfbuzz --enable-libiec61883 --enable-libjack --enable-libjxl --enable-libmodplug --enable-libmp3lame --enable-libopencore_amrnb --enable-libopencore_amrwb --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libplacebo --enable-libpulse --enable-librav1e --enable-librsvg --enable-librubberband --enable-libsnappy --enable-