In [None]:
import gymnasium as gym
import cv2

from stable_baselines3 import DQN, PPO
from stable_baselines3.common.callbacks import EvalCallback, StopTrainingOnRewardThreshold

In [3]:
# Create the CartPole-v1 environment
env = gym.make("CartPole-v1", render_mode="rgb_array")

In [None]:
# # Create Deep Q-Network
# model = DQN(
#     "MlpPolicy",
#     env,
#     learning_rate=1e-3, # default: 0.0001 # good: 1e-3
#     buffer_size=50_000, # default: 1_000_000 # good: 50_000
#     learning_starts=1000, # default: 100 # good: 1000
#     batch_size=32, # default: 32
#     tau=1.0, # default: 1.0
#     gamma=0.99, # default: 0.99
#     train_freq=4, # default: 4
#     target_update_interval=1000, # default: 10000 # good: 1000
#     exploration_fraction=0.1, # default: 0.1
#     exploration_final_eps=0.02, # default: 0.05 # good: 0.02
#     seed=42,
#     verbose=1,
#     tensorboard_log="../logs/exercise_1/dqn_tensorboard/",
# )

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [None]:
stop_callback = StopTrainingOnRewardThreshold(reward_threshold=500, verbose=1)

eval_callback = EvalCallback(
    env,
    best_model_save_path="../logs/exercise_1/",
    log_path="../logs/exercise_1/",
    eval_freq=5_000,
    deterministic=True,
    render=False,
    n_eval_episodes=5,
    callback_on_new_best=stop_callback
)

In [7]:
# Create PPO model
model = PPO("MlpPolicy", env, seed=42, device="cpu", verbose=1, tensorboard_log="../logs/exercise_1/ppo_tensorboard/")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [8]:
# Train model on the environment
TRAINING_TIMESTEPS = 1_000_000
model.learn(total_timesteps=TRAINING_TIMESTEPS, callback=[eval_callback])

Logging to ../logs/exercise_1/ppo_tensorboard/PPO_1
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 22       |
|    ep_rew_mean     | 22       |
| time/              |          |
|    fps             | 6584     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 24.6        |
|    ep_rew_mean          | 24.6        |
| time/                   |             |
|    fps                  | 2941        |
|    iterations           | 2           |
|    time_elapsed         | 1           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.008349772 |
|    clip_fraction        | 0.0965      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_varia



Eval num_timesteps=50000, episode_reward=500.00 +/- 0.00
Episode length: 500.00 +/- 0.00
-----------------------------------------
| eval/                   |             |
|    mean_ep_length       | 500         |
|    mean_reward          | 500         |
| time/                   |             |
|    total_timesteps      | 50000       |
| train/                  |             |
|    approx_kl            | 0.006074584 |
|    clip_fraction        | 0.0473      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.499      |
|    explained_variance   | 0.518       |
|    learning_rate        | 0.0003      |
|    loss                 | 0.00375     |
|    n_updates            | 240         |
|    policy_gradient_loss | -0.00194    |
|    value_loss           | 0.0193      |
-----------------------------------------
New best mean reward!
Stopping training because the mean reward 500.00  is above the threshold 500


<stable_baselines3.ppo.ppo.PPO at 0x7f97181dd010>

In [None]:
def add_overlay(frame_rgb, step, total_reward):
    """
    Adds step count and cumulative reward to the frame.
    """
    # Convert to BGR for OpenCV
    frame_bgr = cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2BGR)

    # Prepare text
    step_text = f"Step: {step}"
    reward_text = f"Reward: {total_reward:.1f}"

    # Add text to frame
    cv2.putText(frame_bgr, step_text, org=(10, 30),
                fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1,
                color=(0, 255, 0), thickness=2)

    # Convert back to RGB
    return cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)


In [None]:
MAX_STEPS = 500
success = False

observation, info = env.reset(seed=42)
frames = []
step_count = 0
cumulative_reward = 0

for _ in range(MAX_STEPS):
    frame = env.render()

    frame_with_overlay = add_overlay(frame, step=step_count + 1, total_reward=cumulative_reward)
    frames.append(frame_with_overlay)

    action, _ = model.predict(observation, deterministic=True)
    observation, reward, terminated, truncated, info = env.step(action)
    cumulative_reward += reward
    step_count += 1

    if terminated or truncated:
        break

if step_count == MAX_STEPS:
    success = True
    print("Successfully balanced pole for 500 steps.")
else:
    print(f"Failed with {step_count} steps. Retrying...")

env.close()

Successfully balanced pole for 500 steps.


In [11]:
# Crear el video a partir de los cuadros guardados
video_filename = "../videos/cartpole_dqn_policy.mp4"
height, width, _ = frames[0].shape  # Obtener dimensiones de los cuadros

# Configuración de salida para formato MP4
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
video = cv2.VideoWriter(video_filename, fourcc, 30.0, (width, height))

# Escribir cada cuadro en el video
for frame in frames:
    video.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))  # Convertir de RGB a BGR para OpenCV

# Liberar el objeto VideoWriter
video.release()

print(f"Video guardado como {video_filename}")


Video guardado como ../videos/cartpole_dqn_policy.mp4


In [12]:
from IPython.display import HTML
from base64 import b64encode
import os

# Compressed video path
compressed_path = "../videos/cartpole_dqn_policy_compressed.mp4"

os.system(f"rm {compressed_path}")
os.system(f"ffmpeg -i {video_filename} -vcodec libx264 {compressed_path}")
# Show video
mp4 = open(compressed_path,'rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML("""
<video width=800 controls>
      <source src="%s" type="video/mp4">
</video>""" % data_url)

rm: cannot remove '../videos/cartpole_dqn_policy_compressed.mp4': No such file or directory
ffmpeg version n7.1 Copyright (c) 2000-2024 the FFmpeg developers
  built with gcc 14.2.1 (GCC) 20250207
  configuration: --prefix=/usr --disable-debug --disable-static --disable-stripping --enable-amf --enable-avisynth --enable-cuda-llvm --enable-lto --enable-fontconfig --enable-frei0r --enable-gmp --enable-gnutls --enable-gpl --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libdav1d --enable-libdrm --enable-libdvdnav --enable-libdvdread --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgsm --enable-libharfbuzz --enable-libiec61883 --enable-libjack --enable-libjxl --enable-libmodplug --enable-libmp3lame --enable-libopencore_amrnb --enable-libopencore_amrwb --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libplacebo --enable-libpulse --enable-librav1e --enable-librsvg --enable-librubberband --enable-libsnappy --