In [1]:
import os
import gymnasium as gym
import torch

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecVideoRecorder, SubprocVecEnv

In [12]:
# -----------------------------
# Sanity checks
# -----------------------------
assert torch.cuda.is_available(), "CUDA not available"
print("Using GPU:", torch.cuda.get_device_name(0))

Using GPU: Tesla V100-SXM2-16GB


In [6]:
# -----------------------------
# Paths
# -----------------------------
VIDEO_DIR = "./rl_videos"
os.makedirs(VIDEO_DIR, exist_ok=True)

In [13]:
# -----------------------------
# Base environment (for training)
# -----------------------------
def make_env():
    return gym.make("CartPole-v1")

n_envs = 32  # matches your CPU cores

env = SubprocVecEnv([make_env for _ in range(n_envs)])

model = PPO(
    "MlpPolicy",
    env,
    device="cpu",
    n_steps=1024,
    batch_size=2048,
    verbose=1,
)

# -----------------------------
# Train
# -----------------------------
model.learn(total_timesteps=200_000)
model.save("ppo_cartpole")

env.close()

Using cpu device
------------------------------
| time/              |       |
|    fps             | 14987 |
|    iterations      | 1     |
|    time_elapsed    | 2     |
|    total_timesteps | 32768 |
------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 12734       |
|    iterations           | 2           |
|    time_elapsed         | 5           |
|    total_timesteps      | 65536       |
| train/                  |             |
|    approx_kl            | 0.007761392 |
|    clip_fraction        | 0.0933      |
|    clip_range           | 0.2         |
|    entropy_loss         | -0.686      |
|    explained_variance   | 0.00019     |
|    learning_rate        | 0.0003      |
|    loss                 | 24          |
|    n_updates            | 10          |
|    policy_gradient_loss | -0.0142     |
|    value_loss           | 74.6        |
-----------------------------------------
----------

In [15]:
from stable_baselines3.common.vec_env import DummyVecEnv, VecVideoRecorder

def make_eval_env():
    return gym.make("CartPole-v1", render_mode="rgb_array")

eval_env = DummyVecEnv([make_eval_env])

eval_env = VecVideoRecorder(
    eval_env,
    VIDEO_DIR,
    record_video_trigger=lambda step: step == 0,
    video_length=500,
    name_prefix="ppo-cartpole",
)

# -----------------------------
# Run one episode and record
# -----------------------------
obs = eval_env.reset()

for _ in range(500):
    action, _ = model.predict(obs, deterministic=True)
    obs, _, dones, _ = eval_env.step(action)
    if dones.any():
        break

eval_env.close()

print(f"Video saved to {VIDEO_DIR}")

Saving video to /home/shreyak/rl_videos/ppo-cartpole-step-0-to-step-500.mp4
MoviePy - Building video /home/shreyak/rl_videos/ppo-cartpole-step-0-to-step-500.mp4.
MoviePy - Writing video /home/shreyak/rl_videos/ppo-cartpole-step-0-to-step-500.mp4



                                                                                                                                                                                      

MoviePy - Done !
MoviePy - video ready /home/shreyak/rl_videos/ppo-cartpole-step-0-to-step-500.mp4
Video saved to ./rl_videos


