In [3]:
import gymnasium as gym
from gymnasium.wrappers import RecordVideo
import torch

from stable_baselines3 import PPO, DQN
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv, VecVideoRecorder, VecNormalize
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.utils import set_random_seed
from stable_baselines3.common.evaluation import evaluate_policy

import os
import shutil
from collections import deque
from matplotlib import pyplot as plt
import seaborn as sns

if os.path.exists('output') == False:
    os.makedirs('output')

# VecEnv

In [2]:
class RewardCallback(BaseCallback):
    """
    Custom callback for printing rewards during training
    """
    def __init__(self, print_freq=1000, verbose=0):
        super(RewardCallback, self).__init__(verbose)
        self.print_freq = print_freq
        self.episode_rewards = []
        
    def _on_step(self) -> bool:
        # Get the current episode rewards for all environments
        for info in self.locals['infos']:
            if 'episode' in info:
                self.episode_rewards.append(info['episode']['r'])
                # Print the reward
                print(f"Step: {self.num_timesteps}, Episode Reward: {info['episode']['r']:.2f}")
                
        # Print average reward every print_freq steps
        if len(self.episode_rewards) > 0 and self.num_timesteps % self.print_freq == 0:
            mean_reward = np.mean(self.episode_rewards)
            print(f"Step: {self.num_timesteps}, Mean Episode Reward: {mean_reward:.2f}")
            self.episode_rewards = []  # Reset the list
            
        return True

In [3]:
import numpy as np
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

env_id = "LunarLander-v3"
num_cpu =  1 # Number of processes to use
# Create the vectorized environment

# Stable Baselines provides you with make_vec_env() helper
# which does exactly the previous steps for you.
# You can choose between `DummyVecEnv` (usually faster) and `SubprocVecEnv`
vec_env = make_vec_env(env_id, n_envs=num_cpu, seed=0, vec_env_cls=SubprocVecEnv)

model = DQN("MlpPolicy", vec_env, device = device)
model.learn(
    total_timesteps=5e3, 
    progress_bar=True,
    callback=RewardCallback(1000),
    )

model.save("lunar_dqn")

vec_env.close()

model.load("lunar_dqn")
env = make_vec_env(env_id, n_envs=1, seed=0)
#wrap env to record video
env = VecVideoRecorder(
    env, 
    "output/lunarlander_dqn", 
    record_video_trigger=lambda x: x % 1000 == 0, 
    video_length=1000, 
    name_prefix="dqn-agent")

mean, std = evaluate_policy(model, env, n_eval_episodes=10)
print(f"mean reward:{mean:.2f} +/- {std:.2f}")

env.close()

cuda


Output()

Saving video to /home/nguyen/Desktop/NLP_RNN/Reinforcement Learning/RL practice/sb3/output/lunarlander_dqn/dqn-agent-step-0-to-step-1000.mp4
MoviePy - Building video /home/nguyen/Desktop/NLP_RNN/Reinforcement Learning/RL practice/sb3/output/lunarlander_dqn/dqn-agent-step-0-to-step-1000.mp4.
MoviePy - Writing video /home/nguyen/Desktop/NLP_RNN/Reinforcement Learning/RL practice/sb3/output/lunarlander_dqn/dqn-agent-step-0-to-step-1000.mp4



                                                                           

MoviePy - Done !
MoviePy - video ready /home/nguyen/Desktop/NLP_RNN/Reinforcement Learning/RL practice/sb3/output/lunarlander_dqn/dqn-agent-step-0-to-step-1000.mp4
mean reward:-136.58 +/- 82.16


# Testing with Mujoco Cheetah-v4

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
env_id = "HalfCheetah-v4"
num_cpu = 4
vec_env = make_vec_env(env_id, n_envs=num_cpu, seed=0)
vec_env = VecNormalize(vec_env, norm_obs=True, norm_reward=True, clip_obs = 10.0)

model = PPO("MlpPolicy", vec_env, device = device)

model.learn(total_timesteps=1e6, progress_bar=True, log_interval=10)

model.save("output/halfcheetah_ppo")
vec_env.save("output/halfcheetah_ppo_env")
vec_env.close()

# Load the trained agent and normalized env
del vec_env

vec_env = make_vec_env(env_id, n_envs=1)
vec_env = VecNormalize.load("output/halfcheetah_ppo_env", vec_env)

vec_env.training = False
vec_env.norm_reward = False

model = PPO.load("output/halfcheetah_ppo", env=vec_env)

mean_reward, std_reward = evaluate_policy(model, vec_env, n_eval_episodes=10)
print(f"mean_reward:{mean_reward:.2f} +/- {std_reward:.2f}")

vec_env.close()


  logger.deprecation(


Output()



  logger.deprecation(


mean_reward:1722.30 +/- 17.64
