In [None]:
import gymnasium as gym
from gymnasium.wrappers import RecordVideo
import torch

from stable_baselines3 import PPO, DQN
from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.utils import set_random_seed
from stable_baselines3.common.evaluation import evaluate_policy

import os
import shutil
from collections import deque
from matplotlib import pyplot as plt
import seaborn as sns

if os.path.exists('output') == False:
    os.makedirs('output')

In [18]:
def build_env(env_id = 'LunarLander-v3', record_name = None):
    # Delete all contents in lunar-agent folder

    # Initialise the environment
    env = gym.make(env_id, render_mode="rgb_array")

    if record_name != None and record_name != "":
        path = os.path.join("output", record_name)

        env = RecordVideo(
            env,
            video_folder=path,
            episode_trigger=lambda x: True,  # Record every episode
            name_prefix="training",
            video_length=3000,  # Maximum number of steps to record per episode
        )

    return env




In [None]:
env = gym.make("LunarLander-v3", render_mode="rgb_array")
# env = build_env()
env = gym.make("LunarLander-v3", render_mode="rgb_array")

# Reset the environment to generate the first observation
observation, info = env.reset(seed=42)

model = DQN("MlpPolicy", env)

model.learn(total_timesteps=1000, progress_bar=True)

model.save('test_dqn')
model = DQN.load('test_dqn', env = env)

mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)

vec_env = model.get_env()
obs = vec_env.reset()
total = 0
for i in range(1000):
    action, _states = model.predict(obs, deterministic=True)
    obs, rewards, dones, info = vec_env.step(action)
    vec_env.render("rgb_array")
    total += rewards

print(rewards)



Output()

# VecEnv

In [None]:
import numpy as np
device = 'cuda' if torch.cuda.is_available() else 'cpu'

env_id = "LunarLander-v3"
num_cpu = 4  # Number of processes to use
# Create the vectorized environment

# Stable Baselines provides you with make_vec_env() helper
# which does exactly the previous steps for you.
# You can choose between `DummyVecEnv` (usually faster) and `SubprocVecEnv`
vec_env = make_vec_env(env_id, n_envs=num_cpu, seed=0, vec_env_cls=SubprocVecEnv)

model = DQN("MlpPolicy", vec_env)
model.learn(total_timesteps=1e6, progress_bar=True)

model.save("lunar_dqn")

model.load("lunar_dqn")

obs = vec_env.reset()
print(obs.shape)
total_rewards = np.zeros(obs.shape[0])

for _ in range(10):
    action, _states = model.predict(obs)
    obs, rewards, dones, info = vec_env.step(action)
    print(rewards)
    total_rewards += rewards

print('np average ', np.average(total_rewards))



Output()