In [1]:
import os
import gymnasium as gym
from stable_baselines3 import PPO, DQN, A2C
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common import results_plotter
from stable_baselines3.common.callbacks import EvalCallback

import time

In [5]:
log_dir = "./logs/"
model_dir = "./models/"
os.makedirs(log_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)

In [None]:

env = gym.make("LunarLander-v2", render_mode="rgb_array")
env = Monitor(env, log_dir)

for algorithm in [PPO, DQN, A2C]:
    model = algorithm("MlpPolicy", env, tensorboard_log=log_dir, verbose=0)
    model.learn(total_timesteps=10000, progress_bar=True)
    model.save(f"{model_dir}{algorithm.__name__}_LunarLander")
    del model

In [2]:
def test_model(model):
    env = gym.make("LunarLander-v2", render_mode="human")
    obs, _ = env.reset()

    done = False
    total_reward = 0
    while not done:
        action, _ = model.predict(obs, deterministic=True)
        obs, reward, terminated, truncated, info = env.step(action)
        total_reward += reward
        done = terminated or truncated
        env.render()
    return total_reward

In [7]:
model = PPO.load(f"{model_dir}PPO_LunarLander")

In [8]:
test_model(model)

-1233.403421427631