### Learning stable baselines

In [1]:
import gymnasium as gym
from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.vec_env import SubprocVecEnv, VecMonitor
from stable_baselines3.common.env_util import make_vec_env
import torch
import os
import multiprocessing
import numpy as np    
os.environ["MKL_SERVICE_FORCE_INTEL"] = '1'

In [2]:

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Set number of parallel environments
num_cpu = multiprocessing.cpu_count()
num_envs = max(1, min(4, num_cpu // 2))  # Use at most 4 or half the cores
print(f"Using {num_envs} environments with {num_cpu} CPU cores available")
env_id = "LunarLander-v3"

Using device: cuda
Using 4 environments with 20 CPU cores available


In [13]:
env = gym.vector.AsyncVectorEnv([lambda: gym.make(env_id) for _ in range(num_envs)])
env.reset()
actions = env.action_space.sample()

state, rewards, terminated, truncated, infos = env.step(actions)
print(state.shape)
print(rewards.shape)
print(terminated.shape)
print(truncated.shape)
print(infos)

env.close()

(4, 8)
(4,)
(4,)
(4,)
{}


In [15]:

# Define custom neural network architecture
policy_kwargs = dict(
    net_arch=[256, 256, 128],  # Three hidden layers
    activation_fn=torch.nn.ReLU,
    optimizer_class=torch.optim.Adam,
    optimizer_kwargs=dict(eps=1e-5),
)

# Create and train the model
model = DQN(
    'MlpPolicy', 
    env, 
    policy_kwargs=policy_kwargs,
    learning_rate=1e-4,
    buffer_size=100000,
    learning_starts=1000,
    batch_size=64,
    tau=0.005,
    gamma=0.99,
    train_freq=4,
    gradient_steps=1,
    target_update_interval=1000,
    exploration_fraction=0.1,
    exploration_final_eps=0.05,
    verbose=1,
    device=device,
    tensorboard_log="./dqn_lunar_tensorboard/"
)


Using cuda device


ValueError: The environment is of type <class 'gymnasium.vector.async_vector_env.AsyncVectorEnv'>, not a Gymnasium environment. In this case, we expect OpenAI Gym to be installed and the environment to be an OpenAI Gym environment.

In [None]:

# Train the agent
total_timesteps = int(2e5)  # Adjust as needed
model.learn(total_timesteps=total_timesteps, progress_bar=True)

# Save the trained model
model.save('dqn_lunar_custom')
print("Model saved successfully")

# Test the trained agent
print("Testing trained agent...")
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

# Clean up
env.close()