In [2]:
import datetime
import json
import logging
import os
import time
from multiprocessing.pool import Pool
from pathlib import Path
import numpy as np
from tensorboardX import SummaryWriter
from gymnasium.wrappers import RecordVideo, RecordEpisodeStatistics, capped_cubic_video_schedule

import rl_agents.trainer.logger
from rl_agents.agents.common.factory import load_environment, load_agent
from rl_agents.agents.common.graphics import AgentGraphics
from rl_agents.agents.common.memory import Transition
from rl_agents.utils import near_split, zip_with_singletons
from rl_agents.configuration import serialize
from rl_agents.trainer.graphics import RewardViewer

logger = logging.getLogger(__name__)

import torch.optim as optim



class Evaluation(object):
    """
        The evaluation of an agent interacting with an environment to maximize its expected reward.
    """

    OUTPUT_FOLDER = 'out'
    SAVED_MODELS_FOLDER = 'saved_models'
    RUN_FOLDER = 'run_{}_{}'
    METADATA_FILE = 'metadata.{}.json'
    LOGGING_FILE = 'logging.{}.log'

    def __init__(self,
                 env,
                 agent,
                 directory=None,
                 run_directory=None,
                 num_episodes=1000,
                 training=True,
                 sim_seed=None,
                 recover=None,
                 display_env=True,
                 display_agent=True,
                 display_rewards=True,
                 close_env=True,
                 step_callback_fn=None):
        """

        :param env: The environment to be solved, possibly wrapping an AbstractEnv environment
        :param AbstractAgent agent: The agent solving the environment
        :param Path directory: Workspace directory path
        :param Path run_directory: Run directory path
        :param int num_episodes: Number of episodes run
        !param training: Whether the agent is being trained or tested
        :param sim_seed: The seed used for the environment/agent randomness source
        :param recover: Recover the agent parameters from a file.
                        - If True, it the default latest save will be used.
                        - If a string, it will be used as a path.
        :param display_env: Render the environment, and have a monitor recording its videos
        :param display_agent: Add the agent graphics to the environment viewer, if supported
        :param display_rewards: Display the performances of the agent through the episodes
        :param close_env: Should the environment be closed when the evaluation is closed
        :param step_callback_fn: A callback function called after every environment step. It takes the following
               arguments: (episode, env, agent, transition, writer).

        """
        self.env = env
        self.agent = agent
        self.num_episodes = num_episodes
        self.training = training
        self.sim_seed = sim_seed if sim_seed is not None else np.random.randint(0, 1e6)
        self.close_env = close_env
        self.display_env = display_env
        self.step_callback_fn = step_callback_fn

        self.directory = Path(directory or self.default_directory)
        self.run_directory = self.directory / (run_directory or self.default_run_directory)
        # self.wrapped_env = RecordVideo(env,
        #                                self.run_directory,
        #                                episode_trigger=(None if self.display_env else lambda e: False))
        # try:
        #     self.wrapped_env.unwrapped.set_record_video_wrapper(self.wrapped_env)
        # except AttributeError:
        #     pass
        # self.wrapped_env = RecordEpisodeStatistics(self.wrapped_env)
        self.wrapped_env = RecordEpisodeStatistics(env)
        self.episode = 0
        self.writer = SummaryWriter(str(self.run_directory))
        self.agent.set_writer(self.writer)
        self.agent.evaluation = self
        self.write_logging()
        self.write_metadata()
        self.filtered_agent_stats = 0
        self.best_agent_stats = -np.inf, 0

        self.recover = recover
        if self.recover:
            self.load_agent_model(self.recover)

        if display_agent:
            try:
                self.env.reset()
                # Render the agent within the environment viewer, if supported
                self.env.render()
                self.env.unwrapped.viewer.directory = self.run_directory
                self.env.unwrapped.viewer.set_agent_display(
                    lambda agent_surface, sim_surface: AgentGraphics.display(self.agent, agent_surface, sim_surface))
                self.env.unwrapped.viewer.directory = self.run_directory
            except AttributeError:
                logger.info("The environment viewer doesn't support agent rendering.")
        self.reward_viewer = None
        if display_rewards:
            self.reward_viewer = RewardViewer()
        self.observation = None

    def train(self):
        self.training = True
        if getattr(self.agent, "batched", False):
            self.run_batched_episodes()
        else:
            self.run_episodes()
        self.close()

    def test(self):
        """
        Test the agent.

        If applicable, the agent model should be loaded before using the recover option.
        """
        self.training = False
        if self.display_env:
            self.wrapped_env.episode_trigger = lambda e: True
        try:
            self.agent.eval()
        except AttributeError:
            pass
        self.run_episodes()
        self.close()

    def run_episodes(self):
        for self.episode in range(self.num_episodes):
            # Run episode
            terminal = False
            self.reset(seed=self.episode)
            rewards = []
            start_time = time.time()
            while not terminal:
                # Step until a terminal step is reached
                reward, terminal = self.step()
                rewards.append(reward)

                # Periodically render
                self.env.render()
                time.sleep(0.01)

                # Catch interruptions
                try:
                    if self.env.unwrapped.done:
                        break
                except AttributeError:
                    pass

            # End of episode
            duration = time.time() - start_time
            self.after_all_episodes(self.episode, rewards, duration)
            self.after_some_episodes(self.episode, rewards)


    def step(self):
        """
            Plan a sequence of actions according to the agent policy, and step the environment accordingly.
        """
        # Query agent for actions sequence
        actions = self.agent.plan(self.observation)
        if not actions:
            raise Exception("The agent did not plan any action")

        # Forward the actions to the environment viewer
        try:
            self.env.unwrapped.viewer.set_agent_action_sequence(actions)
        except AttributeError:
            pass

        # Step the environment
        previous_observation, action = self.observation, actions[0]
        transition = self.wrapped_env.step(action)
        self.observation, reward, done, truncated, info = transition
        terminal = done or truncated

        # Call callback
        if self.step_callback_fn is not None:
            self.step_callback_fn(self.episode, self.wrapped_env, self.agent, transition, self.writer)

        # Record the experience.
        try:
            self.agent.record(previous_observation, action, reward, self.observation, done, info)
        except NotImplementedError:
            pass

        return reward, terminal

    def run_batched_episodes(self):
        """
            Alternatively,
            - run multiple sample-collection jobs in parallel
            - update model
        """
        episode = 0
        episode_duration = 14  # TODO: use a fixed number of samples instead
        batch_sizes = near_split(self.num_episodes * episode_duration, size_bins=self.agent.config["batch_size"])
        self.agent.reset()
        for batch, batch_size in enumerate(batch_sizes):
            logger.info("[BATCH={}/{}]---------------------------------------".format(batch+1, len(batch_sizes)))
            logger.info("[BATCH={}/{}][run_batched_episodes] #samples={}".format(batch+1, len(batch_sizes),
                                                                                 len(self.agent.memory)))
            logger.info("[BATCH={}/{}]---------------------------------------".format(batch+1, len(batch_sizes)))
            # Save current agent
            model_path = self.save_agent_model(identifier=batch)

            # Prepare workers
            env_config, agent_config = serialize(self.env), serialize(self.agent)
            cpu_processes = self.agent.config["processes"] or os.cpu_count()
            workers_sample_counts = near_split(batch_size, cpu_processes)
            workers_starts = list(np.cumsum(np.insert(workers_sample_counts[:-1], 0, 0)) + np.sum(batch_sizes[:batch]))
            base_seed = batch * cpu_processes
            workers_seeds = [base_seed + i for i in range(cpu_processes)]
            workers_params = list(zip_with_singletons(env_config,
                                                      agent_config,
                                                      workers_sample_counts,
                                                      workers_starts,
                                                      workers_seeds,
                                                      model_path,
                                                      batch))

            # Collect trajectories
            logger.info("Collecting {} samples with {} workers...".format(batch_size, cpu_processes))
            if cpu_processes == 1:
                results = [Evaluation.collect_samples(*workers_params[0])]
            else:
                with Pool(processes=cpu_processes) as pool:
                    results = pool.starmap(Evaluation.collect_samples, workers_params)
            trajectories = [trajectory for worker in results for trajectory in worker]

            # Fill memory
            for trajectory in trajectories:
                if trajectory[-1].terminal:  # Check whether the episode was properly finished before logging
                    self.after_all_episodes(episode, [transition.reward for transition in trajectory])
                episode += 1
                [self.agent.record(*transition) for transition in trajectory]

            # Fit model
            self.agent.update()

    @staticmethod
    def collect_samples(environment_config, agent_config, count, start_time, seed, model_path, batch):
        """
            Collect interaction samples of an agent / environment pair.

            Note that the last episode may not terminate, when enough samples have been collected.

        :param dict environment_config: the environment configuration
        :param dict agent_config: the agent configuration
        :param int count: number of samples to collect
        :param start_time: the initial local time of the agent
        :param seed: the env/agent seed
        :param model_path: the path to load the agent model from
        :param batch: index of the current batch
        :return: a list of trajectories, i.e. lists of Transitions
        """
        env = load_environment(environment_config)

        if batch == 0:  # Force pure exploration during first batch
            agent_config["exploration"]["final_temperature"] = 1
        agent_config["device"] = "cpu"
        agent = load_agent(agent_config, env)
        agent.load(model_path)
        agent.seed(seed)
        agent.set_time(start_time)

        state = env.reset(seed=seed)
        episodes = []
        trajectory = []
        for _ in range(count):
            action = agent.act(state)
            next_state, reward, done, info = env.step(action)
            trajectory.append(Transition(state, action, reward, next_state, done, info))
            if done:
                state = env.reset()
                episodes.append(trajectory)
                trajectory = []
            else:
                state = next_state
        if trajectory:  # Unfinished episode
            episodes.append(trajectory)
        env.close()
        return episodes

    def save_agent_model(self, identifier, do_save=True):
        # Create the folder if it doesn't exist
        permanent_folder = self.directory / self.SAVED_MODELS_FOLDER
        os.makedirs(permanent_folder, exist_ok=True)

        episode_path = None
        if do_save:
            episode_path = Path(self.run_directory) / "checkpoint-{}.tar".format(identifier)
            try:
                self.agent.save(filename=permanent_folder / "latest.tar")
                episode_path = self.agent.save(filename=episode_path)
                if episode_path:
                    logger.info("Saved {} model to {}".format(self.agent.__class__.__name__, episode_path))
            except NotImplementedError:
                pass
        return episode_path

    def load_agent_model(self, model_path):
        if model_path is True:
            model_path = self.directory / self.SAVED_MODELS_FOLDER / "latest.tar"
        if isinstance(model_path, str):
            model_path = Path(model_path)
            if not model_path.exists():
                model_path = self.directory / self.SAVED_MODELS_FOLDER / model_path
        try:
            model_path = self.agent.load(filename=model_path)
            if model_path:
                logger.info("Loaded {} model from {}".format(self.agent.__class__.__name__, model_path))
        except FileNotFoundError:
            logger.warning("No pre-trained model found at the desired location.")
        except NotImplementedError:
            pass

    def after_all_episodes(self, episode, rewards, duration):
        rewards = np.array(rewards)
        gamma = self.agent.config.get("gamma", 1)
        self.writer.add_scalar('episode/length', len(rewards), episode)
        self.writer.add_scalar('episode/total_reward', sum(rewards), episode)
        self.writer.add_scalar('episode/return', sum(r*gamma**t for t, r in enumerate(rewards)), episode)
        self.writer.add_scalar('episode/fps', len(rewards) / max(duration, 1e-6), episode)
        self.writer.add_histogram('episode/rewards', rewards, episode)
        logger.info("Episode {} score: {:.1f}".format(episode, sum(rewards)))

    def after_some_episodes(self, episode, rewards,
                            best_increase=1.1,
                            episodes_window=50):
        if capped_cubic_video_schedule(episode):
            # Save the model
            if self.training:
                self.save_agent_model(episode)

        if self.training:
            # Save best model so far, averaged on a window
            best_reward, best_episode = self.best_agent_stats
            self.filtered_agent_stats += 1 / episodes_window * (np.sum(rewards) - self.filtered_agent_stats)
            if self.filtered_agent_stats > best_increase * best_reward \
                    and episode >= best_episode + episodes_window:
                self.best_agent_stats = (self.filtered_agent_stats, episode)
                self.save_agent_model("best")

    @property
    def default_directory(self):
        return Path(self.OUTPUT_FOLDER) / self.env.unwrapped.__class__.__name__ / self.agent.__class__.__name__

    @property
    def default_run_directory(self):
        return self.RUN_FOLDER.format(datetime.datetime.now().strftime('%Y%m%d-%H%M%S'), os.getpid())

    def write_metadata(self):
        metadata = dict(env=serialize(self.env), agent=serialize(self.agent))
        file_infix = '{}.{}'.format(id(self.wrapped_env), os.getpid())
        file = self.run_directory / self.METADATA_FILE.format(file_infix)
        with file.open('w') as f:
            json.dump(metadata, f, sort_keys=True, indent=4)

    def write_logging(self):
        file_infix = '{}.{}'.format(id(self.wrapped_env), os.getpid())
        rl_agents.trainer.logger.configure()
        rl_agents.trainer.logger.add_file_handler(self.run_directory / self.LOGGING_FILE.format(file_infix))

    def reset(self, seed=0):
        seed = self.sim_seed + seed if self.sim_seed is not None else None
        self.observation, info = self.wrapped_env.reset()
        self.agent.seed(seed)  # Seed the agent with the main environment seed
        self.agent.reset()

    def close(self):
        """
            Close the evaluation.
        """
        if self.training:
            self.save_agent_model("final")
        self.wrapped_env.close()
        self.writer.close()
        if self.close_env:
            self.env.close()

In [3]:
import logging
import torch
from gymnasium import spaces

from rl_agents.agents.common.memory import Transition
from rl_agents.agents.common.models import model_factory, size_model_config, trainable_parameters
from rl_agents.agents.common.optimizers import loss_function_factory, optimizer_factory
from rl_agents.agents.common.utils import choose_device
from rl_agents.agents.deep_q_network.abstract import AbstractDQNAgent

logger = logging.getLogger(__name__)


class DQNAgent(AbstractDQNAgent):
    def __init__(self, env, config=None):
        super(DQNAgent, self).__init__(env, config)
        size_model_config(self.env, self.config["model"])
        self.value_net = model_factory(self.config["model"])
        self.target_net = model_factory(self.config["model"])
        self.target_net.load_state_dict(self.value_net.state_dict())
        self.target_net.eval()
        logger.debug("Number of trainable parameters: {}".format(trainable_parameters(self.value_net)))
        self.device = choose_device(self.config["device"])
        self.value_net.to(self.device)
        self.target_net.to(self.device)
        self.loss_function = loss_function_factory(self.config["loss_function"])
        self.optimizer = optimizer_factory(self.config["optimizer"]["type"],
                                           self.value_net.parameters(),
                                           **self.config["optimizer"])
        self.steps = 0

    def step_optimizer(self, loss):
        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.value_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()

    def compute_bellman_residual(self, batch, target_state_action_value=None):
        # Compute concatenate the batch elements
        if not isinstance(batch.state, torch.Tensor):
            # logger.info("Casting the batch to torch.tensor")
            state = torch.cat(tuple(torch.tensor([batch.state], dtype=torch.float))).to(self.device)
            action = torch.tensor(batch.action, dtype=torch.long).to(self.device)
            reward = torch.tensor(batch.reward, dtype=torch.float).to(self.device)
            next_state = torch.cat(tuple(torch.tensor([batch.next_state], dtype=torch.float))).to(self.device)
            terminal = torch.tensor(batch.terminal, dtype=torch.bool).to(self.device)
            batch = Transition(state, action, reward, next_state, terminal, batch.info)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken
        state_action_values = self.value_net(batch.state)
        state_action_values = state_action_values.gather(1, batch.action.unsqueeze(1)).squeeze(1)

        if target_state_action_value is None:
            with torch.no_grad():
                # Compute V(s_{t+1}) for all next states.
                next_state_values = torch.zeros(batch.reward.shape).to(self.device)
                if self.config["double"]:
                    # Double Q-learning: pick best actions from policy network
                    _, best_actions = self.value_net(batch.next_state).max(1)
                    # Double Q-learning: estimate action values from target network
                    best_values = self.target_net(batch.next_state).gather(1, best_actions.unsqueeze(1)).squeeze(1)
                else:
                    best_values, _ = self.target_net(batch.next_state).max(1)
                next_state_values[~batch.terminal] = best_values[~batch.terminal]
                # Compute the expected Q values
                target_state_action_value = batch.reward + self.config["gamma"] * next_state_values

        # Compute loss
        loss = self.loss_function(state_action_values, target_state_action_value)
        return loss, target_state_action_value, batch

    def get_batch_state_values(self, states):
        values, actions = self.value_net(torch.tensor(states, dtype=torch.float).to(self.device)).max(1)
        return values.data.cpu().numpy(), actions.data.cpu().numpy()

    def get_batch_state_action_values(self, states):
        return self.value_net(torch.tensor(states, dtype=torch.float).to(self.device)).data.cpu().numpy()

    def save(self, filename):
        state = {'state_dict': self.value_net.state_dict(),
                 'optimizer': self.optimizer.state_dict()}
        torch.save(state, filename)
        return filename

    def load(self, filename):
        checkpoint = torch.load(filename, map_location=self.device)
        self.value_net.load_state_dict(checkpoint['state_dict'])
        self.target_net.load_state_dict(checkpoint['state_dict'])
        self.optimizer.load_state_dict(checkpoint['optimizer'])
        return filename

    def initialize_model(self):
        self.value_net.reset()

    def set_writer(self, writer):
        super().set_writer(writer)
        obs_shape = self.env.observation_space.shape if isinstance(self.env.observation_space, spaces.Box) else \
            self.env.observation_space.spaces[0].shape
        model_input = torch.zeros((1, *obs_shape), dtype=torch.float, device=self.device)
        self.writer.add_graph(self.value_net, input_to_model=(model_input,)),
        self.writer.add_scalar("agent/trainable_parameters", trainable_parameters(self.value_net), 0)

## 1st Attempt (ddqn1)

In [2]:
import gymnasium
import highway_env
import numpy as np


# Initialize environment
env = gymnasium.make("highway-fast-v0", render_mode='rgb_array')
env.reset()  # Ensure the environment is reset before setting up evaluation

# Define configuration for DQN
config = {
    "model": {
        "type": "MultiLayerPerceptron",
        "layers": [128, 128]  # Smaller architecture for efficiency
    },
    "gamma": 0.9,  # Slightly higher discount factor for better long-term focus
    "n_steps": 1,
    "batch_size": 64,  # Larger batch size for more stable updates
    "memory_capacity": 10000,  # Reduced to save memory
    "learning_starts": 1000,  # Start training after 1000 steps to collect sufficient experience
    "learning_rate": 1e-3,  # Slightly higher learning rate for faster convergence
    "train_frequency": 4,  # Update every 4 steps
    "target_update": 100,  # Update target network less frequently to stabilize training
    "double": True,  # Enable Double DQN to reduce overestimation
    "exploration": {
        "method": "EpsilonGreedy",
        "tau": 10000,  # Slower epsilon decay for prolonged exploration
        "temperature": 1.0,
        "final_temperature": 0.05  # Lower final epsilon for better exploitation
    },
    "device": "mps" if torch.backends.mps.is_available() else "cpu"  # Use MPS on Mac M1 if available
}

# Instantiate the DQN agent with the environment and configuration
agent = DQNAgent(env, config)

# Set up the evaluation instance
evaluation = Evaluation(env, agent, directory="highway_ddqn/", num_episodes=1000, display_env=False)

# Train the agent
evaluation.train()


NameError: name 'torch' is not defined

### Hyperparameter Tuning

In [None]:
import optuna
import gymnasium
import highway_env
import numpy as np
import torch

def objective(trial):
    # Define the hyperparameter search space
    layers = trial.suggest_categorical("layers", [[64, 64], [128, 128], [256, 256]])
    gamma = trial.suggest_float("gamma", 0.7, 0.99)
    batch_size = trial.suggest_int("batch_size", 16, 128)
    memory_capacity = trial.suggest_int("memory_capacity", 5000, 20000)
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-3)
    tau = trial.suggest_int("tau", 1000, 10000)
    final_temperature = trial.suggest_float("final_temperature", 0.01, 0.1)

    # Set up environment
    env = gymnasium.make("highway-fast-v0", render_mode='rgb_array')
    env.reset()

    # Configure the agent with the current trial's parameters
    config = {
        "model": {"type": "MultiLayerPerceptron", "layers": layers},
        "gamma": gamma,
        "batch_size": batch_size,
        "memory_capacity": memory_capacity,
        "learning_starts": 500,
        "learning_rate": learning_rate,
        "train_frequency": 4,
        "target_update": 200,
        "double": True,
        "exploration": {
            "method": "EpsilonGreedy",
            "tau": tau,
            "temperature": 1.0,
            "final_temperature": final_temperature,
        },
        "device": "mps" if torch.backends.mps.is_available() else "cpu",
    }

    # Initialize agent and evaluation
    agent = DQNAgent(env, config)
    evaluation = Evaluation(env, agent, directory="highway_ddqn/", num_episodes=100, display_env=False)

    # Train the agent and retrieve the average return
    evaluation.train()
    avg_reward = evaluation.filtered_agent_stats  # or any other final metric you want

    # Close environments and resources
    evaluation.close()
    return avg_reward


In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)  # Runs 50 trials to find the best config
print("Best hyperparameters:", study.best_params)
print("Best average reward:", study.best_value)

### Model Training After Tuning

In [None]:
import gymnasium
import highway_env
import numpy as np


# Initialize environment
env = gymnasium.make("highway-fast-v0", render_mode='rgb_array')
env.reset()  # Ensure the environment is reset before setting up evaluation

# Define optimized configuration for DQN from Optuna tuning
config = {
    "model": {
        "type": "MultiLayerPerceptron",
        "layers": [256, 256]  # Optuna-selected architecture
    },
    "gamma": 0.9859297882341389,  # Optuna-selected gamma for better long-term focus
    "n_steps": 1,
    "batch_size": 38,  # Optuna-selected batch size for more stable updates
    "memory_capacity": 8068,  # Optuna-selected memory capacity for efficient training
    "learning_starts": 1000,  # Start training after 1000 steps to collect sufficient experience
    "learning_rate": 0.0002817553371974497,  # Optuna-selected learning rate for balanced convergence
    "train_frequency": 4,  # Update every 4 steps
    "target_update": 100,  # Update target network less frequently to stabilize training
    "double": True,  # Enable Double DQN to reduce overestimation
    "exploration": {
        "method": "EpsilonGreedy",
        "tau": 1215,  # Optuna-selected epsilon decay rate for prolonged exploration
        "temperature": 1.0,
        "final_temperature": 0.030210359431679448  # Optuna-selected final epsilon for better exploitation
    },
    "device": "mps" if torch.backends.mps.is_available() else "cpu"  # Use MPS on Mac M1 if available
}

# Instantiate the DQN agent with the environment and configuration
agent = DQNAgent(env, config)

# Set up the evaluation instance
evaluation = Evaluation(env, agent, directory="highway_ddqn/", num_episodes=300, display_env=False)

# Train the agent
evaluation.train()


## DDQN Turned & Increase highway density for better navigation

In [None]:
import gymnasium
import highway_env
import numpy as np


# Initialize environment
env = gymnasium.make("highway-fast-v0", render_mode='rgb_array', config= {"vehicles_density": 2})
env.reset()  # Ensure the environment is reset before setting up evaluation

# Define optimized configuration for DQN from Optuna tuning
config = {
    "model": {
        "type": "MultiLayerPerceptron",
        "layers": [256, 256]  # Optuna-selected architecture
    },
    "gamma": 0.9859297882341389,  # Optuna-selected gamma for better long-term focus
    "n_steps": 1,
    "batch_size": 38,  # Optuna-selected batch size for more stable updates
    "memory_capacity": 8068,  # Optuna-selected memory capacity for efficient training
    "learning_starts": 1000,  # Start training after 1000 steps to collect sufficient experience
    "learning_rate": 0.0002817553371974497,  # Optuna-selected learning rate for balanced convergence
    "train_frequency": 4,  # Update every 4 steps
    "target_update": 100,  # Update target network less frequently to stabilize training
    "double": True,  # Enable Double DQN to reduce overestimation
    "exploration": {
        "method": "EpsilonGreedy",
        "tau": 1215,  # Optuna-selected epsilon decay rate for prolonged exploration
        "temperature": 1.0,
        "final_temperature": 0.030210359431679448  # Optuna-selected final epsilon for better exploitation
    },
    "device": "mps" if torch.backends.mps.is_available() else "cpu"  # Use MPS on Mac M1 if available
}

# Instantiate the DQN agent with the environment and configuration
agent = DQNAgent(env, config)

# Set up the evaluation instance
evaluation = Evaluation(env, agent, directory="highway_ddqn/", num_episodes=20000, display_env=False)

# Train the agent
evaluation.train()

### Hyperparameter Tuning with increased highway density

In [4]:
import optuna
import gymnasium
import highway_env
import numpy as np
import torch

def objective(trial):
    # Define the hyperparameter search space
    layers = trial.suggest_categorical("layers", [[64, 64], [128, 128], [256, 256]])
    gamma = trial.suggest_float("gamma", 0.7, 0.99)
    batch_size = trial.suggest_int("batch_size", 16, 128)
    memory_capacity = trial.suggest_int("memory_capacity", 5000, 20000)
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-3)
    tau = trial.suggest_int("tau", 1000, 10000)
    final_temperature = trial.suggest_float("final_temperature", 0.01, 0.1)

    # Set up environment
    env = gymnasium.make("highway-fast-v0", render_mode='rgb_array' , config= {"vehicles_density": 2})
    env.reset()

    # Configure the agent with the current trial's parameters
    config = {
        "model": {"type": "MultiLayerPerceptron", "layers": layers},
        "gamma": gamma,
        "batch_size": batch_size,
        "memory_capacity": memory_capacity,
        "learning_starts": 500,
        "learning_rate": learning_rate,
        "train_frequency": 4,
        "target_update": 200,
        "double": True,
        "exploration": {
            "method": "EpsilonGreedy",
            "tau": tau,
            "temperature": 1.0,
            "final_temperature": final_temperature,
        },
        "device": "mps" if torch.backends.mps.is_available() else "cpu",
    }

    # Initialize agent and evaluation
    agent = DQNAgent(env, config)
    evaluation = Evaluation(env, agent, directory="highway_ddqn/", num_episodes=100, display_env=False)

    # Train the agent and retrieve the average return
    evaluation.train()
    avg_reward = evaluation.filtered_agent_stats  # or any other final metric you want

    # Close environments and resources
    evaluation.close()
    return avg_reward


# Set up Optuna study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)  # Runs 50 trials to find the best config

print("Best hyperparameters:", study.best_params)
print("Best average reward:", study.best_value)

  from .autonotebook import tqdm as notebook_tqdm
[I 2024-11-09 01:30:57,539] A new study created in memory with name: no-name-e6839c01-05e4-4216-bff8-9d1e1aad6582
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1e-3)
  logger.warn(
  logger.warn(
2024-11-09 01:30:59.061 python[6837:7802365] +[IMKClient subclass]: chose IMKClient_Modern
  return self.value_net(torch.tensor(states, dtype=torch.float).to(self.device)).data.cpu().numpy()
2024-11-09 01:30:59.061 python[6837:7802365] +[IMKInputSession subclass]: chose IMKInputSession_Modern
[INFO] Episode 0 score: 1.0 
[INFO] Saved DQNAgent model to highway_ddqn/run_20241109-013058_6837/checkpoint-0.tar 
[INFO] Episode 1 score: 1.0 
[INFO] Saved DQNAgent model to highway_ddqn/run_20241109-013058_6837/checkpoint-1.tar 
[INFO] Episode 2 score: 1.6 
[INFO] Episode 3 score: 0.8 
[INFO] Episode 4 score: 1.6 
[INFO] Episode 5 score: 2.0 
[INFO] Episode 6 score: 4.7 
[INFO] Episode 7 score: 6.7 
[INFO] Episode 8 score: 3.5 
[INFO

Best hyperparameters: {'layers': [256, 256], 'gamma': 0.8877699144152787, 'batch_size': 59, 'memory_capacity': 5230, 'learning_rate': 4.519834641166762e-05, 'tau': 4026, 'final_temperature': 0.06878738683620923}
Best average reward: 2.6499330843857716


### Model Training After Tuning

In [None]:
import gymnasium
import highway_env
import numpy as np
import torch


config = {
    "vehicles_density": 1,
    "collision_reward": -10,
}
env = gymnasium.make("highway-fast-v0", render_mode='rgb_array', config=config)
env.reset()



# Define optimized configuration for DQN from Optuna tuning
agent_config = {
    "model": {
        "type": "DuelingNetwork",  # Optionally use a dueling architecture
        "layers": [256, 256]  # Matching the architecture in stable-baselines3 setup
    },
    "gamma": 0.8,  # Discount factor
    "n_steps": 1,
    "batch_size": 32,
    "memory_capacity": 15000,  # Replay buffer size
    "learning_starts": 200,  # Delay learning until enough experience is gathered
    "learning_rate": 5e-4,
    "train_frequency": 1,  # Train at every time step
    "target_update": 50,  # Update target network frequently
    "double": True,  # Enable Double DQN to reduce overestimation
    "gradient_clipping": 10.0,  # Clipping large gradients
    "exploration": {
        "method": "EpsilonGreedy",
        "tau": 15000,  # Epsilon decay for exploration
        "temperature": 1.0,  # Initial epsilon
        "final_temperature": 0.1  # Final epsilon for more exploration
    },
    "memory": {
        "type": "ReplayBuffer",
        "capacity": 15000
    },
    "device": "mps" if torch.backends.mps.is_available() else "cpu"
}


# Initialize the agent
agent = DQNAgent(env, config=agent_config)

# Setup the evaluation instance
evaluation = Evaluation(
    env=env,
    agent=agent,
    directory="highway_ddqn/",  # Directory to save models and logs
    num_episodes=2000,  # Training episodes
    training=True,
    display_env=False,  # Do not render during training to save resources
)

# Run the training
evaluation.train()




  logger.warn(
  logger.warn(
2024-11-09 05:19:55.227 python[11710:8048627] +[IMKClient subclass]: chose IMKClient_Modern
2024-11-09 05:19:55.227 python[11710:8048627] +[IMKInputSession subclass]: chose IMKInputSession_Modern
  return self.value_net(torch.tensor(states, dtype=torch.float).to(self.device)).data.cpu().numpy()
[INFO] Episode 0 score: 7.8 
[INFO] Saved DQNAgent model to highway_ddqn/run_20241109-051954_11710/checkpoint-0.tar 
[INFO] Episode 1 score: 16.4 
[INFO] Saved DQNAgent model to highway_ddqn/run_20241109-051954_11710/checkpoint-1.tar 
[INFO] Episode 2 score: 14.6 
[INFO] Episode 3 score: 8.8 
[INFO] Episode 4 score: 2.0 
[INFO] Episode 5 score: 7.7 
[INFO] Episode 6 score: 3.0 
[INFO] Episode 7 score: 10.8 
[INFO] Episode 8 score: 8.7 
[INFO] Saved DQNAgent model to highway_ddqn/run_20241109-051954_11710/checkpoint-8.tar 
[INFO] Episode 9 score: 6.9 
[INFO] Episode 10 score: 13.6 
[INFO] Episode 11 score: 5.8 
[INFO] Episode 12 score: 9.9 
[INFO] Episode 13 score: 7

: 