In [2]:
from utils.training import train_model
from utils.evaluation import evaluate_model, aggregate_and_normalize_rewards
from gymnasium import register
import gymnasium


# 0. Notebook description
This notebook explores the impact of using a continuous action space on the performance of an agent on our predefined performance metrics (percent of steps where a collision occurred, the vehicle accelerated too much, and kept a safe distance from other vehicles, and overtook vehicles to the left. We will evaluate the performance of models trained on the default reward function and our custom function, using the SAC and TD3 algorithms. 

We expect that TD3 might perform better because we are using an uncomplicated version of the highway environment that does not require extensive exploration of actions (which SAC excels at because it encourages diverse action sampling).


# 1. Default reward function with SAC algorithm

Below, we train a model with the default reward function using the SAC algorithm, and inspect its performance on our predefined metrics.


In [2]:
register(
    id='DefaultRewardEnv',
    entry_point='HighwayEnvDefaultReward:HighwayEnvDefaultReward',
)

# Set log_rewards_enabled to True or False as per your requirement
log_filename="4_default_reward_log_sac.csv"
log_performance_metrics_enabled=False


# Create the environment with the custom parameter
env = gymnasium.make('DefaultRewardEnv', 
                     render_mode='rgb_array', 
                     log_performance_metrics_enabled=log_performance_metrics_enabled,
                     log_filename=log_filename, 
                     config={"action": {
                        "type": "ContinuousAction"
                        }}
                     )
train_model(
    env=env,
    session_name="4_Group15_RLProject_sac_default",
    algorithm='SAC'
)

{'action': {'type': 'ContinuousAction'},
 'centering_position': [0.3, 0.5],
 'collision_reward': -1,
 'controlled_vehicles': 1,
 'duration': 30,
 'ego_spacing': 1.5,
 'high_speed_reward': 0.4,
 'initial_lane_id': None,
 'lane_change_reward': 0,
 'lanes_count': 3,
 'manual_control': False,
 'normalize_reward': True,
 'observation': {'type': 'Kinematics'},
 'offroad_terminal': False,
 'offscreen_rendering': False,
 'other_vehicles_type': 'highway_env.vehicle.behavior.IDMVehicle',
 'policy_frequency': 1,
 'real_time_rendering': False,
 'render_agent': True,
 'reward_speed_range': [20, 30],
 'right_lane_reward': 0.1,
 'scaling': 5.5,
 'screen_height': 150,
 'screen_width': 600,
 'show_trajectories': False,
 'simulation_frequency': 5,
 'vehicles_count': 20,
 'vehicles_density': 1}
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ./logs/tensorboard/4_Group15_RLProject_sac_default_SAC/SAC_1
---------------------------------
| rollout/   

In [3]:
# Evaluate
log_performance_metrics_enabled=True


config_updates={
"simulation_frequency": 15 # Use a higher frame rate
}

env = gymnasium.make('DefaultRewardEnv', 
                     render_mode='rgb_array', 
                    log_performance_metrics_enabled=log_performance_metrics_enabled,
                     log_filename=log_filename, 
                     config={"action": {
                        "type": "ContinuousAction"
                        }}
                     )

# evaluate the model with the default reward function
evaluate_model(
    env=env,
    config_updates=config_updates,
    model_path="models/4_Group15_RLProject_sac_default",
    algorithm='SAC',
)

{'action': {'type': 'ContinuousAction'},
 'centering_position': [0.3, 0.5],
 'collision_reward': -1,
 'controlled_vehicles': 1,
 'duration': 30,
 'ego_spacing': 1.5,
 'high_speed_reward': 0.4,
 'initial_lane_id': None,
 'lane_change_reward': 0,
 'lanes_count': 3,
 'manual_control': False,
 'normalize_reward': True,
 'observation': {'type': 'Kinematics'},
 'offroad_terminal': False,
 'offscreen_rendering': False,
 'other_vehicles_type': 'highway_env.vehicle.behavior.IDMVehicle',
 'policy_frequency': 1,
 'real_time_rendering': False,
 'render_agent': True,
 'reward_speed_range': [20, 30],
 'right_lane_reward': 0.1,
 'scaling': 5.5,
 'screen_height': 150,
 'screen_width': 600,
 'show_trajectories': False,
 'simulation_frequency': 15,
 'vehicles_count': 20,
 'vehicles_density': 1}
Logging metrics for step 15 and seconds elapsed 1.0
Logging metrics for step 30 and seconds elapsed 2.0
Logging metrics for step 45 and seconds elapsed 3.0
Logging metrics for step 60 and seconds elapsed 4.0
Logg

In [4]:
metrics = aggregate_and_normalize_rewards(log_filename)
if metrics:
    print("Performance metric (as percent of all steps):")
    for metric_name, avg_metric in metrics.items():
        print(f"{metric_name}: {avg_metric*100:.4f}%")

Performance metric (as percent of all steps):
collision_count: 0.0000%
right_lane_count: 47.4000%
on_road_count: 86.8000%
safe_distance_count: 100.0000%
left_vehicle_overtaken_count: 0.0000%
abrupt_accelerations_count: 89.9000%


The agent trained on the default reward function with the SAC algorithm accelerates too quickly in the overwhelming majority of steps. Unlike in previous notebooks, it also runs off the road in a significant number of recorded steps (13%). There were no recorded collisions, likely because the agent was not learning to move forward and maneuver between cars. It seems as though the model was not able to learn at all. 

# 2. Custom reward function with SAC algorithm

Below, we train a model with the custom reward function using the SAC algorithm, and inspect its performance on our predefined metrics. We use our predefined updated weights for our custom reward function, such as applying a heavier penalty for collisions, and defining a higher reward for driving in the right lane, and driving without accelerating too quickly. 

In [5]:
config_updates = {
    "safe_distance_reward": 0.1,
    "left_vehicle_overtaken_reward": -0.5,
    "collision_reward": -4,
    "smooth_driving_reward" : 0.3,
    "right_lane_reward" : 0.5, 
}
log_filename="4_sac_custom_reward_log.csv"


In [6]:
# Register the custom environment
register(
    id='CustomRewardEnv',
    entry_point='HighwayEnvCustomReward:HighwayEnvFastCustomReward',
)


log_performance_metrics_enabled=False

env = gymnasium.make('CustomRewardEnv', 
                     render_mode='rgb_array', 
                     log_performance_metrics_enabled=log_performance_metrics_enabled, 
                     log_filename=log_filename, 
                     config={"action": {
                        "type": "ContinuousAction"
                        }}
                    )

train_model(
    env=env,
    config_updates=config_updates,
    session_name='4_Group15_RLProject_sac_custom',
    algorithm='SAC'
)

{'action': {'type': 'ContinuousAction'},
 'centering_position': [0.3, 0.5],
 'collision_reward': -4,
 'controlled_vehicles': 1,
 'duration': 30,
 'ego_spacing': 1.5,
 'high_speed_reward': 0.4,
 'initial_lane_id': None,
 'lane_change_reward': 0,
 'lanes_count': 3,
 'left_vehicle_overtaken_reward': -0.5,
 'manual_control': False,
 'normalize_reward': True,
 'observation': {'type': 'Kinematics'},
 'offroad_terminal': False,
 'offscreen_rendering': False,
 'other_vehicles_type': 'highway_env.vehicle.behavior.IDMVehicle',
 'policy_frequency': 1,
 'real_time_rendering': False,
 'render_agent': True,
 'reward_speed_range': [20, 30],
 'right_lane_reward': 0.5,
 'safe_distance_reward': 0.1,
 'scaling': 5.5,
 'screen_height': 150,
 'screen_width': 600,
 'show_trajectories': False,
 'simulation_frequency': 5,
 'smooth_driving_reward': 0.3,
 'vehicles_count': 20,
 'vehicles_density': 1}
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ./logs/

In [7]:
# Evaluate
log_performance_metrics_enabled=True
env = gymnasium.make('CustomRewardEnv', 
                     render_mode='rgb_array', 
                     log_performance_metrics_enabled=log_performance_metrics_enabled, 
                     log_filename=log_filename, 
                     config={"action": {
                        "type": "ContinuousAction"
                        }}

                    )

evaluate_model(
    env=env,
    config_updates={**config_updates, 
                    "simulation_frequency": 15, 
                   
                   },
    model_path='models/4_Group15_RLProject_sac_custom',
    algorithm='SAC',
)

{'action': {'type': 'ContinuousAction'},
 'centering_position': [0.3, 0.5],
 'collision_reward': -4,
 'controlled_vehicles': 1,
 'duration': 30,
 'ego_spacing': 1.5,
 'high_speed_reward': 0.4,
 'initial_lane_id': None,
 'lane_change_reward': 0,
 'lanes_count': 3,
 'left_vehicle_overtaken_reward': -0.5,
 'manual_control': False,
 'normalize_reward': True,
 'observation': {'type': 'Kinematics'},
 'offroad_terminal': False,
 'offscreen_rendering': False,
 'other_vehicles_type': 'highway_env.vehicle.behavior.IDMVehicle',
 'policy_frequency': 1,
 'real_time_rendering': False,
 'render_agent': True,
 'reward_speed_range': [20, 30],
 'right_lane_reward': 0.5,
 'safe_distance_reward': 0.1,
 'scaling': 5.5,
 'screen_height': 150,
 'screen_width': 600,
 'show_trajectories': False,
 'simulation_frequency': 15,
 'smooth_driving_reward': 0.3,
 'vehicles_count': 20,
 'vehicles_density': 1}
Logging metrics for step 15 and seconds elapsed 1.0
Logging metrics for step 30 and seconds elapsed 2.0
Logging

In [8]:
metrics = aggregate_and_normalize_rewards(log_filename)

if metrics:
    print("Performance metric (as percent of all steps):")
    for metric_name, avg_metric in metrics.items():
        print(f"{metric_name}: {avg_metric*100:.4f}%")

Performance metric (as percent of all steps):
collision_count: 0.0000%
right_lane_count: 54.2667%
on_road_count: 83.7333%
safe_distance_count: 100.0000%
left_vehicle_overtaken_count: 0.0000%
abrupt_accelerations_count: 93.5667%


# 3. Default reward function with TD3 algorithm

Below, we train a model with the default reward function using the TD3 algorithm, and inspect its performance on our predefined metrics.

In [9]:
register(
    id='DefaultRewardEnv',
    entry_point='HighwayEnvDefaultReward:HighwayEnvDefaultReward',
)

# Set log_rewards_enabled to True or False as per your requirement
log_filename="4_default_reward_log_td3.csv"
log_performance_metrics_enabled=False


# Create the environment with the custom parameter
env = gymnasium.make('DefaultRewardEnv', 
                     render_mode='rgb_array', 
                     log_performance_metrics_enabled=log_performance_metrics_enabled,
                     log_filename=log_filename, 
                     config={"action": {
                        "type": "ContinuousAction"
                        }}
                     )
train_model(
    env=env,
    session_name="4_Group15_RLProject_td3_default",
    algorithm='TD3'
)

{'action': {'type': 'ContinuousAction'},
 'centering_position': [0.3, 0.5],
 'collision_reward': -1,
 'controlled_vehicles': 1,
 'duration': 30,
 'ego_spacing': 1.5,
 'high_speed_reward': 0.4,
 'initial_lane_id': None,
 'lane_change_reward': 0,
 'lanes_count': 3,
 'manual_control': False,
 'normalize_reward': True,
 'observation': {'type': 'Kinematics'},
 'offroad_terminal': False,
 'offscreen_rendering': False,
 'other_vehicles_type': 'highway_env.vehicle.behavior.IDMVehicle',
 'policy_frequency': 1,
 'real_time_rendering': False,
 'render_agent': True,
 'reward_speed_range': [20, 30],
 'right_lane_reward': 0.1,
 'scaling': 5.5,
 'screen_height': 150,
 'screen_width': 600,
 'show_trajectories': False,
 'simulation_frequency': 5,
 'vehicles_count': 20,
 'vehicles_density': 1}
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ./logs/tensorboard/4_Group15_RLProject_td3_default_TD3/TD3_1


  logger.warn(f"Overriding environment {new_spec.id} already in registry.")


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 30       |
|    ep_rew_mean     | 1.4      |
| time/              |          |
|    episodes        | 4        |
|    fps             | 88       |
|    time_elapsed    | 1        |
|    total_timesteps | 120      |
| train/             |          |
|    actor_loss      | -0.00697 |
|    critic_loss     | 0.0471   |
|    learning_rate   | 0.001    |
|    n_updates       | 19       |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 26.9     |
|    ep_rew_mean     | 2.12     |
| time/              |          |
|    episodes        | 8        |
|    fps             | 79       |
|    time_elapsed    | 2        |
|    total_timesteps | 215      |
| train/             |          |
|    actor_loss      | -0.128   |
|    critic_loss     | 0.019    |
|    learning_rate   | 0.001    |
|    n_updates       | 114      |
--------------

In [10]:
# Evaluate
log_performance_metrics_enabled=True


config_updates={
"simulation_frequency": 15 # Use a higher frame rate
}

env = gymnasium.make('DefaultRewardEnv', 
                     render_mode='rgb_array', 
                    log_performance_metrics_enabled=log_performance_metrics_enabled,
                     log_filename=log_filename, 
                     config={"action": {
                        "type": "ContinuousAction"
                        }}
                     )

# evaluate the model with the default reward function
evaluate_model(
    env=env,
    config_updates=config_updates,
    model_path="models/4_Group15_RLProject_td3_default",
    algorithm='TD3',
)

{'action': {'type': 'ContinuousAction'},
 'centering_position': [0.3, 0.5],
 'collision_reward': -1,
 'controlled_vehicles': 1,
 'duration': 30,
 'ego_spacing': 1.5,
 'high_speed_reward': 0.4,
 'initial_lane_id': None,
 'lane_change_reward': 0,
 'lanes_count': 3,
 'manual_control': False,
 'normalize_reward': True,
 'observation': {'type': 'Kinematics'},
 'offroad_terminal': False,
 'offscreen_rendering': False,
 'other_vehicles_type': 'highway_env.vehicle.behavior.IDMVehicle',
 'policy_frequency': 1,
 'real_time_rendering': False,
 'render_agent': True,
 'reward_speed_range': [20, 30],
 'right_lane_reward': 0.1,
 'scaling': 5.5,
 'screen_height': 150,
 'screen_width': 600,
 'show_trajectories': False,
 'simulation_frequency': 15,
 'vehicles_count': 20,
 'vehicles_density': 1}
Logging metrics for step 15 and seconds elapsed 1.0
Logging metrics for step 30 and seconds elapsed 2.0
Logging metrics for step 45 and seconds elapsed 3.0
Logging metrics for step 60 and seconds elapsed 4.0
Logg

In [11]:
metrics = aggregate_and_normalize_rewards(log_filename)

if metrics:
    print("Performance metric (as percent of all steps):")
    for metric_name, avg_metric in metrics.items():
        print(f"{metric_name}: {avg_metric*100:.4f}%")

Performance metric (as percent of all steps):
collision_count: 0.0000%
right_lane_count: 80.7333%
on_road_count: 67.5500%
safe_distance_count: 100.0000%
left_vehicle_overtaken_count: 0.0000%
abrupt_accelerations_count: 90.7167%


# 4. Custom reward function with TD3 algorithm

Below, we train a model with the custom reward function using the TD3 algorithm, and inspect its performance on our predefined metrics. 

In [32]:
config_updates = {
    "safe_distance_reward": 0.1,
    "left_vehicle_overtaken_reward": -0.5,
    "collision_reward": -4,
    "smooth_driving_reward" : 0.3,
    "right_lane_reward" : 0.5, 
}
log_filename="4_td3_custom_reward_log.csv"


In [33]:
# Register the custom environment
register(
    id='CustomRewardEnv',
    entry_point='HighwayEnvCustomReward:HighwayEnvFastCustomReward',
)


log_performance_metrics_enabled=False

env = gymnasium.make('CustomRewardEnv', 
                     render_mode='rgb_array', 
                     log_performance_metrics_enabled=log_performance_metrics_enabled, 
                     log_filename=log_filename, 
                     config={"action": {
                        "type": "ContinuousAction"
                        }}
                    )

train_model(
    env=env,
    config_updates=config_updates,
    session_name='4_Group15_RLProject_td3_custom',
    algorithm='TD3'
)

{'action': {'type': 'ContinuousAction'},
 'centering_position': [0.3, 0.5],
 'collision_reward': -4,
 'controlled_vehicles': 1,
 'duration': 30,
 'ego_spacing': 1.5,
 'high_speed_reward': 0.4,
 'initial_lane_id': None,
 'lane_change_reward': 0,
 'lanes_count': 3,
 'left_vehicle_overtaken_reward': -0.5,
 'manual_control': False,
 'normalize_reward': True,
 'observation': {'type': 'Kinematics'},
 'offroad_terminal': False,
 'offscreen_rendering': False,
 'other_vehicles_type': 'highway_env.vehicle.behavior.IDMVehicle',
 'policy_frequency': 1,
 'real_time_rendering': False,
 'render_agent': True,
 'reward_speed_range': [20, 30],
 'right_lane_reward': 0.5,
 'safe_distance_reward': 0.1,
 'scaling': 5.5,
 'screen_height': 150,
 'screen_width': 600,
 'show_trajectories': False,
 'simulation_frequency': 5,
 'smooth_driving_reward': 0.3,
 'vehicles_count': 20,
 'vehicles_density': 1}
Training with policy MlpPolicy
Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a 

  logger.warn(f"Overriding environment {new_spec.id} already in registry.")


---------------------------------
| rollout/           |          |
|    ep_len_mean     | 30       |
|    ep_rew_mean     | 1.26     |
| time/              |          |
|    episodes        | 4        |
|    fps             | 84       |
|    time_elapsed    | 1        |
|    total_timesteps | 120      |
| train/             |          |
|    actor_loss      | -0.0385  |
|    critic_loss     | 0.0497   |
|    learning_rate   | 0.001    |
|    n_updates       | 19       |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 30       |
|    ep_rew_mean     | 1.92     |
| time/              |          |
|    episodes        | 8        |
|    fps             | 73       |
|    time_elapsed    | 3        |
|    total_timesteps | 240      |
| train/             |          |
|    actor_loss      | -0.13    |
|    critic_loss     | 0.0261   |
|    learning_rate   | 0.001    |
|    n_updates       | 139      |
--------------

In [34]:
# Evaluate
log_performance_metrics_enabled=True



env = gymnasium.make('CustomRewardEnv', 
                     render_mode='rgb_array', 
                    log_performance_metrics_enabled=log_performance_metrics_enabled,
                     log_filename=log_filename, 
                     config={"action": {
                        "type": "ContinuousAction"
                        }}
                     )

# evaluate the model with the default reward function
evaluate_model(
    env=env,
    config_updates={**config_updates, 
                 "simulation_frequency": 15, 
                   },
    model_path="models/4_Group15_RLProject_td3_custom",
    algorithm='TD3',
)

{'action': {'type': 'ContinuousAction'},
 'centering_position': [0.3, 0.5],
 'collision_reward': -4,
 'controlled_vehicles': 1,
 'duration': 30,
 'ego_spacing': 1.5,
 'high_speed_reward': 0.4,
 'initial_lane_id': None,
 'lane_change_reward': 0,
 'lanes_count': 3,
 'left_vehicle_overtaken_reward': -0.5,
 'manual_control': False,
 'normalize_reward': True,
 'observation': {'type': 'Kinematics'},
 'offroad_terminal': False,
 'offscreen_rendering': False,
 'other_vehicles_type': 'highway_env.vehicle.behavior.IDMVehicle',
 'policy_frequency': 1,
 'real_time_rendering': False,
 'render_agent': True,
 'reward_speed_range': [20, 30],
 'right_lane_reward': 0.5,
 'safe_distance_reward': 0.1,
 'scaling': 5.5,
 'screen_height': 150,
 'screen_width': 600,
 'show_trajectories': False,
 'simulation_frequency': 15,
 'smooth_driving_reward': 0.3,
 'vehicles_count': 20,
 'vehicles_density': 1}
Loading model with path models/4_Group15_RLProject_td3_custom
Logging metrics for step 15 and seconds elapsed 1

In [35]:
metrics = aggregate_and_normalize_rewards(log_filename)

if metrics:
    print("Performance metric (as percent of all steps):")
    for metric_name, avg_metric in metrics.items():
        print(f"{metric_name}: {avg_metric*100:.4f}%")

Performance metric (as percent of all steps):
collision_count: 0.0000%
right_lane_count: 42.3000%
on_road_count: 54.6333%
safe_distance_count: 99.9833%
left_vehicle_overtaken_count: 0.0000%
abrupt_accelerations_count: 93.1500%


# 5. Results

| Metric                          | Default Reward (SAC) | Custom Reward (SAC) | Default Reward (TD3) | Custom Reward (TD3) |
|---------------------------------|-----------------------|----------------------|-----------------------|----------------------|
| collision_count                 | 0.00%                | 0.00%               | 0.00%                | 0.00%               |
| right_lane_count                | 47.40%               | 54.27%              | 80.73%               | 42.30%              |
| on_road_count                   | 86.80%               | 83.73%              | 67.55%               | 54.63%              |
| safe_distance_count             | 100.00%              | 100.00%             | 100.00%              | 99.98%             |
| left_vehicle_overtaken_count    | 0.00%                | 0.00%               | 0.00%                | 0.00%               |
| abrupt_accelerations_count      | 89.90%               | 93.57%              | 90.72%               | 93.15%              |


# 6. DiscussionWhile metrics like 0 collisions for each agent may seem promising, inspecting the animations of these agents shows that none of them are learning to drive smoothly. Their behavior is erratic, and in every episode, they seemed to run off the road, or spin around, while the other cars in the simulation drove forward and left the ego vehicles behind (explaining the perfect scores for no left cars being overtaken).

It is possible that the agents perform poorly because the continuous action space is much larger than the discrete meta action space (which allowed the agent to choose from 1 of 5 actions). Therefore, future experiments could allow the agents to train for longer. 

Alternatively, the reward function should be adjusted to this action space, perhaps so that collisions and overtaking a left vehicle is not so heavily penalized, encouraging forward driving (instead of spinning to avoid potentially colliding or overtaking a left car). 