# Twin-Delayed DDPG

## Installing the packages

In [None]:
!pip install torch #2.0.1 + cu118
!pip install numpy #1.24.2
!pip install matplotlib #3.6.3
!pip install metadrive-simulator #0.3.0.1, For troubleshooting this command, please visit the Meta-Drive GitHub repository at: https://github.com/metadriverse/metadrive.

## Importing the libraries

In [1]:
import os
import time
import numpy as np
from metadrive import MetaDriveEnv
import torch
import torch.nn as nn
import torch.nn.functional as F
import pickle




Successfully registered the following environments: ['MetaDrive-validation-v0', 'MetaDrive-10env-v0', 'MetaDrive-100envs-v0', 'MetaDrive-1000envs-v0', 'SafeMetaDrive-validation-v0', 'SafeMetaDrive-10env-v0', 'SafeMetaDrive-100envs-v0', 'SafeMetaDrive-1000envs-v0', 'MARLTollgate-v0', 'MARLBottleneck-v0', 'MARLRoundabout-v0', 'MARLIntersection-v0', 'MARLParkingLot-v0', 'MARLMetaDrive-v0'].


## Step 1: We initialize the Experience Replay memory

In [2]:
class ReplayBuffer(object):
  #Initializes an ReplayBuffer.
  def __init__(self, max_size=10e6):
    self.storage = []
    self.max_size = max_size
    self.ptr = 0
  
  #Adds a transition to the replay buffer, If the buffer is already at its maximum size, the oldest transition is replaced.
  def add(self, transition):
    if len(self.storage) == self.max_size:
      self.storage[int(self.ptr)] = transition
      self.ptr = (self.ptr + 1) % self.max_size
    else:
      self.storage.append(transition)
  
  #Samples a batch of transitions from the replay buffer with default size of 100 transtions 
  def sample(self, batch_size=100):
    ind = np.random.randint(0, len(self.storage), size=batch_size)
    batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = [], [], [], [], []
    #Loops on all sampled transtions dividing them into seperate compenents
    for i in ind: 
      state, next_state, action, reward, done = self.storage[i]
      batch_states.append(np.array(state, copy=False))
      batch_next_states.append(np.array(next_state, copy=False))
      batch_actions.append(np.array(action, copy=False))
      batch_rewards.append(np.array(reward, copy=False))
      batch_dones.append(np.array(done, copy=False))
    return np.array(batch_states), np.array(batch_next_states), np.array(batch_actions), np.array(batch_rewards).reshape(-1, 1), np.array(batch_dones).reshape(-1, 1)
  
  # Making a save method to save a prexisting buffer.
  def save_buffer(self, file_path):
    with open(file_path, 'wb') as f:
        pickle.dump(Self.storage, f)
  
  # Making a load method to load a prexisting buffer.
  def load_buffer(self,file_path):
      with open(file_path, 'rb') as f:
          self.storage = pickle.load(f)

## Step 2: We build one neural network for the Actor model and one neural network for the Actor target

In [3]:
class Actor(nn.Module):
  
  #Initializes an Actor network.
  def __init__(self, state_dim, action_dim, max_action):
    super(Actor, self).__init__()
    self.layer_1 = nn.Linear(state_dim,400)
    self.layer_2 = nn.Linear(400,300)
    self.layer_3 = nn.Linear(300, action_dim)
    self.max_action = max_action
  
  #Performs a forward pass through the Actor network.
  def forward(self, x):
    x = F.relu(self.layer_1(x))
    x = F.relu(self.layer_2(x))
    x = self.max_action * torch.tanh(self.layer_3(x)) 
    return x

## Step 3: We build two neural networks for the two Critic models and two neural networks for the two Critic targets

In [4]:
class Critic(nn.Module):
  
  #Initializes an Twin Critic networks.
  def __init__(self, state_dim, action_dim):
    super(Critic, self).__init__()
  
    #Defining the first Critic neural network.
    self.layer_1 = nn.Linear(state_dim + action_dim, 400)
    self.layer_2 = nn.Linear(400, 300)
    self.layer_3 = nn.Linear(300, 1)
  
    #Defining the second Critic neural network.
    self.layer_4 = nn.Linear(state_dim + action_dim,400)
    self.layer_5 = nn.Linear(400, 300)
    self.layer_6 = nn.Linear(300, 1)
  
  #Performs a forward pass through the Twin Critic networks.
  def forward(self, x, u):
    xu = torch.cat([x, u], 1)
    
    #Forward-Propagation on the first Critic Neural Network.
    x1 = F.relu(self.layer_1(xu))
    x1 = F.relu(self.layer_2(x1))
    x1 = self.layer_3(x1)
    
    #Forward-Propagation on the second Critic Neural Network.
    x2 = F.relu(self.layer_4(xu))
    x2 = F.relu(self.layer_5(x2))
    x2 = self.layer_6(x2)
    return x1, x2
  
  #Calculates the Q-value using the first Critic network.
  def Q1(self, x, u):
    xu = torch.cat([x, u], 1)
    x1 = F.relu(self.layer_1(xu))
    x1 = F.relu(self.layer_2(x1))
    x1 = self.layer_3(x1)
    return x1

## Selecting the best hardware to run on

In [5]:
#Selecting the device (CPU or GPU).
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## Steps 4 to 15: Training Process

In [6]:
#Building the whole Training Process into a class.

class TD3(object):
  def __init__(self, state_dim, action_dim, max_action):
    self.actor = Actor(state_dim, action_dim, max_action).to(device)
    self.actor_target = Actor(state_dim, action_dim, max_action).to(device)
    self.actor_target.load_state_dict(self.actor.state_dict())
    self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
    self.critic = Critic(state_dim, action_dim).to(device)
    self.critic_target = Critic(state_dim, action_dim).to(device)
    self.critic_target.load_state_dict(self.critic.state_dict())
    self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
    self.max_action = max_action
  
  #Uses the actor network to select an action using the current state of the enviroment.
  def select_action(self, state):
    state = torch.Tensor(state.reshape(1, -1)).to(device)
    return self.actor(state).cpu().data.numpy().flatten()

  def train(self, replay_buffer, iterations, batch_size=100, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2):
    for it in range(iterations):
      
      #Step 4: We sample a batch of transitions (s, s’, a, r) from the memory.
      batch_states, batch_next_states, batch_actions, batch_rewards, batch_dones = replay_buffer.sample(batch_size)
      state = torch.Tensor(batch_states).to(device)
      next_state = torch.Tensor(batch_next_states).to(device)
      action = torch.Tensor(batch_actions).to(device)
      reward = torch.Tensor(batch_rewards).to(device)
      done = torch.Tensor(batch_dones).to(device)
      
      #Step 5: From the next state s’, the Actor target plays the next action a’.
      next_action = self.actor_target(next_state)
      
      #Step 6: We add Gaussian noise to this next action a’ and we clamp it in a range of values supported by the environment.
      noise = torch.Tensor(batch_actions).data.normal_(0, policy_noise).to(device)
      noise = noise.clamp(-noise_clip, noise_clip)
      next_action = (next_action + noise).clamp(-self.max_action, self.max_action)
      
      #Step 7: The two Critic targets take each the couple (s’, a’) as input and return two Q-values Qt1(s’,a’) and Qt2(s’,a’) as outputs.
      target_Q1, target_Q2 = self.critic_target(next_state, next_action)
      
      #Step 8: We keep the minimum of these two Q-values: min(Qt1, Qt2).
      target_Q = torch.min(target_Q1, target_Q2)
      
      #Step 9: We get the final target of the two Critic models, which is: Qt = r + γ * min(Qt1, Qt2), where γ is the discount factor.
      target_Q = reward + ((1 - done) * discount * target_Q).detach()
      
      #Step 10: The two Critic models take each the couple (s, a) as input and return two Q-values Q1(s,a) and Q2(s,a) as outputs.
      current_Q1, current_Q2 = self.critic(state, action)
      
      #Step 11: We compute the loss coming from the two Critic models: Critic Loss = MSE_Loss(Q1(s,a), Qt) + MSE_Loss(Q2(s,a), Qt).
      critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)
      
      #Step 12: We backpropagate this Critic loss and update the parameters of the two Critic models with a SGD optimizer.
      self.critic_optimizer.zero_grad()
      critic_loss.backward()
      self.critic_optimizer.step()
      
      #Step 13: Once every two iterations, we update our Actor model by performing gradient ascent on the output of the first Critic model.
      if it % policy_freq == 0:
        actor_loss = -self.critic.Q1(state, self.actor(state)).mean()
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        
        #Step 14: Still once every two iterations, we update the weights of the Actor target by polyak averaging.
        for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
          target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
        
        #Step 15: Still once every two iterations, we update the weights of the Critic target by polyak averaging.
        for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
          target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
  
  #Making a save method to save a trained model.
  def save(self, filename, directory):
    torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename))
    torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename))
  #Making a load method to load a pre-trained model.
  def load(self, filename, directory):
    self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))
    self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename)))

## We make a function that evaluates a static policy by calculating its average reward over 10 episodes

In [7]:
def evaluate_policy(policy, num_episodes=10, initial_evaluation=False):
    
    total_reward = 0.0
    total_steps = 0
    max_number_of_steps = 1000 if initial_evaluation else 10000 # Limit the maximum number of steps during the initial evaluation to avoid potential scenarios where the new policy gets stuck and takes an excessively long time to complete
    num_of_steps = 0

    # Loop through a specified number of episodes
    for i in range(num_episodes):
        obs = env.reset()  # Reset the environment for a new episode
        num_of_steps = 0
        done = False
        episode_reward = 0.0
        info = {}

        # Execute the policy until the episode terminates or maximum steps are reached
        while not done and num_of_steps < max_number_of_steps:
            action = policy.select_action(obs)  # Select an action based on the policy
            next_obs, reward, done, info = env.step(action)  # Execute the action in the environment
            episode_reward += reward  # Accumulate the reward for the episode
            obs = next_obs  # Update the current observation
            num_of_steps += 1
            total_steps += 1
        #Add the final episode reward 
        total_reward += episode_reward
    
    # Close the environment
    env.close()  

    # Calculate average reward and average steps per episode
    avg_reward = total_reward / num_episodes
    avg_steps = total_steps / num_episodes

    # Print the evaluation results
    print("---------------------------------------")
    print("Average Reward over the Evaluation Step: %f Average steps over the Evaluation Step: %f " % (avg_reward, avg_steps))
    print("---------------------------------------")
    return avg_reward


## We set the hyper-parameters for model training 

In [8]:
env_name = "MetaDrive-100envs-v0" # Name of a environment (set it to any Continous environment you want)
seed = 0 # Random seed number
start_timesteps = 1e4 # Number of iterations/timesteps before which the model randomly chooses an action, and after which it starts to use the policy network
eval_freq = 1e4 # How often the evaluation step is performed (after how many timesteps)
max_timesteps = 2e6 # Total number of iterations/timesteps
save_models = True # Boolean checker whether or not to save the pre-trained model
expl_noise = 0.1 # Exploration noise - STD value of exploration Gaussian noise
batch_size = 500 # Size of the batch
discount = 0.99 # Discount factor gamma, used in the calculation of the total discounted reward
tau = 0.005 # Target network update rate
policy_noise = 0.5 # STD of Gaussian noise added to the actions for the exploration purposes
noise_clip = 0.2 # Maximum value of the Gaussian noise added to the actions (policy)
policy_freq = 2 # Number of iterations to wait before the policy network (Actor model) is updated

## We create the replay buffer and load any existing replay buffers

In [None]:
replayBuffer=ReplayBuffer()

## We create a file name for the two saved models: the Actor and Critic models

In [9]:
file_name = "%s_%s_%s" % ("TD3", env_name, str(seed))
print ("---------------------------------------")
print ("Settings: %s" % (file_name))
print ("---------------------------------------")

---------------------------------------
Settings: TD3_MetaDrive-100envs-v0_0
---------------------------------------


## We create a folder inside which will be saved the trained models

In [10]:
if not os.path.exists("./results"):
  os.makedirs("./results")
if save_models and not os.path.exists("./pytorch_models"):
  os.makedirs("./pytorch_models")

## We create the MetaDrive environment

In [13]:
config = dict(
    use_render=False,  #Flag to enable rendering.
    start_seed=100,  #Starting seed value for reproducibility.
    num_scenarios=20,  #Number of scenarios.
    traffic_density=0.1,  #Density of traffic.
    random_lane_width=True,  #Flag to randomly set lane widths.
    random_lane_num=True,  #Flag to randomly set the number of lanes.
    map=5,  #Map block size selection.
    vehicle_config={
        'increment_steering': False,
        'vehicle_model': 'default',
        'show_navi_mark': True,
        'extra_action_dim': 0,
        'enable_reverse': False,
        'random_navi_mark_color': False,
        'show_dest_mark': False,
        'show_line_to_dest': False,
        'show_line_to_navi_mark': False,
        'use_special_color': False,
        'image_source': 'rgb_camera',
        'navigation_module': None,
        'need_navigation': True,
        'spawn_lane_index': ('>', '>>', 0),
        'spawn_longitude': 5.0,
        'spawn_lateral': 0.0,
        'destination': None,
        'spawn_position_heading': None,
        'spawn_velocity': None,
        'spawn_velocity_car_frame': False,
        'overtake_stat': False,
        'random_color': False,
        'random_agent_model': False,
        'width': None,
        'length': None,
        'height': None,
        'mass': None,
        'lidar': {
            'num_lasers': 40, #Configuring LiDAR array with 40 evenly spread rays in a circular pattern.
            'distance': 50,
            'num_others': 0,
            'gaussian_noise': 0.0,
            'dropout_prob': 0.0,
            'add_others_navi': False
        },
        'side_detector': {
            'num_lasers': 0,
            'distance': 50,
            'gaussian_noise': 0.0,
            'dropout_prob': 0.0
        },
        'lane_line_detector': {
            'num_lasers': 0,
            'distance': 20,
            'gaussian_noise': 0.0,
            'dropout_prob': 0.0
        },
        'show_lidar': True, #Flag to enable rendering of LiDAR rays for improved visualization
        'mini_map': (84, 84, 250),
        'rgb_camera': (84, 84),
        'depth_camera': (84, 84, False),
        'main_camera': (1200, 900),
        'show_side_detector': False,
        'show_lane_line_detector': False,
        'rgb_clip': True,
        'stack_size': 3,
        'rgb_to_grayscale': False,
        'gaussian_noise': 0.0,
        'dropout_prob': 0.0
    },
    max_step_per_agent=25000,  #Maximum number of steps per agent.
    random_traffic=True,  #Flag to enable random traffic.
    use_lateral_reward=True,  #Flag to use lateral reward.
    speed_reward=0.1,  #Reward value for speed.
    success_reward=10.0,  #Reward value for successful completion.
    out_of_road_penalty=50.0,  #Penalty for going out of road boundaries.
    crash_vehicle_penalty=50.0,  #Penalty for crashing into other vehicles.
    crash_object_penalty=50.0  #Penalty for crashing into objects.
)
env =MetaDriveEnv(config)

## We set seeds and we get the necessary information on the states and actions in the chosen environment

In [14]:
env.seed(seed)  #Set the seed for the environment.
torch.manual_seed(seed)  #Set the seed for PyTorch.
np.random.seed(seed)  #Set the seed for NumPy.
state_dim = env.observation_space.shape[0]  #Get the dimensionality of the state space.
action_dim = env.action_space.shape[0]  #Get the dimensionality of the action space.
max_action = float(env.action_space.high[1])  #Get the maximum action value.



2


## We create the policy network (the Actor model)

In [15]:
policy = TD3(state_dim, action_dim, max_action)  #Create a TD3 policy with the specified dimensions.
actor_file_path = os.path.join("./pytorch_models", f"{file_name}_actor.pth")  #Define the file path for the actor model.
critic_file_path = os.path.join("./pytorch_models", f"{file_name}_critic.pth")  #Define the file path for the critic model.
if os.path.exists(actor_file_path) and os.path.exists(critic_file_path):
    # If both actor and critic model files exist, load the saved models.
    policy.load(file_name, './pytorch_models/')
    print("A TD3 model already existed and is loaded")
else:
    print("A TD3 model was created")

A TD3 model already existed and is loaded


## We create the Experience Replay memory

In [None]:
replay_buffer = ReplayBuffer()
if os.path.exists("replay_buffer.pkl"):
    replay_buffer.load_buffer("replay_buffer.pkl")

## We define a list where all the evaluation results over 10 episodes are stored

In [17]:
evaluations = [evaluate_policy(policy,initial_evaluation=True)]

---------------------------------------
Average Reward over the Evaluation Step: 405.343096 Average steps over the Evaluation Step: 539.300000 
---------------------------------------


## We initialize the variables

In [19]:
max_episode_steps = env.config.max_step_per_agent 
timesteps_since_eval = 0
total_timesteps = 0
episode_num = 0
done = True
t0 = time.time()
total_reward =0.0 

## Training

In [20]:
#We start the main loop over 1000,000 timesteps.
while total_timesteps < max_timesteps:
  
  #If the episode is done.
  if done or episode_timesteps == env.config.max_step_per_agent:
    
    #If we are not at the very beginning, we start the training process of the model.
    if total_timesteps != 0 and len(replay_buffer.storage) > batch_size:
      print("Total Timesteps: {} Episode Num: {} Reward: {}".format(total_timesteps, episode_num, episode_reward))
      policy.train(replay_buffer, episode_timesteps, batch_size, discount, tau, policy_noise, noise_clip, policy_freq)
      crash_transition=None
    
    #We evaluate the episode and we save the policy.
    if timesteps_since_eval >= eval_freq:
      timesteps_since_eval %= eval_freq
      evaluations.append(evaluate_policy(policy))
      policy.save(file_name, directory="./pytorch_models")
      np.save("./results/%s" % (file_name), evaluations)

    #When the training step is done, we reset the state of the environment.
    obs = env.reset()

    #Set the Done to False.
    done = False
    
    #Set rewards and episode timesteps to zero.
    episode_reward = 0
    episode_timesteps = 0
    episode_num += 1

  #Random actions encourage exploration and fill replay buffer.
  if total_timesteps < start_timesteps:
    action = env.action_space.sample()
    if(total_timesteps<5000):
      action[0]=0
  #After 10000 timesteps, we switch to the model.
  else: 
    action = policy.select_action(np.array(obs))     

    #If the explore_noise parameter is not 0, we add noise to the action and we clip it.
    if expl_noise != 0:
      action = (action + np.random.normal(0, expl_noise, size=env.action_space.shape[0])).clip(env.action_space.low, env.action_space.high)
  
  #The agent performs the action in the environment, then reaches the next state and receives the reward.
  new_obs, reward, done, info = env.step(action)
  
  #We check if the episode is done.
  done_bool = 0 if episode_timesteps + 1 == env.config.max_step_per_agent else float(done)
  
  #We increase the total reward.
  episode_reward += reward
  total_reward +=reward
  
  #We store the new transition into the Experience Replay memory (ReplayBuffer).
  replay_buffer.add((obs, new_obs, action, reward, done_bool))
  
  #We update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy.
  obs = new_obs
  episode_timesteps += 1
  total_timesteps += 1
  timesteps_since_eval += 1

#We add the last policy evaluation to our list of evaluations and we save our model.
evaluations.append(evaluate_policy(policy))
if save_models: policy.save("%s" % (file_name), directory="./pytorch_models")
np.save("./results/%s" % (file_name), evaluations)
env.reset()      
env.close()
#We save the replay buffer for any future training. 
replay_buffer.save_buffer("replay_buffer.pkl")
average_reward_across_episodes= total_reward/episode_num

NameError: name 'replay_buffer' is not defined

## Testing the adaptive Module

In [None]:
def adaptive_policy_evaluation(policy, average_reward_across_episodes, num_episodes=10):
    total_reward = 0.0
    emotional_damage_counter = 3  # Counter for consecutive episodes with low reward
    timesteps_since_eval = 0  # Counter for timesteps since last policy evaluation

    for i in range(num_episodes):
        obs = env.reset()  # Reset the environment for a new episode
        num_of_steps = 0
        done = False
        episode_reward = 0.0
        info = {}  # Additional information about the episode
        episode_timesteps = 0  # Count of timesteps within the episode

        while not done:
            action = policy.select_action(obs)  # Select an action using the policy
            new_obs, reward, done, info = env.step(action)  # Take a step in the environment
            done_bool = 0 if episode_timesteps + 1 == env.config.max_step_per_agent else float(done)
            # Convert done flag to a float value (0 if not done, 1 if done at the last timestep)
            episode_reward += reward  # Accumulate the reward for the episode

            replay_buffer.add((obs, new_obs, action, reward, done_bool))
            # Store the transition in the replay buffer for training

            obs = new_obs  # Update the observation for the next timestep
            episode_timesteps += 1
            timesteps_since_eval += 1
            num_of_steps += 1

        if timesteps_since_eval > eval_freq:
            # Evaluate the policy after a certain number of timesteps
            print(f"Training model after {eval_freq} timesteps")
            policy.train(replay_buffer, 1000, batch_size, discount, tau, policy_noise, noise_clip, policy_freq)
            policy.save("%s" % (file_name), directory="./pytorch_models")
        elif episode_reward < average_reward_across_episodes * 0.2 or info['crash'] or info['out_of_road']:
            # If the episode reward is significantly below the average or the agent crashes/out of road
            print("Training model due to low score")
            policy.train(replay_buffer, 10, batch_size, discount, tau, policy_noise, noise_clip, policy_freq)
            policy.save("%s" % (file_name), directory="./pytorch_models")
        elif episode_reward < average_reward_across_episodes * 0.7:
            if emotional_damage_counter == 0:
                # If the episode reward is consistently below 70% of the average for 3 consecutive episodes
                print(f"Training model due to consecutive low scores ({emotional_damage_counter} episodes)")
                policy.train(replay_buffer, 30, batch_size, discount, tau, policy_noise, noise_clip, policy_freq)
                policy.save("%s" % (file_name), directory="./pytorch_models")
                emotional_damage_counter = 3
            else:
                emotional_damage_counter -= 1
        else:
            emotional_damage_counter = 3

        average_reward_across_episodes = (average_reward_across_episodes + episode_reward) / 2
        total_reward += episode_reward
        env.close()

    avg_reward = total_reward / num_episodes
    print("---------------------------------------")
    print("Average Reward over the Evaluation Step: %f" % (avg_reward))
    print("---------------------------------------")
    return avg_reward

file_name = "%s_%s_%s" % ("TD3", env_name, str(seed))
print ("---------------------------------------")
print ("Settings: %s" % (file_name))
print ("---------------------------------------")
eval_episodes = 10
config=dict(
          use_render= True, 
          start_seed= 100, 
          num_scenarios=20,
          traffic_density=0.1,
          random_lane_width=True,
          random_lane_num=True,
          map=4,
          vehicle_config= {'increment_steering': False, 'vehicle_model': 'default', 'show_navi_mark': True, 'extra_action_dim': 0, 'enable_reverse': False, 'random_navi_mark_color': False, 'show_dest_mark': False, 'show_line_to_dest': False, 'show_line_to_navi_mark': False, 'use_special_color': False, 'image_source': 'rgb_camera', 'navigation_module': None, 'need_navigation': True, 'spawn_lane_index': ('>', '>>', 0), 'spawn_longitude': 5.0, 'spawn_lateral': 0.0, 'destination': None, 'spawn_position_heading': None, 'spawn_velocity': None, 'spawn_velocity_car_frame': False, 'overtake_stat': False, 'random_color': False, 'random_agent_model': False, 'width': None, 'length': None, 'height': None, 'mass': None, 'lidar': {'num_lasers': 40, 'distance': 50, 'num_others': 0, 'gaussian_noise': 0.0, 'dropout_prob': 0.0, 'add_others_navi': False}, 'side_detector': {'num_lasers': 0, 'distance': 50, 'gaussian_noise': 0.0, 'dropout_prob': 0.0}, 'lane_line_detector': {'num_lasers': 0, 'distance': 20, 'gaussian_noise': 0.0, 'dropout_prob': 0.0}, 'show_lidar': True, 'mini_map': (84, 84, 250), 'rgb_camera': (84, 84), 'depth_camera': (84, 84, False), 'main_camera': (1200, 900), 'show_side_detector': False, 'show_lane_line_detector': False, 'rgb_clip': True, 'stack_size': 3, 'rgb_to_grayscale': False, 'gaussian_noise': 0.0, 'dropout_prob': 0.0},
          max_step_per_agent = 25000,
          random_traffic= True
)
env = MetaDriveEnv(config)
env.seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
max_action = float(env.action_space.high[0])
policy = TD3(state_dim, action_dim, max_action)
policy.load(file_name, './pytorch_models/')
_ = adaptive_policy_evaluation(policy,average_reward_across_episodes=average_reward_across_episodes)