### Libraries

In [None]:
import gymnasium as gym
import highway_env
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import random
from collections import deque, namedtuple
import matplotlib.pyplot as plt

### Defining environment

In [None]:
config = {
    "observation": {
        "type": "Kinematics",
        "vehicles_count": 5,
        "features": ["x", "y", "vx", "vy"],
        "absolute": False,
    },
    "action": {
        "type": "ContinuousAction",
    },
    "lanes_count": 2,
    "vehicles_count": 10,
    "duration": 40,
    "simulation_frequency": 15,
    "policy_frequency": 5,
    "screen_width": 600,
    "screen_height": 150,
    "scaling": 5.5,
    "render_agent": True,
}

env = gym.make("merge-v0", render_mode="rgb_array")
env.unwrapped.configure(config)

### Neural Network

In [None]:
class Actor(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.fc1 = nn.Linear(state_dim, 256)
        self.fc2 = nn.Linear(256, 256)
        self.out = nn.Linear(256, action_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return torch.tanh(self.out(x))

class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.fc1 = nn.Linear(state_dim + action_dim, 256)
        self.fc2 = nn.Linear(256, 256)
        self.out = nn.Linear(256, 1)

    def forward(self, state, action):
        x = torch.cat([state, action], dim=1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.out(x)

In [None]:
Transition = namedtuple("Transition", ("state", "action", "reward", "next_state", "done"))

class ReplayBuffer:
    def __init__(self, capacity=100000):
        self.buffer = deque(maxlen=capacity)

    def push(self, *args):
        self.buffer.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

    def __len__(self):
        return len(self.buffer)

### Training function

Could not fix the problem below. Therefore, we could not check whether the next codes work, so we did it based on other examples.

In [None]:
def train_ddpg(env, episodes=200):
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    actor = Actor(state_dim, action_dim).to(device)
    target_actor = Actor(state_dim, action_dim).to(device)
    target_actor.load_state_dict(actor.state_dict())

    critic = Critic(state_dim, action_dim).to(device)
    target_critic = Critic(state_dim, action_dim).to(device)
    target_critic.load_state_dict(critic.state_dict())

    actor_optim = torch.optim.Adam(actor.parameters(), lr=1e-4)
    critic_optim = torch.optim.Adam(critic.parameters(), lr=1e-3)
    memory = ReplayBuffer()

    gamma = 0.99
    tau = 0.005
    batch_size = 64
    exploration_noise = 0.1

    all_rewards = []

    for ep in range(episodes):
        obs = env.reset()[0]
        state = torch.tensor(obs, dtype=torch.float32).unsqueeze(0).to(device)
        total_reward = 0
        done = False

        while not done:
            with torch.no_grad():
                action = actor(state).cpu().numpy()[0]
                action += np.random.normal(0, exploration_noise, size=action_dim)
                action = np.clip(action, -1, 1)

            next_obs, reward, done, truncated, _ = env.step(action)
            next_state = torch.tensor(next_obs, dtype=torch.float32).unsqueeze(0).to(device)
            memory.push(state, torch.tensor(action).unsqueeze(0).float().to(device),
                        reward, next_state, done)

            state = next_state
            total_reward += reward

            if len(memory) >= batch_size:
                batch = Transition(*zip(*memory.sample(batch_size)))
                states = torch.cat(batch.state)
                actions = torch.cat(batch.action)
                rewards = torch.tensor(batch.reward, dtype=torch.float32).unsqueeze(1).to(device)
                next_states = torch.cat(batch.next_state)
                dones = torch.tensor(batch.done, dtype=torch.float32).unsqueeze(1).to(device)

                with torch.no_grad():
                    target_actions = target_actor(next_states)
                    target_q = target_critic(next_states, target_actions)
                    y = rewards + gamma * (1 - dones) * target_q

                critic_loss = F.mse_loss(critic(states, actions), y)
                critic_optim.zero_grad()
                critic_loss.backward()
                critic_optim.step()

                # Update the actor
                actor_loss = -critic(states, actor(states)).mean()
                actor_optim.zero_grad()
                actor_loss.backward()
                actor_optim.step()

                # Updates target networks
                for param, target_param in zip(actor.parameters(), target_actor.parameters()):
                    target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

                for param, target_param in zip(critic.parameters(), target_critic.parameters()):
                    target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

        all_rewards.append(total_reward)
        print(f"Ep {ep} | Reward: {total_reward:.2f}")

    return all_rewards

### Plotting

In [None]:
rewards = train_ddpg(env)

plt.plot(rewards)
plt.xlabel("Episode")
plt.ylabel("Total reward")
plt.title("DDPG Training - Ongoing Actions")
plt.grid(True)
plt.show()

### Visualization

Still to implement, a video showing the simulation