# TD Deep Q-Network


Hédi Hadiji March 2023  
Adapted from Odalric Ambryn-Maillard

In [None]:
# Imports
import sys
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import random
from copy import deepcopy
import gymnasium as gym
from tqdm import tqdm

import time
import os
from IPython.display import clear_output

import matplotlib.pyplot as plt

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
print(f"python --version = {sys.version}")
print(f"torch.__version__ = {torch.__version__}")
print(f"np.__version__ = {np.__version__}")
print(f"gym.__version__ = {gym.__version__}")

python --version = 3.10.12 (main, Nov 20 2023, 15:14:05) [GCC 11.4.0]
torch.__version__ = 2.2.1+cu121
np.__version__ = 1.25.2
gym.__version__ = 0.29.1


In [None]:
def run_one_episode(env, agent, display=True):
    display_env = deepcopy(env)
    done = False
    state, _ = display_env.reset()

    rewards = 0

    while not done:
        action = agent.get_action(state, 0)
        print(action)
        state, reward, done, _, _ = display_env.step(action)
        rewards += reward
        if display:
            clear_output(wait=True)
            plt.imshow(display_env.render())
            plt.show()
    if display:
        display_env.close()
    print(f'Episode length {rewards}')

In [None]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, state, action, reward, terminated, next_state):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = (state, action, reward, terminated, next_state)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.choices(self.memory, k=batch_size)

    def __len__(self):
        return len(self.memory)

# create instance of replay buffer
#replay_buffer = ReplayBuffer(BUFFER_CAPACITY)

In [None]:
class Net(nn.Module):
    """
    Basic neural net.
    """
    def __init__(self, obs_size, hidden_size, n_actions):
        super(Net, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(obs_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions)
        )

    def forward(self, x):
        return self.net(x.to(device)).cpu()

In [None]:
class DQN_Skeleton:
    def __init__(self,
                action_space,
                observation_space,
                gamma,
                batch_size,
                buffer_capacity,
                update_target_every,
                epsilon_start,
                decrease_epsilon_factor,
                epsilon_min,
                learning_rate,
                ):
        self.action_space = action_space
        self.observation_space = observation_space
        self.gamma = gamma

        self.batch_size = batch_size
        self.buffer_capacity = buffer_capacity
        self.update_target_every = update_target_every

        self.epsilon_start = epsilon_start
        self.decrease_epsilon_factor = decrease_epsilon_factor # larger -> more exploration
        self.epsilon_min = epsilon_min

        self.learning_rate = learning_rate

        self.reset()

    def get_action(self, state):
        """
        ** TO BE IMPLEMENTED LATER**

        Return action according to an epsilon-greedy exploration policy
        """
        pass

    def update(self, *data):
        """
        ** TO BE IMPLEMENTED LATER **

        Updates the buffer and the network(s)
        """
        pass

    def get_q(self, state):
        """
        Compute Q function for a states
        """

        state = state.flatten()
        state_tensor = torch.tensor(state).unsqueeze(0)
        with torch.no_grad():
            output = self.q_net.forward(state_tensor) # shape (1,  n_actions)
        return output.numpy()[0]  # shape  (n_actions)

    def decrease_epsilon(self):
        self.epsilon = self.epsilon_min + (self.epsilon_start - self.epsilon_min) * (
                        np.exp(-1. * self.n_eps / self.decrease_epsilon_factor ) )

    def reset(self):
        hidden_size = 1024
        obs_size = 1
        for i in range(len(self.observation_space.shape)):
          obs_size *= self.observation_space.shape[i]
        n_actions = self.action_space.n

        self.buffer = ReplayBuffer(self.buffer_capacity)
        self.q_net =  Net(obs_size, hidden_size, n_actions).to(device)
        self.target_net = Net(obs_size, hidden_size, n_actions).to(device)

        self.loss_function = nn.MSELoss()
        self.optimizer = optim.AdamW(params=self.q_net.parameters(), lr=self.learning_rate)

        self.epsilon = self.epsilon_start
        self.n_steps = 0
        self.n_eps = 0


In [None]:
class RandomAgent:
    def __init__(self, observation_space, action_space):
        self.action_space = action_space
        return

    def get_action(self, state, *args):
        return self.action_space.sample()

    def update(self, *data):
        pass

In [None]:
class DQN_SkeletonI(DQN_Skeleton):
    def get_action(self, state, epsilon=None):
        """
            ** Solution **

            Return action according to an epsilon-greedy exploration policy
        """
        if epsilon is None:
            epsilon = self.epsilon

        if np.random.rand() < epsilon:
            return self.action_space.sample()
        else:
            return np.argmax(self.get_q(state))

In [None]:
def eval_agent(agent, env, n_sim=5):
    """
    ** Solution **

    Monte Carlo evaluation of DQN agent.

    Repeat n_sim times:
        * Run the DQN policy until the environment reaches a terminal state (= one episode)
        * Compute the sum of rewards in this episode
        * Store the sum of rewards in the episode_rewards array.
    """
    env_copy = deepcopy(env)
    episode_rewards = np.zeros(n_sim)
    for i in range(n_sim):
        state, _ = env_copy.reset()
        reward_sum = 0
        done = False
        while not done:
            action = agent.get_action(state, 0)
            state, reward, terminated, truncated, _ = env_copy.step(action)
            reward_sum += reward
            done = terminated or truncated
        episode_rewards[i] = reward_sum
    return episode_rewards

In [None]:
class DQN(DQN_SkeletonI):
    def update(self, state, action, reward, terminated, next_state):
        """
        ** SOLUTION **
        """

        # add data to replay buffer
        self.buffer.push(torch.tensor(state).unsqueeze(0),
                           torch.tensor([[action]], dtype=torch.int64),
                           torch.tensor([reward]),
                           torch.tensor([terminated], dtype=torch.int64),
                           torch.tensor(next_state).unsqueeze(0),
                          )

        if len(self.buffer) < self.batch_size:
            return np.inf

        # get batch
        transitions = self.buffer.sample(self.batch_size)

        # Compute loss - TO BE IMPLEMENTED!
        # Hint: use the gather method from torch.

        state_batch, action_batch, reward_batch, terminated_batch, next_state_batch = tuple(
            [torch.cat(data) for data in zip(*transitions)]
        )
        values  = self.q_net.forward(state_batch).gather(1, action_batch)

        # Compute the ideal Q values
        with torch.no_grad():
            next_state_values = (1 - terminated_batch) * self.target_net(next_state_batch).max(1)[0]
            targets = next_state_values * self.gamma + reward_batch

        loss = self.loss_function(values, targets.unsqueeze(1).type(torch.float32))
        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        #torch.nn.utils.clip_grad_value_(self.q_net.parameters(), 100)
        self.optimizer.step()

        if not((self.n_steps+1) % self.update_target_every):
            self.target_net.load_state_dict(self.q_net.state_dict())

        self.decrease_epsilon()

        self.n_steps += 1
        if terminated:
            self.n_eps += 1

        return loss.detach().numpy()

In [None]:
env = gym.make("highway-fast-v0", render_mode="rgb_array")

config = {
    "observation": {
        "type": "OccupancyGrid",
        "vehicles_count": 10,
        "features": ["presence", "x", "y", "vx", "vy", "cos_h", "sin_h"],
        "features_range": {
            "x": [-100, 100],
            "y": [-100, 100],
            "vx": [-20, 20],
            "vy": [ 00, 20],
        },
        "grid_size": [[-20, 20], [-20, 20]],
        "grid_step": [5, 5],
        "absolute": False,
    },
    "action": {
        "type": "DiscreteAction",
    },
    "lanes_count": 3,
    "vehicles_count": 10,
    "duration": 20,  # [s]
    "initial_spacing": 0,
    "collision_reward": -1,  # The reward received when colliding with a vehicle.
    "right_lane_reward": 0.6,  # The reward received when driving on the right-most lanes, linearly mapped to
    # zero for other lanes.
    "high_speed_reward": 0.6,  # The reward received when driving at full speed, linearly mapped to zero for
    # lower speeds according to config["reward_speed_range"].
    "lane_change_reward": 0,
    "reward_speed_range": [
        10,
        30,
    ],  # [m/s] The reward for high speed is mapped linearly from this range to [0, HighwayEnv.HIGH_SPEED_REWARD].
    "simulation_frequency": 5,  # [Hz]
    "policy_frequency": 1,  # [Hz]
    "other_vehicles_type": "highway_env.vehicle.behavior.IDMVehicle",
    "screen_width": 600,  # [px]
    "screen_height": 150,  # [px]
    "centering_position": [0.3, 0.5],
    "scaling": 5.5,
    "show_trajectories": True,
    "render_agent": True,
    "offscreen_rendering": False,
    "disable_collision_checks": True,
}


env.unwrapped.configure(config)

In [None]:
state, _ = env.reset()
state = state.flatten()  # Flattening the state at reset for each episode
state_tensor = torch.tensor(state).unsqueeze(0)
state_tensor.size()

  and should_run_async(code)


torch.Size([1, 448])

In [None]:
len(env.observation_space.shape)

3

In [None]:
obs_size = env.observation_space.shape[0] * env.observation_space.shape[1] * env.observation_space.shape[2]
n_actions = env.action_space.n
print(obs_size)
print(n_actions)

448
9


In [None]:
hidden_size = 128
net = nn.Sequential(
    nn.Linear(obs_size, hidden_size),
    nn.ReLU(),
    nn.Dropout(0.5),  # Adding dropout for regularization
    nn.Linear(hidden_size, hidden_size),  # Adding an extra hidden layer
    nn.ReLU(),
    nn.Dropout(0.3),  # Adding dropout for regularization
    nn.Linear(hidden_size, n_actions)
)

In [None]:
nn.CrossEntropyLoss()

CrossEntropyLoss()

In [None]:
print(net(state_tensor).dtype)
print(state_tensor.dtype)

torch.float32
torch.float32


In [None]:
def train(env, agent, N_episodes, eval_every=10, reward_threshold=300):
    total_time = 0
    state, _ = env.reset()
    losses = []
    for ep in range(N_episodes):
        done = False
        state, _ = env.reset()
        state = state.flatten()  # Flattening the state at reset for each episode
        while not done:
            action = agent.get_action(state, agent.epsilon)  # Make sure to pass epsilon for exploration
            #,action = agent.get_action(state)
            next_state, reward, terminated, truncated, _ = env.step(action)
            next_state = next_state.flatten().astype(np.float32)  # Flattening next_state from the environment
            loss_val = agent.update(state, action, reward, terminated, next_state)
            state = next_state
            losses.append(loss_val)
            done = terminated or truncated
            total_time += 1



        if ((ep+1)% eval_every == 0):
            rewards = eval_agent(agent, env)
            print("episode =", ep+1, ", reward = ", np.mean(rewards), ", loss = ", loss_val)
            if np.mean(rewards) >= reward_threshold:
                break

    return losses

In [None]:
action_space = env.action_space
observation_space = env.observation_space

batch_size = 256
buffer_capacity = 10_000
update_target_every = 32

gamma = 0.99
batch_size = 64
buffer_capacity = 10_000
update_target_every = 512

epsilon_start = 0.99
decrease_epsilon_factor = 1000 # 300
epsilon_min = 0.03

learning_rate = 1e-3 

arguments = (action_space,
            observation_space,
            gamma,
            batch_size,
            buffer_capacity,
            update_target_every,
            epsilon_start,
            decrease_epsilon_factor,
            epsilon_min,
            learning_rate,
        )

N_episodes = int(5e2)

agent = DQN(*arguments)


# Run the training loop
losses = train(env, agent, N_episodes)

plt.plot(losses)

# Evaluate the final policy
rewards = eval_agent(agent, env, 20)
print("")
print("mean reward after training = ", np.mean(rewards))

episode = 10 , reward =  7.28125 , loss =  0.047213484
episode = 20 , reward =  6.225 , loss =  0.062425558
episode = 30 , reward =  7.1875 , loss =  0.05611602
episode = 40 , reward =  7.030844082092949 , loss =  0.07104853
episode = 50 , reward =  3.775 , loss =  0.073089175
episode = 60 , reward =  3.05 , loss =  0.12214478
episode = 70 , reward =  1.5625 , loss =  0.10989931
episode = 80 , reward =  3.040806403610733 , loss =  0.08195495
episode = 90 , reward =  2.393070905545266 , loss =  0.07984654
episode = 100 , reward =  2.29375 , loss =  0.08992642
episode = 110 , reward =  3.168344082092948 , loss =  0.17696407
episode = 120 , reward =  4.547956968515089 , loss =  0.09148943
episode = 130 , reward =  3.3979015982943084 , loss =  0.10553448
episode = 140 , reward =  3.3160382279329736 , loss =  0.07985937
episode = 150 , reward =  3.6976564036107336 , loss =  0.14551772
episode = 160 , reward =  3.3102515982943084 , loss =  0.085222036
episode = 170 , reward =  2.369862 , los

In [None]:
print("Rewards after training = ", eval_agent(agent, env))

In [None]:
run_one_episode(env, agent, display=True)