In [None]:
from collections import namedtuple, deque
import numpy as np

from tqdm import tqdm
import torch
from torch.nn.utils import clip_grad_norm_
import torch.nn as nn
import torch.optim as optim
from IPython.display import clear_output
import matplotlib.pyplot as plt
from copy import deepcopy
import collections


import random
import wandb
import copy

In [None]:
import gymnasium as gym
from gymnasium.wrappers import MaxAndSkipObservation, ResizeObservation, GrayscaleObservation, FrameStackObservation, ReshapeObservation
from stable_baselines3.common.atari_wrappers import NoopResetEnv, MaxAndSkipEnv, FireResetEnv, EpisodicLifeEnv
import ale_py
import cv2
from gymnasium.spaces import Box

### Enviroment preparation

In [None]:
# Name of the Enviroment to use
ENV_NAME = "ALE/Frogger-v5"

In [None]:
class ScaledFloatFrame(gym.ObservationWrapper):
    """
    To scale the observation to 0-1
    """
    def observation(self, obs):
        return np.array(obs).astype(np.float32) / 255.0


def make_env(env_name):
    """
    Given the environment name, it returns the environment with all the wrappers applied.
    """
    env = gym.make(env_name, obs_type="grayscale", render_mode="rgb_array")
    print("Original Env.        : {}".format(env.observation_space.shape))
    env = MaxAndSkipObservation(env, skip=4)
    print("MaxAndSkipObservation: {}".format(env.observation_space.shape))
    env = ResizeObservation(env, (84, 84))
    print("ResizeObservation    : {}".format(env.observation_space.shape))
    env = FrameStackObservation(env, stack_size=4)
    print("FrameStackObservation: {}".format(env.observation_space.shape))
    env = ScaledFloatFrame(env)
    print("ScaledFloatFrame     : {}".format(env.observation_space.shape))

    return env


env = make_env(ENV_NAME)
print("\nAction space is {} ".format(env.action_space))
print("Observation space is {} ".format(env.observation_space))

Standard Env.        : (210, 160)
MaxAndSkipObservation: (210, 160)
ResizeObservation    : (84, 84)
FrameStackObservation: (4, 84, 84)
ScaledFloatFrame     : (4, 84, 84)

Action space is Discrete(5) 
Observation space is Box(0, 255, (4, 84, 84), uint8) 


### Dueling DQN

In [None]:
class DuelingDQN(nn.Module):
    """
    Implentation of the Dueling DQN architecture
    """
    def __init__(self, input_shape, output_shape):
        """
        Args:
            input_shape (tuple): Shape of the input image
            output_shape (int): Number of actions
        """ 
        super(DuelingDQN, self).__init__()

        # Convolutional layers
        self.net = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=5, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(1600, 512),
            nn.ReLU()
        )

        # Value and Advantage layers
        self.value_prediction = nn.Linear(512, 1)
        self.advantage_prediction = nn.Linear(512, output_shape)


    def forward(self, x):
        # Generating the embedding
        embedding = self.net(x)
        # Value and Advantage prediction
        value = self.value_prediction(embedding)
        advantage = self.advantage_prediction(embedding)
        # Combining the value and advantage to get the Q-values
        q_values = value + advantage - advantage.mean(dim=-1).unsqueeze(-1)
        return q_values

### Double DQN Implementaion

In [None]:
def compute_loss(model, target_model, states, actions, rewards, dones, next_states, gamma=0.99, criterion=nn.MSELoss(), device="cpu"):
    """
    Compute the loss for the DQN model using the double DQN loss function
    Args:
        model (nn.Module): DQN model
        target_model (nn.Module): Target DQN model
        states (torch.Tensor): States tensor
        actions (torch.Tensor): Actions tensor
        rewards (torch.Tensor): Rewards tensor
        dones (torch.Tensor): Dones tensor
        next_states (torch.Tensor): Next states tensor
        gamma (float): Discount factor
        criterion (torch.nn): Loss function
        device (str): Device to use
    """
    # Get the qvals of the DQN network
    qvals = model(states).gather(1, actions.unsqueeze(-1)).squeeze(-1)

    # Get the best action for the next state given the DQN network
    actions_next = torch.argmax(model(next_states), dim=1)
    # Get the qvals of the target network
    with torch.no_grad():
        qvals_next = target_model(next_states)

    # Get the qvals of the target network for the best action considering the dqn network
    qvals_next = torch.tensor(np.array([qvals_next[i][actions_next[i]].cpu() for i in range(len(actions_next))])).to(device)

    qvals_next[dones] = 0

    # Calculate the Bellman equation
    expected_qvals = rewards + gamma * qvals_next

    # Calculate the loss
    return criterion(qvals, expected_qvals)

### PrioritizedExperienceReplay

In [None]:
def compute_td_error(model, target_model, states, actions, rewards, dones, next_states, gamma=0.99, device="cpu"):
    """
    Compute the TD-error for the DQN model.
    Args:
        model (nn.Module): DQN model
        target_model (nn.Module): Target DQN model
        states (torch.Tensor): States tensor
        actions (torch.Tensor): Actions tensor
        rewards (torch.Tensor): Rewards tensor
        dones (torch.Tensor): Dones tensor
        next_states (torch.Tensor): Next states tensor
        gamma (float): Discount factor
        device (str): Device to use
    """
    # Comberting to tensors
    states = torch.tensor(states).to(device)
    actions = torch.tensor(actions).to(device)
    rewards = torch.tensor(rewards).to(device)
    dones = torch.tensor(dones).to(device)
    next_states = torch.tensor(next_states).to(device)


    # Disable gradients to compute the td-errors
    with torch.no_grad():
        # Predicted Q-values
        Q_values = model(states.unsqueeze(0)).gather(1, actions.unsqueeze(0).unsqueeze(-1)).squeeze(-1)

        # Predicted Q-values for the next states, using the target network
        next_state_values = target_model(next_states.unsqueeze(0)).max(1)[0]
        next_state_values[dones] = 0.0
        next_state_values = next_state_values.detach()

        # Expected Q-values
        expected_Q_values = next_state_values * gamma + rewards

    # Calculate the TD-errors
    return (Q_values - expected_Q_values).abs().detach().item()

In [None]:
# Tuple to store an experience
Experience = collections.namedtuple('Experience', field_names=['state', 'action', 'reward', 'done', 'new_state'])


class PrioritizedExperienceReplay:
    """
    Implementation of the Prioritized Experience Replay.
    """
    def __init__(self, capacity):
        # Buffer to store the experiences and priorities
        self.buffer = collections.deque(maxlen=capacity)
        self.priorities = collections.deque(maxlen=capacity)

    def __len__(self):
        return len(self.buffer)

    def append(self, experience, model, target_model, device="cpu"):
        """
        Function to append the experience to the buffer, and calculate the TD-error.
        Args:
            experience (tuple): Experience tuple
            model (nn.Module): DQN model
            target_model (nn.Module): Target DQN model
            device (str): Device to use
        """
        # Compute the TD-error  
        td_error = compute_td_error(model, target_model, experience.state, experience.action, experience.reward, experience.done, experience.new_state, device=device)
        # Append the experience and priority
        self.priorities.append(td_error)
        self.buffer.append(experience)

    def sample(self, BATCH_SIZE, alpha=0.6, beta=0.4, epsilon=0.01):
        """
        Function to sample the experiences from the buffer, based on the priorities.
        Args:
            BATCH_SIZE (int): Batch size
            alpha (float): Alpha value for the priorities
            beta (float): Beta value for the importance sampling
            epsilon (float): Epsilon value to avoid zero probabilities
        """
        # Calculate the probabilities
        priorities = np.array(self.priorities)
        priorities = priorities + epsilon
        probabilities = priorities ** alpha
        probabilities = probabilities / probabilities.sum()
        
        # Sample the experiences using the probabilities
        indices = np.random.choice(len(self.buffer), BATCH_SIZE, p=probabilities)
        # Computing the weights for the importance sampling in the loss
        weights = (1/len(self.buffer) * 1/probabilities[indices]) ** beta

        # Obtaing the batch of experiences
        states, actions, rewards, dones, next_states = zip(*[self.buffer[idx] for idx in indices])

        # Converting to tensors
        states = torch.from_numpy(np.array(states, dtype=np.float32))
        actions = torch.from_numpy(np.array(actions, dtype=np.int64))
        rewards = torch.from_numpy(np.array(rewards, dtype=np.float32))
        dones = torch.from_numpy(np.array(dones, dtype=bool))
        next_states = torch.from_numpy(np.array(next_states, dtype=np.float32))
        weights = torch.from_numpy(np.array(weights, dtype=np.float32))

        return states, actions, rewards, dones, next_states, weights

In [None]:
class Agent:
    """
    Agent class to interact with the environment
    """
    def __init__(self, env, exp_replay_buffer):
        """
        Args:
            env (gym.Env): Environment
            exp_replay_buffer (PrioritizedExperienceReplay): Experience Replay Buffer  
        """
        # Storing the environment and the experience replay buffer
        self.env = env
        self.exp_replay_buffer = exp_replay_buffer

        # Reset the environment and the total reward
        self._reset()

    def _reset(self):
        self.current_state = self.env.reset()[0]
        self.total_reward = 0.0

    def step(self, net, target_net, epsilon=0.0, device="cpu"):
        """
        Computes the action to take based on the epsilon-greedy policy from net.
        Args:
            net (nn.Module): DQN model
            target_net (nn.Module): Target DQN model
            epsilon (float): Epsilon value
            device (str): Device to use
        """
        # Epsilon-greedy policy 
        done_reward = None
        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            state_ = np.array([self.current_state])
            state = torch.tensor(state_).to(device)
            q_vals = net(state)
            _, act_ = torch.max(q_vals, dim=1)
            action = int(act_.item())

        new_state, reward, terminated, truncated, _ = self.env.step(action)
        is_done = terminated or truncated
        self.total_reward += reward

        exp = Experience(self.current_state, action, reward, is_done, new_state)
        self.exp_replay_buffer.append(exp, net, target_net, device)
        self.current_state = new_state

        if is_done:
            done_reward = self.total_reward
            self._reset()

        return done_reward

#### Hyperparameters

In [None]:
# Optimized parameters
GAMMA = 0.90
LEARNING_RATE = 0.0001

# Fixed parameters over the different experiments
BATCH_SIZE = 64
MEAN_REWARD_BOUND = 390.0
NUMBER_OF_REWARDS_TO_AVERAGE = 10

EXPERIENCE_REPLAY_SIZE = 10000
SYNC_TARGET_NETWORK = 1000
MAX_FRAMES = 1500000

# Found after Experimentation
EPS_START = 1.0
EPS_DECAY = 0.999987
EPS_MIN = 0.03

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
wandb.login()



True

In [None]:
wandb.init(project="Frogger", name="Gamma_90_lr_0001_v2")
wandb.config.update({"gamma": GAMMA, "batch_size": BATCH_SIZE, "learning_rate": LEARNING_RATE, "experience_replay_size": EXPERIENCE_REPLAY_SIZE, "sync_target_network": SYNC_TARGET_NETWORK, "eps_start": EPS_START, "eps_decay": EPS_DECAY, "eps_min": EPS_MIN})

0,1
epsilon,█▇▆▅▄▄▃▂▁
reward,▃▃▆▃▆▃▃▁█
reward_100,▁▁▇▅█▇▆▃▇

0,1
epsilon,0.98602
reward,9.0
reward_100,7.33333


In [None]:
env = make_env(ENV_NAME)

# Initialize the networks
net = DuelingDQN(env.observation_space.shape, env.action_space.n).to(device)
target_net = DuelingDQN(env.observation_space.shape, env.action_space.n).to(device)

# Initialize the agent and the experience replay buffer
buffer = PrioritizedExperienceReplay(EXPERIENCE_REPLAY_SIZE)
agent = Agent(env, buffer)
epsilon = EPS_START

# Initialize the optimizer and the criterion
optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
criterion = nn.MSELoss(reduction="none")

# Initialize the total rewards
total_rewards = []
frame_number = 0

tbar = tqdm()
while True:
    frame_number += 1
    # Update the epsilon
    epsilon = max(epsilon * EPS_DECAY, EPS_MIN)

    # Take a step in the environment
    reward = agent.step(net, target_net, epsilon, device=device)

    # If the game has finished
    if reward is not None:
        # Append the total reward
        total_rewards.append(reward)

        # Update the progress bar and log the metrics
        mean_reward = np.mean(total_rewards[-NUMBER_OF_REWARDS_TO_AVERAGE:])
        tbar.set_description(f"Frame:{frame_number} | Total games:{len(total_rewards)} | Mean reward: {mean_reward:.3f}  (epsilon used: {epsilon:.2f})")
        wandb.log({"epsilon": epsilon, "reward_100": mean_reward, "reward": reward, "episode": len(total_rewards)}, step=frame_number)

        # If the environment is solved finish the training
        if mean_reward > MEAN_REWARD_BOUND:
            torch.save(net.state_dict(), f"model_gamma_90_lr_0001_v2.pth")
            print(f"SOLVED in {frame_number} frames and {len(total_rewards)} games")
            break

    # If the buffer is not full we do not train
    if len(buffer) < EXPERIENCE_REPLAY_SIZE:
        continue

    # Save the model every 5000 frames
    if frame_number % 5000 == 0:
        torch.save(net.state_dict(), f"model_gamma_90_lr_0001_v2.pth")

    # If the number of frames is greater than the maximum number of frames, we finish the training
    if frame_number > MAX_FRAMES:
        torch.save(net.state_dict(), f"model_gamma_90_lr_0001_v2.pth")
        break


    # Sample the experiences from the buffer
    states, actions, rewards, dones, next_states, weights = buffer.sample(BATCH_SIZE)
    states, actions, rewards, dones, next_states, weights = states.to(device), actions.to(device), rewards.to(device), dones.to(device), next_states.to(device), weights.to(device)

    # Compute the loss and optimize the network
    loss = compute_loss(net, target_net, states, actions, rewards, dones, next_states, gamma=GAMMA, criterion=criterion, device=device)
    loss = (loss * weights).mean()
    wandb.log({"loss": loss.item()}, step=frame_number)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Update the target network every SYNC_TARGET_NETWORK frames
    if frame_number % SYNC_TARGET_NETWORK == 0:
        target_net.load_state_dict(net.state_dict())

In [None]:
wandb.finish()