In [1]:
# Install the ALE package
!pip install ale-py
!pip install gymnasium==1.0.0
!pip install stable_baselines3

Collecting ale-py
  Downloading ale_py-0.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.6 kB)
Downloading ale_py-0.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ale-py
Successfully installed ale-py-0.10.1
Collecting gymnasium==1.0.0
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium==1.0.0)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-1.0.0-py3-none-any.whl (958 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m958.1/958.1 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notificatio

In [7]:
import gymnasium as gym
from gymnasium.wrappers import MaxAndSkipObservation, ResizeObservation, GrayscaleObservation, FrameStackObservation, ReshapeObservation
from stable_baselines3.common.atari_wrappers import NoopResetEnv, MaxAndSkipEnv, FireResetEnv, EpisodicLifeEnv
import ale_py
import cv2
from gymnasium.spaces import Box
from torch import nn
import torch
import torch.optim as optim
import numpy as np
import collections
from tqdm import tqdm
import wandb

  and should_run_async(code)


In [3]:
ENV_NAME = "ALE/Frogger-v5"

  and should_run_async(code)


### Enviroment preparation

In [5]:
class ScaledFloatFrame(gym.ObservationWrapper):
    def observation(self, obs):
        return np.array(obs).astype(np.float32) / 255.0


def make_env(env_name):
    env = gym.make(env_name, obs_type="grayscale")
    # env = FireResetEnv(env)
    print("Standard Env.        : {}".format(env.observation_space.shape))
    env = MaxAndSkipObservation(env, skip=4)
    print("MaxAndSkipObservation: {}".format(env.observation_space.shape))
    #env = FireResetEnv(env)
    env = ResizeObservation(env, (84, 84))
    print("ResizeObservation    : {}".format(env.observation_space.shape))
    env = FrameStackObservation(env, stack_size=4)
    print("FrameStackObservation: {}".format(env.observation_space.shape))
    env = ScaledFloatFrame(env)
    print("ScaledFloatFrame     : {}".format(env.observation_space.shape))

    return env


env = make_env(ENV_NAME)
print("\nAction space is {} ".format(env.action_space))
print("Observation space is {} ".format(env.observation_space))

Standard Env.        : (210, 160)
MaxAndSkipObservation: (210, 160)
ResizeObservation    : (84, 84)
FrameStackObservation: (4, 84, 84)
ScaledFloatFrame     : (4, 84, 84)

Action space is Discrete(5) 
Observation space is Box(0, 255, (4, 84, 84), uint8) 


In [None]:
class FrameStackObservation(gym.Wrapper):
    def __init__(self, env, stack_size):
        """Constructor"""
        gym.Wrapper.__init__(self, env)
        self.stack_size = stack_size
        self.frames = deque([], maxlen=stack_size)
        shp = env.observation_space.shape
        self.observation_space = Box(0.0, 1.0, (stack_size, shp[0], shp[1]), dtype=np.float32)

    def reset(self, **kwargs):
        """Reset the environment"""
        observation, info = self.env.reset()
        for _ in range(self.stack_size):
            self.frames.append(observation)
        return self._get_observation(), info

    def step(self, action):
        """Take a step"""
        observation, reward, done, truncated, info = self.env.step(action)
        self.frames.append(observation)
        return self._get_observation(), reward, done, truncated, info

    def _get_observation(self):
        """Get the observation"""
        frames_list = [frame for frame in self.frames]
        return np.stack(frames_list, axis=0)

class ScaleAndReshapeObservation(gym.ObservationWrapper):
    def __init__(self, env):
        """Constructor"""
        gym.ObservationWrapper.__init__(self, env)
        self.observation_space = Box(0.0, 1.0, env.observation_space.shape, dtype=np.float32)

    def observation(self, observation):
        """Observation"""
        if type(observation) == tuple:
            observation, info = observation

            observation = cv2.resize(observation, (84, 84))
            return (np.array(observation).astype(np.float32) / 255.0, info)
        else:
            observation = cv2.resize(observation, (84, 84))
            return np.array(observation).astype(np.float32) / 255.0


def preprocess_env(env_name):

    env = gym.make(env_name, obs_type="grayscale", render_mode='rgb_array')
    env = NoopResetEnv(env, noop_max=4)
    env = MaxAndSkipEnv(env, skip=2)
    env = FireResetEnv(env)
    env = EpisodicLifeEnv(env)
    env = ScaleAndReshapeObservation(env)
    env = FrameStackObservation(env, stack_size=4)

    return env

In [None]:
env = preprocess_env(ENV_NAME)
print("\nAction space is {} ".format(env.action_space))
print("Observation space is {} ".format(env.observation_space))

### DQN Aproach

In [8]:
class DuelingDQN(nn.Module):
    def __init__(self, input_shape, output_shape):
        super(DuelingDQN, self).__init__()

        self.net = nn.Sequential(
            nn.Conv2d(input_shape[0], 32, kernel_size=5, stride=4),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=2),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Conv2d(64, 64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Flatten(),
            nn.Linear(1600, 512),
            nn.ReLU()
        )

        self.value_prediction = nn.Linear(512, 1)
        self.advantage_prediction = nn.Linear(512, output_shape)


    def forward(self, x):
        embedding = self.net(x)
        value = self.value_prediction(embedding)
        advantage = self.advantage_prediction(embedding)
        q_values = value + advantage - advantage.mean(dim=-1).unsqueeze(-1)
        return q_values

In [9]:
Experience = collections.namedtuple('Experience', field_names=['state', 'action', 'reward', 'done', 'new_state'])

class Agent:
    def __init__(self, env, exp_replay_buffer):
        self.env = env
        self.exp_replay_buffer = exp_replay_buffer
        self._reset()

    def _reset(self):
        self.current_state = self.env.reset()[0]
        self.total_reward = 0.0

    def step(self, net, target_net, epsilon=0.0, device="cpu"):
        done_reward = None
        if np.random.random() < epsilon:
            action = env.action_space.sample()
        else:
            state_ = np.array([self.current_state])
            state = torch.tensor(state_).to(device)
            q_vals = net(state)
            _, act_ = torch.max(q_vals, dim=1)
            action = int(act_.item())

        new_state, reward, terminated, truncated, _ = self.env.step(action)
        is_done = terminated or truncated
        self.total_reward += reward

        exp = Experience(self.current_state, action, reward, is_done, new_state)
        self.exp_replay_buffer.append(exp, net, target_net, device)
        self.current_state = new_state

        if is_done:
            done_reward = self.total_reward
            self._reset()

        return done_reward

In [11]:
# def compute_loss(model, target_model, states, actions, rewards, dones, next_states, gamma=0.99, criterion=nn.MSELoss()):
#     Q_values = model(states).gather(1, actions.unsqueeze(-1)).squeeze(-1)

#     next_state_values = target_model(next_states).max(1)[0]
#     next_state_values[dones] = 0.0
#     next_state_values = next_state_values.detach()

#     expected_Q_values = next_state_values * gamma + rewards

#     return criterion(Q_values, expected_Q_values)

def compute_loss(model, target_model, states, actions, rewards, dones, next_states, gamma=0.99, criterion=nn.MSELoss(), device="cpu"):
    # Get the qvals of the DQN network
    qvals = model(states).gather(1, actions.unsqueeze(-1)).squeeze(-1)

    # Get the best action for the next state given the DQN network
    actions_next = torch.argmax(model(next_states), dim=1)
    # Get the qvals of the target network
    with torch.no_grad():
        qvals_next = target_model(next_states)

    # Get the qvals of the target network for the best action considering the dqn network
    qvals_next = torch.tensor(np.array([qvals_next[i][actions_next[i]].cpu() for i in range(len(actions_next))])).to(device)

    qvals_next[dones] = 0

    # Calculate the Bellman equation
    expected_qvals = rewards + gamma * qvals_next

    # Calculate the loss
    return criterion(qvals, expected_qvals)

In [12]:
def compute_td_error(model, target_model, states, actions, rewards, dones, next_states, gamma=0.99, device="cpu"):
    states = torch.tensor(states).to(device)
    actions = torch.tensor(actions).to(device)
    rewards = torch.tensor(rewards).to(device)
    dones = torch.tensor(dones).to(device)
    next_states = torch.tensor(next_states).to(device)

    with torch.no_grad():
        Q_values = model(states.unsqueeze(0)).gather(1, actions.unsqueeze(0).unsqueeze(-1)).squeeze(-1)

        next_state_values = target_model(next_states.unsqueeze(0)).max(1)[0]
        next_state_values[dones] = 0.0
        next_state_values = next_state_values.detach()

        expected_Q_values = next_state_values * gamma + rewards

    return (Q_values - expected_Q_values).abs().detach().item()

In [13]:
class PrioritizedExperienceReplay:
    """
    D'aquest metode no se si es correcte del tot, en comptes de calcular el
    td_error quan faig el sampling ho he implementat en el moment en que safegeix al buffer.

    D'aquesta forma no augmenta molt tant al numero de claculs extra (Sino shauria de clacular
    per a tot el buffer cada vegada que fem sampling)
    """
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)
        self.priorities = collections.deque(maxlen=capacity)

    def __len__(self):
        return len(self.buffer)

    def append(self, experience, model, target_model, device="cpu"):
        td_error = compute_td_error(model, target_model, experience.state, experience.action, experience.reward, experience.done, experience.new_state, device=device)
        self.priorities.append(td_error)
        self.buffer.append(experience)

    def sample(self, BATCH_SIZE, alpha=0.6, beta=0.4, epsilon=0.01):
        priorities = np.array(self.priorities)
        priorities = priorities + epsilon
        probabilities = priorities ** alpha
        probabilities = probabilities / probabilities.sum()

        indices = np.random.choice(len(self.buffer), BATCH_SIZE, p=probabilities)
        weights = (1/len(self.buffer) * 1/probabilities[indices]) ** beta
        states, actions, rewards, dones, next_states = zip(*[self.buffer[idx] for idx in indices])

        states = torch.from_numpy(np.array(states, dtype=np.float32))
        actions = torch.from_numpy(np.array(actions, dtype=np.int64))
        rewards = torch.from_numpy(np.array(rewards, dtype=np.float32))
        dones = torch.from_numpy(np.array(dones, dtype=bool))
        next_states = torch.from_numpy(np.array(next_states, dtype=np.float32))
        weights = torch.from_numpy(np.array(weights, dtype=np.float32))

        return states, actions, rewards, dones, next_states, weights

#### Hyperparameters

In [25]:
MEAN_REWARD_BOUND = 350.0
NUMBER_OF_REWARDS_TO_AVERAGE = 10

GAMMA = 0.99

BATCH_SIZE = 64
LEARNING_RATE = 0.0001

EXPERIENCE_REPLAY_SIZE = 10000
SYNC_TARGET_NETWORK = 1000

EPS_START = 1.0
EPS_DECAY = 0.999987
EPS_MIN = 0.03

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
wandb.login()



True

In [27]:
wandb.init(project="Frogger", name="DQN")
wandb.config.update({"gamma": GAMMA, "batch_size": BATCH_SIZE, "learning_rate": LEARNING_RATE, "experience_replay_size": EXPERIENCE_REPLAY_SIZE, "sync_target_network": SYNC_TARGET_NETWORK, "eps_start": EPS_START, "eps_decay": EPS_DECAY, "eps_min": EPS_MIN})

VBox(children=(Label(value='0.307 MB of 0.307 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epsilon,█████▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁
reward,▃▅▆▄▄▆▃▇▃▃▃█▂▃▆▂▂▃▂▂▃▂▂▄▂▃▁▂▃▆▂▁▅▂▃▆▁▄▂▂
reward_100,▇█▇▇▆▆▆▆▆▆▆▅▄▄▅▃▃▃▂▂▁▄▄▆▆▅▄▅▅▄▃▄▅▄▄▁▃▅▅▄

0,1
epsilon,0.87929
reward,6.0
reward_100,7.5


In [None]:
env = make_env(ENV_NAME)

net = DuelingDQN(env.observation_space.shape, env.action_space.n).to(device)
target_net = DuelingDQN(env.observation_space.shape, env.action_space.n).to(device)

buffer = PrioritizedExperienceReplay(EXPERIENCE_REPLAY_SIZE)
agent = Agent(env, buffer)

epsilon = EPS_START
optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE)
criterion = nn.MSELoss(reduction="none")
total_rewards = []
frame_number = 0

tbar = tqdm()
while True:
    frame_number += 1
    epsilon = max(epsilon * EPS_DECAY, EPS_MIN)

    reward = agent.step(net, target_net, epsilon, device=device)
    if reward is not None:
        total_rewards.append(reward)

        mean_reward = np.mean(total_rewards[-NUMBER_OF_REWARDS_TO_AVERAGE:])
        tbar.set_description(f"Frame:{frame_number} | Total games:{len(total_rewards)} | Mean reward: {mean_reward:.3f}  (epsilon used: {epsilon:.2f})")
        wandb.log({"epsilon": epsilon, "reward_100": mean_reward, "reward": reward}, step=frame_number)

        if mean_reward > MEAN_REWARD_BOUND:
            print(f"SOLVED in {frame_number} frames and {len(total_rewards)} games")
            break

    if len(buffer) < EXPERIENCE_REPLAY_SIZE:
        continue

    states, actions, rewards, dones, next_states, weights = buffer.sample(BATCH_SIZE)
    states, actions, rewards, dones, next_states, weights = states.to(device), actions.to(device), rewards.to(device), dones.to(device), next_states.to(device), weights.to(device)

    loss = compute_loss(net, target_net, states, actions, rewards, dones, next_states, gamma=GAMMA, criterion=criterion, device=device)
    loss = (loss * weights).mean()
    wandb.log({"loss": loss.item()}, step=frame_number)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if frame_number % SYNC_TARGET_NETWORK == 0:
        target_net.load_state_dict(net.state_dict())

Standard Env.        : (210, 160)
MaxAndSkipObservation: (210, 160)
ResizeObservation    : (84, 84)
FrameStackObservation: (4, 84, 84)
ScaledFloatFrame     : (4, 84, 84)


  lambda data: self._console_raw_callback("stderr", data),
Frame:406 | Total games:4 | Mean reward: 7.500  (epsilon used: 0.99): : 0it [00:14, ?it/s]
Frame:10861 | Total games:94 | Mean reward: 8.300  (epsilon used: 0.87): : 0it [01:07, ?it/s]