In [19]:
pip install "gymnasium[atari,accept-rom-license]==1.0.0" "ale-py==0.9.1"








In [9]:
import sys, os
import gymnasium as gym
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torchvision
import numpy as np
import random
from gymnasium.spaces import Box
from collections import deque
import copy
from gymnasium.wrappers import FrameStackObservation
import ale_py

%matplotlib inline


# -------------------------------
# EXTRA ADDITIONS BLOCK
# -------------------------------

# Add TimeLimit wrapper (EXTRA)
from gymnasium.wrappers import TimeLimit

# Add hyperparameter search tools (EXTRA)
from itertools import product

# Matplotlib settings for saving high-quality plots (EXTRA)
plt.rcParams["savefig.dpi"] = 300

# Register Atari environments from ale_py (EXTRA)
gym.register_envs(ale_py)

# Set global seed for reproducibility (EXTRA)
seed = 42
np.random.seed(seed)
torch.manual_seed(seed)
random.seed(seed)

# Device setup for GPU/CPU compatibility (EXTRA)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [13]:
class SkipFrame(gym.Wrapper):
    def __init__(self, env, num_skip):
        super().__init__(env)
        self.num_skip = num_skip

    def step(self, action):
        total_reward = 0.0
        for _ in range(self.num_skip):
            obs, reward, terminated, truncated, info = self.env.step(action)
            total_reward += reward
            if terminated or truncated:
                break

        return obs, total_reward, terminated, truncated, info

#ORIGINAL
#class GrayScaleObservation(gym.ObservationWrapper):
 #   def __init__(self, env):
  #      super().__init__(env)
   #     obs_shape = self.observation_space.shape[:2]
    #    self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.float32)

#    def observation(self, observation):
 #       observation = np.transpose(observation, (2, 0, 1))
  #      observation = torch.tensor(observation.copy(), dtype=torch.float)
   #     transform = torchvision.transforms.Grayscale()
    #    observation = transform(observation)
     #   return observation
#ENHANCED PREPROCESSING
class GrayScaleObservation(gym.ObservationWrapper):
    def __init__(self, env, apply_grayscale=True):  # EXTRA
        """
        Optionally converts observations to grayscale.
        """
        super().__init__(env)
        self.apply_grayscale = apply_grayscale  # EXTRA
        obs_shape = self.observation_space.shape[:2]
        if self.apply_grayscale:  # EXTRA
            self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.float32)

    def observation(self, observation):
        """
        Convert observation to grayscale if required.
        """
        if self.apply_grayscale:  # EXTRA
            observation = np.transpose(observation, (2, 0, 1))
            observation = torch.tensor(observation.copy(), dtype=torch.float)
            transform = torchvision.transforms.Grayscale()
            observation = transform(observation)
        return observation
   #ORIGINAL     
#class ResizeObservation(gym.ObservationWrapper):
 #   def __init__(self, env, shape):
  #      super().__init__(env)
   #     self.shape = (shape, shape) if isinstance(shape, int) else tuple(shape)
    #    obs_shape = self.shape + self.observation_space.shape[2:]
     #   self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.float32)

    #def observation(self, observation):
     #   transforms = torchvision.transforms.Compose([torchvision.transforms.Resize(self.shape),
         #torchvision.transforms.Normalize(0, 255)])
      #  return transforms(observation).squeeze(0)

#ENHANCED PREPROCESSING
class ResizeObservation(gym.ObservationWrapper):
    def __init__(self, env, shape, normalize=True):  # EXTRA
        """
        Resize observations to the specified shape and optionally normalize.
        """
        super().__init__(env)
        self.shape = (shape, shape) if isinstance(shape, int) else tuple(shape)
        self.normalize = normalize  # EXTRA
        obs_shape = self.shape + self.observation_space.shape[2:]
        self.observation_space = Box(low=0, high=255, shape=obs_shape, dtype=np.float32)

    def observation(self, observation):
        """
        Resize observation to the target shape and normalize if required.
        """
        transforms = [torchvision.transforms.Resize(self.shape)]
        if self.normalize:  # EXTRA
            transforms.append(torchvision.transforms.Normalize(0, 255))
        transform_pipeline = torchvision.transforms.Compose(transforms)
        return transform_pipeline(observation).squeeze(0)
        
class ExperienceReplayMemory(object):
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def __len__(self):
        return len(self.memory)

    def store(self, state, next_state, action, reward, terminated, truncated):
        state = state.__array__()
        next_state = next_state.__array__()
        self.memory.append((state, next_state, action, reward, terminated, truncated))

    def sample(self, batch_size):
        # DONE
        experiences = random.sample(self.memory, batch_size)
        states, next_states, actions, rewards, terminated, truncated = zip(*experiences)

        return (
            torch.tensor(np.array(states), dtype=torch.float32),
            torch.tensor(np.array(next_states), dtype=torch.float32),
            torch.tensor(actions, dtype=torch.int64),
            torch.tensor(rewards, dtype=torch.float32),
            torch.tensor(terminated, dtype=torch.float32),
            torch.tensor(truncated, dtype=torch.float32),
        )
# EXTRA: Flexible Preprocessing Pipeline Wrapper
class FlexiblePreprocessingWrapper(gym.Wrapper):
    """
    A flexible wrapper to combine grayscale, resizing, and normalization.
    """
    def __init__(self, env, config):
        """
        Args:
            env: The environment to wrap.
            config: Dictionary with preprocessing options:
                - 'grayscale': Convert to grayscale.
                - 'resize_shape': Resize to this shape (tuple or int).
                - 'normalize': Normalize pixel values.
        """
        super().__init__(env)
        self.config = config
        if self.config.get("grayscale", True):  # Apply grayscale if enabled
            env = GrayScaleObservation(env, apply_grayscale=self.config["grayscale"])
        if "resize_shape" in self.config:  # Resize if specified
            env = ResizeObservation(env, shape=self.config["resize_shape"], normalize=self.config.get("normalize", True))
        self.env = env  # Wrapped environment

    def reset(self, **kwargs):
        return self.env.reset(**kwargs)

    def step(self, action):
        return self.env.step(action)

In [7]:
#ORIGINAL

#seed = 957
#np.random.seed(seed)
#torch.manual_seed(seed)
#if torch.backends.cudnn.enabled:
#    torch.backends.cudnn.benchmark = False
#    torch.backends.cudnn.deterministic = True

#gym.register_envs(ale_py)

#env_rendering = False    # Set to False while training your model on Colab

# Create and preprocess the Atari Breakout environment
#if env_rendering:
#    env = gym.make("ALE/Breakout-v5", full_action_space=False, render_mode="human")
#else:
#    env = gym.make("ALE/Breakout-v5", full_action_space=False)

#env = SkipFrame(env, num_skip=4)
#env = GrayScaleObservation(env)
#env = ResizeObservation(env, shape=84)
#env = FrameStackObservation(env, stack_size=4)

#image_stack, h, w = env.observation_space.shape
#num_actions = env.action_space.n
#print(f'Number of stacked frames: {image_stack}')
#print(f'Resized observation space dimensionality: {h}, {w}')
#print(f'Number of available actions by the agent: {num_actions}')

#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#print(f'Using device: {device}')

Number of stacked frames: 4
Resized observation space dimensionality: 84, 84
Number of available actions by the agent: 4
Using device: cpu


In [14]:
# ENhanced preprocessing
# Seed Initialization for Reproducibility
seed = 957
np.random.seed(seed)
torch.manual_seed(seed)
if torch.backends.cudnn.enabled:
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

# Register Atari Environments
gym.register_envs(ale_py)

env_rendering = False    # Set to False while training your model on Colab

# -------------------------------
# EXTRA: Configuration Setup
# -------------------------------
# Preprocessing and environment configuration
env_config = {
    "num_skip": 4,                # Number of frames to skip
    "grayscale": True,            # Convert frames to grayscale
    "resize_shape": 84,           # Resize frames to 84x84
    "normalize": True,            # Normalize pixel values
    "stack_size": 4,              # Number of frames to stack
}

# -------------------------------
# EXTRA: Create and Preprocess the Environment
# -------------------------------
if env_rendering:
    env = gym.make("ALE/Breakout-v5", full_action_space=False, render_mode="human")
else:
    env = gym.make("ALE/Breakout-v5", full_action_space=False)

# Apply preprocessing pipeline
env = SkipFrame(env, num_skip=env_config["num_skip"])
env = FlexiblePreprocessingWrapper(env, config=env_config)  # EXTRA
env = FrameStackObservation(env, stack_size=env_config["stack_size"])

# Environment Properties
image_stack, h, w = env.observation_space.shape
num_actions = env.action_space.n
print(f'Number of stacked frames: {image_stack}')
print(f'Resized observation space dimensionality: {h}, {w}')
print(f'Number of available actions by the agent: {num_actions}')

# Device Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device: {device}')

Number of stacked frames: 4
Resized observation space dimensionality: 84, 84
Number of available actions by the agent: 4
Using device: cpu


In [15]:
class DeepQNet(torch.nn.Module):
    def __init__(self, h, w, image_stack, num_actions):
        super(DeepQNet, self).__init__()
        # DONE: create a convolutional neural network
        self.conv1 = torch.nn.Conv2d(image_stack, 32, kernel_size=8, stride=4)
        self.conv2 = torch.nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = torch.nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.fc1 = torch.nn.Linear(64 * 7 * 7, 512)
        self.fc2 = torch.nn.Linear(512, num_actions)

    def forward(self, x):
        # DONE: forward pass from the neural network
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = torch.relu(self.conv3(x))
        x = x.view(x.size(0), -1)  # Flatten
        x = torch.relu(self.fc1(x))
        return self.fc2(x)  # Output Q-values

# DONE: create an online and target DQN (Hint: Use copy.deepcopy() and 
#       set requires_grad to False for the parameters of the target DQN)
online_dqn = DeepQNet(h, w, image_stack, num_actions)
target_dqn = copy.deepcopy(online_dqn)
# Freeze target network parameters to prevent updates
for param in target_dqn.parameters():
    param.requires_grad = False

online_dqn.to(device)
target_dqn.to(device)

DeepQNet(
  (conv1): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
  (conv2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
  (conv3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=3136, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=4, bias=True)
)

In [16]:
def convert(x):
    return torch.tensor(x.__array__()).float()


class AtariAgent:
    def __init__(self, buffer, eps, eps_decay, min_eps, gamma, batch_size,
                 online_dqn, target_dqn, run_as_ddqn, 
                 optimizer, criterion, device,
                 max_train_frames, burn_in_phase, sync_target):

        self.buffer = buffer
        self.eps = eps
        self.eps_decay = eps_decay
        self.min_eps = min_eps
        self.gamma = gamma
        self.batch_size = batch_size

        self.online_dqn = online_dqn
        self.target_dqn = target_dqn
        self.run_as_ddqn = run_as_ddqn
        self.optimizer = optimizer
        self.criterion = criterion
        self.device = device
        self.max_train_frames = max_train_frames
        self.burn_in_phase = burn_in_phase
        self.sync_target = sync_target

        self.current_step = 0


    def policy(self, state, is_training):

        state = convert(state).unsqueeze(0).to(self.device)
        if is_training and random.random() < self.eps:
            return random.randint(0, num_actions - 1)  # Random action (exploration)
        else:
            with torch.no_grad():
                q_values = self.online_dqn(state)
            return torch.argmax(q_values).item()  # Greedy action (exploitation)

        # DONE: Implement an epsilon-greedy policy



    def compute_loss(self, state, action, reward, next_state, truncated, terminated):
        state, action, reward, next_state, truncated, terminated = [x.to(self.device) for x in 
                                (state, action, reward, next_state, truncated, terminated)]

        # DONE: Compute the DQN (or DDQN) loss based on self.criterion
        q_values = self.online_dqn(state).gather(1, action.unsqueeze(1)).squeeze(1)

        with torch.no_grad():
            if self.run_as_ddqn:  # EXTRA: Double DQN
                # Double DQN: Use online DQN to select action and target DQN to evaluate Q-value
                best_next_actions = self.online_dqn(next_state).argmax(1)
                max_next_q_values = self.target_dqn(next_state).gather(1, best_next_actions.unsqueeze(1)).squeeze(1)
            else:
                # Standard DQN
                max_next_q_values = self.target_dqn(next_state).max(1)[0]

            target_q_values = reward + self.gamma * max_next_q_values * (1 - terminated)

        return self.criterion(q_values, target_q_values)


    def run_episode(self, is_training):
        episode_reward, episode_loss = 0, 0.
        state, _ = env.reset(seed=seed)

        for t in range(self.max_train_frames):
            action = self.policy(state, is_training)
            self.current_step += 1
            next_state, reward, terminated, truncated, _ = env.step(action)

            episode_reward += reward

            if is_training:
                self.buffer.store(state, next_state, action, reward, terminated, truncated)

                if self.current_step > self.burn_in_phase:
                    state_batch, next_state_batch, action_batch, \
                        reward_batch, terminated_batch, truncated_batch = self.buffer.sample(self.batch_size)

                    if self.current_step % self.sync_target == 0:
                        # DONE: Periodically update your target_dqn at each sync_target frames
                        self.target_dqn.load_state_dict(self.online_dqn.state_dict())

                    loss = self.compute_loss(state_batch, action_batch, reward_batch, 
                                             next_state_batch, terminated_batch, truncated_batch)
                    self.optimizer.zero_grad()
                    loss.backward()
                    self.optimizer.step()
                    episode_loss += loss.detach().item()
            else:
                with torch.no_grad():
                    st = convert(state).to(self.device).unsqueeze(0)
                    next_st = convert(next_state).to(self.device).unsqueeze(0)
                    act = action.to(self.device)
                    rew = torch.tensor(reward).to(self.device)
                    trunc = torch.tensor(truncated).to(self.device)
                    term = torch.tensor(terminated).to(self.device)

                    episode_loss += self.compute_loss(st, act, rew, next_st, term, trunc).item()

            state = next_state

            if self.current_step > self.burn_in_phase and self.eps > self.min_eps:
                self.eps *= self.eps_decay

            if terminated or truncated:
                break

        return dict(reward=episode_reward, loss=episode_loss / t)


    def save_checkpoint(self, train_metrics, save_filename):
        save_dict = {'curr_step': self.current_step,
                    'train_metrics': train_metrics,
                    'eps': self.eps,
                    'online_dqn': self.online_dqn.state_dict(),
                    'target_dqn': self.target_dqn.state_dict()}

        torch.save(save_dict, save_filename)

In [18]:
def update_metrics(metrics, episode):
    for k, v in episode.items():
        metrics[k].append(v)


def print_metrics(it, metrics, is_training, window=100):
    reward_mean = np.mean(metrics['reward'][-window:])
    loss_mean = np.mean(metrics['loss'][-window:])
    mode = "train" if is_training else "test"
    print(f"Episode {it:4d} | {mode:5s} | reward {reward_mean:5.5f} | loss {loss_mean:5.5f}")

In [None]:
# Hyperparameters (TODO: modify as needed)
#batch_size = 32
#alpha = 0.00025
#gamma = 0.95
#eps, eps_decay, min_eps = 1.0, 0.999, 0.05
#buffer = ExperienceReplayMemory(20_000)
#burn_in_phase = 20_000
#sync_target = 30_000
#max_train_frames = 10_000
#max_train_episodes = 100_000
#max_test_episodes = 1
#run_as_ddqn = False # Set the run_as_ddqn flag to True if you want to run the DDQN algorithm
#save_filename = './saved_model.pt'

# TODO: create the appropriate MSE criterion and Adam optimizer
#optimizer = ...
#criterion = ...

#testing_mode = False # Change to True if you want to load a saved model

#if testing_mode:
    # TODO: Load your saved online_dqn model for testing. 
    #       The target_dqn should be the same as the online_dqn (it isn't needed for testing).
 #   pass

#agent = AtariAgent(buffer=buffer, eps=eps, eps_decay=eps_decay, min_eps=min_eps, gamma=gamma, batch_size=batch_size,
 #                  online_dqn=online_dqn, target_dqn=target_dqn, run_as_ddqn=run_as_ddqn,
  #                 optimizer=optimizer, criterion=criterion, device=device, 
   #                max_train_frames=max_train_frames, burn_in_phase=burn_in_phase, sync_target=sync_target)

#if testing_mode:
 #   test_metrics = dict(reward=[], loss=[])
  #  for it in range(max_test_episodes):
   #     episode_metrics = agent.run_episode(is_training=False)
    #    update_metrics(test_metrics, episode_metrics)
     #   print_metrics(it + 1, test_metrics, is_training=False)
#else:
 #   train_metrics = dict(reward=[], loss=[])
  #  for it in range(max_train_episodes):
   #     episode_metrics = agent.run_episode(is_training=True)
    #    update_metrics(train_metrics, episode_metrics)
     #   if it % 50 == 0:
      #      print_metrics(it, train_metrics, is_training=True)
       #     agent.save_checkpoint(train_metrics, save_filename)

In [None]:
# -------------------------------
# Preprocessing Variations (EXTRA)
# -------------------------------
preprocessing_variations = [
    {"num_skip": 4, "resize_shape": 84, "grayscale": True},   # Default setup
    {"num_skip": 4, "resize_shape": 96, "grayscale": True},   # Larger frame size
    {"num_skip": 2, "resize_shape": 84, "grayscale": False},  # RGB input with fewer skips
]

# -------------------------------
# Hyperparameters and Configuration
# -------------------------------

# Base hyperparameters
base_hyperparams = {
    "batch_size": 32,
    "alpha": 0.00025,
    "gamma": 0.95,
    "eps": 1.0,
    "eps_decay": 0.999,
    "min_eps": 0.05,
    "buffer_size": 20_000,
    "burn_in_phase": 20_000,
    "sync_target": 30_000,
    "max_train_frames": 10_000,
    "max_train_episodes": 100_000,
    "max_test_episodes": 1,
    "run_as_ddqn": False,  # Toggle for Double DQN
    "save_filename": "./saved_model.pt",
}

# Refined Hyperparameter Variations (EXTRA)
hyperparameter_variations = {
    "batch_size": [32, 64],           # Small and large batches for gradient updates
    "alpha": [0.00025, 0.0005],       # Adjusted to focus on faster learning rates
    "eps_decay": [0.995, 0.999],      # Slightly different exploration strategies
    "burn_in_phase": [10_000, 20_000],# Small and large replay buffer warm-up sizes
    "sync_target": [10_000, 30_000],  # Frequent and infrequent target updates
}

# -------------------------------
# Create Replay Buffer, Optimizer, and Loss
# -------------------------------
buffer = ExperienceReplayMemory(base_hyperparams["buffer_size"])
optimizer = torch.optim.Adam(online_dqn.parameters(), lr=base_hyperparams["alpha"])  # DONE
criterion = torch.nn.MSELoss()  # DONE

# -------------------------------
# Training Mode with Preprocessing and Hyperparameter Search
# -------------------------------
if not testing_mode:
    results = []  # Store results for multiple configurations
    
    # Loop through preprocessing variations
    for preprocess_config in preprocessing_variations:  # EXTRA
        print(f"Using preprocessing config: {preprocess_config}")  # EXTRA: Log preprocessing config
        
        # Create and preprocess environment
        env = gym.make("ALE/Breakout-v5", full_action_space=False)
        env = SkipFrame(env, num_skip=preprocess_config["num_skip"])
        if preprocess_config["grayscale"]:
            env = GrayScaleObservation(env)
        env = ResizeObservation(env, shape=preprocess_config["resize_shape"])
        env = FrameStackObservation(env, stack_size=4)  # Fixed stack size

        # Extract environment dimensions
        image_stack, h, w = env.observation_space.shape
        num_actions = env.action_space.n

        # Loop through hyperparameter configurations
        for batch_size in hyperparameter_variations["batch_size"]:
            for alpha in hyperparameter_variations["alpha"]:
                for eps_decay in hyperparameter_variations["eps_decay"]:
                    for burn_in_phase in hyperparameter_variations["burn_in_phase"]:
                        for sync_target in hyperparameter_variations["sync_target"]:
                            print(f"Training with batch_size={batch_size}, alpha={alpha}, eps_decay={eps_decay}, "
                                  f"burn_in_phase={burn_in_phase}, sync_target={sync_target}")

                            # Update agent hyperparameters dynamically
                            optimizer = torch.optim.Adam(online_dqn.parameters(), lr=alpha)  # EXTRA
                            agent = AtariAgent(
                                buffer=buffer,
                                eps=base_hyperparams["eps"],
                                eps_decay=eps_decay,
                                min_eps=base_hyperparams["min_eps"],
                                gamma=base_hyperparams["gamma"],
                                batch_size=batch_size,
                                online_dqn=online_dqn,
                                target_dqn=target_dqn,
                                run_as_ddqn=base_hyperparams["run_as_ddqn"],
                                optimizer=optimizer,
                                criterion=criterion,
                                device=device,
                                max_train_frames=base_hyperparams["max_train_frames"],
                                burn_in_phase=burn_in_phase,
                                sync_target=sync_target,
                            )

                            # Training loop
                            train_metrics = {"reward": [], "loss": []}
                            for it in range(base_hyperparams["max_train_episodes"]):
                                episode_metrics = agent.run_episode(is_training=True)
                                update_metrics(train_metrics, episode_metrics)
                                if it % 50 == 0:
                                    print_metrics(it, train_metrics, is_training=True)
                                    agent.save_checkpoint(train_metrics, base_hyperparams["save_filename"])

                            # Store results
                            results.append({
                                "preprocess_config": preprocess_config,  # EXTRA
                                "batch_size": batch_size,
                                "alpha": alpha,
                                "eps_decay": eps_decay,
                                "burn_in_phase": burn_in_phase,
                                "sync_target": sync_target,
                                "metrics": train_metrics,
                            })

# -------------------------------
# Testing Results (if applicable)
# -------------------------------
if testing_mode:
    test_metrics = {"reward": [], "loss": []}
    for it in range(base_hyperparams["max_test_episodes"]):
        episode_metrics = agent.run_episode(is_training=False)
        update_metrics(test_metrics, episode_metrics)
        print_metrics(it + 1, test_metrics, is_training=False)

In [None]:
import matplotlib.pyplot as plt

# Function to plot metrics
def plot_metrics(metrics, title="Metrics Over Episodes"):
    """
    Plots reward and loss metrics over episodes.
    
    Args:
        metrics (dict): A dictionary containing "reward" and "loss" lists.
        title (str): The title for the plot.
    """
    plt.figure(figsize=(12, 6))
    
    # Plot rewards
    plt.plot(metrics["reward"], label="Reward", color="blue")
    
    # Plot losses
    if "loss" in metrics and len(metrics["loss"]) > 0:
        plt.plot(metrics["loss"], label="Loss", color="red")
    
    plt.title(title)
    plt.xlabel("Episodes")
    plt.ylabel("Values")
    plt.legend()
    plt.grid()
    plt.show()

# Plot train metrics
plot_metrics(train_metrics, title="Training Metrics")

# Plot test metrics if available
if testing_mode and "test_metrics" in locals():
    plot_metrics(test_metrics, title="Testing Metrics")