In [None]:
# Install required libraries (run once)
!pip install -q torch gymnasium numpy matplotlib wandb tqdm

In [None]:
# Imports & device setup
import os
import random
import time
import numpy as np
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque
import matplotlib.pyplot as plt
from tqdm import trange
from gymnasium.wrappers import RecordVideo
import wandb

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

# If using Weights & Biases, uncomment after logging in interactively:
# wandb.login()
wandb.init(project="RL_Assignment2_DQN_DDQN", name="DQN_DDQN_Final", reinit=True)

In [None]:
# Replay Buffer
from collections import deque

class ReplayBuffer:
    def __init__(self, capacity=100000):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((np.array(state, dtype=np.float32),
                            int(action),
                            float(reward),
                            np.array(next_state, dtype=np.float32),
                            bool(done)))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = map(np.array, zip(*batch))
        return states, actions, rewards, next_states, dones

    def __len__(self):
        return len(self.buffer)

In [None]:
# Q-Network Definition
class QNetwork(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_units=128):
        super().__init__()
        self.fc1 = nn.Linear(state_dim, hidden_units)
        self.fc2 = nn.Linear(hidden_units, hidden_units)
        self.fc3 = nn.Linear(hidden_units, action_dim)

    def forward(self, x):
        if isinstance(x, np.ndarray):
            x = torch.FloatTensor(x).to(device)
        if x.dim() == 1:
            x = x.unsqueeze(0)
            squeeze = True
        else:
            squeeze = False
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        out = self.fc3(x)
        if squeeze:
            return out.squeeze(0)
        return out

In [None]:
# DQN and DDQN Agents
class DQNAgent:
    def __init__(self, state_dim, action_dim, config):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.gamma = config.get("gamma", 0.99)
        self.epsilon = config.get("epsilon_start", 1.0)
        self.epsilon_min = config.get("epsilon_min", 0.01)
        self.epsilon_decay = config.get("epsilon_decay", 0.999)
        self.lr = config.get("lr", 1e-3)
        self.batch_size = config.get("batch_size", 64)
        self.grad_clip = config.get("grad_clip", 10.0)

        self.q_net = QNetwork(state_dim, action_dim).to(device)
        self.target_net = QNetwork(state_dim, action_dim).to(device)
        self.target_net.load_state_dict(self.q_net.state_dict())

        self.optimizer = optim.Adam(self.q_net.parameters(), lr=self.lr)
        self.memory = ReplayBuffer(config.get("memory_size", 100000))
        self.loss_fn = F.smooth_l1_loss

    def act_index(self, state, deterministic=False):
        if (not deterministic) and (random.random() < self.epsilon):
            return random.randrange(self.action_dim)
        with torch.no_grad():
            s = torch.FloatTensor(state).to(device)
            qvals = self.q_net(s)
            return int(torch.argmax(qvals).item())

    def remember(self, s, a, r, ns, done):
        self.memory.push(s, a, r, ns, done)

    def replay_step(self):
        if len(self.memory) < self.batch_size:
            return None
        states, actions, rewards, next_states, dones = self.memory.sample(self.batch_size)
        states = torch.FloatTensor(states).to(device)
        next_states = torch.FloatTensor(next_states).to(device)
        actions = torch.LongTensor(actions).unsqueeze(1).to(device)
        rewards = torch.FloatTensor(rewards).unsqueeze(1).to(device)
        dones = torch.FloatTensor(dones.astype(np.float32)).unsqueeze(1).to(device)

        q_values = self.q_net(states).gather(1, actions)
        with torch.no_grad():
            next_q_vals = self.target_net(next_states).max(1)[0].unsqueeze(1)
            targets = rewards + self.gamma * next_q_vals * (1 - dones)

        loss = self.loss_fn(q_values, targets)
        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.q_net.parameters(), self.grad_clip)
        self.optimizer.step()

        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
        return float(loss.item())

    def update_target(self):
        self.target_net.load_state_dict(self.q_net.state_dict())

    def save(self, path):
        os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
        torch.save(self.q_net.state_dict(), path)

    def load(self, path, map_location=None):
        self.q_net.load_state_dict(torch.load(path, map_location=map_location or device))
        self.update_target()

class DDQNAgent(DQNAgent):
    def replay_step(self):
        if len(self.memory) < self.batch_size:
            return None
        states, actions, rewards, next_states, dones = self.memory.sample(self.batch_size)
        states = torch.FloatTensor(states).to(device)
        next_states = torch.FloatTensor(next_states).to(device)
        actions = torch.LongTensor(actions).unsqueeze(1).to(device)
        rewards = torch.FloatTensor(rewards).unsqueeze(1).to(device)
        dones = torch.FloatTensor(dones.astype(np.float32)).unsqueeze(1).to(device)

        q_values = self.q_net(states).gather(1, actions)
        with torch.no_grad():
            next_actions = self.q_net(next_states).argmax(1).unsqueeze(1)
            next_q_vals = self.target_net(next_states).gather(1, next_actions)
            targets = rewards + self.gamma * next_q_vals * (1 - dones)

        loss = self.loss_fn(q_values, targets)
        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.q_net.parameters(), self.grad_clip)
        self.optimizer.step()

        self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)
        return float(loss.item())

In [None]:
# Action Discretization Helper
def make_action_list(env, resolution=7, env_name=None):
    """Return (action_size, actions_list) for Discrete or Box spaces.
    Discrete -> integer list; Box(1,) -> scalar list; multi-dim Box -> vary first dim only.
    Pendulum gets higher default resolution.
    """
    act_space = env.action_space
    if isinstance(act_space, gym.spaces.Discrete):
        n = act_space.n
        return n, list(range(n))
    elif isinstance(act_space, gym.spaces.Box):
        low = act_space.low
        high = act_space.high
        shape = act_space.shape
        res = max(resolution, 15) if env_name == "Pendulum-v1" else resolution
        if len(shape) == 1 and shape[0] == 1:
            vals = np.linspace(low[0], high[0], res)
            return len(vals), [float(v) for v in vals]
        else:
            vals = np.linspace(low[0], high[0], res)
            actions = [np.array([v] + [0]*(shape[0]-1), dtype=act_space.dtype) for v in vals]
            return len(actions), actions
    else:
        raise ValueError("Unsupported action space")

In [None]:
# Train Function (warmup, MountainCar shaping, action formatting)
def train_agent(env_name, agent_type="DQN", episodes=150, seed=0, save_model=True,
                warmup_steps=2000, target_update_steps=1000, discretize_res=7):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    env = gym.make(env_name)
    env_details = {"obs_space": str(env.observation_space), "act_space": str(env.action_space)}

    if env_name == "MountainCar-v0" and episodes < 1000:
        print("Note: overriding episodes -> 1000 for MountainCar")
        episodes = 1000
        warmup_steps = max(warmup_steps, 3000)
        target_update_steps = 500

    action_size, actions_list = make_action_list(env, resolution=discretize_res, env_name=env_name)
    obs_shape = env.observation_space.shape
    state_size = int(np.prod(obs_shape)) if obs_shape else 1

    config = {
        "gamma": 0.99, "epsilon_start": 1.0, "epsilon_min": 0.01,
        "epsilon_decay": 0.999, "lr": 1e-3, "memory_size": 100000,
        "batch_size": 64, "grad_clip": 10.0
    }
    if env_name == "MountainCar-v0":
        config["epsilon_decay"] = 0.9995; config["lr"] = 5e-4
    if env_name == "CartPole-v1":
        config["epsilon_decay"] = 0.995
    if env_name == "Pendulum-v1":
        config["epsilon_decay"] = 0.999

    agent = DQNAgent(state_size, action_size, config) if agent_type == "DQN" else DDQNAgent(state_size, action_size, config)

    models_dir = os.path.join("models", env_name); os.makedirs(models_dir, exist_ok=True)
    normalize = (env_name == "MountainCar-v0")
    obs_low, obs_high = (env.observation_space.low, env.observation_space.high) if normalize else (None, None)

    print(f"[Warmup] populating replay buffer with {warmup_steps} random steps...")
    state_raw, _ = env.reset(seed=seed)
    state = (state_raw - obs_low) / (obs_high - obs_low) if normalize else state_raw
    for _ in range(warmup_steps):
        idx = random.randrange(action_size); action = actions_list[idx]
        if isinstance(env.action_space, gym.spaces.Box):
            action_to_env = np.array([action], dtype=env.action_space.dtype) if np.isscalar(action) else np.array(action, dtype=env.action_space.dtype)
        else:
            action_to_env = int(action)
        next_state_raw, r_env, terminated, truncated, _ = env.step(action_to_env)
        done = terminated or truncated
        next_state = (next_state_raw - obs_low) / (obs_high - obs_low) if normalize else next_state_raw
        agent.remember(state, idx, r_env, next_state, done)
        state_raw = next_state_raw; state = next_state
        if done:
            state_raw, _ = env.reset(); state = (state_raw - obs_low) / (obs_high - obs_low) if normalize else state_raw

    rewards_list = []; best_reward = -float("inf"); step_count = 0
    alpha = 10.0; beta = 0.05; goal_reward = 100.0

    for ep in range(1, episodes + 1):
        state_raw, _ = env.reset(seed=seed + ep)
        state = (state_raw - obs_low) / (obs_high - obs_low) if normalize else state_raw
        total_reward = 0.0; done = False
        while not done:
            idx = agent.act_index(state); action = actions_list[idx]
            if isinstance(env.action_space, gym.spaces.Box):
                action_to_env = np.array([action], dtype=env.action_space.dtype) if np.isscalar(action) else np.array(action, dtype=env.action_space.dtype)
            else:
                action_to_env = int(action)
            next_state_raw, r_env, terminated, truncated, _ = env.step(action_to_env)
            done = terminated or truncated
            if env_name == "MountainCar-v0":
                pos_min, pos_max = env.observation_space.low[0], env.observation_space.high[0]
                pos_cur, vel_cur = float(state_raw[0]), float(state_raw[1])
                pos_next, vel_next = float(next_state_raw[0]), float(next_state_raw[1])
                pos_cur_norm = (pos_cur - pos_min) / (pos_max - pos_min)
                pos_next_norm = (pos_next - pos_min) / (pos_max - pos_min)
                phi_cur = alpha * pos_cur_norm; phi_next = alpha * pos_next_norm
                shaping = agent.gamma * phi_next - phi_cur
                vel_bonus = beta * abs(vel_next)
                reward = float(r_env + shaping + vel_bonus)
                if done and pos_next >= 0.5: reward += goal_reward
            else:
                reward = float(r_env)
            next_state = (next_state_raw - obs_low) / (obs_high - obs_low) if normalize else next_state_raw
            agent.remember(state, idx, reward, next_state, done); _ = agent.replay_step()
            state_raw = next_state_raw; state = next_state
            total_reward += reward; step_count += 1
            if step_count % target_update_steps == 0: agent.update_target()
        rewards_list.append(total_reward)
        if total_reward > best_reward:
            best_reward = total_reward
            ckpt_path = os.path.join(models_dir, f"{env_name}_{agent_type}_ep{ep}_rew{int(total_reward)}.pth")
            agent.save(ckpt_path); agent.save(os.path.join(models_dir, f"{env_name}_{agent_type}_best.pth"))
        try:
            wandb.log({f"{agent_type}/{env_name}/reward": float(total_reward), f"{agent_type}/{env_name}/epsilon": float(agent.epsilon)})
        except Exception:
            pass
        if (ep % 10 == 0) or (ep == 1) or (ep == episodes):
            print(f"{agent_type} | {env_name} | Episode {ep}/{episodes} | Reward: {total_reward:.2f} | Epsilon: {agent.epsilon:.4f}")

    final_path = os.path.join(models_dir, f"{env_name}_{agent_type}_final.pth"); agent.save(final_path); env.close()
    meta = {"env_details": env_details, "actions_list": actions_list, "model_path": final_path}
    return agent, rewards_list, meta

In [None]:
# Evaluation + Video Recording
def evaluate_and_record(agent, env_name, actions_list, agent_type="DQN", episodes=3, seed=42):
    video_folder = f"videos/{env_name}/{agent_type}"; os.makedirs(video_folder, exist_ok=True)
    try:
        base_env = gym.make(env_name, render_mode="rgb_array")
    except TypeError:
        base_env = gym.make(env_name)
    env = RecordVideo(base_env, video_folder=video_folder, episode_trigger=lambda e: True)

    details = {"observation_space": str(base_env.observation_space), "action_space": str(base_env.action_space)}
    try:
        wandb.log({f"{agent_type}/{env_name}/env_description": str(details)})
    except Exception:
        pass

    all_episode_rewards = []
    normalize = (env_name == "MountainCar-v0")
    if normalize: obs_low, obs_high = base_env.observation_space.low, base_env.observation_space.high

    for ep in range(episodes):
        state_raw, _ = env.reset(seed=seed + ep)
        state = (state_raw - obs_low) / (obs_high - obs_low) if normalize else state_raw
        done = False; total_reward = 0.0; steps = 0
        while not done:
            try:
                action_idx = agent.act_index(state, deterministic=True)
            except TypeError:
                action_idx = agent.act_index(state)
            action = actions_list[action_idx]
            if isinstance(base_env.action_space, gym.spaces.Box):
                action_to_env = np.array([action], dtype=base_env.action_space.dtype) if np.isscalar(action) else np.array(action, dtype=base_env.action_space.dtype)
            else:
                action_to_env = int(action)
            next_state_raw, r, terminated, truncated, _ = env.step(action_to_env)
            done = terminated or truncated
            next_state = (next_state_raw - obs_low) / (obs_high - obs_low) if normalize else next_state_raw
            state = next_state; total_reward += float(r); steps += 1
        all_episode_rewards.append(total_reward)
        print(f"[VIDEO] {agent_type} | {env_name} | Eval {ep+1}/{episodes} | Reward: {total_reward:.2f} | Steps: {steps}")

    env.close()
    vids = sorted([os.path.join(video_folder, f) for f in os.listdir(video_folder) if f.endswith(".mp4")])
    print(f"Saved {len(vids)} videos to {video_folder}")
    for i, v in enumerate(vids):
        try:
            wandb.log({f"{agent_type}/{env_name}/video_{i}": wandb.Video(v, fps=30, format="mp4")})
        except Exception:
            print("W&B video upload failed or unavailable.")
    return all_episode_rewards, vids

In [None]:
# Deterministic Stability Tests (100 episodes)
def evaluate_agent_tests(agent, env_name, actions_list, n_tests=100, deterministic=True):
    env = gym.make(env_name)
    normalize = (env_name == "MountainCar-v0")
    if normalize: obs_low, obs_high = env.observation_space.low, env.observation_space.high
    durations, rewards = [], []
    for t in range(n_tests):
        state_raw, _ = env.reset(seed=10000 + t)
        state = (state_raw - obs_low) / (obs_high - obs_low) if normalize else state_raw
        done = False; total_reward = 0.0; steps = 0
        while not done:
            try:
                idx = agent.act_index(state, deterministic=deterministic)
            except TypeError:
                idx = agent.act_index(state)
            action = actions_list[idx]
            if isinstance(env.action_space, gym.spaces.Box):
                action_to_env = np.array([action], dtype=env.action_space.dtype) if np.isscalar(action) else np.array(action, dtype=env.action_space.dtype)
            else:
                action_to_env = int(action)
            next_state_raw, r, terminated, truncated, _ = env.step(action_to_env)
            done = terminated or truncated
            next_state = (next_state_raw - obs_low) / (obs_high - obs_low) if normalize else next_state_raw
            state = next_state; total_reward += float(r); steps += 1
        durations.append(steps); rewards.append(total_reward)
    env.close(); return np.array(durations), np.array(rewards)

In [None]:
# Plot Helper (Training Curves)
def plot_rewards(env_name, results_dict):
    plt.figure(figsize=(10,5))
    plt.title(f"Training Rewards: {env_name}")
    if len(results_dict["DQN"]["rewards"])>0:
        plt.plot(results_dict["DQN"]["rewards"], label="DQN")
    if len(results_dict["DDQN"]["rewards"])>0:
        plt.plot(results_dict["DDQN"]["rewards"], label="DDQN")
    plt.xlabel("Episode"); plt.ylabel("Reward"); plt.legend(); plt.grid(True); plt.show()

In [None]:
# Orchestration: Train Across Environments
envs = ["CartPole-v1", "Acrobot-v1", "MountainCar-v0", "Pendulum-v1"]
results = {}
for env_name in envs:
    episodes = 1000 if env_name == "MountainCar-v0" else 150
    print("\n" + "="*70)
    print(f"TRAINING {env_name} with DQN for {episodes} episodes")
    dqn_agent, dqn_rewards, dqn_meta = train_agent(env_name, agent_type="DQN", episodes=episodes, seed=0)
    print(f"TRAINING {env_name} with DDQN for {episodes} episodes")
    ddqn_agent, ddqn_rewards, ddqn_meta = train_agent(env_name, agent_type="DDQN", episodes=episodes, seed=0)
    results[env_name] = {
        "DQN": {"agent": dqn_agent, "rewards": dqn_rewards, "meta": dqn_meta},
        "DDQN": {"agent": ddqn_agent, "rewards": ddqn_rewards, "meta": ddqn_meta},
    }
    print(f"\nRecording evaluation videos for {env_name} with DQN")
    dqn_eval_rewards, dqn_videos = evaluate_and_record(dqn_agent, env_name, dqn_meta["actions_list"], agent_type="DQN", episodes=3)
    print(f"\nRecording evaluation videos for {env_name} with DDQN")
    ddqn_eval_rewards, ddqn_videos = evaluate_and_record(ddqn_agent, env_name, ddqn_meta["actions_list"], agent_type="DDQN", episodes=3)
    results[env_name]["DQN"].update({"eval_rewards": dqn_eval_rewards, "videos": dqn_videos})
    results[env_name]["DDQN"].update({"eval_rewards": ddqn_eval_rewards, "videos": ddqn_videos})
    print(f"\nRunning stability tests (100 deterministic) for {env_name} - DQN")
    d_durations, d_rewards = evaluate_agent_tests(dqn_agent, env_name, dqn_meta["actions_list"], n_tests=100, deterministic=True)
    print(f"DQN {env_name} durations mean/std: {d_durations.mean():.2f}/{d_durations.std():.2f} | rewards mean/std: {d_rewards.mean():.2f}/{d_rewards.std():.2f}")
    print(f"Running stability tests (100 deterministic) for {env_name} - DDQN")
    dd_durations, dd_rewards = evaluate_agent_tests(ddqn_agent, env_name, ddqn_meta["actions_list"], n_tests=100, deterministic=True)
    print(f"DDQN {env_name} durations mean/std: {dd_durations.mean():.2f}/{dd_durations.std():.2f} | rewards mean/std: {dd_rewards.mean():.2f}/{dd_rewards.std():.2f}")
    try:
        wandb.log({
            f"DQN/{env_name}/test/dur_mean": float(d_durations.mean()),
            f"DQN/{env_name}/test/dur_std": float(d_durations.std()),
            f"DQN/{env_name}/test/rew_mean": float(d_rewards.mean()),
            f"DQN/{env_name}/test/rew_std": float(d_rewards.std()),
            f"DDQN/{env_name}/test/dur_mean": float(dd_durations.mean()),
            f"DDQN/{env_name}/test/dur_std": float(dd_durations.std()),
            f"DDQN/{env_name}/test/rew_mean": float(dd_rewards.mean()),
            f"DDQN/{env_name}/test/rew_std": float(dd_rewards.std()),
        })
    except Exception:
        pass