<a href="https://colab.research.google.com/github/jimmy93029/NYCU_Artificial_Intelligence_Capstone_Labs/blob/main/Lab2/AI_capstone_HW2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installation


In [None]:
!apt-get install -y swig
!pip install box2d-py==2.3.5 --no-build-isolation

In [None]:
!pip install "gymnasium[atari, accept-rom-license]"

In [None]:
!pip install stable-baselines3[extra] torch torchvision

In [None]:
!pip install gymnasium[accept-rom-license,atari]

In [None]:
!pip install Shimmy

In [4]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

# Task 1 : Comparison on Atari

```
# 此內容會顯示為程式碼
```



In [10]:
import ale_py
# if using gymnasium
import shimmy
import torch
import torch.nn as nn
import torch.optim as optim
import gymnasium as gym


In [11]:
import matplotlib as plt

def plot_rewards(reward_dict):
    plt.figure(figsize=(10, 6))
    for label, rewards in reward_dict.items():
        plt.plot(rewards, label=label)
    plt.xlabel("Episode")
    plt.ylabel("Total Reward")
    plt.title("Policy Comparison")
    plt.legend()
    plt.grid(True)
    plt.show()


## train

### train REINFORCE

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
import gymnasium as gym
import numpy as np


# Policy: CNN + Softmax
class CNNREINFORCEPolicy(nn.Module):
    def __init__(self, act_dim):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(3, 32, 8, 4), nn.ReLU(),
            nn.Conv2d(32, 64, 4, 2), nn.ReLU(),
            nn.Conv2d(64, 64, 3, 1), nn.ReLU()
        )
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 22 * 16, 512), nn.ReLU(),
            nn.Linear(512, act_dim),
            nn.Softmax(dim=-1)
        )

    def forward(self, obs):
        if isinstance(obs, np.ndarray):
            obs = torch.tensor(obs, dtype=torch.float32)
        obs = obs.permute(2, 0, 1).unsqueeze(0) / 255.0
        return self.fc(self.conv(obs)).squeeze(0)

# Value Network
class CNNValueNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(3, 32, 8, 4), nn.ReLU(),
            nn.Conv2d(32, 64, 4, 2), nn.ReLU(),
            nn.Conv2d(64, 64, 3, 1), nn.ReLU()
        )
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 22 * 16, 512), nn.ReLU(),
            nn.Linear(512, 1)
        )

    def forward(self, obs):
        if isinstance(obs, np.ndarray):
            obs = torch.tensor(obs, dtype=torch.float32)
        obs = obs.permute(2, 0, 1).unsqueeze(0) / 255.0
        return self.fc(self.conv(obs)).squeeze()

In [13]:
# Training Function
def train_reinforce_variant(env_id, variant="original", num_episodes=500, lr=1e-4, gamma=0.99):
    env = gym.make(env_id)
    policy = CNNREINFORCEPolicy(env.action_space.n)
    optimizer = optim.Adam(policy.parameters(), lr=lr)

    if variant == "advantage":
        value_net = CNNValueNet()
        value_optimizer = optim.Adam(value_net.parameters(), lr=lr)

    for ep in range(num_episodes):
        obs, _ = env.reset()
        log_probs, rewards, states = [], [], []
        done = False

        while not done:
            probs = policy(obs)
            dist = torch.distributions.Categorical(probs)
            action = dist.sample()
            log_probs.append(dist.log_prob(action))
            states.append(obs)
            obs, reward, done, truncated, _ = env.step(action.item())
            rewards.append(reward)

        # Compute returns G_t
        G = 0
        returns = []
        for r in reversed(rewards):
            G = r + gamma * G
            returns.insert(0, G)
        returns = torch.tensor(returns, dtype=torch.float32)

        # Compute loss
        if variant == "baseline":
            baseline = returns.mean()
            advantages = returns - baseline
        elif variant == "advantage":
            values = torch.stack([value_net(s) for s in states])
            advantages = returns - values.detach()

            value_loss = nn.functional.mse_loss(values, returns)
            value_optimizer.zero_grad()
            value_loss.backward()
            value_optimizer.step()
        else:
            advantages = returns

        loss = -torch.sum(torch.stack(log_probs) * advantages)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(f"[{variant.upper()}] Ep {ep+1}/{num_episodes} | Reward: {sum(rewards):.1f}")

    env.close()


## test

In [14]:
def evaluate_reinforce(env_id, model_path, episodes=10):
    import gymnasium as gym

    obs_dim = gym.make(env_id).observation_space.shape[0]
    act_dim = gym.make(env_id).action_space.n
    policy = CNNREINFORCEPolicy(obs_dim, act_dim)
    policy.load_state_dict(torch.load(model_path))
    policy.eval()

    env = gym.make(env_id)
    rewards = []

    for _ in range(episodes):
        obs, _ = env.reset()
        done, total_reward = False, 0
        while not done:
            obs_tensor = torch.tensor(obs, dtype=torch.float32)
            probs = policy(obs_tensor)
            action = torch.argmax(probs).item()
            obs, reward, done, truncated, _ = env.step(action)
            total_reward += reward
        rewards.append(total_reward)
    return rewards


## main

In [15]:
envs_discrete = ["ALE/Assault-v5"]
# envs_continuous = ["CarRacing-v2", "BipedalWalker-v3"]

train_reinforce_variant("ALE/Assault-v5", variant="original")
train_reinforce_variant("ALE/Assault-v5", variant="baseline")
train_reinforce_variant("ALE/Assault-v5", variant="advantage")

[ORIGINAL] Ep 1/500 | Reward: 168.0
[ORIGINAL] Ep 2/500 | Reward: 273.0
[ORIGINAL] Ep 3/500 | Reward: 189.0
[ORIGINAL] Ep 4/500 | Reward: 336.0
[ORIGINAL] Ep 5/500 | Reward: 210.0
[ORIGINAL] Ep 6/500 | Reward: 273.0
[ORIGINAL] Ep 7/500 | Reward: 189.0
[ORIGINAL] Ep 8/500 | Reward: 336.0
[ORIGINAL] Ep 9/500 | Reward: 189.0
[ORIGINAL] Ep 10/500 | Reward: 168.0


KeyboardInterrupt: 

## Task 2

In [None]:
from stable_baselines3 import PPO, A2C
from stable_baselines3.common.env_util import make_vec_env

In [8]:
def train_sb3(algo, env_id, total_timesteps=100_000):
    policy = "CnnPolicy" if "ALE" in env_id else "MlpPolicy"
    env = make_vec_env(env_id, n_envs=1)
    model = algo(policy, env, verbose=1)
    model.learn(total_timesteps=total_timesteps)
    model.save(f"{env_id.split('/')[-1]}_{algo.__name__}")


In [None]:
def evaluate_sb3(model_class, model_path, env_id, episodes=10):
    from stable_baselines3 import PPO, A2C
    import gymnasium as gym

    model = model_class.load(model_path)
    env = gym.make(env_id)
    rewards = []

    for _ in range(episodes):
        obs, _ = env.reset()
        done, total_reward = False, 0
        while not done:
            action, _ = model.predict(obs)
            obs, reward, done, truncated, _ = env.step(action)
            total_reward += reward
        rewards.append(total_reward)
    return rewards

In [9]:
envs_discrete = ["ALE/Assault-v5"]
# envs_continuous = ["CarRacing-v2", "BipedalWalker-v3"]

print("\n=== TRAINING PPO ===")
for env_id in envs_discrete:
    train_sb3(PPO, env_id)

print("\n=== TRAINING A2C ===")
for env_id in envs_discrete:
    train_sb3(A2C, env_id)

print("\n=== TRAINING REINFORCE (Discrete Only) ===")
for env_id in envs_discrete:
    train_reinforce(env_id)


=== TRAINING PPO ===
Using cuda device
Wrapping the env in a VecTransposeImage.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 594      |
|    ep_rew_mean     | 259      |
| time/              |          |
|    fps             | 290      |
|    iterations      | 1        |
|    time_elapsed    | 7        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 574         |
|    ep_rew_mean          | 264         |
| time/                   |             |
|    fps                  | 204         |
|    iterations           | 2           |
|    time_elapsed         | 19          |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.011813848 |
|    clip_fraction        | 0.21        |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.93

RuntimeError: mat1 and mat2 shapes cannot be multiplied (33600x3 and 210x128)

In [None]:
# Evaluate PPO vs A2C vs REINFORCE
ppo_rewards = evaluate_sb3(PPO, "Assault-v5_PPO", "ALE/Assault-v5")
a2c_rewards = evaluate_sb3(A2C, "Assault-v5_A2C", "ALE/Assault-v5")
reinforce_rewards = evaluate_reinforce("ALE/Assault-v5", "Assault-v5_REINFORCE.pth")

In [None]:
# Plot results
plot_rewards({
    "PPO": ppo_rewards,
    "A2C": a2c_rewards,
    "REINFORCE": reinforce_rewards
})