In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import os
os.chdir('/content/drive/MyDrive/finalproject_RL')

In [3]:
!pip install -r requirements.txt

Collecting git+https://github.com/Farama-Foundation/MAgent2 (from -r requirements.txt (line 1))
  Cloning https://github.com/Farama-Foundation/MAgent2 to /tmp/pip-req-build-1ytf0zvd
  Running command git clone --filter=blob:none --quiet https://github.com/Farama-Foundation/MAgent2 /tmp/pip-req-build-1ytf0zvd
  Resolved https://github.com/Farama-Foundation/MAgent2 to commit b2ddd49445368cf85d4d4e1edcddae2e28aa1406
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pettingzoo (from -r requirements.txt (line 4))
  Downloading pettingzoo-1.24.3-py3-none-any.whl.metadata (8.5 kB)
Collecting gymnasium>=0.28.0 (from pettingzoo->-r requirements.txt (line 4))
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium>=0.28.0->pettingzoo->-r requirements.txt (line 4))
  Downloading Farama_Notifications-0.0.

In [12]:
import os
import time
import random
import numpy as np
import torch
from torch import nn
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset
from magent2.environments import battle_v4
from torch_model import QNetwork

In [13]:
def preprocess_observation(observation):
    """Reshape and normalize observation for CNN input."""
    obs = torch.tensor(observation, dtype=torch.float32).permute(2, 0, 1)  # HWC -> CHW
    return obs.unsqueeze(0)  # Add batch dimension

def detect_enemy_nearby(observation, radius=2):
    """
    Detect if any enemies are within a given radius in the observation.

    Args:
        observation: Raw observation from the environment (HWC format).
        radius: Radius to search for enemies.

    Returns:
        True if an enemy is detected within the radius, False otherwise.
    """
    enemy_value = 2  # Assume enemies are represented by this value
    agent_position = np.argwhere(observation[:, :, 0] == 1)  # Agent position (value 1)

    if agent_position.size == 0:
        return False

    agent_y, agent_x = agent_position[0]

    # Define the search bounds
    y_min = max(0, agent_y - radius)
    y_max = min(observation.shape[0], agent_y + radius + 1)
    x_min = max(0, agent_x - radius)
    x_max = min(observation.shape[1], agent_x + radius + 1)

    # Check for enemy presence within the bounds
    subregion = observation[y_min:y_max, x_min:x_max, 0]
    return enemy_value in subregion

def select_action(state, policy_net, epsilon, action_space, observation, attack_action_id=0):
    """
    Epsilon-greedy action selection with priority for attack if an enemy is nearby.
    """
    # Check for nearby enemies
    enemy_detected = detect_enemy_nearby(observation)

    if enemy_detected:
        return attack_action_id  # Prioritize attack if an enemy is nearby

    # Epsilon-greedy policy for other cases
    if random.random() < epsilon:
        return random.randint(0, action_space - 1)  # Explore
    with torch.no_grad():
        return policy_net(state).argmax().item()  # Exploit

def update_target_network(policy_net, target_net):
    """Copy weights from policy network to target network."""
    target_net.load_state_dict(policy_net.state_dict())

In [14]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = []
        self.capacity = capacity

    def add(self, transition):
        if len(self.buffer) >= self.capacity:
            self.buffer.pop(0)
        self.buffer.append(transition)

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

    def __len__(self):
        return len(self.buffer)


In [16]:
def train_dqn(policy_net, target_net, optimizer, buffer, batch_size, gamma):
    if len(buffer) < batch_size:
        return 0

    transitions = buffer.sample(batch_size)
    states, actions, rewards, next_states, dones = zip(*transitions)

    states = torch.cat(states).to(device)
    actions = torch.tensor(actions).unsqueeze(1).to(device)
    rewards = torch.tensor(rewards).unsqueeze(1).to(device)
    next_states = torch.cat(next_states).to(device)
    dones = torch.tensor(dones, dtype=torch.float32).unsqueeze(1).to(device)

    # Compute Q values
    q_values = policy_net(states).gather(1, actions)

    # Compute target Q values
    with torch.no_grad():
        max_next_q_values = target_net(next_states).max(1, keepdim=True)[0]
        target_q_values = rewards + (1 - dones) * gamma * max_next_q_values

    # Compute loss
    loss = nn.MSELoss()(q_values, target_q_values)

    # Optimize model
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss.item()

In [17]:
if __name__ == "__main__":
    # Environment setup
    env = battle_v4.env(map_size=45, minimap_mode=False, step_reward=-0.005,
                         dead_penalty=-1, attack_penalty=-0.1, attack_opponent_reward=1,
                         max_cycles=300, extra_features=False)
    env.reset()

    # Hyperparameters
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    num_episodes = 500
    batch_size = 64
    gamma = 0.99
    epsilon = 1.0
    epsilon_decay = 0.995
    epsilon_min = 0.1
    target_update_freq = 10
    buffer_capacity = 10000
    learning_rate = 0.001

    # Observation and action space
    observation_space = env.observation_space("blue_0").shape
    action_space = env.action_space("blue_0").n

    # Initialize networks and optimizer
    policy_net = QNetwork(observation_space, action_space).to(device)
    target_net = QNetwork(observation_space, action_space).to(device)
    update_target_network(policy_net, target_net)
    target_net.eval()

    optimizer = Adam(policy_net.parameters(), lr=learning_rate)
    replay_buffer = ReplayBuffer(buffer_capacity)

    start_time = time.time()

    for episode in range(1, num_episodes + 1):
        env.reset()
        obs, _, _, _, _ = env.last()
        state = preprocess_observation(obs).to(device)
        total_reward = 0

        for step in range(300):  # Max steps per episode
            # Select action
            action = select_action(state, policy_net, epsilon, action_space, obs)

            # Take action in environment
            env.step(action)
            obs, reward, done, truncation, _ = env.last()
            next_state = preprocess_observation(obs).to(device)

            # Store transition in replay buffer
            replay_buffer.add((state, action, reward, next_state, done or truncation))

            # Train the policy network
            loss = train_dqn(policy_net, target_net, optimizer, replay_buffer, batch_size, gamma)

            state = next_state
            total_reward += reward

            if done or truncation:
                break

        # Decay epsilon
        epsilon = max(epsilon * epsilon_decay, epsilon_min)

        # Update target network
        if episode % target_update_freq == 0:
            update_target_network(policy_net, target_net)

        print(f"Episode {episode}, Total Reward: {total_reward:.2f}, Loss: {loss:.4f}, Epsilon: {epsilon:.3f}")

    env.close()
    print(f"Training completed in {(time.time() - start_time) / 3600:.2f} hours.")

    # Save the trained model
    torch.save(policy_net.state_dict(), "blue.pth")

Episode 1, Total Reward: -6.60, Loss: 0.0001, Epsilon: 0.995
Episode 2, Total Reward: -5.90, Loss: 0.0001, Epsilon: 0.990
Episode 3, Total Reward: -4.80, Loss: 0.0001, Epsilon: 0.985
Episode 4, Total Reward: -4.70, Loss: 0.0001, Epsilon: 0.980
Episode 5, Total Reward: -6.30, Loss: 0.0000, Epsilon: 0.975
Episode 6, Total Reward: -4.90, Loss: 0.0001, Epsilon: 0.970
Episode 7, Total Reward: -5.10, Loss: 0.0002, Epsilon: 0.966
Episode 8, Total Reward: -4.10, Loss: 0.0001, Epsilon: 0.961
Episode 9, Total Reward: -5.40, Loss: 0.0001, Epsilon: 0.956
Episode 10, Total Reward: -6.50, Loss: 0.0002, Epsilon: 0.951
Episode 11, Total Reward: -5.80, Loss: 0.0001, Epsilon: 0.946
Episode 12, Total Reward: -5.70, Loss: 0.0003, Epsilon: 0.942
Episode 13, Total Reward: -6.70, Loss: 0.0000, Epsilon: 0.937
Episode 14, Total Reward: -5.70, Loss: 0.0001, Epsilon: 0.932
Episode 15, Total Reward: -4.60, Loss: 0.0002, Epsilon: 0.928
Episode 16, Total Reward: -4.80, Loss: 0.0001, Epsilon: 0.923
Episode 17, Total