## Create the environment

In [1]:
from agar.Env import AgarEnv
import numpy as np
from agar.Config import Config
import time

render = True
num_agents = 1

class Args:
    """
    Class to hold the arguments for the environment.
    """
    def __init__(self):
        self.num_controlled_agent = num_agents
        self.num_processes = 64
        self.action_repeat = 1
        self.total_step = 1e8
        self.r_alpha = 0.1
        self.r_beta = 0.1
        self.seed = 42
        self.gamma = 0.99
        self.eval = True


class ContEnvWrapper:
    """
    Wrapper class for the continuous environment.
    """
    def __init__(self):
        self.env = AgarEnv(Args())
        self.action_limits = np.array([[-1,1],[-1,1]])
    
    def reset(self):
        """
        Reset the environment and return the initial observation.
        """
        obs = self.env.reset()
        return obs
    
    def render(self):
        """
        Render the environment.
        """
        self.env.render(0,render_player=True)
    
    def close(self):
        """
        Close the environment.
        """
        self.env.close()
    
    def step(self, actions):
        """
        Take a step in the environment given the actions and return the new observation, reward, and done flag.
        """
        actions = np.array(actions).reshape(-1)
        actions[2] = 1 if actions[2] > 0 else 0
        obs, rewards, dones, infos, new_obs = self.env.step(actions)
        return obs['t0'], rewards[0], dones[0]

## Initialize the policy like TransformerGaussianPolicy

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torch.nn.functional as F

class TransformerGaussianPolicy(nn.Module):
    def __init__(self, input_dim, output_dim, nhead, num_encoder_layers):
        super(TransformerGaussianPolicy, self).__init__()
        
        # Embedding for the input, increase the dimension for transformer
        self.embedding = nn.Linear(input_dim, 512)
        
        # Transformer encoder
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=512, nhead=nhead),
            num_layers=num_encoder_layers
        )
        
        # Heads for mean and standard deviation
        self.mu_head = nn.Linear(512, output_dim)
        self.sigma_head = nn.Linear(512, output_dim)

    def forward(self, x):
        """
        Forward pass of the TransformerGaussianPolicy network.
        
        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, input_dim).
        
        Returns:
            mu (torch.Tensor): Mean tensor of shape (batch_size, output_dim).
            sigma (torch.Tensor): Standard deviation tensor of shape (batch_size, output_dim).
        """
        x = self.embedding(x)
        
        # Note: Transformer expects input in the format (sequence length, batch size, features)
        # Here, we treat our input as a sequence of length 1.
        x = x.unsqueeze(0)
        x = self.transformer(x)
        x = x.squeeze(0)
        
        mu = torch.tanh(self.mu_head(x))
        sigma = F.softplus(self.sigma_head(x)) + 1e-5
        return mu, sigma


class GaussianBoostedPolicyNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(GaussianPolicyNetwork, self).__init__()
        self.fc = nn.Linear(input_dim, 128)
        self.mu_head = nn.Linear(128, output_dim)
        self.sigma_head = nn.Linear(128, output_dim)
    
    def forward(self, x):
        """
        Forward pass of the GaussianBoostedPolicyNetwork network.
        
        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, input_dim).
        
        Returns:
            mu (torch.Tensor): Mean tensor of shape (batch_size, output_dim).
            sigma (torch.Tensor): Standard deviation tensor of shape (batch_size, output_dim).
        """
        x = torch.relu(self.fc(x))
        mu = torch.tanh(self.mu_head(x))  # Mean
        sigma = F.softplus(self.sigma_head(x)) + 1e-5  # Standard deviation
        return mu, sigma
    
    


class GaussianPolicyNetwork(nn.Module):
    """
    Gaussian Policy Network class.
    """
    def __init__(self, input_dim, output_dim):
        """
        Initialize the Gaussian Policy Network.

        Args:
            input_dim (int): Dimensionality of the input.
            output_dim (int): Dimensionality of the output.
        """
        super(GaussianPolicyNetwork, self).__init__()
        self.fc = nn.Linear(input_dim, 128)
        self.mu_head = nn.Linear(128, output_dim)
        self.sigma_head = nn.Linear(128, output_dim)
    
    def forward(self, x):
        """
        Forward pass of the network.

        Args:
            x (torch.Tensor): Input tensor.

        Returns:
            torch.Tensor: Mean and standard deviation tensors.
        """
        x = torch.relu(self.fc(x))
        mu = torch.tanh(self.mu_head(x))  # Mean
        sigma = F.softplus(self.sigma_head(x)) + 1e-5  # Standard deviation
        return mu, sigma

def select_action(policy, state):
    """
    Select an action based on the given policy and state.

    Args:
        policy (GaussianPolicyNetwork): Policy network.
        state (np.array): Input state.

    Returns:
        torch.Tensor: Action tensor.
        torch.Tensor: Log probability tensor.
    """
    state_tensor = torch.tensor(state, dtype=torch.float32)
    mu, sigma = policy(state_tensor)
    dist = torch.distributions.Normal(mu, sigma)
    action = dist.sample()
    log_prob = dist.log_prob(action).sum(-1)
    return action, log_prob

def train_policy(policy, optimizer, device, episodes=1000):
    """
    Train the policy network.

    Args:
        policy (GaussianPolicyNetwork): Policy network.
        optimizer (torch.optim.Optimizer): Optimizer for training.
        device (str): Device to run the training on.
        episodes (int, optional): Number of episodes to train for. Defaults to 1000.
    """
    policy = policy.to(device)
    
    env = ContEnvWrapper()
    gamma = 0.99
    
    for episode in range(episodes):
        log_probs = []
        rewards = []
        env.reset()
        state, _, _ = env.step(np.array([0,0,0]))
        
        state = torch.tensor(state).to(device)
        
        while True:
            action, log_prob = select_action(policy, state)
            next_state, reward, done = env.step([action.detach().cpu()])
            
            log_probs.append(log_prob)
            rewards.append(reward)
            
            if done:
                break
            
            state = torch.tensor(next_state).to(device)
        
        # Compute discounted rewards
        R = 0
        returns = []
        for r in reversed(rewards):
            R = r + gamma * R
            returns.insert(0, R)
        
        returns = torch.tensor(returns).to(device)
        returns = (returns - returns.mean()) / (returns.std() + 1e-7)  # Normalize
        
        # Update policy
        policy_loss = []
        for log_prob, R in zip(log_probs, returns):
            policy_loss.append(-log_prob * R)
        policy_loss = torch.cat([loss.view(1,-1) for loss in policy_loss]).sum()
        
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()
        
        print(f"Episode {episode + 1}:\t Mean Reward = {np.mean(rewards)} \t Total Reward = {sum(rewards)} \t Total Steps = {len(rewards)}")

    env.close()

## Initialize the input and output dimensions

In [5]:
# Define input and output dimensions
input_dim = 578   # Pendulum state space
output_dim = 3 # Pendulum action space

# Initialize the policy network
policy = TransformerGaussianPolicy(input_dim, output_dim, nhead=4, num_encoder_layers=4)

# Set the device to CUDA if available, otherwise use CPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Initialize the optimizer with the policy network parameters and learning rate
optimizer = optim.Adam(policy.parameters(), lr=1e-4)

# Train the policy network for a specified number of episodes
train_policy(policy, optimizer, device, episodes=10000)

  state_tensor = torch.tensor(state, dtype=torch.float32)


Episode 1:	 Mean Reward = 81.00816272177637 	 Total Reward = 40018.0323845575
Episode 2:	 Mean Reward = 88.04243897207677 	 Total Reward = 52649.37850530193
Episode 3:	 Mean Reward = -240.3504138454763 	 Total Reward = -95419.11429665406
Episode 4:	 Mean Reward = 24.41920174928198 	 Total Reward = 5054.774762101375
Episode 5:	 Mean Reward = 79.74944109981193 	 Total Reward = 34053.01134961967
Episode 6:	 Mean Reward = 77.58061861002169 	 Total Reward = 19162.41279667536
Episode 7:	 Mean Reward = 26.017684684833764 	 Total Reward = 10198.932396454882
Episode 8:	 Mean Reward = 89.29094096666134 	 Total Reward = 20894.080186198757
Episode 9:	 Mean Reward = 100.5287141734367 	 Total Reward = 36994.56681582472
Episode 10:	 Mean Reward = 45.67844967604238 	 Total Reward = 12835.644358967897
Episode 11:	 Mean Reward = 134.53806079865328 	 Total Reward = 65923.6497913401
Episode 12:	 Mean Reward = 95.4437001634494 	 Total Reward = 29396.6596503424
Episode 13:	 Mean Reward = 5.8659853143908 	 T

In [6]:
def test_agent():
    """
    Function to test an agent using the environment.
    """
    # Initialize the environment
    config = Config()
    env = ContEnvWrapper()
    env.reset()

    num_iterations = 1000
    policy = policy.to('cpu')

    obs, reward, done = env.step([0,0,0])
    with torch.no_grad():
        for i in range(num_iterations):
            # Select an action based on the current policy
            action, _ = select_action(policy, obs)

            # Take a step in the environment
            env.step(action)

            # Render the environment
            env.render()

            # Pause for a short time to visualize the agent's actions
            time.sleep(0.02)

    # Close the environment
    env.close()

# Call the test_agent function to execute the code
test_agent()