### 1. Environment Check

In [14]:
from pettingzoo.atari import boxing_v2
import numpy as np
import random
import torch
import torch.nn as nn
import torch.optim as optim


In [15]:
# Initialize the Boxing environment
env = boxing_v2.env(render_mode="rgb_array") 
env.reset(seed = 42)

In [None]:
# Define a simple random policy for demonstration
# def random_policy(observation, action_space):
#     return action_space().sample()

In [17]:
# for agent in env.agent_iter():
#     observation, reward, termination, truncation, info = env.last()

#     if termination or truncation:
#         action = None
#     else:
#         action = env.action_space(agent).sample()
#     env.step(action)
#     env.render()
# env.close()

### 2. Define PPO Algorithm

In [None]:
from history.networks import PPOAgent
from history.buffers import RolloutBuffer
from history.ppo import PPO

In [19]:
# check torch installation
# !python -c "import torch; print(torch.__version__)"

Testing the PPOAgent and Buffer

In [20]:
# # Test setup
# def test_agent_and_buffer():
#     # Initialize environment
#     env = boxing_v2.env(render_mode="rgb_array")
#     env.reset()

#     # Observation shape and action space
#     obs_shape = (3, 210, 160)  # Example shape for Atari observations
#     action_space = env.action_space("first_0")

#     # Initialize agent and buffer
#     agent = PPOAgent(obs_shape, action_space)
#     buffer = RolloutBuffer()

#     # Test environment interaction
#     print("Testing interaction with Boxing environment...")
#     for agent_name in env.agent_iter(10):  # Interact for 10 steps
#         obs, reward, termination, truncation, info = env.last()

#         if obs is not None:
#             # Normalize the observation for the network
#             obs_tensor = torch.tensor(obs / 255.0, dtype=torch.float32).permute(2, 0, 1).unsqueeze(0)  # (C, H, W)
#             action_probs = agent.forward_policy(obs_tensor)
#             action = torch.multinomial(action_probs, 1).item()  # Sample action
#             log_prob = torch.log(action_probs.squeeze(0)[action])  # Compute log-prob

#             # Store data in the buffer
#             buffer.store(obs, action, log_prob.item(), reward, termination or truncation)

#             # Print action probabilities and chosen action
#             print(f"Action Probabilities: {action_probs.detach().numpy()}")
#             print(f"Chosen Action: {action}, Log Prob: {log_prob.item()}")

#         env.step(action if not termination and not truncation else None)

#     # Check buffer contents
#     print("\nTesting RolloutBuffer contents...")
#     print(f"Observations: {len(buffer.observations)}")
#     print(f"Actions: {len(buffer.actions)}")
#     print(f"Log Probs: {len(buffer.log_probs)}")
#     print(f"Rewards: {len(buffer.rewards)}")
#     print(f"Dones: {len(buffer.dones)}")

#     # Test forward pass through the value network
#     print("\nTesting value network...")
#     value = agent.forward_value(obs_tensor)
#     print(f"Value Estimate: {value.item()}")

#     # Clear the buffer
#     buffer.clear()
#     print("\nRolloutBuffer cleared. Current size:", len(buffer.observations))


In [21]:
# test_agent_and_buffer()

In [22]:
# import torch
# import numpy as np
# from pettingzoo.atari import boxing_v2

# # Test configuration
# obs_shape = (3, 210, 160)  # Atari observation shape
# action_space = type('', (), {'n': 6})()  # Mock action space with 6 actions
# num_steps = 10  # Number of steps in the rollout

# # Initialize PPO and buffer
# ppo = PPO(obs_shape, action_space)
# buffer = RolloutBuffer()

# # Initialize environment
# env = boxing_v2.env(render_mode="rgb_array")
# env.reset()

# # Interaction loop
# print("Starting environment interaction...")
# for step in range(num_steps):
#     for agent_name in env.agent_iter():
#         obs, reward, termination, truncation, info = env.last()
#         if obs is not None:
#             # Normalize observation and process it
#             obs_tensor = torch.tensor(obs / 255.0, dtype=torch.float32).permute(2, 0, 1).unsqueeze(0)  # (C, H, W)
#             action_probs = ppo.policy.forward_policy(obs_tensor)  # Get action probabilities
#             action = torch.multinomial(action_probs, 1).item()  # Sample an action
#             log_prob = torch.log(action_probs.squeeze(0)[action])  # Compute log-probability

#             # Store step in the buffer
#             buffer.store(obs, action, log_prob.item(), reward, termination or truncation)

#             # Step in the environment
#             env.step(action if not termination and not truncation else None)

#         if termination or truncation:
#             break  # Break the loop if the environment ends

# # Compute returns and advantages
# print("Computing returns and advantages...")
# buffer.compute_returns_and_advantages(ppo.policy, ppo.gamma, ppo.gae_lambda)

# # Perform an update
# print("Updating the PPO model...")
# ppo.update(buffer)

# # Print buffer statistics
# print("Buffer Summary:")
# print(f"Observations: {len(buffer.observations)}")
# print(f"Actions: {len(buffer.actions)}")
# print(f"Log Probs: {len(buffer.log_probs)}")
# print(f"Rewards: {len(buffer.rewards)}")
# print(f"Returns: {len(buffer.returns)}")
# print(f"Advantages: {len(buffer.advantages)}")

# # Clear the buffer for the next episode
# buffer.clear()
# print("Buffer cleared. Current size:", len(buffer.observations))

# # Test the forward pass after updates
# dummy_obs = torch.randn(1, *obs_shape)  # Random observation
# policy_output = ppo.policy.forward_policy(dummy_obs)
# value_output = ppo.policy.forward_value(dummy_obs)

# print("Policy Output Shape:", policy_output.size())
# print("Value Output Shape:", value_output.size())


### 3. Wrapper for Pettingzoo 

In [23]:
from supersuit import pad_observations_v0, pad_action_space_v0, resize_v1, normalize_obs_v0, frame_stack_v1, dtype_v0
from pettingzoo.utils import aec_to_parallel

In [24]:
env = boxing_v2.env()
env = pad_observations_v0(env)
env = pad_action_space_v0(env)
env = resize_v1(env, 84, 84)  # Resize frames to 84x84
env = dtype_v0(env, dtype="float32")  # Convert observations to float32
env = normalize_obs_v0(env, env_min=0, env_max=1)  # Normalize pixel values
env = frame_stack_v1(env, 4)  # Stack 4 frames
parallel_env = aec_to_parallel(env)  # Convert to parallel format


In [25]:
# Step 2: Initialize PPO and RolloutBuffer
obs_shape = (4, 84, 84)  # Stacked frames
action_space = env.action_space("first_0")  # Example action space for an agent
ppo = PPO(obs_shape, action_space)
buffer = RolloutBuffer()

Shape after conv layers: torch.Size([1, 64, 9, 9])


In [27]:
# Step 3: Training Loop
num_episodes = 1000
max_steps_per_episode = 1000  # Maximum steps to prevent infinite loops

for episode in range(num_episodes):
    # Reset the environment
    observations = parallel_env.reset()

    # Check and extract nested observations (e.g., from Agent 0)
    if isinstance(observations, tuple) and len(observations) > 0:
        agent_observations = observations[0]  # Extract observations from the first tuple element
    else:
        raise ValueError("Unexpected observation structure in parallel environment.")

    # Initialize done flags for each agent
    done = {agent: False for agent in agent_observations.keys()}
    step = 0

    while not all(done.values()) and step < max_steps_per_episode:
        actions = {}
        log_probs = {}

        # Process observations for each agent
        for agent, obs in agent_observations.items():
            # Debug observation shape
            # print(f"Agent: {agent}, Raw Observation Shape: {obs.shape}")

            if obs.shape[-1] == 12:
                obs = obs.reshape(84, 84, 4, 3)  # Reshape to (H, W, Frames, RGB)
                obs = obs.mean(axis=-1)  # Convert to grayscale by averaging RGB channels
                obs = obs.transpose(2, 0, 1)  # Rearrange to (Frames, H, W)

            obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0)  # (C, H, W)
            # print(f"Agent: {agent}, Tensor Shape: {obs_tensor}")

            # Get action probabilities
            action_probs = ppo.policy.forward_policy(obs_tensor)
            action = torch.multinomial(action_probs, 1).item()  # Sample action
            log_probs[agent] = torch.log(action_probs.squeeze(0)[action])  # Log probability
            actions[agent] = action  # Store action

        # Step the environment
        step_output = parallel_env.step(actions)

        if len(step_output) == 5:  # Handle truncations
            next_observations, rewards, dones, truncations, infos = step_output
            dones = {agent: dones[agent] or truncations[agent] for agent in dones}
        else:
            next_observations, rewards, dones, infos = step_output

        # Directly use next_observations as agent_observations
        agent_observations = next_observations

        # Store data in the buffer for each agent
        for agent, obs in agent_observations.items():
            buffer.store(obs, actions[agent], log_probs[agent].item(), rewards[agent], dones[agent])

        # Update done flags
        done = dones
        step += 1

    # Compute Returns and Advantages
    print(f"Episode {episode + 1}: Computing returns and advantages...")
    buffer.compute_returns_and_advantages(ppo.policy, ppo.gamma, ppo.gae_lambda)

    # Update PPO
    print(f"Episode {episode + 1}: Updating PPO model...")
    ppo.update(buffer)

    # Clear buffer for the next episode
    buffer.clear()

    # Log progress
    print(f"Episode {episode + 1}/{num_episodes} completed.")


Observation Shape in Store: (84, 84, 12)
Observation Shape in Store: (84, 84, 12)
Observation Shape in Store: (84, 84, 12)
Observation Shape in Store: (84, 84, 12)
Observation Shape in Store: (84, 84, 12)
Observation Shape in Store: (84, 84, 12)
Observation Shape in Store: (84, 84, 12)
Observation Shape in Store: (84, 84, 12)
Observation Shape in Store: (84, 84, 12)
Observation Shape in Store: (84, 84, 12)
Observation Shape in Store: (84, 84, 12)
Observation Shape in Store: (84, 84, 12)
Observation Shape in Store: (84, 84, 12)
Observation Shape in Store: (84, 84, 12)
Observation Shape in Store: (84, 84, 12)
Observation Shape in Store: (84, 84, 12)
Observation Shape in Store: (84, 84, 12)
Observation Shape in Store: (84, 84, 12)
Observation Shape in Store: (84, 84, 12)
Observation Shape in Store: (84, 84, 12)
Observation Shape in Store: (84, 84, 12)
Observation Shape in Store: (84, 84, 12)
Observation Shape in Store: (84, 84, 12)
Observation Shape in Store: (84, 84, 12)
Observation Shap

  return F.mse_loss(input, target, reduction=self.reduction)


RuntimeError: The size of tensor a (11802) must match the size of tensor b (3934) at non-singleton dimension 0

Obs Shape: (84, 84, 12)


NameError: name 'values' is not defined

In [None]:
# for agent, obs in agent_observations.items():
#     print(f"Agent: {agent}, Observation Type: {type(obs)}, Shape: {obs.shape if isinstance(obs, np.ndarray) else 'Not a NumPy Array'}")


Agent: first_0, Observation Type: <class 'numpy.ndarray'>, Shape: (84, 84, 12)
Agent: second_0, Observation Type: <class 'numpy.ndarray'>, Shape: (84, 84, 12)


In [None]:
# # Debug parallel environment
# from pettingzoo.utils.conversions import aec_to_parallel

# # Convert AEC environment to parallel format
# parallel_env = aec_to_parallel(env)

# # Verify the type
# print(type(parallel_env))


In [None]:
# # Reset and inspect observations
# observations = parallel_env.reset()
# print("Observations type:", type(observations))
# print("Observations:", observations)


In [None]:
# for agent_idx, obs in enumerate(observations):
#     print(f"Agent {agent_idx}, Observation: {obs}")

### 4.Self-Play Setup -

### 5. PPO Training Framework

### 6. Advantage Estimation

### 7. Logging and Monitoring

### 8. Evaluation

In [None]:
# obs_shape = (3, 210, 160)
# action_space = type('', (), {'n': 6})()  # Mock action space with 6 actions

# agent = PPOAgent(obs_shape, action_space)
# dummy_obs = torch.randn(1, *obs_shape)  # Create a random input tensor
# policy_output = agent.forward_policy(dummy_obs)
# value_output = agent.forward_value(dummy_obs)

# print("Policy Output Shape:", policy_output.size())
# print("Value Output Shape:", value_output.size())


In [None]:
# 