## PettinZoo

In [None]:
import pybullet as p
import pybullet_data
import numpy as np
import time
import functools

from gymnasium import spaces
from gymnasium.spaces import Discrete
from pettingzoo import AECEnv
from gymnasium.utils import seeding

from pettingzoo.utils import agent_selector

from utils.physics_helpers import (
    create_wall, create_cube_block, create_point_agent,
    distance_between_bodies
)


class PyBulletPushAECEnv(AECEnv):
    metadata = {"render_modes": ["human"], "name": "pybullet_push_aec_v0", "is_parallelizable": True}

    def __init__(self):
        super().__init__()
        self.numb_agents = 2
        self.agents = [f"agent_{i+1}" for i in range(self.numb_agents)]
        self.possible_agents = self.agents[:]

        self.action_spaces = {
            agent: spaces.Discrete(5)  # [noop, up, down, left, right]
            for agent in self.agents
        }
        self.observation_spaces = {
            agent: spaces.Box(low=-3, high=3, shape=(6,), dtype=np.float32)
            for agent in self.agents
        }

        self.agent_name_mapping = {agent: i for i, agent in enumerate(self.agents)}

        self._agent_selector = agent_selector(self.agents)
        self.agent_selection = None

        self.physics_client = p.connect(p.DIRECT)
        p.setAdditionalSearchPath(pybullet_data.getDataPath())

        self.render_mode = "human"
        self.time_step = 1. / 60.

    def _setup_world(self):
        if p.getConnectionInfo()['isConnected'] == 0:
            p.connect(p.DIRECT)  # Use DIRECT mode for headless training (no GUI)
        p.resetSimulation()
        p.setGravity(0, 0, -9.8)
        p.loadURDF("plane.urdf")

        create_wall(0, 3.0, 3.0, 0.05)
        create_wall(0, -3.0, 3.0, 0.05)
        create_wall(-3.0, 0.0, 0.05, 3.0)
        create_wall(3.0, 0.0, 0.05, 3.0)
        create_wall(0, 1.7, 0.05, 1.3)
        create_wall(0, -1.7, 0.05, 1.3)

        self.cube_id = create_cube_block(-2, 0)
        self.agent_ids = {
            "agent_1": create_point_agent(-2.5, 0.5, color=[1, 0, 0, 1]),
            "agent_2": create_point_agent(-2.5, -0.5, color=[0, 0, 1, 1])
        }

    def reset(self, seed=None, options=None):
        self._setup_world()
        if seed is not None:
            self.np_random, self.np_random_seed = seeding.np_random(seed)

        self.agents = self.possible_agents[:]
        self.rewards = {agent: 0 for agent in self.agents}
        self._cumulative_rewards = {agent: 0 for agent in self.agents}
        self.terminations = {agent: False for agent in self.agents}
        self.truncations = {agent: False for agent in self.agents}
        self.infos = {agent: {} for agent in self.agents}
        self._agent_steps = {agent: 0 for agent in self.agents}

        self._agent_selector = agent_selector(self.agents)
        self.agent_selection = self._agent_selector.reset()

    def step(self, action):
        if self.terminations[self.agent_selection] or self.truncations[self.agent_selection]:
            self._was_dead_step(action)
            return

        agent = self.agent_selection
        agent_id = self.agent_ids[agent]
        # Clears the instant rewards for the current agent from prev iteration before applying the action
        self._clear_rewards()
        success_bonus = 100.0  # Reward bonus for reaching the goal
        vx, vy = 0, 0
        speed = 0.5
        if action == 1: vy = speed
        elif action == 2: vy = -speed
        elif action == 3: vx = -speed
        elif action == 4: vx = speed

        p.resetBaseVelocity(agent_id, [vx, vy, 0])
        p.stepSimulation()
        time.sleep(self.time_step)

        # Reward based on distance to cube
        dist = distance_between_bodies(agent_id, self.cube_id)
        self.rewards[agent] = -dist
        # self._cumulative_rewards[agent] += self.rewards[agent]
        print("AT ", self._agent_steps[agent] , agent,"took action: ", action ,"REW:", self.rewards[agent], "CUM_REW:", self._cumulative_rewards[agent])

         # Termination Check
        goal_position,_ =  p.getBasePositionAndOrientation(self.cube_id)
        goal_threshold = 0.3
        agent_pos, _ = p.getBasePositionAndOrientation(agent_id)
        cube_to_goal_distance = np.linalg.norm(np.array(agent_pos[:2]) - np.array(goal_position[:2]))

        if cube_to_goal_distance < goal_threshold:
            self.rewards[agent] += success_bonus
            self.terminations[agent] = True
            self.truncations[agent] = False
        else:
            self.terminations[agent] = False

        self._accumulate_rewards()
        
        # Truncation Check
        max_steps = 500
        if self._agent_steps[agent] >= max_steps:
            self.truncations[agent] = True
            self.terminations[agent] = False
        else:
            self.truncations[agent] = False

        # Increment step counter
        self._agent_steps[agent] += 1
        
        
        self.agent_selection = self._agent_selector.next()
        
        if self.render_mode == "human":
            self.render()


    @functools.lru_cache(maxsize=None)
    def action_space(self, agent):
        # We can seed the action space to make the environment deterministic.
        return Discrete(5, seed=self.np_random_seed)
    def observe(self, agent):
        agent_ids = self.agent_ids.copy()
        current_agent_pos, _ = p.getBasePositionAndOrientation(agent_ids[agent])
        cube_pos, _ = p.getBasePositionAndOrientation(self.cube_id)

        other_agent_poses = []
        for other_agent in self.agents:
            if other_agent != agent:
                pos, _ = p.getBasePositionAndOrientation(agent_ids[other_agent])
                other_agent_poses.extend(pos[:2])

        obs = np.array(
            list(current_agent_pos[:2]) + list(cube_pos[:2]) + other_agent_poses,
            dtype=np.float32
        )
        return obs

    def render(self):
        pass

    def close(self):
        p.disconnect()


: 

In [7]:
kk= PyBulletPushAECEnv()

In [8]:
print(kk.agents)
print(kk.possible_agents)
print(kk.max_num_agents)
print(kk.action_spaces)
print(kk.observation_spaces)
print(kk.agent_name_mapping)
print(kk._agent_selector)
print(kk.agent_selection)
print(kk.metadata)
print(kk.physics_client) 
print(kk.render_mode)   

['agent_1', 'agent_2']
['agent_1', 'agent_2']
2
{'agent_1': Discrete(5), 'agent_2': Discrete(5)}
{'agent_1': Box(-3.0, 3.0, (6,), float32), 'agent_2': Box(-3.0, 3.0, (6,), float32)}
{'agent_1': 0, 'agent_2': 1}
<pettingzoo.utils.agent_selector.agent_selector object at 0x00000254636ECF10>
None
{'render_modes': ['human'], 'name': 'pybullet_push_aec_v0'}
2
human


In [38]:
kk.reset()
print(kk.agents)
print(kk._agent_selector)
print(kk.agent_selection)

['agent_1', 'agent_2']
<pettingzoo.utils.agent_selector.agent_selector object at 0x000001B5B96A9C90>
agent_1


In [11]:
print(kk._agent_selector.next())
# print(kk.agent_selection)

agent_2


In [7]:
import functools


In [27]:
env = PyBulletPushAECEnv()
env.reset()
for agent in env.agent_iter():
    obs, reward, done, trunc, info = env.last()
    print(f"[DEBUG] {agent=} reward={reward} expected={env.rewards[agent]}")
    env.step(env.action_space(agent).sample())


[DEBUG] agent='agent_1' reward=0 expected=0
REW: -0.7088722243800007 CUM_REW: -0.7088722243800007
[DEBUG] agent='agent_1' reward=-0.7088722243800007 expected=-0.7088722243800007
REW: -0.7102826044689532 CUM_REW: -1.419154828848954
[DEBUG] agent='agent_1' reward=-1.419154828848954 expected=-0.7102826044689532
REW: -0.7103254191257218 CUM_REW: -2.1294802479746755
[DEBUG] agent='agent_1' reward=-2.1294802479746755 expected=-0.7103254191257218
REW: -0.7103375717027383 CUM_REW: -2.839817819677414
[DEBUG] agent='agent_1' reward=-2.839817819677414 expected=-0.7103375717027383
REW: -0.711745575345585 CUM_REW: -3.551563395022999
[DEBUG] agent='agent_1' reward=-3.551563395022999 expected=-0.711745575345585
REW: -0.7103434042015684 CUM_REW: -4.261906799224567
[DEBUG] agent='agent_1' reward=-4.261906799224567 expected=-0.7103434042015684
REW: -0.7117506988387946 CUM_REW: -4.973657498063361
[DEBUG] agent='agent_1' reward=-4.973657498063361 expected=-0.7117506988387946
REW: -0.7117821938643286 CUM_R

KeyboardInterrupt: 

In [6]:
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv, VecMonitor
from pettingzoo.utils.conversions import aec_to_parallel
from supersuit import pettingzoo_env_to_vec_env_v1, concat_vec_envs_v1
# from envs.pybullet_push_env import PyBulletPushAECEnv

def make_env():
    # 1️⃣ Create and convert PettingZoo env to parallel
    parallel_env = aec_to_parallel(PyBulletPushAECEnv())

    # 2️⃣ Convert to vectorized env
    vec_env = pettingzoo_env_to_vec_env_v1(parallel_env)
    vec_env = concat_vec_envs_v1(vec_env, num_vec_envs=4, num_cpus=1, base_class='stable_baselines3')

    # 3️⃣ Add VecMonitor for logging
    vec_env = VecMonitor(vec_env)
    # env = PyBulletPushAECEnv()
    # env = aec_to_parallel(env)
    # env = pettingzoo_parallel_to_gym_v1(env)
    return vec_env


def train():
    env = make_env()

    model = PPO(
        policy="MlpPolicy",
        env=env,
        verbose=1,
        tensorboard_log="./ppo_push_tensorboard/"
    )

    model.learn(total_timesteps=200000)
    model.save("ppo_pybullet_push")
    env.close()

if __name__ == "__main__":
    train()


Using cpu device
Logging to ./ppo_push_tensorboard/PPO_1
AT  0 agent_1 took action:  4 REW: -0.7074610233354057 CUM_REW: 0
AT  0 agent_2 took action:  1 REW: -0.7074642862423216 CUM_REW: 0
AT  0 agent_1 took action:  4 REW: -0.7047139693605335 CUM_REW: 0
AT  0 agent_2 took action:  3 REW: -0.7075247751636603 CUM_REW: 0
AT  0 agent_1 took action:  0 REW: -0.7033148887028186 CUM_REW: 0
AT  0 agent_2 took action:  4 REW: -0.7074340158386568 CUM_REW: 0
AT  0 agent_1 took action:  4 REW: -0.7017996083581028 CUM_REW: 0
AT  0 agent_2 took action:  1 REW: -0.7046592602989967 CUM_REW: 0
AT  1 agent_1 took action:  3 REW: -0.701859165376428 CUM_REW: -0.7074610233354057
AT  1 agent_2 took action:  3 REW: -0.704725437655998 CUM_REW: -0.7074642862423216
AT  1 agent_1 took action:  3 REW: -0.7045943004977925 CUM_REW: -0.7047139693605335
AT  1 agent_2 took action:  4 REW: -0.7046179681153023 CUM_REW: -0.7075247751636603
AT  1 agent_1 took action:  2 REW: -0.7045224640502681 CUM_REW: -0.70331488870281

ValueError: when an agent is dead, the only valid action is None

In [None]:
# from envs.pybullet_push_env import PyBulletPushAECEnv

def train_random_policy():
    env = PyBulletPushAECEnv(render_mode="human")
    env.reset(seed=42)

    episode_rewards = {agent: 0 for agent in env.possible_agents}

    for agent in env.agent_iter():
        obs, rew, terminated, truncated, info = env.last()

        # Logging cumulative reward
        episode_rewards[agent] += rew

        if terminated or truncated:
            action = None
        else:
            # Sample random action (placeholder for future policy)
            action = env.action_space(agent).sample()

        env.step(action)

    env.close()
    print("Episode reward summary:")
    for agent, r in episode_rewards.items():
        print(f"{agent}: {r:.2f}")

if __name__ == "__main__":
    train_random_policy()
