<a href="https://colab.research.google.com/github/kuds/rl-atari-tennis/blob/main/%5BAtari%20Tennis%5D%20Reinforcement%20Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Atari Tennis

In [None]:
!pip install swig

In [None]:
!pip install gymnasium gymnasium[atari] pettingzoo multi-agent-ale-py autorom

In [None]:
!AutoROM --accept-license

In [None]:
!pip install ray[rllib] pymunk

In [None]:
!pip install supersuit stable-baselines3

In [None]:
import platform
import ray
import supersuit
import torch
import numpy
import gymnasium as gym
from pettingzoo.atari import tennis_v3
from stable_baselines3 import PPO
from stable_baselines3.common.atari_wrappers import (
    NoopResetEnv, MaxAndSkipEnv, EpisodicLifeEnv,
    FireResetEnv, WarpFrame, ClipRewardEnv
)
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv
from importlib.metadata import version
import time

In [None]:
print(f"Python Version: {platform.python_version()}")
print(f"Torch Version: {version('torch')}")
print(f"Is Cuda Available: {torch.cuda.is_available()}")
print(f"Cuda Version: {torch.version.cuda}")
print(f"Gymnasium Version: {version('gymnasium')}")
print(f"Numpy Version: {version('numpy')}")
print(f"Stable Baselines3 Version: {version('stable_baselines3')}")
print(f"Supersuit Version: {version('supersuit')}")
print(f"PettingZoo Version: {version('pettingzoo')}")
print(f"Ray Version: {version('ray')}")

In [None]:
def make_env(env_id):
    """
    Creates and wraps the Atari environment.
    """
    env = tennis_v3.env(render_mode="rgb_array")
    env.reset(seed=42)
    #env = gym.make(env_id, render_mode='rgb_array')
    env = NoopResetEnv(env, noop_max=30)
    env = MaxAndSkipEnv(env, skip=4)
    env = EpisodicLifeEnv(env)
    if 'FIRE' in env.unwrapped.get_action_meanings():
        env = FireResetEnv(env)
    env = WarpFrame(env)
    env = ClipRewardEnv(env)
    return env

def evaluate_agent():
    # Create the environment for evaluation
    env_id = "ALE/Tennis-v5"
    env = gym.make(env_id, render_mode='human')

    # Apply necessary wrappers

    env = MaxAndSkipEnv(env, skip=4)
    env = EpisodicLifeEnv(env)
    if 'FIRE' in env.unwrapped.get_action_meanings():
        env = FireResetEnv(env)
    env = WarpFrame(env)
    env = ClipRewardEnv(env)

    # Stack frames
    env = DummyVecEnv([lambda: env])
    env = VecFrameStack(env, n_stack=4)

    # Load the trained model
    model = PPO.load("ppo_atari_tennis")

    obs = env.reset()
    while True:
        action, _ = model.predict(obs)
        obs, rewards, dones, infos = env.step(action)
        # Rendering is handled by the environment when render_mode='human'
        if dones:
            obs = env.reset()

    env.close()

In [None]:
# Environment ID for Atari Tennis
env_id = "ALE/Tennis-v5"

# Number of parallel environments (increase for faster training)
num_envs = 8  # You can adjust this number

# Create the vectorized environment
env = make_env(env_id)
env = DummyVecEnv([lambda: env for _ in range(num_envs)])

# Stack frames (for temporal information)
env = VecFrameStack(env, n_stack=4)

# Create the PPO agent with CNN policy (since observations are images)
model = PPO("CnnPolicy", env, verbose=1)

# Train the agent
total_timesteps = 10_000_000  # Adjust as needed
model.learn(total_timesteps=total_timesteps)

# Save the model
model.save("ppo_atari_tennis")

# Close the environment
env.close()

# Evaluate the trained agent
evaluate_agent()

AttributeError: 'ParallelAtariEnv' object has no attribute 'get_action_meanings'

In [None]:
from pettingzoo.atari import tennis_v3

#Environments can be interacted with in a manner very similar to Gymnasium:

env.reset()
for agent in env.agent_iter():
    observation, reward, termination, truncation, info = env.last()
    action = None if termination or truncation else env.action_space(agent).sample()  # this is where you would insert your policy
    env.step(action)

In [None]:
env = tennis_v3.env()
env = NoopResetEnv(env, noop_max=30)

In [None]:
"""Uses Stable-Baselines3 to train agents in the Knights-Archers-Zombies environment using SuperSuit vector envs.

This environment requires using SuperSuit's Black Death wrapper, to handle agent death.

For more information, see https://stable-baselines3.readthedocs.io/en/master/modules/ppo.html

Author: Elliot (https://github.com/elliottower)
"""
from __future__ import annotations

import glob
import os
import time

import supersuit as ss
from stable_baselines3 import PPO
from stable_baselines3.ppo import CnnPolicy, MlpPolicy

from pettingzoo.butterfly import knights_archers_zombies_v10


def train(env_fn, steps: int = 10_000, seed: int | None = 0, **env_kwargs):
    # Train a single model to play as each agent in an AEC environment
    env = env_fn.parallel_env(**env_kwargs)

    # Add black death wrapper so the number of agents stays constant
    # MarkovVectorEnv does not support environments with varying numbers of active agents unless black_death is set to True
    env = ss.black_death_v3(env)

    # Pre-process using SuperSuit
    visual_observation = not env.unwrapped.vector_state
    if visual_observation:
        # If the observation space is visual, reduce the color channels, resize from 512px to 84px, and apply frame stacking
        env = ss.color_reduction_v0(env, mode="B")
        env = ss.resize_v1(env, x_size=84, y_size=84)
        env = ss.frame_stack_v1(env, 3)

    env.reset(seed=seed)

    print(f"Starting training on {str(env.metadata['name'])}.")

    env = ss.pettingzoo_env_to_vec_env_v1(env)
    env = ss.concat_vec_envs_v1(env, 8, num_cpus=1, base_class="stable_baselines3")

    # Use a CNN policy if the observation space is visual
    model = PPO(
        CnnPolicy if visual_observation else MlpPolicy,
        env,
        verbose=3,
        batch_size=256,
    )

    model.learn(total_timesteps=steps)

    model.save(f"{env.unwrapped.metadata.get('name')}_{time.strftime('%Y%m%d-%H%M%S')}")

    print("Model has been saved.")

    print(f"Finished training on {str(env.unwrapped.metadata['name'])}.")

    env.close()


def eval(env_fn, num_games: int = 100, render_mode: str | None = None, **env_kwargs):
    # Evaluate a trained agent vs a random agent
    env = env_fn.env(render_mode=render_mode, **env_kwargs)

    # Pre-process using SuperSuit
    visual_observation = not env.unwrapped.vector_state
    if visual_observation:
        # If the observation space is visual, reduce the color channels, resize from 512px to 84px, and apply frame stacking
        env = ss.color_reduction_v0(env, mode="B")
        env = ss.resize_v1(env, x_size=84, y_size=84)
        env = ss.frame_stack_v1(env, 3)

    print(
        f"\nStarting evaluation on {str(env.metadata['name'])} (num_games={num_games}, render_mode={render_mode})"
    )

    try:
        latest_policy = max(
            glob.glob(f"{env.metadata['name']}*.zip"), key=os.path.getctime
        )
    except ValueError:
        print("Policy not found.")
        exit(0)

    model = PPO.load(latest_policy)

    rewards = {agent: 0 for agent in env.possible_agents}

    # Note: we evaluate here using an AEC environments, to allow for easy A/B testing against random policies
    # For example, we can see here that using a random agent for archer_0 results in less points than the trained agent
    for i in range(num_games):
        env.reset(seed=i)
        env.action_space(env.possible_agents[0]).seed(i)

        for agent in env.agent_iter():
            obs, reward, termination, truncation, info = env.last()

            for a in env.agents:
                rewards[a] += env.rewards[a]

            if termination or truncation:
                break
            else:
                if agent == env.possible_agents[0]:
                    act = env.action_space(agent).sample()
                else:
                    act = model.predict(obs, deterministic=True)[0]
            env.step(act)
    env.close()

    avg_reward = sum(rewards.values()) / len(rewards.values())
    avg_reward_per_agent = {
        agent: rewards[agent] / num_games for agent in env.possible_agents
    }
    print(f"Avg reward: {avg_reward}")
    print("Avg reward per agent, per game: ", avg_reward_per_agent)
    print("Full rewards: ", rewards)
    return avg_reward


if __name__ == "__main__":
    env_fn = knights_archers_zombies_v10

    # Set vector_state to false in order to use visual observations (significantly longer training time)
    env_kwargs = dict(max_cycles=100, max_zombies=4, vector_state=True)

    # Train a model (takes ~5 minutes on a laptop CPU)
    train(env_fn, steps=81_920, seed=0, **env_kwargs)

    # Evaluate 10 games (takes ~10 seconds on a laptop CPU)
    eval(env_fn, num_games=10, render_mode=None, **env_kwargs)

    # Watch 2 games (takes ~10 seconds on a laptop CPU)
    eval(env_fn, num_games=2, render_mode="human", **env_kwargs)

In [None]:
import supersuit as ss
from pettingzoo.atari import tennis_v3
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import VecMonitor
from pettingzoo.utils.conversions import aec_to_parallel

# Create the PettingZoo environment
env = tennis_v3.env()

# Apply Supersuit wrappers to make the environment compatible with Stable Baselines3
env = ss.max_observation_v0(env, 2)  # Ensure observations are the same size
env = ss.pad_action_space_v0(env)     # Ensure action spaces are the same size
env = aec_to_parallel(env)
#env = ss.pettingzoo_env_to_vec_env_v1(env)  # Convert to vectorized environment
#env = VecMonitor(env)  # Monitor to keep track of rewards and other info

# Create the model using the CNN policy for processing image observations
model = PPO('CnnPolicy', env, verbose=1)

# Train the model
model.learn(total_timesteps=500000)

# Save the model
model.save("ppo_tennis_selfplay")

# Load the trained model
model = PPO.load("ppo_tennis_selfplay")

# Evaluation loop
env = tennis_v3.env()
env = ss.max_observation_v0(env, 2)
env = ss.pad_action_space_v0(env)
env.reset()

for agent in env.agent_iter():
    observation, reward, done, info = env.last()
    if done:
        action = None
    else:
        # Use the trained model to predict actions
        action, _ = model.predict(observation, deterministic=True)
    env.step(action)
    env.render()

In [None]:
import ray
from ray import tune
from ray.rllib.env import PettingZooEnv
from pettingzoo.atari import tennis_v3
from ray.tune.registry import register_env
from ray.rllib.agents.ppo import PPOTrainer

# Initialize Ray
ray.init()

# Environment creator function
def env_creator(config):
    env = tennis_v3.env()
    return env

# Register the environment with RLlib
register_env("tennis_v3", lambda config: PettingZooEnv(env_creator(config)))

# Create an instance of the environment to extract spaces
temp_env = PettingZooEnv(env_creator({}))
obs_space = temp_env.observation_space
act_space = temp_env.action_space

# Define the policies
policies = {
    "shared_policy": (None, obs_space, act_space, {})
}

# Policy mapping function
def policy_mapping_fn(agent_id, episode, **kwargs):
    return "shared_policy"  # All agents use the same policy (self-play)

# RLlib configuration
config = {
    "env": "tennis_v3",
    "env_config": {},
    "framework": "torch",  # Use "tf" if you prefer TensorFlow
    "num_workers": 1,      # Increase if you have more CPUs
    "num_gpus": 0,         # Set to 1 if you have a GPU
    "multiagent": {
        "policies": policies,
        "policy_mapping_fn": policy_mapping_fn,
    },
    "lr": 1e-4,
    "train_batch_size": 4000,
    "rollout_fragment_length": 200,
    "sgd_minibatch_size": 128,
    "num_sgd_iter": 10,
    "clip_param": 0.1,
}

# Start training
results = tune.run(
    "PPO",
    config=config,
    stop={"timesteps_total": 500000},
    checkpoint_at_end=True,
)

# Get the last checkpoint
checkpoints = results.get_trial_checkpoints_paths(
    trial=results.get_best_trial("episode_reward_mean", mode="max"),
    metric="episode_reward_mean"
)
checkpoint_path = checkpoints[0][0]

# Load the trained agent
agent = PPOTrainer(config=config)
agent.restore(checkpoint_path)

# Evaluation loop
env = PettingZooEnv(env_creator({}))
env.reset()

for agent_id in env.agent_iter():
    observation, reward, done, info = env.last()
    if done:
        action = None
    else:
        action = agent.compute_single_action(observation, policy_id="shared_policy")
    env.step(action)
    env.render()

# Shutdown Ray
ray.shutdown()


In [None]:
import ray
from ray import tune
from ray.rllib.env import PettingZooEnv
from pettingzoo.atari import tennis_v3
from ray.tune.registry import register_env
from ray.rllib.algoirthms.ppo import PPOTrainer

# Initialize Ray
ray.init()

# Environment creator function
def env_creator(config):
    env = tennis_v3.env()
    return env

# Register the environment with RLlib
register_env("tennis_v3", lambda config: PettingZooEnv(env_creator(config)))

# Create an instance of the environment to extract spaces
temp_env = PettingZooEnv(env_creator({}))
obs_space = temp_env.observation_space
act_space = temp_env.action_space

# Define the policies
policies = {
    "shared_policy": (None, obs_space, act_space, {})
}

# Policy mapping function
def policy_mapping_fn(agent_id, episode, **kwargs):
    return "shared_policy"  # All agents use the same policy (self-play)

# RLlib configuration
config = {
    "env": "tennis_v3",
    "env_config": {},
    "framework": "torch",  # Use "tf" if you prefer TensorFlow
    "num_workers": 1,      # Increase if you have more CPUs
    "num_gpus": 0,         # Set to 1 if you have a GPU
    "multiagent": {
        "policies": policies,
        "policy_mapping_fn": policy_mapping_fn,
    },
    "lr": 1e-4,
    "train_batch_size": 4000,
    "rollout_fragment_length": 200,
    "sgd_minibatch_size": 128,
    "num_sgd_iter": 10,
    "clip_param": 0.1,
}

# Start training
results = tune.run(
    "PPO",
    config=config,
    stop={"timesteps_total": 500000},
    checkpoint_at_end=True,
)

# Get the last checkpoint
checkpoints = results.get_trial_checkpoints_paths(
    trial=results.get_best_trial("episode_reward_mean", mode="max"),
    metric="episode_reward_mean"
)
checkpoint_path = checkpoints[0][0]

# Load the trained agent
agent = PPOTrainer(config=config)
agent.restore(checkpoint_path)

# Evaluation loop
env = PettingZooEnv(env_creator({}))
env.reset()

for agent_id in env.agent_iter():
    observation, reward, done, info = env.last()
    if done:
        action = None
    else:
        action = agent.compute_single_action(observation, policy_id="shared_policy")
    env.step(action)
    env.render()

# Shutdown Ray
ray.shutdown()


In [None]:
"""Uses Ray's RLlib to train agents to play Pistonball.

Author: Rohan (https://github.com/Rohan138)
"""

import os

import ray
import supersuit as ss
from ray import tune
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.env.wrappers.pettingzoo_env import ParallelPettingZooEnv
from ray.rllib.models import ModelCatalog
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.tune.registry import register_env
from torch import nn

from pettingzoo.butterfly import pistonball_v6


class CNNModelV2(TorchModelV2, nn.Module):
    def __init__(self, obs_space, act_space, num_outputs, *args, **kwargs):
        TorchModelV2.__init__(self, obs_space, act_space, num_outputs, *args, **kwargs)
        nn.Module.__init__(self)
        self.model = nn.Sequential(
            nn.Conv2d(3, 32, [8, 8], stride=(4, 4)),
            nn.ReLU(),
            nn.Conv2d(32, 64, [4, 4], stride=(2, 2)),
            nn.ReLU(),
            nn.Conv2d(64, 64, [3, 3], stride=(1, 1)),
            nn.ReLU(),
            nn.Flatten(),
            (nn.Linear(3136, 512)),
            nn.ReLU(),
        )
        self.policy_fn = nn.Linear(512, num_outputs)
        self.value_fn = nn.Linear(512, 1)

    def forward(self, input_dict, state, seq_lens):
        model_out = self.model(input_dict["obs"].permute(0, 3, 1, 2))
        self._value_out = self.value_fn(model_out)
        return self.policy_fn(model_out), state

    def value_function(self):
        return self._value_out.flatten()


def env_creator(args):
    env = pistonball_v6.parallel_env(
        n_pistons=20,
        time_penalty=-0.1,
        continuous=True,
        random_drop=True,
        random_rotate=True,
        ball_mass=0.75,
        ball_friction=0.3,
        ball_elasticity=1.5,
        max_cycles=125,
    )
    env = ss.color_reduction_v0(env, mode="B")
    env = ss.dtype_v0(env, "float32")
    env = ss.resize_v1(env, x_size=84, y_size=84)
    env = ss.normalize_obs_v0(env, env_min=0, env_max=1)
    env = ss.frame_stack_v1(env, 3)
    return env


# if __name__ == "__main__":
#     ray.init()

#     env_name = "pistonball_v6"

#     register_env(env_name, lambda config: ParallelPettingZooEnv(env_creator(config)))
#     ModelCatalog.register_custom_model("CNNModelV2", CNNModelV2)

#     config = (
#         PPOConfig()
#         .environment(env=env_name, clip_actions=True)
#         .rollouts(num_rollout_workers=4, rollout_fragment_length=128)
#         .training(
#             train_batch_size=512,
#             lr=2e-5,
#             gamma=0.99,
#             lambda_=0.9,
#             use_gae=True,
#             clip_param=0.4,
#             grad_clip=None,
#             entropy_coeff=0.1,
#             vf_loss_coeff=0.25,
#             sgd_minibatch_size=64,
#             num_sgd_iter=10,
#         )
#         .debugging(log_level="ERROR")
#         .framework(framework="torch")
#         .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")))
#     )

#     tune.run(
#         "PPO",
#         name="PPO",
#         stop={"timesteps_total": 5000000 if not os.environ.get("CI") else 50000},
#         checkpoint_freq=10,
#         storage_path="~/ray_results/" + env_name,
#         config=config.to_dict(),
#     )

  if (distutils.version.LooseVersion(tf.__version__) <


In [None]:
ray.init()

env_name = "pistonball_v6"

register_env(env_name, lambda config: ParallelPettingZooEnv(env_creator(config)))
ModelCatalog.register_custom_model("CNNModelV2", CNNModelV2)

config = (
    PPOConfig()
    .environment(env=env_name, clip_actions=True)
    .rollouts(num_rollout_workers=4, rollout_fragment_length=128)
    .training(
        train_batch_size=512,
        lr=2e-5,
        gamma=0.99,
        lambda_=0.9,
        use_gae=True,
        clip_param=0.4,
        grad_clip=None,
        entropy_coeff=0.1,
        vf_loss_coeff=0.25,
        sgd_minibatch_size=64,
        num_sgd_iter=10,
    )
    .debugging(log_level="ERROR")
    .framework(framework="torch")
    .resources(num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", "0")))
)

tune.run(
    "PPO",
    name="PPO",
    stop={"timesteps_total": 5000000 if not os.environ.get("CI") else 50000},
    checkpoint_freq=10,
    storage_path="~/ray_results/" + env_name,
    config=config.to_dict(),
)

In [None]:
# https://github.com/ray-project/ray/issues/16425