In [None]:
import supersuit as ss
from pettingzoo.atari import pong_v3
from pettingzoo.utils import agent_selector
from stable_baselines3 import PPO
import numpy as np
from PIL import Image

### Multi-Agent Single Match Video Generation

In [11]:
# Create the environment
env = pong_v3.env(num_players=2, render_mode="rgb_array")

# Pre-process using SuperSuit
env = ss.color_reduction_v0(env, mode="B")
env = ss.resize_v1(env, x_size=84, y_size=84)
env = ss.frame_stack_v1(env, 4, stack_dim=0)
env = ss.dtype_v0(env, dtype=np.float32)
env = ss.normalize_obs_v0(env, env_min=0, env_max=1)

# Load the agents
model1 = PPO.load("/home/joan/Desktop/Runs/Pong_baseline_adversarial_v5/Pong_baseline_adversarial_v5_0.zip")
model2 = PPO.load("/home/joan/Desktop/Runs/Pong_baseline_adversarial_v5/Pong_baseline_adversarial_v5_1.zip")

rewards = {agent: 0 for agent in env.possible_agents}

# We evaluate here using an AEC environments
env.reset(seed=1234)
env.action_space(env.possible_agents[0]).seed(0)

# List of images to create a gif
images = []
for agent in env.agent_iter():
    # Getting the observation and action space
    obs, reward, termination, truncation, info = env.last()

    # Update the rewards
    for a in env.agents:
        rewards[a] += env.rewards[a]

    # If the game is over, break
    if termination or truncation:
        break
    else:
        # Select the action
        if agent == env.possible_agents[0]:
            act = model1.predict(obs)[0]
        else:
            act = model2.predict(obs)[0]

    # Perform the action
    env.step(act)

    # Store an image of the current state of the environment
    images.append(env.render())

    # If we have more than 100000 images, break
    if len(images) > 100000:
        break

# Close the environment
env.close()

In [12]:
# Match rewards
print(rewards)

{'first_0': -9, 'second_0': 9}


In [13]:
# Open images and store them in a list
images = [Image.fromarray(image) for image in images]

# Save as GIF
images[0].save('output.gif', save_all=True, append_images=images[1:], duration=20, loop=0)

### Multi-Agent Performance Evaluation

In [30]:
test_episodes = 100
# Create the environment
env = pong_v3.env(num_players=2, render_mode="rgb_array")

# Pre-process using SuperSuit
env = ss.color_reduction_v0(env, mode="B")
env = ss.resize_v1(env, x_size=84, y_size=84)
env = ss.frame_stack_v1(env, 4, stack_dim=0)
env = ss.dtype_v0(env, dtype=np.float32)
env = ss.normalize_obs_v0(env, env_min=0, env_max=1)

# Load the agents
model1 = PPO.load("/home/joan/Desktop/Pong_baseline_adversarial_v5_0.zip")
model2 = PPO.load("/home/joan/Desktop/Pong_baseline_adversarial_v5_1.zip")

# Initialize the variables to store the results
total_diff_rewards = {agent: 0 for agent in env.possible_agents}
total_rewards = {agent: 0 for agent in env.possible_agents}
total_wins = {agent: 0 for agent in env.possible_agents}

# We evaluate here using an AEC environments
env.reset(seed=1234)
env.action_space(env.possible_agents[0]).seed(0)

# Iterate over the test episodes
for i in range(test_episodes):
    # Initialize the rewards for each agent
    rewards = {agent: 0 for agent in env.possible_agents}
    # Iterate over the agents
    for agent in env.agent_iter():
        # Getting the observation
        obs, reward, termination, truncation, info = env.last()

        # Update the rewards
        for a in env.agents:
            rewards[a] += env.rewards[a]

        # If the game is over, break
        if termination or truncation:
            # Update the accumulated rewards
            for a in env.agents:
                total_diff_rewards[a] += rewards[a]

            # Update the total points scored and wins
            a1, a2 = env.agents
            if rewards[a1] > rewards[a2]:
                total_rewards[a1] += 21
                total_rewards[a2] += (21 + rewards[a2])
                total_wins[a1] += 1
            else:
                total_rewards[a2] += 21
                total_rewards[a1] += (21 + rewards[a1])
                total_wins[a2] += 1    

            # Reset the environment and break the loop of the episode  
            env.reset()
            break
        else:
            # Select the action
            if agent == env.possible_agents[0]:
                act = model1.predict(obs)[0]
            else:
                act = model2.predict(obs)[0]
        env.step(act)

env.close()

In [31]:
# Points made by each agent
print(total_rewards)
# Accumulated rewards of each agent for the 100 episodes
print(total_diff_rewards)
# Number of wins of each agent
print(total_wins)

{'first_0': 1492, 'second_0': 2050}
{'first_0': -558, 'second_0': 558}
{'first_0': 10, 'second_0': 90}
