In [0]:
# Comment these lines if not using Google Colaboratory
!pip install -U ray[rllib]
!pip install gputil

In [0]:
#
# Define Rock Paper Scissors (taken from Ray example)
#

import ray
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from gym.spaces import Tuple, Discrete

ROCK = 0
PAPER = 1
SCISSORS = 2

class RockPaperScissorsEnv(MultiAgentEnv):
    """Two-player environment for rock paper scissors.
    The observation is simply the last opponent action."""

    def __init__(self, _):
        self.action_space = Discrete(3)
        self.observation_space = Discrete(3)
        self.player1 = "agent_1"
        self.player2 = "agent_2"
        self.last_move = None
        self.num_moves = 0

    def reset(self):
        self.last_move = (1, 2)
        self.num_moves = 0
        return {
            self.player1: self.last_move[1],
            self.player2: self.last_move[0],
        }

    def step(self, action_dict):
        move1 = action_dict[self.player1]
        move2 = action_dict[self.player2]
        self.last_move = (move1, move2)
        obs = {
            self.player1: self.last_move[1],
            self.player2: self.last_move[0],
        }
        r1, r2 = {
            (ROCK, ROCK): (0, 0),
            (ROCK, PAPER): (-1, 1),
            (ROCK, SCISSORS): (1, -1),
            (PAPER, ROCK): (1, -1),
            (PAPER, PAPER): (0, 0),
            (PAPER, SCISSORS): (-1, 1),
            (SCISSORS, ROCK): (-1, 1),
            (SCISSORS, PAPER): (1, -1),
            (SCISSORS, SCISSORS): (0, 0),
        }[move1, move2]
        rew = {
            self.player1: r1,
            self.player2: r2,
        }
        self.num_moves += 1
        done = {
            "__all__": self.num_moves >= 10,
        }
        return obs, rew, done, {}
    
    def render(self):
        print(self.last_move)
        

In [0]:
#
# Test with IQL
#

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import gym
from gym import spaces
import numpy as np

import ray
import ray.rllib.agents.ppo as ppo
import ray.rllib.agents.dqn as dqn
import ray.rllib.agents.dqn.apex as apex
from ray.tune.logger import pretty_print

from ray.rllib.env.multi_agent_env import MultiAgentEnv
from ray.rllib.agents.dqn.dqn import DQNTrainer as DQNAgent
from ray.rllib.agents.dqn.dqn_policy import DQNTFPolicy as DQNPolicyGraph
from ray.tune.registry import register_env

import matplotlib.pyplot as plt
from gym.spaces import Tuple, Discrete

ray.shutdown()
ray.init(webui_host="127.0.0.1",ignore_reinit_error=True, log_to_driver=False)

game = RockPaperScissorsEnv({})
act_space = game.action_space
obs_space = game.observation_space

trainer = DQNAgent(env=RockPaperScissorsEnv, config={
    "gamma": 0.95,
    "lr": 0.0001,
    "schedule_max_timesteps": 20000,
    "timesteps_per_iteration": 1000,
    "exploration_fraction": 0.95,
    "exploration_final_eps": 0.02,
    "model": {
        "fcnet_activation": "relu",
        "fcnet_hiddens": [32,16],
    },
    "multiagent": {
        "policies": {
            "dqn_policy1": (DQNPolicyGraph, obs_space, act_space, {"gamma": 0.95}),
            "dqn_policy2": (DQNPolicyGraph, obs_space, act_space, {"gamma": 0.95}),
        },
        "policies_to_train": ["dqn_policy1","dqn_policy2"],
        "policy_mapping_fn":
            lambda agent_id:
                 "dqn_policy1"
                 if int(agent_id.split('_')[1])<=1
                 else "dqn_policy2"
    },
})


In [0]:
# Train
N = 20
mean_reward = np.zeros(N)
mean_length = np.zeros(N)
for i in range(N):
    # Improve the DQN policy
    stats = trainer.train()
    print("== Iteration", i, "== AvgReward:",stats["policy_reward_mean"],"== AvgLength:",stats["episode_len_mean"])
  

In [0]:
# Test
agent_names = ["agent_%d"%(i+1) for i in range(2)]
policy_map = trainer.config["multiagent"]["policy_mapping_fn"]

N = 100
for i in range(N):
    #print("Game ",i)
    obs = game.reset()
    #game.render()
    dones = {}
    
    avgR = np.zeros(2)
    R = np.zeros(2)
    t = 0
    action_dict = {}
    while not dones.get("__all__",False):
        # Get actions from neural network
        action_dict = { x: trainer.compute_action(obs[x],policy_id=policy_map(x)) for x in agent_names}
        
        # Make move
        obs, rewards, dones, _ = game.step(action_dict)
        
        # Add rewards up
        for q in rewards:
            index = int(q.split('_')[1])-1
            R[index] += rewards[q]

        #game.render()
        
    avgR = avgR + R
       
print('average reward:',avgR/N)  
