In [7]:
import  ray, gym
from ray import tune
import numpy as np
from gym import spaces
from ray.rllib.agents.ppo import PPOTrainer

In [8]:
class EnlishAuction(gym.Env):
    def __init__(self, env_config):
        self.action_space = []
        self.observation_space = []
        self.reward_range = (-200,200)
        self._state = np.zeros(shape=(1,7), dtype=np.int32)
        self.agents = env_config["agents"]
        for _ in self.agents:
            self.action_space.append(spaces.Discrete(3))
            self.observation_space.append(spaces.Box(low=0,high=100,shape=(1,5), dtype=np.int16))
        self.reset()

    def reset(self):
        self._state = np.zeros(shape=(1,7), dtype=np.int32)
        self._state[0,1:3] = 2
        valuations = np.zeros((4,))
        for x in range(2):
            valu = np.random.rand(1,2)
            valuations[0,2*x] = np.amax(valu) * 100
            valuations[0,1+2*x] = np.amin(valu) * 100

        self._state[:,3:7] = valuations

        obs_n = []
        for i, agent in enumerate(self.agents):
            obs_n.append(self._observation(i))
        return obs_n

    def step(self, action_n):
        obs_n = []
        reward_n = []
        done_n = []

        for i, agent in enumerate(self.agents):
            obs_n.append(self._observation(i))
            reward_n.append(self._reward(i,action_n))
            done_n.append(np.sum(action_n) <= 2)
        return obs_n, reward_n, done_n

    def _observation(self, i):
        my_index = i
        enemy_index = (i + 1) % 2

        price = self._state[0]
        my_demand = self._state[my_index + 1]
        enemy_demand = self._state[enemy_index + 1]
        my_valuations = self._state[3 + my_index * 2: 5 + my_index * 2]
        res = []
        res.extend([price,my_demand,enemy_demand])
        res.extend(my_valuations)
        return res

    def _final_reward_i(self,i, bid):
        if bid == 0:
            return 0
        res = self._state[3 + i * 2]
        if bid == 2:
            res += self._state[4 + i * 2]
        res -= self._state[0] * bid
        return res


    def _reward(self, i, action_n):
        if np.sum(action_n) <= 2:
            return self._final_reward_i(i, action_n[i])
        return 0

    def render(self, mode='human'):
        pass

In [9]:
def get_rllib_config(seeds, debug=False, stop_iters=200):
    stop_config = {
        "training_iteration": 2 if debug else stop_iters,
    }
    env_config = {
        "agents": [0,1]
    }
    mock = EnlishAuction(env_config)
    rllib_config = {
        "env": EnlishAuction,
        "env_config": env_config,
        "multiagent": {
            "policies": {
                "ppo_policy": (None, mock.observation_space, mock.action_space, {})
            },
            "policy_mapping_fn": lambda agent_id: agent_id,
        },
        "seed": tune.grid_search(seeds),
        "num_gpus": 0,
        "framework": "tf",
        "lr": 5e-3,
        "train_batch_size": 64
    }

    return rllib_config, stop_config, env_config

In [10]:
def main():
    train_n_replicas = 1
    seeds = list(range(train_n_replicas))
    ray.init()
    rllib_config, stop_config, env_config = get_rllib_config(seeds)
    tune_analysis = tune.run(PPOTrainer,
                             config=rllib_config,
                             stop=stop_config,
                             checkpoint_freq=0,
                             checkpoint_at_end=True,
                             name="PPO_English")
    ray.shutdown()
    return tune_analysis

In [11]:
main()

RuntimeError: Maybe you called ray.init twice by accident? This error can be suppressed by passing in 'ignore_reinit_error=True' or by calling 'ray.shutdown()' prior to 'ray.init()'.