In [4]:
from ray.rllib.env import MultiAgentEnv
from ray.tune.registry import register_env

from ray.rllib.agents.ppo import PPOTrainer
import ray
from ray import tune
import gym
import numpy as np

In [5]:
class IrrigationEnv(MultiAgentEnv):
    def __init__(self, return_agent_actions = False, part=False):
        self.num_agents = 5
        self.observation_space = gym.spaces.Box(low=200, high=800, shape=(1,))
        self.action_space = gym.spaces.Box(low=0, high=1, shape=(1,))

    def reset(self):
        obs = {}
        self.water = np.random.uniform(200,800)
        for i in range(self.num_agents):
            obs[i] = np.array([self.water])
        return obs

    def cal_rewards(self, action_dict):
        self.curr_water = self.water
        reward = 0
        for i in range(self.num_agents):
            water_demanded = self.water*action_dict[i][0]
            if self.curr_water == 0:
                # No water is left in stream
                reward -= water_demanded*100 # Penalty
            elif self.curr_water - water_demanded<0:
                # Water in stream is less than water demanded, withdraw all left
                water_needed = water_demanded - self.curr_water
                water_withdrawn = self.curr_water
                self.curr_water = 0
                reward += -water_withdrawn**2 + 200*water_withdrawn
                reward -= water_needed*100 # Penalty
            else:
                # Water in stream is more than water demanded, withdraw water demanded
                self.curr_water -= water_demanded
                water_withdrawn = water_demanded
                reward += -water_withdrawn**2 + 200*water_withdrawn

        return reward

    def step(self, action_dict):
        obs, rew, done, info = {}, {}, {}, {}

        reward = self.cal_rewards(action_dict)

        for i in range(self.num_agents):

            obs[i], rew[i], done[i], info[i] = np.array([self.curr_water]), reward, True, {}

        done["__all__"] = True
        return obs, rew, done, info

In [6]:
def env_creator(_):
    return IrrigationEnv()

single_env = IrrigationEnv()
env_name = "IrrigationEnv"
register_env(env_name, env_creator)

# Get environment obs, action spaces and number of agents
obs_space = single_env.observation_space
act_space = single_env.action_space
num_agents = single_env.num_agents

# Create a policy mapping
def gen_policy():
    return (None, obs_space, act_space, {})

policy_graphs = {}
for i in range(num_agents):
    policy_graphs['agent-' + str(i)] = gen_policy()

#Use `policy_mapping_fn(agent_id, episode, worker, **kwargs)` instead
def policy_mapping_fn(agent_id, episode, worker):
    return 'agent-' + str(agent_id)

config={
            "log_level": "WARN",
            "num_workers": 1,
            "num_cpus_for_driver": 1,
            "num_cpus_per_worker": 1,
            "num_sgd_iter": 10,
            "train_batch_size": 128,
            "lr": 5e-3,
            "model":{"fcnet_hiddens": [8, 8]},
            "rollout_fragment_length": 128,
            "multiagent": {
                "policies": policy_graphs,
                "policy_mapping_fn": policy_mapping_fn,
            },
            "env": "IrrigationEnv"}
    
trainer = PPOTrainer(config=config)

for i in range(15):
    results = trainer.train()
    print(f"Iter: {i}; avg. reward={results['episode_reward_mean']}")

2022-01-04 14:39:53,709	INFO trainable.py:124 -- Trainable.setup took 21.946 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


Iter: 0; avg. reward=-779603.5833117398
Iter: 1; avg. reward=-515774.1297232858
Iter: 2; avg. reward=-318073.5645450128
Iter: 3; avg. reward=-217518.62759570783
Iter: 4; avg. reward=-80784.4438319313
Iter: 5; avg. reward=-18121.005286384003
Iter: 6; avg. reward=-2725.425218754961
Iter: 7; avg. reward=45630.70374515159
Iter: 8; avg. reward=57204.84265795139
Iter: 9; avg. reward=69085.41056414637
Iter: 10; avg. reward=74416.85820282507
Iter: 11; avg. reward=98381.41331161988
Iter: 12; avg. reward=118006.33517910565
Iter: 13; avg. reward=136276.57466463174
Iter: 14; avg. reward=147511.4776076984


In [7]:
env = IrrigationEnv()
obs = env.reset()
done = False
total_reward = 0.0

In [8]:
while not done:
   
    action = {}
    for agent_id, agent_obs in obs.items():
        policy_id = config['multiagent']['policy_mapping_fn'](agent_id, None, None)
        action[agent_id] = trainer.compute_single_action(agent_obs, policy_id=policy_id)
        
    obs, reward, done, info = env.step(action)
    done = done['__all__']
    
    total_reward += sum(reward.values())

print(f"total-reward={total_reward}")

total-reward=-5394731.978702811
