## Environment 정의

In [1]:
import os

import random


class Discrete:
    def __init__(self, num_actions: int):
        """ Discrete action space for num_actions.
        Discrete(4) can be used as encoding moving in one of the cardinal directions.
        """
        self.n = num_actions

    def sample(self):
        return random.randint(0, self.n - 1)


class Environment:

    seeker, goal = (0, 0), (4, 4)
    info = {'seeker': seeker, 'goal': goal}

    def __init__(self,  *args, **kwargs):
        self.action_space = Discrete(4)
        self.observation_space = Discrete(5*5)

    def reset(self):
        """Reset seeker and goal positions, return observations."""
        self.seeker = (0, 0)
        self.goal = (4, 4)

        return self.get_observation()

    def get_observation(self):
        """Encode the seeker position as integer"""
        return 5 * self.seeker[0] + self.seeker[1]

    def get_reward(self):
        """Reward finding the goal"""
        return 1 if self.seeker == self.goal else 0

    def is_done(self):
        """We're done if we found the goal"""
        return self.seeker == self.goal

    def step(self, action):
        """Take a step in a direction and return all available information."""
        if action == 0:  # move down
            self.seeker = (min(self.seeker[0] + 1, 4), self.seeker[1])
        elif action == 1:  # move left
            self.seeker = (self.seeker[0], max(self.seeker[1] - 1, 0))
        elif action == 2:  # move up
            self.seeker = (max(self.seeker[0] - 1, 0), self.seeker[1])
        elif action == 3:  # move right
            self.seeker = (self.seeker[0], min(self.seeker[1] + 1, 4))
        else:
            raise ValueError("Invalid action")

        return self.get_observation(), self.get_reward(), self.is_done(), self.info

    def render(self, *args, **kwargs):
        """Render the environment, e.g. by printing its representation."""
        os.system('cls' if os.name == 'nt' else 'clear')
        try:
            from IPython.display import clear_output
            clear_output(wait=True)
        except Exception:
            pass
        grid = [['| ' for _ in range(5)] + ["|\n"] for _ in range(5)]
        grid[self.goal[0]][self.goal[1]] = '|G'
        grid[self.seeker[0]][self.seeker[1]] = '|S'
        print(''.join([''.join(grid_row) for grid_row in grid]))


import gym
from gym.spaces import Discrete


class GymEnvironment(Environment, gym.Env):
    def __init__(self, *args, **kwargs):
        """Make our original `Environment` a gym `Env`."""
        super().__init__(*args, **kwargs)


gym_env = GymEnvironment()

## Rllib Python 으로 학습하기

In [2]:
from ray.tune.logger import pretty_print
from ray.rllib.algorithms.dqn import DQNConfig

config = (DQNConfig().environment(GymEnvironment)
          .rollouts(num_rollout_workers=2, create_env_on_local_worker=True))

pretty_print(config.to_dict())

algo = config.build()

for i in range(10):
    result = algo.train()

print(pretty_print(result))

  VALID_NP_HPARAMS = (np.bool8, np.float32, np.float64, np.int32, np.int64)
2024-03-09 13:31:11,779	INFO worker.py:1529 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m
[2m[36m(pid=42461)[0m   from pkg_resources import packaging
[2m[36m(pid=42461)[0m   VALID_NP_HPARAMS = (np.bool8, np.float32, np.float64, np.int32, np.int64)
[2m[36m(pid=42462)[0m   from pkg_resources import packaging
[2m[36m(pid=42462)[0m   VALID_NP_HPARAMS = (np.bool8, np.float32, np.float64, np.int32, np.int64)
[2m[36m(RolloutWorker pid=42461)[0m   prep = cls(observation_space, options)
[2m[36m(RolloutWorker pid=42461)[0m   super(DistributionalQTFModel, self).__init__(
[2m[36m(RolloutWorker pid=42462)[0m   prep = cls(observation_space, options)
[2m[36m(RolloutWorker pid=42462)[0m   super(DistributionalQTFModel, self).__init__(
  prep = cls(observation_space, options)
  super(DistributionalQTFModel, self).__init__(
2024-03-09 13:31:20,495	INFO trainable.

agent_timesteps_total: 10000
counters:
  last_target_update_ts: 9502
  num_agent_steps_sampled: 10000
  num_agent_steps_trained: 144000
  num_env_steps_sampled: 10000
  num_env_steps_trained: 144000
  num_target_updates: 18
custom_metrics: {}
date: 2024-03-09_13-32-36
done: false
episode_len_mean: 11.05
episode_media: {}
episode_reward_max: 1.0
episode_reward_mean: 1.0
episode_reward_min: 1.0
episodes_this_iter: 95
episodes_total: 354
experiment_id: 986d8e86711c49c389c43a7e65b5cff7
hostname: hoondori-ML
info:
  last_target_update_ts: 9502
  learner:
    default_policy:
      custom_metrics: {}
      diff_num_grad_updates_vs_sampler_policy: 4499.0
      learner_stats:
        cur_lr: 0.0005000000237487257
        max_q: 0.9902322888374329
        mean_q: 0.9334491491317749
        mean_td_error: -0.0003340337425470352
        min_q: 0.8533341884613037
        model: {}
      num_agent_steps_trained: 32.0
      num_grad_updates_lifetime: 4500.0
      td_error: [-0.00018143653869628906, -

## 모델의 저장과 로드, 평가

In [9]:
from ray.rllib.algorithms import Algorithm

ckpt = algo.save()
print(ckpt)

eval = algo.evaluate()
print(pretty_print(eval))

#algo.stop()
#restored_algo = Algorithm.from_checkpoint(ckpt)

/home/hoondori/ray_results/DQN_GymEnvironment_2024-03-09_11-04-28d_cn6mha/checkpoint_000010
evaluation:
  custom_metrics: {}
  episode_len_mean: .nan
  episode_media: {}
  episode_reward_max: .nan
  episode_reward_mean: .nan
  episode_reward_min: .nan
  episodes_this_iter: 0
  hist_stats:
    episode_lengths: []
    episode_reward: []
  num_agent_steps_sampled_this_iter: 10
  num_env_steps_sampled_this_iter: 10
  num_faulty_episodes: 0
  policy_reward_max: {}
  policy_reward_mean: {}
  policy_reward_min: {}
  sampler_perf: {}
  timesteps_this_iter: 10



## 액션 계산하기

In [10]:
env = GymEnvironment()
done = False
total_reward = 0
observations = env.reset()
while not done:
    action = algo.compute_single_action(observations)
    observations, reward, done, info = env.step(action)
    total_reward += reward
print(total_reward)    

1


In [11]:
action = algo.compute_actions({"obs_1":observations, "obs_2":observations})
print(action)

{'obs_1': 3, 'obs_2': 3}


## 정책과 모델 상태에 접근하기

In [3]:
policy = algo.get_policy()
weights = policy.get_weights()
model = policy.model
model.base_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 observations (InputLayer)   [(None, 25)]                 0         []                            
                                                                                                  
 fc_1 (Dense)                (None, 256)                  6656      ['observations[0][0]']        
                                                                                                  
 fc_out (Dense)              (None, 256)                  65792     ['fc_1[0][0]']                
                                                                                                  
 value_out (Dense)           (None, 1)                    257       ['fc_1[0][0]']                
                                                                                              

## worker 별 모델 확인

In [16]:
workers = algo.workers
weight_list = workers.foreach_worker(
    lambda remote_trainer: remote_trainer.get_policy().get_weights()
)
print(len(weight_list))

3


# 실험 구성

In [22]:
# 리소스 구성
from ray.rllib.algorithms.dqn import DQNConfig

config = DQNConfig().resources(num_gpus=1, num_cpus_per_worker=2, num_gpus_per_worker=0)

In [24]:
# 롤아웃 워커 구성

from ray.rllib.algorithms.dqn import DQNConfig

config = DQNConfig().rollouts(
    num_rollout_workers=4,
    num_envs_per_worker=1,
    create_env_on_local_worker=True
)

In [26]:
# 환경 구성

from ray.rllib.algorithms.dqn import DQNConfig

config = DQNConfig().environment(
    env="CartPole-v1",
    env_config={"my_config": "value"},
    observation_space=None,
    action_space=None,
    render_env=True
)

## 다중 에이전트

In [1]:
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from gym.spaces import Discrete
import os

class MultiAgentMaze(MultiAgentEnv):

    def __init__(self,  *args, **kwargs):
        self.action_space = Discrete(4)
        self.observation_space = Discrete(5*5)
        self.agents = {1: (4, 0), 2: (0, 4)}
        self.goal = (4, 4)
        self.info = {1: {'obs': self.agents[1]}, 2: {'obs': self.agents[2]}}

    def reset(self):
        self.agents = {1: (4, 0), 2: (0, 4)}

        return {1: self.get_observation(1), 2: self.get_observation(2)}

    def get_observation(self, agent_id):
        seeker = self.agents[agent_id]
        return 5 * seeker[0] + seeker[1]

    def get_reward(self, agent_id):
        return 1 if self.agents[agent_id] == self.goal else 0

    def is_done(self, agent_id):
        return self.agents[agent_id] == self.goal

    def step(self, action):
        agent_ids = action.keys()

        for agent_id in agent_ids:
            seeker = self.agents[agent_id]
            if action[agent_id] == 0:  # move down
                seeker = (min(seeker[0] + 1, 4), seeker[1])
            elif action[agent_id] == 1:  # move left
                seeker = (seeker[0], max(seeker[1] - 1, 0))
            elif action[agent_id] == 2:  # move up
                seeker = (max(seeker[0] - 1, 0), seeker[1])
            elif action[agent_id] == 3:  # move right
                seeker = (seeker[0], min(seeker[1] + 1, 4))
            else:
                raise ValueError("Invalid action")
            self.agents[agent_id] = seeker

        observations = {i: self.get_observation(i) for i in agent_ids}
        rewards = {i: self.get_reward(i) for i in agent_ids}
        done = {i: self.is_done(i) for i in agent_ids}

        done["__all__"] = all(done.values())

        return observations, rewards, done, self.info

    def render(self, *args, **kwargs):
        """We override this method here so clear the output in Jupyter notebooks.
        The previous implementation works well in the terminal, but does not clear
        the screen in interactive environments.
        """
        os.system('cls' if os.name == 'nt' else 'clear')
        try:
            from IPython.display import clear_output
            clear_output(wait=True)
        except Exception:
            pass
        grid = [['| ' for _ in range(5)] + ["|\n"] for _ in range(5)]
        grid[self.goal[0]][self.goal[1]] = '|G'
        grid[self.agents[1][0]][self.agents[1][1]] = '|1'
        grid[self.agents[2][0]][self.agents[2][1]] = '|2'
        grid[self.agents[2][0]][self.agents[2][1]] = '|2'
        print(''.join([''.join(grid_row) for grid_row in grid]))

In [2]:
import time

env = MultiAgentMaze()

while True:
    obs, rew, done, info = env.step(
        {1: env.action_space.sample(), 2: env.action_space.sample()}
    )
    time.sleep(0.1)
    env.render()
    if any(done.values()):
        break

| | | | | |
| | | | | |
| | |2| | |
| | | | | |
| | | | |1|



In [None]:
#!pip install numpy==1.23.5  <-- 에러 발생시 설치

In [4]:
# 각 agent가 각자의 정책을 가지도록 한 후에 학습

from ray.rllib.algorithms.dqn import DQNConfig

algo = DQNConfig()\
    .environment(env=MultiAgentMaze)\
    .multi_agent(
        policies = {
            "policy_1": (
                None, env.observation_space, env.action_space, {"gamma": 0.80}
            ),
            "policy_2": (
                None, env.observation_space, env.action_space, {"gamma": 0.95}
            ),            
        },
        policy_mapping_fn = lambda agent_id: f"policy_{agent_id}",
    ).build()
algo.train()



{'custom_metrics': {},
 'episode_media': {},
 'info': {'learner': {},
  'num_env_steps_sampled': 1000,
  'num_env_steps_trained': 0,
  'num_agent_steps_sampled': 2000,
  'num_agent_steps_trained': 0},
 'sampler_results': {'episode_reward_max': 2.0,
  'episode_reward_min': 2.0,
  'episode_reward_mean': 2.0,
  'episode_len_mean': 134.0,
  'episode_media': {},
  'episodes_this_iter': 7,
  'policy_reward_min': {'policy_1': 1.0, 'policy_2': 1.0},
  'policy_reward_max': {'policy_1': 1.0, 'policy_2': 1.0},
  'policy_reward_mean': {'policy_1': 1.0, 'policy_2': 1.0},
  'custom_metrics': {},
  'hist_stats': {'episode_reward': [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0],
   'episode_lengths': [245, 33, 43, 209, 24, 33, 351],
   'policy_policy_1_reward': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
   'policy_policy_2_reward': [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]},
  'sampler_perf': {'mean_raw_obs_processing_ms': 0.7600655684342514,
   'mean_inference_ms': 3.307951556576359,
   'mean_action_processing_ms': 0.05153