In [1]:
import torch
from ray.rllib.algorithms.ppo import PPOConfig
from pprint import pprint

In [3]:
config = (
    PPOConfig()
    .framework("torch")
    .environment("CartPole-v1")
    .rl_module(_enable_rl_module_api=True)
    .training(_enable_rl_trainer_api=True)
)

algorithm = config.build()

# run for 2 training steps
for _ in range(2):
    result = algorithm.train()
    pprint(result)

2023-02-13 17:06:34,141	INFO worker.py:1242 -- Using address localhost:9031 set in the environment variable RAY_ADDRESS
2023-02-13 17:06:37,197	INFO worker.py:1364 -- Connecting to existing Ray cluster at address: 172.31.180.61:9031...
2023-02-13 17:06:37,207	INFO worker.py:1544 -- Connected to Ray cluster. View the dashboard at [1m[32mhttps://console.anyscale-staging.com/api/v2/sessions/ses_cZgx1Ldp4pEKfiMmuqujiuNB/services?redirect_to=dashboard [39m[22m
2023-02-13 17:06:37,965	INFO packaging.py:503 -- Creating a file package for local directory '/tmp/ray_tmp_module/ray'.
2023-02-13 17:06:39,130	INFO packaging.py:330 -- Pushing file package 'gcs://_ray_pkg_3708b7cef17a6742.zip' (148.79MiB) to Ray cluster...
2023-02-13 17:06:41,639	INFO packaging.py:343 -- Successfully pushed file package 'gcs://_ray_pkg_3708b7cef17a6742.zip'.
2023-02-13 17:06:42,185	INFO packaging.py:330 -- Pushing file package 'gcs://_ray_pkg_71e339f49232ad6dbefb414ba636e492.zip' (150.93MiB) to Ray cluster...
202

{'agent_timesteps_total': 4000,
 'config': {'_disable_action_flattening': False,
            '_disable_execution_plan_api': True,
            '_disable_preprocessor_api': False,
            '_enable_rl_module_api': True,
            '_enable_rl_trainer_api': True,
            '_fake_gpus': False,
            '_rl_trainer_hps': PPORLTrainerHPs(kl_coeff=0.2,
                                               kl_target=0.01,
                                               use_critic=True,
                                               clip_param=0.3,
                                               vf_clip_param=10.0,
                                               entropy_coeff=0.0,
                                               vf_loss_coeff=1.0,
                                               lr_schedule=None,
                                               entropy_coeff_schedule=None),
            '_tf_policy_handles_more_than_one_loss': False,
            'action_space': None,
            'act

In [14]:
import gymnasium as gym
from typing import Any, Mapping, Union
from dataclasses import dataclass

from ray.rllib.core.rl_module import RLModule
from ray.rllib.core.rl_module.torch.torch_rl_module import TorchRLModule
from ray.rllib.models.specs.specs_torch import TorchTensorSpec
from ray.rllib.models.specs.typing import SpecType
from ray.rllib.utils.annotations import override
from ray.rllib.utils.nested_dict import NestedDict

import torch
import torch.nn as nn


In [11]:
class DiscreteBCTorchModule(TorchRLModule):

    def __init__(
        self,
        input_dim: int,
        hidden_dim: int,
        output_dim: int,
    ) -> None:
	
        super().__init__()
        # define the neural network, treat the base class as a nn.Module
        self.policy = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim),
        )

    @override(RLModule)
    def _forward_inference(self, batch: NestedDict) -> Mapping[str, Any]:
	    # There is no need to keep gradients during inference or explore
        with torch.no_grad():
            logits = self.policy(batch["obs"])
            return {"action_dist": torch.distributions.Categorical(logits=logits)}

    @override(RLModule)
    def _forward_exploration(self, batch: NestedDict) -> Mapping[str, Any]:
        with torch.no_grad():
            logits = self.policy(batch["obs"])
            return {"action_dist": torch.distributions.Categorical(logits=logits)}

    @override(RLModule)
    def _forward_train(self, batch: NestedDict) -> Mapping[str, Any]:
        logits = self.policy(batch["obs"])
        return {"action_dist": torch.distributions.Categorical(logits=logits)}


    @override(RLModule)
    def input_specs_exploration(self) -> SpecType:
        return ["obs"]

    @override(RLModule)
    def input_specs_inference(self) -> SpecType:
        return ["obs"]

    @override(RLModule)
    def input_specs_train(self) -> SpecType:
        return ["obs"]
    
    @override(RLModule)
    def output_specs_exploration(self) -> SpecType:
        return ["action_dist"]

    @override(RLModule)
    def output_specs_inference(self) -> SpecType:
        return ["action_dist"]

    @override(RLModule)
    def output_specs_train(self) -> SpecType:
        return ["action_dist"]


    @classmethod
    @override(RLModule)
    def from_model_config(
        cls,
        observation_space: "gym.Space",
        action_space: "gym.Space",
        *,
        model_config: Mapping[str, Any],
    ) -> "DiscreteBCTorchModule":

        config = {
            "input_dim": observation_space.shape[0],
            "hidden_dim": model_config["fcnet_hiddens"][0],
            "output_dim": action_space.n,
        }

        return cls(**config)


In [12]:
from ray.rllib.core.rl_module.rl_module import SingleAgentRLModuleSpec

env = gym.make("CartPole-v1")
spec = SingleAgentRLModuleSpec(
    module_class=DiscreteBCTorchModule,
    observation_space=env.observation_space,
    action_space=env.action_space,
    model_config={"fcnet_hiddens": [64]},
)

module = spec.build()
print(module)


DiscreteBCTorchModule(
  (policy): Sequential(
    (0): Linear(in_features=4, out_features=64, bias=True)
    (1): ReLU()
    (2): Linear(in_features=64, out_features=2, bias=True)
  )
)


In [25]:
@dataclass
class BCModuleSpec(SingleAgentRLModuleSpec):

    def build(self) -> "RLModule":
        # this handles all implementation details
        config = {
            "input_dim": self.observation_space.shape[0],
            "hidden_dim": self.model_config["fcnet_hiddens"][0],
            "output_dim": self.action_space.n,
        } 
        print(config)
        return self.module_class(**config)

In [26]:
# let's test our spec
spec = BCModuleSpec(
	module_class = DiscreteBCTorchModule,
	observation_space = gym.spaces.Box(low=-1, high=1, shape=(10,)),
    action_space = gym.spaces.Discrete(2),
    model_config = {"fcnet_hiddens": [32]} 
)

# this will return an RLModule instance from DiscreteBCTorchModule type
module = spec.build()
# We can construct a simple multi-agent RL module with the "Default" key
marl_module = module.as_multi_agent()
print(marl_module)

{'input_dim': 10, 'hidden_dim': 32, 'output_dim': 2}
MARL({'default_policy': DiscreteBCTorchModule(
  (policy): Sequential(
    (0): Linear(in_features=10, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=2, bias=True)
  )
)})


In [29]:
from ray.rllib.core.rl_module.marl_module import MultiAgentRLModuleSpec

spec = MultiAgentRLModuleSpec(
    module_specs = {
        "module_1": SingleAgentRLModuleSpec(
            module_class=DiscreteBCTorchModule,
            observation_space=gym.spaces.Box(low=-1, high=1, shape=(10,)),
            action_space=gym.spaces.Discrete(2),
            model_config={"fcnet_hiddens": [32]}
        ),
        "module_2": SingleAgentRLModuleSpec(
            module_class=DiscreteBCTorchModule,
            observation_space=gym.spaces.Box(low=-1, high=1, shape=(5,)),
            action_space=gym.spaces.Discrete(2),
            model_config={"fcnet_hiddens": [16]}
        )
    },
)

# this will return a MultiAgentRLModule instance from the given RLModules
marl_module = spec.build()
# This has no effect essentially. 
marl_module = marl_module.as_multi_agent()
print(marl_module)

MARL({'module_1': DiscreteBCTorchModule(
  (policy): Sequential(
    (0): Linear(in_features=10, out_features=32, bias=True)
    (1): ReLU()
    (2): Linear(in_features=32, out_features=2, bias=True)
  )
),
 'module_2': DiscreteBCTorchModule(
  (policy): Sequential(
    (0): Linear(in_features=5, out_features=16, bias=True)
    (1): ReLU()
    (2): Linear(in_features=16, out_features=2, bias=True)
  )
)})


In [35]:
from ray.rllib.core.rl_trainer.torch.torch_rl_trainer import TorchRLTrainer

# Simple behavioral clonning Learner
class BCTorchLeaner(TorchRLTrainer):

    # compute the loss for each RLModule in the underlying MultiAgentRLModule
    def compute_loss_per_module(
        self, module_id: str, batch, fwd_out
    ) -> Mapping[str, Any]:

        action_dist = fwd_out["action_dist"]
        # compute the negative log liklihood of the action dist.
        loss = -torch.mean(action_dist.log_prob(batch["actions"]))
	   # return the loss dict which should include a special key for learner to be able to perform gradient updates.
        return {self.TOTAL_LOSS_KEY: loss}

In [44]:
from ray.rllib.core.rl_trainer.rl_trainer import RLTrainerSpec

# Learners, similar to RLModules have specs that they can be instantiated from:
spec = RLTrainerSpec(
	rl_trainer_class= BCTorchLeaner,
	module_spec = SingleAgentRLModuleSpec(
	module_class = DiscreteBCTorchModule,
		observation_space=gym.spaces.Box(low=-1, high=1, shape=(10,)),
		action_space=gym.spaces.Discrete(2),
        model_config = {"fcnet_hiddens": [32]} 
    ),
	optimizer_config={"lr": 1e-3},
)

# build the learner from specs
learner = spec.build()
print(learner)

<__main__.BCTorchLeaner object at 0x7fd74c6af100>


In [46]:
learner.build()
print(learner)

<__main__.BCTorchLeaner object at 0x7fd74c6af100>
