In [None]:
import warnings
warnings.filterwarnings("ignore")
import ray
import torch
ray.init(
        runtime_env={
        "working_dir": "/data2/zzd/rl_llm/verl",  # 工作目录（会上传到集群）
    }
)

2025-04-29 07:47:34,768	INFO worker.py:1832 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m
2025-04-29 07:47:35,134	INFO packaging.py:575 -- Creating a file package for local module '/data2/zzd/rl_llm/verl'.
2025-04-29 07:47:35,565	INFO packaging.py:367 -- Pushing file package 'gcs://_ray_pkg_8c5fcbfc40086328.zip' (13.71MiB) to Ray cluster...
2025-04-29 07:47:35,637	INFO packaging.py:380 -- Successfully pushed file package 'gcs://_ray_pkg_8c5fcbfc40086328.zip'.


0,1
Python version:,3.10.16
Ray version:,2.43.0
Dashboard:,http://127.0.0.1:8265


[36m(GPUAccumulator pid=2179640)[0m rank 0, value: tensor([1.], device='cuda:0')
[36m(GPUAccumulator pid=2179952)[0m rank 2, value: tensor([3.], device='cuda:0')
[36m(GPUAccumulator pid=2208868)[0m rank 0, value: tensor([2.], device='cuda:0')
[36m(GPUAccumulator pid=2209046)[0m rank 0, value: tensor([3.], device='cuda:0')[32m [repeated 4x across cluster][0m
[36m(GPUAccumulator pid=2209609)[0m rank 2, value: tensor([5.], device='cuda:0')
[36m(GPUAccumulatorDecorator pid=2222385)[0m 10
[36m(GPUAccumulatorDecorator pid=2222615)[0m 10
[36m(GPUAccumulatorDecorator pid=2222616)[0m 10
[36m(GPUAccumulatorDecorator pid=2222617)[0m 10
[36m(GPUAccumulatorDecorator pid=2222619)[0m 10
[36m(GPUAccumulatorDecorator pid=2222620)[0m 10
[36m(GPUAccumulatorDecorator pid=2222622)[0m 10
[36m(GPUAccumulatorDecorator pid=2222623)[0m 10
[36m(GPUAccumulatorDecorator pid=2222385)[0m rank 0, value: tensor([10.], device='cuda:0')
[36m(GPUAccumulatorDecorator pid=2226747)[0m 10
[36

In [2]:
@ray.remote
class Accumulator:
    def __init__(self):
        self.value = 0

    def add(self, x):
        self.value += x

    def get_value(self):
        return self.value
    
# Instantiate an accumulator. Accumulator can be viewed as a process, acting as an RPC service.
accumulator = Accumulator.remote()

In [3]:
value_ref = accumulator.get_value.remote()  # Check the current value. Note that this function returns immediately and does not actually wait for the remote execution to complete.
# Get the value
value = ray.get(value_ref)
print(value)

0


In [4]:
# Accumulate, then check the result.
accumulator.add.remote(10)  # Similarly, the 'add' here will return immediately.
new_value = ray.get(accumulator.get_value.remote())
print(new_value)

10


上文的例子是单进程的，在下边的例子里，首先实现一个使用 GPU 的 Worker 类，然后使用 Ray 创建一个 WorkerGroup，并启动多个 Worker 进程，每个进程都使用一个 GPU。

In [5]:
import sys
sys.path.append("/data2/zzd/rl_llm/verl")

from verl.single_controller.base import Worker
from verl.single_controller.ray.base import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup, merge_resource_pool

In [6]:
@ray.remote
class GPUAccumulator(Worker):
    def __init__(self) -> None:
        super().__init__()
        # The initial value of each rank is the same as the rank
        self.value = torch.zeros(size=(1,), device="cuda") + self.rank

    def add(self, x):
        self.value += x
        print(f"rank {self.rank}, value: {self.value}")
        return self.value.cpu()

初始化 RayWorkerGroup 并在给定的资源池中执行分布式计算

参数传递原理：输入参数是一个长度为 world_size 的列表，列表中的每个元素分别分配给 RayWorkerGroup 中的每个 worker。返回参数也是一个列表，对应于每个 worker 的返回值。

In [7]:
resource_pool = RayResourcePool([4], use_gpu=True)

# Each worker's initial value is its rank, and then each rank's value is incremented by 1, so the values obtained on each rank are [1, 2, 3, 4]
class_with_args = RayClassWithInitArgs(cls=GPUAccumulator)
worker_group = RayWorkerGroup(resource_pool, class_with_args)
print(worker_group.execute_all_sync("add", x=[1, 1, 1, 1]))




[tensor([1.]), tensor([2.]), tensor([3.]), tensor([4.])]


In [11]:
print(resource_pool.local_rank_list())
print(resource_pool.local_world_size_list())

[0, 1, 2, 3]
[4, 4, 4, 4]


In [13]:
print(worker_group.workers)
print(worker_group.worker_names)
# worker_group.worker_names

[Actor(GPUAccumulator, 0df4b19eab489dacf1bcf96b01000000), Actor(GPUAccumulator, 82effa1deec127d7d4b0294601000000), Actor(GPUAccumulator, 6fd2a5355d97ce6cf0216de901000000), Actor(GPUAccumulator, e26494bdc64670b38035d10b01000000)]
['TB0KhKGPUAccumulator_0:0', 'TB0KhKGPUAccumulator_0:1', 'TB0KhKGPUAccumulator_0:2', 'TB0KhKGPUAccumulator_0:3']


GPU 资源共享

映射到同一资源池的 RayWorkerGroups 共享 GPU。在下边例子中，我们另起一个占用 4 个 GPU的资源池，并与前文资源池合并，最后得到一个占用所有 8 个 GPU的资源池。

In [None]:
# Create a new resource pool and then merge the newly created resource pool with the previous one.
resource_pool_new = RayResourcePool([4], use_gpu=True, name_prefix="a")
resource_pool_merge = merge_resource_pool(resource_pool, resource_pool_new)
print(resource_pool_new.local_rank_list())
print(resource_pool_merge.local_rank_list())


[0, 1, 2, 3]
[0, 1, 2, 3, 0, 1, 2, 3]


In [15]:
# Establish a RayWorkerGroup on the newly created resource pool.
worker_group_new = RayWorkerGroup(resource_pool_new, class_with_args)
worker_group_merge = RayWorkerGroup(resource_pool_merge, class_with_args)



In [16]:
# Run 'add' on the second set of 4 GPUs; the result should be [2, 3, 4, 5].
output_new = worker_group_new.execute_all_sync("add", x=[2, 2, 2, 2])
print(output_new)

[tensor([2.]), tensor([3.]), tensor([4.]), tensor([5.])]


In [17]:
# Run 'add' on the merged set of 8 GPUs; the result should be [3, 4, 5, 6, 7, 8, 9, 10].
output_merge = worker_group_merge.execute_all_sync("add", x=[3, 3, 3, 3, 3, 3, 3, 3])
print(output_merge)

[tensor([3.]), tensor([4.]), tensor([5.]), tensor([6.]), tensor([7.]), tensor([8.]), tensor([9.]), tensor([10.])]


数据分发(dispatch)、执行(execution)和收集(collection)

在上述示例中，我们使用了 RayWorkerGroup 中的 execute_all_sync 函数来从驱动程序向每个工作器分发数据。这对编码来说非常不方便。在本章中，我们使用函数装饰器形式，允许 RayWorkerGroup 直接调用在 Worker 中编写的函数，从而大大简化参数传递。

HybridFlow中的传输协议

| 传输协议         | 分发功能 dispatch                                                | 收集功能 collection                                                    | 使用场景                                                                 |
|------------------|--------------------------------------------------------------------------|--------------------------------------------------------------------------|--------------------------------------------------------------------------|
| ONE_TO_ALL       | 将数据广播到所有节点。                                                   | 从所有节点收集数据。                                                     | 所有工作节点具有相同的输入并运行相同的代码，例如模型初始化。             |
| 3D_PROTO         | 拆分数据，分散到所有数据并行（DP）节点，并在组内广播。                    | 从所有DP组中 \( p=1 \)、\( t=0 \) 的节点收集并拼接数据。                 | 模型在每个数据并行组内分片存储，输出仅存在于最后一个流水线阶段，并在数据并行组间复制。典型应用于Megatron-LM、Deepspeed等的3D并行训练场景。 |
| 3D_ALL_MICRO_DP  | 按微DP大小拆分数据，分散到所有微DP组，并在组内广播。                      | 从所有微DP组中 local_rank=0 的节点收集并拼接数据。                       | 与HybridEngine配合使用，用于处理策略模型在训练和推理切换时的3D并行方案。  |
| 3D_PP_ONLY       | 将数据广播到所有节点。                                                   | 从所有流水线并行（PP）组中 \( t=0 \)、\( d=0 \) 的节点收集并拼接数据。   | 用于检查权重名称（因为在张量并行（TP）和数据并行（DP）组中权重名称相同）。|
| DP_PROTO         | 将数据按批次拆分并分散到所有数据并行（DP）节点。                          | 从所有DP节点收集并拼接数据。                                             | 数据并行模式下的模型训练。                                               |
| ALL_TO_ALL       | 无操作。                                                                 | 从所有节点收集数据。                                                     | 调试时使用。用户可以手动定义每个工作节点的输入并分别检查其输出。         |


In [22]:
from verl.single_controller.base.decorator import Dispatch, Execute, register

@ray.remote
class GPUAccumulatorDecorator(Worker):
    def __init__(self) -> None:
        super().__init__()
        # The initial value of each rank is the same as the rank
        self.value = torch.zeros(size=(1,), device="cuda") + self.rank

    # map from a single input to all the worker
    @register(Dispatch.ONE_TO_ALL)
    def add(self, x):
        print(x)
        self.value = self.value + x
        print(f"rank {self.rank}, value: {self.value}")
        return self.value.cpu()

In [23]:
class_with_args = RayClassWithInitArgs(cls=GPUAccumulatorDecorator)
gpu_accumulator_decorator = RayWorkerGroup(resource_pool_merge, class_with_args)



In [24]:
# As we can see, 10 is automatically dispatched to each Worker in this RayWorkerGroup.
print(gpu_accumulator_decorator.add(x=10))

[tensor([10.]), tensor([11.]), tensor([12.]), tensor([13.]), tensor([14.]), tensor([15.]), tensor([16.]), tensor([17.])]


用户还可以自定义 dispatch 和 collection 函数，只需要自己编写 dispatch_fn 和 collect_fn 函数。我们还支持仅在 rank_zero 上执行 RPC，下面提供了具体示例。

In [25]:
from verl.single_controller.base.decorator import Dispatch, collect_all_to_all, register

def two_to_all_dispatch_fn(worker_group, *args, **kwargs):
    """
    Assume the input is a list of 2. Duplicate the input interleaved and pass to each worker.
    """
    for arg in args:
        assert len(arg) == 2
        for i in range(worker_group.world_size - 2):
            arg.append(arg[i % 2])
    for k, v in kwargs.items():
        assert len(v) == 2
        for i in range(worker_group.world_size - 2):
            v.append(v[i % 2])
    return args, kwargs


@ray.remote
class TestActor(Worker):
    # TODO: pass *args and **kwargs is bug prone and not very convincing
    def __init__(self, x) -> None:
        super().__init__()
        self._x = x

    def foo(self, y):
        return self._x + y

    @register(dispatch_mode=Dispatch.ALL_TO_ALL, execute_mode=Execute.RANK_ZERO)
    def foo_rank_zero(self, x, y):
        return self._x + y + x

    @register(dispatch_mode={"dispatch_fn": two_to_all_dispatch_fn, "collect_fn": collect_all_to_all})
    def foo_custom(self, x, y):
        return self._x + y + x

In [None]:
class_with_args = RayClassWithInitArgs(cls=TestActor, x=2)
worker_group = RayWorkerGroup(resource_pool, class_with_args)

output_ref = worker_group.foo_custom(x=[1, 2], y=[5, 6])
assert output_ref == [8, 10, 8, 10]
# Worker 0: 2 + 5 + 1 = 8
# Worker 1: 2 + 6 + 2 = 10
# Worker 2: 2 + 5 + 1 = 8
# Worker 3: 2 + 6 + 2 = 10

output_ref = worker_group.foo_rank_zero(x=1, y=2)
assert output_ref == 5
# Worker 0: 2 + 2 + 1 = 5



In [27]:
ray.shutdown()