NVMegatronRayWorkerGroup

由于 Ray 的问题，目前我们只能在 RayResourcePool 中支持 max_colocate_count=1。这意味着每个 GPU 只能有一个进程。因此，我们需要重启 ray 并初始化一个新的 resource_pool 来演示 NVMegatronRayWorkerGroup.

NVMegatronRayWorkerGroup 其中创建了一个 Megatron，然后运行了一个张量并行（tp）分割的 Llama mlp 层。这里，我们使用了一种复杂的调度模式， Megatron_COMPUTE 。这种调度模式假设用户传递的数据是按 DP 维度划分的。数据被调度到同一 dp 组内的所有 tp/pp 进程中，最终只收集 tp=0 和最后一个 pp 的输出数据。这样，对于只在驱动器上编写代码的用户来说，RPC 背后的 Megatron 就变得透明了。

In [1]:
import warnings
warnings.filterwarnings("ignore")
import ray
import torch
ray.init(
        runtime_env={
        "working_dir": "/data2/zzd/rl_llm/verl",  # 工作目录（会上传到集群）
    }
)

2025-04-29 11:47:03,316	INFO worker.py:1832 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m
2025-04-29 11:47:03,677	INFO packaging.py:575 -- Creating a file package for local module '/data2/zzd/rl_llm/verl'.
2025-04-29 11:47:04,120	INFO packaging.py:367 -- Pushing file package 'gcs://_ray_pkg_7d8903d1395f05be.zip' (13.73MiB) to Ray cluster...
2025-04-29 11:47:04,177	INFO packaging.py:380 -- Successfully pushed file package 'gcs://_ray_pkg_7d8903d1395f05be.zip'.


0,1
Python version:,3.10.16
Ray version:,2.43.0
Dashboard:,http://127.0.0.1:8265


[36m(MLPLayerWorker pid=2512825)[0m [2025-04-29 11:47:21,452] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)




In [None]:
import sys
sys.path.append("/data2/zzd/rl_llm/verl")

import os
from megatron.core import parallel_state as mpu
from megatron.core import tensor_parallel
from megatron.core import ModelParallelConfig
from omegaconf import OmegaConf

from verl.single_controller.base.decorator import Dispatch, register
from verl.single_controller.base.megatron.worker import MegatronWorker
from verl.single_controller.ray.base import RayClassWithInitArgs, RayResourcePool
from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup
from verl.models.llama.megatron.layers import ParallelLlamaMLP
from verl.utils.torch_dtypes import PrecisionType

In [3]:
resource_pool = RayResourcePool([4], use_gpu=True, max_colocate_count=1)

In [None]:
@ray.remote
class MLPLayerWorker(MegatronWorker):
    def __init__(self):
        super().__init__()
        rank = int(os.environ["LOCAL_RANK"])
        torch.distributed.init_process_group(backend="nccl")
        torch.cuda.set_device(rank)

        mpu.initialize_model_parallel(
            tensor_model_parallel_size=4,
            pipeline_model_parallel_size=1,
            virtual_pipeline_model_parallel_size=None,
            pipeline_model_parallel_split_rank=None,
            use_sharp=False,
            context_parallel_size=1,
            expert_model_parallel_size=1,
            nccl_communicator_config_path=None,
        )
        tensor_parallel.model_parallel_cuda_manual_seed(10)

    @register(Dispatch.ONE_TO_ALL)
    def init_model(self, config):
        megatron_config = ModelParallelConfig(
            tensor_model_parallel_size=mpu.get_tensor_model_parallel_world_size(),
            pipeline_model_parallel_size=mpu.get_pipeline_model_parallel_world_size(),
            virtual_pipeline_model_parallel_size=mpu.get_virtual_pipeline_model_parallel_world_size(),
            sequence_parallel=False,
            params_dtype=PrecisionType.to_dtype("fp32"),
            pipeline_dtype=PrecisionType.to_dtype("fp32"),
            )
        self.parallel_layer = ParallelLlamaMLP(config=config, megatron_config=megatron_config)

    @register(Dispatch.ONE_TO_ALL)
    def get_weights(self):
        output = {}
        for key, val in self.parallel_layer.named_parameters():
            output[key] = val
        return output

    @register(Dispatch.MEGATRON_COMPUTE)
    def run_layer(self, x):
        x = x.to("cuda")
        y = self.parallel_layer(x)
        return y

In [5]:
layer_cls = RayClassWithInitArgs(cls=MLPLayerWorker)
layer_worker_group = NVMegatronRayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=layer_cls,)



In [6]:
print(layer_worker_group.world_size, layer_worker_group.tp_size, layer_worker_group.pp_size, layer_worker_group.dp_size)

4 4 1 1


In [7]:
config = OmegaConf.create(
    {
        "hidden_size": 4096,
        "intermediate_size": 11008,
        "hidden_act": "silu",
        "pretraining_tp": 1,
        "tp": layer_worker_group.tp_size,
    }
)
layer_worker_group.init_model(config)

[None, None, None, None]

In [8]:
batch_size = 16
seq_len = 2048
x = torch.rand(size=(batch_size, seq_len, 4096), dtype=torch.float32)
output = layer_worker_group.run_layer([x])  # This must be a list of size 1, ensuring that the input equals the data parallel (dp).
print(output[0].shape)

torch.Size([16, 2048, 4096])


In [9]:
# Shutdown ray cluster
ray.shutdown()