jd-opensource · liutongxuan · Oct 24, 2025 · Oct 22, 2025 · XuZhang99 · Oct 23, 2025
diff --git a/examples/generate.py b/examples/generate.py
@@ -1,4 +1,5 @@
-# python examples/generate.py --model='/path/models/Qwen2-7B-Instruct' --devices='npu:0' 
+# python examples/generate.py --model='/path/models/Qwen2-7B-Instruct' --devices='npu:0'
+# python generate.py --model='/path/models/Qwen2-7B-Instruct' --devices='npu:0,npu:1'
 
 from xllm import ArgumentParser, LLM, RequestParams
 

diff --git a/examples/generate_vlm.py b/examples/generate_vlm.py
@@ -1,4 +1,5 @@
-# python examples/generate_vlm.py --model='/path/models/Qwen2.5-VL-7B' --devices='npu:0' --master_node_addr=127.0.0.1:8888
+# python examples/generate_vlm.py --model='/path/models/Qwen2.5-VL-7B' --devices='npu:0'
+# python generate_vlm.py --model='/path/models/Qwen2.5-VL-7B' --devices='npu:0,npu:1'
 
 import os
 import signal

diff --git a/setup.py b/setup.py
@@ -610,7 +610,8 @@ def apply_patch():
                 },
         zip_safe=False,
         py_modules=["xllm/launch_xllm", "xllm/__init__",
-                    "xllm/pybind/llm", "xllm/pybind/vlm", "xllm/pybind/args"],
+                    "xllm/pybind/llm", "xllm/pybind/vlm",
+                    "xllm/pybind/util", "xllm/pybind/args"],
         entry_points={
             'console_scripts': [
                 'xllm = xllm.launch_xllm:launch_xllm'

diff --git a/xllm/core/common/options.h b/xllm/core/common/options.h
@@ -170,6 +170,11 @@ class Options {
   PROPERTY(int, max_requests_per_batch) = 0;
 
   PROPERTY(bool, enable_continuous_kvcache) = false;
+
+  // for offline inference: start with offline inference, default is false
+  PROPERTY(bool, enable_offline_inference) = false;
+  // for offline inference: the path to spawn worker binary
+  PROPERTY(std::string, spawn_worker_path) = "";
 };
 
 }  // namespace xllm
diff --git a/xllm/core/distributed_runtime/CMakeLists.txt b/xllm/core/distributed_runtime/CMakeLists.txt
@@ -4,6 +4,8 @@ if(USE_NPU)
   include_directories(
     ${CMAKE_SOURCE_DIR}/third_party/spdlog/include
   )
+
+  add_subdirectory(spawn_worker_server)
 endif()
 
 cc_library(

diff --git a/xllm/core/distributed_runtime/dist_manager.cpp b/xllm/core/distributed_runtime/dist_manager.cpp
@@ -141,6 +141,9 @@ void DistManager::setup_multi_node_workers(
     // Node2: 0+4, 1+4, 2+4, 3+4
     const int32_t rank = static_cast<int32_t>(i) + base_rank;
 
+    // we use spawn process worker to launch a xllm instance
+    // when start a offline inference task with multi-gpu/npu/mpu/...
+    bool use_spawn_worker = options.enable_offline_inference() && i > 0;
     ParallelArgs parallel_args(rank, world_size, dp_size, nullptr, ep_size);
     servers_.emplace_back(std::make_unique<WorkerServer>(i,
                                                          master_node_addr,
@@ -149,7 +152,8 @@ void DistManager::setup_multi_node_workers(
                                                          parallel_args,
                                                          devices[i],
                                                          worker_server_options,
-                                                         worker_type));
+                                                         worker_type,
+                                                         use_spawn_worker));
   }
 
   // Master node need to wait all workers done

diff --git a/xllm/core/distributed_runtime/spawn_worker_server/CMakeLists.txt b/xllm/core/distributed_runtime/spawn_worker_server/CMakeLists.txt
@@ -0,0 +1,24 @@
+include(cc_binary)
+
+cc_binary(
+  NAME 
+    spawn_worker
+  HDRS
+    spawn_worker_server.h
+  SRCS
+    spawn_worker_server.cpp
+    spawn_worker_server_process.cpp
+  DEPS
+    :models
+    :model
+    :distributed_runtime
+    absl::strings
+    xllm_kernels
+    ascendcl
+    nnopbase
+    atb
+    c_sec
+    spdlog::spdlog
+)
+
+add_dependencies(export_module spawn_worker)
diff --git a/xllm/core/distributed_runtime/spawn_worker_server/spawn_worker_server.cpp b/xllm/core/distributed_runtime/spawn_worker_server/spawn_worker_server.cpp
@@ -0,0 +1,84 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "spawn_worker_server.h"
+
+#include <absl/strings/str_split.h>
+#if defined(USE_NPU)
+#include <acl/acl.h>
+#endif
+#include <signal.h>
+#include <sys/prctl.h>
+
+#include <cstdlib>
+
+#include "core/distributed_runtime/worker_server.h"
+#include "core/platform/device.h"
+#include "core/runtime/options.h"
+
+namespace xllm {
+
+bool xllm::SpawnWorkerServer::g_running_ = true;
+
+SpawnWorkerServer::SpawnWorkerServer(const std::string& master_node_addr,
+                                     int local_rank,
+                                     int global_rank,
+                                     int world_size,
+                                     int device_idx,
+                                     int num_decoding_tokens,
+                                     int block_size) {
+  // TODO: pass whole xllm::runtime::Options here from main process.
+  xllm::runtime::Options runner_options;
+  runner_options.block_size(block_size)
+      .num_decoding_tokens(num_decoding_tokens)
+      .enable_schedule_overlap(false)
+      .enable_offline_inference(true)
+      .master_node_addr(master_node_addr);
+  FLAGS_enable_schedule_overlap = false;
+  FLAGS_master_node_addr = master_node_addr;
+  FLAGS_block_size = block_size;
+
+  std::atomic<bool> done(false);
+#if defined(USE_NPU)
+  xllm::Device device("npu:" + std::to_string(device_idx));
+  device.set_device();
+  device.init_device_context();
+  FLAGS_enable_atb_comm_multiprocess = true;
+#endif
+
+  ParallelArgs parallel_args(global_rank, world_size, 1, nullptr, 1);
+  WorkerServer worker_server(local_rank,
+                             master_node_addr,
+                             done,
+                             parallel_args,
+                             device,
+                             runner_options,
+                             WorkerType::LLM,
+                             false);
+}
+
+void SpawnWorkerServer::handle_signal(int signum) { g_running_ = false; }
+
+void SpawnWorkerServer::run() {
+  signal(SIGINT, SpawnWorkerServer::handle_signal);
+  signal(SIGTERM, SpawnWorkerServer::handle_signal);
+
+  // main thread waiting here
+  while (SpawnWorkerServer::g_running_) {
+    sleep(5);
+  }
+}
+
+}  // namespace xllm
diff --git a/xllm/core/distributed_runtime/spawn_worker_server/spawn_worker_server.h b/xllm/core/distributed_runtime/spawn_worker_server/spawn_worker_server.h
@@ -0,0 +1,41 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#pragma once
+
+#include <string>
+
+namespace xllm {
+
+class SpawnWorkerServer final {
+ public:
+  explicit SpawnWorkerServer(const std::string& master_node_addr,
+                             int local_rank,
+                             int global_rank,
+                             int world_size,
+                             int device_idx,
+                             int num_decoding_tokens,
+                             int block_size);
+
+  ~SpawnWorkerServer() = default;
+
+  void run();
+
+  static void handle_signal(int signum);
+
+  static bool g_running_;
+};
+
+}  // namespace xllm
diff --git a/xllm/core/distributed_runtime/spawn_worker_server/spawn_worker_server_process.cpp b/xllm/core/distributed_runtime/spawn_worker_server/spawn_worker_server_process.cpp
@@ -0,0 +1,73 @@
+/* Copyright 2025 The xLLM Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://github.com/jd-opensource/xllm/blob/main/LICENSE
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <signal.h>
+#include <sys/prctl.h>
+
+#include "spawn_worker_server.h"
+
+// Worker argv from engine process:
+// @master_node_addr
+// @local_rank
+// @global_rank
+// @world_size
+// @device_idx
+// @num_decoding_tokens
+// @block_size
+int main(int argc, char* argv[]) {
+  if (argc < 7) {
+    LOG(ERROR)
+        << "Spwan worker process receive wrong args. Need 7 args, receive "
+        << argc;
+    return 1;
+  }
+
+  // set PR_SET_PDEATHSIG flag that child should exit
+  // when parent process exit
+  if (prctl(PR_SET_PDEATHSIG, SIGHUP) == -1) {
+    perror("prctl");
+    return EXIT_FAILURE;
+  }
+
+  std::string master_node_addr = std::string(argv[1]);
+  int local_rank = atoi(argv[2]);
+  int global_rank = atoi(argv[3]);
+  int world_size = atoi(argv[4]);
+  int device_idx = atoi(argv[5]);
+  int num_decoding_tokens = atoi(argv[6]);
+  int block_size = atoi(argv[7]);
+
+  LOG(INFO) << "Spwan worker: "
+            << "master_node_addr = " << master_node_addr
+            << ", local_rank = " << local_rank
+            << ", world_size = " << world_size
+            << ", device_idx = " << device_idx
+            << ", num_decoding_tokens = " << num_decoding_tokens
+            << ", block_size = " << block_size << "\n";
+
+  xllm::SpawnWorkerServer worker(master_node_addr,
+                                 local_rank,
+                                 global_rank,
+                                 world_size,
+                                 device_idx,
+                                 num_decoding_tokens,
+                                 block_size);
+
+  worker.run();
+
+  return 0;
+}