From cf813a9c031d6a5f70e10717d000362950e5f09f Mon Sep 17 00:00:00 2001
From: phantomlei <phantomlei3@gmail.com>
Date: Thu, 6 Nov 2025 20:47:32 +0800
Subject: [PATCH] feat: add timeout env variable for process groups.

---
 xllm/core/runtime/dit_engine.cpp | 12 ++++++++++--
 xllm/core/runtime/llm_engine.cpp | 12 ++++++++++--
 xllm/core/runtime/vlm_engine.cpp | 13 +++++++++++--
 xllm/core/util/env_var.cpp       | 32 ++++++++++++++++++++++++++++++++
 xllm/core/util/env_var.h         | 14 ++++++++++++++
 5 files changed, 77 insertions(+), 6 deletions(-)
 mode change 100755 => 100644 xllm/core/runtime/vlm_engine.cpp

diff --git a/xllm/core/runtime/dit_engine.cpp b/xllm/core/runtime/dit_engine.cpp
index 3d33e2e5b..a0c43caf6 100644
--- a/xllm/core/runtime/dit_engine.cpp
+++ b/xllm/core/runtime/dit_engine.cpp
@@ -22,6 +22,7 @@ limitations under the License.
 #include "core/common/metrics.h"
 #include "framework/parallel_state/parallel_args.h"
 #include "framework/parallel_state/parallel_state.h"
+#include "util/env_var.h"
 #include "util/timer.h"
 #include "worker.h"
 
@@ -60,8 +61,15 @@ DiTEngine::DiTEngine(const runtime::Options& options) : options_(options) {
     for (auto& worker : workers_) {
       futures.emplace_back(worker->process_group_test_async());
     }
-    // wait up to 4 seconds for all futures to complete
-    folly::collectAll(futures).within(std::chrono::seconds(4)).get();
+    // Wait for all futures to complete with a configurable timeout.
+    // The timeout can be adjusted via the
+    // XLLM_PROCESS_GROUP_ASYNC_TIMEOUT_SECONDS environment variable (default: 4
+    // seconds). This is particularly important in multi-node multi-device
+    // scenarios where network latency may require a longer timeout period.
+    const int timeout_seconds = util::get_process_group_test_timeout_seconds();
+    folly::collectAll(futures)
+        .within(std::chrono::seconds(timeout_seconds))
+        .get();
   }
 }
 
diff --git a/xllm/core/runtime/llm_engine.cpp b/xllm/core/runtime/llm_engine.cpp
index b2eddb5c9..1d7da3444 100644
--- a/xllm/core/runtime/llm_engine.cpp
+++ b/xllm/core/runtime/llm_engine.cpp
@@ -36,6 +36,7 @@ limitations under the License.
 #include "llm_worker_impl.h"
 #include "runtime/worker.h"
 #include "server/xllm_server_registry.h"
+#include "util/env_var.h"
 #include "util/pretty_print.h"
 #include "util/utils.h"
 
@@ -106,8 +107,15 @@ void LLMEngine::process_group_test() {
     for (auto& worker : worker_clients_) {
       futures.emplace_back(worker->process_group_test_async());
     }
-    // wait up to 4 seconds for all futures to complete
-    folly::collectAll(futures).within(std::chrono::seconds(4)).get();
+    // Wait for all futures to complete with a configurable timeout.
+    // The timeout can be adjusted via the
+    // XLLM_PROCESS_GROUP_ASYNC_TIMEOUT_SECONDS environment variable (default: 4
+    // seconds). This is particularly important in multi-node multi-device
+    // scenarios where network latency may require a longer timeout period.
+    const int timeout_seconds = util::get_process_group_test_timeout_seconds();
+    folly::collectAll(futures)
+        .within(std::chrono::seconds(timeout_seconds))
+        .get();
   }
 #endif
 }
diff --git a/xllm/core/runtime/vlm_engine.cpp b/xllm/core/runtime/vlm_engine.cpp
old mode 100755
new mode 100644
index 3a75a1803..29468641b
--- a/xllm/core/runtime/vlm_engine.cpp
+++ b/xllm/core/runtime/vlm_engine.cpp
@@ -27,6 +27,7 @@ limitations under the License.
 #include "framework/model/model_args.h"
 #include "framework/model_loader.h"
 #include "framework/parallel_state/parallel_state.h"
+#include "util/env_var.h"
 #include "util/pretty_print.h"
 #include "util/utils.h"
 #include "worker.h"
@@ -75,8 +76,16 @@ void VLMEngine::process_group_test() {
     for (auto& worker : workers_) {
       futures.emplace_back(worker->process_group_test_async());
     }
-    // wait up to 4 seconds for all futures to complete
-    folly::collectAll(futures).within(std::chrono::seconds(4)).get();
+    // Wait for all futures to complete with a configurable timeout.
+    // The timeout can be adjusted via the
+    // XLLM_PROCESS_GROUP_ASYNC_TIMEOUT_SECONDS environment variable (default: 4
+    // seconds). This is particularly important in multi-node multi-device
+    // communication scenarios where network latency may require a longer
+    // timeout period.
+    const int timeout_seconds = util::get_process_group_test_timeout_seconds();
+    folly::collectAll(futures)
+        .within(std::chrono::seconds(timeout_seconds))
+        .get();
   }
 #endif
 }
diff --git a/xllm/core/util/env_var.cpp b/xllm/core/util/env_var.cpp
index 6aac81ef7..928b792dc 100644
--- a/xllm/core/util/env_var.cpp
+++ b/xllm/core/util/env_var.cpp
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include "env_var.h"
 
+#include <climits>
 #include <cstdlib>
+#include <cstring>
 
 namespace xllm {
 namespace util {
@@ -30,5 +32,35 @@ bool get_bool_env(const std::string& key, bool defaultValue) {
           strVal == "True");
 }
 
+int get_int_env(const std::string& key, int defaultValue) {
+  const char* val = std::getenv(key.c_str());
+  if (val == nullptr) {
+    return defaultValue;
+  }
+  // Use strtol for proper error handling
+  char* endptr;
+  long int result = std::strtol(val, &endptr, 10);
+  // Check if conversion was successful (endptr points to end of string or valid
+  // terminator)
+  if (endptr == val || *endptr != '\0') {
+    return defaultValue;
+  }
+  // Check for overflow/underflow
+  if (result < INT_MIN || result > INT_MAX) {
+    return defaultValue;
+  }
+  return static_cast<int>(result);
+}
+
+int get_process_group_test_timeout_seconds() {
+  // Default timeout is 4 seconds, but can be overridden via environment
+  // variable to accommodate multi-node multi-device communication scenarios
+  // where network latency may require a longer timeout period.
+  constexpr int kDefaultTimeoutSeconds = 4;
+  constexpr const char* kTimeoutEnvVar =
+      "XLLM_PROCESS_GROUP_ASYNC_TIMEOUT_SECONDS";
+  return get_int_env(kTimeoutEnvVar, kDefaultTimeoutSeconds);
+}
+
 }  // namespace util
 }  // namespace xllm
diff --git a/xllm/core/util/env_var.h b/xllm/core/util/env_var.h
index cbd69d67f..c10a61fe7 100644
--- a/xllm/core/util/env_var.h
+++ b/xllm/core/util/env_var.h
@@ -22,5 +22,19 @@ namespace util {
 
 bool get_bool_env(const std::string& key, bool defaultValue);
 
+// Get an integer value from an environment variable.
+// Returns the default value if the environment variable is not set or cannot be
+// parsed.
+int get_int_env(const std::string& key, int defaultValue);
+
+// Get the timeout in seconds for process group test operations.
+// This timeout is used when waiting for process group initialization tests to
+// complete in multi-device/multi-node scenarios. The default value is 4
+// seconds, but can be overridden by setting the
+// XLLM_PROCESS_GROUP_ASYNC_TIMEOUT_SECONDS environment variable. This is
+// particularly useful in multi-node multi-device communication scenarios where
+// network latency may cause the default 4-second timeout to be insufficient.
+int get_process_group_test_timeout_seconds();
+
 }  // namespace util
 }  // namespace xllm