From cf813a9c031d6a5f70e10717d000362950e5f09f Mon Sep 17 00:00:00 2001 From: phantomlei Date: Thu, 6 Nov 2025 20:47:32 +0800 Subject: [PATCH] feat: add timeout env variable for process groups. --- xllm/core/runtime/dit_engine.cpp | 12 ++++++++++-- xllm/core/runtime/llm_engine.cpp | 12 ++++++++++-- xllm/core/runtime/vlm_engine.cpp | 13 +++++++++++-- xllm/core/util/env_var.cpp | 32 ++++++++++++++++++++++++++++++++ xllm/core/util/env_var.h | 14 ++++++++++++++ 5 files changed, 77 insertions(+), 6 deletions(-) mode change 100755 => 100644 xllm/core/runtime/vlm_engine.cpp diff --git a/xllm/core/runtime/dit_engine.cpp b/xllm/core/runtime/dit_engine.cpp index 3d33e2e5b..a0c43caf6 100644 --- a/xllm/core/runtime/dit_engine.cpp +++ b/xllm/core/runtime/dit_engine.cpp @@ -22,6 +22,7 @@ limitations under the License. #include "core/common/metrics.h" #include "framework/parallel_state/parallel_args.h" #include "framework/parallel_state/parallel_state.h" +#include "util/env_var.h" #include "util/timer.h" #include "worker.h" @@ -60,8 +61,15 @@ DiTEngine::DiTEngine(const runtime::Options& options) : options_(options) { for (auto& worker : workers_) { futures.emplace_back(worker->process_group_test_async()); } - // wait up to 4 seconds for all futures to complete - folly::collectAll(futures).within(std::chrono::seconds(4)).get(); + // Wait for all futures to complete with a configurable timeout. + // The timeout can be adjusted via the + // XLLM_PROCESS_GROUP_ASYNC_TIMEOUT_SECONDS environment variable (default: 4 + // seconds). This is particularly important in multi-node multi-device + // scenarios where network latency may require a longer timeout period. + const int timeout_seconds = util::get_process_group_test_timeout_seconds(); + folly::collectAll(futures) + .within(std::chrono::seconds(timeout_seconds)) + .get(); } } diff --git a/xllm/core/runtime/llm_engine.cpp b/xllm/core/runtime/llm_engine.cpp index b2eddb5c9..1d7da3444 100644 --- a/xllm/core/runtime/llm_engine.cpp +++ b/xllm/core/runtime/llm_engine.cpp @@ -36,6 +36,7 @@ limitations under the License. #include "llm_worker_impl.h" #include "runtime/worker.h" #include "server/xllm_server_registry.h" +#include "util/env_var.h" #include "util/pretty_print.h" #include "util/utils.h" @@ -106,8 +107,15 @@ void LLMEngine::process_group_test() { for (auto& worker : worker_clients_) { futures.emplace_back(worker->process_group_test_async()); } - // wait up to 4 seconds for all futures to complete - folly::collectAll(futures).within(std::chrono::seconds(4)).get(); + // Wait for all futures to complete with a configurable timeout. + // The timeout can be adjusted via the + // XLLM_PROCESS_GROUP_ASYNC_TIMEOUT_SECONDS environment variable (default: 4 + // seconds). This is particularly important in multi-node multi-device + // scenarios where network latency may require a longer timeout period. + const int timeout_seconds = util::get_process_group_test_timeout_seconds(); + folly::collectAll(futures) + .within(std::chrono::seconds(timeout_seconds)) + .get(); } #endif } diff --git a/xllm/core/runtime/vlm_engine.cpp b/xllm/core/runtime/vlm_engine.cpp old mode 100755 new mode 100644 index 3a75a1803..29468641b --- a/xllm/core/runtime/vlm_engine.cpp +++ b/xllm/core/runtime/vlm_engine.cpp @@ -27,6 +27,7 @@ limitations under the License. #include "framework/model/model_args.h" #include "framework/model_loader.h" #include "framework/parallel_state/parallel_state.h" +#include "util/env_var.h" #include "util/pretty_print.h" #include "util/utils.h" #include "worker.h" @@ -75,8 +76,16 @@ void VLMEngine::process_group_test() { for (auto& worker : workers_) { futures.emplace_back(worker->process_group_test_async()); } - // wait up to 4 seconds for all futures to complete - folly::collectAll(futures).within(std::chrono::seconds(4)).get(); + // Wait for all futures to complete with a configurable timeout. + // The timeout can be adjusted via the + // XLLM_PROCESS_GROUP_ASYNC_TIMEOUT_SECONDS environment variable (default: 4 + // seconds). This is particularly important in multi-node multi-device + // communication scenarios where network latency may require a longer + // timeout period. + const int timeout_seconds = util::get_process_group_test_timeout_seconds(); + folly::collectAll(futures) + .within(std::chrono::seconds(timeout_seconds)) + .get(); } #endif } diff --git a/xllm/core/util/env_var.cpp b/xllm/core/util/env_var.cpp index 6aac81ef7..928b792dc 100644 --- a/xllm/core/util/env_var.cpp +++ b/xllm/core/util/env_var.cpp @@ -15,7 +15,9 @@ limitations under the License. #include "env_var.h" +#include #include +#include namespace xllm { namespace util { @@ -30,5 +32,35 @@ bool get_bool_env(const std::string& key, bool defaultValue) { strVal == "True"); } +int get_int_env(const std::string& key, int defaultValue) { + const char* val = std::getenv(key.c_str()); + if (val == nullptr) { + return defaultValue; + } + // Use strtol for proper error handling + char* endptr; + long int result = std::strtol(val, &endptr, 10); + // Check if conversion was successful (endptr points to end of string or valid + // terminator) + if (endptr == val || *endptr != '\0') { + return defaultValue; + } + // Check for overflow/underflow + if (result < INT_MIN || result > INT_MAX) { + return defaultValue; + } + return static_cast(result); +} + +int get_process_group_test_timeout_seconds() { + // Default timeout is 4 seconds, but can be overridden via environment + // variable to accommodate multi-node multi-device communication scenarios + // where network latency may require a longer timeout period. + constexpr int kDefaultTimeoutSeconds = 4; + constexpr const char* kTimeoutEnvVar = + "XLLM_PROCESS_GROUP_ASYNC_TIMEOUT_SECONDS"; + return get_int_env(kTimeoutEnvVar, kDefaultTimeoutSeconds); +} + } // namespace util } // namespace xllm diff --git a/xllm/core/util/env_var.h b/xllm/core/util/env_var.h index cbd69d67f..c10a61fe7 100644 --- a/xllm/core/util/env_var.h +++ b/xllm/core/util/env_var.h @@ -22,5 +22,19 @@ namespace util { bool get_bool_env(const std::string& key, bool defaultValue); +// Get an integer value from an environment variable. +// Returns the default value if the environment variable is not set or cannot be +// parsed. +int get_int_env(const std::string& key, int defaultValue); + +// Get the timeout in seconds for process group test operations. +// This timeout is used when waiting for process group initialization tests to +// complete in multi-device/multi-node scenarios. The default value is 4 +// seconds, but can be overridden by setting the +// XLLM_PROCESS_GROUP_ASYNC_TIMEOUT_SECONDS environment variable. This is +// particularly useful in multi-node multi-device communication scenarios where +// network latency may cause the default 4-second timeout to be insufficient. +int get_process_group_test_timeout_seconds(); + } // namespace util } // namespace xllm