Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions xllm/core/runtime/dit_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ limitations under the License.
#include "core/common/metrics.h"
#include "framework/parallel_state/parallel_args.h"
#include "framework/parallel_state/parallel_state.h"
#include "util/env_var.h"
#include "util/timer.h"
#include "worker.h"

Expand Down Expand Up @@ -60,8 +61,15 @@ DiTEngine::DiTEngine(const runtime::Options& options) : options_(options) {
for (auto& worker : workers_) {
futures.emplace_back(worker->process_group_test_async());
}
// wait up to 4 seconds for all futures to complete
folly::collectAll(futures).within(std::chrono::seconds(4)).get();
// Wait for all futures to complete with a configurable timeout.
// The timeout can be adjusted via the
// XLLM_PROCESS_GROUP_ASYNC_TIMEOUT_SECONDS environment variable (default: 4
// seconds). This is particularly important in multi-node multi-device
// scenarios where network latency may require a longer timeout period.
const int timeout_seconds = util::get_process_group_test_timeout_seconds();
folly::collectAll(futures)
.within(std::chrono::seconds(timeout_seconds))
.get();
}
}

Expand Down
12 changes: 10 additions & 2 deletions xllm/core/runtime/llm_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ limitations under the License.
#include "llm_worker_impl.h"
#include "runtime/worker.h"
#include "server/xllm_server_registry.h"
#include "util/env_var.h"
#include "util/pretty_print.h"
#include "util/utils.h"

Expand Down Expand Up @@ -106,8 +107,15 @@ void LLMEngine::process_group_test() {
for (auto& worker : worker_clients_) {
futures.emplace_back(worker->process_group_test_async());
}
// wait up to 4 seconds for all futures to complete
folly::collectAll(futures).within(std::chrono::seconds(4)).get();
// Wait for all futures to complete with a configurable timeout.
// The timeout can be adjusted via the
// XLLM_PROCESS_GROUP_ASYNC_TIMEOUT_SECONDS environment variable (default: 4
// seconds). This is particularly important in multi-node multi-device
// scenarios where network latency may require a longer timeout period.
const int timeout_seconds = util::get_process_group_test_timeout_seconds();
folly::collectAll(futures)
.within(std::chrono::seconds(timeout_seconds))
.get();
}
#endif
}
Expand Down
13 changes: 11 additions & 2 deletions xllm/core/runtime/vlm_engine.cpp
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ limitations under the License.
#include "framework/model/model_args.h"
#include "framework/model_loader.h"
#include "framework/parallel_state/parallel_state.h"
#include "util/env_var.h"
#include "util/pretty_print.h"
#include "util/utils.h"
#include "worker.h"
Expand Down Expand Up @@ -75,8 +76,16 @@ void VLMEngine::process_group_test() {
for (auto& worker : workers_) {
futures.emplace_back(worker->process_group_test_async());
}
// wait up to 4 seconds for all futures to complete
folly::collectAll(futures).within(std::chrono::seconds(4)).get();
// Wait for all futures to complete with a configurable timeout.
// The timeout can be adjusted via the
// XLLM_PROCESS_GROUP_ASYNC_TIMEOUT_SECONDS environment variable (default: 4
// seconds). This is particularly important in multi-node multi-device
// communication scenarios where network latency may require a longer
// timeout period.
const int timeout_seconds = util::get_process_group_test_timeout_seconds();
folly::collectAll(futures)
.within(std::chrono::seconds(timeout_seconds))
.get();
}
#endif
}
Expand Down
32 changes: 32 additions & 0 deletions xllm/core/util/env_var.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@ limitations under the License.

#include "env_var.h"

#include <climits>
#include <cstdlib>
#include <cstring>

namespace xllm {
namespace util {
Expand All @@ -30,5 +32,35 @@ bool get_bool_env(const std::string& key, bool defaultValue) {
strVal == "True");
}

int get_int_env(const std::string& key, int defaultValue) {
const char* val = std::getenv(key.c_str());
if (val == nullptr) {
return defaultValue;
}
// Use strtol for proper error handling
char* endptr;
long int result = std::strtol(val, &endptr, 10);
// Check if conversion was successful (endptr points to end of string or valid
// terminator)
if (endptr == val || *endptr != '\0') {
return defaultValue;
}
// Check for overflow/underflow
if (result < INT_MIN || result > INT_MAX) {
return defaultValue;
}
return static_cast<int>(result);
}

int get_process_group_test_timeout_seconds() {
// Default timeout is 4 seconds, but can be overridden via environment
// variable to accommodate multi-node multi-device communication scenarios
// where network latency may require a longer timeout period.
constexpr int kDefaultTimeoutSeconds = 4;
constexpr const char* kTimeoutEnvVar =
"XLLM_PROCESS_GROUP_ASYNC_TIMEOUT_SECONDS";
return get_int_env(kTimeoutEnvVar, kDefaultTimeoutSeconds);
}

} // namespace util
} // namespace xllm
14 changes: 14 additions & 0 deletions xllm/core/util/env_var.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,19 @@ namespace util {

bool get_bool_env(const std::string& key, bool defaultValue);

// Get an integer value from an environment variable.
// Returns the default value if the environment variable is not set or cannot be
// parsed.
int get_int_env(const std::string& key, int defaultValue);

// Get the timeout in seconds for process group test operations.
// This timeout is used when waiting for process group initialization tests to
// complete in multi-device/multi-node scenarios. The default value is 4
// seconds, but can be overridden by setting the
// XLLM_PROCESS_GROUP_ASYNC_TIMEOUT_SECONDS environment variable. This is
// particularly useful in multi-node multi-device communication scenarios where
// network latency may cause the default 4-second timeout to be insufficient.
int get_process_group_test_timeout_seconds();

} // namespace util
} // namespace xllm