From 77c56ff55e3a71047a487950d81f78c3b40e409a Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Mon, 10 Feb 2025 16:04:52 +0700 Subject: [PATCH 1/2] fix: add default cpu_threads --- engine/services/model_service.cc | 25 +++++++++++++++++++ engine/services/model_service.h | 2 ++ engine/utils/hardware/gguf/gguf_file.h | 12 ++++++--- .../utils/hardware/gguf/gguf_file_estimate.h | 2 +- 4 files changed, 36 insertions(+), 5 deletions(-) diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc index 6dc1642fb..496646758 100644 --- a/engine/services/model_service.cc +++ b/engine/services/model_service.cc @@ -939,6 +939,11 @@ cpp::result ModelService::StartModel( json_helper::MergeJson(json_data, params_override); + // Set default cpu_threads if it is not configured + if (!json_data.isMember("cpu_threads")) { + json_data["cpu_threads"] = GetCpuThreads(); + } + // Set the latest ctx_len if (ctx_len) { json_data["ctx_len"] = @@ -1321,6 +1326,26 @@ ModelService::MayFallbackToCpu(const std::string& model_path, int ngl, return warning; } +int ModelService::GetCpuThreads() const { + auto hw_thread_num = std::thread::hardware_concurrency(); + auto default_engine = engine_svc_->GetDefaultEngineVariant(kLlamaEngine); + auto is_gpu_mode = false; + if (default_engine.has_error()) { + CTL_INF("Could not get default engine"); + } else { + auto& de = default_engine.value(); + is_gpu_mode = (de.variant.find("cuda") != std::string::npos) || + (de.variant.find("vulkan") != std::string::npos) || + (de.variant.find("mac") != std::string::npos); + } + + if (is_gpu_mode) { + return std::max(hw_thread_num / 4, 1u); + } else { + return std::max(hw_thread_num / 2, 1u); + } +} + cpp::result, std::string> ModelService::GetModelMetadata(const std::string& model_id) const { if (model_id.empty()) { diff --git a/engine/services/model_service.h b/engine/services/model_service.h index 17f2c0ddb..dcf99430f 100644 --- a/engine/services/model_service.h +++ b/engine/services/model_service.h @@ -112,6 +112,8 @@ class ModelService { const std::string& model_path, int ngl, int ctx_len, int n_batch = 2048, int n_ubatch = 2048, const std::string& kv_cache_type = "f16"); + int GetCpuThreads() const; + std::shared_ptr db_service_; std::shared_ptr hw_service_; std::shared_ptr download_service_; diff --git a/engine/utils/hardware/gguf/gguf_file.h b/engine/utils/hardware/gguf/gguf_file.h index 361668242..640c1b49f 100644 --- a/engine/utils/hardware/gguf/gguf_file.h +++ b/engine/utils/hardware/gguf/gguf_file.h @@ -7,11 +7,11 @@ #include #include #include +#include #include #include #include #include -#include #ifdef _WIN32 #include @@ -23,8 +23,8 @@ #endif #include "ggml.h" -#include "utils/string_utils.h" #include "utils/logging_utils.h" +#include "utils/string_utils.h" // #define GGUF_LOG(msg) \ // do { \ @@ -246,11 +246,15 @@ struct GGUFHelper { file_size = std::filesystem::file_size(file_path); int fd = open(file_path.c_str(), O_RDONLY); + if (fd == -1) { + CTL_INF("Failed to open file: " << file_path << ", error: " << errno); + return false; + } // Memory-map the file data = static_cast( mmap(nullptr, file_size, PROT_READ, MAP_PRIVATE, fd, 0)); if (data == MAP_FAILED) { - perror("Error mapping file"); + CTL_INF("Error mapping file"); close(fd); return false; } @@ -482,7 +486,7 @@ struct GGUFFile { inline std::optional ParseGgufFile(const std::string& path) { GGUFFile gf; GGUFHelper h; - if(!h.OpenAndMMap(path)) { + if (!h.OpenAndMMap(path)) { return std::nullopt; } diff --git a/engine/utils/hardware/gguf/gguf_file_estimate.h b/engine/utils/hardware/gguf/gguf_file_estimate.h index 12a7e72e1..402a70958 100644 --- a/engine/utils/hardware/gguf/gguf_file_estimate.h +++ b/engine/utils/hardware/gguf/gguf_file_estimate.h @@ -64,7 +64,6 @@ inline float GetQuantBit(const std::string& kv_cache_t) { inline std::optional EstimateLLaMACppRun( const std::string& file_path, const RunConfig& rc) { - Estimation res; // token_embeddings_size = n_vocab * embedding_length * 2 * quant_bit/16 bytes //RAM = token_embeddings_size + ((total_ngl-ngl) >=1 ? Output_layer_size + (total_ngl - ngl - 1 ) / (total_ngl-1) * (total_file_size - token_embeddings_size - Output_layer_size) : 0 ) (bytes) @@ -72,6 +71,7 @@ inline std::optional EstimateLLaMACppRun( auto gf = ParseGgufFile(file_path); if (!gf) return std::nullopt; + Estimation res; int32_t embedding_length = 0; int64_t n_vocab = 0; int32_t num_block = 0; From 85a07760ecad7d26a6b9da31c21fcd3b74ee6b4c Mon Sep 17 00:00:00 2001 From: sangjanai Date: Fri, 14 Feb 2025 08:56:40 +0700 Subject: [PATCH 2/2] fix: use half of cpu threads --- engine/services/model_service.cc | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc index 496646758..8f968437c 100644 --- a/engine/services/model_service.cc +++ b/engine/services/model_service.cc @@ -1327,23 +1327,7 @@ ModelService::MayFallbackToCpu(const std::string& model_path, int ngl, } int ModelService::GetCpuThreads() const { - auto hw_thread_num = std::thread::hardware_concurrency(); - auto default_engine = engine_svc_->GetDefaultEngineVariant(kLlamaEngine); - auto is_gpu_mode = false; - if (default_engine.has_error()) { - CTL_INF("Could not get default engine"); - } else { - auto& de = default_engine.value(); - is_gpu_mode = (de.variant.find("cuda") != std::string::npos) || - (de.variant.find("vulkan") != std::string::npos) || - (de.variant.find("mac") != std::string::npos); - } - - if (is_gpu_mode) { - return std::max(hw_thread_num / 4, 1u); - } else { - return std::max(hw_thread_num / 2, 1u); - } + return std::max(std::thread::hardware_concurrency() / 2, 1u); } cpp::result, std::string>