From 32a7bae0a67327d5fe9064e806e571bd16d803e8 Mon Sep 17 00:00:00 2001 From: Akarshan Date: Thu, 12 Jun 2025 10:11:02 +0530 Subject: [PATCH 01/10] refactor: remove --pooling flag from model loading The --pooling flag was removed as the mean pooling functionality not needed in chat models. This fixes the regression --- engine/extensions/local-engine/local_engine.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/engine/extensions/local-engine/local_engine.cc b/engine/extensions/local-engine/local_engine.cc index 2bba11a7b..9c0b9b6b5 100644 --- a/engine/extensions/local-engine/local_engine.cc +++ b/engine/extensions/local-engine/local_engine.cc @@ -561,8 +561,6 @@ void LocalEngine::LoadModel(std::shared_ptr json_body, params.push_back("--port"); params.push_back(std::to_string(s.port)); - params.push_back("--pooling"); - params.push_back("mean"); params.push_back("--jinja"); From d9ea60013be82db0a19b78794b044dfb51cec702 Mon Sep 17 00:00:00 2001 From: Akarshan Date: Thu, 12 Jun 2025 10:34:28 +0530 Subject: [PATCH 02/10] feat(local-engine): add ctx_len parameter support Adds support for the ctx_len parameter by appending --ctx-size with its value. Removed outdated parameter mappings from the kParamsMap to reflect current implementation details and ensure consistency. --- engine/extensions/local-engine/local_engine.cc | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/engine/extensions/local-engine/local_engine.cc b/engine/extensions/local-engine/local_engine.cc index 9c0b9b6b5..6ae9c72ba 100644 --- a/engine/extensions/local-engine/local_engine.cc +++ b/engine/extensions/local-engine/local_engine.cc @@ -23,16 +23,6 @@ const std::unordered_set kIgnoredParams = { "stop", "tfs_z", "typ_p", "caching_enabled"}; const std::unordered_map kParamsMap = { - {"cpu_threads", "--threads"}, - {"n_ubatch", "--ubatch-size"}, - {"n_batch", "--batch-size"}, - {"n_parallel", "--parallel"}, - {"temperature", "--temp"}, - {"top_k", "--top-k"}, - {"top_p", "--top-p"}, - {"min_p", "--min-p"}, - {"dynatemp_exponent", "--dynatemp-exp"}, - {"ctx_len", "--ctx-size"}, {"ngl", "-ngl"}, }; @@ -85,6 +75,12 @@ std::vector ConvertJsonToParamsVector(const Json::Value& root) { res.push_back("--ignore_eos"); } continue; + } else if (member == "ctx_len") { + if (!root[member].isNull()) { + res.push_back("--ctx-size"); + res.push_back(root[member].asString()); + } + continue; } res.push_back("--" + member); From 4a02ef566503be303e4e26654e26c4e55de8d23b Mon Sep 17 00:00:00 2001 From: Akarshan Date: Thu, 12 Jun 2025 11:29:15 +0530 Subject: [PATCH 03/10] feat: add conditional model parameters based on path When the model path contains both "jan" and "nano" (case-insensitive), automatically add speculative decoding parameters to adjust generation behavior. This improves flexibility by enabling environment-specific configurations without manual parameter tuning. Also includes necessary headers for string manipulation and fixes whitespace in ctx_len handling. --- .../extensions/local-engine/local_engine.cc | 36 ++++++++++++++----- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/engine/extensions/local-engine/local_engine.cc b/engine/extensions/local-engine/local_engine.cc index 6ae9c72ba..37c6293f0 100644 --- a/engine/extensions/local-engine/local_engine.cc +++ b/engine/extensions/local-engine/local_engine.cc @@ -1,6 +1,8 @@ #include "local_engine.h" +#include #include #include +#include #include #include "utils/curl_utils.h" #include "utils/json_helper.h" @@ -37,13 +39,29 @@ int GenerateRandomInteger(int min, int max) { std::vector ConvertJsonToParamsVector(const Json::Value& root) { std::vector res; - std::string errors; for (const auto& member : root.getMemberNames()) { if (member == "model_path" || member == "llama_model_path") { if (!root[member].isNull()) { + const std::string path = root[member].asString(); res.push_back("--model"); - res.push_back(root[member].asString()); + res.push_back(path); + + // If path contains both "Jan" and "nano", case-insensitive, add special params + std::string lowered = path; + std::transform(lowered.begin(), lowered.end(), lowered.begin(), [](unsigned char c) { + return std::tolower(c); + }); + if (lowered.find("jan") != std::string::npos && lowered.find("nano") != std::string::npos) { + res.push_back("--temp"); + res.push_back("0.7"); + res.push_back("--top-p"); + res.push_back("0.8"); + res.push_back("--top-k"); + res.push_back("20"); + res.push_back("--min-p"); + res.push_back("0"); + } } continue; } else if (kIgnoredParams.find(member) != kIgnoredParams.end()) { @@ -76,13 +94,14 @@ std::vector ConvertJsonToParamsVector(const Json::Value& root) { } continue; } else if (member == "ctx_len") { - if (!root[member].isNull()) { - res.push_back("--ctx-size"); - res.push_back(root[member].asString()); - } - continue; + if (!root[member].isNull()) { + res.push_back("--ctx-size"); + res.push_back(root[member].asString()); + } + continue; } + // Generic handling for other members res.push_back("--" + member); if (root[member].isString()) { res.push_back(root[member].asString()); @@ -101,7 +120,7 @@ std::vector ConvertJsonToParamsVector(const Json::Value& root) { ss << "\"" << value.asString() << "\""; first = false; } - ss << "] "; + ss << "]"; res.push_back(ss.str()); } } @@ -109,6 +128,7 @@ std::vector ConvertJsonToParamsVector(const Json::Value& root) { return res; } + constexpr const auto kMinDataChunkSize = 6u; struct OaiInfo { From 41023d39b7fbfb1d2649ea29b02152303bef827b Mon Sep 17 00:00:00 2001 From: Akarshan Date: Thu, 12 Jun 2025 11:41:35 +0530 Subject: [PATCH 04/10] chore: remove redundant comment The comment was redundant as the code's purpose is clear without it, improving readability. --- engine/extensions/local-engine/local_engine.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/engine/extensions/local-engine/local_engine.cc b/engine/extensions/local-engine/local_engine.cc index 37c6293f0..0722ef1d4 100644 --- a/engine/extensions/local-engine/local_engine.cc +++ b/engine/extensions/local-engine/local_engine.cc @@ -34,7 +34,7 @@ int GenerateRandomInteger(int min, int max) { std::uniform_int_distribution<> dis( min, max); // Distribution for the desired range - return dis(gen); // Generate and return a random integer within the range + return dis(gen); } std::vector ConvertJsonToParamsVector(const Json::Value& root) { From 87ad9bc7ae058d5d10d109a2e8c727236261e071 Mon Sep 17 00:00:00 2001 From: Akarshan Date: Thu, 12 Jun 2025 12:47:56 +0530 Subject: [PATCH 05/10] feat: add new parameters and flags to local engine configuration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit introduces new configuration parameters and their corresponding command-line flags for the local engine. The changes include: - Adding "flash_attn" to ignored parameters - Mapping UI parameters to CLI flags (e.g., cpu_threads → --threads) - Expanding support for various model configuration options These additions enhance the flexibility of the local engine by enabling fine-grained control over performance and behavior through both UI and CLI interfaces. --- engine/extensions/local-engine/local_engine.cc | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/engine/extensions/local-engine/local_engine.cc b/engine/extensions/local-engine/local_engine.cc index 0722ef1d4..c1c894518 100644 --- a/engine/extensions/local-engine/local_engine.cc +++ b/engine/extensions/local-engine/local_engine.cc @@ -22,9 +22,20 @@ const std::unordered_set kIgnoredParams = { "user_prompt", "min_keep", "mirostat", "mirostat_eta", "mirostat_tau", "text_model", "version", "n_probs", "object", "penalize_nl", "precision", "size", + "flash_attn", "stop", "tfs_z", "typ_p", "caching_enabled"}; const std::unordered_map kParamsMap = { + {"cpu_threads", "--threads"}, + {"n_ubatch", "--ubatch-size"}, + {"n_batch", "--batch-size"}, + {"n_parallel", "--parallel"}, + {"temperature", "--temp"}, + {"top_k", "--top-k"}, + {"top_p", "--top-p"}, + {"min_p", "--min-p"}, + {"dynatemp_exponent", "--dynatemp-exp"}, + {"ctx_len", "--ctx-size"}, {"ngl", "-ngl"}, }; From 8725f3844dd023074fa47a4f9837c2b45044d539 Mon Sep 17 00:00:00 2001 From: Akarshan Date: Thu, 12 Jun 2025 12:50:24 +0530 Subject: [PATCH 06/10] feat: add support for 'qwen' in parameter conversion The condition was updated to include 'qwen' in the check for triggering specific parameters ('--temp', '--top-p', etc.), aligning it with the existing 'jan' and 'nano' validation logic. This allows the same parameter configuration to apply to 'qwen' models as well as the original keywords. --- engine/extensions/local-engine/local_engine.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/engine/extensions/local-engine/local_engine.cc b/engine/extensions/local-engine/local_engine.cc index c1c894518..a0f53af2f 100644 --- a/engine/extensions/local-engine/local_engine.cc +++ b/engine/extensions/local-engine/local_engine.cc @@ -1,6 +1,7 @@ #include "local_engine.h" #include #include +#include #include #include #include @@ -63,7 +64,7 @@ std::vector ConvertJsonToParamsVector(const Json::Value& root) { std::transform(lowered.begin(), lowered.end(), lowered.begin(), [](unsigned char c) { return std::tolower(c); }); - if (lowered.find("jan") != std::string::npos && lowered.find("nano") != std::string::npos) { + if ((lowered.find("jan") != std::string::npos && lowered.find("nano") != std::string::npos) || lowered.find("qwen") != std::string::npos) { res.push_back("--temp"); res.push_back("0.7"); res.push_back("--top-p"); From 7ae8a15aec30b0325797a348f714e58405463ef1 Mon Sep 17 00:00:00 2001 From: Akarshan Date: Thu, 12 Jun 2025 13:07:48 +0530 Subject: [PATCH 07/10] fix: remove deprecated parameters and adjust ignored list Removed deprecated parameters such as "dynatemp_exponent" and "ctx_len" handling logic, which were no longer needed. Added "flash_attn" back to the ignored parameters list. Cleaned up the parameter conversion logic by removing conditional blocks for specific model optimizations that are no longer required. --- .../extensions/local-engine/local_engine.cc | 21 +------------------ 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/engine/extensions/local-engine/local_engine.cc b/engine/extensions/local-engine/local_engine.cc index a0f53af2f..02cb05517 100644 --- a/engine/extensions/local-engine/local_engine.cc +++ b/engine/extensions/local-engine/local_engine.cc @@ -23,8 +23,7 @@ const std::unordered_set kIgnoredParams = { "user_prompt", "min_keep", "mirostat", "mirostat_eta", "mirostat_tau", "text_model", "version", "n_probs", "object", "penalize_nl", "precision", "size", - "flash_attn", - "stop", "tfs_z", "typ_p", "caching_enabled"}; + "flash_attn", "stop", "tfs_z", "typ_p", "caching_enabled"}; const std::unordered_map kParamsMap = { {"cpu_threads", "--threads"}, @@ -35,7 +34,6 @@ const std::unordered_map kParamsMap = { {"top_k", "--top-k"}, {"top_p", "--top-p"}, {"min_p", "--min-p"}, - {"dynatemp_exponent", "--dynatemp-exp"}, {"ctx_len", "--ctx-size"}, {"ngl", "-ngl"}, }; @@ -64,16 +62,6 @@ std::vector ConvertJsonToParamsVector(const Json::Value& root) { std::transform(lowered.begin(), lowered.end(), lowered.begin(), [](unsigned char c) { return std::tolower(c); }); - if ((lowered.find("jan") != std::string::npos && lowered.find("nano") != std::string::npos) || lowered.find("qwen") != std::string::npos) { - res.push_back("--temp"); - res.push_back("0.7"); - res.push_back("--top-p"); - res.push_back("0.8"); - res.push_back("--top-k"); - res.push_back("20"); - res.push_back("--min-p"); - res.push_back("0"); - } } continue; } else if (kIgnoredParams.find(member) != kIgnoredParams.end()) { @@ -105,14 +93,7 @@ std::vector ConvertJsonToParamsVector(const Json::Value& root) { res.push_back("--ignore_eos"); } continue; - } else if (member == "ctx_len") { - if (!root[member].isNull()) { - res.push_back("--ctx-size"); - res.push_back(root[member].asString()); - } - continue; } - // Generic handling for other members res.push_back("--" + member); if (root[member].isString()) { From f46f74b9be45c0464a1b80ecd12b43bfdc325e2f Mon Sep 17 00:00:00 2001 From: Louis Date: Thu, 12 Jun 2025 15:12:04 +0700 Subject: [PATCH 08/10] Revert "fix: remove deprecated parameters and adjust ignored list" This reverts commit 7ae8a15aec30b0325797a348f714e58405463ef1. --- .../extensions/local-engine/local_engine.cc | 21 ++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/engine/extensions/local-engine/local_engine.cc b/engine/extensions/local-engine/local_engine.cc index 02cb05517..a0f53af2f 100644 --- a/engine/extensions/local-engine/local_engine.cc +++ b/engine/extensions/local-engine/local_engine.cc @@ -23,7 +23,8 @@ const std::unordered_set kIgnoredParams = { "user_prompt", "min_keep", "mirostat", "mirostat_eta", "mirostat_tau", "text_model", "version", "n_probs", "object", "penalize_nl", "precision", "size", - "flash_attn", "stop", "tfs_z", "typ_p", "caching_enabled"}; + "flash_attn", + "stop", "tfs_z", "typ_p", "caching_enabled"}; const std::unordered_map kParamsMap = { {"cpu_threads", "--threads"}, @@ -34,6 +35,7 @@ const std::unordered_map kParamsMap = { {"top_k", "--top-k"}, {"top_p", "--top-p"}, {"min_p", "--min-p"}, + {"dynatemp_exponent", "--dynatemp-exp"}, {"ctx_len", "--ctx-size"}, {"ngl", "-ngl"}, }; @@ -62,6 +64,16 @@ std::vector ConvertJsonToParamsVector(const Json::Value& root) { std::transform(lowered.begin(), lowered.end(), lowered.begin(), [](unsigned char c) { return std::tolower(c); }); + if ((lowered.find("jan") != std::string::npos && lowered.find("nano") != std::string::npos) || lowered.find("qwen") != std::string::npos) { + res.push_back("--temp"); + res.push_back("0.7"); + res.push_back("--top-p"); + res.push_back("0.8"); + res.push_back("--top-k"); + res.push_back("20"); + res.push_back("--min-p"); + res.push_back("0"); + } } continue; } else if (kIgnoredParams.find(member) != kIgnoredParams.end()) { @@ -93,7 +105,14 @@ std::vector ConvertJsonToParamsVector(const Json::Value& root) { res.push_back("--ignore_eos"); } continue; + } else if (member == "ctx_len") { + if (!root[member].isNull()) { + res.push_back("--ctx-size"); + res.push_back(root[member].asString()); + } + continue; } + // Generic handling for other members res.push_back("--" + member); if (root[member].isString()) { From b28b774798b08de5163eb3d5f8ec332ae4771f43 Mon Sep 17 00:00:00 2001 From: Louis Date: Thu, 12 Jun 2025 15:10:29 +0700 Subject: [PATCH 09/10] fix: params should not be overriden --- .../extensions/local-engine/local_engine.cc | 10 ------- engine/services/model_service.cc | 30 ++++++++++++------- 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/engine/extensions/local-engine/local_engine.cc b/engine/extensions/local-engine/local_engine.cc index a0f53af2f..beda1f44b 100644 --- a/engine/extensions/local-engine/local_engine.cc +++ b/engine/extensions/local-engine/local_engine.cc @@ -64,16 +64,6 @@ std::vector ConvertJsonToParamsVector(const Json::Value& root) { std::transform(lowered.begin(), lowered.end(), lowered.begin(), [](unsigned char c) { return std::tolower(c); }); - if ((lowered.find("jan") != std::string::npos && lowered.find("nano") != std::string::npos) || lowered.find("qwen") != std::string::npos) { - res.push_back("--temp"); - res.push_back("0.7"); - res.push_back("--top-p"); - res.push_back("0.8"); - res.push_back("--top-k"); - res.push_back("20"); - res.push_back("--min-p"); - res.push_back("0"); - } } continue; } else if (kIgnoredParams.find(member) != kIgnoredParams.end()) { diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc index 68f0fe070..1d1180259 100644 --- a/engine/services/model_service.cc +++ b/engine/services/model_service.cc @@ -165,8 +165,8 @@ ModelService::ModelService(std::shared_ptr db_service, download_service_{download_service}, inference_svc_(inference_service), engine_svc_(engine_svc), - task_queue_(task_queue) { - // ProcessBgrTasks(); + task_queue_(task_queue){ + // ProcessBgrTasks(); }; void ModelService::ForceIndexingModelList() { @@ -557,6 +557,8 @@ cpp::result ModelService::StartModel( if (auto& o = params_override["ctx_len"]; !o.isNull()) { ctx_len = o.asInt(); } + Json::Value model_load_params; + json_helper::MergeJson(model_load_params, params_override); try { constexpr const int kDefautlContextLength = 8192; @@ -630,6 +632,8 @@ cpp::result ModelService::StartModel( #else json_data["model_path"] = fmu::ToAbsoluteCortexDataPath(fs::path(mc.files[0])).string(); + model_load_params["model_path"] = + fmu::ToAbsoluteCortexDataPath(fs::path(mc.files[0])).string(); #endif } else { LOG_WARN << "model_path is empty"; @@ -642,6 +646,8 @@ cpp::result ModelService::StartModel( #else json_data["mmproj"] = fmu::ToAbsoluteCortexDataPath(fs::path(mc.mmproj)).string(); + model_load_params["model_path"] = + fmu::ToAbsoluteCortexDataPath(fs::path(mc.mmproj)).string(); #endif } json_data["system_prompt"] = mc.system_template; @@ -655,6 +661,8 @@ cpp::result ModelService::StartModel( } json_data["model"] = model_handle; + model_load_params["model"] = model_handle; + // params_override["model"] = model_handle; if (auto& cpt = custom_prompt_template; !cpt.value_or("").empty()) { auto parse_prompt_result = string_utils::ParsePrompt(cpt.value()); json_data["system_prompt"] = parse_prompt_result.system_prompt; @@ -662,8 +670,6 @@ cpp::result ModelService::StartModel( json_data["ai_prompt"] = parse_prompt_result.ai_prompt; } - json_helper::MergeJson(json_data, params_override); - // Set default cpu_threads if it is not configured if (!json_data.isMember("cpu_threads")) { json_data["cpu_threads"] = GetCpuThreads(); @@ -686,12 +692,12 @@ cpp::result ModelService::StartModel( assert(!!inference_svc_); - auto ir = - inference_svc_->LoadModel(std::make_shared(json_data)); + auto ir = inference_svc_->LoadModel( + std::make_shared(model_load_params)); auto status = std::get<0>(ir)["status_code"].asInt(); auto data = std::get<1>(ir); - if (status == drogon::k200OK) { + if (status == drogon::k200OK) { return StartModelResult{/* .success = */ true, /* .warning = */ may_fallback_res.value()}; } else if (status == drogon::k409Conflict) { @@ -1031,13 +1037,15 @@ ModelService::MayFallbackToCpu(const std::string& model_path, int ngl, auto es = hardware::EstimateLLaMACppRun(model_path, rc); if (!!es && (*es).gpu_mode.vram_MiB > free_vram_MiB && is_cuda) { - CTL_WRN("Not enough VRAM - " << "required: " << (*es).gpu_mode.vram_MiB - << ", available: " << free_vram_MiB); + CTL_WRN("Not enough VRAM - " + << "required: " << (*es).gpu_mode.vram_MiB + << ", available: " << free_vram_MiB); } if (!!es && (*es).cpu_mode.ram_MiB > free_ram_MiB) { - CTL_WRN("Not enough RAM - " << "required: " << (*es).cpu_mode.ram_MiB - << ", available: " << free_ram_MiB); + CTL_WRN("Not enough RAM - " + << "required: " << (*es).cpu_mode.ram_MiB + << ", available: " << free_ram_MiB); } return warning; From 656795cf4931d67bb05c7484aa6c5ace97d5f040 Mon Sep 17 00:00:00 2001 From: Louis Date: Thu, 12 Jun 2025 15:17:13 +0700 Subject: [PATCH 10/10] chore: clean up --- engine/services/model_service.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc index 1d1180259..2da6c749e 100644 --- a/engine/services/model_service.cc +++ b/engine/services/model_service.cc @@ -662,7 +662,6 @@ cpp::result ModelService::StartModel( json_data["model"] = model_handle; model_load_params["model"] = model_handle; - // params_override["model"] = model_handle; if (auto& cpt = custom_prompt_template; !cpt.value_or("").empty()) { auto parse_prompt_result = string_utils::ParsePrompt(cpt.value()); json_data["system_prompt"] = parse_prompt_result.system_prompt;