Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.
28 changes: 22 additions & 6 deletions engine/extensions/local-engine/local_engine.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
#include "local_engine.h"
#include <algorithm>
#include <random>
#include <string>
#include <thread>
#include <string.h>
#include <unordered_set>
#include "utils/curl_utils.h"
#include "utils/json_helper.h"
Expand All @@ -20,6 +23,7 @@ const std::unordered_set<std::string> kIgnoredParams = {
"user_prompt", "min_keep", "mirostat", "mirostat_eta",
"mirostat_tau", "text_model", "version", "n_probs",
"object", "penalize_nl", "precision", "size",
"flash_attn",
"stop", "tfs_z", "typ_p", "caching_enabled"};

const std::unordered_map<std::string, std::string> kParamsMap = {
Expand All @@ -42,18 +46,24 @@ int GenerateRandomInteger(int min, int max) {
std::uniform_int_distribution<> dis(
min, max); // Distribution for the desired range

return dis(gen); // Generate and return a random integer within the range
return dis(gen);
}

std::vector<std::string> ConvertJsonToParamsVector(const Json::Value& root) {
std::vector<std::string> res;
std::string errors;

for (const auto& member : root.getMemberNames()) {
if (member == "model_path" || member == "llama_model_path") {
if (!root[member].isNull()) {
const std::string path = root[member].asString();
res.push_back("--model");
res.push_back(root[member].asString());
res.push_back(path);

// If path contains both "Jan" and "nano", case-insensitive, add special params
std::string lowered = path;
std::transform(lowered.begin(), lowered.end(), lowered.begin(), [](unsigned char c) {
return std::tolower(c);
});
}
continue;
} else if (kIgnoredParams.find(member) != kIgnoredParams.end()) {
Expand Down Expand Up @@ -85,8 +95,15 @@ std::vector<std::string> ConvertJsonToParamsVector(const Json::Value& root) {
res.push_back("--ignore_eos");
}
continue;
} else if (member == "ctx_len") {
if (!root[member].isNull()) {
res.push_back("--ctx-size");
res.push_back(root[member].asString());
}
continue;
}

// Generic handling for other members
res.push_back("--" + member);
if (root[member].isString()) {
res.push_back(root[member].asString());
Expand All @@ -105,14 +122,15 @@ std::vector<std::string> ConvertJsonToParamsVector(const Json::Value& root) {
ss << "\"" << value.asString() << "\"";
first = false;
}
ss << "] ";
ss << "]";
res.push_back(ss.str());
}
}

return res;
}


constexpr const auto kMinDataChunkSize = 6u;

struct OaiInfo {
Expand Down Expand Up @@ -561,8 +579,6 @@ void LocalEngine::LoadModel(std::shared_ptr<Json::Value> json_body,
params.push_back("--port");
params.push_back(std::to_string(s.port));

params.push_back("--pooling");
params.push_back("mean");

params.push_back("--jinja");

Expand Down
29 changes: 18 additions & 11 deletions engine/services/model_service.cc
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,8 @@ ModelService::ModelService(std::shared_ptr<DatabaseService> db_service,
download_service_{download_service},
inference_svc_(inference_service),
engine_svc_(engine_svc),
task_queue_(task_queue) {
// ProcessBgrTasks();
task_queue_(task_queue){
// ProcessBgrTasks();
};

void ModelService::ForceIndexingModelList() {
Expand Down Expand Up @@ -557,6 +557,8 @@ cpp::result<StartModelResult, std::string> ModelService::StartModel(
if (auto& o = params_override["ctx_len"]; !o.isNull()) {
ctx_len = o.asInt();
}
Json::Value model_load_params;
json_helper::MergeJson(model_load_params, params_override);

try {
constexpr const int kDefautlContextLength = 8192;
Expand Down Expand Up @@ -630,6 +632,8 @@ cpp::result<StartModelResult, std::string> ModelService::StartModel(
#else
json_data["model_path"] =
fmu::ToAbsoluteCortexDataPath(fs::path(mc.files[0])).string();
model_load_params["model_path"] =
fmu::ToAbsoluteCortexDataPath(fs::path(mc.files[0])).string();
#endif
} else {
LOG_WARN << "model_path is empty";
Expand All @@ -642,6 +646,8 @@ cpp::result<StartModelResult, std::string> ModelService::StartModel(
#else
json_data["mmproj"] =
fmu::ToAbsoluteCortexDataPath(fs::path(mc.mmproj)).string();
model_load_params["model_path"] =
fmu::ToAbsoluteCortexDataPath(fs::path(mc.mmproj)).string();
#endif
}
json_data["system_prompt"] = mc.system_template;
Expand All @@ -655,15 +661,14 @@ cpp::result<StartModelResult, std::string> ModelService::StartModel(
}

json_data["model"] = model_handle;
model_load_params["model"] = model_handle;
if (auto& cpt = custom_prompt_template; !cpt.value_or("").empty()) {
auto parse_prompt_result = string_utils::ParsePrompt(cpt.value());
json_data["system_prompt"] = parse_prompt_result.system_prompt;
json_data["user_prompt"] = parse_prompt_result.user_prompt;
json_data["ai_prompt"] = parse_prompt_result.ai_prompt;
}

json_helper::MergeJson(json_data, params_override);

// Set default cpu_threads if it is not configured
if (!json_data.isMember("cpu_threads")) {
json_data["cpu_threads"] = GetCpuThreads();
Expand All @@ -686,12 +691,12 @@ cpp::result<StartModelResult, std::string> ModelService::StartModel(

assert(!!inference_svc_);

auto ir =
inference_svc_->LoadModel(std::make_shared<Json::Value>(json_data));
auto ir = inference_svc_->LoadModel(
std::make_shared<Json::Value>(model_load_params));
auto status = std::get<0>(ir)["status_code"].asInt();
auto data = std::get<1>(ir);

if (status == drogon::k200OK) {
if (status == drogon::k200OK) {
return StartModelResult{/* .success = */ true,
/* .warning = */ may_fallback_res.value()};
} else if (status == drogon::k409Conflict) {
Expand Down Expand Up @@ -1031,13 +1036,15 @@ ModelService::MayFallbackToCpu(const std::string& model_path, int ngl,
auto es = hardware::EstimateLLaMACppRun(model_path, rc);

if (!!es && (*es).gpu_mode.vram_MiB > free_vram_MiB && is_cuda) {
CTL_WRN("Not enough VRAM - " << "required: " << (*es).gpu_mode.vram_MiB
<< ", available: " << free_vram_MiB);
CTL_WRN("Not enough VRAM - "
<< "required: " << (*es).gpu_mode.vram_MiB
<< ", available: " << free_vram_MiB);
}

if (!!es && (*es).cpu_mode.ram_MiB > free_ram_MiB) {
CTL_WRN("Not enough RAM - " << "required: " << (*es).cpu_mode.ram_MiB
<< ", available: " << free_ram_MiB);
CTL_WRN("Not enough RAM - "
<< "required: " << (*es).cpu_mode.ram_MiB
<< ", available: " << free_ram_MiB);
}

return warning;
Expand Down