diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index d30fd93b5..834c8ec45 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -434,8 +434,6 @@ bool llamaCPP::loadModelImpl(const Json::Value &jsonBody) { gpt_params params; // By default will setting based on number of handlers - int drogon_thread = drogon::app().getThreadNum() - 5; - LOG_INFO << "Drogon thread is:" << drogon_thread; if (jsonBody) { if (!jsonBody["mmproj"].isNull()) { LOG_INFO << "MMPROJ FILE detected, multi-model enabled!"; @@ -447,7 +445,7 @@ bool llamaCPP::loadModelImpl(const Json::Value &jsonBody) { params.embedding = jsonBody.get("embedding", true).asBool(); // Check if n_parallel exists in jsonBody, if not, set to drogon_thread params.n_batch = jsonBody.get("n_batch", 512).asInt(); - params.n_parallel = jsonBody.get("n_parallel", drogon_thread).asInt(); + params.n_parallel = jsonBody.get("n_parallel", 1).asInt(); params.n_threads = jsonBody.get("cpu_threads", std::thread::hardware_concurrency()) .asInt(); diff --git a/main.cc b/main.cc index 0c4534493..8a0e2f1ff 100644 --- a/main.cc +++ b/main.cc @@ -2,6 +2,8 @@ #include // for PATH_MAX #include #include +#include +#include #if defined(__APPLE__) && defined(__MACH__) #include // for dirname() @@ -11,6 +13,7 @@ #include // for readlink() #elif defined(_WIN32) #include +#undef max #else #error "Unsupported platform!" #endif @@ -35,11 +38,13 @@ int main(int argc, char *argv[]) { port = std::atoi(argv[3]); // Convert string argument to int } + int logical_cores = std::thread::hardware_concurrency(); + int drogon_thread_num = std::max(thread_num, logical_cores); nitro_utils::nitro_logo(); LOG_INFO << "Server started, listening at: " << host << ":" << port; LOG_INFO << "Please load your model"; drogon::app().addListener(host, port); - drogon::app().setThreadNum(thread_num + 5); + drogon::app().setThreadNum(drogon_thread_num); LOG_INFO << "Number of thread is:" << drogon::app().getThreadNum(); drogon::app().run();