From 9734cbdd07b3059e5328ce4e586ee83cc8ce4c86 Mon Sep 17 00:00:00 2001 From: tikikun Date: Fri, 12 Jan 2024 08:41:27 +0700 Subject: [PATCH 1/2] bug: fix busy waiting with proper CV --- controllers/llamaCPP.cc | 7 ++++++- controllers/llamaCPP.h | 14 ++++++++++---- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index fc2705cdc..4b5c72495 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -456,7 +456,7 @@ bool llamaCPP::loadModelImpl(const Json::Value &jsonBody) { log_enable(); std::string llama_log_folder = jsonBody["llama_log_folder"].asString(); log_set_target(llama_log_folder + "llama.log"); - } // Set folder for llama log + } // Set folder for llama log } #ifdef GGML_USE_CUBLAS LOG_INFO << "Setting up GGML CUBLAS PARAMS"; @@ -483,7 +483,10 @@ bool llamaCPP::loadModelImpl(const Json::Value &jsonBody) { return false; // Indicate failure } llama.initialize(); + model_loaded = true; + llama.model_loaded_external = true; + LOG_INFO << "Started background task here!"; backgroundThread = std::thread(&llamaCPP::backgroundTask, this); warmupModel(); @@ -535,6 +538,8 @@ void llamaCPP::backgroundTask() { void llamaCPP::stopBackgroundTask() { if (model_loaded) { model_loaded = false; + llama.condition_tasks.notify_one(); + llama.model_loaded_external = false; LOG_INFO << "changed to false"; if (backgroundThread.joinable()) { backgroundThread.join(); diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index e7e084e7c..700a53f64 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -503,6 +503,9 @@ struct llama_server_context { int32_t id_gen; int32_t n_ctx; // total context for all clients / slots + // Internal + std::atomic model_loaded_external = false; + // system prompt bool system_need_update = false; @@ -1538,10 +1541,13 @@ struct llama_server_context { "cache\n"); kv_cache_clear(); } - std::this_thread::sleep_for(std::chrono::milliseconds(5)); - // TODO: Need to implement queueing using CV for better performance - // std::unique_lock lock(mutex_tasks); - // condition_tasks.wait(lock, [&] { return !queue_tasks.empty(); }); + // std::this_thread::sleep_for(std::chrono::milliseconds(5)); + // TODO: Need to implement queueing using CV for better performance + std::unique_lock lock(mutex_tasks); + condition_tasks.wait(lock, [&] { + return (!queue_tasks.empty() && model_loaded_external) || + (!model_loaded_external); + }); } for (llama_client_slot &slot : slots) { From 9a31ee8eeca41c672dff6946a032dd7c5d659387 Mon Sep 17 00:00:00 2001 From: tikikun Date: Fri, 12 Jan 2024 08:55:09 +0700 Subject: [PATCH 2/2] remove usage of load state outside of external llama cpp dependency --- controllers/llamaCPP.cc | 16 +++++++--------- controllers/llamaCPP.h | 2 +- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index 4b5c72495..f770fd066 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -153,7 +153,7 @@ void llamaCPP::chatCompletion( const HttpRequestPtr &req, std::function &&callback) { - if (!model_loaded) { + if (!llama.model_loaded_external) { Json::Value jsonResp; jsonResp["message"] = "Model has not been loaded, please load model into nitro"; @@ -391,7 +391,7 @@ void llamaCPP::unloadModel( std::function &&callback) { Json::Value jsonResp; jsonResp["message"] = "No model loaded"; - if (model_loaded) { + if (llama.model_loaded_external) { stopBackgroundTask(); llama_free(llama.ctx); @@ -408,7 +408,7 @@ void llamaCPP::modelStatus( const HttpRequestPtr &req, std::function &&callback) { Json::Value jsonResp; - bool is_model_loaded = this->model_loaded; + bool is_model_loaded = llama.model_loaded_external; if (is_model_loaded) { jsonResp["model_loaded"] = is_model_loaded; jsonResp["model_data"] = llama.get_model_props().dump(); @@ -484,7 +484,6 @@ bool llamaCPP::loadModelImpl(const Json::Value &jsonBody) { } llama.initialize(); - model_loaded = true; llama.model_loaded_external = true; LOG_INFO << "Started background task here!"; @@ -497,7 +496,7 @@ void llamaCPP::loadModel( const HttpRequestPtr &req, std::function &&callback) { - if (model_loaded) { + if (llama.model_loaded_external) { LOG_INFO << "model loaded"; Json::Value jsonResp; jsonResp["message"] = "Model already loaded"; @@ -525,7 +524,7 @@ void llamaCPP::loadModel( } void llamaCPP::backgroundTask() { - while (model_loaded) { + while (llama.model_loaded_external) { // model_loaded = llama.update_slots(); } @@ -536,10 +535,9 @@ void llamaCPP::backgroundTask() { } void llamaCPP::stopBackgroundTask() { - if (model_loaded) { - model_loaded = false; - llama.condition_tasks.notify_one(); + if (llama.model_loaded_external) { llama.model_loaded_external = false; + llama.condition_tasks.notify_one(); LOG_INFO << "changed to false"; if (backgroundThread.joinable()) { backgroundThread.join(); diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index 700a53f64..2d4b9884b 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -2560,7 +2560,7 @@ class llamaCPP : public drogon::HttpController { private: llama_server_context llama; - std::atomic model_loaded = false; + //std::atomic model_loaded = false; size_t sent_count = 0; size_t sent_token_probs_index = 0; std::thread backgroundThread;