diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index fc2705cdc..f770fd066 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -153,7 +153,7 @@ void llamaCPP::chatCompletion( const HttpRequestPtr &req, std::function &&callback) { - if (!model_loaded) { + if (!llama.model_loaded_external) { Json::Value jsonResp; jsonResp["message"] = "Model has not been loaded, please load model into nitro"; @@ -391,7 +391,7 @@ void llamaCPP::unloadModel( std::function &&callback) { Json::Value jsonResp; jsonResp["message"] = "No model loaded"; - if (model_loaded) { + if (llama.model_loaded_external) { stopBackgroundTask(); llama_free(llama.ctx); @@ -408,7 +408,7 @@ void llamaCPP::modelStatus( const HttpRequestPtr &req, std::function &&callback) { Json::Value jsonResp; - bool is_model_loaded = this->model_loaded; + bool is_model_loaded = llama.model_loaded_external; if (is_model_loaded) { jsonResp["model_loaded"] = is_model_loaded; jsonResp["model_data"] = llama.get_model_props().dump(); @@ -456,7 +456,7 @@ bool llamaCPP::loadModelImpl(const Json::Value &jsonBody) { log_enable(); std::string llama_log_folder = jsonBody["llama_log_folder"].asString(); log_set_target(llama_log_folder + "llama.log"); - } // Set folder for llama log + } // Set folder for llama log } #ifdef GGML_USE_CUBLAS LOG_INFO << "Setting up GGML CUBLAS PARAMS"; @@ -483,7 +483,9 @@ bool llamaCPP::loadModelImpl(const Json::Value &jsonBody) { return false; // Indicate failure } llama.initialize(); - model_loaded = true; + + llama.model_loaded_external = true; + LOG_INFO << "Started background task here!"; backgroundThread = std::thread(&llamaCPP::backgroundTask, this); warmupModel(); @@ -494,7 +496,7 @@ void llamaCPP::loadModel( const HttpRequestPtr &req, std::function &&callback) { - if (model_loaded) { + if (llama.model_loaded_external) { LOG_INFO << "model loaded"; Json::Value jsonResp; jsonResp["message"] = "Model already loaded"; @@ -522,7 +524,7 @@ void llamaCPP::loadModel( } void llamaCPP::backgroundTask() { - while (model_loaded) { + while (llama.model_loaded_external) { // model_loaded = llama.update_slots(); } @@ -533,8 +535,9 @@ void llamaCPP::backgroundTask() { } void llamaCPP::stopBackgroundTask() { - if (model_loaded) { - model_loaded = false; + if (llama.model_loaded_external) { + llama.model_loaded_external = false; + llama.condition_tasks.notify_one(); LOG_INFO << "changed to false"; if (backgroundThread.joinable()) { backgroundThread.join(); diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index e7e084e7c..2d4b9884b 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -503,6 +503,9 @@ struct llama_server_context { int32_t id_gen; int32_t n_ctx; // total context for all clients / slots + // Internal + std::atomic model_loaded_external = false; + // system prompt bool system_need_update = false; @@ -1538,10 +1541,13 @@ struct llama_server_context { "cache\n"); kv_cache_clear(); } - std::this_thread::sleep_for(std::chrono::milliseconds(5)); - // TODO: Need to implement queueing using CV for better performance - // std::unique_lock lock(mutex_tasks); - // condition_tasks.wait(lock, [&] { return !queue_tasks.empty(); }); + // std::this_thread::sleep_for(std::chrono::milliseconds(5)); + // TODO: Need to implement queueing using CV for better performance + std::unique_lock lock(mutex_tasks); + condition_tasks.wait(lock, [&] { + return (!queue_tasks.empty() && model_loaded_external) || + (!model_loaded_external); + }); } for (llama_client_slot &slot : slots) { @@ -2554,7 +2560,7 @@ class llamaCPP : public drogon::HttpController { private: llama_server_context llama; - std::atomic model_loaded = false; + //std::atomic model_loaded = false; size_t sent_count = 0; size_t sent_token_probs_index = 0; std::thread backgroundThread;