From 2c362644a8bf5ff9f590a6aa72965b088aa5e6cb Mon Sep 17 00:00:00 2001 From: Bach Dao Date: Mon, 26 Feb 2024 17:44:27 +0700 Subject: [PATCH 1/5] Fix race when model is loaded first time --- controllers/llamaCPP.cc | 62 ++++++++++++++++++++++++++--------------- controllers/llamaCPP.h | 4 +++ 2 files changed, 44 insertions(+), 22 deletions(-) diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index 6a34ac41c..03b860545 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -536,31 +536,40 @@ void llamaCPP::modelStatus( void llamaCPP::loadModel( const HttpRequestPtr &req, std::function &&callback) { - if (llama.model_loaded_external) { - LOG_INFO << "model loaded"; - Json::Value jsonResp; - jsonResp["message"] = "Model already loaded"; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(drogon::k409Conflict); - callback(resp); + + if(llama.model_loaded_external.load(std::memory_order_acquire)){ + model_loaded_response(req, callback); return; } + bool modelLoadedSuccess; + const auto& jsonBody = req->getJsonObject(); + + { + std::scoped_lock lck{load_model_mutex}; + if (llama.model_loaded_external.load(std::memory_order_relaxed)) { + model_loaded_response(req, callback); + return; + } + modelLoadedSuccess = loadModelImpl(jsonBody); + llama.model_loaded_external.store(modelLoadedSuccess, std::memory_order_relaxed); + } - const auto &jsonBody = req->getJsonObject(); - if (!loadModelImpl(jsonBody)) { - // Error occurred during model loading - Json::Value jsonResp; - jsonResp["message"] = "Failed to load model"; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(drogon::k500InternalServerError); - callback(resp); - } else { - // Model loaded successfully - Json::Value jsonResp; - jsonResp["message"] = "Model loaded successfully"; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - callback(resp); + if (modelLoadedSuccess) { + // Model loaded successfully + Json::Value jsonResp; + jsonResp["message"] = "Model loaded successfully"; + auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); + callback(resp); + return; } + + // Error occurred during model loading + Json::Value jsonResp; + jsonResp["message"] = "Failed to load model"; + auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); + resp->setStatusCode(drogon::k500InternalServerError); + callback(resp); + } bool llamaCPP::loadModelImpl(std::shared_ptr jsonBody) { @@ -654,7 +663,6 @@ bool llamaCPP::loadModelImpl(std::shared_ptr jsonBody) { queue = new trantor::ConcurrentTaskQueue(llama.params.n_parallel, "llamaCPP"); - llama.model_loaded_external = true; LOG_INFO << "Started background task here!"; backgroundThread = std::thread(&llamaCPP::backgroundTask, this); @@ -683,3 +691,13 @@ void llamaCPP::stopBackgroundTask() { } } } +void llamaCPP::model_loaded_response( + const HttpRequestPtr ptr, + std::function callback) { + LOG_INFO << "model loaded"; + Json::Value jsonResp; + jsonResp["message"] = "Model already loaded"; + auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); + resp->setStatusCode(drogon::k409Conflict); + callback(resp); +} diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index 292565b99..9b13a7453 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -2262,6 +2262,7 @@ class llamaCPP : public drogon::HttpController, public ChatProvider { std::atomic no_of_chats = 0; int clean_cache_threshold; std::string grammar_file_content; + std::mutex load_model_mutex; /** * Queue to handle the inference tasks @@ -2277,5 +2278,8 @@ class llamaCPP : public drogon::HttpController, public ChatProvider { void warmupModel(); void backgroundTask(); void stopBackgroundTask(); + void model_loaded_response( + const HttpRequestPtr ptr, + std::function function); }; }; // namespace inferences From bcca08ec72946abbb43d6cffbbec85c8d55f836c Mon Sep 17 00:00:00 2001 From: Bach Dao Date: Tue, 27 Feb 2024 13:42:46 +0700 Subject: [PATCH 2/5] Correct the last write to model_loaded_external flag with memory_order_release --- controllers/llamaCPP.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index 03b860545..bc473173a 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -551,7 +551,7 @@ void llamaCPP::loadModel( return; } modelLoadedSuccess = loadModelImpl(jsonBody); - llama.model_loaded_external.store(modelLoadedSuccess, std::memory_order_relaxed); + llama.model_loaded_external.store(modelLoadedSuccess, std::memory_order_release); } if (modelLoadedSuccess) { From 22164e5c2628ebad86d4b124c7adaf06554ea2a9 Mon Sep 17 00:00:00 2001 From: Bach Dao Date: Tue, 27 Feb 2024 13:56:03 +0700 Subject: [PATCH 3/5] Refactor: change function's name to comply with convention --- controllers/llamaCPP.cc | 6 +++--- controllers/llamaCPP.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index bc473173a..6ebc571e1 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -538,7 +538,7 @@ void llamaCPP::loadModel( std::function &&callback) { if(llama.model_loaded_external.load(std::memory_order_acquire)){ - model_loaded_response(req, callback); + modelLoadedResponse(req, callback); return; } bool modelLoadedSuccess; @@ -547,7 +547,7 @@ void llamaCPP::loadModel( { std::scoped_lock lck{load_model_mutex}; if (llama.model_loaded_external.load(std::memory_order_relaxed)) { - model_loaded_response(req, callback); + modelLoadedResponse(req, callback); return; } modelLoadedSuccess = loadModelImpl(jsonBody); @@ -691,7 +691,7 @@ void llamaCPP::stopBackgroundTask() { } } } -void llamaCPP::model_loaded_response( +void llamaCPP::modelLoadedResponse( const HttpRequestPtr ptr, std::function callback) { LOG_INFO << "model loaded"; diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index 9b13a7453..6c2f7fd9a 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -2278,7 +2278,7 @@ class llamaCPP : public drogon::HttpController, public ChatProvider { void warmupModel(); void backgroundTask(); void stopBackgroundTask(); - void model_loaded_response( + void modelLoadedResponse( const HttpRequestPtr ptr, std::function function); }; From 5a8af99f07e494b29f5bf24a40740dfab282cbfc Mon Sep 17 00:00:00 2001 From: Bach Dao Date: Thu, 29 Feb 2024 21:34:24 +0700 Subject: [PATCH 4/5] Change to std::lock_guard for compatibility with C++11 --- controllers/llamaCPP.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index fe1f5ed43..ac148d270 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -554,7 +554,7 @@ void llamaCPP::loadModel( const auto& jsonBody = req->getJsonObject(); { - std::scoped_lock lck{load_model_mutex}; + std::lock_guard lck{load_model_mutex}; if (llama.model_loaded_external.load(std::memory_order_relaxed)) { ModelLoadedResponse(callback); return; From f5ebb168e28156b457606e774c5099ea0ddf87f7 Mon Sep 17 00:00:00 2001 From: Bach Dao Date: Sun, 3 Mar 2024 10:11:21 +0700 Subject: [PATCH 5/5] Fixed a typo --- controllers/llamaCPP.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index ac148d270..c2872ad77 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -541,7 +541,7 @@ void llamaCPP::ModelStatus( return; } -void llamaCPP::loadModel( +void llamaCPP::LoadModel( const HttpRequestPtr &req, std::function &&callback) { @@ -559,7 +559,7 @@ void llamaCPP::loadModel( ModelLoadedResponse(callback); return; } - modelLoadedSuccess = loadModelImpl(jsonBody); + modelLoadedSuccess = LoadModelImpl(jsonBody); llama.model_loaded_external.store(modelLoadedSuccess, std::memory_order_release); } if (modelLoadedSuccess) {