From 2c362644a8bf5ff9f590a6aa72965b088aa5e6cb Mon Sep 17 00:00:00 2001
From: Bach Dao <bach.dao@axon-networks.com>
Date: Mon, 26 Feb 2024 17:44:27 +0700
Subject: [PATCH 1/5] Fix race when model is loaded first time

---
 controllers/llamaCPP.cc | 62 ++++++++++++++++++++++++++---------------
 controllers/llamaCPP.h  |  4 +++
 2 files changed, 44 insertions(+), 22 deletions(-)
diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc
index 6a34ac41c..03b860545 100644
--- a/controllers/llamaCPP.cc
+++ b/controllers/llamaCPP.cc
@@ -536,31 +536,40 @@ void llamaCPP::modelStatus(
 void llamaCPP::loadModel(
     const HttpRequestPtr &req,
     std::function<void(const HttpResponsePtr &)> &&callback) {
-  if (llama.model_loaded_external) {
-    LOG_INFO << "model loaded";
-    Json::Value jsonResp;
-    jsonResp["message"] = "Model already loaded";
-    auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
-    resp->setStatusCode(drogon::k409Conflict);
-    callback(resp);
+
+  if(llama.model_loaded_external.load(std::memory_order_acquire)){
+    model_loaded_response(req, callback);
     return;
   }
+  bool modelLoadedSuccess;
+  const auto& jsonBody = req->getJsonObject();
+
+  {
+    std::scoped_lock lck{load_model_mutex};
+    if (llama.model_loaded_external.load(std::memory_order_relaxed)) {
+      model_loaded_response(req, callback);
+      return;
+    }
+    modelLoadedSuccess = loadModelImpl(jsonBody);
+    llama.model_loaded_external.store(modelLoadedSuccess, std::memory_order_relaxed);
+  }
 
-  const auto &jsonBody = req->getJsonObject();
-  if (!loadModelImpl(jsonBody)) {
-    // Error occurred during model loading
-    Json::Value jsonResp;
-    jsonResp["message"] = "Failed to load model";
-    auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
-    resp->setStatusCode(drogon::k500InternalServerError);
-    callback(resp);
-  } else {
-    // Model loaded successfully
-    Json::Value jsonResp;
-    jsonResp["message"] = "Model loaded successfully";
-    auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
-    callback(resp);
+  if (modelLoadedSuccess) {
+      // Model loaded successfully
+      Json::Value jsonResp;
+      jsonResp["message"] = "Model loaded successfully";
+      auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
+      callback(resp);
+      return;
   }
+
+  // Error occurred during model loading
+  Json::Value jsonResp;
+  jsonResp["message"] = "Failed to load model";
+  auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
+  resp->setStatusCode(drogon::k500InternalServerError);
+  callback(resp);
+
 }
 
 bool llamaCPP::loadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
@@ -654,7 +663,6 @@ bool llamaCPP::loadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
 
   queue = new trantor::ConcurrentTaskQueue(llama.params.n_parallel, "llamaCPP");
 
-  llama.model_loaded_external = true;
 
   LOG_INFO << "Started background task here!";
   backgroundThread = std::thread(&llamaCPP::backgroundTask, this);
@@ -683,3 +691,13 @@ void llamaCPP::stopBackgroundTask() {
     }
   }
 }
+void llamaCPP::model_loaded_response(
+    const HttpRequestPtr ptr,
+    std::function<void(const HttpResponsePtr&)> callback) {
+  LOG_INFO << "model loaded";
+  Json::Value jsonResp;
+  jsonResp["message"] = "Model already loaded";
+  auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
+  resp->setStatusCode(drogon::k409Conflict);
+  callback(resp);
+}
diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h
index 292565b99..9b13a7453 100644
--- a/controllers/llamaCPP.h
+++ b/controllers/llamaCPP.h
@@ -2262,6 +2262,7 @@ class llamaCPP : public drogon::HttpController<llamaCPP>, public ChatProvider {
   std::atomic<int> no_of_chats = 0;
   int clean_cache_threshold;
   std::string grammar_file_content;
+  std::mutex load_model_mutex;
 
   /**
    * Queue to handle the inference tasks
@@ -2277,5 +2278,8 @@ class llamaCPP : public drogon::HttpController<llamaCPP>, public ChatProvider {
   void warmupModel();
   void backgroundTask();
   void stopBackgroundTask();
+  void model_loaded_response(
+      const HttpRequestPtr ptr,
+      std::function<void(const HttpResponsePtr&)> function);
 };
 }; // namespace inferences

From bcca08ec72946abbb43d6cffbbec85c8d55f836c Mon Sep 17 00:00:00 2001
From: Bach Dao <bachdao91@gmail.com>
Date: Tue, 27 Feb 2024 13:42:46 +0700
Subject: [PATCH 2/5] Correct the last write to model_loaded_external flag with
 memory_order_release

---
 controllers/llamaCPP.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc
index 03b860545..bc473173a 100644
--- a/controllers/llamaCPP.cc
+++ b/controllers/llamaCPP.cc
@@ -551,7 +551,7 @@ void llamaCPP::loadModel(
       return;
     }
     modelLoadedSuccess = loadModelImpl(jsonBody);
-    llama.model_loaded_external.store(modelLoadedSuccess, std::memory_order_relaxed);
+    llama.model_loaded_external.store(modelLoadedSuccess, std::memory_order_release);
   }
 
   if (modelLoadedSuccess) {

From 22164e5c2628ebad86d4b124c7adaf06554ea2a9 Mon Sep 17 00:00:00 2001
From: Bach Dao <bachdao91@gmail.com>
Date: Tue, 27 Feb 2024 13:56:03 +0700
Subject: [PATCH 3/5] Refactor: change function's name to comply with
 convention

---
 controllers/llamaCPP.cc | 6 +++---
 controllers/llamaCPP.h  | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc
index bc473173a..6ebc571e1 100644
--- a/controllers/llamaCPP.cc
+++ b/controllers/llamaCPP.cc
@@ -538,7 +538,7 @@ void llamaCPP::loadModel(
     std::function<void(const HttpResponsePtr &)> &&callback) {
 
   if(llama.model_loaded_external.load(std::memory_order_acquire)){
-    model_loaded_response(req, callback);
+    modelLoadedResponse(req, callback);
     return;
   }
   bool modelLoadedSuccess;
@@ -547,7 +547,7 @@ void llamaCPP::loadModel(
   {
     std::scoped_lock lck{load_model_mutex};
     if (llama.model_loaded_external.load(std::memory_order_relaxed)) {
-      model_loaded_response(req, callback);
+      modelLoadedResponse(req, callback);
       return;
     }
     modelLoadedSuccess = loadModelImpl(jsonBody);
@@ -691,7 +691,7 @@ void llamaCPP::stopBackgroundTask() {
     }
   }
 }
-void llamaCPP::model_loaded_response(
+void llamaCPP::modelLoadedResponse(
     const HttpRequestPtr ptr,
     std::function<void(const HttpResponsePtr&)> callback) {
   LOG_INFO << "model loaded";
diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h
index 9b13a7453..6c2f7fd9a 100644
--- a/controllers/llamaCPP.h
+++ b/controllers/llamaCPP.h
@@ -2278,7 +2278,7 @@ class llamaCPP : public drogon::HttpController<llamaCPP>, public ChatProvider {
   void warmupModel();
   void backgroundTask();
   void stopBackgroundTask();
-  void model_loaded_response(
+  void modelLoadedResponse(
       const HttpRequestPtr ptr,
       std::function<void(const HttpResponsePtr&)> function);
 };

From 5a8af99f07e494b29f5bf24a40740dfab282cbfc Mon Sep 17 00:00:00 2001
From: Bach Dao <bach.dao@axon-networks.com>
Date: Thu, 29 Feb 2024 21:34:24 +0700
Subject: [PATCH 4/5] Change to std::lock_guard for compatibility with C++11

---
 controllers/llamaCPP.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc
index fe1f5ed43..ac148d270 100644
--- a/controllers/llamaCPP.cc
+++ b/controllers/llamaCPP.cc
@@ -554,7 +554,7 @@ void llamaCPP::loadModel(
   const auto& jsonBody = req->getJsonObject();
 
   {
-    std::scoped_lock lck{load_model_mutex};
+    std::lock_guard lck{load_model_mutex};
     if (llama.model_loaded_external.load(std::memory_order_relaxed)) {
       ModelLoadedResponse(callback);
       return;

From f5ebb168e28156b457606e774c5099ea0ddf87f7 Mon Sep 17 00:00:00 2001
From: Bach Dao <bach.dao@axon-networks.com>
Date: Sun, 3 Mar 2024 10:11:21 +0700
Subject: [PATCH 5/5] Fixed a typo

---
 controllers/llamaCPP.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc
index ac148d270..c2872ad77 100644
--- a/controllers/llamaCPP.cc
+++ b/controllers/llamaCPP.cc
@@ -541,7 +541,7 @@ void llamaCPP::ModelStatus(
   return;
 }
 
-void llamaCPP::loadModel(
+void llamaCPP::LoadModel(
     const HttpRequestPtr &req,
     std::function<void(const HttpResponsePtr &)> &&callback) {
 
@@ -559,7 +559,7 @@ void llamaCPP::loadModel(
       ModelLoadedResponse(callback);
       return;
     }
-    modelLoadedSuccess = loadModelImpl(jsonBody);
+    modelLoadedSuccess = LoadModelImpl(jsonBody);
     llama.model_loaded_external.store(modelLoadedSuccess, std::memory_order_release);
   }
   if (modelLoadedSuccess) {