janhq · tikikun · Jan 12, 2024 · Jan 12, 2024 · Jan 12, 2024
diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc
@@ -153,7 +153,7 @@ void llamaCPP::chatCompletion(
     const HttpRequestPtr &req,
     std::function<void(const HttpResponsePtr &)> &&callback) {
 
-  if (!model_loaded) {
+  if (!llama.model_loaded_external) {
     Json::Value jsonResp;
     jsonResp["message"] =
         "Model has not been loaded, please load model into nitro";
@@ -391,7 +391,7 @@ void llamaCPP::unloadModel(
     std::function<void(const HttpResponsePtr &)> &&callback) {
   Json::Value jsonResp;
   jsonResp["message"] = "No model loaded";
-  if (model_loaded) {
+  if (llama.model_loaded_external) {
     stopBackgroundTask();
 
     llama_free(llama.ctx);
@@ -408,7 +408,7 @@ void llamaCPP::modelStatus(
     const HttpRequestPtr &req,
     std::function<void(const HttpResponsePtr &)> &&callback) {
   Json::Value jsonResp;
-  bool is_model_loaded = this->model_loaded;
+  bool is_model_loaded = llama.model_loaded_external;
   if (is_model_loaded) {
     jsonResp["model_loaded"] = is_model_loaded;
     jsonResp["model_data"] = llama.get_model_props().dump();
@@ -456,7 +456,7 @@ bool llamaCPP::loadModelImpl(const Json::Value &jsonBody) {
       log_enable();
       std::string llama_log_folder = jsonBody["llama_log_folder"].asString();
       log_set_target(llama_log_folder + "llama.log");
-    }    // Set folder for llama log
+    } // Set folder for llama log
   }
 #ifdef GGML_USE_CUBLAS
   LOG_INFO << "Setting up GGML CUBLAS PARAMS";
@@ -483,7 +483,9 @@ bool llamaCPP::loadModelImpl(const Json::Value &jsonBody) {
     return false; // Indicate failure
   }
   llama.initialize();
-  model_loaded = true;
+
+  llama.model_loaded_external = true;
+
   LOG_INFO << "Started background task here!";
   backgroundThread = std::thread(&llamaCPP::backgroundTask, this);
   warmupModel();
@@ -494,7 +496,7 @@ void llamaCPP::loadModel(
     const HttpRequestPtr &req,
     std::function<void(const HttpResponsePtr &)> &&callback) {
 
-  if (model_loaded) {
+  if (llama.model_loaded_external) {
     LOG_INFO << "model loaded";
     Json::Value jsonResp;
     jsonResp["message"] = "Model already loaded";
@@ -522,7 +524,7 @@ void llamaCPP::loadModel(
 }
 
 void llamaCPP::backgroundTask() {
-  while (model_loaded) {
+  while (llama.model_loaded_external) {
     // model_loaded =
     llama.update_slots();
   }
@@ -533,8 +535,9 @@ void llamaCPP::backgroundTask() {
 }
 
 void llamaCPP::stopBackgroundTask() {
-  if (model_loaded) {
-    model_loaded = false;
+  if (llama.model_loaded_external) {
+    llama.model_loaded_external = false;
+    llama.condition_tasks.notify_one();
     LOG_INFO << "changed to false";
     if (backgroundThread.joinable()) {
       backgroundThread.join();

diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h
@@ -503,6 +503,9 @@ struct llama_server_context {
   int32_t id_gen;
   int32_t n_ctx; // total context for all clients / slots
 
+  // Internal
+  std::atomic<bool> model_loaded_external = false;
+
   // system prompt
   bool system_need_update = false;
 
@@ -1538,10 +1541,13 @@ struct llama_server_context {
                 "cache\n");
         kv_cache_clear();
       }
-      std::this_thread::sleep_for(std::chrono::milliseconds(5));
-      // TODO: Need to implement queueing using CV for better performance
-      // std::unique_lock<std::mutex> lock(mutex_tasks);
-      // condition_tasks.wait(lock, [&] { return !queue_tasks.empty(); });
+      // std::this_thread::sleep_for(std::chrono::milliseconds(5));
+      //  TODO: Need to implement queueing using CV for better performance
+      std::unique_lock<std::mutex> lock(mutex_tasks);
+      condition_tasks.wait(lock, [&] {
+        return (!queue_tasks.empty() && model_loaded_external) ||
+               (!model_loaded_external);
+      });
     }
 
     for (llama_client_slot &slot : slots) {
@@ -2554,7 +2560,7 @@ class llamaCPP : public drogon::HttpController<llamaCPP> {
 
 private:
   llama_server_context llama;
-  std::atomic<bool> model_loaded = false;
+  //std::atomic<bool> model_loaded = false;
   size_t sent_count = 0;
   size_t sent_token_probs_index = 0;
   std::thread backgroundThread;