From 4cfa440b16b78b74b1fe6d668f337907df1c2990 Mon Sep 17 00:00:00 2001
From: tikikun <daogiatuank54@gmail.com>
Date: Wed, 11 Oct 2023 17:08:02 +0700
Subject: [PATCH 1/8] feat: add dynamic loading model through api

---
 config.json             |  8 +----
 controllers/llamaCPP.cc | 67 ++++++++++++++++++++++++++++++++++++++
 controllers/llamaCPP.h  | 71 ++++++++++++++++++++++-------------------
 main.cc                 | 23 ++++++-------
 4 files changed, 118 insertions(+), 51 deletions(-)
diff --git a/config.json b/config.json
index 2a7f093cc..add7da3a6 100644
--- a/config.json
+++ b/config.json
@@ -4,11 +4,5 @@
 			"address": "127.0.0.1",
 			"port": 3928
 		}
-	],
-	"custom_config": {
-		"llama_model_path": "/Users/alandao/Documents/codes/nitro.cpp_temp/models/llama2_7b_chat_uncensored.Q4_0.gguf",
-		"ctx_len": 2048,
-		"ngl": 100,
-		"embedding":true
-	}
+	]
 }
diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc
index 5a12621f7..679859f5c 100644
--- a/controllers/llamaCPP.cc
+++ b/controllers/llamaCPP.cc
@@ -4,6 +4,7 @@
 #include <chrono>
 #include <cstring>
 #include <drogon/HttpResponse.h>
+#include <drogon/HttpTypes.h>
 #include <regex>
 #include <thread>
 
@@ -41,6 +42,15 @@ std::string create_return_json(const std::string &id, const std::string &model,
 void llamaCPP::chatCompletion(
     const HttpRequestPtr &req,
     std::function<void(const HttpResponsePtr &)> &&callback) {
+  if (!model_loaded) {
+    Json::Value jsonResp;
+    jsonResp["message"] = "Model is not loaded yet";
+    auto resp = drogon::HttpResponse::newHttpJsonResponse(jsonResp);
+    resp->setStatusCode(drogon::k500InternalServerError);
+    callback(resp);
+    return;
+  }
+
   const auto &jsonBody = req->getJsonObject();
   std::string formatted_output =
       "Below is a conversation between an AI system named ASSISTANT and USER\n";
@@ -203,6 +213,15 @@ void llamaCPP::chatCompletion(
 void llamaCPP::embedding(
     const HttpRequestPtr &req,
     std::function<void(const HttpResponsePtr &)> &&callback) {
+  if (!model_loaded) {
+    Json::Value jsonResp;
+    jsonResp["message"] = "Model is not loaded yet";
+    auto resp = drogon::HttpResponse::newHttpJsonResponse(jsonResp);
+    resp->setStatusCode(drogon::k500InternalServerError);
+    callback(resp);
+    return;
+  }
+
   auto lock = llama.lock();
 
   const auto &jsonBody = req->getJsonObject();
@@ -225,3 +244,51 @@ void llamaCPP::embedding(
   resp->setContentTypeString("application/json");
   callback(resp);
 }
+
+void llamaCPP::loadModel(
+    const HttpRequestPtr &req,
+    std::function<void(const HttpResponsePtr &)> &&callback) {
+
+  const auto &jsonBody = req->getJsonObject();
+
+  gpt_params params;
+  if (jsonBody) {
+    params.model = (*jsonBody)["llama_model_path"].asString();
+    params.n_gpu_layers = (*jsonBody)["ngl"].asInt();
+    params.n_ctx = (*jsonBody)["ctx_len"].asInt();
+    params.embedding = (*jsonBody)["embedding"].asBool();
+  }
+#ifdef GGML_USE_CUBLAS
+  LOG_INFO << "Setting up GGML CUBLAS PARAMS";
+  params.mul_mat_q = false;
+#endif // GGML_USE_CUBLAS
+  if (params.model_alias == "unknown") {
+    params.model_alias = params.model;
+  }
+
+  llama_backend_init(params.numa);
+
+  LOG_INFO_LLAMA("build info",
+                 {{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}});
+  LOG_INFO_LLAMA("system info",
+                 {
+                     {"n_threads", params.n_threads},
+                     {"total_threads", std::thread::hardware_concurrency()},
+                     {"system_info", llama_print_system_info()},
+                 });
+
+  // load the model
+  if (!llama.loadModel(params)) {
+    LOG_ERROR << "Error loading the model will exit the program";
+    Json::Value jsonResp;
+    jsonResp["message"] = "Model loaded failed";
+    auto resp = drogon::HttpResponse::newHttpJsonResponse(jsonResp);
+    resp->setStatusCode(drogon::k500InternalServerError);
+    callback(resp);
+  }
+  Json::Value jsonResp;
+  jsonResp["message"] = "Model loaded successfully";
+  model_loaded = true;
+  auto resp = drogon::HttpResponse::newHttpJsonResponse(jsonResp);
+  callback(resp);
+}
diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h
index e4c6c881d..ce6a23e02 100644
--- a/controllers/llamaCPP.h
+++ b/controllers/llamaCPP.h
@@ -1,3 +1,4 @@
+#include <drogon/HttpTypes.h>
 #if defined(_WIN32)
 #define NOMINMAX
 #endif
@@ -1311,51 +1312,55 @@ namespace inferences {
 class llamaCPP : public drogon::HttpController<llamaCPP> {
 public:
   llamaCPP() {
-    gpt_params params;
-    auto conf = drogon::app().getCustomConfig();
-    params.model = conf["llama_model_path"].asString();
-    params.n_gpu_layers = conf["ngl"].asInt();
-    params.n_ctx = conf["ctx_len"].asInt();
-    params.embedding = conf["embedding"].asBool();
-#ifdef GGML_USE_CUBLAS
-    LOG_INFO << "Setting up GGML CUBLAS PARAMS";
-    params.mul_mat_q = false;
-#endif // GGML_USE_CUBLAS
-    if (params.model_alias == "unknown") {
-      params.model_alias = params.model;
-    }
-
-    llama_backend_init(params.numa);
-
-    LOG_INFO_LLAMA("build info",
-                   {{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}});
-    LOG_INFO_LLAMA("system info",
-                   {
-                       {"n_threads", params.n_threads},
-                       {"total_threads", std::thread::hardware_concurrency()},
-                       {"system_info", llama_print_system_info()},
-                   });
-
-    // load the model
-    if (!llama.loadModel(params)) {
-      LOG_ERROR << "Error loading the model will exit the program";
-      std::terminate();
-    }
-    nitro_utils::nitro_logo();
+//    gpt_params params;
+//    auto conf = drogon::app().getCustomConfig();
+//    params.model = conf["llama_model_path"].asString();
+//    params.n_gpu_layers = conf["ngl"].asInt();
+//    params.n_ctx = conf["ctx_len"].asInt();
+//    params.embedding = conf["embedding"].asBool();
+//#ifdef GGML_USE_CUBLAS
+//    LOG_INFO << "Setting up GGML CUBLAS PARAMS";
+//    params.mul_mat_q = false;
+//#endif // GGML_USE_CUBLAS
+//    if (params.model_alias == "unknown") {
+//      params.model_alias = params.model;
+//    }
+//
+//    llama_backend_init(params.numa);
+//
+//    LOG_INFO_LLAMA("build info",
+//                   {{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}});
+//    LOG_INFO_LLAMA("system info",
+//                   {
+//                       {"n_threads", params.n_threads},
+//                       {"total_threads", std::thread::hardware_concurrency()},
+//                       {"system_info", llama_print_system_info()},
+//                   });
+//
+//    // load the model
+//    if (!llama.loadModel(params)) {
+//      LOG_ERROR << "Error loading the model will exit the program";
+//      std::terminate();
+//    }
   }
   METHOD_LIST_BEGIN
   // list path definitions here;
-  METHOD_ADD(llamaCPP::chatCompletion, "chat_completion");
-  METHOD_ADD(llamaCPP::embedding,"embedding");
+  METHOD_ADD(llamaCPP::chatCompletion, "chat_completion",Post);
+  METHOD_ADD(llamaCPP::embedding,"embedding",Post);
+  METHOD_ADD(llamaCPP::loadModel,"loadmodel",Post);
   // PATH_ADD("/llama/chat_completion", Post);
   METHOD_LIST_END
   void chatCompletion(const HttpRequestPtr &req,
                       std::function<void(const HttpResponsePtr &)> &&callback);
   void embedding(const HttpRequestPtr &req,
                  std::function<void(const HttpResponsePtr &)> &&callback);
+  void loadModel(const HttpRequestPtr &req,
+                 std::function<void(const HttpResponsePtr &)> &&callback);
+
 
 private:
   llama_server_context llama;
+  bool model_loaded = false;
   size_t sent_count = 0;
   size_t sent_token_probs_index = 0;
 };
diff --git a/main.cc b/main.cc
index 5ac05720e..fa9fb5ef8 100644
--- a/main.cc
+++ b/main.cc
@@ -1,4 +1,4 @@
-
+#include "controllers/nitro_utils.h"
 #include <climits> // for PATH_MAX
 #include <drogon/drogon.h>
 
@@ -43,16 +43,15 @@ int main() {
   char path[MAX_PATH];
   char dir[MAX_PATH];
   // char dir[MAX_PATH];
-  if(GetModuleFileNameA(NULL, path, sizeof(path))) {
-      char* lastBackslash = strrchr(path, '\\');
-      if (lastBackslash == nullptr) {
-        return 1;
-      }
-      lastBackslash[0] = '\0';
-      strcpy(dir, path);
-      configPath = std::string(dir) + "/config/config.json";
-  }
-  else {
+  if (GetModuleFileNameA(NULL, path, sizeof(path))) {
+    char *lastBackslash = strrchr(path, '\\');
+    if (lastBackslash == nullptr) {
+      return 1;
+    }
+    lastBackslash[0] = '\0';
+    strcpy(dir, path);
+    configPath = std::string(dir) + "/config/config.json";
+  } else {
     LOG_ERROR << "Failed to get binary location!";
     return 1;
   }
@@ -66,6 +65,8 @@ int main() {
   auto app_conf = drogon::app().getCustomConfig();
 
   LOG_INFO << app_conf["llama_model_file"].asString();
+  nitro_utils::nitro_logo();
+  LOG_INFO << "Server started, please load your model";
   // drogon::app().addListener("0.0.0.0", 8080);
   drogon::app().run();
 

From 571e03b45e44e5d02beb6d2de769a8f851cd2cc8 Mon Sep 17 00:00:00 2001
From: tikikun <daogiatuank54@gmail.com>
Date: Wed, 11 Oct 2023 17:14:28 +0700
Subject: [PATCH 2/8] feat: add dynamic loading model through api

---
 main.cc | 60 ++++++++++++---------------------------------------------
 1 file changed, 12 insertions(+), 48 deletions(-)

diff --git a/main.cc b/main.cc
index fa9fb5ef8..4b703d7ea 100644
--- a/main.cc
+++ b/main.cc
@@ -14,60 +14,24 @@
 #error "Unsupported platform!"
 #endif
 
-int main() {
-  std::string configPath;
+int main(int argc, char *argv[]) {
 
-#if defined(__APPLE__) && defined(__MACH__)
-  char path[PATH_MAX];
-  uint32_t size = sizeof(path);
-  if (_NSGetExecutablePath(path, &size) == 0) {
-    path[size] = '\0'; // Null-terminate the string
-    char *dir = dirname(path);
-    configPath = std::string(dir) + "/config/config.json";
-  } else {
-    LOG_ERROR << "Failed to get binary location!";
-    return 1;
-  }
-#elif defined(__linux__)
-  char path[PATH_MAX];
-  ssize_t len = readlink("/proc/self/exe", path, sizeof(path) - 1);
-  if (len != -1) {
-    path[len] = '\0';
-    char *dir = dirname(path);
-    configPath = std::string(dir) + "/config/config.json";
-  } else {
-    LOG_ERROR << "Failed to get binary location!";
-    return 1;
-  }
-#elif defined(_WIN32)
-  char path[MAX_PATH];
-  char dir[MAX_PATH];
-  // char dir[MAX_PATH];
-  if (GetModuleFileNameA(NULL, path, sizeof(path))) {
-    char *lastBackslash = strrchr(path, '\\');
-    if (lastBackslash == nullptr) {
-      return 1;
-    }
-    lastBackslash[0] = '\0';
-    strcpy(dir, path);
-    configPath = std::string(dir) + "/config/config.json";
-  } else {
-    LOG_ERROR << "Failed to get binary location!";
-    return 1;
+  std::string host = "127.0.0.1";
+  int port = 3928;
+
+  // Check for host argument
+  if (argc > 1) {
+    host = argv[1];
   }
-#else
-  LOG_ERROR << "Unsupported platform!";
-  return 1;
-#endif
 
-  // Set HTTP listener address and port
-  drogon::app().loadConfigFile(configPath);
-  auto app_conf = drogon::app().getCustomConfig();
+  // Check for port argument
+  if (argc > 2) {
+    port = std::atoi(argv[2]); // Convert string argument to int
+  }
 
-  LOG_INFO << app_conf["llama_model_file"].asString();
   nitro_utils::nitro_logo();
   LOG_INFO << "Server started, please load your model";
-  // drogon::app().addListener("0.0.0.0", 8080);
+  drogon::app().addListener(host, port);
   drogon::app().run();
 
   return 0;

From 108bbd5a53bf2800667f482c2bb9677ea25f515c Mon Sep 17 00:00:00 2001
From: tikikun <daogiatuank54@gmail.com>
Date: Wed, 11 Oct 2023 17:19:32 +0700
Subject: [PATCH 3/8] feat: no config file

---
 main.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/main.cc b/main.cc
index 4b703d7ea..ff3ca04f8 100644
--- a/main.cc
+++ b/main.cc
@@ -30,7 +30,8 @@ int main(int argc, char *argv[]) {
   }
 
   nitro_utils::nitro_logo();
-  LOG_INFO << "Server started, please load your model";
+  LOG_INFO << "Server started, listening at: " << host << ":" << port
+           << "please load your model";
   drogon::app().addListener(host, port);
   drogon::app().run();
 

From 177ee9b6811f4481fac7a845b7c46fe1c526409d Mon Sep 17 00:00:00 2001
From: tikikun <daogiatuank54@gmail.com>
Date: Wed, 11 Oct 2023 17:21:33 +0700
Subject: [PATCH 4/8] chore: format code

---
 controllers/llamaCPP.h | 69 +++++++++++++++++++++---------------------
 1 file changed, 35 insertions(+), 34 deletions(-)

diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h
index ce6a23e02..4a7a4f3a3 100644
--- a/controllers/llamaCPP.h
+++ b/controllers/llamaCPP.h
@@ -1312,42 +1312,44 @@ namespace inferences {
 class llamaCPP : public drogon::HttpController<llamaCPP> {
 public:
   llamaCPP() {
-//    gpt_params params;
-//    auto conf = drogon::app().getCustomConfig();
-//    params.model = conf["llama_model_path"].asString();
-//    params.n_gpu_layers = conf["ngl"].asInt();
-//    params.n_ctx = conf["ctx_len"].asInt();
-//    params.embedding = conf["embedding"].asBool();
-//#ifdef GGML_USE_CUBLAS
-//    LOG_INFO << "Setting up GGML CUBLAS PARAMS";
-//    params.mul_mat_q = false;
-//#endif // GGML_USE_CUBLAS
-//    if (params.model_alias == "unknown") {
-//      params.model_alias = params.model;
-//    }
-//
-//    llama_backend_init(params.numa);
-//
-//    LOG_INFO_LLAMA("build info",
-//                   {{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}});
-//    LOG_INFO_LLAMA("system info",
-//                   {
-//                       {"n_threads", params.n_threads},
-//                       {"total_threads", std::thread::hardware_concurrency()},
-//                       {"system_info", llama_print_system_info()},
-//                   });
-//
-//    // load the model
-//    if (!llama.loadModel(params)) {
-//      LOG_ERROR << "Error loading the model will exit the program";
-//      std::terminate();
-//    }
+    //    gpt_params params;
+    //    auto conf = drogon::app().getCustomConfig();
+    //    params.model = conf["llama_model_path"].asString();
+    //    params.n_gpu_layers = conf["ngl"].asInt();
+    //    params.n_ctx = conf["ctx_len"].asInt();
+    //    params.embedding = conf["embedding"].asBool();
+    // #ifdef GGML_USE_CUBLAS
+    //    LOG_INFO << "Setting up GGML CUBLAS PARAMS";
+    //    params.mul_mat_q = false;
+    // #endif // GGML_USE_CUBLAS
+    //    if (params.model_alias == "unknown") {
+    //      params.model_alias = params.model;
+    //    }
+    //
+    //    llama_backend_init(params.numa);
+    //
+    //    LOG_INFO_LLAMA("build info",
+    //                   {{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}});
+    //    LOG_INFO_LLAMA("system info",
+    //                   {
+    //                       {"n_threads", params.n_threads},
+    //                       {"total_threads",
+    //                       std::thread::hardware_concurrency()},
+    //                       {"system_info", llama_print_system_info()},
+    //                   });
+    //
+    //    // load the model
+    //    if (!llama.loadModel(params)) {
+    //      LOG_ERROR << "Error loading the model will exit the program";
+    //      std::terminate();
+    //    }
+    //    deprecate this if find no usecase
   }
   METHOD_LIST_BEGIN
   // list path definitions here;
-  METHOD_ADD(llamaCPP::chatCompletion, "chat_completion",Post);
-  METHOD_ADD(llamaCPP::embedding,"embedding",Post);
-  METHOD_ADD(llamaCPP::loadModel,"loadmodel",Post);
+  METHOD_ADD(llamaCPP::chatCompletion, "chat_completion", Post);
+  METHOD_ADD(llamaCPP::embedding, "embedding", Post);
+  METHOD_ADD(llamaCPP::loadModel, "loadmodel", Post);
   // PATH_ADD("/llama/chat_completion", Post);
   METHOD_LIST_END
   void chatCompletion(const HttpRequestPtr &req,
@@ -1357,7 +1359,6 @@ class llamaCPP : public drogon::HttpController<llamaCPP> {
   void loadModel(const HttpRequestPtr &req,
                  std::function<void(const HttpResponsePtr &)> &&callback);
 
-
 private:
   llama_server_context llama;
   bool model_loaded = false;

From 636700774a0a030cb5f6965047d7578743932aa0 Mon Sep 17 00:00:00 2001
From: tikikun <daogiatuank54@gmail.com>
Date: Wed, 11 Oct 2023 17:29:04 +0700
Subject: [PATCH 5/8] feat: no config file

---
 main.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/main.cc b/main.cc
index ff3ca04f8..4c394c635 100644
--- a/main.cc
+++ b/main.cc
@@ -30,8 +30,8 @@ int main(int argc, char *argv[]) {
   }
 
   nitro_utils::nitro_logo();
-  LOG_INFO << "Server started, listening at: " << host << ":" << port
-           << "please load your model";
+  LOG_INFO << "Server started, listening at: " << host << ":" << port;
+  LOG_INFO << "please load your model";
   drogon::app().addListener(host, port);
   drogon::app().run();
 

From 96d38b97b2f05c1c276e02ab923524cea6f57736 Mon Sep 17 00:00:00 2001
From: tikikun <daogiatuank54@gmail.com>
Date: Wed, 11 Oct 2023 17:29:24 +0700
Subject: [PATCH 6/8] feat: no config file

---
 main.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/main.cc b/main.cc
index 4c394c635..c09cba781 100644
--- a/main.cc
+++ b/main.cc
@@ -31,7 +31,7 @@ int main(int argc, char *argv[]) {
 
   nitro_utils::nitro_logo();
   LOG_INFO << "Server started, listening at: " << host << ":" << port;
-  LOG_INFO << "please load your model";
+  LOG_INFO << "Please load your model";
   drogon::app().addListener(host, port);
   drogon::app().run();
 

From 9bb11daec9f720a166b5c57e5d786cee03a1c4a7 Mon Sep 17 00:00:00 2001
From: tikikun <daogiatuank54@gmail.com>
Date: Wed, 11 Oct 2023 17:48:25 +0700
Subject: [PATCH 7/8] chore: format code

---
 controllers/llamaCPP.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h
index 4a7a4f3a3..9cf14b848 100644
--- a/controllers/llamaCPP.h
+++ b/controllers/llamaCPP.h
@@ -1188,7 +1188,7 @@ static void parse_options_completion(const json &body,
   }
 
   llama.params.antiprompt.clear();
-  const auto &stop = body.find("stop");
+  ế mà mua chưa, e tính nhờ mua 2 hộp thể const auto &stop = body.find("stop");
   if (stop != body.end() && stop->is_array()) {
     for (const auto &word : *stop) {
       if (!word.empty()) {

From a7a38180db554c1395482635fc5ca1035a3944b3 Mon Sep 17 00:00:00 2001
From: tikikun <daogiatuank54@gmail.com>
Date: Wed, 11 Oct 2023 17:49:07 +0700
Subject: [PATCH 8/8] chore: format code

---
 controllers/llamaCPP.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h
index 9cf14b848..4a7a4f3a3 100644
--- a/controllers/llamaCPP.h
+++ b/controllers/llamaCPP.h
@@ -1188,7 +1188,7 @@ static void parse_options_completion(const json &body,
   }
 
   llama.params.antiprompt.clear();
-  ế mà mua chưa, e tính nhờ mua 2 hộp thể const auto &stop = body.find("stop");
+  const auto &stop = body.find("stop");
   if (stop != body.end() && stop->is_array()) {
     for (const auto &word : *stop) {
       if (!word.empty()) {