From 4cfa440b16b78b74b1fe6d668f337907df1c2990 Mon Sep 17 00:00:00 2001 From: tikikun Date: Wed, 11 Oct 2023 17:08:02 +0700 Subject: [PATCH 1/8] feat: add dynamic loading model through api --- config.json | 8 +---- controllers/llamaCPP.cc | 67 ++++++++++++++++++++++++++++++++++++++ controllers/llamaCPP.h | 71 ++++++++++++++++++++++------------------- main.cc | 23 ++++++------- 4 files changed, 118 insertions(+), 51 deletions(-) diff --git a/config.json b/config.json index 2a7f093cc..add7da3a6 100644 --- a/config.json +++ b/config.json @@ -4,11 +4,5 @@ "address": "127.0.0.1", "port": 3928 } - ], - "custom_config": { - "llama_model_path": "/Users/alandao/Documents/codes/nitro.cpp_temp/models/llama2_7b_chat_uncensored.Q4_0.gguf", - "ctx_len": 2048, - "ngl": 100, - "embedding":true - } + ] } diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index 5a12621f7..679859f5c 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -41,6 +42,15 @@ std::string create_return_json(const std::string &id, const std::string &model, void llamaCPP::chatCompletion( const HttpRequestPtr &req, std::function &&callback) { + if (!model_loaded) { + Json::Value jsonResp; + jsonResp["message"] = "Model is not loaded yet"; + auto resp = drogon::HttpResponse::newHttpJsonResponse(jsonResp); + resp->setStatusCode(drogon::k500InternalServerError); + callback(resp); + return; + } + const auto &jsonBody = req->getJsonObject(); std::string formatted_output = "Below is a conversation between an AI system named ASSISTANT and USER\n"; @@ -203,6 +213,15 @@ void llamaCPP::chatCompletion( void llamaCPP::embedding( const HttpRequestPtr &req, std::function &&callback) { + if (!model_loaded) { + Json::Value jsonResp; + jsonResp["message"] = "Model is not loaded yet"; + auto resp = drogon::HttpResponse::newHttpJsonResponse(jsonResp); + resp->setStatusCode(drogon::k500InternalServerError); + callback(resp); + return; + } + auto lock = llama.lock(); const auto &jsonBody = req->getJsonObject(); @@ -225,3 +244,51 @@ void llamaCPP::embedding( resp->setContentTypeString("application/json"); callback(resp); } + +void llamaCPP::loadModel( + const HttpRequestPtr &req, + std::function &&callback) { + + const auto &jsonBody = req->getJsonObject(); + + gpt_params params; + if (jsonBody) { + params.model = (*jsonBody)["llama_model_path"].asString(); + params.n_gpu_layers = (*jsonBody)["ngl"].asInt(); + params.n_ctx = (*jsonBody)["ctx_len"].asInt(); + params.embedding = (*jsonBody)["embedding"].asBool(); + } +#ifdef GGML_USE_CUBLAS + LOG_INFO << "Setting up GGML CUBLAS PARAMS"; + params.mul_mat_q = false; +#endif // GGML_USE_CUBLAS + if (params.model_alias == "unknown") { + params.model_alias = params.model; + } + + llama_backend_init(params.numa); + + LOG_INFO_LLAMA("build info", + {{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}}); + LOG_INFO_LLAMA("system info", + { + {"n_threads", params.n_threads}, + {"total_threads", std::thread::hardware_concurrency()}, + {"system_info", llama_print_system_info()}, + }); + + // load the model + if (!llama.loadModel(params)) { + LOG_ERROR << "Error loading the model will exit the program"; + Json::Value jsonResp; + jsonResp["message"] = "Model loaded failed"; + auto resp = drogon::HttpResponse::newHttpJsonResponse(jsonResp); + resp->setStatusCode(drogon::k500InternalServerError); + callback(resp); + } + Json::Value jsonResp; + jsonResp["message"] = "Model loaded successfully"; + model_loaded = true; + auto resp = drogon::HttpResponse::newHttpJsonResponse(jsonResp); + callback(resp); +} diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index e4c6c881d..ce6a23e02 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -1,3 +1,4 @@ +#include #if defined(_WIN32) #define NOMINMAX #endif @@ -1311,51 +1312,55 @@ namespace inferences { class llamaCPP : public drogon::HttpController { public: llamaCPP() { - gpt_params params; - auto conf = drogon::app().getCustomConfig(); - params.model = conf["llama_model_path"].asString(); - params.n_gpu_layers = conf["ngl"].asInt(); - params.n_ctx = conf["ctx_len"].asInt(); - params.embedding = conf["embedding"].asBool(); -#ifdef GGML_USE_CUBLAS - LOG_INFO << "Setting up GGML CUBLAS PARAMS"; - params.mul_mat_q = false; -#endif // GGML_USE_CUBLAS - if (params.model_alias == "unknown") { - params.model_alias = params.model; - } - - llama_backend_init(params.numa); - - LOG_INFO_LLAMA("build info", - {{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}}); - LOG_INFO_LLAMA("system info", - { - {"n_threads", params.n_threads}, - {"total_threads", std::thread::hardware_concurrency()}, - {"system_info", llama_print_system_info()}, - }); - - // load the model - if (!llama.loadModel(params)) { - LOG_ERROR << "Error loading the model will exit the program"; - std::terminate(); - } - nitro_utils::nitro_logo(); +// gpt_params params; +// auto conf = drogon::app().getCustomConfig(); +// params.model = conf["llama_model_path"].asString(); +// params.n_gpu_layers = conf["ngl"].asInt(); +// params.n_ctx = conf["ctx_len"].asInt(); +// params.embedding = conf["embedding"].asBool(); +//#ifdef GGML_USE_CUBLAS +// LOG_INFO << "Setting up GGML CUBLAS PARAMS"; +// params.mul_mat_q = false; +//#endif // GGML_USE_CUBLAS +// if (params.model_alias == "unknown") { +// params.model_alias = params.model; +// } +// +// llama_backend_init(params.numa); +// +// LOG_INFO_LLAMA("build info", +// {{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}}); +// LOG_INFO_LLAMA("system info", +// { +// {"n_threads", params.n_threads}, +// {"total_threads", std::thread::hardware_concurrency()}, +// {"system_info", llama_print_system_info()}, +// }); +// +// // load the model +// if (!llama.loadModel(params)) { +// LOG_ERROR << "Error loading the model will exit the program"; +// std::terminate(); +// } } METHOD_LIST_BEGIN // list path definitions here; - METHOD_ADD(llamaCPP::chatCompletion, "chat_completion"); - METHOD_ADD(llamaCPP::embedding,"embedding"); + METHOD_ADD(llamaCPP::chatCompletion, "chat_completion",Post); + METHOD_ADD(llamaCPP::embedding,"embedding",Post); + METHOD_ADD(llamaCPP::loadModel,"loadmodel",Post); // PATH_ADD("/llama/chat_completion", Post); METHOD_LIST_END void chatCompletion(const HttpRequestPtr &req, std::function &&callback); void embedding(const HttpRequestPtr &req, std::function &&callback); + void loadModel(const HttpRequestPtr &req, + std::function &&callback); + private: llama_server_context llama; + bool model_loaded = false; size_t sent_count = 0; size_t sent_token_probs_index = 0; }; diff --git a/main.cc b/main.cc index 5ac05720e..fa9fb5ef8 100644 --- a/main.cc +++ b/main.cc @@ -1,4 +1,4 @@ - +#include "controllers/nitro_utils.h" #include // for PATH_MAX #include @@ -43,16 +43,15 @@ int main() { char path[MAX_PATH]; char dir[MAX_PATH]; // char dir[MAX_PATH]; - if(GetModuleFileNameA(NULL, path, sizeof(path))) { - char* lastBackslash = strrchr(path, '\\'); - if (lastBackslash == nullptr) { - return 1; - } - lastBackslash[0] = '\0'; - strcpy(dir, path); - configPath = std::string(dir) + "/config/config.json"; - } - else { + if (GetModuleFileNameA(NULL, path, sizeof(path))) { + char *lastBackslash = strrchr(path, '\\'); + if (lastBackslash == nullptr) { + return 1; + } + lastBackslash[0] = '\0'; + strcpy(dir, path); + configPath = std::string(dir) + "/config/config.json"; + } else { LOG_ERROR << "Failed to get binary location!"; return 1; } @@ -66,6 +65,8 @@ int main() { auto app_conf = drogon::app().getCustomConfig(); LOG_INFO << app_conf["llama_model_file"].asString(); + nitro_utils::nitro_logo(); + LOG_INFO << "Server started, please load your model"; // drogon::app().addListener("0.0.0.0", 8080); drogon::app().run(); From 571e03b45e44e5d02beb6d2de769a8f851cd2cc8 Mon Sep 17 00:00:00 2001 From: tikikun Date: Wed, 11 Oct 2023 17:14:28 +0700 Subject: [PATCH 2/8] feat: add dynamic loading model through api --- main.cc | 60 ++++++++++++--------------------------------------------- 1 file changed, 12 insertions(+), 48 deletions(-) diff --git a/main.cc b/main.cc index fa9fb5ef8..4b703d7ea 100644 --- a/main.cc +++ b/main.cc @@ -14,60 +14,24 @@ #error "Unsupported platform!" #endif -int main() { - std::string configPath; +int main(int argc, char *argv[]) { -#if defined(__APPLE__) && defined(__MACH__) - char path[PATH_MAX]; - uint32_t size = sizeof(path); - if (_NSGetExecutablePath(path, &size) == 0) { - path[size] = '\0'; // Null-terminate the string - char *dir = dirname(path); - configPath = std::string(dir) + "/config/config.json"; - } else { - LOG_ERROR << "Failed to get binary location!"; - return 1; - } -#elif defined(__linux__) - char path[PATH_MAX]; - ssize_t len = readlink("/proc/self/exe", path, sizeof(path) - 1); - if (len != -1) { - path[len] = '\0'; - char *dir = dirname(path); - configPath = std::string(dir) + "/config/config.json"; - } else { - LOG_ERROR << "Failed to get binary location!"; - return 1; - } -#elif defined(_WIN32) - char path[MAX_PATH]; - char dir[MAX_PATH]; - // char dir[MAX_PATH]; - if (GetModuleFileNameA(NULL, path, sizeof(path))) { - char *lastBackslash = strrchr(path, '\\'); - if (lastBackslash == nullptr) { - return 1; - } - lastBackslash[0] = '\0'; - strcpy(dir, path); - configPath = std::string(dir) + "/config/config.json"; - } else { - LOG_ERROR << "Failed to get binary location!"; - return 1; + std::string host = "127.0.0.1"; + int port = 3928; + + // Check for host argument + if (argc > 1) { + host = argv[1]; } -#else - LOG_ERROR << "Unsupported platform!"; - return 1; -#endif - // Set HTTP listener address and port - drogon::app().loadConfigFile(configPath); - auto app_conf = drogon::app().getCustomConfig(); + // Check for port argument + if (argc > 2) { + port = std::atoi(argv[2]); // Convert string argument to int + } - LOG_INFO << app_conf["llama_model_file"].asString(); nitro_utils::nitro_logo(); LOG_INFO << "Server started, please load your model"; - // drogon::app().addListener("0.0.0.0", 8080); + drogon::app().addListener(host, port); drogon::app().run(); return 0; From 108bbd5a53bf2800667f482c2bb9677ea25f515c Mon Sep 17 00:00:00 2001 From: tikikun Date: Wed, 11 Oct 2023 17:19:32 +0700 Subject: [PATCH 3/8] feat: no config file --- main.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/main.cc b/main.cc index 4b703d7ea..ff3ca04f8 100644 --- a/main.cc +++ b/main.cc @@ -30,7 +30,8 @@ int main(int argc, char *argv[]) { } nitro_utils::nitro_logo(); - LOG_INFO << "Server started, please load your model"; + LOG_INFO << "Server started, listening at: " << host << ":" << port + << "please load your model"; drogon::app().addListener(host, port); drogon::app().run(); From 177ee9b6811f4481fac7a845b7c46fe1c526409d Mon Sep 17 00:00:00 2001 From: tikikun Date: Wed, 11 Oct 2023 17:21:33 +0700 Subject: [PATCH 4/8] chore: format code --- controllers/llamaCPP.h | 69 +++++++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 34 deletions(-) diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index ce6a23e02..4a7a4f3a3 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -1312,42 +1312,44 @@ namespace inferences { class llamaCPP : public drogon::HttpController { public: llamaCPP() { -// gpt_params params; -// auto conf = drogon::app().getCustomConfig(); -// params.model = conf["llama_model_path"].asString(); -// params.n_gpu_layers = conf["ngl"].asInt(); -// params.n_ctx = conf["ctx_len"].asInt(); -// params.embedding = conf["embedding"].asBool(); -//#ifdef GGML_USE_CUBLAS -// LOG_INFO << "Setting up GGML CUBLAS PARAMS"; -// params.mul_mat_q = false; -//#endif // GGML_USE_CUBLAS -// if (params.model_alias == "unknown") { -// params.model_alias = params.model; -// } -// -// llama_backend_init(params.numa); -// -// LOG_INFO_LLAMA("build info", -// {{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}}); -// LOG_INFO_LLAMA("system info", -// { -// {"n_threads", params.n_threads}, -// {"total_threads", std::thread::hardware_concurrency()}, -// {"system_info", llama_print_system_info()}, -// }); -// -// // load the model -// if (!llama.loadModel(params)) { -// LOG_ERROR << "Error loading the model will exit the program"; -// std::terminate(); -// } + // gpt_params params; + // auto conf = drogon::app().getCustomConfig(); + // params.model = conf["llama_model_path"].asString(); + // params.n_gpu_layers = conf["ngl"].asInt(); + // params.n_ctx = conf["ctx_len"].asInt(); + // params.embedding = conf["embedding"].asBool(); + // #ifdef GGML_USE_CUBLAS + // LOG_INFO << "Setting up GGML CUBLAS PARAMS"; + // params.mul_mat_q = false; + // #endif // GGML_USE_CUBLAS + // if (params.model_alias == "unknown") { + // params.model_alias = params.model; + // } + // + // llama_backend_init(params.numa); + // + // LOG_INFO_LLAMA("build info", + // {{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}}); + // LOG_INFO_LLAMA("system info", + // { + // {"n_threads", params.n_threads}, + // {"total_threads", + // std::thread::hardware_concurrency()}, + // {"system_info", llama_print_system_info()}, + // }); + // + // // load the model + // if (!llama.loadModel(params)) { + // LOG_ERROR << "Error loading the model will exit the program"; + // std::terminate(); + // } + // deprecate this if find no usecase } METHOD_LIST_BEGIN // list path definitions here; - METHOD_ADD(llamaCPP::chatCompletion, "chat_completion",Post); - METHOD_ADD(llamaCPP::embedding,"embedding",Post); - METHOD_ADD(llamaCPP::loadModel,"loadmodel",Post); + METHOD_ADD(llamaCPP::chatCompletion, "chat_completion", Post); + METHOD_ADD(llamaCPP::embedding, "embedding", Post); + METHOD_ADD(llamaCPP::loadModel, "loadmodel", Post); // PATH_ADD("/llama/chat_completion", Post); METHOD_LIST_END void chatCompletion(const HttpRequestPtr &req, @@ -1357,7 +1359,6 @@ class llamaCPP : public drogon::HttpController { void loadModel(const HttpRequestPtr &req, std::function &&callback); - private: llama_server_context llama; bool model_loaded = false; From 636700774a0a030cb5f6965047d7578743932aa0 Mon Sep 17 00:00:00 2001 From: tikikun Date: Wed, 11 Oct 2023 17:29:04 +0700 Subject: [PATCH 5/8] feat: no config file --- main.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.cc b/main.cc index ff3ca04f8..4c394c635 100644 --- a/main.cc +++ b/main.cc @@ -30,8 +30,8 @@ int main(int argc, char *argv[]) { } nitro_utils::nitro_logo(); - LOG_INFO << "Server started, listening at: " << host << ":" << port - << "please load your model"; + LOG_INFO << "Server started, listening at: " << host << ":" << port; + LOG_INFO << "please load your model"; drogon::app().addListener(host, port); drogon::app().run(); From 96d38b97b2f05c1c276e02ab923524cea6f57736 Mon Sep 17 00:00:00 2001 From: tikikun Date: Wed, 11 Oct 2023 17:29:24 +0700 Subject: [PATCH 6/8] feat: no config file --- main.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.cc b/main.cc index 4c394c635..c09cba781 100644 --- a/main.cc +++ b/main.cc @@ -31,7 +31,7 @@ int main(int argc, char *argv[]) { nitro_utils::nitro_logo(); LOG_INFO << "Server started, listening at: " << host << ":" << port; - LOG_INFO << "please load your model"; + LOG_INFO << "Please load your model"; drogon::app().addListener(host, port); drogon::app().run(); From 9bb11daec9f720a166b5c57e5d786cee03a1c4a7 Mon Sep 17 00:00:00 2001 From: tikikun Date: Wed, 11 Oct 2023 17:48:25 +0700 Subject: [PATCH 7/8] chore: format code --- controllers/llamaCPP.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index 4a7a4f3a3..9cf14b848 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -1188,7 +1188,7 @@ static void parse_options_completion(const json &body, } llama.params.antiprompt.clear(); - const auto &stop = body.find("stop"); + ế mà mua chưa, e tính nhờ mua 2 hộp thể const auto &stop = body.find("stop"); if (stop != body.end() && stop->is_array()) { for (const auto &word : *stop) { if (!word.empty()) { From a7a38180db554c1395482635fc5ca1035a3944b3 Mon Sep 17 00:00:00 2001 From: tikikun Date: Wed, 11 Oct 2023 17:49:07 +0700 Subject: [PATCH 8/8] chore: format code --- controllers/llamaCPP.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index 9cf14b848..4a7a4f3a3 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -1188,7 +1188,7 @@ static void parse_options_completion(const json &body, } llama.params.antiprompt.clear(); - ế mà mua chưa, e tính nhờ mua 2 hộp thể const auto &stop = body.find("stop"); + const auto &stop = body.find("stop"); if (stop != body.end() && stop->is_array()) { for (const auto &word : *stop) { if (!word.empty()) {