From dcd9acd0f9e2b92d3befe5f9871a079d657e5926 Mon Sep 17 00:00:00 2001 From: tikikun Date: Mon, 5 Feb 2024 08:33:32 +0700 Subject: [PATCH 1/6] move private methods to private --- controllers/llamaCPP.h | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index a9581da21..2fce524ea 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -2548,18 +2548,9 @@ class llamaCPP : public drogon::HttpController { std::function &&callback); void unloadModel(const HttpRequestPtr &req, std::function &&callback); - void modelStatus(const HttpRequestPtr &req, std::function &&callback); - bool loadModelImpl(const Json::Value &jsonBody); - - void warmupModel(); - - void backgroundTask(); - - void stopBackgroundTask(); - private: llama_server_context llama; // std::atomic model_loaded = false; @@ -2577,5 +2568,10 @@ class llamaCPP : public drogon::HttpController { std::atomic single_queue_is_busy; // This value only used under the // condition n_parallel is 1 std::string grammar_file_content; + + bool loadModelImpl(const Json::Value &jsonBody); + void warmupModel(); + void backgroundTask(); + void stopBackgroundTask(); }; }; // namespace inferences From e5a973f5e0447b80e3cd92f9001717b5303faffe Mon Sep 17 00:00:00 2001 From: tikikun Date: Mon, 5 Feb 2024 08:58:55 +0700 Subject: [PATCH 2/6] move private methods to private --- controllers/llamaCPP.cc | 8 ++++---- controllers/llamaCPP.h | 2 ++ 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index 416d22cd7..71f9fe13f 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -22,8 +22,8 @@ std::shared_ptr create_inference_state(llamaCPP *instance) { // -------------------------------------------- // Function to check if the model is loaded -void check_model_loaded( - llama_server_context &llama, const HttpRequestPtr &req, +void llamaCPP::checkModelLoaded( + const HttpRequestPtr &req, std::function &callback) { if (!llama.model_loaded_external) { Json::Value jsonResp; @@ -152,7 +152,7 @@ void llamaCPP::chatCompletion( std::function &&callback) { // Check if model is loaded - check_model_loaded(llama, req, callback); + checkModelLoaded(req, callback); const auto &jsonBody = req->getJsonObject(); std::string formatted_output = pre_prompt; @@ -405,7 +405,7 @@ void llamaCPP::chatCompletion( void llamaCPP::embedding( const HttpRequestPtr &req, std::function &&callback) { - check_model_loaded(llama, req, callback); + checkModelLoaded(req, callback); auto state = create_inference_state(this); diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index 2fce524ea..9d39a85f9 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -2573,5 +2573,7 @@ class llamaCPP : public drogon::HttpController { void warmupModel(); void backgroundTask(); void stopBackgroundTask(); + void checkModelLoaded(const HttpRequestPtr &req, + std::function &callback); }; }; // namespace inferences From a321e8238f1655207a83c20bf46eaf3cda2eaad2 Mon Sep 17 00:00:00 2001 From: tikikun Date: Mon, 5 Feb 2024 13:46:10 +0700 Subject: [PATCH 3/6] refactor handler and impl for seperation of concern --- controllers/llamaCPP.cc | 89 +++++++++++++++++++++++------------------ controllers/llamaCPP.h | 10 +++-- 2 files changed, 58 insertions(+), 41 deletions(-) diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index 71f9fe13f..9e04dd13a 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -2,6 +2,7 @@ #include "llama.h" #include "log.h" #include "utils/nitro_utils.h" +#include using namespace inferences; using json = nlohmann::json; @@ -23,7 +24,6 @@ std::shared_ptr create_inference_state(llamaCPP *instance) { // Function to check if the model is loaded void llamaCPP::checkModelLoaded( - const HttpRequestPtr &req, std::function &callback) { if (!llama.model_loaded_external) { Json::Value jsonResp; @@ -151,10 +151,17 @@ void llamaCPP::chatCompletion( const HttpRequestPtr &req, std::function &&callback) { + const auto &jsonBody = req->getJsonObject(); // Check if model is loaded - checkModelLoaded(req, callback); + checkModelLoaded(callback); + + chatCompletionImpl(jsonBody, callback); +} + +void llamaCPP::chatCompletionImpl( + std::shared_ptr jsonBody, + std::function &callback) { - const auto &jsonBody = req->getJsonObject(); std::string formatted_output = pre_prompt; json data; @@ -402,17 +409,23 @@ void llamaCPP::chatCompletion( } } } + void llamaCPP::embedding( const HttpRequestPtr &req, std::function &&callback) { - checkModelLoaded(req, callback); + checkModelLoaded(callback); + const auto &jsonBody = req->getJsonObject(); - auto state = create_inference_state(this); + embeddingImpl(jsonBody, callback); + return; +} - const auto &jsonBody = req->getJsonObject(); +void llamaCPP::embeddingImpl( + std::shared_ptr jsonBody, + std::function &callback) { Json::Value responseData(Json::arrayValue); - + auto state = create_inference_state(this); if (jsonBody->isMember("input")) { // If single queue is busy, we will wait if not we will just go ahead and // process and make it busy, and yet i'm aware not DRY, i have the same @@ -464,7 +477,6 @@ void llamaCPP::embedding( resp->setBody(Json::writeString(Json::StreamWriterBuilder(), root)); resp->setContentTypeString("application/json"); callback(resp); - return; } void llamaCPP::unloadModel( @@ -502,30 +514,30 @@ void llamaCPP::modelStatus( return; } -bool llamaCPP::loadModelImpl(const Json::Value &jsonBody) { +bool llamaCPP::loadModelImpl(std::shared_ptr jsonBody) { gpt_params params; - // By default will setting based on number of handlers if (jsonBody) { - if (!jsonBody["mmproj"].isNull()) { + if (!jsonBody->operator[]("mmproj").isNull()) { LOG_INFO << "MMPROJ FILE detected, multi-model enabled!"; - params.mmproj = jsonBody["mmproj"].asString(); + params.mmproj = jsonBody->operator[]("mmproj").asString(); } - if (!jsonBody["grp_attn_n"].isNull()) { + if (!jsonBody->operator[]("grp_attn_n").isNull()) { - params.grp_attn_n = jsonBody["grp_attn_n"].asInt(); + params.grp_attn_n = jsonBody->operator[]("grp_attn_n").asInt(); } - if (!jsonBody["grp_attn_w"].isNull()) { + if (!jsonBody->operator[]("grp_attn_w").isNull()) { - params.grp_attn_w = jsonBody["grp_attn_w"].asInt(); + params.grp_attn_w = jsonBody->operator[]("grp_attn_w").asInt(); } - if (!jsonBody["mlock"].isNull()) { - params.use_mlock = jsonBody["mlock"].asBool(); + if (!jsonBody->operator[]("mlock").isNull()) { + params.use_mlock = jsonBody->operator[]("mlock").asBool(); } - if (!jsonBody["grammar_file"].isNull()) { - std::string grammar_file = jsonBody["grammar_file"].asString(); + if (!jsonBody->operator[]("grammar_file").isNull()) { + std::string grammar_file = + jsonBody->operator[]("grammar_file").asString(); std::ifstream file(grammar_file); if (!file) { LOG_ERROR << "Grammar file not found"; @@ -536,30 +548,31 @@ bool llamaCPP::loadModelImpl(const Json::Value &jsonBody) { } }; - params.model = jsonBody["llama_model_path"].asString(); - params.n_gpu_layers = jsonBody.get("ngl", 100).asInt(); - params.n_ctx = jsonBody.get("ctx_len", 2048).asInt(); - params.embedding = jsonBody.get("embedding", true).asBool(); + params.model = jsonBody->operator[]("llama_model_path").asString(); + params.n_gpu_layers = jsonBody->get("ngl", 100).asInt(); + params.n_ctx = jsonBody->get("ctx_len", 2048).asInt(); + params.embedding = jsonBody->get("embedding", true).asBool(); // Check if n_parallel exists in jsonBody, if not, set to drogon_thread - params.n_batch = jsonBody.get("n_batch", 512).asInt(); - params.n_parallel = jsonBody.get("n_parallel", 1).asInt(); + params.n_batch = jsonBody->get("n_batch", 512).asInt(); + params.n_parallel = jsonBody->get("n_parallel", 1).asInt(); params.n_threads = - jsonBody.get("cpu_threads", std::thread::hardware_concurrency()) + jsonBody->get("cpu_threads", std::thread::hardware_concurrency()) .asInt(); - params.cont_batching = jsonBody.get("cont_batching", false).asBool(); + params.cont_batching = jsonBody->get("cont_batching", false).asBool(); this->clean_cache_threshold = - jsonBody.get("clean_cache_threshold", 5).asInt(); - this->caching_enabled = jsonBody.get("caching_enabled", false).asBool(); - this->user_prompt = jsonBody.get("user_prompt", "USER: ").asString(); - this->ai_prompt = jsonBody.get("ai_prompt", "ASSISTANT: ").asString(); + jsonBody->get("clean_cache_threshold", 5).asInt(); + this->caching_enabled = jsonBody->get("caching_enabled", false).asBool(); + this->user_prompt = jsonBody->get("user_prompt", "USER: ").asString(); + this->ai_prompt = jsonBody->get("ai_prompt", "ASSISTANT: ").asString(); this->system_prompt = - jsonBody.get("system_prompt", "ASSISTANT's RULE: ").asString(); - this->pre_prompt = jsonBody.get("pre_prompt", "").asString(); - this->repeat_last_n = jsonBody.get("repeat_last_n", 32).asInt(); + jsonBody->get("system_prompt", "ASSISTANT's RULE: ").asString(); + this->pre_prompt = jsonBody->get("pre_prompt", "").asString(); + this->repeat_last_n = jsonBody->get("repeat_last_n", 32).asInt(); - if (!jsonBody["llama_log_folder"].isNull()) { + if (!jsonBody->operator[]("llama_log_folder").isNull()) { log_enable(); - std::string llama_log_folder = jsonBody["llama_log_folder"].asString(); + std::string llama_log_folder = + jsonBody->operator[]("llama_log_folder").asString(); log_set_target(llama_log_folder + "llama.log"); } // Set folder for llama log } @@ -612,7 +625,7 @@ void llamaCPP::loadModel( } const auto &jsonBody = req->getJsonObject(); - if (!loadModelImpl(*jsonBody)) { + if (!loadModelImpl(jsonBody)) { // Error occurred during model loading Json::Value jsonResp; jsonResp["message"] = "Failed to load model"; diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index 9d39a85f9..a2c06f008 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -2569,11 +2569,15 @@ class llamaCPP : public drogon::HttpController { // condition n_parallel is 1 std::string grammar_file_content; - bool loadModelImpl(const Json::Value &jsonBody); + bool loadModelImpl(std::shared_ptr jsonBody); + void + chatCompletionImpl(std::shared_ptr jsonBody, + std::function &callback); + void embeddingImpl(std::shared_ptr jsonBody, + std::function &callback); + void checkModelLoaded(std::function &callback); void warmupModel(); void backgroundTask(); void stopBackgroundTask(); - void checkModelLoaded(const HttpRequestPtr &req, - std::function &callback); }; }; // namespace inferences From a78a9d8f008be8f7e3bcebace61f01a48f4f0111 Mon Sep 17 00:00:00 2001 From: tikikun Date: Mon, 5 Feb 2024 14:02:22 +0700 Subject: [PATCH 4/6] add a few inline docs --- controllers/llamaCPP.cc | 80 +++++++++++++++++++++++------------------ 1 file changed, 46 insertions(+), 34 deletions(-) diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index 9e04dd13a..9966dc6fa 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -7,6 +7,13 @@ using namespace inferences; using json = nlohmann::json; +/** + * There is a need to save state of current ongoing inference status of a + * handler, this struct is to solve that issue + * + * @param inst Pointer to the llamaCPP instance this inference task is + * associated with. + */ struct inferenceState { bool is_stopped = false; bool is_streaming = false; @@ -16,13 +23,19 @@ struct inferenceState { inferenceState(llamaCPP *inst) : instance(inst) {} }; +/** + * This function is to create the smart pointer to inferenceState, hence the + * inferenceState will be persisting even tho the lambda in streaming might go + * out of scope and the handler already moved on + */ std::shared_ptr create_inference_state(llamaCPP *instance) { return std::make_shared(instance); } -// -------------------------------------------- - -// Function to check if the model is loaded +/** + * Check if model already loaded if not return message to user + * @param callback the function to return message to user + */ void llamaCPP::checkModelLoaded( std::function &callback) { if (!llama.model_loaded_external) { @@ -513,6 +526,36 @@ void llamaCPP::modelStatus( callback(resp); return; } +void llamaCPP::loadModel( + const HttpRequestPtr &req, + std::function &&callback) { + + if (llama.model_loaded_external) { + LOG_INFO << "model loaded"; + Json::Value jsonResp; + jsonResp["message"] = "Model already loaded"; + auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); + resp->setStatusCode(drogon::k409Conflict); + callback(resp); + return; + } + + const auto &jsonBody = req->getJsonObject(); + if (!loadModelImpl(jsonBody)) { + // Error occurred during model loading + Json::Value jsonResp; + jsonResp["message"] = "Failed to load model"; + auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); + resp->setStatusCode(drogon::k500InternalServerError); + callback(resp); + } else { + // Model loaded successfully + Json::Value jsonResp; + jsonResp["message"] = "Model loaded successfully"; + auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); + callback(resp); + } +} bool llamaCPP::loadModelImpl(std::shared_ptr jsonBody) { @@ -610,37 +653,6 @@ bool llamaCPP::loadModelImpl(std::shared_ptr jsonBody) { return true; } -void llamaCPP::loadModel( - const HttpRequestPtr &req, - std::function &&callback) { - - if (llama.model_loaded_external) { - LOG_INFO << "model loaded"; - Json::Value jsonResp; - jsonResp["message"] = "Model already loaded"; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(drogon::k409Conflict); - callback(resp); - return; - } - - const auto &jsonBody = req->getJsonObject(); - if (!loadModelImpl(jsonBody)) { - // Error occurred during model loading - Json::Value jsonResp; - jsonResp["message"] = "Failed to load model"; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(drogon::k500InternalServerError); - callback(resp); - } else { - // Model loaded successfully - Json::Value jsonResp; - jsonResp["message"] = "Model loaded successfully"; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - callback(resp); - } -} - void llamaCPP::backgroundTask() { while (llama.model_loaded_external) { // model_loaded = From 69fa41d2e18829e54c591fe7d89aaf04c9c16b43 Mon Sep 17 00:00:00 2001 From: tikikun Date: Mon, 5 Feb 2024 14:10:44 +0700 Subject: [PATCH 5/6] handle prelight as a single endpoint --- controllers/llamaCPP.cc | 2 +- controllers/llamaCPP.h | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index 9966dc6fa..44d977f38 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -149,7 +149,7 @@ void llamaCPP::warmupModel() { return; } -void llamaCPP::chatCompletionPrelight( +void llamaCPP::handlePrelight( const HttpRequestPtr &req, std::function &&callback) { auto resp = drogon::HttpResponse::newHttpResponse(); diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index a2c06f008..adc179489 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -2530,18 +2530,17 @@ class llamaCPP : public drogon::HttpController { // Openai compatible path ADD_METHOD_TO(llamaCPP::chatCompletion, "/v1/chat/completions", Post); - ADD_METHOD_TO(llamaCPP::chatCompletionPrelight, "/v1/chat/completions", - Options); + ADD_METHOD_TO(llamaCPP::handlePrelight, "/v1/chat/completions", Options); ADD_METHOD_TO(llamaCPP::embedding, "/v1/embeddings", Post); + ADD_METHOD_TO(llamaCPP::handlePrelight, "/v1/embeddings", Options); // PATH_ADD("/llama/chat_completion", Post); METHOD_LIST_END void chatCompletion(const HttpRequestPtr &req, std::function &&callback); - void chatCompletionPrelight( - const HttpRequestPtr &req, - std::function &&callback); + void handlePrelight(const HttpRequestPtr &req, + std::function &&callback); void embedding(const HttpRequestPtr &req, std::function &&callback); void loadModel(const HttpRequestPtr &req, From 96deb0eea2e4dbaeab6d61d79fda49d895b07cc0 Mon Sep 17 00:00:00 2001 From: tikikun Date: Mon, 5 Feb 2024 14:11:33 +0700 Subject: [PATCH 6/6] redundant include --- controllers/llamaCPP.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index 44d977f38..ea207a220 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -2,7 +2,6 @@ #include "llama.h" #include "log.h" #include "utils/nitro_utils.h" -#include using namespace inferences; using json = nlohmann::json;