diff --git a/config.json b/config.json index 2a7f093cc..add7da3a6 100644 --- a/config.json +++ b/config.json @@ -4,11 +4,5 @@ "address": "127.0.0.1", "port": 3928 } - ], - "custom_config": { - "llama_model_path": "/Users/alandao/Documents/codes/nitro.cpp_temp/models/llama2_7b_chat_uncensored.Q4_0.gguf", - "ctx_len": 2048, - "ngl": 100, - "embedding":true - } + ] } diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index 5a12621f7..679859f5c 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -41,6 +42,15 @@ std::string create_return_json(const std::string &id, const std::string &model, void llamaCPP::chatCompletion( const HttpRequestPtr &req, std::function &&callback) { + if (!model_loaded) { + Json::Value jsonResp; + jsonResp["message"] = "Model is not loaded yet"; + auto resp = drogon::HttpResponse::newHttpJsonResponse(jsonResp); + resp->setStatusCode(drogon::k500InternalServerError); + callback(resp); + return; + } + const auto &jsonBody = req->getJsonObject(); std::string formatted_output = "Below is a conversation between an AI system named ASSISTANT and USER\n"; @@ -203,6 +213,15 @@ void llamaCPP::chatCompletion( void llamaCPP::embedding( const HttpRequestPtr &req, std::function &&callback) { + if (!model_loaded) { + Json::Value jsonResp; + jsonResp["message"] = "Model is not loaded yet"; + auto resp = drogon::HttpResponse::newHttpJsonResponse(jsonResp); + resp->setStatusCode(drogon::k500InternalServerError); + callback(resp); + return; + } + auto lock = llama.lock(); const auto &jsonBody = req->getJsonObject(); @@ -225,3 +244,51 @@ void llamaCPP::embedding( resp->setContentTypeString("application/json"); callback(resp); } + +void llamaCPP::loadModel( + const HttpRequestPtr &req, + std::function &&callback) { + + const auto &jsonBody = req->getJsonObject(); + + gpt_params params; + if (jsonBody) { + params.model = (*jsonBody)["llama_model_path"].asString(); + params.n_gpu_layers = (*jsonBody)["ngl"].asInt(); + params.n_ctx = (*jsonBody)["ctx_len"].asInt(); + params.embedding = (*jsonBody)["embedding"].asBool(); + } +#ifdef GGML_USE_CUBLAS + LOG_INFO << "Setting up GGML CUBLAS PARAMS"; + params.mul_mat_q = false; +#endif // GGML_USE_CUBLAS + if (params.model_alias == "unknown") { + params.model_alias = params.model; + } + + llama_backend_init(params.numa); + + LOG_INFO_LLAMA("build info", + {{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}}); + LOG_INFO_LLAMA("system info", + { + {"n_threads", params.n_threads}, + {"total_threads", std::thread::hardware_concurrency()}, + {"system_info", llama_print_system_info()}, + }); + + // load the model + if (!llama.loadModel(params)) { + LOG_ERROR << "Error loading the model will exit the program"; + Json::Value jsonResp; + jsonResp["message"] = "Model loaded failed"; + auto resp = drogon::HttpResponse::newHttpJsonResponse(jsonResp); + resp->setStatusCode(drogon::k500InternalServerError); + callback(resp); + } + Json::Value jsonResp; + jsonResp["message"] = "Model loaded successfully"; + model_loaded = true; + auto resp = drogon::HttpResponse::newHttpJsonResponse(jsonResp); + callback(resp); +} diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index e4c6c881d..4a7a4f3a3 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -1,3 +1,4 @@ +#include #if defined(_WIN32) #define NOMINMAX #endif @@ -1311,51 +1312,56 @@ namespace inferences { class llamaCPP : public drogon::HttpController { public: llamaCPP() { - gpt_params params; - auto conf = drogon::app().getCustomConfig(); - params.model = conf["llama_model_path"].asString(); - params.n_gpu_layers = conf["ngl"].asInt(); - params.n_ctx = conf["ctx_len"].asInt(); - params.embedding = conf["embedding"].asBool(); -#ifdef GGML_USE_CUBLAS - LOG_INFO << "Setting up GGML CUBLAS PARAMS"; - params.mul_mat_q = false; -#endif // GGML_USE_CUBLAS - if (params.model_alias == "unknown") { - params.model_alias = params.model; - } - - llama_backend_init(params.numa); - - LOG_INFO_LLAMA("build info", - {{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}}); - LOG_INFO_LLAMA("system info", - { - {"n_threads", params.n_threads}, - {"total_threads", std::thread::hardware_concurrency()}, - {"system_info", llama_print_system_info()}, - }); - - // load the model - if (!llama.loadModel(params)) { - LOG_ERROR << "Error loading the model will exit the program"; - std::terminate(); - } - nitro_utils::nitro_logo(); + // gpt_params params; + // auto conf = drogon::app().getCustomConfig(); + // params.model = conf["llama_model_path"].asString(); + // params.n_gpu_layers = conf["ngl"].asInt(); + // params.n_ctx = conf["ctx_len"].asInt(); + // params.embedding = conf["embedding"].asBool(); + // #ifdef GGML_USE_CUBLAS + // LOG_INFO << "Setting up GGML CUBLAS PARAMS"; + // params.mul_mat_q = false; + // #endif // GGML_USE_CUBLAS + // if (params.model_alias == "unknown") { + // params.model_alias = params.model; + // } + // + // llama_backend_init(params.numa); + // + // LOG_INFO_LLAMA("build info", + // {{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}}); + // LOG_INFO_LLAMA("system info", + // { + // {"n_threads", params.n_threads}, + // {"total_threads", + // std::thread::hardware_concurrency()}, + // {"system_info", llama_print_system_info()}, + // }); + // + // // load the model + // if (!llama.loadModel(params)) { + // LOG_ERROR << "Error loading the model will exit the program"; + // std::terminate(); + // } + // deprecate this if find no usecase } METHOD_LIST_BEGIN // list path definitions here; - METHOD_ADD(llamaCPP::chatCompletion, "chat_completion"); - METHOD_ADD(llamaCPP::embedding,"embedding"); + METHOD_ADD(llamaCPP::chatCompletion, "chat_completion", Post); + METHOD_ADD(llamaCPP::embedding, "embedding", Post); + METHOD_ADD(llamaCPP::loadModel, "loadmodel", Post); // PATH_ADD("/llama/chat_completion", Post); METHOD_LIST_END void chatCompletion(const HttpRequestPtr &req, std::function &&callback); void embedding(const HttpRequestPtr &req, std::function &&callback); + void loadModel(const HttpRequestPtr &req, + std::function &&callback); private: llama_server_context llama; + bool model_loaded = false; size_t sent_count = 0; size_t sent_token_probs_index = 0; }; diff --git a/main.cc b/main.cc index 5ac05720e..c09cba781 100644 --- a/main.cc +++ b/main.cc @@ -1,4 +1,4 @@ - +#include "controllers/nitro_utils.h" #include // for PATH_MAX #include @@ -14,59 +14,25 @@ #error "Unsupported platform!" #endif -int main() { - std::string configPath; +int main(int argc, char *argv[]) { -#if defined(__APPLE__) && defined(__MACH__) - char path[PATH_MAX]; - uint32_t size = sizeof(path); - if (_NSGetExecutablePath(path, &size) == 0) { - path[size] = '\0'; // Null-terminate the string - char *dir = dirname(path); - configPath = std::string(dir) + "/config/config.json"; - } else { - LOG_ERROR << "Failed to get binary location!"; - return 1; - } -#elif defined(__linux__) - char path[PATH_MAX]; - ssize_t len = readlink("/proc/self/exe", path, sizeof(path) - 1); - if (len != -1) { - path[len] = '\0'; - char *dir = dirname(path); - configPath = std::string(dir) + "/config/config.json"; - } else { - LOG_ERROR << "Failed to get binary location!"; - return 1; - } -#elif defined(_WIN32) - char path[MAX_PATH]; - char dir[MAX_PATH]; - // char dir[MAX_PATH]; - if(GetModuleFileNameA(NULL, path, sizeof(path))) { - char* lastBackslash = strrchr(path, '\\'); - if (lastBackslash == nullptr) { - return 1; - } - lastBackslash[0] = '\0'; - strcpy(dir, path); - configPath = std::string(dir) + "/config/config.json"; - } - else { - LOG_ERROR << "Failed to get binary location!"; - return 1; + std::string host = "127.0.0.1"; + int port = 3928; + + // Check for host argument + if (argc > 1) { + host = argv[1]; } -#else - LOG_ERROR << "Unsupported platform!"; - return 1; -#endif - // Set HTTP listener address and port - drogon::app().loadConfigFile(configPath); - auto app_conf = drogon::app().getCustomConfig(); + // Check for port argument + if (argc > 2) { + port = std::atoi(argv[2]); // Convert string argument to int + } - LOG_INFO << app_conf["llama_model_file"].asString(); - // drogon::app().addListener("0.0.0.0", 8080); + nitro_utils::nitro_logo(); + LOG_INFO << "Server started, listening at: " << host << ":" << port; + LOG_INFO << "Please load your model"; + drogon::app().addListener(host, port); drogon::app().run(); return 0;