Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 1 addition & 7 deletions config.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,5 @@
"address": "127.0.0.1",
"port": 3928
}
],
"custom_config": {
"llama_model_path": "/Users/alandao/Documents/codes/nitro.cpp_temp/models/llama2_7b_chat_uncensored.Q4_0.gguf",
"ctx_len": 2048,
"ngl": 100,
"embedding":true
}
]
}
67 changes: 67 additions & 0 deletions controllers/llamaCPP.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <chrono>
#include <cstring>
#include <drogon/HttpResponse.h>
#include <drogon/HttpTypes.h>
#include <regex>
#include <thread>

Expand Down Expand Up @@ -41,6 +42,15 @@ std::string create_return_json(const std::string &id, const std::string &model,
void llamaCPP::chatCompletion(
const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback) {
if (!model_loaded) {
Json::Value jsonResp;
jsonResp["message"] = "Model is not loaded yet";
auto resp = drogon::HttpResponse::newHttpJsonResponse(jsonResp);
resp->setStatusCode(drogon::k500InternalServerError);
callback(resp);
return;
}

const auto &jsonBody = req->getJsonObject();
std::string formatted_output =
"Below is a conversation between an AI system named ASSISTANT and USER\n";
Expand Down Expand Up @@ -203,6 +213,15 @@ void llamaCPP::chatCompletion(
void llamaCPP::embedding(
const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback) {
if (!model_loaded) {
Json::Value jsonResp;
jsonResp["message"] = "Model is not loaded yet";
auto resp = drogon::HttpResponse::newHttpJsonResponse(jsonResp);
resp->setStatusCode(drogon::k500InternalServerError);
callback(resp);
return;
}

auto lock = llama.lock();

const auto &jsonBody = req->getJsonObject();
Expand All @@ -225,3 +244,51 @@ void llamaCPP::embedding(
resp->setContentTypeString("application/json");
callback(resp);
}

void llamaCPP::loadModel(
const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback) {

const auto &jsonBody = req->getJsonObject();

gpt_params params;
if (jsonBody) {
params.model = (*jsonBody)["llama_model_path"].asString();
params.n_gpu_layers = (*jsonBody)["ngl"].asInt();
params.n_ctx = (*jsonBody)["ctx_len"].asInt();
params.embedding = (*jsonBody)["embedding"].asBool();
}
#ifdef GGML_USE_CUBLAS
LOG_INFO << "Setting up GGML CUBLAS PARAMS";
params.mul_mat_q = false;
#endif // GGML_USE_CUBLAS
if (params.model_alias == "unknown") {
params.model_alias = params.model;
}

llama_backend_init(params.numa);

LOG_INFO_LLAMA("build info",
{{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}});
LOG_INFO_LLAMA("system info",
{
{"n_threads", params.n_threads},
{"total_threads", std::thread::hardware_concurrency()},
{"system_info", llama_print_system_info()},
});

// load the model
if (!llama.loadModel(params)) {
LOG_ERROR << "Error loading the model will exit the program";
Json::Value jsonResp;
jsonResp["message"] = "Model loaded failed";
auto resp = drogon::HttpResponse::newHttpJsonResponse(jsonResp);
resp->setStatusCode(drogon::k500InternalServerError);
callback(resp);
}
Json::Value jsonResp;
jsonResp["message"] = "Model loaded successfully";
model_loaded = true;
auto resp = drogon::HttpResponse::newHttpJsonResponse(jsonResp);
callback(resp);
}
72 changes: 39 additions & 33 deletions controllers/llamaCPP.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#include <drogon/HttpTypes.h>
#if defined(_WIN32)
#define NOMINMAX
#endif
Expand Down Expand Up @@ -1311,51 +1312,56 @@ namespace inferences {
class llamaCPP : public drogon::HttpController<llamaCPP> {
public:
llamaCPP() {
gpt_params params;
auto conf = drogon::app().getCustomConfig();
params.model = conf["llama_model_path"].asString();
params.n_gpu_layers = conf["ngl"].asInt();
params.n_ctx = conf["ctx_len"].asInt();
params.embedding = conf["embedding"].asBool();
#ifdef GGML_USE_CUBLAS
LOG_INFO << "Setting up GGML CUBLAS PARAMS";
params.mul_mat_q = false;
#endif // GGML_USE_CUBLAS
if (params.model_alias == "unknown") {
params.model_alias = params.model;
}

llama_backend_init(params.numa);

LOG_INFO_LLAMA("build info",
{{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}});
LOG_INFO_LLAMA("system info",
{
{"n_threads", params.n_threads},
{"total_threads", std::thread::hardware_concurrency()},
{"system_info", llama_print_system_info()},
});

// load the model
if (!llama.loadModel(params)) {
LOG_ERROR << "Error loading the model will exit the program";
std::terminate();
}
nitro_utils::nitro_logo();
// gpt_params params;
// auto conf = drogon::app().getCustomConfig();
// params.model = conf["llama_model_path"].asString();
// params.n_gpu_layers = conf["ngl"].asInt();
// params.n_ctx = conf["ctx_len"].asInt();
// params.embedding = conf["embedding"].asBool();
// #ifdef GGML_USE_CUBLAS
// LOG_INFO << "Setting up GGML CUBLAS PARAMS";
// params.mul_mat_q = false;
// #endif // GGML_USE_CUBLAS
// if (params.model_alias == "unknown") {
// params.model_alias = params.model;
// }
//
// llama_backend_init(params.numa);
//
// LOG_INFO_LLAMA("build info",
// {{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}});
// LOG_INFO_LLAMA("system info",
// {
// {"n_threads", params.n_threads},
// {"total_threads",
// std::thread::hardware_concurrency()},
// {"system_info", llama_print_system_info()},
// });
//
// // load the model
// if (!llama.loadModel(params)) {
// LOG_ERROR << "Error loading the model will exit the program";
// std::terminate();
// }
// deprecate this if find no usecase
}
METHOD_LIST_BEGIN
// list path definitions here;
METHOD_ADD(llamaCPP::chatCompletion, "chat_completion");
METHOD_ADD(llamaCPP::embedding,"embedding");
METHOD_ADD(llamaCPP::chatCompletion, "chat_completion", Post);
METHOD_ADD(llamaCPP::embedding, "embedding", Post);
METHOD_ADD(llamaCPP::loadModel, "loadmodel", Post);
// PATH_ADD("/llama/chat_completion", Post);
METHOD_LIST_END
void chatCompletion(const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback);
void embedding(const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback);
void loadModel(const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback);

private:
llama_server_context llama;
bool model_loaded = false;
size_t sent_count = 0;
size_t sent_token_probs_index = 0;
};
Expand Down
66 changes: 16 additions & 50 deletions main.cc
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@

#include "controllers/nitro_utils.h"
#include <climits> // for PATH_MAX
#include <drogon/drogon.h>

Expand All @@ -14,59 +14,25 @@
#error "Unsupported platform!"
#endif

int main() {
std::string configPath;
int main(int argc, char *argv[]) {

#if defined(__APPLE__) && defined(__MACH__)
char path[PATH_MAX];
uint32_t size = sizeof(path);
if (_NSGetExecutablePath(path, &size) == 0) {
path[size] = '\0'; // Null-terminate the string
char *dir = dirname(path);
configPath = std::string(dir) + "/config/config.json";
} else {
LOG_ERROR << "Failed to get binary location!";
return 1;
}
#elif defined(__linux__)
char path[PATH_MAX];
ssize_t len = readlink("/proc/self/exe", path, sizeof(path) - 1);
if (len != -1) {
path[len] = '\0';
char *dir = dirname(path);
configPath = std::string(dir) + "/config/config.json";
} else {
LOG_ERROR << "Failed to get binary location!";
return 1;
}
#elif defined(_WIN32)
char path[MAX_PATH];
char dir[MAX_PATH];
// char dir[MAX_PATH];
if(GetModuleFileNameA(NULL, path, sizeof(path))) {
char* lastBackslash = strrchr(path, '\\');
if (lastBackslash == nullptr) {
return 1;
}
lastBackslash[0] = '\0';
strcpy(dir, path);
configPath = std::string(dir) + "/config/config.json";
}
else {
LOG_ERROR << "Failed to get binary location!";
return 1;
std::string host = "127.0.0.1";
int port = 3928;

// Check for host argument
if (argc > 1) {
host = argv[1];
}
#else
LOG_ERROR << "Unsupported platform!";
return 1;
#endif

// Set HTTP listener address and port
drogon::app().loadConfigFile(configPath);
auto app_conf = drogon::app().getCustomConfig();
// Check for port argument
if (argc > 2) {
port = std::atoi(argv[2]); // Convert string argument to int
}

LOG_INFO << app_conf["llama_model_file"].asString();
// drogon::app().addListener("0.0.0.0", 8080);
nitro_utils::nitro_logo();
LOG_INFO << "Server started, listening at: " << host << ":" << port;
LOG_INFO << "Please load your model";
drogon::app().addListener(host, port);
drogon::app().run();

return 0;
Expand Down