Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 47 additions & 41 deletions controllers/llamaCPP.cc
Original file line number Diff line number Diff line change
Expand Up @@ -359,47 +359,32 @@ void llamaCPP::modelStatus(
return;
}

void llamaCPP::loadModel(
const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback) {

if (model_loaded) {
LOG_INFO << "model loaded";
Json::Value jsonResp;
jsonResp["message"] = "Model already loaded";
auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
resp->setStatusCode(drogon::k409Conflict);
callback(resp);
return;
}

const auto &jsonBody = req->getJsonObject();
bool llamaCPP::loadModelImpl(const Json::Value &jsonBody) {

gpt_params params;

// By default will setting based on number of handlers
int drogon_thread = drogon::app().getThreadNum();
LOG_INFO << "Drogon thread is:" << drogon_thread;
if (jsonBody) {
params.model = (*jsonBody)["llama_model_path"].asString();
params.n_gpu_layers = (*jsonBody).get("ngl", 100).asInt();
params.n_ctx = (*jsonBody).get("ctx_len", 2048).asInt();
params.embedding = (*jsonBody).get("embedding", true).asBool();
params.model = jsonBody["llama_model_path"].asString();
params.n_gpu_layers = jsonBody.get("ngl", 100).asInt();
params.n_ctx = jsonBody.get("ctx_len", 2048).asInt();
params.embedding = jsonBody.get("embedding", true).asBool();
// Check if n_parallel exists in jsonBody, if not, set to drogon_thread
params.n_batch = (*jsonBody).get("n_batch", 512).asInt();
params.n_parallel = (*jsonBody).get("n_parallel", drogon_thread).asInt();
params.n_batch = jsonBody.get("n_batch", 512).asInt();
params.n_parallel = jsonBody.get("n_parallel", drogon_thread).asInt();
params.n_threads =
(*jsonBody)
.get("cpu_threads", std::thread::hardware_concurrency())
jsonBody.get("cpu_threads", std::thread::hardware_concurrency())
.asInt();
params.cont_batching = (*jsonBody).get("cont_batching", false).asBool();
params.cont_batching = jsonBody.get("cont_batching", false).asBool();

this->user_prompt = (*jsonBody).get("user_prompt", "USER: ").asString();
this->ai_prompt = (*jsonBody).get("ai_prompt", "ASSISTANT: ").asString();
this->user_prompt = jsonBody.get("user_prompt", "USER: ").asString();
this->ai_prompt = jsonBody.get("ai_prompt", "ASSISTANT: ").asString();
this->system_prompt =
(*jsonBody).get("system_prompt", "ASSISTANT's RULE: ").asString();
this->pre_prompt = (*jsonBody).get("pre_prompt", "").asString();
this->repeat_last_n = (*jsonBody).get("repeat_last_n", 32).asInt();
jsonBody.get("system_prompt", "ASSISTANT's RULE: ").asString();
this->pre_prompt = jsonBody.get("pre_prompt", "").asString();
this->repeat_last_n = jsonBody.get("repeat_last_n", 32).asInt();
}
#ifdef GGML_USE_CUBLAS
LOG_INFO << "Setting up GGML CUBLAS PARAMS";
Expand All @@ -422,25 +407,46 @@ void llamaCPP::loadModel(

// load the model
if (!llama.load_model(params)) {
LOG_ERROR << "Error loading the model will exit the program";
Json::Value jsonResp;
jsonResp["message"] = "Failed to load model";
auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
resp->setStatusCode(drogon::k500InternalServerError);
callback(resp);
LOG_ERROR << "Error loading the model";
return false; // Indicate failure
}
llama.initialize();

Json::Value jsonResp;
jsonResp["message"] = "Model loaded successfully";
model_loaded = true;
auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);

LOG_INFO << "Started background task here!";
backgroundThread = std::thread(&llamaCPP::backgroundTask, this);
warmupModel();
return true;
}

callback(resp);
void llamaCPP::loadModel(
const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback) {

if (model_loaded) {
LOG_INFO << "model loaded";
Json::Value jsonResp;
jsonResp["message"] = "Model already loaded";
auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
resp->setStatusCode(drogon::k409Conflict);
callback(resp);
return;
}

const auto &jsonBody = req->getJsonObject();
if (!loadModelImpl(*jsonBody)) {
// Error occurred during model loading
Json::Value jsonResp;
jsonResp["message"] = "Failed to load model";
auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
resp->setStatusCode(drogon::k500InternalServerError);
callback(resp);
} else {
// Model loaded successfully
Json::Value jsonResp;
jsonResp["message"] = "Model loaded successfully";
auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
callback(resp);
}
}

void llamaCPP::backgroundTask() {
Expand Down
23 changes: 23 additions & 0 deletions controllers/llamaCPP.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#include <iostream>
#include <string>
#if defined(_WIN32)
#define NOMINMAX
#endif
Expand Down Expand Up @@ -2117,6 +2119,25 @@ class llamaCPP : public drogon::HttpController<llamaCPP> {
// log_disable(); // Disable the log to file feature, reduce bloat for
// target
// system ()
std::vector<std::string> llama_models =
nitro_utils::listFilesInDir(nitro_utils::models_folder);
std::string model_index;
if (llama_models.size() > 0) {
LOG_INFO << "Found models folder, here are the llama models you have:";
int index_val = 0;
for (auto llama_model : llama_models) {
LOG_INFO << "index: " << index_val++ << "| model: " << llama_model;
std::cout
<< "Please type the index of the model you want to load here >> ";
std::cin >> model_index;
Json::Value jsonBody;
jsonBody["llama_model_path"] = nitro_utils::models_folder + "/" +
llama_models[std::stoi(model_index)];
loadModelImpl(jsonBody);
}
} else {
LOG_INFO << "Not found models folder, start server as usual";
}
}

METHOD_LIST_BEGIN
Expand Down Expand Up @@ -2145,6 +2166,8 @@ class llamaCPP : public drogon::HttpController<llamaCPP> {
void modelStatus(const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback);

bool loadModelImpl(const Json::Value &jsonBody);

void warmupModel();

void backgroundTask();
Expand Down
42 changes: 42 additions & 0 deletions utils/nitro_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,51 @@
#include <drogon/HttpResponse.h>
#include <iostream>
#include <ostream>
// Include platform-specific headers
#ifdef _WIN32
#include <windows.h>
#else
#include <dirent.h>
#endif

namespace nitro_utils {

inline std::string models_folder = "./models";

inline std::vector<std::string> listFilesInDir(const std::string &path) {
std::vector<std::string> files;

#ifdef _WIN32
// Windows-specific code
WIN32_FIND_DATA findFileData;
HANDLE hFind = FindFirstFile((path + "\\*").c_str(), &findFileData);

if (hFind != INVALID_HANDLE_VALUE) {
do {
if (!(findFileData.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)) {
files.push_back(findFileData.cFileName);
}
} while (FindNextFile(hFind, &findFileData) != 0);
FindClose(hFind);
}
#else
// POSIX-specific code (Linux, Unix, MacOS)
DIR *dir;
struct dirent *ent;

if ((dir = opendir(path.c_str())) != NULL) {
while ((ent = readdir(dir)) != NULL) {
if (ent->d_type == DT_REG) { // Check if it's a regular file
files.push_back(ent->d_name);
}
}
closedir(dir);
}
#endif

return files;
}

inline std::string rtrim(const std::string &str) {
size_t end = str.find_last_not_of("\n\t ");
return (end == std::string::npos) ? "" : str.substr(0, end + 1);
Expand Down