Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions controllers/llamaCPP.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include <drogon/HttpTypes.h>
#include <regex>
#include <thread>
#include <trantor/utils/Logger.h>

using namespace inferences;

Expand Down Expand Up @@ -39,6 +40,49 @@ std::string create_return_json(const std::string &id, const std::string &model,
return Json::writeString(writer, root);
}

void llamaCPP::warmupModel() {
auto lock = llama.lock();
llama.rewind();
llama_reset_timings(llama.ctx);

llama.prompt = "hello";
llama.params.n_predict = 1;
llama.loadPrompt();
llama.beginCompletion();
size_t stop_pos = std::string::npos;

while (llama.has_next_token) {
const completion_token_output token_with_probs = llama.doCompletion();
const std::string token_text =
token_with_probs.tok == -1
? ""
: llama_token_to_piece(llama.ctx, token_with_probs.tok);

stop_pos = llama.findStoppingStrings(llama.generated_text,
token_text.size(), STOP_FULL);
}

if (stop_pos == std::string::npos) {
stop_pos = llama.findStoppingStrings(llama.generated_text, 0, STOP_PARTIAL);
}
if (stop_pos != std::string::npos) {
llama.generated_text.erase(llama.generated_text.begin() + stop_pos,
llama.generated_text.end());
}
auto probs = llama.generated_token_probs;
if (llama.params.sampling_params.n_probs > 0 && llama.stopped_word) {
const std::vector<llama_token> stop_word_toks =
llama_tokenize(llama.ctx, llama.stopping_word, false);
probs = std::vector<completion_token_output>(
llama.generated_token_probs.begin(),
llama.generated_token_probs.end() - stop_word_toks.size());
}

LOG_INFO << "Warm-up generated text:" << llama.generated_text;
LOG_INFO << "Warm-up finish";
return;
}

void llamaCPP::chatCompletion(
const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback) {
Expand Down Expand Up @@ -297,5 +341,6 @@ void llamaCPP::loadModel(
jsonResp["message"] = "Model loaded successfully";
model_loaded = true;
auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
warmupModel();
callback(resp);
}
43 changes: 24 additions & 19 deletions controllers/llamaCPP.h
Original file line number Diff line number Diff line change
Expand Up @@ -525,12 +525,12 @@ struct llama_server_context {
if (llama_decode(ctx,
llama_batch_get_one(&embd[n_past], n_eval, n_past, 0))) {
LOG_ERROR_LLAMA("failed to eval",
{
{"n_eval", n_eval},
{"n_past", n_past},
{"embd",
tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend())},
});
{
{"n_eval", n_eval},
{"n_past", n_past},
{"embd", tokens_to_str(ctx, embd.cbegin() + n_past,
embd.cend())},
});
has_next_token = false;
return result;
}
Expand Down Expand Up @@ -677,9 +677,9 @@ struct llama_server_context {
static const int n_embd = llama_n_embd(model);
if (!params.embedding) {
LOG_WARNING_LLAMA("embedding disabled",
{
{"params.embedding", params.embedding},
});
{
{"params.embedding", params.embedding},
});
return std::vector<float>(n_embd, 0.0f);
}
const float *data = llama_get_embeddings(ctx);
Expand Down Expand Up @@ -891,17 +891,19 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
}
}
#else
LOG_WARNING_LLAMA("llama.cpp was compiled without cuBLAS. It is not possible "
"to set a tensor split.\n",
{});
LOG_WARNING_LLAMA(
"llama.cpp was compiled without cuBLAS. It is not possible "
"to set a tensor split.\n",
{});
#endif // GGML_USE_CUBLAS
} else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
#ifdef GGML_USE_CUBLAS
params.mul_mat_q = false;
#else
LOG_WARNING_LLAMA("warning: llama.cpp was compiled without cuBLAS. Disabling "
"mul_mat_q kernels has no effect.\n",
{});
LOG_WARNING_LLAMA(
"warning: llama.cpp was compiled without cuBLAS. Disabling "
"mul_mat_q kernels has no effect.\n",
{});
#endif // GGML_USE_CUBLAS
} else if (arg == "--main-gpu" || arg == "-mg") {
if (++i >= argc) {
Expand All @@ -911,9 +913,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
#ifdef GGML_USE_CUBLAS
params.main_gpu = std::stoi(argv[i]);
#else
LOG_WARNING_LLAMA("llama.cpp was compiled without cuBLAS. It is not possible "
"to set a main GPU.",
{});
LOG_WARNING_LLAMA(
"llama.cpp was compiled without cuBLAS. It is not possible "
"to set a main GPU.",
{});
#endif
} else if (arg == "--lora") {
if (++i >= argc) {
Expand Down Expand Up @@ -1260,7 +1263,8 @@ class llamaCPP : public drogon::HttpController<llamaCPP> {
public:
llamaCPP() {
// Some default values for now below
log_disable(); //Disable the log to file feature, reduce bloat for target system ()
log_disable(); // Disable the log to file feature, reduce bloat for target
// system ()
}
METHOD_LIST_BEGIN
// list path definitions here;
Expand All @@ -1275,6 +1279,7 @@ class llamaCPP : public drogon::HttpController<llamaCPP> {
std::function<void(const HttpResponsePtr &)> &&callback);
void loadModel(const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback);
void warmupModel();

private:
llama_server_context llama;
Expand Down