Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion controllers/llamaCPP.cc
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,17 @@ void llamaCPP::warmupModel() {
return;
}

void llamaCPP::chatCompletionPrelight(
const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback) {
auto resp = drogon::HttpResponse::newHttpResponse();
resp->setStatusCode(drogon::HttpStatusCode::k200OK);
resp->addHeader("Access-Control-Allow-Origin", "*");
resp->addHeader("Access-Control-Allow-Methods", "POST, OPTIONS");
resp->addHeader("Access-Control-Allow-Headers", "*");
callback(resp);
}

void llamaCPP::chatCompletion(
const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback) {
Expand Down Expand Up @@ -454,7 +465,9 @@ void llamaCPP::backgroundTask() {
// model_loaded =
llama.update_slots();
}
LOG_INFO << "Background task stopped!";
LOG_INFO << "Background task stopped! ";
llama.kv_cache_clear();
LOG_INFO << "KV cache cleared!";
return;
}

Expand Down
287 changes: 5 additions & 282 deletions controllers/llamaCPP.h
Original file line number Diff line number Diff line change
Expand Up @@ -1775,288 +1775,6 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
"LLaVA.\n");
printf("\n");
}

static void server_params_parse(int argc, char **argv, server_params &sparams,
gpt_params &params,
llama_server_context &llama) {
gpt_params default_params;
server_params default_sparams;
std::string arg;
bool invalid_param = false;

for (int i = 1; i < argc; i++) {
arg = argv[i];
if (arg == "--port") {
if (++i >= argc) {
invalid_param = true;
break;
}
sparams.port = std::stoi(argv[i]);
} else if (arg == "--host") {
if (++i >= argc) {
invalid_param = true;
break;
}
sparams.hostname = argv[i];
} else if (arg == "--path") {
if (++i >= argc) {
invalid_param = true;
break;
}
sparams.public_path = argv[i];
} else if (arg == "--timeout" || arg == "-to") {
if (++i >= argc) {
invalid_param = true;
break;
}
sparams.read_timeout = std::stoi(argv[i]);
sparams.write_timeout = std::stoi(argv[i]);
} else if (arg == "-m" || arg == "--model") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.model = argv[i];
} else if (arg == "-a" || arg == "--alias") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.model_alias = argv[i];
} else if (arg == "-h" || arg == "--help") {
server_print_usage(argv[0], default_params, default_sparams);
exit(0);
} else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_ctx = std::stoi(argv[i]);
} else if (arg == "--rope-scaling") {
if (++i >= argc) {
invalid_param = true;
break;
}
std::string value(argv[i]);
/**/ if (value == "none") {
params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE;
} else if (value == "linear") {
params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR;
} else if (value == "yarn") {
params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN;
} else {
invalid_param = true;
break;
}
} else if (arg == "--rope-freq-base") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.rope_freq_base = std::stof(argv[i]);
} else if (arg == "--rope-freq-scale") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.rope_freq_scale = std::stof(argv[i]);
} else if (arg == "--yarn-ext-factor") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.yarn_ext_factor = std::stof(argv[i]);
} else if (arg == "--yarn-attn-factor") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.yarn_attn_factor = std::stof(argv[i]);
} else if (arg == "--yarn-beta-fast") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.yarn_beta_fast = std::stof(argv[i]);
} else if (arg == "--yarn-beta-slow") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.yarn_beta_slow = std::stof(argv[i]);
} else if (arg == "--memory-f32" || arg == "--memory_f32") {
params.memory_f16 = false;
} else if (arg == "--threads" || arg == "-t") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_threads = std::stoi(argv[i]);
} else if (arg == "--threads-batch" || arg == "-tb") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_threads_batch = std::stoi(argv[i]);
} else if (arg == "-b" || arg == "--batch-size") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_batch = std::stoi(argv[i]);
params.n_batch = std::min(512, params.n_batch);
} else if (arg == "--gpu-layers" || arg == "-ngl" ||
arg == "--n-gpu-layers") {
if (++i >= argc) {
invalid_param = true;
break;
}
#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD
params.n_gpu_layers = std::stoi(argv[i]);
#else
LOG_WARNING_LLAMA(
"Not compiled with GPU offload support, --n-gpu-layers option will "
"be ignored. "
"See main README.md for information on enabling GPU BLAS support",
{{"n_gpu_layers", params.n_gpu_layers}});
#endif
} else if (arg == "--tensor-split" || arg == "-ts") {
if (++i >= argc) {
invalid_param = true;
break;
}
#ifdef GGML_USE_CUBLAS
std::string arg_next = argv[i];

// split string by , and /
const std::regex regex{R"([,/]+)"};
std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex,
-1};
std::vector<std::string> split_arg{it, {}};
GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES);

for (size_t i_device = 0; i_device < LLAMA_MAX_DEVICES; ++i_device) {
if (i_device < split_arg.size()) {
params.tensor_split[i_device] = std::stof(split_arg[i_device]);
} else {
params.tensor_split[i_device] = 0.0f;
}
}
#else
LOG_WARNING_LLAMA("llama.cpp was compiled without cuBLAS. It is not "
"possible to set a tensor split.\n",
{});
#endif // GGML_USE_CUBLAS
} else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
#ifdef GGML_USE_CUBLAS
params.mul_mat_q = false;
#else
LOG_WARNING_LLAMA("warning: llama.cpp was compiled without cuBLAS. "
"Disabling mul_mat_q kernels has no effect.\n",
{});
#endif // GGML_USE_CUBLAS
} else if (arg == "--main-gpu" || arg == "-mg") {
if (++i >= argc) {
invalid_param = true;
break;
}
#ifdef GGML_USE_CUBLAS
params.main_gpu = std::stoi(argv[i]);
#else
LOG_WARNING_LLAMA("llama.cpp was compiled without cuBLAS. It is not "
"possible to set a main GPU.",
{});
#endif
} else if (arg == "--lora") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f));
params.use_mmap = false;
} else if (arg == "--lora-scaled") {
if (++i >= argc) {
invalid_param = true;
break;
}
const char *lora_adapter = argv[i];
if (++i >= argc) {
invalid_param = true;
break;
}
params.lora_adapter.push_back(
std::make_tuple(lora_adapter, std::stof(argv[i])));
params.use_mmap = false;
} else if (arg == "--lora-base") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.lora_base = argv[i];
} else if (arg == "-v" || arg == "--verbose") {
#if SERVER_VERBOSE != 1
LOG_WARNING_LLAMA("server.cpp is not built with verbose logging.", {});
#else
server_verbose = true;
#endif
} else if (arg == "--mlock") {
params.use_mlock = true;
} else if (arg == "--no-mmap") {
params.use_mmap = false;
} else if (arg == "--numa") {
params.numa = true;
} else if (arg == "--embedding") {
params.embedding = true;
} else if (arg == "-cb" || arg == "--cont-batching") {
params.cont_batching = true;
} else if (arg == "-np" || arg == "--parallel") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_parallel = std::stoi(argv[i]);
} else if (arg == "-n" || arg == "--n-predict") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.n_predict = std::stoi(argv[i]);
} else if (arg == "-spf" || arg == "--system-prompt-file") {
if (++i >= argc) {
invalid_param = true;
break;
}
std::ifstream file(argv[i]);
if (!file) {
fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
invalid_param = true;
break;
}
std::string systm_content;
std::copy(std::istreambuf_iterator<char>(file),
std::istreambuf_iterator<char>(),
std::back_inserter(systm_content));
llama.process_system_prompt_data(json::parse(systm_content));
} else if (arg == "--mmproj") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.mmproj = argv[i];
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
server_print_usage(argv[0], default_params, default_sparams);
exit(1);
}
}

if (invalid_param) {
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
server_print_usage(argv[0], default_params, default_sparams);
exit(1);
}
}

static json
format_partial_response(llama_server_context &llama, llama_client_slot *slot,
const std::string &content,
Expand Down Expand Up @@ -2150,12 +1868,17 @@ class llamaCPP : public drogon::HttpController<llamaCPP> {

// Openai compatible path
ADD_METHOD_TO(llamaCPP::chatCompletion, "/v1/chat/completions", Post);
ADD_METHOD_TO(llamaCPP::chatCompletionPrelight, "/v1/chat/completions",
Options);

ADD_METHOD_TO(llamaCPP::embedding, "/v1/embeddings", Post);

// PATH_ADD("/llama/chat_completion", Post);
METHOD_LIST_END
void chatCompletion(const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback);
void chatCompletionPrelight(const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback);
void embedding(const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback);
void loadModel(const HttpRequestPtr &req,
Expand Down
2 changes: 1 addition & 1 deletion llama.cpp