From a25c902485bc49a74f2a49db0047d4fcc5816976 Mon Sep 17 00:00:00 2001 From: tikikun Date: Fri, 8 Dec 2023 20:47:19 +0700 Subject: [PATCH 1/8] version pump --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index fbbc42827..fe680e3d1 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit fbbc42827b2949b95bcde23ce47bb47d006c895d +Subproject commit fe680e3d1080a765e5d3150ffd7bab189742898d From fd1f9fc98010a2409a03b033fd7e0321777798d0 Mon Sep 17 00:00:00 2001 From: tikikun Date: Fri, 8 Dec 2023 21:03:22 +0700 Subject: [PATCH 2/8] Fix issue with version pump --- controllers/llamaCPP.h | 282 ----------------------------------------- 1 file changed, 282 deletions(-) diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index 9e80d46e2..8741ea288 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -1775,288 +1775,6 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, "LLaVA.\n"); printf("\n"); } - -static void server_params_parse(int argc, char **argv, server_params &sparams, - gpt_params ¶ms, - llama_server_context &llama) { - gpt_params default_params; - server_params default_sparams; - std::string arg; - bool invalid_param = false; - - for (int i = 1; i < argc; i++) { - arg = argv[i]; - if (arg == "--port") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.port = std::stoi(argv[i]); - } else if (arg == "--host") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.hostname = argv[i]; - } else if (arg == "--path") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.public_path = argv[i]; - } else if (arg == "--timeout" || arg == "-to") { - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.read_timeout = std::stoi(argv[i]); - sparams.write_timeout = std::stoi(argv[i]); - } else if (arg == "-m" || arg == "--model") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.model = argv[i]; - } else if (arg == "-a" || arg == "--alias") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.model_alias = argv[i]; - } else if (arg == "-h" || arg == "--help") { - server_print_usage(argv[0], default_params, default_sparams); - exit(0); - } else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_ctx = std::stoi(argv[i]); - } else if (arg == "--rope-scaling") { - if (++i >= argc) { - invalid_param = true; - break; - } - std::string value(argv[i]); - /**/ if (value == "none") { - params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; - } else if (value == "linear") { - params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; - } else if (value == "yarn") { - params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; - } else { - invalid_param = true; - break; - } - } else if (arg == "--rope-freq-base") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.rope_freq_base = std::stof(argv[i]); - } else if (arg == "--rope-freq-scale") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.rope_freq_scale = std::stof(argv[i]); - } else if (arg == "--yarn-ext-factor") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.yarn_ext_factor = std::stof(argv[i]); - } else if (arg == "--yarn-attn-factor") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.yarn_attn_factor = std::stof(argv[i]); - } else if (arg == "--yarn-beta-fast") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.yarn_beta_fast = std::stof(argv[i]); - } else if (arg == "--yarn-beta-slow") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.yarn_beta_slow = std::stof(argv[i]); - } else if (arg == "--memory-f32" || arg == "--memory_f32") { - params.memory_f16 = false; - } else if (arg == "--threads" || arg == "-t") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_threads = std::stoi(argv[i]); - } else if (arg == "--threads-batch" || arg == "-tb") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_threads_batch = std::stoi(argv[i]); - } else if (arg == "-b" || arg == "--batch-size") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_batch = std::stoi(argv[i]); - params.n_batch = std::min(512, params.n_batch); - } else if (arg == "--gpu-layers" || arg == "-ngl" || - arg == "--n-gpu-layers") { - if (++i >= argc) { - invalid_param = true; - break; - } -#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD - params.n_gpu_layers = std::stoi(argv[i]); -#else - LOG_WARNING_LLAMA( - "Not compiled with GPU offload support, --n-gpu-layers option will " - "be ignored. " - "See main README.md for information on enabling GPU BLAS support", - {{"n_gpu_layers", params.n_gpu_layers}}); -#endif - } else if (arg == "--tensor-split" || arg == "-ts") { - if (++i >= argc) { - invalid_param = true; - break; - } -#ifdef GGML_USE_CUBLAS - std::string arg_next = argv[i]; - - // split string by , and / - const std::regex regex{R"([,/]+)"}; - std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, - -1}; - std::vector split_arg{it, {}}; - GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); - - for (size_t i_device = 0; i_device < LLAMA_MAX_DEVICES; ++i_device) { - if (i_device < split_arg.size()) { - params.tensor_split[i_device] = std::stof(split_arg[i_device]); - } else { - params.tensor_split[i_device] = 0.0f; - } - } -#else - LOG_WARNING_LLAMA("llama.cpp was compiled without cuBLAS. It is not " - "possible to set a tensor split.\n", - {}); -#endif // GGML_USE_CUBLAS - } else if (arg == "--no-mul-mat-q" || arg == "-nommq") { -#ifdef GGML_USE_CUBLAS - params.mul_mat_q = false; -#else - LOG_WARNING_LLAMA("warning: llama.cpp was compiled without cuBLAS. " - "Disabling mul_mat_q kernels has no effect.\n", - {}); -#endif // GGML_USE_CUBLAS - } else if (arg == "--main-gpu" || arg == "-mg") { - if (++i >= argc) { - invalid_param = true; - break; - } -#ifdef GGML_USE_CUBLAS - params.main_gpu = std::stoi(argv[i]); -#else - LOG_WARNING_LLAMA("llama.cpp was compiled without cuBLAS. It is not " - "possible to set a main GPU.", - {}); -#endif - } else if (arg == "--lora") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f)); - params.use_mmap = false; - } else if (arg == "--lora-scaled") { - if (++i >= argc) { - invalid_param = true; - break; - } - const char *lora_adapter = argv[i]; - if (++i >= argc) { - invalid_param = true; - break; - } - params.lora_adapter.push_back( - std::make_tuple(lora_adapter, std::stof(argv[i]))); - params.use_mmap = false; - } else if (arg == "--lora-base") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.lora_base = argv[i]; - } else if (arg == "-v" || arg == "--verbose") { -#if SERVER_VERBOSE != 1 - LOG_WARNING_LLAMA("server.cpp is not built with verbose logging.", {}); -#else - server_verbose = true; -#endif - } else if (arg == "--mlock") { - params.use_mlock = true; - } else if (arg == "--no-mmap") { - params.use_mmap = false; - } else if (arg == "--numa") { - params.numa = true; - } else if (arg == "--embedding") { - params.embedding = true; - } else if (arg == "-cb" || arg == "--cont-batching") { - params.cont_batching = true; - } else if (arg == "-np" || arg == "--parallel") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_parallel = std::stoi(argv[i]); - } else if (arg == "-n" || arg == "--n-predict") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_predict = std::stoi(argv[i]); - } else if (arg == "-spf" || arg == "--system-prompt-file") { - if (++i >= argc) { - invalid_param = true; - break; - } - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - break; - } - std::string systm_content; - std::copy(std::istreambuf_iterator(file), - std::istreambuf_iterator(), - std::back_inserter(systm_content)); - llama.process_system_prompt_data(json::parse(systm_content)); - } else if (arg == "--mmproj") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.mmproj = argv[i]; - } else { - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - server_print_usage(argv[0], default_params, default_sparams); - exit(1); - } - } - - if (invalid_param) { - fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); - server_print_usage(argv[0], default_params, default_sparams); - exit(1); - } -} - static json format_partial_response(llama_server_context &llama, llama_client_slot *slot, const std::string &content, From 6d0015ddea6e0df79fbe84e4186c9ff711ae869d Mon Sep 17 00:00:00 2001 From: tikikun Date: Fri, 8 Dec 2023 21:11:21 +0700 Subject: [PATCH 3/8] kv cache clear when the model is stopped --- controllers/llamaCPP.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index a7121e18d..a852c0872 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -454,7 +454,9 @@ void llamaCPP::backgroundTask() { // model_loaded = llama.update_slots(); } - LOG_INFO << "Background task stopped!"; + LOG_INFO << "Background task stopped! "; + llama.kv_cache_clear(); + LOG_INFO << "KV cache cleared!"; return; } From 3537b5828eaa2f971807a5c724c425850ced0d63 Mon Sep 17 00:00:00 2001 From: tikikun Date: Fri, 8 Dec 2023 21:29:13 +0700 Subject: [PATCH 4/8] disable logigng --- controllers/llamaCPP.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index 8741ea288..c73e06aeb 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -1834,7 +1834,7 @@ class llamaCPP : public drogon::HttpController { public: llamaCPP() { // Some default values for now below - log_disable(); // Disable the log to file feature, reduce bloat for + //log_disable(); // Disable the log to file feature, reduce bloat for // target // system () std::vector llama_models = From a4fd01ebb98ed34395b1657f4a7642eb248cae9f Mon Sep 17 00:00:00 2001 From: tikikun Date: Fri, 8 Dec 2023 21:29:30 +0700 Subject: [PATCH 5/8] enable logging --- controllers/llamaCPP.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index c73e06aeb..8741ea288 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -1834,7 +1834,7 @@ class llamaCPP : public drogon::HttpController { public: llamaCPP() { // Some default values for now below - //log_disable(); // Disable the log to file feature, reduce bloat for + log_disable(); // Disable the log to file feature, reduce bloat for // target // system () std::vector llama_models = From d41d3c58680b21fe044500232666c5f3273624ea Mon Sep 17 00:00:00 2001 From: tikikun Date: Sun, 10 Dec 2023 08:37:37 +0700 Subject: [PATCH 6/8] add prelight --- controllers/llamaCPP.cc | 11 +++++++++++ controllers/llamaCPP.h | 5 +++++ 2 files changed, 16 insertions(+) diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index a852c0872..6ab37bdb7 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -145,6 +145,17 @@ void llamaCPP::warmupModel() { return; } +void llamaCPP::chatCompletionPrelight( + const HttpRequestPtr &req, + std::function &&callback) { + auto resp = drogon::HttpResponse::newHttpResponse(); + resp->setStatusCode(drogon::HttpStatusCode::k200OK); + resp->addHeader("Access-Control-Allow-Origin", "*"); + resp->addHeader("Access-Control-Allow-Methods", "POST, OPTIONS"); + resp->addHeader("Access-Control-Allow-Headers", "*"); + callback(resp); +} + void llamaCPP::chatCompletion( const HttpRequestPtr &req, std::function &&callback) { diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index 8741ea288..52c8b76a1 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -1868,12 +1868,17 @@ class llamaCPP : public drogon::HttpController { // Openai compatible path ADD_METHOD_TO(llamaCPP::chatCompletion, "/v1/chat/completions", Post); + ADD_METHOD_TO(llamaCPP::chatCompletionPrelight, "/v1/chat/completions", + Options); + ADD_METHOD_TO(llamaCPP::embedding, "/v1/embeddings", Post); // PATH_ADD("/llama/chat_completion", Post); METHOD_LIST_END void chatCompletion(const HttpRequestPtr &req, std::function &&callback); + void chatCompletionPrelight(const HttpRequestPtr &req, + std::function &&callback); void embedding(const HttpRequestPtr &req, std::function &&callback); void loadModel(const HttpRequestPtr &req, From 9c4e5f0113a42ab86f03da191b40d5e74db20f22 Mon Sep 17 00:00:00 2001 From: tikikun Date: Sun, 10 Dec 2023 18:57:00 +0700 Subject: [PATCH 7/8] test mmq --- controllers/llamaCPP.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index 6ab37bdb7..91cbae12b 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -399,7 +399,7 @@ bool llamaCPP::loadModelImpl(const Json::Value &jsonBody) { } #ifdef GGML_USE_CUBLAS LOG_INFO << "Setting up GGML CUBLAS PARAMS"; - params.mul_mat_q = false; + params.mul_mat_q = true; #endif // GGML_USE_CUBLAS if (params.model_alias == "unknown") { params.model_alias = params.model; From a1c6b973419cecf2d89f7732795aac20993eb67e Mon Sep 17 00:00:00 2001 From: tikikun Date: Sun, 10 Dec 2023 19:01:34 +0700 Subject: [PATCH 8/8] redo the test --- controllers/llamaCPP.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index 91cbae12b..6ab37bdb7 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -399,7 +399,7 @@ bool llamaCPP::loadModelImpl(const Json::Value &jsonBody) { } #ifdef GGML_USE_CUBLAS LOG_INFO << "Setting up GGML CUBLAS PARAMS"; - params.mul_mat_q = true; + params.mul_mat_q = false; #endif // GGML_USE_CUBLAS if (params.model_alias == "unknown") { params.model_alias = params.model;