From 9b801076f14761aff8823f4b0d43ae6744f4b5e9 Mon Sep 17 00:00:00 2001 From: automaticcat Date: Sat, 2 Dec 2023 09:22:36 +0700 Subject: [PATCH 1/5] fix chatml scenario llamaCPP.cc --- controllers/llamaCPP.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index c317c114d..378655ae1 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -198,6 +198,8 @@ void llamaCPP::chatCompletion( data["prompt"] = formatted_output; for (const auto &stop_word : (*jsonBody)["stop"]) { stopWords.push_back(stop_word.asString()); + // Ensure success case for chatML + stopWords.push_back("<|im_end|>"); } // specify default stop words stopWords.push_back(nitro_utils::rtrim(user_prompt)); From e234418696b5c90a4a5f09a18c66dbac34ca3c2b Mon Sep 17 00:00:00 2001 From: automaticcat Date: Sat, 2 Dec 2023 15:08:37 +0700 Subject: [PATCH 2/5] add default case --- controllers/llamaCPP.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index 378655ae1..b4d94fd79 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -198,10 +198,10 @@ void llamaCPP::chatCompletion( data["prompt"] = formatted_output; for (const auto &stop_word : (*jsonBody)["stop"]) { stopWords.push_back(stop_word.asString()); - // Ensure success case for chatML - stopWords.push_back("<|im_end|>"); } // specify default stop words + // Ensure success case for chatML + stopWords.push_back("<|im_end|>"); stopWords.push_back(nitro_utils::rtrim(user_prompt)); data["stop"] = stopWords; } From c2a0ff99fdd5ac8a02b8b72a5f31195d3eb1860b Mon Sep 17 00:00:00 2001 From: automaticcat Date: Sat, 2 Dec 2023 16:51:54 +0700 Subject: [PATCH 3/5] update pre-prompt --- controllers/llamaCPP.cc | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index b4d94fd79..56b42d4b6 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -388,14 +388,7 @@ void llamaCPP::loadModel( this->ai_prompt = (*jsonBody).get("ai_prompt", "ASSISTANT: ").asString(); this->system_prompt = (*jsonBody).get("system_prompt", "ASSISTANT's RULE: ").asString(); - this->pre_prompt = - (*jsonBody) - .get("pre_prompt", - "A chat between a curious user and an artificial " - "intelligence " - "assistant. The assistant follows the given rules no matter " - "what.\\n") - .asString(); + this->pre_prompt = (*jsonBody).get("pre_prompt","").asString(); } #ifdef GGML_USE_CUBLAS LOG_INFO << "Setting up GGML CUBLAS PARAMS"; From 59737700faa836761a6d77124039f4a98aa72e2a Mon Sep 17 00:00:00 2001 From: tikikun Date: Sat, 2 Dec 2023 17:09:28 +0700 Subject: [PATCH 4/5] add customization for batch size --- README.md | 1 + controllers/llamaCPP.cc | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 4b666767a..4d2f902d6 100644 --- a/README.md +++ b/README.md @@ -107,6 +107,7 @@ Table of parameters | `system_prompt` | String | The prompt to use for system rules. | | `pre_prompt` | String | The prompt to use for internal configuration. | | `cpu_threads` | Integer | The number of threads to use for inferencing (CPU MODE ONLY) | +| `n_batch` | Integer | The batch size for prompt eval step | ***OPTIONAL***: You can run Nitro on a different port like 5000 instead of 3928 by running it manually in terminal ```zsh diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index 56b42d4b6..4143ed1a9 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -376,7 +376,7 @@ void llamaCPP::loadModel( params.n_ctx = (*jsonBody).get("ctx_len", 2048).asInt(); params.embedding = (*jsonBody).get("embedding", true).asBool(); // Check if n_parallel exists in jsonBody, if not, set to drogon_thread - + params.n_batch = (*jsonBody).get("n_batch",512).asInt(); params.n_parallel = (*jsonBody).get("n_parallel", drogon_thread).asInt(); params.n_threads = (*jsonBody) From 16c46a7d02e4083475c20d22e77a75dcff3c3058 Mon Sep 17 00:00:00 2001 From: tikikun Date: Sat, 2 Dec 2023 17:56:10 +0700 Subject: [PATCH 5/5] add repeat last n --- controllers/llamaCPP.cc | 8 ++++++-- controllers/llamaCPP.h | 1 + 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index 4143ed1a9..92771d498 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -170,6 +170,9 @@ void llamaCPP::chatCompletion( data["cache_prompt"] = true; data["n_keep"] = -1; + // Passing load value + data["repeat_last_n"] = this->repeat_last_n; + data["stream"] = (*jsonBody).get("stream", false).asBool(); data["n_predict"] = (*jsonBody).get("max_tokens", 500).asInt(); data["top_p"] = (*jsonBody).get("top_p", 0.95).asFloat(); @@ -376,7 +379,7 @@ void llamaCPP::loadModel( params.n_ctx = (*jsonBody).get("ctx_len", 2048).asInt(); params.embedding = (*jsonBody).get("embedding", true).asBool(); // Check if n_parallel exists in jsonBody, if not, set to drogon_thread - params.n_batch = (*jsonBody).get("n_batch",512).asInt(); + params.n_batch = (*jsonBody).get("n_batch", 512).asInt(); params.n_parallel = (*jsonBody).get("n_parallel", drogon_thread).asInt(); params.n_threads = (*jsonBody) @@ -388,7 +391,8 @@ void llamaCPP::loadModel( this->ai_prompt = (*jsonBody).get("ai_prompt", "ASSISTANT: ").asString(); this->system_prompt = (*jsonBody).get("system_prompt", "ASSISTANT's RULE: ").asString(); - this->pre_prompt = (*jsonBody).get("pre_prompt","").asString(); + this->pre_prompt = (*jsonBody).get("pre_prompt", "").asString(); + this->repeat_last_n = (*jsonBody).get("repeat_last_n", 32).asInt(); } #ifdef GGML_USE_CUBLAS LOG_INFO << "Setting up GGML CUBLAS PARAMS"; diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index 3061149ab..b470e79ae 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -2161,5 +2161,6 @@ class llamaCPP : public drogon::HttpController { std::string ai_prompt; std::string system_prompt; std::string pre_prompt; + int repeat_last_n; }; }; // namespace inferences