From 6a970fe9cb0b2afcd65507732b025055f0372de1 Mon Sep 17 00:00:00 2001 From: tikikun Date: Tue, 2 Jan 2024 16:28:25 +0700 Subject: [PATCH 1/6] feat: pumnp version to latest --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index a7aee47b9..26f3071d7 160000 --- a/llama.cpp +++ b/llama.cpp @@ -1 +1 @@ -Subproject commit a7aee47b98e45539d491071b25778b833b77e387 +Subproject commit 26f3071d714f0b27ad7f021a46a66a1085480258 From 37899029f4c0f1258f2e829d58f1af7187b6e968 Mon Sep 17 00:00:00 2001 From: Alan Dao Date: Tue, 2 Jan 2024 17:11:15 +0700 Subject: [PATCH 2/6] upgraded server with latest api --- controllers/llamaCPP.cc | 6 +- controllers/llamaCPP.h | 875 +++++++++++++++++++++++++++++++++++----- 2 files changed, 768 insertions(+), 113 deletions(-) diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index 834c8ec45..57770849d 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -135,7 +135,7 @@ void llamaCPP::warmupModel() { pseudo["prompt"] = "Hello"; pseudo["n_predict"] = 2; pseudo["stream"] = false; - const int task_id = llama.request_completion(pseudo, false, false); + const int task_id = llama.request_completion(pseudo, false, false, -1); std::string completion_text; task_result result = llama.next_result(task_id); if (!result.error && result.stop) { @@ -292,7 +292,7 @@ void llamaCPP::chatCompletion( LOG_INFO << "Current completion text"; LOG_INFO << formatted_output; #endif - const int task_id = llama.request_completion(data, false, false); + const int task_id = llama.request_completion(data, false, false, -1); LOG_INFO << "Resolved request for task_id:" << task_id; if (is_streamed) { @@ -383,7 +383,7 @@ void llamaCPP::embedding( prompt = ""; } const int task_id = llama.request_completion( - {{"prompt", prompt}, {"n_predict", 0}}, false, true); + {{"prompt", prompt}, {"n_predict", 0}}, false, true, -1); task_result result = llama.next_result(task_id); std::vector embedding_result = result.result_json["embedding"]; auto resp = nitro_utils::nitroHttpResponse(); diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index d61536272..947a622c2 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -13,12 +13,10 @@ #include #include #include -#include #include // External #include "common.h" -#include "grammar-parser.h" #include "llama.h" #include "../../llama.cpp/examples/llava/clip.h" @@ -34,7 +32,11 @@ // auto generated files (update with ./deps.sh) -#include +#include +#include +#include + +#include #include #include #include @@ -43,10 +45,13 @@ #define SERVER_VERBOSE 1 #endif +#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613" + using json = nlohmann::json; struct server_params { std::string hostname = "127.0.0.1"; + std::string api_key; std::string public_path = "examples/server/public"; int32_t port = 8080; int32_t read_timeout = 600; @@ -85,7 +90,7 @@ static inline bool is_base64(uint8_t c) { return (isalnum(c) || (c == '+') || (c == '/')); } -static std::vector base64_decode(std::string const &encoded_string) { +static std::vector base64_decode(const std::string &encoded_string) { int i = 0; int j = 0; int in_ = 0; @@ -155,15 +160,23 @@ struct task_server { json data; bool infill_mode = false; bool embedding_mode = false; + int multitask_id = -1; }; struct task_result { int id; + int multitask_id = -1; bool stop; bool error; json result_json; }; +struct task_multi { + int id; + std::set subtasks_remaining{}; + std::vector results{}; +}; + // TODO: can become bool if we can't find use of more states enum slot_state { IDLE, @@ -198,7 +211,7 @@ struct slot_image { float *image_embedding = nullptr; int32_t image_tokens = 0; - clip_image_u8 img_data; + clip_image_u8 *img_data; std::string prefix_prompt; // before of this image }; @@ -274,7 +287,8 @@ static void server_log(const char *level, const char *function, int line, const std::string str = log.dump(-1, ' ', false, json::error_handler_t::replace); - LOG_INFO << str; + printf("%.*s\n", (int)str.size(), str.data()); + fflush(stdout); } // format incomplete utf-8 multibyte character for output @@ -345,7 +359,6 @@ struct llama_client_slot { int32_t num_prompt_tokens = 0; int32_t num_prompt_tokens_processed = 0; - int32_t multibyte_pending = 0; json prompt; std::string generated_text; @@ -361,6 +374,9 @@ struct llama_client_slot { bool stopped_word = false; bool stopped_limit = false; + bool oaicompat = false; + std::string oaicompat_model; + std::string stopping_word; // sampling @@ -380,6 +396,9 @@ struct llama_client_slot { double t_prompt_processing; // ms double t_token_generation; // ms + // multitasks + int multitask_id = -1; + void reset() { num_prompt_tokens = 0; generated_text = ""; @@ -388,7 +407,6 @@ struct llama_client_slot { stopped_word = false; stopped_limit = false; stopping_word = ""; - multibyte_pending = 0; n_past = 0; sent_count = 0; sent_token_probs_index = 0; @@ -398,12 +416,13 @@ struct llama_client_slot { for (slot_image &img : images) { free(img.image_embedding); - delete[] img.img_data.data; + if (img.img_data) { + clip_image_u8_free(img.img_data); + } img.prefix_prompt = ""; } images.clear(); - // llama_set_rng_seed(ctx, params.seed); in batched the seed matter??????? } bool has_budget(gpt_params &global_params) { @@ -453,7 +472,7 @@ struct llama_client_slot { }; } - void print_timings() { + void print_timings() const { LOG_TEE("\n"); LOG_TEE("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per " "token, %8.2f tokens per second)\n", @@ -483,6 +502,7 @@ struct llama_server_context { bool multimodal = false; bool clean_kv_cache = true; bool all_slots_are_idle = false; + bool add_bos_token = true; int32_t id_gen; int32_t n_ctx; // total context for all clients / slots @@ -501,8 +521,11 @@ struct llama_server_context { std::vector queue_tasks; std::vector queue_results; - std::mutex mutex_tasks; + std::vector queue_multitasks; + std::mutex mutex_tasks; // also guards id_gen, and queue_multitasks + std::condition_variable condition_tasks; std::mutex mutex_results; + std::condition_variable condition_results; ~llama_server_context() { if (ctx) { @@ -555,6 +578,8 @@ struct llama_server_context { n_ctx = llama_n_ctx(ctx); + add_bos_token = llama_should_add_bos_token(model); + return true; } @@ -587,6 +612,13 @@ struct llama_server_context { std::vector tokenize(const json &json_prompt, bool add_bos) const { + // TODO: currently, we tokenize using special tokens by default + // this is not always correct (see + // https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216) + // but it's better compared to completely ignoring ChatML and other + // chat templates + const bool TMP_FORCE_SPECIAL = true; + // If `add_bos` is true, we only add BOS, when json_prompt is a string, // or the first element of the json_prompt array is a string. std::vector prompt_tokens; @@ -598,10 +630,10 @@ struct llama_server_context { auto s = p.template get(); std::vector p; if (first) { - p = ::llama_tokenize(ctx, s, add_bos); + p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL); first = false; } else { - p = ::llama_tokenize(ctx, s, false); + p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL); } prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); } else { @@ -613,7 +645,7 @@ struct llama_server_context { } } else { auto s = json_prompt.template get(); - prompt_tokens = ::llama_tokenize(ctx, s, add_bos); + prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL); } return prompt_tokens; @@ -641,12 +673,22 @@ struct llama_server_context { slot_params default_params; llama_sampling_params default_sparams; + if (data.count("__oaicompat") != 0) { + slot->oaicompat = true; + slot->oaicompat_model = + json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); + } else { + slot->oaicompat = false; + slot->oaicompat_model = ""; + } + slot->params.stream = json_value(data, "stream", false); slot->params.cache_prompt = json_value(data, "cache_prompt", false); slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict); slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k); slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p); + slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p); slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z); slot->sparams.typical_p = json_value(data, "typical_p", default_sparams.typical_p); @@ -693,6 +735,38 @@ struct llama_server_context { slot->prompt = ""; } + slot->sparams.penalty_prompt_tokens.clear(); + slot->sparams.use_penalty_prompt_tokens = false; + const auto &penalty_prompt = data.find("penalty_prompt"); + if (penalty_prompt != data.end()) { + if (penalty_prompt->is_string()) { + const auto penalty_prompt_string = penalty_prompt->get(); + auto penalty_tokens = + llama_tokenize(model, penalty_prompt_string, false); + slot->sparams.penalty_prompt_tokens.swap(penalty_tokens); + if (slot->params.n_predict > 0) { + slot->sparams.penalty_prompt_tokens.reserve( + slot->sparams.penalty_prompt_tokens.size() + + slot->params.n_predict); + } + slot->sparams.use_penalty_prompt_tokens = true; + } else if (penalty_prompt->is_array()) { + const auto n_tokens = penalty_prompt->size(); + slot->sparams.penalty_prompt_tokens.reserve( + n_tokens + std::max(0, slot->params.n_predict)); + const int n_vocab = llama_n_vocab(model); + for (const auto &penalty_token : *penalty_prompt) { + if (penalty_token.is_number_integer()) { + const auto tok = penalty_token.get(); + if (tok >= 0 && tok < n_vocab) { + slot->sparams.penalty_prompt_tokens.push_back(tok); + } + } + } + slot->sparams.use_penalty_prompt_tokens = true; + } + } + slot->sparams.logit_bias.clear(); if (json_value(data, "ignore_eos", false)) { @@ -731,29 +805,20 @@ struct llama_server_context { const auto &images_data = data.find("image_data"); if (images_data != data.end() && images_data->is_array()) { for (const auto &img : *images_data) { - std::string data_b64 = img["data"].get(); + const std::vector image_buffer = + base64_decode(img["data"].get()); + slot_image img_sl; img_sl.id = img.count("id") != 0 ? img["id"].get() : slot->images.size(); - int width, height, channels; - std::vector image_buffer = base64_decode(data_b64); - data_b64.clear(); - auto data = - stbi_load_from_memory(image_buffer.data(), image_buffer.size(), - &width, &height, &channels, 3); - if (!data) { + img_sl.img_data = clip_image_u8_init(); + if (!clip_image_load_from_bytes( + image_buffer.data(), image_buffer.size(), img_sl.img_data)) { LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id, img_sl.id); return false; } - LOG_TEE("slot %i - image loaded [id: %i] resolution (%i x %i)\n", - slot->id, img_sl.id, width, height); - img_sl.img_data.nx = width; - img_sl.img_data.ny = height; - img_sl.img_data.size = width * height * 3; - img_sl.img_data.data = new uint8_t[width * height * 3](); - memcpy(img_sl.img_data.data, data, width * height * 3); - stbi_image_free(data); + LOG_TEE("slot %i - loaded image\n", slot->id); img_sl.request_encode_image = true; slot->images.push_back(img_sl); } @@ -807,6 +872,7 @@ struct llama_server_context { llama_sampling_free(slot->ctx_sampling); } slot->ctx_sampling = llama_sampling_init(slot->sparams); + llama_set_rng_seed(ctx, slot->params.seed); slot->command = LOAD_PROMPT; all_slots_are_idle = false; @@ -823,7 +889,7 @@ struct llama_server_context { } void update_system_prompt() { - system_tokens = ::llama_tokenize(ctx, system_prompt, true); + system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token); llama_batch_clear(batch); @@ -905,25 +971,36 @@ struct llama_server_context { slot.generated_text += token_str; slot.has_next_token = true; - if (slot.multibyte_pending > 0) { - slot.multibyte_pending -= token_str.size(); - } else if (token_str.size() == 1) { - const char c = token_str[0]; - // 2-byte characters: 110xxxxx 10xxxxxx + if (slot.ctx_sampling->params.use_penalty_prompt_tokens && + result.tok != -1) { + // we can change penalty_prompt_tokens because it is always created from + // scratch each request + slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok); + } + + // check if there is incomplete UTF-8 character at the end + bool incomplete = false; + for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i) { + unsigned char c = slot.generated_text[slot.generated_text.size() - i]; + if ((c & 0xC0) == 0x80) { + // continuation byte: 10xxxxxx + continue; + } if ((c & 0xE0) == 0xC0) { - slot.multibyte_pending = 1; - // 3-byte characters: 1110xxxx 10xxxxxx 10xxxxxx + // 2-byte character: 110xxxxx ... + incomplete = i < 2; } else if ((c & 0xF0) == 0xE0) { - slot.multibyte_pending = 2; - // 4-byte characters: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + // 3-byte character: 1110xxxx ... + incomplete = i < 3; } else if ((c & 0xF8) == 0xF0) { - slot.multibyte_pending = 3; - } else { - slot.multibyte_pending = 0; + // 4-byte character: 11110xxx ... + incomplete = i < 4; } + // else 1-byte character or invalid byte + break; } - if (slot.multibyte_pending == 0) { + if (!incomplete) { size_t pos = std::min(slot.sent_count, slot.generated_text.size()); const std::string str_test = slot.generated_text.substr(pos); bool is_stop_full = false; @@ -955,7 +1032,7 @@ struct llama_server_context { } } - if (slot.multibyte_pending > 0 && !slot.has_next_token) { + if (incomplete) { slot.has_next_token = true; } @@ -993,8 +1070,8 @@ struct llama_server_context { if (!img.request_encode_image) { continue; } - clip_image_f32 img_res; - if (!clip_image_preprocess(clp_ctx, &img.img_data, &img_res, + clip_image_f32 *img_res = clip_image_f32_init(); + if (!clip_image_preprocess(clp_ctx, img.img_data, img_res, /*pad2square =*/true)) { LOG_TEE("Error processing the given image"); clip_free(clp_ctx); @@ -1008,24 +1085,51 @@ struct llama_server_context { return false; } LOG_TEE("slot %i - encoding image [id: %i]\n", slot.id, img.id); - if (!clip_image_encode(clp_ctx, params.n_threads, &img_res, + if (!clip_image_encode(clp_ctx, params.n_threads, img_res, img.image_embedding)) { LOG_TEE("Unable to encode image\n"); return false; } + clip_image_f32_free(img_res); img.request_encode_image = false; } return slot.images.size() > 0; } - void send_error(int id, std::string error) { - std::lock_guard lock(mutex_results); + void send_error(task_server &task, std::string error) { + std::unique_lock lock(mutex_results); task_result res; - res.id = id; + res.id = task.id; + res.multitask_id = task.multitask_id; + res.stop = false; res.error = true; res.result_json = {{"content", error}}; queue_results.push_back(res); + condition_results.notify_all(); + } + + void add_multi_task(int id, std::vector &sub_ids) { + std::lock_guard lock(mutex_tasks); + task_multi multi; + multi.id = id; + std::copy(sub_ids.begin(), sub_ids.end(), + std::inserter(multi.subtasks_remaining, + multi.subtasks_remaining.end())); + queue_multitasks.push_back(multi); + condition_tasks.notify_one(); + } + + void update_multi_task(int multitask_id, int subtask_id, + task_result &result) { + std::lock_guard lock(mutex_tasks); + for (auto &multitask : queue_multitasks) { + if (multitask.id == multitask_id) { + multitask.subtasks_remaining.erase(subtask_id); + multitask.results.push_back(result); + condition_tasks.notify_one(); + } + } } json get_model_props() { return get_formated_generation(slots[0]); } @@ -1039,15 +1143,18 @@ struct llama_server_context { {"n_ctx", slot.n_ctx}, {"model", params.model_alias}, {"seed", slot.params.seed}, - {"temp", slot.sparams.temp}, + {"temperature", slot.sparams.temp}, {"top_k", slot.sparams.top_k}, {"top_p", slot.sparams.top_p}, + {"min_p", slot.sparams.min_p}, {"tfs_z", slot.sparams.tfs_z}, {"typical_p", slot.sparams.typical_p}, {"repeat_last_n", slot.sparams.penalty_last_n}, {"repeat_penalty", slot.sparams.penalty_repeat}, {"presence_penalty", slot.sparams.penalty_present}, {"frequency_penalty", slot.sparams.penalty_freq}, + {"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens}, + {"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens}, {"mirostat", slot.sparams.mirostat}, {"mirostat_tau", slot.sparams.mirostat_tau}, {"mirostat_eta", slot.sparams.mirostat_eta}, @@ -1065,9 +1172,10 @@ struct llama_server_context { void send_partial_response(llama_client_slot &slot, completion_token_output tkn) { - std::lock_guard lock(mutex_results); + std::unique_lock lock(mutex_results); task_result res; res.id = slot.task_id; + res.multitask_id = slot.multitask_id; res.error = false; res.stop = false; @@ -1095,13 +1203,20 @@ struct llama_server_context { probs_vector_to_json(ctx, probs_output); } + if (slot.oaicompat) { + res.result_json["oaicompat_token_ctr"] = slot.n_decoded; + res.result_json["model"] = slot.oaicompat_model; + } + queue_results.push_back(res); + condition_results.notify_all(); } void send_final_response(llama_client_slot &slot) { - std::lock_guard lock(mutex_results); + std::unique_lock lock(mutex_results); task_result res; res.id = slot.task_id; + res.multitask_id = slot.multitask_id; res.error = false; res.stop = true; @@ -1139,13 +1254,25 @@ struct llama_server_context { probs_vector_to_json(ctx, probs); } + if (slot.oaicompat) { + res.result_json["oaicompat_token_ctr"] = slot.n_decoded; + res.result_json["model"] = slot.oaicompat_model; + } + + // parent multitask, if any, needs to be updated + if (slot.multitask_id != -1) { + update_multi_task(slot.multitask_id, slot.task_id, res); + } + queue_results.push_back(res); + condition_results.notify_all(); } void send_embedding(llama_client_slot &slot) { - std::lock_guard lock(mutex_results); + std::unique_lock lock(mutex_results); task_result res; res.id = slot.task_id; + res.multitask_id = slot.multitask_id; res.error = false; res.stop = true; @@ -1166,31 +1293,50 @@ struct llama_server_context { }; } queue_results.push_back(res); + condition_results.notify_all(); } - int request_completion(json data, bool infill, bool embedding) { - std::lock_guard lock(mutex_tasks); + int request_completion(json data, bool infill, bool embedding, + int multitask_id) { + std::unique_lock lock(mutex_tasks); task_server task; task.id = id_gen++; - task.data = data; + task.target_id = 0; + task.data = std::move(data); task.infill_mode = infill; task.embedding_mode = embedding; task.type = COMPLETION_TASK; + task.multitask_id = multitask_id; + + // when a completion task's prompt array is not a singleton, we split it + // into multiple requests + if (task.data.at("prompt").size() > 1) { + lock.unlock(); // entering new func scope + return split_multiprompt_task(task); + } + + // otherwise, it's a single-prompt task, we actually queue it queue_tasks.push_back(task); + condition_tasks.notify_one(); return task.id; } task_result next_result(int task_id) { while (true) { - std::this_thread::sleep_for(std::chrono::microseconds(5)); - std::lock_guard lock(mutex_results); - - if (queue_results.empty()) { - continue; - } + std::unique_lock lock(mutex_results); + condition_results.wait(lock, [&] { return !queue_results.empty(); }); for (int i = 0; i < (int)queue_results.size(); i++) { + // for now, tasks that have associated parent multitasks just get erased + // once multitask picks up the result + if (queue_results[i].multitask_id == task_id) { + update_multi_task(task_id, queue_results[i].id, queue_results[i]); + queue_results.erase(queue_results.begin() + i); + continue; + } + if (queue_results[i].id == task_id) { + assert(queue_results[i].multitask_id == -1); task_result res = queue_results[i]; queue_results.erase(queue_results.begin() + i); return res; @@ -1274,16 +1420,38 @@ struct llama_server_context { } void request_cancel(int task_id) { - std::lock_guard lock(mutex_tasks); + std::unique_lock lock(mutex_tasks); task_server task; task.id = id_gen++; task.type = CANCEL_TASK; task.target_id = task_id; queue_tasks.push_back(task); + condition_tasks.notify_one(); + } + + int split_multiprompt_task(task_server &multiprompt_task) { + int prompt_count = multiprompt_task.data.at("prompt").size(); + assert(prompt_count > 1); + + int multitask_id = id_gen++; + std::vector subtask_ids(prompt_count); + for (int i = 0; i < prompt_count; i++) { + json subtask_data = multiprompt_task.data; + subtask_data["prompt"] = subtask_data["prompt"][i]; + + // subtasks inherit everything else (infill mode, embedding mode, etc.) + subtask_ids[i] = + request_completion(subtask_data, multiprompt_task.infill_mode, + multiprompt_task.embedding_mode, multitask_id); + } + + // queue up the multitask so we can track its subtask progression + add_multi_task(multitask_id, subtask_ids); + return multitask_id; } void process_tasks() { - std::lock_guard lock(mutex_tasks); + std::unique_lock lock(mutex_tasks); while (!queue_tasks.empty()) { task_server task = queue_tasks.front(); queue_tasks.erase(queue_tasks.begin()); @@ -1294,7 +1462,7 @@ struct llama_server_context { if (slot == nullptr) { LOG_TEE("slot unavailable\n"); // send error result - send_error(task.id, "slot unavailable"); + send_error(task, "slot unavailable"); return; } @@ -1307,10 +1475,11 @@ struct llama_server_context { slot->infill = task.infill_mode; slot->embedding = task.embedding_mode; slot->task_id = task.id; + slot->multitask_id = task.multitask_id; if (!launch_slot_with_data(slot, task.data)) { // send error result - send_error(task.id, "internal_error"); + send_error(task, "internal_error"); break; } } break; @@ -1324,6 +1493,35 @@ struct llama_server_context { } break; } } + + // remove finished multitasks from the queue of multitasks, and add the + // corresponding result to the result queue + auto queue_iterator = queue_multitasks.begin(); + while (queue_iterator != queue_multitasks.end()) { + if (queue_iterator->subtasks_remaining.empty()) { + // all subtasks done == multitask is done + task_result aggregate_result; + aggregate_result.id = queue_iterator->id; + aggregate_result.stop = true; + aggregate_result.error = false; + + // collect json results into one json result + std::vector result_jsons; + for (auto &subres : queue_iterator->results) { + result_jsons.push_back(subres.result_json); + aggregate_result.error = aggregate_result.error && subres.error; + } + aggregate_result.result_json = json{"results", result_jsons}; + + std::lock_guard lock(mutex_results); + queue_results.push_back(aggregate_result); + condition_results.notify_all(); + + queue_iterator = queue_multitasks.erase(queue_iterator); + } else { + ++queue_iterator; + } + } } bool update_slots() { @@ -1344,8 +1542,8 @@ struct llama_server_context { "cache\n"); kv_cache_clear(); } - // avoid 100% usage of cpu all time - std::this_thread::sleep_for(std::chrono::milliseconds(5)); + std::unique_lock lock(mutex_tasks); + condition_tasks.wait(lock, [&] { return !queue_tasks.empty(); }); } for (llama_client_slot &slot : slots) { @@ -1466,54 +1664,55 @@ struct llama_server_context { } else { prompt_tokens = tokenize( slot.prompt, - system_prompt.empty()); // add BOS if there isn't system prompt + system_prompt.empty() && + add_bos_token); // add BOS if there isn't system prompt } slot.num_prompt_tokens = prompt_tokens.size(); + if (slot.params.n_keep < 0) { + slot.params.n_keep = slot.num_prompt_tokens; + } + slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); + + // if input prompt is too big, truncate it + if (slot.num_prompt_tokens >= slot.n_ctx) { + const int n_left = slot.n_ctx - slot.params.n_keep; + const int n_block_size = n_left / 2; + const int erased_blocks = + (slot.num_prompt_tokens - slot.params.n_keep - n_block_size) / + n_block_size; + + std::vector new_tokens(prompt_tokens.begin(), + prompt_tokens.begin() + + slot.params.n_keep); + new_tokens.insert(new_tokens.end(), + prompt_tokens.begin() + slot.params.n_keep + + erased_blocks * n_block_size, + prompt_tokens.end()); + + LOG_VERBOSE( + "input truncated", + { + {"n_ctx", slot.n_ctx}, + {"n_keep", slot.params.n_keep}, + {"n_left", n_left}, + {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), + new_tokens.cend())}, + }); + slot.truncated = true; + prompt_tokens = new_tokens; + + slot.num_prompt_tokens = prompt_tokens.size(); + GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx); + } + if (!slot.params.cache_prompt) { llama_sampling_reset(slot.ctx_sampling); slot.n_past = 0; slot.num_prompt_tokens_processed = slot.num_prompt_tokens; } else { - if (slot.params.n_keep < 0) { - slot.params.n_keep = slot.num_prompt_tokens; - } - slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); - - // if input prompt is too big, truncate it - if (slot.num_prompt_tokens >= slot.n_ctx) { - const int n_left = slot.n_ctx - slot.params.n_keep; - const int n_block_size = n_left / 2; - const int erased_blocks = - (slot.num_prompt_tokens - slot.params.n_keep - n_block_size) / - n_block_size; - - std::vector new_tokens(prompt_tokens.begin(), - prompt_tokens.begin() + - slot.params.n_keep); - new_tokens.insert(new_tokens.end(), - prompt_tokens.begin() + slot.params.n_keep + - erased_blocks * n_block_size, - prompt_tokens.end()); - - LOG_VERBOSE( - "input truncated", - { - {"n_ctx", slot.n_ctx}, - {"n_keep", slot.params.n_keep}, - {"n_left", n_left}, - {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), - new_tokens.cend())}, - }); - slot.truncated = true; - prompt_tokens = new_tokens; - - slot.num_prompt_tokens = prompt_tokens.size(); - GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx); - } - // push the prompt into the sampling context (do not apply grammar) for (auto &token : prompt_tokens) { llama_sampling_accept(slot.ctx_sampling, ctx, token, false); @@ -1559,7 +1758,7 @@ struct llama_server_context { // process the prefix of first image std::vector prefix_tokens = - has_images ? tokenize(slot.images[0].prefix_prompt, true) + has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) : prompt_tokens; for (; slot.n_past < (int)prefix_tokens.size(); ++slot.n_past) { llama_batch_add(batch, prefix_tokens[slot.n_past], @@ -1757,6 +1956,8 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" --path PUBLIC_PATH path from which to serve static files " "(default %s)\n", sparams.public_path.c_str()); + printf(" --api-key API_KEY optional api key to enhance server security. " + "If set, requests must include this key for access.\n"); printf(" -to N, --timeout N server read/write timeout in seconds " "(default: %d)\n", sparams.read_timeout); @@ -1773,8 +1974,462 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, "prompt of all slots), this is useful for chat applications.\n"); printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for " "LLaVA.\n"); + printf(" --log-disable disables logging to a file.\n"); printf("\n"); } + +static void server_params_parse(int argc, char **argv, server_params &sparams, + gpt_params ¶ms, + llama_server_context &llama) { + gpt_params default_params; + server_params default_sparams; + std::string arg; + bool invalid_param = false; + + for (int i = 1; i < argc; i++) { + arg = argv[i]; + if (arg == "--port") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.port = std::stoi(argv[i]); + } else if (arg == "--host") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.hostname = argv[i]; + } else if (arg == "--path") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.public_path = argv[i]; + } else if (arg == "--api-key") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.api_key = argv[i]; + } else if (arg == "--timeout" || arg == "-to") { + if (++i >= argc) { + invalid_param = true; + break; + } + sparams.read_timeout = std::stoi(argv[i]); + sparams.write_timeout = std::stoi(argv[i]); + } else if (arg == "-m" || arg == "--model") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.model = argv[i]; + } else if (arg == "-a" || arg == "--alias") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.model_alias = argv[i]; + } else if (arg == "-h" || arg == "--help") { + server_print_usage(argv[0], default_params, default_sparams); + exit(0); + } else if (arg == "-c" || arg == "--ctx-size" || arg == "--ctx_size") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_ctx = std::stoi(argv[i]); + } else if (arg == "--rope-scaling") { + if (++i >= argc) { + invalid_param = true; + break; + } + std::string value(argv[i]); + /**/ if (value == "none") { + params.rope_scaling_type = LLAMA_ROPE_SCALING_NONE; + } else if (value == "linear") { + params.rope_scaling_type = LLAMA_ROPE_SCALING_LINEAR; + } else if (value == "yarn") { + params.rope_scaling_type = LLAMA_ROPE_SCALING_YARN; + } else { + invalid_param = true; + break; + } + } else if (arg == "--rope-freq-base") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.rope_freq_base = std::stof(argv[i]); + } else if (arg == "--rope-freq-scale") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.rope_freq_scale = std::stof(argv[i]); + } else if (arg == "--yarn-ext-factor") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.yarn_ext_factor = std::stof(argv[i]); + } else if (arg == "--yarn-attn-factor") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.yarn_attn_factor = std::stof(argv[i]); + } else if (arg == "--yarn-beta-fast") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.yarn_beta_fast = std::stof(argv[i]); + } else if (arg == "--yarn-beta-slow") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.yarn_beta_slow = std::stof(argv[i]); + } else if (arg == "--threads" || arg == "-t") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_threads = std::stoi(argv[i]); + } else if (arg == "--threads-batch" || arg == "-tb") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_threads_batch = std::stoi(argv[i]); + } else if (arg == "-b" || arg == "--batch-size") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_batch = std::stoi(argv[i]); + params.n_batch = std::min(512, params.n_batch); + } else if (arg == "--gpu-layers" || arg == "-ngl" || + arg == "--n-gpu-layers") { + if (++i >= argc) { + invalid_param = true; + break; + } +#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD + params.n_gpu_layers = std::stoi(argv[i]); +#else + LOG_WARNING_LLAMA( + "Not compiled with GPU offload support, --n-gpu-layers option will " + "be ignored. " + "See main README.md for information on enabling GPU BLAS support", + {{"n_gpu_layers", params.n_gpu_layers}}); +#endif + } else if (arg == "--tensor-split" || arg == "-ts") { + if (++i >= argc) { + invalid_param = true; + break; + } +#ifdef GGML_USE_CUBLAS + std::string arg_next = argv[i]; + + // split string by , and / + const std::regex regex{R"([,/]+)"}; + std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, + -1}; + std::vector split_arg{it, {}}; + GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); + + for (size_t i_device = 0; i_device < LLAMA_MAX_DEVICES; ++i_device) { + if (i_device < split_arg.size()) { + params.tensor_split[i_device] = std::stof(split_arg[i_device]); + } else { + params.tensor_split[i_device] = 0.0f; + } + } +#else + LOG_WARNING_LLAMA("llama.cpp was compiled without cuBLAS. It is not " + "possible to set a tensor split.\n", + {}); +#endif // GGML_USE_CUBLAS + } else if (arg == "--no-mul-mat-q" || arg == "-nommq") { +#ifdef GGML_USE_CUBLAS + params.mul_mat_q = false; +#else + LOG_WARNING_LLAMA("warning: llama.cpp was compiled without cuBLAS. " + "Disabling mul_mat_q kernels has no effect.\n", + {}); +#endif // GGML_USE_CUBLAS + } else if (arg == "--main-gpu" || arg == "-mg") { + if (++i >= argc) { + invalid_param = true; + break; + } +#ifdef GGML_USE_CUBLAS + params.main_gpu = std::stoi(argv[i]); +#else + LOG_WARNING_LLAMA("llama.cpp was compiled without cuBLAS. It is not " + "possible to set a main GPU.", + {}); +#endif + } else if (arg == "--lora") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.lora_adapter.push_back(std::make_tuple(argv[i], 1.0f)); + params.use_mmap = false; + } else if (arg == "--lora-scaled") { + if (++i >= argc) { + invalid_param = true; + break; + } + const char *lora_adapter = argv[i]; + if (++i >= argc) { + invalid_param = true; + break; + } + params.lora_adapter.push_back( + std::make_tuple(lora_adapter, std::stof(argv[i]))); + params.use_mmap = false; + } else if (arg == "--lora-base") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.lora_base = argv[i]; + } else if (arg == "-v" || arg == "--verbose") { +#if SERVER_VERBOSE != 1 + LOG_WARNING_LLAMA("server.cpp is not built with verbose logging.", {}); +#else + server_verbose = true; +#endif + } else if (arg == "--mlock") { + params.use_mlock = true; + } else if (arg == "--no-mmap") { + params.use_mmap = false; + } else if (arg == "--numa") { + params.numa = true; + } else if (arg == "--embedding") { + params.embedding = true; + } else if (arg == "-cb" || arg == "--cont-batching") { + params.cont_batching = true; + } else if (arg == "-np" || arg == "--parallel") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_parallel = std::stoi(argv[i]); + } else if (arg == "-n" || arg == "--n-predict") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_predict = std::stoi(argv[i]); + } else if (arg == "-spf" || arg == "--system-prompt-file") { + if (++i >= argc) { + invalid_param = true; + break; + } + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + break; + } + std::string systm_content; + std::copy(std::istreambuf_iterator(file), + std::istreambuf_iterator(), + std::back_inserter(systm_content)); + llama.process_system_prompt_data(json::parse(systm_content)); + } else if (arg == "--mmproj") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.mmproj = argv[i]; + } else if (arg == "--log-disable") { + log_set_target(stdout); + LOG_INFO_LLAMA("logging to file is disabled.", {}); + } else { + fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); + server_print_usage(argv[0], default_params, default_sparams); + exit(1); + } + } + + if (invalid_param) { + fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); + server_print_usage(argv[0], default_params, default_sparams); + exit(1); + } +} + +static std::string random_string() { + static const std::string str( + "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); + + std::random_device rd; + std::mt19937 generator(rd()); + + std::string result(32, ' '); + + for (int i = 0; i < 32; ++i) { + result[i] = str[generator() % str.size()]; + } + + return result; +} + +static std::string gen_chatcmplid() { + std::stringstream chatcmplid; + chatcmplid << "chatcmpl-" << random_string(); + return chatcmplid.str(); +} +static json format_final_response_oaicompat(const json &request, + const task_result &response, + bool streaming = false) { + json result = response.result_json; + + bool stopped_word = result.count("stopped_word") != 0; + bool stopped_eos = json_value(result, "stopped_eos", false); + int num_tokens_predicted = json_value(result, "tokens_predicted", 0); + int num_prompt_tokens = json_value(result, "tokens_evaluated", 0); + std::string content = json_value(result, "content", std::string("")); + + std::string finish_reason = "length"; + if (stopped_word || stopped_eos) { + finish_reason = "stop"; + } + + json choices = + streaming ? json::array({json{{"finish_reason", finish_reason}, + {"index", 0}, + {"delta", json::object()}}}) + : json::array({json{{"finish_reason", finish_reason}, + {"index", 0}, + {"message", json{{"content", content}, + {"role", "assistant"}}}}}); + + std::time_t t = std::time(0); + + json res = + json{{"choices", choices}, + {"created", t}, + {"model", + json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, + {"object", streaming ? "chat.completion.chunk" : "chat.completion"}, + {"usage", + json{{"completion_tokens", num_tokens_predicted}, + {"prompt_tokens", num_prompt_tokens}, + {"total_tokens", num_tokens_predicted + num_prompt_tokens}}}, + {"id", gen_chatcmplid()}}; + + if (server_verbose) { + res["__verbose"] = result; + } + + if (result.contains("completion_probabilities")) { + res["completion_probabilities"] = + json_value(result, "completion_probabilities", json::array()); + } + + return res; +} + +// return value is vector as there is one case where we might need to generate +// two responses +static std::vector +format_partial_response_oaicompat(const task_result &response) { + json result = response.result_json; + + if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) { + return std::vector({response.result_json}); + } + + bool first = json_value(result, "oaicompat_token_ctr", 0) == 0; + std::string modelname = + json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); + + bool stopped_word = json_value(result, "stopped_word", false); + bool stopped_eos = json_value(result, "stopped_eos", false); + bool stopped_limit = json_value(result, "stopped_limit", false); + std::string content = json_value(result, "content", std::string("")); + + std::string finish_reason; + if (stopped_word || stopped_eos) { + finish_reason = "stop"; + } + if (stopped_limit) { + finish_reason = "length"; + } + + std::time_t t = std::time(0); + + json choices; + + if (!finish_reason.empty()) { + choices = json::array({json{{"finish_reason", finish_reason}, + {"index", 0}, + {"delta", json::object()}}}); + } else { + if (first) { + if (content.empty()) { + choices = json::array({json{{"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{{"role", "assistant"}}}}}); + } else { + // We have to send this as two updates to conform to openai behavior + json initial_ret = + json{{"choices", + json::array({json{{"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{{"role", "assistant"}}}}})}, + {"created", t}, + {"id", gen_chatcmplid()}, + {"model", modelname}, + {"object", "chat.completion.chunk"}}; + + json second_ret = + json{{"choices", + json::array({json{{"finish_reason", nullptr}, + {"index", 0}, + {"delta", json{{"content", content}}}}})}, + {"created", t}, + {"id", gen_chatcmplid()}, + {"model", modelname}, + {"object", "chat.completion.chunk"}}; + + return std::vector({initial_ret, second_ret}); + } + } else { + // Some idiosyncrasy in task processing logic makes several trailing calls + // with empty content, we ignore these at the calee site. + if (content.empty()) { + return std::vector({json::object()}); + } + + choices = json::array({json{ + {"finish_reason", nullptr}, + {"index", 0}, + {"delta", + json{ + {"content", content}, + }}, + }}); + } + } + + json ret = json{{"choices", choices}, + {"created", t}, + {"id", gen_chatcmplid()}, + {"model", modelname}, + {"object", "chat.completion.chunk"}}; + + return std::vector({ret}); +} + static json format_partial_response(llama_server_context &llama, llama_client_slot *slot, const std::string &content, From a995c93d0be6494f6c6e36f115a73037a54f6987 Mon Sep 17 00:00:00 2001 From: Alan Dao Date: Tue, 2 Jan 2024 17:15:54 +0700 Subject: [PATCH 3/6] redundant includes --- controllers/llamaCPP.cc | 7 ------- 1 file changed, 7 deletions(-) diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index 57770849d..957556beb 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -1,14 +1,7 @@ #include "llamaCPP.h" #include "llama.h" #include "utils/nitro_utils.h" -#include -#include -#include -#include -#include -#include #include -#include using namespace inferences; using json = nlohmann::json; From 1a574d2d300f369f152780fb4614a0a29054ad36 Mon Sep 17 00:00:00 2001 From: Alan Dao Date: Tue, 2 Jan 2024 17:18:04 +0700 Subject: [PATCH 4/6] remove redundant includes --- controllers/llamaCPP.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc index 957556beb..91c7c3a3d 100644 --- a/controllers/llamaCPP.cc +++ b/controllers/llamaCPP.cc @@ -1,7 +1,6 @@ #include "llamaCPP.h" #include "llama.h" #include "utils/nitro_utils.h" -#include using namespace inferences; using json = nlohmann::json; From af08bec556085256f4039d86f57ac1bf68ea6f83 Mon Sep 17 00:00:00 2001 From: Alan Dao Date: Tue, 2 Jan 2024 17:18:31 +0700 Subject: [PATCH 5/6] remove redundant includes --- controllers/llamaCPP.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index 947a622c2..3d7334b42 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -8,12 +8,7 @@ #include "log.h" #include "utils/nitro_utils.h" -#include #include -#include -#include -#include -#include // External #include "common.h" From 797b251fa4085d561fe0228da3ab8791722d57a2 Mon Sep 17 00:00:00 2001 From: Alan Dao Date: Tue, 2 Jan 2024 17:19:44 +0700 Subject: [PATCH 6/6] add back fstream --- controllers/llamaCPP.h | 1 + 1 file changed, 1 insertion(+) diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h index 3d7334b42..e4152ed85 100644 --- a/controllers/llamaCPP.h +++ b/controllers/llamaCPP.h @@ -9,6 +9,7 @@ #include "log.h" #include "utils/nitro_utils.h" #include +#include // External #include "common.h"