From ed3e23f01f66bdeb4f4d88a87ae8ef05e159ed8d Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Tue, 30 Apr 2024 16:18:51 +0700 Subject: [PATCH 01/10] fix: struct cortex-cpp --- .gitmodules | 6 ++++++ cortex-cpp/.gitmodules | 7 ------- cortex-cpp/llama.cpp | 1 + cortex-cpp/whisper.cpp | 1 + 4 files changed, 8 insertions(+), 7 deletions(-) create mode 100644 .gitmodules delete mode 100644 cortex-cpp/.gitmodules create mode 160000 cortex-cpp/llama.cpp create mode 160000 cortex-cpp/whisper.cpp diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..ea95ac179 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,6 @@ +[submodule "cortex-cpp/llama.cpp"] + path = cortex-cpp/llama.cpp + url = https://github.com/ggerganov/llama.cpp +[submodule "cortex-cpp/whisper.cpp"] + path = cortex-cpp/whisper.cpp + url = https://github.com/ggerganov/whisper.cpp.git diff --git a/cortex-cpp/.gitmodules b/cortex-cpp/.gitmodules deleted file mode 100644 index e2f71d456..000000000 --- a/cortex-cpp/.gitmodules +++ /dev/null @@ -1,7 +0,0 @@ -[submodule "llama.cpp"] - path = llama.cpp - url = https://github.com/ggerganov/llama.cpp - branch = master -[submodule "whisper.cpp"] - path = whisper.cpp - url = https://github.com/ggerganov/whisper.cpp.git diff --git a/cortex-cpp/llama.cpp b/cortex-cpp/llama.cpp new file mode 160000 index 000000000..8a56075b0 --- /dev/null +++ b/cortex-cpp/llama.cpp @@ -0,0 +1 @@ +Subproject commit 8a56075b07a8b571bf95a912ffdce4c928c2b414 diff --git a/cortex-cpp/whisper.cpp b/cortex-cpp/whisper.cpp new file mode 160000 index 000000000..9fab28135 --- /dev/null +++ b/cortex-cpp/whisper.cpp @@ -0,0 +1 @@ +Subproject commit 9fab28135c7867bb7eccd9ebcd2ea8d52e42ca81 From 45aabe388adc44c1b6723f8745db910caa662588 Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Tue, 30 Apr 2024 16:47:11 +0700 Subject: [PATCH 02/10] fix: split llama engine --- .gitmodules | 3 - cortex-cpp/CMakeLists.txt | 11 +- cortex-cpp/common/base.h | 3 +- cortex-cpp/context/llama_server_context.h | 2260 ----------------- cortex-cpp/controllers/llamaCPP.cc | 809 +----- cortex-cpp/controllers/llamaCPP.h | 95 +- cortex-cpp/cortex-common/EngineI.h | 27 + .../engines/cortex.llamacpp/engine.cmake | 21 + cortex-cpp/llama.cpp | 1 - cortex-cpp/models/chat_completion_request.h | 36 - cortex-cpp/nitro_deps/CMakeLists.txt | 11 +- cortex-cpp/test/components/test_models.cc | 53 - cortex-cpp/utils/dylib.h | 316 +++ 13 files changed, 532 insertions(+), 3114 deletions(-) delete mode 100644 cortex-cpp/context/llama_server_context.h create mode 100644 cortex-cpp/cortex-common/EngineI.h create mode 100644 cortex-cpp/engines/cortex.llamacpp/engine.cmake delete mode 160000 cortex-cpp/llama.cpp delete mode 100644 cortex-cpp/models/chat_completion_request.h delete mode 100644 cortex-cpp/test/components/test_models.cc create mode 100644 cortex-cpp/utils/dylib.h diff --git a/.gitmodules b/.gitmodules index ea95ac179..65a042814 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,3 @@ -[submodule "cortex-cpp/llama.cpp"] - path = cortex-cpp/llama.cpp - url = https://github.com/ggerganov/llama.cpp [submodule "cortex-cpp/whisper.cpp"] path = cortex-cpp/whisper.cpp url = https://github.com/ggerganov/whisper.cpp.git diff --git a/cortex-cpp/CMakeLists.txt b/cortex-cpp/CMakeLists.txt index eba4fee0c..9be937dc4 100644 --- a/cortex-cpp/CMakeLists.txt +++ b/cortex-cpp/CMakeLists.txt @@ -1,6 +1,7 @@ cmake_minimum_required(VERSION 3.5) project(nitro C CXX) +# include(engines/cortex.llamacpp/engine.cmake) include(CheckIncludeFileCXX) check_include_file_cxx(any HAS_ANY) @@ -55,8 +56,6 @@ endif() add_compile_definitions(NITRO_VERSION="${NITRO_VERSION}") -add_subdirectory(llama.cpp/examples/llava) -add_subdirectory(llama.cpp) add_subdirectory(whisper.cpp) add_subdirectory(test) @@ -68,8 +67,9 @@ add_executable(${PROJECT_NAME} main.cc) # target_link_libraries(${PROJECT_NAME} PRIVATE nitro_deps) # # and comment out the following lines + find_package(Drogon CONFIG REQUIRED) -target_link_libraries(${PROJECT_NAME} PRIVATE Drogon::Drogon common llama whisper llava +target_link_libraries(${PROJECT_NAME} PRIVATE Drogon::Drogon whisper ${CMAKE_THREAD_LIBS_INIT}) # ############################################################################## @@ -89,6 +89,7 @@ aux_source_directory(controllers CTL_SRC) aux_source_directory(common COMMON_SRC) aux_source_directory(context CONTEXT_SRC) aux_source_directory(models MODEL_SRC) +aux_source_directory(cortex-common CORTEX_COMMON) # aux_source_directory(filters FILTER_SRC) aux_source_directory(plugins # PLUGIN_SRC) @@ -97,10 +98,10 @@ aux_source_directory(models MODEL_SRC) # namespaces. drogon_create_views(${PROJECT_NAME} # ${CMAKE_CURRENT_SOURCE_DIR}/views ${CMAKE_CURRENT_BINARY_DIR} TRUE) -target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) +target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ) # ${CMAKE_CURRENT_SOURCE_DIR}/models) target_sources(${PROJECT_NAME} PRIVATE ${CTL_SRC} ${COMMON_SRC} ${CONTEXT_SRC}) # ${FILTER_SRC} ${PLUGIN_SRC} ${MODEL_SRC}) # ############################################################################## # uncomment the following line for dynamically loading views set_property(TARGET -# ${PROJECT_NAME} PROPERTY ENABLE_EXPORTS ON) +# ${PROJECT_NAME} PROPERTY ENABLE_EXPORTS ON) \ No newline at end of file diff --git a/cortex-cpp/common/base.h b/cortex-cpp/common/base.h index e87d07488..adfe42a19 100644 --- a/cortex-cpp/common/base.h +++ b/cortex-cpp/common/base.h @@ -1,6 +1,5 @@ #pragma once #include -#include using namespace drogon; @@ -25,7 +24,7 @@ class BaseChatCompletion { // General chat method virtual void ChatCompletion( - inferences::ChatCompletionRequest &&completion, + const HttpRequestPtr& req, std::function&& callback) = 0; }; diff --git a/cortex-cpp/context/llama_server_context.h b/cortex-cpp/context/llama_server_context.h deleted file mode 100644 index 21792f11b..000000000 --- a/cortex-cpp/context/llama_server_context.h +++ /dev/null @@ -1,2260 +0,0 @@ -#include -#include -#include -#include - -// External -#include "clip.h" -#include "common.h" -#include "llama.h" -#include "llava.h" -#include "stb_image.h" -#include "utils/json.hpp" - -#if defined(_WIN32) -#define NOMINMAX -#endif - -using json = nlohmann::json; - -#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613" - -struct server_params { - std::string hostname = "127.0.0.1"; - std::string api_key; - std::string public_path = "examples/server/public"; - int32_t port = 8080; - int32_t read_timeout = 600; - int32_t write_timeout = 600; -}; - -static bool server_verbose = false; - -#if SERVER_VERBOSE != 1 -#define LOG_VERBOSE(MSG, ...) -#else -#define LOG_VERBOSE(MSG, ...) \ - do { \ - if (server_verbose) { \ - server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \ - } \ - } while (0) -#endif - -#define LOG_ERROR_LLAMA(MSG, ...) \ - server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__) -#define LOG_WARNING_LLAMA(MSG, ...) \ - server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__) -#define LOG_INFO_LLAMA(MSG, ...) \ - server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) - -// -// base64 utils (TODO: move to common in the future) -// - -static const std::string base64_chars = - "ABCDEFGHIJKLMNOPQRSTUVWXYZ" - "abcdefghijklmnopqrstuvwxyz" - "0123456789+/"; - -static inline bool is_base64(uint8_t c) { - return (isalnum(c) || (c == '+') || (c == '/')); -} - -static std::vector base64_decode(const std::string& encoded_string) { - int i = 0; - int j = 0; - int in_ = 0; - - int in_len = encoded_string.size(); - - uint8_t char_array_4[4]; - uint8_t char_array_3[3]; - - std::vector ret; - - while (in_len-- && (encoded_string[in_] != '=') && - is_base64(encoded_string[in_])) { - char_array_4[i++] = encoded_string[in_]; - in_++; - if (i == 4) { - for (i = 0; i < 4; i++) { - char_array_4[i] = base64_chars.find(char_array_4[i]); - } - - char_array_3[0] = - ((char_array_4[0]) << 2) + ((char_array_4[1] & 0x30) >> 4); - char_array_3[1] = - ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); - char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; - - for (i = 0; (i < 3); i++) { - ret.push_back(char_array_3[i]); - } - i = 0; - } - } - - if (i) { - for (j = i; j < 4; j++) { - char_array_4[j] = 0; - } - - for (j = 0; j < 4; j++) { - char_array_4[j] = base64_chars.find(char_array_4[j]); - } - - char_array_3[0] = - ((char_array_4[0]) << 2) + ((char_array_4[1] & 0x30) >> 4); - char_array_3[1] = - ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); - char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; - - for (j = 0; (j < i - 1); j++) { - ret.push_back(char_array_3[j]); - } - } - - return ret; -} - -// -// parallel -// - -enum task_type { COMPLETION_TASK, CANCEL_TASK }; - -struct task_server { - int id; - int target_id; - task_type type; - json data; - bool infill_mode = false; - bool embedding_mode = false; - int multitask_id = -1; -}; - -struct task_result { - int id; - int multitask_id = -1; - bool stop; - bool error; - json result_json; -}; - -struct task_multi { - int id; - std::set subtasks_remaining{}; - std::vector results{}; -}; - -// TODO: can become bool if we can't find use of more states -enum slot_state { - IDLE, - PROCESSING, -}; - -enum slot_command { - NONE, - LOAD_PROMPT, - RELEASE, -}; - -struct slot_params { - bool stream = true; - bool cache_prompt = - false; // remember the prompt to avoid reprocessing all prompt - - uint32_t seed = -1; // RNG seed - int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_predict = -1; // new tokens to predict - - std::vector antiprompt; - - json input_prefix; - json input_suffix; -}; - -struct slot_image { - int32_t id; - - bool request_encode_image = false; - float* image_embedding = nullptr; - int32_t image_tokens = 0; - - clip_image_u8* img_data; - - std::string prefix_prompt; // before of this image -}; - -// completion token output with probabilities -struct completion_token_output { - struct token_prob { - llama_token tok; - float prob; - }; - - std::vector probs; - llama_token tok; - std::string text_to_send; -}; - -static size_t common_part(const std::vector& a, - const std::vector& b) { - size_t i; - for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {} - return i; -} - -enum stop_type { - STOP_FULL, - STOP_PARTIAL, -}; - -enum class ModelType { LLM = 0, EMBEDDING }; - -static bool ends_with(const std::string& str, const std::string& suffix) { - return str.size() >= suffix.size() && - 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); -} - -static size_t find_partial_stop_string(const std::string& stop, - const std::string& text) { - if (!text.empty() && !stop.empty()) { - const char text_last_char = text.back(); - for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) { - if (stop[char_index] == text_last_char) { - const std::string current_partial = stop.substr(0, char_index + 1); - if (ends_with(text, current_partial)) { - return text.size() - char_index - 1; - } - } - } - } - return std::string::npos; -} - -// TODO: reuse llama_detokenize -template -static std::string tokens_to_str(llama_context* ctx, Iter begin, Iter end) { - std::string ret; - for (; begin != end; ++begin) { - ret += llama_token_to_piece(ctx, *begin); - } - return ret; -} - -static void server_log(const char* level, const char* function, int line, - const char* message, - const nlohmann::ordered_json& extra) { - nlohmann::ordered_json log{ - {"timestamp", time(nullptr)}, {"level", level}, - {"function", function}, {"line", line}, - {"message", message}, - }; - - if (!extra.empty()) { - log.merge_patch(extra); - } - - const std::string str = - log.dump(-1, ' ', false, json::error_handler_t::replace); - printf("%.*s\n", (int)str.size(), str.data()); - fflush(stdout); -} - -// format incomplete utf-8 multibyte character for output -static std::string tokens_to_output_formatted_string(const llama_context* ctx, - const llama_token token) { - std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token); - // if the size is 1 and first bit is 1, meaning it's a partial character - // (size > 1 meaning it's already a known token) - if (out.size() == 1 && (out[0] & 0x80) == 0x80) { - std::stringstream ss; - ss << std::hex << (out[0] & 0xff); - std::string res(ss.str()); - out = "byte: \\x" + res; - } - return out; -} - -// convert a vector of completion_token_output to json -static json probs_vector_to_json( - const llama_context* ctx, - const std::vector& probs) { - json out = json::array(); - for (const auto& prob : probs) { - json probs_for_token = json::array(); - for (const auto& p : prob.probs) { - std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok); - probs_for_token.push_back(json{ - {"tok_str", tok_str}, - {"prob", p.prob}, - }); - } - std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok); - out.push_back(json{ - {"content", tok_str}, - {"probs", probs_for_token}, - }); - } - return out; -} - -template -static T json_value(const json& body, const std::string& key, - const T& default_value) { - // Fallback null to default value - return body.contains(key) && !body.at(key).is_null() - ? body.value(key, default_value) - : default_value; -} - -struct llama_client_slot { - int id; - int task_id = -1; - - struct slot_params params; - - slot_state state = IDLE; - slot_command command = NONE; - - // used to determine the slot that has been used the longest - int64_t t_last_used = -1; - - // generation props - int32_t n_ctx = 0; // context size per slot - int32_t n_past = 0; - int32_t n_decoded = 0; - int32_t n_remaining = -1; - int32_t i_batch = -1; - - int32_t num_prompt_tokens = 0; - int32_t num_prompt_tokens_processed = 0; - - json prompt; - std::string generated_text; - llama_token sampled; - std::vector cache_tokens; - std::vector generated_token_probs; - - bool infill = false; - bool embedding = false; - bool has_next_token = true; - bool truncated = false; - bool stopped_eos = false; - bool stopped_word = false; - bool stopped_limit = false; - - bool oaicompat = false; - std::string oaicompat_model; - - std::string stopping_word; - - // sampling - struct llama_sampling_params sparams; - llama_sampling_context* ctx_sampling = nullptr; - - // multimodal - std::vector images; - - // stats - size_t sent_count = 0; - size_t sent_token_probs_index = 0; - - int64_t t_start_process_prompt; - int64_t t_start_genereration; - - double t_prompt_processing; // ms - double t_token_generation; // ms - - // multitasks - int multitask_id = -1; - - void reset() { - num_prompt_tokens = 0; - generated_text = ""; - truncated = false; - stopped_eos = false; - stopped_word = false; - stopped_limit = false; - stopping_word = ""; - n_past = 0; - sent_count = 0; - sent_token_probs_index = 0; - infill = false; - - generated_token_probs.clear(); - - for (slot_image& img : images) { - free(img.image_embedding); - if (img.img_data) { - clip_image_u8_free(img.img_data); - } - img.prefix_prompt = ""; - } - - images.clear(); - } - - bool has_budget(gpt_params& global_params) { - n_remaining = -1; - if (params.n_predict != -1) { - n_remaining = params.n_predict - n_decoded; - } else if (global_params.n_predict != -1) { - n_remaining = global_params.n_predict - n_decoded; - } - return n_remaining > 0 || n_remaining == -1; // no budget || limitless - } - - bool available() const { return state == IDLE && command == NONE; } - - bool is_processing() const { - return (state == IDLE && command == LOAD_PROMPT) || state == PROCESSING; - } - - void add_token_string(const completion_token_output& token) { - if (command == RELEASE) { - return; - } - cache_tokens.push_back(token.tok); - generated_token_probs.push_back(token); - } - - void release() { - if (state == IDLE || state == PROCESSING) { - t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3; - command = RELEASE; - } - } - - json get_formated_timings() { - return json{ - {"prompt_n", num_prompt_tokens_processed}, - {"prompt_ms", t_prompt_processing}, - {"prompt_per_token_ms", - t_prompt_processing / num_prompt_tokens_processed}, - {"prompt_per_second", - 1e3 / t_prompt_processing * num_prompt_tokens_processed}, - - {"predicted_n", n_decoded}, - {"predicted_ms", t_token_generation}, - {"predicted_per_token_ms", t_token_generation / n_decoded}, - {"predicted_per_second", 1e3 / t_token_generation * n_decoded}, - }; - } - - void print_timings() const { - LOG_DEBUG << __func__ << ": prompt eval time = " << t_prompt_processing - << "ms / " << num_prompt_tokens_processed << " tokens (" - << t_prompt_processing / num_prompt_tokens_processed - << " ms per " - "token, " - << 1e3 / t_prompt_processing * num_prompt_tokens_processed - << " tokens per second)"; - LOG_DEBUG << __func__ << ": eval time = " << t_token_generation - << " ms / " << n_decoded << " runs (" - << t_token_generation / n_decoded - << " ms per " - "token, " - << 1e3 / t_token_generation * n_decoded - << " tokens per second)\n"; - LOG_DEBUG << __func__ << ": total time = " - << t_prompt_processing + t_token_generation << " ms"; - } -}; - -struct llama_server_context { - llama_model* model = nullptr; - llama_context* ctx = nullptr; - - clip_ctx* clp_ctx = nullptr; - - gpt_params params; - - llama_batch batch; - - bool multimodal = false; - bool clean_kv_cache = true; - bool all_slots_are_idle = false; - bool add_bos_token = true; - - int32_t id_gen; - int32_t n_ctx; // total context for all clients / slots - - // Internal - std::atomic model_loaded_external = false; - - // system prompt - bool system_need_update = false; - - std::string system_prompt; - std::vector system_tokens; - - std::string name_user; // this should be the antiprompt - std::string name_assistant; - - // slots / clients - std::vector slots; - - std::vector queue_tasks; - std::vector queue_results; - std::vector queue_multitasks; - std::mutex mutex_tasks; // also guards id_gen, and queue_multitasks - std::condition_variable condition_tasks; - std::mutex mutex_results; - std::condition_variable condition_results; - ModelType model_type = ModelType::LLM; - - ~llama_server_context() { - if (ctx) { - llama_free(ctx); - ctx = nullptr; - } - if (model) { - llama_free_model(model); - model = nullptr; - } - } - - bool load_model(const gpt_params& params_) { - params = params_; - if (!params.mmproj.empty()) { - multimodal = true; - LOG_DEBUG << "Multi Modal Mode Enabled"; - clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/1); - if (clp_ctx == nullptr) { - LOG_ERROR_LLAMA("unable to load clip model", - {{"model", params.mmproj}}); - return false; - } - - if (params.n_ctx < - 2048) { // request larger context for the image embedding - params.n_ctx = 2048; - } - } - - std::tie(model, ctx) = llama_init_from_gpt_params(params); - if (model == nullptr) { - LOG_ERROR_LLAMA("llama.cpp unable to load model", - {{"model", params.model}}); - return false; - } - - if (multimodal) { - const int n_embd_clip = clip_n_mmproj_embd(clp_ctx); - const int n_embd_llm = llama_n_embd(model); - if (n_embd_clip != n_embd_llm) { - LOG_DEBUG << __func__ << ": embedding dim of the multimodal projector (" - << n_embd_clip - << ") is not " - "equal to that of LLaMA (" - << n_embd_llm - << "). Make sure that you use the " - "correct mmproj file."; - llama_free(ctx); - llama_free_model(model); - return false; - } - } - - if (ctx == nullptr) { - LOG_ERROR_LLAMA("Unable to get llama.cpp context", {}); - return false; - } - n_ctx = llama_n_ctx(ctx); - - add_bos_token = llama_should_add_bos_token(model); - - return true; - } - - void initialize() { - id_gen = 0; - - // create slots - all_slots_are_idle = true; - - const int32_t n_ctx_slot = n_ctx / params.n_parallel; - - LOG_DEBUG << "Available slots: "; - for (int i = 0; i < params.n_parallel; i++) { - llama_client_slot slot; - - slot.id = i; - slot.n_ctx = n_ctx_slot; - slot.reset(); - - LOG_DEBUG << " -> Slot " << slot.id << " - max context: " << n_ctx_slot; - slots.push_back(slot); - } - - try { - batch = llama_batch_init(n_ctx, 0, params.n_parallel); - } catch (const std::exception& e) { - LOG_ERROR_LLAMA("Failed to allocate llama.cpp batch metadata", - {{"exception", e.what()}, - {"n_tokens_alloc", n_ctx}, - {"embd", 0}, - {"n_seq_max", params.n_parallel}}); - } - - // empty system prompt - system_prompt = ""; - system_tokens.clear(); - } - - std::vector tokenize(const json& json_prompt, - bool add_bos) const { - // TODO: currently, we tokenize using special tokens by default - // this is not always correct (see - // https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216) - // but it's better compared to completely ignoring ChatML and other - // chat templates - const bool TMP_FORCE_SPECIAL = true; - - // If `add_bos` is true, we only add BOS, when json_prompt is a string, - // or the first element of the json_prompt array is a string. - std::vector prompt_tokens; - - if (json_prompt.is_array()) { - bool first = true; - for (const auto& p : json_prompt) { - if (p.is_string()) { - auto s = p.template get(); - std::vector p; - if (first) { - p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL); - first = false; - } else { - p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL); - } - prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end()); - } else { - if (first) { - first = false; - } - prompt_tokens.push_back(p.template get()); - } - } - } else { - auto s = json_prompt.template get(); - prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL); - } - - return prompt_tokens; - } - - llama_client_slot* get_slot(int id) { - int64_t t_last = ggml_time_us(); - llama_client_slot* last_used = nullptr; - - for (llama_client_slot& slot : slots) { - if (slot.id == id && slot.available()) { - return &slot; - } - - if (slot.available() && slot.t_last_used < t_last) { - last_used = &slot; - t_last = slot.t_last_used; - } - } - - return last_used; - } - - bool launch_slot_with_data(llama_client_slot*& slot, json data) { - slot_params default_params; - llama_sampling_params default_sparams; - - if (data.count("__oaicompat") != 0) { - slot->oaicompat = true; - slot->oaicompat_model = - json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); - } else { - slot->oaicompat = false; - slot->oaicompat_model = ""; - } - - slot->params.stream = json_value(data, "stream", false); - slot->params.cache_prompt = json_value(data, "cache_prompt", false); - slot->params.n_predict = - json_value(data, "n_predict", default_params.n_predict); - slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k); - slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p); - slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p); - slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z); - slot->sparams.typical_p = - json_value(data, "typical_p", default_sparams.typical_p); - slot->sparams.temp = json_value(data, "temperature", default_sparams.temp); - slot->sparams.penalty_last_n = - json_value(data, "repeat_last_n", default_sparams.penalty_last_n); - slot->sparams.penalty_repeat = - json_value(data, "repeat_penalty", default_sparams.penalty_repeat); - slot->sparams.penalty_freq = - json_value(data, "frequency_penalty", default_sparams.penalty_freq); - slot->sparams.penalty_present = - json_value(data, "presence_penalty", default_sparams.penalty_present); - slot->sparams.mirostat = - json_value(data, "mirostat", default_sparams.mirostat); - slot->sparams.mirostat_tau = - json_value(data, "mirostat_tau", default_sparams.mirostat_tau); - slot->sparams.mirostat_eta = - json_value(data, "mirostat_eta", default_sparams.mirostat_eta); - slot->sparams.penalize_nl = - json_value(data, "penalize_nl", default_sparams.penalize_nl); - slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep); - slot->params.seed = json_value(data, "seed", default_params.seed); - slot->sparams.grammar = - json_value(data, "grammar", default_sparams.grammar); - slot->sparams.n_probs = - json_value(data, "n_probs", default_sparams.n_probs); - - // infill - if (data.count("input_prefix") != 0) { - slot->params.input_prefix = data["input_prefix"]; - } else { - slot->params.input_prefix = ""; - } - - if (data.count("input_suffix") != 0) { - slot->params.input_suffix = data["input_suffix"]; - } else { - slot->params.input_suffix = ""; - } - - if (data.count("prompt") != 0) { - slot->prompt = data["prompt"]; - } else { - slot->prompt = ""; - } - - slot->sparams.penalty_prompt_tokens.clear(); - slot->sparams.use_penalty_prompt_tokens = false; - const auto& penalty_prompt = data.find("penalty_prompt"); - if (penalty_prompt != data.end()) { - if (penalty_prompt->is_string()) { - const auto penalty_prompt_string = penalty_prompt->get(); - auto penalty_tokens = - llama_tokenize(model, penalty_prompt_string, false); - slot->sparams.penalty_prompt_tokens.swap(penalty_tokens); - if (slot->params.n_predict > 0) { - slot->sparams.penalty_prompt_tokens.reserve( - slot->sparams.penalty_prompt_tokens.size() + - slot->params.n_predict); - } - slot->sparams.use_penalty_prompt_tokens = true; - } else if (penalty_prompt->is_array()) { - const auto n_tokens = penalty_prompt->size(); - slot->sparams.penalty_prompt_tokens.reserve( - n_tokens + std::max(0, slot->params.n_predict)); - const int n_vocab = llama_n_vocab(model); - for (const auto& penalty_token : *penalty_prompt) { - if (penalty_token.is_number_integer()) { - const auto tok = penalty_token.get(); - if (tok >= 0 && tok < n_vocab) { - slot->sparams.penalty_prompt_tokens.push_back(tok); - } - } - } - slot->sparams.use_penalty_prompt_tokens = true; - } - } - - slot->sparams.logit_bias.clear(); - - if (json_value(data, "ignore_eos", false)) { - slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY; - } - - const auto& logit_bias = data.find("logit_bias"); - if (logit_bias != data.end() && logit_bias->is_array()) { - const int n_vocab = llama_n_vocab(model); - for (const auto& el : *logit_bias) { - if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) { - llama_token tok = el[0].get(); - if (tok >= 0 && tok < n_vocab) { - if (el[1].is_number()) { - slot->sparams.logit_bias[tok] = el[1].get(); - } else if (el[1].is_boolean() && !el[1].get()) { - slot->sparams.logit_bias[tok] = -INFINITY; - } - } - } - } - } - - slot->params.antiprompt.clear(); - - const auto& stop = data.find("stop"); - if (stop != data.end() && stop->is_array()) { - for (const auto& word : *stop) { - if (!word.empty()) { - slot->params.antiprompt.push_back(word); - } - } - } - - if (multimodal) { - const auto& images_data = data.find("image_data"); - if (images_data != data.end() && images_data->is_array()) { - for (const auto& img : *images_data) { - const std::vector image_buffer = - base64_decode(img["data"].get()); - - slot_image img_sl; - img_sl.id = - img.count("id") != 0 ? img["id"].get() : slot->images.size(); - img_sl.img_data = clip_image_u8_init(); - if (!clip_image_load_from_bytes( - image_buffer.data(), image_buffer.size(), img_sl.img_data)) { - LOG_DEBUG << "slot " << slot->id - << " - failed to load image [id: " << img_sl.id << "]"; - return false; - } - LOG_DEBUG << "slot " << slot->id << " - loaded image"; - img_sl.request_encode_image = true; - slot->images.push_back(img_sl); - } - // process prompt - // example: system prompt [img-102] user [img-103] describe [img-134] -> - // [{id: 102, prefix: 'system prompt '}, {id: 103, prefix: ' user '}, - // {id: 134, prefix: ' describe '}]} - if (slot->images.size() > 0 && !slot->prompt.is_array()) { - std::string prompt = slot->prompt.get(); - size_t pos = 0, begin_prefix = 0; - std::string pattern = "[img-"; - while ((pos = prompt.find(pattern, pos)) != std::string::npos) { - size_t end_prefix = pos; - pos += pattern.length(); - size_t end_pos = prompt.find("]", pos); - if (end_pos != std::string::npos) { - std::string image_id = prompt.substr(pos, end_pos - pos); - try { - int img_id = std::stoi(image_id); - bool found = false; - for (slot_image& img : slot->images) { - if (img.id == img_id) { - found = true; - img.prefix_prompt = - prompt.substr(begin_prefix, end_prefix - begin_prefix); - begin_prefix = end_pos + 1; - break; - } - } - if (!found) { - LOG_DEBUG << "ERROR: Image with id: " << img_id - << ", not found.\n"; - slot->images.clear(); - return false; - } - } catch (const std::invalid_argument& e) { - LOG_DEBUG << "Invalid image number id in prompt"; - slot->images.clear(); - return false; - } - } - } - slot->prompt = ""; - slot->params.input_suffix = prompt.substr(begin_prefix); - slot->params.cache_prompt = - false; // multimodal doesn't support cache prompt - } - } - } - - if (slot->ctx_sampling != nullptr) { - llama_sampling_free(slot->ctx_sampling); - } - slot->ctx_sampling = llama_sampling_init(slot->sparams); - llama_set_rng_seed(ctx, slot->params.seed); - slot->command = LOAD_PROMPT; - - all_slots_are_idle = false; - - LOG_DEBUG << "slot " << slot->id - << " is processing [task id: " << slot->task_id << "]"; - - return true; - } - - void kv_cache_clear() { - // clear the entire KV cache - llama_kv_cache_clear(ctx); - } - - void update_system_prompt() { - system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token); - - llama_batch_clear(batch); - - kv_cache_clear(); - - for (int i = 0; i < (int)system_tokens.size(); ++i) { - llama_batch_add(batch, system_tokens[i], i, {0}, false); - } - - if (llama_decode(ctx, batch) != 0) { - LOG_WARN << __func__ << ": llama_decode() failed"; - return; - } - - // assign the system KV cache to all parallel sequences - for (int32_t i = 1; i < params.n_parallel; ++i) { - llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size()); - } - - LOG_DEBUG << "system prompt updated"; - system_need_update = false; - } - - void notify_system_prompt_changed() { - // release all slots - for (llama_client_slot& slot : slots) { - slot.release(); - } - - system_need_update = true; - } - - void process_system_prompt_data(const json& sys_props) { - system_prompt = sys_props.value("prompt", ""); - name_user = sys_props.value("anti_prompt", ""); - name_assistant = sys_props.value("assistant_name", ""); - - if (slots.size() > 0) { - notify_system_prompt_changed(); - } - } - - static size_t find_stopping_strings(const std::string& text, - const size_t last_token_size, - const stop_type type, - llama_client_slot& slot) { - size_t stop_pos = std::string::npos; - - for (const std::string& word : slot.params.antiprompt) { - size_t pos; - if (type == STOP_FULL) { - const size_t tmp = word.size() + last_token_size; - const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0; - pos = text.find(word, from_pos); - } else { - pos = find_partial_stop_string(word, text); - } - if (pos != std::string::npos && - (stop_pos == std::string::npos || pos < stop_pos)) { - if (type == STOP_FULL) { - slot.stopped_word = true; - slot.stopping_word = word; - slot.has_next_token = false; - } - stop_pos = pos; - } - } - - return stop_pos; - } - - bool process_token(completion_token_output& result, llama_client_slot& slot) { - // remember which tokens were sampled - used for repetition penalties during - // sampling - const std::string token_str = llama_token_to_piece(ctx, result.tok); - slot.sampled = result.tok; - - // search stop word and delete it - slot.generated_text += token_str; - slot.has_next_token = true; - - if (slot.ctx_sampling->params.use_penalty_prompt_tokens && - result.tok != -1) { - // we can change penalty_prompt_tokens because it is always created from - // scratch each request - slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok); - } - - // check if there is incomplete UTF-8 character at the end - bool incomplete = false; - for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i) { - unsigned char c = slot.generated_text[slot.generated_text.size() - i]; - if ((c & 0xC0) == 0x80) { - // continuation byte: 10xxxxxx - continue; - } - if ((c & 0xE0) == 0xC0) { - // 2-byte character: 110xxxxx ... - incomplete = i < 2; - } else if ((c & 0xF0) == 0xE0) { - // 3-byte character: 1110xxxx ... - incomplete = i < 3; - } else if ((c & 0xF8) == 0xF0) { - // 4-byte character: 11110xxx ... - incomplete = i < 4; - } - // else 1-byte character or invalid byte - break; - } - - if (!incomplete) { - size_t pos = std::min(slot.sent_count, slot.generated_text.size()); - const std::string str_test = slot.generated_text.substr(pos); - bool is_stop_full = false; - size_t stop_pos = - find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot); - if (stop_pos != std::string::npos) { - is_stop_full = true; - slot.generated_text.erase(slot.generated_text.begin() + pos + stop_pos, - slot.generated_text.end()); - pos = std::min(slot.sent_count, slot.generated_text.size()); - } else { - is_stop_full = false; - stop_pos = find_stopping_strings(str_test, token_str.size(), - STOP_PARTIAL, slot); - } - - // check if there is any token to predict - if (stop_pos == std::string::npos || - (!slot.has_next_token && !is_stop_full && stop_pos > 0)) { - // no send the stop word in the response - result.text_to_send = - slot.generated_text.substr(pos, std::string::npos); - slot.sent_count += result.text_to_send.size(); - // add the token to slot queue and cache - } - slot.add_token_string(result); - if (slot.params.stream) { - send_partial_response(slot, result); - } - } - - if (incomplete) { - slot.has_next_token = true; - } - - // check the limits - if (slot.n_decoded > 2 && slot.has_next_token && !slot.has_budget(params)) { - slot.stopped_limit = true; - slot.has_next_token = false; - } - - if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model)) { - slot.stopped_eos = true; - slot.has_next_token = false; - LOG_VERBOSE("eos token found", {}); - } - - LOG_VERBOSE( - "next token", - { - {"token", result.tok}, - {"token_text", tokens_to_output_formatted_string(ctx, result.tok)}, - {"has_next_token", slot.has_next_token}, - {"n_remain", slot.n_remaining}, - {"num_tokens_predicted", slot.n_decoded}, - {"stopped_eos", slot.stopped_eos}, - {"stopped_word", slot.stopped_word}, - {"stopped_limit", slot.stopped_limit}, - {"stopping_word", slot.stopping_word}, - }); - - return slot.has_next_token; // continue - } - bool process_images(llama_client_slot& slot) const { - for (slot_image& img : slot.images) { - if (!img.request_encode_image) { - continue; - } - - if (!llava_image_embed_make_with_clip_img( - clp_ctx, params.n_threads, img.img_data, &img.image_embedding, - &img.image_tokens)) { - LOG_DEBUG << "Error processing the given image"; - return false; - } - - img.request_encode_image = false; - } - - return slot.images.size() > 0; - } - void send_error(task_server& task, std::string error) { - std::unique_lock lock(mutex_results); - task_result res; - res.id = task.id; - res.multitask_id = task.multitask_id; - res.stop = false; - res.error = true; - res.result_json = {{"content", error}}; - queue_results.push_back(res); - condition_results.notify_all(); - } - - void add_multi_task(int id, std::vector& sub_ids) { - std::lock_guard lock(mutex_tasks); - task_multi multi; - multi.id = id; - std::copy(sub_ids.begin(), sub_ids.end(), - std::inserter(multi.subtasks_remaining, - multi.subtasks_remaining.end())); - queue_multitasks.push_back(multi); - condition_tasks.notify_one(); - } - - void update_multi_task(int multitask_id, int subtask_id, - task_result& result) { - std::lock_guard lock(mutex_tasks); - for (auto& multitask : queue_multitasks) { - if (multitask.id == multitask_id) { - multitask.subtasks_remaining.erase(subtask_id); - multitask.results.push_back(result); - condition_tasks.notify_one(); - } - } - } - - json get_model_props() { return get_formated_generation(slots[0]); } - - json get_formated_generation(llama_client_slot& slot) { - const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model)); - const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && - eos_bias->second < 0.0f && - std::isinf(eos_bias->second); - return json{ - {"n_ctx", slot.n_ctx}, - {"model", params.model_alias}, - {"seed", slot.params.seed}, - {"temperature", slot.sparams.temp}, - {"top_k", slot.sparams.top_k}, - {"top_p", slot.sparams.top_p}, - {"min_p", slot.sparams.min_p}, - {"tfs_z", slot.sparams.tfs_z}, - {"typical_p", slot.sparams.typical_p}, - {"repeat_last_n", slot.sparams.penalty_last_n}, - {"repeat_penalty", slot.sparams.penalty_repeat}, - {"presence_penalty", slot.sparams.penalty_present}, - {"frequency_penalty", slot.sparams.penalty_freq}, - {"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens}, - {"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens}, - {"mirostat", slot.sparams.mirostat}, - {"mirostat_tau", slot.sparams.mirostat_tau}, - {"mirostat_eta", slot.sparams.mirostat_eta}, - {"penalize_nl", slot.sparams.penalize_nl}, - {"stop", slot.params.antiprompt}, - {"n_predict", slot.params.n_predict}, - {"n_keep", params.n_keep}, - {"ignore_eos", ignore_eos}, - {"stream", slot.params.stream}, - {"logit_bias", slot.sparams.logit_bias}, - {"n_probs", slot.sparams.n_probs}, - {"grammar", slot.sparams.grammar}, - }; - } - - void send_partial_response(llama_client_slot& slot, - completion_token_output tkn) { - std::unique_lock lock(mutex_results); - task_result res; - res.id = slot.task_id; - res.multitask_id = slot.multitask_id; - res.error = false; - res.stop = false; - - res.result_json = json{{"content", tkn.text_to_send}, - {"stop", false}, - {"slot_id", slot.id}, - {"multimodal", multimodal}}; - - if (slot.sparams.n_probs > 0) { - std::vector probs_output = {}; - const std::vector to_send_toks = - llama_tokenize(ctx, tkn.text_to_send, false); - size_t probs_pos = std::min(slot.sent_token_probs_index, - slot.generated_token_probs.size()); - size_t probs_stop_pos = - std::min(slot.sent_token_probs_index + to_send_toks.size(), - slot.generated_token_probs.size()); - if (probs_pos < probs_stop_pos) { - probs_output = std::vector( - slot.generated_token_probs.begin() + probs_pos, - slot.generated_token_probs.begin() + probs_stop_pos); - } - slot.sent_token_probs_index = probs_stop_pos; - res.result_json["completion_probabilities"] = - probs_vector_to_json(ctx, probs_output); - } - - if (slot.oaicompat) { - res.result_json["oaicompat_token_ctr"] = slot.n_decoded; - res.result_json["model"] = slot.oaicompat_model; - } - - queue_results.push_back(res); - condition_results.notify_all(); - } - - void send_final_response(llama_client_slot& slot) { - std::unique_lock lock(mutex_results); - task_result res; - res.id = slot.task_id; - res.multitask_id = slot.multitask_id; - res.error = false; - res.stop = true; - - res.result_json = - json{{"content", !slot.params.stream ? slot.generated_text : ""}, - {"slot_id", slot.id}, - {"stop", true}, - {"model", params.model_alias}, - {"tokens_predicted", slot.n_decoded}, - {"tokens_evaluated", slot.num_prompt_tokens}, - {"generation_settings", get_formated_generation(slot)}, - {"prompt", slot.prompt}, - {"truncated", slot.truncated}, - {"stopped_eos", slot.stopped_eos}, - {"stopped_word", slot.stopped_word}, - {"stopped_limit", slot.stopped_limit}, - {"stopping_word", slot.stopping_word}, - {"tokens_cached", slot.n_past}, - {"timings", slot.get_formated_timings()}}; - - if (slot.sparams.n_probs > 0) { - std::vector probs = {}; - if (!slot.params.stream && slot.stopped_word) { - const std::vector stop_word_toks = - llama_tokenize(ctx, slot.stopping_word, false); - probs = std::vector( - slot.generated_token_probs.begin(), - slot.generated_token_probs.end() - stop_word_toks.size()); - } else { - probs = std::vector( - slot.generated_token_probs.begin(), - slot.generated_token_probs.begin() + slot.sent_token_probs_index); - } - res.result_json["completion_probabilities"] = - probs_vector_to_json(ctx, probs); - } - - if (slot.oaicompat) { - res.result_json["oaicompat_token_ctr"] = slot.n_decoded; - res.result_json["model"] = slot.oaicompat_model; - } - - // parent multitask, if any, needs to be updated - if (slot.multitask_id != -1) { - update_multi_task(slot.multitask_id, slot.task_id, res); - } - - queue_results.push_back(res); - condition_results.notify_all(); - } - - void send_embedding(llama_client_slot& slot) { - std::unique_lock lock(mutex_results); - task_result res; - res.id = slot.task_id; - res.multitask_id = slot.multitask_id; - res.error = false; - res.stop = true; - - const int n_embd = llama_n_embd(model); - - std::vector embd_res(n_embd, 0.0f); - - for (int i = 0; i < batch.n_tokens; ++i) { - if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) { - continue; - } - - const float* embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]); - if (embd == NULL) { - embd = llama_get_embeddings_ith(ctx, i); - } - - if (embd == NULL) { - LOG_ERROR << "failed to get embeddings" - << " token " << batch.token[i] << ", seq_id " - << batch.seq_id[i][0]; - - res.result_json = json{ - {"embedding", std::vector(n_embd, 0.0f)}, - }; - - continue; - } - - llama_embd_normalize(embd, embd_res.data(), n_embd); - } - res.result_json = json{ - {"embedding", embd_res}, - }; - - queue_results.push_back(res); - condition_results.notify_all(); - } - - int request_completion(json data, bool infill, bool embedding, - int multitask_id) { - std::unique_lock lock(mutex_tasks); - task_server task; - task.id = id_gen++; - task.target_id = 0; - task.data = std::move(data); - task.infill_mode = infill; - task.embedding_mode = embedding; - task.type = COMPLETION_TASK; - task.multitask_id = multitask_id; - - // when a completion task's prompt array is not a singleton, we split it - // into multiple requests - if (task.data.at("prompt").size() > 1) { - lock.unlock(); // entering new func scope - return split_multiprompt_task(task); - } - - // otherwise, it's a single-prompt task, we actually queue it - queue_tasks.push_back(task); - condition_tasks.notify_one(); - return task.id; - } - - task_result next_result(int task_id) { - while (true) { - std::unique_lock lock(mutex_results); - condition_results.wait(lock, [&] { return !queue_results.empty(); }); - - for (int i = 0; i < (int)queue_results.size(); i++) { - // for now, tasks that have associated parent multitasks just get erased - // once multitask picks up the result - if (queue_results[i].multitask_id == task_id) { - update_multi_task(task_id, queue_results[i].id, queue_results[i]); - queue_results.erase(queue_results.begin() + i); - continue; - } - - if (queue_results[i].id == task_id) { - if (queue_results[i].multitask_id != -1) { - LOG_ERROR_LLAMA("Incorrect multitask ID", {{"task_id", task_id}}); - } - task_result res = queue_results[i]; - queue_results.erase(queue_results.begin() + i); - return res; - } - } - } - - // never reached - // return task_result{-1, false, false, {}}; - } - - // for multiple images processing - bool ingest_images(llama_client_slot& slot, int n_batch) { - int image_idx = 0; - - while (image_idx < (int)slot.images.size()) { - slot_image& img = slot.images[image_idx]; - - // process prefix prompt - for (int32_t i = 0; i < (int32_t)batch.n_tokens; i += n_batch) { - const int32_t n_tokens = - std::min(n_batch, (int32_t)(batch.n_tokens - i)); - llama_batch batch_view = { - n_tokens, - batch.token + i, - nullptr, - batch.pos + i, - batch.n_seq_id + i, - batch.seq_id + i, - batch.logits + i, - 0, - 0, - 0, // unused - }; - if (llama_decode(ctx, batch_view)) { - LOG_DEBUG << __func__ << " : failed to eval\n"; - return false; - } - } - - // process image with llm - for (int i = 0; i < img.image_tokens; i += n_batch) { - int n_eval = img.image_tokens - i; - if (n_eval > n_batch) { - n_eval = n_batch; - } - - const int n_embd = llama_n_embd(model); - llama_batch batch_img = { - n_eval, nullptr, (img.image_embedding + i * n_embd), - nullptr, nullptr, nullptr, - nullptr, slot.n_past, 1, - 0, - }; - if (llama_decode(ctx, batch_img)) { - LOG_DEBUG << __func__ << " : failed to eval image"; - return false; - } - slot.n_past += n_eval; - } - image_idx++; - - llama_batch_clear(batch); - - // append prefix of next image - const auto json_prompt = - (image_idx >= (int)slot.images.size()) - ? slot.params.input_suffix - : // no more images, then process suffix prompt - (json)(slot.images[image_idx].prefix_prompt); - - std::vector append_tokens = - tokenize(json_prompt, false); // has next image - for (int i = 0; i < (int)append_tokens.size(); ++i) { - llama_batch_add(batch, append_tokens[i], slot.n_past, {slot.id}, true); - slot.n_past += 1; - } - } - - return true; - } - - void request_cancel(int task_id) { - std::unique_lock lock(mutex_tasks); - task_server task; - task.id = id_gen++; - task.type = CANCEL_TASK; - task.target_id = task_id; - queue_tasks.push_back(task); - condition_tasks.notify_one(); - } - - int split_multiprompt_task(task_server& multiprompt_task) { - int prompt_count = multiprompt_task.data.at("prompt").size(); - assert(prompt_count > 1); - - int multitask_id = id_gen++; - std::vector subtask_ids(prompt_count); - for (int i = 0; i < prompt_count; i++) { - json subtask_data = multiprompt_task.data; - subtask_data["prompt"] = subtask_data["prompt"][i]; - - // subtasks inherit everything else (infill mode, embedding mode, etc.) - subtask_ids[i] = - request_completion(subtask_data, multiprompt_task.infill_mode, - multiprompt_task.embedding_mode, multitask_id); - } - - // queue up the multitask so we can track its subtask progression - add_multi_task(multitask_id, subtask_ids); - return multitask_id; - } - - void process_tasks() { - std::unique_lock lock(mutex_tasks); - while (!queue_tasks.empty()) { - task_server task = queue_tasks.front(); - queue_tasks.erase(queue_tasks.begin()); - switch (task.type) { - case COMPLETION_TASK: { - llama_client_slot* slot = - get_slot(json_value(task.data, "slot_id", -1)); - if (slot == nullptr) { - LOG_DEBUG << "slot unavailable"; - // send error result - send_error(task, "slot unavailable"); - return; - } - - if (task.data.contains("system_prompt")) { - process_system_prompt_data(task.data["system_prompt"]); - } - - slot->reset(); - - slot->infill = task.infill_mode; - slot->embedding = task.embedding_mode; - slot->task_id = task.id; - slot->multitask_id = task.multitask_id; - - if (!launch_slot_with_data(slot, task.data)) { - // send error result - send_error(task, "internal_error"); - break; - } - } break; - case CANCEL_TASK: { // release slot linked with the task id - for (auto& slot : slots) { - if (slot.task_id == task.target_id) { - slot.release(); - break; - } - } - } break; - } - } - - // remove finished multitasks from the queue of multitasks, and add the - // corresponding result to the result queue - auto queue_iterator = queue_multitasks.begin(); - while (queue_iterator != queue_multitasks.end()) { - if (queue_iterator->subtasks_remaining.empty()) { - // all subtasks done == multitask is done - task_result aggregate_result; - aggregate_result.id = queue_iterator->id; - aggregate_result.stop = true; - aggregate_result.error = false; - - // collect json results into one json result - std::vector result_jsons; - for (auto& subres : queue_iterator->results) { - result_jsons.push_back(subres.result_json); - aggregate_result.error = aggregate_result.error && subres.error; - } - aggregate_result.result_json = json{"results", result_jsons}; - - std::lock_guard lock(mutex_results); - queue_results.push_back(aggregate_result); - condition_results.notify_all(); - - queue_iterator = queue_multitasks.erase(queue_iterator); - } else { - ++queue_iterator; - } - } - } - - bool update_slots() { - // attend tasks - process_tasks(); - - // update the system prompt wait until all slots are idle state - if (system_need_update && all_slots_are_idle) { - LOG_DEBUG << "updating system prompt"; - update_system_prompt(); - } - - llama_batch_clear(batch); - - if (all_slots_are_idle) { - if (system_prompt.empty() && clean_kv_cache) { - LOG_DEBUG - << "all slots are idle and system prompt is empty, clear the KV " - "cache"; - kv_cache_clear(); - } - // std::this_thread::sleep_for(std::chrono::milliseconds(5)); - // TODO: Need to implement queueing using CV for better performance - std::unique_lock lock(mutex_tasks); - condition_tasks.wait(lock, [&] { - return (!queue_tasks.empty() && model_loaded_external) || - (queue_tasks.empty() && !model_loaded_external); - }); - } - - for (llama_client_slot& slot : slots) { - if (slot.is_processing() && - slot.cache_tokens.size() >= (size_t)slot.n_ctx) { - // Shift context - const int n_left = slot.n_past - slot.params.n_keep - 1; - const int n_discard = n_left / 2; - - LOG_DEBUG << "slot " << slot.id - << " context shift - n_keep = " << slot.params.n_keep - << ", n_left = " << n_left << ", n_discard: " << n_discard; - llama_kv_cache_seq_rm(ctx, slot.id, slot.params.n_keep + 1, - slot.params.n_keep + n_discard + 1); - llama_kv_cache_seq_add(ctx, slot.id, slot.params.n_keep + 1 + n_discard, - slot.n_past, -n_discard); - - for (size_t i = slot.params.n_keep + 1 + n_discard; - i < slot.cache_tokens.size(); i++) { - slot.cache_tokens[i - n_discard] = slot.cache_tokens[i]; - } - - slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard); - - slot.n_past -= n_discard; - - slot.truncated = true; - - LOG_VERBOSE("context shift", { - {"n_ctx", n_ctx}, - {"n_keep", params.n_keep}, - {"n_left", n_left}, - }); - } - } - - // decode any currently ongoing sequences - for (auto& slot : slots) { - // release the slot - if (slot.command == RELEASE) { - slot.state = IDLE; - slot.command = NONE; - slot.t_last_used = ggml_time_us(); - - LOG_DEBUG << "slot " << slot.id << " released (" - << (int)slot.cache_tokens.size() << " tokens in cache)"; - - continue; - } - - if (slot.state == IDLE) { - continue; - } - - slot.i_batch = batch.n_tokens; - - llama_batch_add(batch, slot.sampled, system_tokens.size() + slot.n_past, - {slot.id}, true); - - slot.n_decoded += 1; - slot.n_past += 1; - } - - // process in chunks of params.n_batch - int32_t n_batch = params.n_batch; - - // assign workload to the slots - if (params.cont_batching || batch.n_tokens == 0) { - for (auto& slot : slots) { - const bool has_prompt = slot.prompt.is_array() || - (slot.prompt.is_string() && - !slot.prompt.get().empty()) || - !slot.images.empty(); - - // empty prompt passed -> release the slot and send empty response - if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt) { - slot.release(); - slot.print_timings(); - send_final_response(slot); - continue; - } - - // need process the prompt - if (slot.state == IDLE && slot.command == LOAD_PROMPT) { - slot.state = PROCESSING; - slot.command = NONE; - std::vector prompt_tokens; - slot.t_start_process_prompt = ggml_time_us(); - slot.t_start_genereration = 0; - - if (slot.infill) { - bool suff_rm_leading_spc = true; - if (params.input_suffix.find_first_of(' ') == 0 && - params.input_suffix.size() > 1) { - params.input_suffix.erase(0, 1); - suff_rm_leading_spc = false; - } - auto prefix_tokens = tokenize(slot.params.input_prefix, false); - auto suffix_tokens = tokenize(slot.params.input_suffix, false); - - const int space_token = - 29871; // TODO: this should not be hardcoded - if (suff_rm_leading_spc && !suffix_tokens.empty() && - suffix_tokens[0] == space_token) { - suffix_tokens.erase(suffix_tokens.begin()); - } - - prefix_tokens.insert(prefix_tokens.begin(), - llama_token_prefix(model)); - prefix_tokens.insert(prefix_tokens.begin(), - llama_token_bos(model)); // always add BOS - prefix_tokens.insert(prefix_tokens.end(), - llama_token_suffix(model)); - prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), - suffix_tokens.end()); - prefix_tokens.push_back(llama_token_middle(model)); - prompt_tokens = prefix_tokens; - } else { - prompt_tokens = tokenize( - slot.prompt, - system_prompt.empty() && - add_bos_token); // add BOS if there isn't system prompt - } - - slot.num_prompt_tokens = prompt_tokens.size(); - - if (slot.params.n_keep < 0) { - slot.params.n_keep = slot.num_prompt_tokens; - } - slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); - - // if input prompt is too big, truncate it - if (slot.num_prompt_tokens >= slot.n_ctx) { - const int n_left = slot.n_ctx - slot.params.n_keep; - const int n_block_size = n_left / 2; - const int erased_blocks = - (slot.num_prompt_tokens - slot.params.n_keep - n_block_size) / - n_block_size; - - std::vector new_tokens( - prompt_tokens.begin(), - prompt_tokens.begin() + slot.params.n_keep); - new_tokens.insert(new_tokens.end(), - prompt_tokens.begin() + slot.params.n_keep + - erased_blocks * n_block_size, - prompt_tokens.end()); - - LOG_VERBOSE( - "input truncated", - { - {"n_ctx", slot.n_ctx}, - {"n_keep", slot.params.n_keep}, - {"n_left", n_left}, - {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), - new_tokens.cend())}, - }); - slot.truncated = true; - prompt_tokens = new_tokens; - - slot.num_prompt_tokens = prompt_tokens.size(); - GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx); - } - - if (!slot.params.cache_prompt) { - llama_sampling_reset(slot.ctx_sampling); - - slot.n_past = 0; - slot.num_prompt_tokens_processed = slot.num_prompt_tokens; - } else { - // push the prompt into the sampling context (do not apply grammar) - for (auto& token : prompt_tokens) { - llama_sampling_accept(slot.ctx_sampling, ctx, token, false); - } - - slot.n_past = common_part(slot.cache_tokens, prompt_tokens); - slot.num_prompt_tokens_processed = - slot.num_prompt_tokens - slot.n_past; - - LOG_DEBUG << "slot " << slot.id << " : in cache: " << slot.n_past - << " tokens | to process: " - << slot.num_prompt_tokens_processed << " tokens"; - } - - LOG_DEBUG << "slot " << slot.id << " : kv cache rm - [" - << (int)system_tokens.size() + slot.n_past << ", end)"; - - llama_kv_cache_seq_rm(ctx, slot.id, - system_tokens.size() + slot.n_past, -1); - - slot.cache_tokens = prompt_tokens; - - if (slot.n_past == slot.num_prompt_tokens) { - // we have to evaluate at least 1 token to generate logits. - LOG_DEBUG << "slot " << slot.id - << " : we have to evaluate at least 1 token to " - "generate logits"; - slot.n_past--; - } - - LOG_VERBOSE( - "prompt ingested", - { - {"n_past", slot.n_past}, - {"cached", - tokens_to_str(ctx, slot.cache_tokens.cbegin(), - slot.cache_tokens.cbegin() + slot.n_past)}, - {"to_eval", - tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past, - slot.cache_tokens.cend())}, - }); - - const bool has_images = process_images(slot); - - // process the prefix of first image - std::vector prefix_tokens = - has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token) - : prompt_tokens; - for (; slot.n_past < (int)prefix_tokens.size(); ++slot.n_past) { - llama_batch_add(batch, prefix_tokens[slot.n_past], - system_tokens.size() + slot.n_past, {slot.id}, - false); - } - - if (has_images && !ingest_images(slot, n_batch)) { - LOG_DEBUG << "failed processing images"; - return false; - } - - // extract the logits only for the last token - if (batch.n_tokens > 0) { - batch.logits[batch.n_tokens - 1] = true; - } - - slot.n_decoded = 0; - slot.i_batch = batch.n_tokens - 1; - } - } - } - - if (batch.n_tokens == 0) { - all_slots_are_idle = true; - return true; - } - - for (int32_t i = 0; i < (int32_t)batch.n_tokens; i += n_batch) { - const int32_t n_tokens = std::min(n_batch, (int32_t)(batch.n_tokens - i)); - llama_batch batch_view = { - n_tokens, - batch.token + i, - nullptr, - batch.pos + i, - batch.n_seq_id + i, - batch.seq_id + i, - batch.logits + i, - 0, - 0, - 0, // unused - }; - - const int ret = llama_decode(ctx, batch_view); - if (ret != 0) { - if (n_batch == 1 || ret < 0) { - // if you get here, it means the KV cache is full - try increasing it - // via the context size - LOG_DEBUG << __func__ - << " : failed to decode the batch, n_batch = " << n_batch - << ", ret = " << ret; - return false; - } - - LOG_DEBUG - << __func__ - << " : failed to find free space in the KV cache, retrying with " - "smaller n_batch = " - << n_batch / 2; - - // retry with half the batch size to try to find a free slot in the KV - // cache - n_batch /= 2; - i -= n_batch; - continue; - } - - for (auto& slot : slots) { - if (slot.i_batch < (int)i || slot.i_batch >= (int)(i + n_tokens)) { - continue; - } - - // prompt evaluated for embedding - if (slot.embedding) { - send_embedding(slot); - slot.release(); - slot.i_batch = -1; - return true; - } - - completion_token_output result; - const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx, - NULL, slot.i_batch - i); - - llama_sampling_accept(slot.ctx_sampling, ctx, id, true); - - if (slot.n_decoded == 1) { - slot.t_start_genereration = ggml_time_us(); - slot.t_prompt_processing = - (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3; - } - - llama_token_data_array cur_p = {slot.ctx_sampling->cur.data(), - slot.ctx_sampling->cur.size(), false}; - result.tok = id; - - const int32_t n_probs = slot.sparams.n_probs; - if (slot.sparams.temp <= 0 && n_probs > 0) { - // for llama_sample_token_greedy we need to sort candidates - llama_sample_softmax(ctx, &cur_p); - } - - for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i) { - result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p}); - } - - if (!process_token(result, slot)) { - slot.release(); - slot.print_timings(); - send_final_response(slot); - } - - slot.i_batch = -1; - } - } - return true; - } -}; - -static void server_print_usage(const char* argv0, const gpt_params& params, - const server_params& sparams) { - printf("usage: %s [options]\n", argv0); - printf("\n"); - printf("options:\n"); - printf(" -h, --help show this help message and exit\n"); - printf(" -v, --verbose verbose output (default: %s)\n", - server_verbose ? "enabled" : "disabled"); - printf( - " -t N, --threads N number of threads to use during " - "computation (default: %d)\n", - params.n_threads); - printf( - " -tb N, --threads-batch N number of threads to use during batch " - "and prompt processing (default: same as --threads)\n"); - printf( - " -c N, --ctx-size N size of the prompt context (default: " - "%d)\n", - params.n_ctx); - printf(" --rope-scaling {none,linear,yarn}\n"); - printf( - " RoPE frequency scaling method, defaults " - "to linear unless specified by the model\n"); - printf( - " --rope-freq-base N RoPE base frequency (default: loaded " - "from model)\n"); - printf( - " --rope-freq-scale N RoPE frequency scaling factor, expands " - "context by a factor of 1/N\n"); - printf( - " --yarn-ext-factor N YaRN: extrapolation mix factor (default: " - "1.0, 0.0 = full interpolation)\n"); - printf( - " --yarn-attn-factor N YaRN: scale sqrt(t) or attention " - "magnitude (default: 1.0)\n"); - printf( - " --yarn-beta-slow N YaRN: high correction dim or alpha " - "(default: %.1f)\n", - params.yarn_beta_slow); - printf( - " --yarn-beta-fast N YaRN: low correction dim or beta " - "(default: %.1f)\n", - params.yarn_beta_fast); - printf( - " -b N, --batch-size N batch size for prompt processing " - "(default: %d)\n", - params.n_batch); - printf( - " --memory-f32 use f32 instead of f16 for memory " - "key+value (default: disabled)\n"); - printf( - " not recommended: doubles context memory " - "required and no measurable increase in quality\n"); - if (llama_supports_mlock()) { - printf( - " --mlock force system to keep model in RAM " - "rather than swapping or compressing\n"); - } - if (llama_supports_mmap()) { - printf( - " --no-mmap do not memory-map model (slower load " - "but may reduce pageouts if not using mlock)\n"); - } - printf( - " --numa attempt optimizations that help on some " - "NUMA systems\n"); - if (llama_supports_gpu_offload()) { - printf(" -ngl N, --n-gpu-layers N\n"); - printf(" number of layers to store in VRAM\n"); - printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n"); - printf( - " how to split the model across multiple " - "GPUs, one of:\n"); - printf(" - none: use one GPU only\n"); - printf( - " - layer (default): split layers and " - "KV across GPUs\n"); - printf(" - row: split rows across GPUs\n"); - printf(" -ts SPLIT --tensor-split SPLIT\n"); - printf( - " fraction of the model to offload to " - "each GPU, comma-separated list of proportions, e.g. 3,1\n"); - printf( - " -mg i, --main-gpu i the GPU to use for the model (with " - "split-mode = none),\n"); - printf( - " or for intermediate results and KV " - "(with split-mode = row)\n"); - } - printf(" -m FNAME, --model FNAME\n"); - printf(" model path (default: %s)\n", - params.model.c_str()); - printf(" -a ALIAS, --alias ALIAS\n"); - printf( - " set an alias for the model, will be " - "added as `model` field in completion response\n"); - printf( - " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); - printf( - " --lora-base FNAME optional model to use as a base for the " - "layers modified by the LoRA adapter\n"); - printf( - " --host ip address to listen (default (default: " - "%s)\n", - sparams.hostname.c_str()); - printf(" --port PORT port to listen (default (default: %d)\n", - sparams.port); - printf( - " --path PUBLIC_PATH path from which to serve static files " - "(default %s)\n", - sparams.public_path.c_str()); - printf( - " --api-key API_KEY optional api key to enhance server " - "security. If set, requests must include this key for access.\n"); - printf( - " --api-key-file FNAME path to file containing api keys " - "delimited by new lines. If set, requests must include one of the " - "keys for access.\n"); - printf( - " -to N, --timeout N server read/write timeout in seconds " - "(default: %d)\n", - sparams.read_timeout); - printf( - " --embedding enable embedding vector output (default: " - "%s)\n", - params.embedding ? "enabled" : "disabled"); - printf( - " -np N, --parallel N number of slots for process requests " - "(default: %d)\n", - params.n_parallel); - printf( - " -cb, --cont-batching enable continuous batching (a.k.a " - "dynamic batching) (default: disabled)\n"); - printf(" -spf FNAME, --system-prompt-file FNAME\n"); - printf( - " set a file to load a system prompt " - "(initial " - "prompt of all slots), this is useful for chat applications.\n"); - printf( - " --mmproj MMPROJ_FILE path to a multimodal projector file for " - "LLaVA.\n"); - printf(" --log-disable disables logging to a file.\n"); - printf("\n"); - printf(" --override-kv KEY=TYPE:VALUE\n"); - printf( - " advanced option to override model " - "metadata by key. may be specified multiple times.\n"); - printf( - " types: int, float, bool. example: " - "--override-kv tokenizer.ggml.add_bos_token=bool:false\n"); - printf( - " -gan N, --grp-attn-n N set the group attention factor to extend " - "context size through self-extend(default: 1=disabled), used together " - "with group attention width `--grp-attn-w`"); - printf( - " -gaw N, --grp-attn-w N set the group attention width to extend " - "context size through self-extend(default: 512), used together with " - "group attention factor `--grp-attn-n`"); - printf("\n"); -} -static std::string random_string() { - static const std::string str( - "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); - - std::random_device rd; - std::mt19937 generator(rd()); - - std::string result(32, ' '); - - for (int i = 0; i < 32; ++i) { - result[i] = str[generator() % str.size()]; - } - - return result; -} - -static std::string gen_chatcmplid() { - std::stringstream chatcmplid; - chatcmplid << "chatcmpl-" << random_string(); - return chatcmplid.str(); -} -static json format_final_response_oaicompat(const json& request, - const task_result& response, - bool streaming = false) { - json result = response.result_json; - - bool stopped_word = result.count("stopped_word") != 0; - bool stopped_eos = json_value(result, "stopped_eos", false); - int num_tokens_predicted = json_value(result, "tokens_predicted", 0); - int num_prompt_tokens = json_value(result, "tokens_evaluated", 0); - std::string content = json_value(result, "content", std::string("")); - - std::string finish_reason = "length"; - if (stopped_word || stopped_eos) { - finish_reason = "stop"; - } - - json choices = - streaming ? json::array({json{{"finish_reason", finish_reason}, - {"index", 0}, - {"delta", json::object()}}}) - : json::array({json{{"finish_reason", finish_reason}, - {"index", 0}, - {"message", json{{"content", content}, - {"role", "assistant"}}}}}); - - std::time_t t = std::time(0); - - json res = - json{{"choices", choices}, - {"created", t}, - {"model", - json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))}, - {"object", streaming ? "chat.completion.chunk" : "chat.completion"}, - {"usage", - json{{"completion_tokens", num_tokens_predicted}, - {"prompt_tokens", num_prompt_tokens}, - {"total_tokens", num_tokens_predicted + num_prompt_tokens}}}, - {"id", gen_chatcmplid()}}; - - if (server_verbose) { - res["__verbose"] = result; - } - - if (result.contains("completion_probabilities")) { - res["completion_probabilities"] = - json_value(result, "completion_probabilities", json::array()); - } - - return res; -} - -// return value is vector as there is one case where we might need to generate -// two responses -static std::vector format_partial_response_oaicompat( - const task_result& response) { - json result = response.result_json; - - if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) { - return std::vector({response.result_json}); - } - - bool first = json_value(result, "oaicompat_token_ctr", 0) == 0; - std::string modelname = - json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL)); - - bool stopped_word = json_value(result, "stopped_word", false); - bool stopped_eos = json_value(result, "stopped_eos", false); - bool stopped_limit = json_value(result, "stopped_limit", false); - std::string content = json_value(result, "content", std::string("")); - - std::string finish_reason; - if (stopped_word || stopped_eos) { - finish_reason = "stop"; - } - if (stopped_limit) { - finish_reason = "length"; - } - - std::time_t t = std::time(0); - - json choices; - - if (!finish_reason.empty()) { - choices = json::array({json{{"finish_reason", finish_reason}, - {"index", 0}, - {"delta", json::object()}}}); - } else { - if (first) { - if (content.empty()) { - choices = json::array({json{{"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{{"role", "assistant"}}}}}); - } else { - // We have to send this as two updates to conform to openai behavior - json initial_ret = - json{{"choices", - json::array({json{{"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{{"role", "assistant"}}}}})}, - {"created", t}, - {"id", gen_chatcmplid()}, - {"model", modelname}, - {"object", "chat.completion.chunk"}}; - - json second_ret = - json{{"choices", - json::array({json{{"finish_reason", nullptr}, - {"index", 0}, - {"delta", json{{"content", content}}}}})}, - {"created", t}, - {"id", gen_chatcmplid()}, - {"model", modelname}, - {"object", "chat.completion.chunk"}}; - - return std::vector({initial_ret, second_ret}); - } - } else { - // Some idiosyncrasy in task processing logic makes several trailing calls - // with empty content, we ignore these at the calee site. - if (content.empty()) { - return std::vector({json::object()}); - } - - choices = json::array({json{ - {"finish_reason", nullptr}, - {"index", 0}, - {"delta", - json{ - {"content", content}, - }}, - }}); - } - } - - json ret = json{{"choices", choices}, - {"created", t}, - {"id", gen_chatcmplid()}, - {"model", modelname}, - {"object", "chat.completion.chunk"}}; - - return std::vector({ret}); -} - -static json format_partial_response( - llama_server_context& llama, llama_client_slot* slot, - const std::string& content, - const std::vector& probs) { - json res = json{{"content", content}, - {"stop", false}, - {"slot_id", slot->id}, - {"multimodal", llama.multimodal}}; - - if (slot->sparams.n_probs > 0) { - res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs); - } - - return res; -} - -static json format_tokenizer_response(const std::vector& tokens) { - return json{{"tokens", tokens}}; -} - -static json format_detokenized_response(std::string content) { - return json{{"content", content}}; -} - -struct token_translator { - llama_context* ctx; - std::string operator()(llama_token tok) const { - return llama_token_to_piece(ctx, tok); - } - std::string operator()(const completion_token_output& cto) const { - return (*this)(cto.tok); - } -}; - -static void append_to_generated_text_from_generated_token_probs( - llama_server_context& llama, llama_client_slot* slot) { - auto& gtps = slot->generated_token_probs; - auto translator = token_translator{llama.ctx}; - auto add_strlen = [=](size_t sum, const completion_token_output& cto) { - return sum + translator(cto).size(); - }; - const size_t len = - std::accumulate(gtps.begin(), gtps.end(), size_t(0), add_strlen); - if (slot->generated_text.capacity() < slot->generated_text.size() + len) { - slot->generated_text.reserve(slot->generated_text.size() + len); - } - for (const completion_token_output& cto : gtps) { - slot->generated_text += translator(cto); - } -} diff --git a/cortex-cpp/controllers/llamaCPP.cc b/cortex-cpp/controllers/llamaCPP.cc index 69284d6e9..3beaae0e9 100644 --- a/cortex-cpp/controllers/llamaCPP.cc +++ b/cortex-cpp/controllers/llamaCPP.cc @@ -1,757 +1,148 @@ #include "llamaCPP.h" +#include #include #include -#include "log.h" + +#include "trantor/utils/Logger.h" #include "utils/logging_utils.h" #include "utils/nitro_utils.h" -// External -#include "common.h" -#include "llama.h" - using namespace inferences; using json = nlohmann::json; - -/** - * The state of the inference task - */ -enum InferenceStatus { PENDING, RUNNING, EOS, FINISHED }; - -/** - * There is a need to save state of current ongoing inference status of a - * handler, this struct is to solve that issue - * - * @param inst Pointer to the llamaCPP instance this inference task is - * associated with. - */ -struct inferenceState { - int task_id; - InferenceStatus inference_status = PENDING; - llamaCPP* instance; - // Check if we receive the first token, set it to false after receiving - bool is_first_token = true; - - inferenceState(llamaCPP* inst) : instance(inst) {} -}; - -/** - * This function is to create the smart pointer to inferenceState, hence the - * inferenceState will be persisting even tho the lambda in streaming might go - * out of scope and the handler already moved on - */ -std::shared_ptr create_inference_state(llamaCPP* instance) { - return std::make_shared(instance); -} - -/** - * Check if model already loaded if not return message to user - * @param callback the function to return message to user - */ -bool llamaCPP::CheckModelLoaded( - const std::function& callback) { - if (!llama.model_loaded_external) { - LOG_ERROR << "Model has not been loaded"; - Json::Value jsonResp; - jsonResp["message"] = - "Model has not been loaded, please load model into nitro"; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(drogon::k409Conflict); - callback(resp); - return false; - } - return true; -} - -Json::Value create_embedding_payload(const std::vector& embedding, - int prompt_tokens) { - Json::Value dataItem; - - dataItem["object"] = "embedding"; - - Json::Value embeddingArray(Json::arrayValue); - for (const auto& value : embedding) { - embeddingArray.append(value); - } - dataItem["embedding"] = embeddingArray; - dataItem["index"] = 0; - - return dataItem; -} - -Json::Value create_full_return_json(const std::string& id, - const std::string& model, - const std::string& content, - const std::string& system_fingerprint, - int prompt_tokens, int completion_tokens, - Json::Value finish_reason = Json::Value()) { - Json::Value root; - - root["id"] = id; - root["model"] = model; - root["created"] = static_cast(std::time(nullptr)); - root["object"] = "chat.completion"; - root["system_fingerprint"] = system_fingerprint; - - Json::Value choicesArray(Json::arrayValue); - Json::Value choice; - - choice["index"] = 0; - Json::Value message; - message["role"] = "assistant"; - message["content"] = content; - choice["message"] = message; - choice["finish_reason"] = finish_reason; - - choicesArray.append(choice); - root["choices"] = choicesArray; - - Json::Value usage; - usage["prompt_tokens"] = prompt_tokens; - usage["completion_tokens"] = completion_tokens; - usage["total_tokens"] = prompt_tokens + completion_tokens; - root["usage"] = usage; - - return root; -} - -std::string create_return_json(const std::string& id, const std::string& model, - const std::string& content, - Json::Value finish_reason = Json::Value()) { - Json::Value root; - - root["id"] = id; - root["model"] = model; - root["created"] = static_cast(std::time(nullptr)); - root["object"] = "chat.completion.chunk"; - - Json::Value choicesArray(Json::arrayValue); - Json::Value choice; - - choice["index"] = 0; - Json::Value delta; - delta["content"] = content; - choice["delta"] = delta; - choice["finish_reason"] = finish_reason; - - choicesArray.append(choice); - root["choices"] = choicesArray; - - Json::StreamWriterBuilder writer; - writer["indentation"] = ""; // This sets the indentation to an empty string, - // producing compact output. - return Json::writeString(writer, root); -} - -llamaCPP::llamaCPP() - : queue(new trantor::ConcurrentTaskQueue(llama.params.n_parallel, - "llamaCPP")) { +namespace inferences { +llamaCPP::llamaCPP() { + dylib_ = std::make_unique("./engines/cortex.llamacpp", "engine"); + auto func = dylib_->get_function("get_engine"); + engine_ = func(); // Some default values for now below - log_disable(); // Disable the log to file feature, reduce bloat for - // target - // system () + // log_disable(); // Disable the log to file feature, reduce bloat for + // target + // system () }; -llamaCPP::~llamaCPP() { - StopBackgroundTask(); -} - -void llamaCPP::WarmupModel() { - json pseudo; - - LOG_INFO << "Warm-up model"; - pseudo["prompt"] = "Hello"; - pseudo["n_predict"] = 2; - pseudo["stream"] = false; - const int task_id = llama.request_completion(pseudo, false, false, -1); - std::string completion_text; - task_result result = llama.next_result(task_id); - if (!result.error && result.stop) { - LOG_INFO << result.result_json.dump(-1, ' ', false, - json::error_handler_t::replace); - } - return; -} +llamaCPP::~llamaCPP() {} void llamaCPP::ChatCompletion( - inferences::ChatCompletionRequest&& completion, - std::function&& callback) { - // Check if model is loaded - if (CheckModelLoaded(callback)) { - // Model is loaded - // Do Inference - InferenceImpl(std::move(completion), std::move(callback)); - } -} - -void llamaCPP::InferenceImpl( - inferences::ChatCompletionRequest&& completion, + const HttpRequestPtr& req, std::function&& callback) { - if (llama.model_type == ModelType::EMBEDDING) { - LOG_WARN << "Not support completion for embedding model"; - Json::Value jsonResp; - jsonResp["message"] = "Not support completion for embedding model"; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(drogon::k400BadRequest); - callback(resp); - return; - } - std::string formatted_output = pre_prompt; - int request_id = ++no_of_requests; - LOG_INFO_REQUEST(request_id) << "Generating reponse for inference request"; - - json data; - json stopWords; - int no_images = 0; - // To set default value - - // Increase number of chats received and clean the prompt - //no_of_chats++; - //if (no_of_chats % clean_cache_threshold == 0) { - // LOG_INFO_REQUEST(request_id) << "Clean cache threshold reached!"; - // llama.kv_cache_clear(); - // LOG_INFO_REQUEST(request_id) << "Cache cleaned"; - //} - - // Default values to enable auto caching - //data["cache_prompt"] = caching_enabled; - data["cache_prompt"] = false; - data["n_keep"] = -1; - - // Passing load value - data["repeat_last_n"] = this->repeat_last_n; - LOG_INFO_REQUEST(request_id) - << "Stop words:" << completion.stop.toStyledString(); - - data["stream"] = completion.stream; - data["n_predict"] = completion.max_tokens; - data["top_p"] = completion.top_p; - data["temperature"] = completion.temperature; - data["frequency_penalty"] = completion.frequency_penalty; - data["presence_penalty"] = completion.presence_penalty; - const Json::Value& messages = completion.messages; - - if (!grammar_file_content.empty()) { - data["grammar"] = grammar_file_content; - }; - - if (!llama.multimodal) { - for (const auto& message : messages) { - std::string input_role = message["role"].asString(); - std::string role; - if (input_role == "user") { - role = user_prompt; - std::string content = message["content"].asString(); - formatted_output += role + content; - } else if (input_role == "assistant") { - role = ai_prompt; - std::string content = message["content"].asString(); - formatted_output += role + content; - } else if (input_role == "system") { - role = system_prompt; - std::string content = message["content"].asString(); - formatted_output = role + content + formatted_output; - - } else { - role = input_role; - std::string content = message["content"].asString(); - formatted_output += role + content; - } - } - formatted_output += ai_prompt; - } else { - data["image_data"] = json::array(); - for (const auto& message : messages) { - std::string input_role = message["role"].asString(); - std::string role; - if (input_role == "user") { - formatted_output += role; - for (auto content_piece : message["content"]) { - role = user_prompt; - - json content_piece_image_data; - content_piece_image_data["data"] = ""; - - auto content_piece_type = content_piece["type"].asString(); - if (content_piece_type == "text") { - auto text = content_piece["text"].asString(); - formatted_output += text; - } else if (content_piece_type == "image_url") { - auto image_url = content_piece["image_url"]["url"].asString(); - std::string base64_image_data; - if (image_url.find("http") != std::string::npos) { - LOG_INFO_REQUEST(request_id) - << "Remote image detected but not supported yet"; - } else if (image_url.find("data:image") != std::string::npos) { - LOG_INFO_REQUEST(request_id) << "Base64 image detected"; - base64_image_data = nitro_utils::extractBase64(image_url); - LOG_INFO_REQUEST(request_id) << base64_image_data; - } else { - LOG_INFO_REQUEST(request_id) << "Local image detected"; - nitro_utils::processLocalImage( - image_url, [&](const std::string& base64Image) { - base64_image_data = base64Image; - }); - LOG_INFO_REQUEST(request_id) << base64_image_data; - } - content_piece_image_data["data"] = base64_image_data; - - formatted_output += "[img-" + std::to_string(no_images) + "]"; - content_piece_image_data["id"] = no_images; - data["image_data"].push_back(content_piece_image_data); - no_images++; - } - } - - } else if (input_role == "assistant") { - role = ai_prompt; - std::string content = message["content"].asString(); - formatted_output += role + content; - } else if (input_role == "system") { - role = system_prompt; - std::string content = message["content"].asString(); - formatted_output = role + content + formatted_output; - - } else { - role = input_role; - std::string content = message["content"].asString(); - formatted_output += role + content; - } - } - formatted_output += ai_prompt; - LOG_INFO_REQUEST(request_id) << formatted_output; - } - - data["prompt"] = formatted_output; - for (const auto& stop_word : completion.stop) { - stopWords.push_back(stop_word.asString()); - } - // specify default stop words - // Ensure success case for chatML - stopWords.push_back("<|im_end|>"); - stopWords.push_back(nitro_utils::rtrim(user_prompt)); - data["stop"] = stopWords; - - bool is_streamed = data["stream"]; -// Enable full message debugging -#ifdef DEBUG - LOG_INFO_REQUEST(request_id) << "Current completion text"; - LOG_INFO_REQUEST(request_id) << formatted_output; -#endif - - if (is_streamed) { - LOG_INFO_REQUEST(request_id) << "Streamed, waiting for respone"; - auto state = create_inference_state(this); - - auto chunked_content_provider = [state, data, request_id]( - char* pBuffer, - std::size_t nBuffSize) -> std::size_t { - if (state->inference_status == PENDING) { - state->inference_status = RUNNING; - } else if (state->inference_status == FINISHED) { - return 0; - } - - if (!pBuffer) { - LOG_WARN_REQUEST(request_id) - "Connection closed or buffer is null. Reset context"; - state->inference_status = FINISHED; - return 0; - } - - if (state->inference_status == EOS) { - LOG_INFO_REQUEST(request_id) << "End of result"; - const std::string str = - "data: " + - create_return_json(nitro_utils::generate_random_string(20), "_", "", - "stop") + - "\n\n" + "data: [DONE]" + "\n\n"; - - LOG_VERBOSE("data stream", - {{"request_id": request_id}, {"to_send", str}}); - std::size_t nRead = std::min(str.size(), nBuffSize); - memcpy(pBuffer, str.data(), nRead); - state->inference_status = FINISHED; - return nRead; - } - - task_result result = state->instance->llama.next_result(state->task_id); - if (!result.error) { - std::string to_send = result.result_json["content"]; - - // trim the leading space if it is the first token - if (std::exchange(state->is_first_token, false)) { - nitro_utils::ltrim(to_send); - } - - const std::string str = - "data: " + - create_return_json(nitro_utils::generate_random_string(20), "_", - to_send) + - "\n\n"; - - std::size_t nRead = std::min(str.size(), nBuffSize); - memcpy(pBuffer, str.data(), nRead); - - if (result.stop) { - LOG_INFO_REQUEST(request_id) << "Reached result stop"; - state->inference_status = EOS; - return nRead; - } - - // Make sure nBufferSize is not zero - // Otherwise it stop streaming - if (!nRead) { - state->inference_status = FINISHED; - } - - return nRead; - } else { - LOG_ERROR_REQUEST(request_id) << "Error during inference"; - } - state->inference_status = FINISHED; - return 0; - }; - // Queued task - state->instance->queue->runTaskInQueue([cb = std::move(callback), state, - data, chunked_content_provider, - request_id]() { - state->task_id = - state->instance->llama.request_completion(data, false, false, -1); - - // Start streaming response - auto resp = nitro_utils::nitroStreamResponse(chunked_content_provider, - "chat_completions.txt"); - cb(resp); - - int retries = 0; - - // Since this is an async task, we will wait for the task to be - // completed - while (state->inference_status != FINISHED && retries < 10 && - state->instance->llama.model_loaded_external) { - // Should wait chunked_content_provider lambda to be called within - // 3s - if (state->inference_status == PENDING) { - retries += 1; - } - if (state->inference_status != RUNNING) - LOG_INFO_REQUEST(request_id) - << "Wait for task to be released:" << state->task_id; - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - } - LOG_INFO_REQUEST(request_id) << "Task completed, release it"; - // Request completed, release it - state->instance->llama.request_cancel(state->task_id); - LOG_INFO_REQUEST(request_id) << "Inference completed"; - }); + LOG_TRACE << "Start chat completion"; + auto json_body = req->getJsonObject(); + bool is_stream = (*json_body).get("stream", false).asBool(); + auto q = std::make_shared(); + engine_->HandleChatCompletion(json_body, + [q](Json::Value status, Json::Value res) { + q->push(std::make_pair(status, res)); + }); + LOG_TRACE << "Wait to chat completion responses"; + if (is_stream) { + ProcessStreamRes(std::move(callback), q); } else { - queue->runTaskInQueue( - [this, request_id, cb = std::move(callback), d = std::move(data)]() { - Json::Value respData; - int task_id = llama.request_completion(d, false, false, -1); - LOG_INFO_REQUEST(request_id) << "Non stream, waiting for respone"; - if (!json_value(d, "stream", false)) { - std::string completion_text; - task_result result = llama.next_result(task_id); - if (!result.error && result.stop) { - int prompt_tokens = result.result_json["tokens_evaluated"]; - int predicted_tokens = result.result_json["tokens_predicted"]; - std::string to_send = result.result_json["content"]; - nitro_utils::ltrim(to_send); - respData = create_full_return_json( - nitro_utils::generate_random_string(20), "_", to_send, "_", - prompt_tokens, predicted_tokens); - } else { - respData["message"] = "Internal error during inference"; - LOG_ERROR_REQUEST(request_id) << "Error during inference"; - } - auto resp = nitro_utils::nitroHttpJsonResponse(respData); - cb(resp); - LOG_INFO_REQUEST(request_id) << "Inference completed"; - } - }); + ProcessNonStreamRes(std::move(callback), *q); } + LOG_TRACE << "Done chat completion"; } void llamaCPP::Embedding( const HttpRequestPtr& req, std::function&& callback) { - // Check if model is loaded - if (CheckModelLoaded(callback)) { - // Model is loaded - const auto& jsonBody = req->getJsonObject(); - // Run embedding - EmbeddingImpl(jsonBody, std::move(callback)); - return; - } -} - -void llamaCPP::EmbeddingImpl( - std::shared_ptr jsonBody, - std::function&& callback) { - int request_id = ++no_of_requests; - LOG_INFO_REQUEST(request_id) << "Generating reponse for embedding request"; - // Queue embedding task - auto state = create_inference_state(this); - - state->instance->queue->runTaskInQueue([this, state, jsonBody, callback, - request_id]() { - Json::Value responseData(Json::arrayValue); - - if (jsonBody->isMember("input")) { - const Json::Value& input = (*jsonBody)["input"]; - if (input.isString()) { - // Process the single string input - state->task_id = llama.request_completion( - {{"prompt", input.asString()}, {"n_predict", 0}}, false, true, -1); - task_result result = llama.next_result(state->task_id); - std::vector embedding_result = result.result_json["embedding"]; - responseData.append(create_embedding_payload(embedding_result, 0)); - } else if (input.isArray()) { - // Process each element in the array input - for (const auto& elem : input) { - if (elem.isString()) { - const int task_id = llama.request_completion( - {{"prompt", elem.asString()}, {"n_predict", 0}}, false, true, - -1); - task_result result = llama.next_result(task_id); - std::vector embedding_result = - result.result_json["embedding"]; - responseData.append(create_embedding_payload(embedding_result, 0)); - } - } - } - } - - Json::Value root; - root["data"] = responseData; - root["model"] = "_"; - root["object"] = "list"; - Json::Value usage; - usage["prompt_tokens"] = 0; - usage["total_tokens"] = 0; - root["usage"] = usage; - - auto resp = nitro_utils::nitroHttpJsonResponse(root); - callback(resp); - LOG_INFO_REQUEST(request_id) << "Embedding completed"; - }); + LOG_TRACE << "Start embedding"; + SyncQueue q; + engine_->HandleEmbedding(req->getJsonObject(), + [&q](Json::Value status, Json::Value res) { + q.push(std::make_pair(status, res)); + }); + LOG_TRACE << "Wait to embedding"; + ProcessNonStreamRes(std::move(callback), q); + LOG_TRACE << "Done embedding"; } void llamaCPP::UnloadModel( const HttpRequestPtr& req, std::function&& callback) { - Json::Value jsonResp; - if (CheckModelLoaded(callback)) { - StopBackgroundTask(); - - llama_free(llama.ctx); - llama_free_model(llama.model); - llama.ctx = nullptr; - llama.model = nullptr; - jsonResp["message"] = "Model unloaded successfully"; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - callback(resp); - LOG_INFO << "Model unloaded successfully"; - } + LOG_TRACE << "Start unload model"; + engine_->UnloadModel( + req->getJsonObject(), + [cb = std::move(callback)](Json::Value status, Json::Value res) { + auto resp = nitro_utils::nitroHttpJsonResponse(res); + resp->setStatusCode( + static_cast(status["status_code"].asInt())); + cb(resp); + }); + LOG_TRACE << "Done unload model"; } void llamaCPP::ModelStatus( const HttpRequestPtr& req, std::function&& callback) { - Json::Value jsonResp; - bool is_model_loaded = llama.model_loaded_external; - if (CheckModelLoaded(callback)) { - jsonResp["model_loaded"] = is_model_loaded; - jsonResp["model_data"] = llama.get_model_props().dump(); - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - callback(resp); - LOG_INFO << "Model status responded"; - } + LOG_TRACE << "Start to get model status"; + engine_->GetModelStatus( + req->getJsonObject(), + [cb = std::move(callback)](Json::Value status, Json::Value res) { + auto resp = nitro_utils::nitroHttpJsonResponse(res); + resp->setStatusCode( + static_cast(status["status_code"].asInt())); + cb(resp); + }); + LOG_TRACE << "Done get model status"; } void llamaCPP::LoadModel( const HttpRequestPtr& req, std::function&& callback) { - - if (!nitro_utils::isAVX2Supported() && ggml_cpu_has_avx2()) { - LOG_ERROR << "AVX2 is not supported by your processor"; - Json::Value jsonResp; - jsonResp["message"] = - "AVX2 is not supported by your processor, please download and replace " - "the correct Nitro asset version"; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(drogon::k500InternalServerError); - callback(resp); - return; - } - - if (llama.model_loaded_external) { - LOG_INFO << "Model already loaded"; - Json::Value jsonResp; - jsonResp["message"] = "Model already loaded"; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(drogon::k409Conflict); - callback(resp); - return; - } - - const auto& jsonBody = req->getJsonObject(); - if (!LoadModelImpl(jsonBody)) { - // Error occurred during model loading - Json::Value jsonResp; - jsonResp["message"] = "Failed to load model"; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(drogon::k500InternalServerError); - callback(resp); - } else { - // Model loaded successfully - Json::Value jsonResp; - jsonResp["message"] = "Model loaded successfully"; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - callback(resp); - LOG_INFO << "Model loaded successfully"; - } -} - -bool llamaCPP::LoadModelImpl(std::shared_ptr jsonBody) { - gpt_params params; - std::string model_type; - // By default will setting based on number of handlers - if (jsonBody) { - if (!jsonBody->operator[]("mmproj").isNull()) { - LOG_INFO << "MMPROJ FILE detected, multi-model enabled!"; - params.mmproj = jsonBody->operator[]("mmproj").asString(); - } - if (!jsonBody->operator[]("grp_attn_n").isNull()) { - params.grp_attn_n = jsonBody->operator[]("grp_attn_n").asInt(); - } - if (!jsonBody->operator[]("grp_attn_w").isNull()) { - params.grp_attn_w = jsonBody->operator[]("grp_attn_w").asInt(); - } - if (!jsonBody->operator[]("mlock").isNull()) { - params.use_mlock = jsonBody->operator[]("mlock").asBool(); + LOG_TRACE << "Load model"; + engine_->LoadModel( + req->getJsonObject(), + [cb = std::move(callback)](Json::Value status, Json::Value res) { + auto resp = nitro_utils::nitroHttpJsonResponse(res); + resp->setStatusCode( + static_cast(status["status_code"].asInt())); + cb(resp); + }); + LOG_TRACE << "Done load model"; +} + +void llamaCPP::ProcessStreamRes(std::function cb, + std::shared_ptr q) { + auto err_or_done = std::make_shared(false); + auto chunked_content_provider = [q, err_or_done]( + char* buf, + std::size_t buf_size) -> std::size_t { + if (buf == nullptr) { + LOG_TRACE << "Buf is null"; + return 0; } - if (!jsonBody->operator[]("grammar_file").isNull()) { - std::string grammar_file = - jsonBody->operator[]("grammar_file").asString(); - std::ifstream file(grammar_file); - if (!file) { - LOG_ERROR << "Grammar file not found"; - } else { - std::stringstream grammarBuf; - grammarBuf << file.rdbuf(); - grammar_file_content = grammarBuf.str(); - } - }; - - Json::Value model_path = jsonBody->operator[]("llama_model_path"); - if (model_path.isNull()) { - LOG_ERROR << "Missing model path in request"; - } else { - if (std::filesystem::exists( - std::filesystem::path(model_path.asString()))) { - params.model = model_path.asString(); - } else { - LOG_ERROR << "Could not find model in path " << model_path.asString(); - } + if (*err_or_done) { + LOG_TRACE << "Done"; + return 0; } - params.n_gpu_layers = jsonBody->get("ngl", 100).asInt(); - params.n_ctx = jsonBody->get("ctx_len", 2048).asInt(); - params.embedding = jsonBody->get("embedding", true).asBool(); - model_type = jsonBody->get("model_type", "llm").asString(); - if (model_type == "llm") { - llama.model_type = ModelType::LLM; - } else { - llama.model_type = ModelType::EMBEDDING; - } - // Check if n_parallel exists in jsonBody, if not, set to drogon_thread - params.n_batch = jsonBody->get("n_batch", 512).asInt(); - params.n_parallel = jsonBody->get("n_parallel", 1).asInt(); - params.n_threads = - jsonBody->get("cpu_threads", std::thread::hardware_concurrency()) - .asInt(); - params.cont_batching = jsonBody->get("cont_batching", false).asBool(); - this->clean_cache_threshold = - jsonBody->get("clean_cache_threshold", 5).asInt(); - this->caching_enabled = jsonBody->get("caching_enabled", false).asBool(); - this->user_prompt = jsonBody->get("user_prompt", "USER: ").asString(); - this->ai_prompt = jsonBody->get("ai_prompt", "ASSISTANT: ").asString(); - this->system_prompt = - jsonBody->get("system_prompt", "ASSISTANT's RULE: ").asString(); - this->pre_prompt = jsonBody->get("pre_prompt", "").asString(); - this->repeat_last_n = jsonBody->get("repeat_last_n", 32).asInt(); + auto [status, res] = q->wait_and_pop(); - if (!jsonBody->operator[]("llama_log_folder").isNull()) { - log_enable(); - std::string llama_log_folder = - jsonBody->operator[]("llama_log_folder").asString(); - log_set_target(llama_log_folder + "llama.log"); - } // Set folder for llama log - } - if (params.model_alias == "unknown") { - params.model_alias = params.model; - } - - llama_backend_init(); - - // LOG_INFO_LLAMA("build info", - // {{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}}); - LOG_INFO_LLAMA("system info", - { - {"n_threads", params.n_threads}, - {"total_threads", std::thread::hardware_concurrency()}, - {"system_info", llama_print_system_info()}, - }); - - // load the model - if (!llama.load_model(params)) { - LOG_ERROR << "Error loading the model"; - return false; // Indicate failure - } - llama.initialize(); - - if (queue != nullptr) { - delete queue; - } - - queue = new trantor::ConcurrentTaskQueue(llama.params.n_parallel, "llamaCPP"); - - llama.model_loaded_external = true; + if (status["has_error"].asBool() || status["is_done"].asBool()) { + *err_or_done = true; + } - LOG_INFO << "Started background task here!"; - backgroundThread = std::thread(&llamaCPP::BackgroundTask, this); + auto str = res["data"].asString(); + LOG_TRACE << "data: " << str; + std::size_t n = std::min(str.size(), buf_size); + memcpy(buf, str.data(), n); - // For model like nomic-embed-text-v1.5.f16.gguf, etc, we don't need to warm up model. - // So we use this variable to differentiate with other models - if (llama.model_type == ModelType::LLM) { - WarmupModel(); - } - return true; -} + return n; + }; -void llamaCPP::BackgroundTask() { - while (llama.model_loaded_external) { - // model_loaded = - llama.update_slots(); - } - LOG_INFO << "Background task stopped! "; - llama.kv_cache_clear(); - LOG_INFO << "KV cache cleared!"; - return; + auto resp = nitro_utils::nitroStreamResponse(chunked_content_provider, + "chat_completions.txt"); + cb(resp); } -void llamaCPP::StopBackgroundTask() { - if (llama.model_loaded_external) { - llama.model_loaded_external = false; - llama.condition_tasks.notify_one(); - LOG_INFO << "Stopping background task! "; - if (backgroundThread.joinable()) { - backgroundThread.join(); - } - LOG_INFO << "Background task stopped! "; - } +void llamaCPP::ProcessNonStreamRes( + std::function cb, SyncQueue& q) { + auto [status, res] = q.wait_and_pop(); + auto resp = nitro_utils::nitroHttpJsonResponse(res); + resp->setStatusCode( + static_cast(status["status_code"].asInt())); + cb(resp); } +} // namespace inferences \ No newline at end of file diff --git a/cortex-cpp/controllers/llamaCPP.h b/cortex-cpp/controllers/llamaCPP.h index 531c18b20..62b554f6b 100644 --- a/cortex-cpp/controllers/llamaCPP.h +++ b/cortex-cpp/controllers/llamaCPP.h @@ -11,18 +11,15 @@ #define CPPHTTPLIB_NO_EXCEPTIONS 1 #endif -#include +#include #include #include -#include #include "common/base.h" -#include "context/llama_server_context.h" -#include "stb_image.h" +#include "cortex-common/EngineI.h" +#include "trantor/utils/SerialTaskQueue.h" +#include "utils/dylib.h" #include "utils/json.hpp" - -#include "models/chat_completion_request.h" - #ifndef SERVER_VERBOSE #define SERVER_VERBOSE 1 #endif @@ -37,6 +34,8 @@ class llamaCPP : public drogon::HttpController, public BaseModel, public BaseChatCompletion, public BaseEmbedding { + struct SyncQueue; + public: llamaCPP(); ~llamaCPP(); @@ -59,7 +58,7 @@ class llamaCPP : public drogon::HttpController, // PATH_ADD("/llama/chat_completion", Post); METHOD_LIST_END void ChatCompletion( - inferences::ChatCompletionRequest&& completion, + const HttpRequestPtr& req, std::function&& callback) override; void Embedding( const HttpRequestPtr& req, @@ -75,36 +74,52 @@ class llamaCPP : public drogon::HttpController, std::function&& callback) override; private: - llama_server_context llama; - // std::atomic model_loaded = false; - size_t sent_count = 0; - size_t sent_token_probs_index = 0; - std::thread backgroundThread; - std::string user_prompt; - std::string ai_prompt; - std::string system_prompt; - std::string pre_prompt; - int repeat_last_n; - bool caching_enabled; - std::atomic no_of_requests = 0; - std::atomic no_of_chats = 0; - int clean_cache_threshold; - std::string grammar_file_content; - - /** - * Queue to handle the inference tasks - */ - trantor::ConcurrentTaskQueue* queue; - - bool LoadModelImpl(std::shared_ptr jsonBody); - void InferenceImpl(inferences::ChatCompletionRequest&& completion, - std::function&& callback); - void EmbeddingImpl(std::shared_ptr jsonBody, - std::function&& callback); - bool CheckModelLoaded( - const std::function& callback); - void WarmupModel(); - void BackgroundTask(); - void StopBackgroundTask(); + void ProcessStreamRes(std::function cb, + std::shared_ptr q); + void ProcessNonStreamRes(std::function cb, + SyncQueue& q); + + private: + struct SyncQueue { + void push(std::pair&& p) { + std::unique_lock l(mtx); + q.push(p); + cond.notify_one(); + } + + std::pair wait_and_pop() { + std::unique_lock l(mtx); + cond.wait(l, [this] { return !q.empty(); }); + auto res = q.front(); + q.pop(); + return res; + } + + std::mutex mtx; + std::condition_variable cond; + // Status and result + std::queue> q; + }; + struct StreamStatus { + void Done() { + std::unique_lock l(m); + stream_done = true; + cv.notify_all(); + } + + void Wait() { + std::unique_lock l(m); + cv.wait(l, [this] { return stream_done; }); + } + + private: + std::mutex m; + std::condition_variable cv; + bool stream_done = false; + }; + + private: + std::unique_ptr dylib_; + EngineI* engine_; }; -}; // namespace inferences +}; // namespace inferences \ No newline at end of file diff --git a/cortex-cpp/cortex-common/EngineI.h b/cortex-cpp/cortex-common/EngineI.h new file mode 100644 index 000000000..b8770b230 --- /dev/null +++ b/cortex-cpp/cortex-common/EngineI.h @@ -0,0 +1,27 @@ +#pragma once + +#include +#include + +#include "json/value.h" + +class EngineI { + public: + virtual ~EngineI() {} + + virtual void HandleChatCompletion( + std::shared_ptr jsonBody, + std::function&& callback) = 0; + virtual void HandleEmbedding( + std::shared_ptr jsonBody, + std::function&& callback) = 0; + virtual void LoadModel( + std::shared_ptr jsonBody, + std::function&& callback) = 0; + virtual void UnloadModel( + std::shared_ptr jsonBody, + std::function&& callback) = 0; + virtual void GetModelStatus( + std::shared_ptr jsonBody, + std::function&& callback) = 0; +}; diff --git a/cortex-cpp/engines/cortex.llamacpp/engine.cmake b/cortex-cpp/engines/cortex.llamacpp/engine.cmake new file mode 100644 index 000000000..5b9070510 --- /dev/null +++ b/cortex-cpp/engines/cortex.llamacpp/engine.cmake @@ -0,0 +1,21 @@ +set(LIBLLAMA_ENGINE_VERSION v1.0) + +if(UNIX) + set(LIBRARY_NAME llama-engine-lib.so) +else() + set(LIBRARY_NAME engine.dll) +endif() + +set(LIBLLAMA_ENGINE_URL https://github.com/.../releases/download/${LIBLLAMA_ENGINE_VERSION}/${LIBRARY_NAME}) +set(LIBLLAMA_ENGINE_PATH ${CMAKE_BINARY_DIR}/engines/${LIBRARY_NAME}) + +MESSAGE("CMAKE_BINARY_DIR = " ${CMAKE_BINARY_DIR}) + +file(DOWNLOAD ${LIBLLAMA_ENGINE_URL} ${LIBLLAMA_ENGINE_PATH} STATUS LIBLLAMA_ENGINE_DOWNLOAD_STATUS) +list(GET LIBLLAMA_ENGINE_DOWNLOAD_STATUS 0 LIBLLAMA_ENGINE_DOWNLOAD_STATUS_NO) + +if(LIBLLAMA_ENGINE_DOWNLOAD_STATUS_NO) + message(STATUS "Pre-built library not downloaded. (${LIBLLAMA_ENGINE_DOWNLOAD_STATUS})") +else() + message(STATUS "Linking downloaded pre-built library.") +endif() \ No newline at end of file diff --git a/cortex-cpp/llama.cpp b/cortex-cpp/llama.cpp deleted file mode 160000 index 8a56075b0..000000000 --- a/cortex-cpp/llama.cpp +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 8a56075b07a8b571bf95a912ffdce4c928c2b414 diff --git a/cortex-cpp/models/chat_completion_request.h b/cortex-cpp/models/chat_completion_request.h deleted file mode 100644 index f4fd087f5..000000000 --- a/cortex-cpp/models/chat_completion_request.h +++ /dev/null @@ -1,36 +0,0 @@ -#pragma once -#include - -namespace inferences { -struct ChatCompletionRequest { - bool stream = false; - int max_tokens = 500; - float top_p = 0.95f; - float temperature = 0.8f; - float frequency_penalty = 0; - float presence_penalty = 0; - Json::Value stop = Json::Value(Json::arrayValue); - Json::Value messages = Json::Value(Json::arrayValue); -}; -} // namespace inferences - -namespace drogon { -template <> -inline inferences::ChatCompletionRequest fromRequest(const HttpRequest& req) { - auto jsonBody = req.getJsonObject(); - inferences::ChatCompletionRequest completion; - if (jsonBody) { - completion.stream = (*jsonBody).get("stream", false).asBool(); - completion.max_tokens = (*jsonBody).get("max_tokens", 500).asInt(); - completion.top_p = (*jsonBody).get("top_p", 0.95).asFloat(); - completion.temperature = (*jsonBody).get("temperature", 0.8).asFloat(); - completion.frequency_penalty = - (*jsonBody).get("frequency_penalty", 0).asFloat(); - completion.presence_penalty = - (*jsonBody).get("presence_penalty", 0).asFloat(); - completion.messages = (*jsonBody)["messages"]; - completion.stop = (*jsonBody)["stop"]; - } - return completion; -} -} // namespace inferences diff --git a/cortex-cpp/nitro_deps/CMakeLists.txt b/cortex-cpp/nitro_deps/CMakeLists.txt index 29b9b0186..667024cd7 100644 --- a/cortex-cpp/nitro_deps/CMakeLists.txt +++ b/cortex-cpp/nitro_deps/CMakeLists.txt @@ -5,6 +5,9 @@ project(MyProject) include(ExternalProject) # Define variables +if(UNIX AND NOT APPLE) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") +endif() set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(THIRD_PARTY_INSTALL_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../build_deps/_install) #if(NOT THIRD_PARTY_INSTALL_PATH ) @@ -34,8 +37,7 @@ ExternalProject_Add( brotli GIT_REPOSITORY https://github.com/google/brotli GIT_TAG v1.1.0 - CMAKE_ARGS - -DCMAKE_BUILD_TYPE=Release + CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF -DSHARE_INSTALL_PREFIX=${CMAKE_CURRENT_SOURCE_DIR}/share -DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH} @@ -47,6 +49,7 @@ ExternalProject_Add( GIT_TAG 1.9.5 CMAKE_ARGS -DBUILD_SHARED_LIBS=OFF + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH} ) @@ -65,7 +68,7 @@ ExternalProject_Add( GIT_REPOSITORY https://github.com/drogonframework/drogon GIT_TAG v1.9.2 CMAKE_ARGS - -DCMAKE_BUILD_TYPE=release + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DOPENSSL_USE_STATIC_LIBS=TRUE -DZLIB_USE_STATIC_LIBS=${ZLIB_USE_STATIC_LIBS} -DBUILD_ORM=OFF @@ -86,7 +89,6 @@ ExternalProject_Add( GIT_TAG v1.14.0 CMAKE_ARGS -Dgtest_force_shared_crt=ON - -DCMAKE_BUILD_TYPE=release -DCMAKE_PREFIX_PATH=${THIRD_PARTY_INSTALL_PATH} -DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH} ) @@ -111,4 +113,3 @@ if(ZLIB_USE_STATIC_LIBS) endif() # target_link_libraries( ...) - diff --git a/cortex-cpp/test/components/test_models.cc b/cortex-cpp/test/components/test_models.cc deleted file mode 100644 index ac27e97dd..000000000 --- a/cortex-cpp/test/components/test_models.cc +++ /dev/null @@ -1,53 +0,0 @@ -#include "gtest/gtest.h" -#include "models/chat_completion_request.h" - -using inferences::ChatCompletionRequest; - -class ModelTest : public ::testing::Test { -}; - - -TEST_F(ModelTest, should_parse_request) { - { - Json::Value data; - auto req = drogon::HttpRequest::newHttpJsonRequest(data); - - auto res = - drogon::fromRequest(*req.get()); - - EXPECT_EQ(res.stream, false); - EXPECT_EQ(res.max_tokens, 500); - EXPECT_EQ(res.top_p, 0.95f); - EXPECT_EQ(res.temperature, 0.8f); - EXPECT_EQ(res.frequency_penalty, 0); - EXPECT_EQ(res.presence_penalty, 0); - EXPECT_EQ(res.stop, Json::Value{}); - EXPECT_EQ(res.messages, Json::Value{}); - } - - { - Json::Value data; - data["stream"] = true; - data["max_tokens"] = 400; - data["top_p"] = 0.8; - data["temperature"] = 0.7; - data["frequency_penalty"] = 0.1; - data["presence_penalty"] = 0.2; - data["messages"] = "message"; - data["stop"] = "stop"; - - auto req = drogon::HttpRequest::newHttpJsonRequest(data); - - auto res = - drogon::fromRequest(*req.get()); - - EXPECT_EQ(res.stream, true); - EXPECT_EQ(res.max_tokens, 400); - EXPECT_EQ(res.top_p, 0.8f); - EXPECT_EQ(res.temperature, 0.7f); - EXPECT_EQ(res.frequency_penalty, 0.1f); - EXPECT_EQ(res.presence_penalty, 0.2f); - EXPECT_EQ(res.stop, Json::Value{"stop"}); - EXPECT_EQ(res.messages, Json::Value{"message"}); - } -} diff --git a/cortex-cpp/utils/dylib.h b/cortex-cpp/utils/dylib.h new file mode 100644 index 000000000..806e2ee67 --- /dev/null +++ b/cortex-cpp/utils/dylib.h @@ -0,0 +1,316 @@ +/** + * @file dylib.hpp + * @version 2.2.1 + * @brief C++ cross-platform wrapper around dynamic loading of shared libraries + * @link https://github.com/martin-olivier/dylib + * + * @author Martin Olivier + * @copyright (c) 2023 Martin Olivier + * + * This library is released under MIT license + */ + +#pragma once + +#include +#include +#include + +#if ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L) +#define DYLIB_CPP17 +#include +#endif + +#if (defined(_WIN32) || defined(_WIN64)) +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#define DYLIB_UNDEFINE_LEAN_AND_MEAN +#endif +#ifndef NOMINMAX +#define NOMINMAX +#define DYLIB_UNDEFINE_NOMINMAX +#endif +#include +#ifdef DYLIB_UNDEFINE_LEAN_AND_MEAN +#undef WIN32_LEAN_AND_MEAN +#undef DYLIB_UNDEFINE_LEAN_AND_MEAN +#endif +#ifdef DYLIB_UNDEFINE_NOMINMAX +#undef NOMINMAX +#undef DYLIB_UNDEFINE_NOMINMAX +#endif +#else +#include +#endif + +#if (defined(_WIN32) || defined(_WIN64)) +#define DYLIB_WIN_MAC_OTHER(win_def, mac_def, other_def) win_def +#define DYLIB_WIN_OTHER(win_def, other_def) win_def +#elif defined(__APPLE__) +#define DYLIB_WIN_MAC_OTHER(win_def, mac_def, other_def) mac_def +#define DYLIB_WIN_OTHER(win_def, other_def) other_def +#else +#define DYLIB_WIN_MAC_OTHER(win_def, mac_def, other_def) other_def +#define DYLIB_WIN_OTHER(win_def, other_def) other_def +#endif + +/** + * The `dylib` class represents a single dynamic library instance, + * allowing the access of symbols like functions or global variables + */ +class dylib { +public: + struct filename_components { + static constexpr const char *prefix = DYLIB_WIN_OTHER("", "lib"); + static constexpr const char *suffix = DYLIB_WIN_MAC_OTHER(".dll", ".dylib", ".so"); + }; + using native_handle_type = DYLIB_WIN_OTHER(HINSTANCE, void *); + using native_symbol_type = DYLIB_WIN_OTHER(FARPROC, void *); + + static_assert(std::is_pointer::value, "Expecting HINSTANCE to be a pointer"); + static_assert(std::is_pointer::value, "Expecting FARPROC to be a pointer"); + + static constexpr bool add_filename_decorations = true; + static constexpr bool no_filename_decorations = false; + + /** + * This exception is raised when a library fails to load or a symbol fails to resolve + */ + class exception : public std::runtime_error { + using std::runtime_error::runtime_error; + }; + + /** + * This exception is raised when a library fails to load + */ + class load_error : public exception { + using exception::exception; + }; + + /** + * This exception is raised when a symbol fails to resolve + */ + class symbol_error : public exception { + using exception::exception; + }; + + dylib(const dylib&) = delete; + dylib& operator=(const dylib&) = delete; + + dylib(dylib &&other) noexcept : m_handle(other.m_handle) { + other.m_handle = nullptr; + } + + dylib& operator=(dylib &&other) noexcept { + if (this != &other) + std::swap(m_handle, other.m_handle); + return *this; + } + + /** + * Loads a dynamic library + * + * @throws `dylib::load_error` if the library could not be opened (including + * the case of the library file not being found) + * @throws `std::invalid_argument` if the arguments are null + * + * @param dir_path the directory path where the dynamic library is located + * @param name the name of the dynamic library to load + * @param decorations adds OS-specific decorations to the library name + */ + ///@{ + dylib(const char *dir_path, const char *lib_name, bool decorations = add_filename_decorations) { + if (!dir_path) + throw std::invalid_argument("The directory path is null"); + if (!lib_name) + throw std::invalid_argument("The library name is null"); + + std::string final_name = lib_name; + std::string final_path = dir_path; + + if (decorations) + final_name = filename_components::prefix + final_name + filename_components::suffix; + + if (!final_path.empty() && final_path.find_last_of('/') != final_path.size() - 1) + final_path += '/'; + + m_handle = open((final_path + final_name).c_str()); + + if (!m_handle) + throw load_error("Could not load library \"" + final_path + final_name + "\"\n" + get_error_description()); + } + + dylib(const std::string &dir_path, const std::string &lib_name, bool decorations = add_filename_decorations) + : dylib(dir_path.c_str(), lib_name.c_str(), decorations) {} + + dylib(const std::string &dir_path, const char *lib_name, bool decorations = add_filename_decorations) + : dylib(dir_path.c_str(), lib_name, decorations) {} + + dylib(const char *dir_path, const std::string &lib_name, bool decorations = add_filename_decorations) + : dylib(dir_path, lib_name.c_str(), decorations) {} + + explicit dylib(const std::string &lib_name, bool decorations = add_filename_decorations) + : dylib("", lib_name.c_str(), decorations) {} + + explicit dylib(const char *lib_name, bool decorations = add_filename_decorations) + : dylib("", lib_name, decorations) {} + +#ifdef DYLIB_CPP17 + explicit dylib(const std::filesystem::path &lib_path) + : dylib("", lib_path.string().c_str(), no_filename_decorations) {} + + dylib(const std::filesystem::path &dir_path, const std::string &lib_name, bool decorations = add_filename_decorations) + : dylib(dir_path.string().c_str(), lib_name.c_str(), decorations) {} + + dylib(const std::filesystem::path &dir_path, const char *lib_name, bool decorations = add_filename_decorations) + : dylib(dir_path.string().c_str(), lib_name, decorations) {} +#endif + ///@} + + ~dylib() { + if (m_handle) + close(m_handle); + } + + /** + * Get a symbol from the currently loaded dynamic library + * + * @throws `dylib::symbol_error` if the symbol could not be found + * @throws `std::invalid_argument` if the argument or library handle is null + * + * @param symbol_name the symbol name to lookup + * + * @return a pointer to the requested symbol + */ + native_symbol_type get_symbol(const char *symbol_name) const { + if (!symbol_name) + throw std::invalid_argument("The symbol name to lookup is null"); + if (!m_handle) + throw std::logic_error("The dynamic library handle is null. This object may have been moved from."); + + auto symbol = locate_symbol(m_handle, symbol_name); + + if (symbol == nullptr) + throw symbol_error("Could not get symbol \"" + std::string(symbol_name) + "\"\n" + get_error_description()); + return symbol; + } + + native_symbol_type get_symbol(const std::string &symbol_name) const { + return get_symbol(symbol_name.c_str()); + } + + /** + * Get a function from the currently loaded dynamic library + * + * @throws `dylib::symbol_error` if the function could not be found + * @throws `std::invalid_argument` if the argument is null + * + * @tparam T the function type, e.g., `double(int, int)` + * @param symbol_name the function name to lookup + * + * @return a pointer to the requested function + */ + template + T *get_function(const char *symbol_name) const { +#if (defined(__GNUC__) && __GNUC__ >= 8) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wcast-function-type" +#endif + return reinterpret_cast(get_symbol(symbol_name)); +#if (defined(__GNUC__) && __GNUC__ >= 8) +#pragma GCC diagnostic pop +#endif + } + + template + T *get_function(const std::string &symbol_name) const { + return get_function(symbol_name.c_str()); + } + + /** + * Get a variable from the currently loaded dynamic library + * + * @throws `dylib::symbol_error` if the variable could not be found + * @throws `std::invalid_argument` if the argument is null + * + * @tparam T the variable type + * @param symbol_name the variable name to lookup + * + * @return a reference to the requested variable + */ + template + T &get_variable(const char *symbol_name) const { + return *reinterpret_cast(get_symbol(symbol_name)); + } + + template + T &get_variable(const std::string &symbol_name) const { + return get_variable(symbol_name.c_str()); + } + + /** + * Check if a symbol exists in the currently loaded dynamic library. + * This method will return false if no dynamic library is currently loaded + * or if the symbol name is nullptr + * + * @param symbol_name the symbol name to look for + * + * @return true if the symbol exists in the dynamic library, false otherwise + */ + bool has_symbol(const char *symbol_name) const noexcept { + if (!m_handle || !symbol_name) + return false; + return locate_symbol(m_handle, symbol_name) != nullptr; + } + + bool has_symbol(const std::string &symbol) const noexcept { + return has_symbol(symbol.c_str()); + } + + /** + * @return the dynamic library handle + */ + native_handle_type native_handle() noexcept { + return m_handle; + } + +protected: + native_handle_type m_handle{nullptr}; + + static native_handle_type open(const char *path) noexcept { +#if (defined(_WIN32) || defined(_WIN64)) + return LoadLibraryA(path); +#else + return dlopen(path, RTLD_NOW | RTLD_LOCAL); +#endif + } + + static native_symbol_type locate_symbol(native_handle_type lib, const char *name) noexcept { + return DYLIB_WIN_OTHER(GetProcAddress, dlsym)(lib, name); + } + + static void close(native_handle_type lib) noexcept { + DYLIB_WIN_OTHER(FreeLibrary, dlclose)(lib); + } + + static std::string get_error_description() noexcept { +#if (defined(_WIN32) || defined(_WIN64)) + constexpr const size_t BUF_SIZE = 512; + const auto error_code = GetLastError(); + if (!error_code) + return "No error reported by GetLastError"; + char description[BUF_SIZE]; + const auto lang = MAKELANGID(LANG_ENGLISH, SUBLANG_ENGLISH_US); + const DWORD length = + FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM, nullptr, error_code, lang, description, BUF_SIZE, nullptr); + return (length == 0) ? "Unknown error (FormatMessage failed)" : description; +#else + const auto description = dlerror(); + return (description == nullptr) ? "No error reported by dlerror" : description; +#endif + } +}; + +#undef DYLIB_WIN_MAC_OTHER +#undef DYLIB_WIN_OTHER +#undef DYLIB_CPP17 \ No newline at end of file From f1a7105325b2174fee8b033b49f35bcf1da93019 Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Fri, 3 May 2024 08:29:13 +0700 Subject: [PATCH 03/10] feat: llama engine --- .github/workflows/build.yml | 60 ++++++++++++------- cortex-cpp/CMakeLists.txt | 2 +- .../engines/cortex.llamacpp/engine.cmake | 51 ++++++++++++++-- 3 files changed, 88 insertions(+), 25 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 90e509d55..318b984a1 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -171,7 +171,9 @@ jobs: shell: bash run: | mkdir -p nitro + mkdir -p nitro/engines/cortex.llamacpp cp build/nitro nitro/ + cp build/engines/cortex.llamacpp/libengine.so nitro/engines/cortex.llamacpp/ tar -czvf nitro.tar.gz nitro - name: Upload Artifact @@ -218,7 +220,11 @@ jobs: contents: write strategy: matrix: - cuda: ["12-0", "11-7"] + include: + - cuda: "12-0" + flags: "-DCUDA_12_0=ON -DCUDA_11_7=OFF" + - cuda: "11-7" + flags: "-DCUDA_12_0=OFF -DCUDA_11_7=ON" steps: - name: Clone @@ -232,7 +238,7 @@ jobs: run: | ./install_deps.sh mkdir build && cd build - cmake -DLLAMA_NATIVE=OFF -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} .. + cmake ${{matrix.flags}} -DLLAMA_NATIVE=OFF -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} .. make -j $(nproc) ls -la @@ -240,7 +246,9 @@ jobs: shell: bash run: | mkdir -p nitro + mkdir -p nitro/engines/cortex.llamacpp cp build/nitro nitro/ + cp build/engines/cortex.llamacpp/libengine.so nitro/engines/cortex.llamacpp/ tar -czvf nitro.tar.gz nitro - name: Upload Artifact @@ -295,7 +303,9 @@ jobs: run: | mkdir -p nitro cp llama.cpp/ggml-metal.metal nitro/ + mkdir -p nitro/engines/cortex.llamacpp cp build/nitro nitro/ + cp build/engines/cortex.llamacpp/libengine.dylib nitro/engines/cortex.llamacpp/ - name: Upload Artifact uses: actions/upload-artifact@v2 @@ -361,7 +371,9 @@ jobs: shell: bash run: | mkdir -p nitro + mkdir -p nitro/engines/cortex.llamacpp cp build/nitro nitro/ + cp build/engines/cortex.llamacpp/libengine.dylib nitro/engines/cortex.llamacpp/ - name: Upload Artifact uses: actions/upload-artifact@v2 @@ -408,8 +420,10 @@ jobs: - name: bundle universal binary run: | mkdir -p nitro + mkdir -p nitro/engines/cortex.llamacpp/ ls ./nitro-mac-amd64 lipo -create ./nitro-mac-amd64/nitro ./nitro-mac-arm64/nitro -output ./nitro/nitro + lipo -create ./nitro-mac-amd64/engines/cortex.llamacpp/libengine.dylib ./nitro-mac-arm64/engines/cortex.llamacpp/libengine.dylib -output ./nitro/engines/cortex.llamacpp/libengine.dylib cp ./nitro-mac-arm64/ggml-metal.metal ./nitro/ggml-metal.metal tar -czvf nitro.tar.gz nitro @@ -503,6 +517,8 @@ jobs: id: pack_artifacts shell: cmd run: | + mkdir .\build\Release\engines\cortex.llamacpp\ + robocopy .\build\engines\cortex.llamacpp\ .\build\Release\engines\cortex.llamacpp\ engine.dll robocopy build_deps\_install\bin\ .\build\Release\ zlib.dll robocopy build\bin\Release\ .\build\Release\ llama.dll robocopy build\bin\Release\ .\build\Release\ whisper.dll @@ -558,8 +574,25 @@ jobs: strategy: matrix: - cuda: ["12-0", "11-7"] - instructions: ["amd64-avx2", "amd64-avx", "amd64-avx512"] + include: + - cuda: "12-0" + instructions: "-DLLAMA_NATIVE=OFF" + flags: "-DCUDA_12_0=ON -DCUDA_11_7=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" + - cuda: "12-0" + instructions: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF" + flags: "-DCUDA_12_0=ON -DCUDA_11_7=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" + - cuda: "12-0" + instructions: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF" + flags: "-DCUDA_12_0=ON -DCUDA_11_7=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" + - cuda: "11-7" + instructions: "-DLLAMA_NATIVE=OFF" + flags: "-DCUDA_12_0=OFF -DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" + - cuda: "11-7" + instructions: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF" + flags: "-DCUDA_12_0=OFF -DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" + - cuda: "11-7" + instructions: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF" + flags: "-DCUDA_12_0=OFF -DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" steps: - name: Setup VSWhere.exe @@ -600,21 +633,6 @@ jobs: with: dotnet-version: "6.0.x" - # Conditional instruction check and set environment variable - - name: Set INSTRUCTION Based on Instructions ${{ matrix.instructions }} - shell: cmd - run: | - IF "${{ matrix.instructions }}" == "amd64-avx2" ( - echo "INSTRUCTION=-DLLAMA_NATIVE=OFF" >> $env:GITHUB_ENV - echo "INSTRUCTION=-DLLAMA_NATIVE=OFF" - ) ELSE IF "${{ matrix.instructions }}" == "amd64-avx" ( - echo "INSTRUCTION=-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF" >> $env:GITHUB_ENV - echo "INSTRUCTION=-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF" - ) ELSE IF "${{ matrix.instructions }}" == "amd64-avx512" ( - echo "INSTRUCTION=-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF" >> $env:GITHUB_ENV - echo "INSTRUCTION=-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF" - ) - - name: Build id: cmake_build shell: cmd @@ -623,7 +641,7 @@ jobs: cmake --build ./build_deps/nitro_deps --config Release mkdir -p build cd build - cmake .. %INSTRUCTION% -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} + cmake .. ${{ matrix.instructions }} ${{ matrix.flags }} -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} cmake --build . --config Release -j "%NUMBER_OF_PROCESSORS%" - name: Pack artifacts @@ -631,6 +649,8 @@ jobs: shell: cmd run: | set PATH=%PATH%;C:\Program Files\7-Zip\ + mkdir .\build\Release\engines\cortex.llamacpp\ + robocopy .\build\engines\cortex.llamacpp\ .\build\Release\engines\cortex.llamacpp\ engine.dll robocopy build_deps\_install\bin\ .\build\Release\ zlib.dll robocopy build\bin\Release\ .\build\Release\ llama.dll robocopy build\bin\Release\ .\build\Release\ whisper.dll diff --git a/cortex-cpp/CMakeLists.txt b/cortex-cpp/CMakeLists.txt index 9be937dc4..4bb269aaa 100644 --- a/cortex-cpp/CMakeLists.txt +++ b/cortex-cpp/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.5) project(nitro C CXX) -# include(engines/cortex.llamacpp/engine.cmake) +include(engines/cortex.llamacpp/engine.cmake) include(CheckIncludeFileCXX) check_include_file_cxx(any HAS_ANY) diff --git a/cortex-cpp/engines/cortex.llamacpp/engine.cmake b/cortex-cpp/engines/cortex.llamacpp/engine.cmake index 5b9070510..4b879a1ca 100644 --- a/cortex-cpp/engines/cortex.llamacpp/engine.cmake +++ b/cortex-cpp/engines/cortex.llamacpp/engine.cmake @@ -1,21 +1,64 @@ set(LIBLLAMA_ENGINE_VERSION v1.0) -if(UNIX) - set(LIBRARY_NAME llama-engine-lib.so) +# Download library based on instruction +# TODO(sang) correct library name when cortex.llamacpp CI is done +if(UNIX AND NOT APPLE) + if(CUDA_12_0) + set(LIBRARY_NAME linux-amd64-cuda-12-0.tar.gz) + elseif(CUDA_11_7) + set(LIBRARY_NAME linux-amd64-cuda-11-7.tar.gz) + elseif(LLAMA_VULKAN) + set(LIBRARY_NAME linux-amd64-vulkan.tar.gz) + elseif(DLLAMA_AVX512) + set(LIBRARY_NAME linux-amd64-avx512.tar.gz) + elseif(DLLAMA_AVX2) + set(LIBRARY_NAME linux-amd64-avx2.tar.gz) + else() + set(LIBRARY_NAME linux-amd64-avx.tar.gz) + endif() +elseif(UNIX) + set(LIBRARY_NAME mac-universal.tar.gz) else() - set(LIBRARY_NAME engine.dll) + if(CUDA_12_0) + if(DLLAMA_AVX512) + set(LIBRARY_NAME win-amd64-avx512-cuda-12-0.tar.gz) + elseif(DLLAMA_AVX2) + set(LIBRARY_NAME win-amd64-avx2-cuda-12-0.tar.gz) + else() + set(LIBRARY_NAME win-amd64-avx-cuda-12-0.tar.gz) + endif() + elseif(CUDA_11_7) + if(DLLAMA_AVX512) + set(LIBRARY_NAME win-amd64-avx512-cuda-11-7.tar.gz) + elseif(DLLAMA_AVX2) + set(LIBRARY_NAME win-amd64-avx2-cuda-11-7.tar.gz) + else() + set(LIBRARY_NAME win-amd64-avx-cuda-11-7.tar.gz) + endif() + elseif(LLAMA_VULKAN) + set(LIBRARY_NAME win-amd64-vulkan.tar.gz) + elseif(DLLAMA_AVX512) + set(LIBRARY_NAME win-amd64-avx512.tar.gz) + elseif(DLLAMA_AVX2) + set(LIBRARY_NAME win-amd64-avx2.tar.gz) + else() + set(LIBRARY_NAME win-amd64-avx.tar.gz) + endif() endif() set(LIBLLAMA_ENGINE_URL https://github.com/.../releases/download/${LIBLLAMA_ENGINE_VERSION}/${LIBRARY_NAME}) + set(LIBLLAMA_ENGINE_PATH ${CMAKE_BINARY_DIR}/engines/${LIBRARY_NAME}) -MESSAGE("CMAKE_BINARY_DIR = " ${CMAKE_BINARY_DIR}) +# MESSAGE("CMAKE_BINARY_DIR = " ${CMAKE_BINARY_DIR}) file(DOWNLOAD ${LIBLLAMA_ENGINE_URL} ${LIBLLAMA_ENGINE_PATH} STATUS LIBLLAMA_ENGINE_DOWNLOAD_STATUS) list(GET LIBLLAMA_ENGINE_DOWNLOAD_STATUS 0 LIBLLAMA_ENGINE_DOWNLOAD_STATUS_NO) +# MESSAGE("file = " ${CMAKE_BINARY_DIR}/engines/${LIBRARY_NAME}) if(LIBLLAMA_ENGINE_DOWNLOAD_STATUS_NO) message(STATUS "Pre-built library not downloaded. (${LIBLLAMA_ENGINE_DOWNLOAD_STATUS})") else() message(STATUS "Linking downloaded pre-built library.") + file(ARCHIVE_EXTRACT INPUT ${CMAKE_BINARY_DIR}/engines/${LIBRARY_NAME} DESTINATION ${CMAKE_BINARY_DIR}/engines/) endif() \ No newline at end of file From 775a62488537b2f88fcabd3138db5b3ea24f0c98 Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Fri, 3 May 2024 09:39:05 +0700 Subject: [PATCH 04/10] fix: only load engine when load model --- cortex-cpp/controllers/llamaCPP.cc | 105 +++++++++++++++++++++++++---- cortex-cpp/controllers/llamaCPP.h | 2 + 2 files changed, 95 insertions(+), 12 deletions(-) diff --git a/cortex-cpp/controllers/llamaCPP.cc b/cortex-cpp/controllers/llamaCPP.cc index 3beaae0e9..d3e489072 100644 --- a/cortex-cpp/controllers/llamaCPP.cc +++ b/cortex-cpp/controllers/llamaCPP.cc @@ -11,21 +11,35 @@ using namespace inferences; using json = nlohmann::json; namespace inferences { -llamaCPP::llamaCPP() { - dylib_ = std::make_unique("./engines/cortex.llamacpp", "engine"); - auto func = dylib_->get_function("get_engine"); - engine_ = func(); - // Some default values for now below - // log_disable(); // Disable the log to file feature, reduce bloat for - // target - // system () -}; +namespace { +constexpr static auto kLlamaEngine = "cortex.llamacpp"; +constexpr static auto kLlamaLibPath = "./engines/cortex.llamacpp"; +} // namespace + +llamaCPP::llamaCPP() + : engine_{nullptr} { + + // Some default values for now below + // log_disable(); // Disable the log to file feature, reduce bloat for + // target + // system () + }; llamaCPP::~llamaCPP() {} void llamaCPP::ChatCompletion( const HttpRequestPtr& req, std::function&& callback) { + if (!IsEngineLoaded()) { + Json::Value res; + res["message"] = "Engine is not loaded yet"; + auto resp = nitro_utils::nitroHttpJsonResponse(res); + resp->setStatusCode(k409Conflict); + callback(resp); + LOG_WARN << "Engine is not loaded yet"; + return; + } + LOG_TRACE << "Start chat completion"; auto json_body = req->getJsonObject(); bool is_stream = (*json_body).get("stream", false).asBool(); @@ -46,6 +60,16 @@ void llamaCPP::ChatCompletion( void llamaCPP::Embedding( const HttpRequestPtr& req, std::function&& callback) { + if (!IsEngineLoaded()) { + Json::Value res; + res["message"] = "Engine is not loaded yet"; + auto resp = nitro_utils::nitroHttpJsonResponse(res); + resp->setStatusCode(k409Conflict); + callback(resp); + LOG_WARN << "Engine is not loaded yet"; + return; + } + LOG_TRACE << "Start embedding"; SyncQueue q; engine_->HandleEmbedding(req->getJsonObject(), @@ -60,6 +84,15 @@ void llamaCPP::Embedding( void llamaCPP::UnloadModel( const HttpRequestPtr& req, std::function&& callback) { + if (!IsEngineLoaded()) { + Json::Value res; + res["message"] = "Engine is not loaded yet"; + auto resp = nitro_utils::nitroHttpJsonResponse(res); + resp->setStatusCode(k409Conflict); + callback(resp); + LOG_WARN << "Engine is not loaded yet"; + return; + } LOG_TRACE << "Start unload model"; engine_->UnloadModel( req->getJsonObject(), @@ -75,6 +108,16 @@ void llamaCPP::UnloadModel( void llamaCPP::ModelStatus( const HttpRequestPtr& req, std::function&& callback) { + if (!IsEngineLoaded()) { + Json::Value res; + res["message"] = "Engine is not loaded yet"; + auto resp = nitro_utils::nitroHttpJsonResponse(res); + resp->setStatusCode(k409Conflict); + callback(resp); + LOG_WARN << "Engine is not loaded yet"; + return; + } + LOG_TRACE << "Start to get model status"; engine_->GetModelStatus( req->getJsonObject(), @@ -90,6 +133,40 @@ void llamaCPP::ModelStatus( void llamaCPP::LoadModel( const HttpRequestPtr& req, std::function&& callback) { + auto engine_type = + (*(req->getJsonObject())).get("engine", kLlamaEngine).asString(); + if (!dylib_ || engine_type != cur_engine_name_) { + cur_engine_name_ = engine_type; + // TODO: change this when we get more engines + auto get_engine_path = [](std::string_view e) { + if (e == kLlamaEngine) { + return kLlamaLibPath; + } + return kLlamaLibPath; + }; + + try { + dylib_ = + std::make_unique(get_engine_path(cur_engine_name_), "engine"); + } catch (const dylib::load_error& e) { + LOG_ERROR << "Could not load engine: " << e.what(); + dylib_.reset(); + engine_ = nullptr; + } + + if (!dylib_) { + Json::Value res; + res["message"] = "Could not load engine " + cur_engine_name_; + auto resp = nitro_utils::nitroHttpJsonResponse(res); + resp->setStatusCode(k500InternalServerError); + callback(resp); + return; + } + auto func = dylib_->get_function("get_engine"); + engine_ = func(); + LOG_INFO << "Loaded engine: " << cur_engine_name_; + } + LOG_TRACE << "Load model"; engine_->LoadModel( req->getJsonObject(), @@ -105,9 +182,8 @@ void llamaCPP::LoadModel( void llamaCPP::ProcessStreamRes(std::function cb, std::shared_ptr q) { auto err_or_done = std::make_shared(false); - auto chunked_content_provider = [q, err_or_done]( - char* buf, - std::size_t buf_size) -> std::size_t { + auto chunked_content_provider = + [q, err_or_done](char* buf, std::size_t buf_size) -> std::size_t { if (buf == nullptr) { LOG_TRACE << "Buf is null"; return 0; @@ -145,4 +221,9 @@ void llamaCPP::ProcessNonStreamRes( static_cast(status["status_code"].asInt())); cb(resp); } + +bool llamaCPP::IsEngineLoaded() { + return !!engine_; +} + } // namespace inferences \ No newline at end of file diff --git a/cortex-cpp/controllers/llamaCPP.h b/cortex-cpp/controllers/llamaCPP.h index 62b554f6b..691b20cb4 100644 --- a/cortex-cpp/controllers/llamaCPP.h +++ b/cortex-cpp/controllers/llamaCPP.h @@ -78,6 +78,7 @@ class llamaCPP : public drogon::HttpController, std::shared_ptr q); void ProcessNonStreamRes(std::function cb, SyncQueue& q); + bool IsEngineLoaded(); private: struct SyncQueue { @@ -121,5 +122,6 @@ class llamaCPP : public drogon::HttpController, private: std::unique_ptr dylib_; EngineI* engine_; + std::string cur_engine_name_; }; }; // namespace inferences \ No newline at end of file From 9adc9d94b7938a4b342ed7a50c13f83d9bf42110 Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Fri, 3 May 2024 13:16:28 +0700 Subject: [PATCH 05/10] chore: bump whisper.cpp --- cortex-cpp/whisper.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cortex-cpp/whisper.cpp b/cortex-cpp/whisper.cpp index 9fab28135..58210d6a7 160000 --- a/cortex-cpp/whisper.cpp +++ b/cortex-cpp/whisper.cpp @@ -1 +1 @@ -Subproject commit 9fab28135c7867bb7eccd9ebcd2ea8d52e42ca81 +Subproject commit 58210d6a7634ea1e42e0a2dab611f4a0518731dc From ad799c35707d8be7082245ea11cdeef3c78e584e Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Sat, 4 May 2024 08:56:32 +0700 Subject: [PATCH 06/10] fix: correct library name --- .github/workflows/build.yml | 6 +- .../engines/cortex.llamacpp/engine.cmake | 68 +++++++++++-------- 2 files changed, 42 insertions(+), 32 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 318b984a1..2871f9a4b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -294,7 +294,7 @@ jobs: run: | ./install_deps.sh mkdir build && cd build - cmake -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} .. + cmake -DMAC_ARM64=ON -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} .. CC=gcc-8 make -j $(sysctl -n hw.ncpu) ls -la @@ -363,7 +363,7 @@ jobs: run: | ./install_deps.sh mkdir build && cd build - cmake -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} .. + cmake -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} .. CC=gcc-8 make -j $(sysctl -n hw.ncp) ls -la @@ -423,7 +423,7 @@ jobs: mkdir -p nitro/engines/cortex.llamacpp/ ls ./nitro-mac-amd64 lipo -create ./nitro-mac-amd64/nitro ./nitro-mac-arm64/nitro -output ./nitro/nitro - lipo -create ./nitro-mac-amd64/engines/cortex.llamacpp/libengine.dylib ./nitro-mac-arm64/engines/cortex.llamacpp/libengine.dylib -output ./nitro/engines/cortex.llamacpp/libengine.dylib + lipo -create ./nitro-mac-amd64/engines/cortex.llamacpp/libengine.dylib ./nitro-mac-arm64/engines/cortex.llamacpp/libengine.dylib ./nitro/engines/cortex.llamacpp/libengine.dylib cp ./nitro-mac-arm64/ggml-metal.metal ./nitro/ggml-metal.metal tar -czvf nitro.tar.gz nitro diff --git a/cortex-cpp/engines/cortex.llamacpp/engine.cmake b/cortex-cpp/engines/cortex.llamacpp/engine.cmake index 4b879a1ca..be7df6b12 100644 --- a/cortex-cpp/engines/cortex.llamacpp/engine.cmake +++ b/cortex-cpp/engines/cortex.llamacpp/engine.cmake @@ -1,52 +1,62 @@ -set(LIBLLAMA_ENGINE_VERSION v1.0) +# cortex.llamacpp release version +set(VERSION 0.1.0) +set(ENGINE_VERSION v${VERSION}) -# Download library based on instruction -# TODO(sang) correct library name when cortex.llamacpp CI is done +# MESSAGE("ENGINE_VERSION=" ${ENGINE_VERSION}) + +# Download library based on instructions if(UNIX AND NOT APPLE) if(CUDA_12_0) - set(LIBRARY_NAME linux-amd64-cuda-12-0.tar.gz) + set(LIBRARY_NAME cortex.llamacpp-${VERSION}-linux-amd64-cuda-12-0.tar.gz) elseif(CUDA_11_7) - set(LIBRARY_NAME linux-amd64-cuda-11-7.tar.gz) + set(LIBRARY_NAME cortex.llamacpp-${VERSION}-linux-amd64-cuda-11-7.tar.gz) elseif(LLAMA_VULKAN) - set(LIBRARY_NAME linux-amd64-vulkan.tar.gz) - elseif(DLLAMA_AVX512) - set(LIBRARY_NAME linux-amd64-avx512.tar.gz) - elseif(DLLAMA_AVX2) - set(LIBRARY_NAME linux-amd64-avx2.tar.gz) + set(LIBRARY_NAME cortex.llamacpp-${VERSION}-linux-amd64-vulkan.tar.gz) + elseif(LLAMA_AVX512) + set(LIBRARY_NAME cortex.llamacpp-${VERSION}-linux-amd64-avx512.tar.gz) + elseif(NOT LLAMA_AVX2) + set(LIBRARY_NAME cortex.llamacpp-${VERSION}-linux-amd64-avx.tar.gz) else() - set(LIBRARY_NAME linux-amd64-avx.tar.gz) + set(LIBRARY_NAME cortex.llamacpp-${VERSION}-linux-amd64-avx2.tar.gz) endif() elseif(UNIX) - set(LIBRARY_NAME mac-universal.tar.gz) + if(MAC_ARM64) + set(LIBRARY_NAME cortex.llamacpp-${VERSION}-mac-arm64.tar.gz) + else() + set(LIBRARY_NAME cortex.llamacpp-${VERSION}-mac-amd64.tar.gz) + endif() else() if(CUDA_12_0) - if(DLLAMA_AVX512) - set(LIBRARY_NAME win-amd64-avx512-cuda-12-0.tar.gz) - elseif(DLLAMA_AVX2) - set(LIBRARY_NAME win-amd64-avx2-cuda-12-0.tar.gz) + if(LLAMA_AVX512) + set(LIBRARY_NAME cortex.llamacpp-${VERSION}-windows-amd64-avx512-cuda-12-0.tar.gz) + elseif(NOT LLAMA_AVX2) + set(LIBRARY_NAME cortex.llamacpp-${VERSION}-windows-amd64-avx-cuda-12-0.tar.gz) else() - set(LIBRARY_NAME win-amd64-avx-cuda-12-0.tar.gz) + set(LIBRARY_NAME cortex.llamacpp-${VERSION}-windows-amd64-avx2-cuda-12-0.tar.gz) endif() elseif(CUDA_11_7) - if(DLLAMA_AVX512) - set(LIBRARY_NAME win-amd64-avx512-cuda-11-7.tar.gz) - elseif(DLLAMA_AVX2) - set(LIBRARY_NAME win-amd64-avx2-cuda-11-7.tar.gz) + if(LLAMA_AVX512) + set(LIBRARY_NAME cortex.llamacpp-${VERSION}-windows-amd64-avx512-cuda-11-7.tar.gz) + elseif(NOT LLAMA_AVX2) + set(LIBRARY_NAME cortex.llamacpp-${VERSION}-windows-amd64-avx-cuda-11-7.tar.gz) else() - set(LIBRARY_NAME win-amd64-avx-cuda-11-7.tar.gz) + set(LIBRARY_NAME cortex.llamacpp-${VERSION}-windows-amd64-avx2-cuda-11-7.tar.gz) endif() elseif(LLAMA_VULKAN) - set(LIBRARY_NAME win-amd64-vulkan.tar.gz) - elseif(DLLAMA_AVX512) - set(LIBRARY_NAME win-amd64-avx512.tar.gz) - elseif(DLLAMA_AVX2) - set(LIBRARY_NAME win-amd64-avx2.tar.gz) + set(LIBRARY_NAME cortex.llamacpp-${VERSION}-windows-amd64-vulkan.tar.gz) + elseif(LLAMA_AVX512) + set(LIBRARY_NAME cortex.llamacpp-${VERSION}-windows-amd64-avx512.tar.gz) + elseif(NOT LLAMA_AVX2) + set(LIBRARY_NAME cortex.llamacpp-${VERSION}-windows-amd64-avx.tar.gz) else() - set(LIBRARY_NAME win-amd64-avx.tar.gz) + set(LIBRARY_NAME cortex.llamacpp-${VERSION}-windows-amd64-avx2.tar.gz) endif() endif() -set(LIBLLAMA_ENGINE_URL https://github.com/.../releases/download/${LIBLLAMA_ENGINE_VERSION}/${LIBRARY_NAME}) + +set(LIBLLAMA_ENGINE_URL https://github.com/janhq/cortex.llamacpp/releases/download/${ENGINE_VERSION}/${LIBRARY_NAME}) +# MESSAGE("LIBLLAMA_ENGINE_URL="${LIBLLAMA_ENGINE_URL}) +# MESSAGE("LIBARRY_NAME=" ${LIBRARY_NAME}) set(LIBLLAMA_ENGINE_PATH ${CMAKE_BINARY_DIR}/engines/${LIBRARY_NAME}) From 132ba7e2bc3fd521d04e71af26202afd3cf4e677 Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Mon, 6 May 2024 09:19:49 +0700 Subject: [PATCH 07/10] fix: ci-cd --- .github/workflows/build.yml | 172 +++++++++++++++++++----------------- cortex-cpp/install_deps.sh | 0 2 files changed, 89 insertions(+), 83 deletions(-) mode change 100644 => 100755 cortex-cpp/install_deps.sh diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 2871f9a4b..112c1023d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -98,7 +98,7 @@ jobs: local max_retries=3 local tag while [ $retries -lt $max_retries ]; do - tag=$(curl -s https://api.github.com/repos/janhq/nitro/releases/latest | jq -r .tag_name) + tag=$(curl -s https://api.github.com/repos/janhq/cortex/releases/latest | jq -r .tag_name) if [ -n "$tag" ] && [ "$tag" != "null" ]; then echo $tag return @@ -161,6 +161,7 @@ jobs: id: make_build run: | ldd --version + cd cortex-cpp ./install_deps.sh mkdir build && cd build cmake ${{ matrix.defines }} -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} .. @@ -170,35 +171,35 @@ jobs: - name: Package shell: bash run: | - mkdir -p nitro - mkdir -p nitro/engines/cortex.llamacpp - cp build/nitro nitro/ - cp build/engines/cortex.llamacpp/libengine.so nitro/engines/cortex.llamacpp/ - tar -czvf nitro.tar.gz nitro + mkdir -p cortex-cpp/nitro + mkdir -p cortex-cpp/nitro/engines/cortex.llamacpp + cp cortex-cpp/build/nitro cortex-cpp/nitro/ + cp cortex-cpp/build/engines/cortex.llamacpp/libengine.so cortex-cpp/nitro/engines/cortex.llamacpp/ + tar -czvf cortex-cpp/nitro.tar.gz cortex-cpp/nitro - name: Upload Artifact uses: actions/upload-artifact@v2 if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' with: name: nitro-linux-${{ matrix.build }} - path: ./nitro + path: ./cortex-cpp/nitro - name: Run e2e testing - LLama.CPP shell: bash - if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' }} + if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' && matrix.build != 'amd64-avx' }} run: | # run e2e testing - cd nitro - chmod +x ../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }} + cd cortex-cpp/nitro + chmod +x ../../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }} rm -rf uploads/ - name: Run e2e testing - Whisper.CPP shell: bash - if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' }} + if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' && matrix.build != 'amd64-avx' }} run: | # run e2e testing - cd nitro - chmod +x ../.github/scripts/e2e-test-whisper-linux-and-mac.sh && ../.github/scripts/e2e-test-whisper-linux-and-mac.sh ./nitro ${{ env.WHISPER_MODEL_URL }} + cd cortex-cpp/nitro + chmod +x ../../.github/scripts/e2e-test-whisper-linux-and-mac.sh && ../../.github/scripts/e2e-test-whisper-linux-and-mac.sh ./nitro ${{ env.WHISPER_MODEL_URL }} rm -rf uploads/ - uses: actions/upload-release-asset@v1.0.1 @@ -207,7 +208,7 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: upload_url: ${{ needs.create-draft-release.outputs.upload_url }} - asset_path: ./nitro.tar.gz + asset_path: ./cortex-cpp/nitro.tar.gz asset_name: nitro-${{ needs.create-draft-release.outputs.version }}-linux-${{ matrix.build }}.tar.gz asset_content_type: application/gzip @@ -222,9 +223,9 @@ jobs: matrix: include: - cuda: "12-0" - flags: "-DCUDA_12_0=ON -DCUDA_11_7=OFF" + flags: "-DCUDA_12_0=ON" - cuda: "11-7" - flags: "-DCUDA_12_0=OFF -DCUDA_11_7=ON" + flags: "-DCUDA_11_7=ON" steps: - name: Clone @@ -236,6 +237,7 @@ jobs: - name: Build id: make_build run: | + cd cortex-cpp ./install_deps.sh mkdir build && cd build cmake ${{matrix.flags}} -DLLAMA_NATIVE=OFF -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} .. @@ -245,6 +247,7 @@ jobs: - name: Package shell: bash run: | + cd cortex-cpp mkdir -p nitro mkdir -p nitro/engines/cortex.llamacpp cp build/nitro nitro/ @@ -256,7 +259,16 @@ jobs: if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' with: name: nitro-linux-amd64-cuda-${{ matrix.cuda }} - path: ./nitro + path: ./cortex-cpp/nitro + + - name: Run e2e testing - LLama.CPP + shell: bash + if: ${{ matrix.cuda != '12-0'}} + run: | + # run e2e testing + cd cortex-cpp/nitro + chmod +x ../../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }} + rm -rf uploads/ - uses: actions/upload-release-asset@v1.0.1 if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') @@ -264,7 +276,7 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: upload_url: ${{ needs.create-draft-release.outputs.upload_url }} - asset_path: ./nitro.tar.gz + asset_path: ./cortex-cpp/nitro.tar.gz asset_name: nitro-${{ needs.create-draft-release.outputs.version }}-linux-amd64-cuda-${{ matrix.cuda }}.tar.gz asset_content_type: application/gzip @@ -292,6 +304,7 @@ jobs: - name: Build id: cmake_build run: | + cd cortex-cpp ./install_deps.sh mkdir build && cd build cmake -DMAC_ARM64=ON -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} .. @@ -301,8 +314,8 @@ jobs: - name: Package shell: bash run: | + cd cortex-cpp mkdir -p nitro - cp llama.cpp/ggml-metal.metal nitro/ mkdir -p nitro/engines/cortex.llamacpp cp build/nitro nitro/ cp build/engines/cortex.llamacpp/libengine.dylib nitro/engines/cortex.llamacpp/ @@ -311,22 +324,21 @@ jobs: uses: actions/upload-artifact@v2 with: name: nitro-mac-arm64 - path: ./nitro + path: ./cortex-cpp/nitro - name: Run e2e testing - LLama.CPP - shell: bash run: | # run e2e testing - cd nitro/ - chmod +x ../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }} + cd cortex-cpp/nitro/ + chmod +x ../../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }} rm -rf uploads/ - name: Run e2e testing - Whisper.CPP - shell: bash run: | + cd cortex-cpp # To test with CoreML if [[ ! -f "/tmp/testwhisper-encoder.mlmodelc" ]]; then - wget https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny-encoder.mlmodelc.zip + curl -LO https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny-encoder.mlmodelc.zip unzip ggml-tiny-encoder.mlmodelc.zip rm ggml-tiny-encoder.mlmodelc.zip rm -rf /tmp/testwhisper-encoder.mlmodelc @@ -334,11 +346,11 @@ jobs: fi # run e2e testing cd nitro - chmod +x ../.github/scripts/e2e-test-whisper-linux-and-mac.sh && ../.github/scripts/e2e-test-whisper-linux-and-mac.sh ./nitro ${{ env.WHISPER_MODEL_URL }} + chmod +x ../../.github/scripts/e2e-test-whisper-linux-and-mac.sh && ../../.github/scripts/e2e-test-whisper-linux-and-mac.sh ./nitro ${{ env.WHISPER_MODEL_URL }} rm -rf uploads/ macOS-amd64-build: - runs-on: macos-latest + runs-on: macos-13 needs: [create-draft-release, set-nitro-version] if: always() && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.set-nitro-version.result == 'success' timeout-minutes: 40 @@ -361,6 +373,7 @@ jobs: - name: Build id: cmake_build run: | + cd cortex-cpp ./install_deps.sh mkdir build && cd build cmake -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} .. @@ -370,6 +383,7 @@ jobs: - name: Package shell: bash run: | + cd cortex-cpp mkdir -p nitro mkdir -p nitro/engines/cortex.llamacpp cp build/nitro nitro/ @@ -379,22 +393,24 @@ jobs: uses: actions/upload-artifact@v2 with: name: nitro-mac-amd64 - path: ./nitro + path: ./cortex-cpp/nitro - name: Run e2e testing - LLama.CPP shell: bash run: | # run e2e testing + cd cortex-cpp cd nitro - chmod +x ../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }} + chmod +x ../../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }} rm -rf uploads/ - name: Run e2e testing - Whisper.CPP shell: bash run: | # run e2e testing + cd cortex-cpp cd nitro - chmod +x ../.github/scripts/e2e-test-whisper-linux-and-mac.sh && ../.github/scripts/e2e-test-whisper-linux-and-mac.sh ./nitro ${{ env.WHISPER_MODEL_URL }} + chmod +x ../../.github/scripts/e2e-test-whisper-linux-and-mac.sh && ../../.github/scripts/e2e-test-whisper-linux-and-mac.sh ./nitro ${{ env.WHISPER_MODEL_URL }} rm -rf uploads/ universal-nitro-artifact-macos: @@ -409,29 +425,29 @@ jobs: uses: actions/download-artifact@v2 with: name: nitro-mac-amd64 - path: ./nitro-mac-amd64 + path: ./cortex-cpp/nitro-mac-amd64 - name: download artifact arm64 uses: actions/download-artifact@v2 with: name: nitro-mac-arm64 - path: ./nitro-mac-arm64 + path: ./cortex-cpp/nitro-mac-arm64 - name: bundle universal binary run: | + cd cortex-cpp mkdir -p nitro mkdir -p nitro/engines/cortex.llamacpp/ ls ./nitro-mac-amd64 lipo -create ./nitro-mac-amd64/nitro ./nitro-mac-arm64/nitro -output ./nitro/nitro - lipo -create ./nitro-mac-amd64/engines/cortex.llamacpp/libengine.dylib ./nitro-mac-arm64/engines/cortex.llamacpp/libengine.dylib ./nitro/engines/cortex.llamacpp/libengine.dylib - cp ./nitro-mac-arm64/ggml-metal.metal ./nitro/ggml-metal.metal + lipo -create ./nitro-mac-amd64/engines/cortex.llamacpp/libengine.dylib ./nitro-mac-arm64/engines/cortex.llamacpp/libengine.dylib -output ./nitro/engines/cortex.llamacpp/libengine.dylib tar -czvf nitro.tar.gz nitro - name: Upload Artifact uses: actions/upload-artifact@v2 with: name: nitro-mac-universal - path: ./nitro + path: ./cortex-cpp/nitro - uses: actions/upload-release-asset@v1.0.1 if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') @@ -439,7 +455,7 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: upload_url: ${{ needs.create-draft-release.outputs.upload_url }} - asset_path: ./nitro.tar.gz + asset_path: ./cortex-cpp/nitro.tar.gz asset_name: nitro-${{ needs.create-draft-release.outputs.version }}-mac-universal.tar.gz asset_content_type: application/gzip @@ -506,6 +522,7 @@ jobs: id: cmake_build shell: cmd run: | + cd cortex-cpp cmake -S ./nitro_deps -B ./build_deps/nitro_deps cmake --build ./build_deps/nitro_deps --config Release mkdir -p build @@ -517,14 +534,15 @@ jobs: id: pack_artifacts shell: cmd run: | + cd cortex-cpp mkdir .\build\Release\engines\cortex.llamacpp\ robocopy .\build\engines\cortex.llamacpp\ .\build\Release\engines\cortex.llamacpp\ engine.dll robocopy build_deps\_install\bin\ .\build\Release\ zlib.dll robocopy build\bin\Release\ .\build\Release\ llama.dll robocopy build\bin\Release\ .\build\Release\ whisper.dll - robocopy .github\patches\windows\ .\build\Release\ msvcp140.dll - robocopy .github\patches\windows\ .\build\Release\ vcruntime140_1.dll - robocopy .github\patches\windows\ .\build\Release\ vcruntime140.dll + robocopy ..\.github\patches\windows\ .\build\Release\ msvcp140.dll + robocopy ..\.github\patches\windows\ .\build\Release\ vcruntime140_1.dll + robocopy ..\.github\patches\windows\ .\build\Release\ vcruntime140.dll robocopy "$env:SDL2_DIR\..\lib\2.28.5\" .\build\Release\ SDL2.dll dotnet tool install --global AzureSignTool azuresigntool.exe sign -kvu "${{ secrets.AZURE_KEY_VAULT_URI }}" -kvi "${{ secrets.AZURE_CLIENT_ID }}" -kvt "${{ secrets.AZURE_TENANT_ID }}" -kvs "${{ secrets.AZURE_CLIENT_SECRET }}" -kvc ${{ secrets.AZURE_CERT_NAME }} -tr http://timestamp.globalsign.com/tsa/r6advanced1 -v ".\build\Release\nitro.exe" @@ -535,16 +553,18 @@ jobs: shell: cmd if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' }} run: | + cd cortex-cpp cd build\Release - ..\..\.github\scripts\e2e-test-llama-windows.bat nitro.exe ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }} + ..\..\..\.github\scripts\e2e-test-llama-windows.bat nitro.exe ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }} rmdir /S /Q .\build\Release\uploads - name: Run e2e testing - Whisper.cpp shell: cmd if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' }} run: | + cd cortex-cpp cd build\Release - ..\..\.github\scripts\e2e-test-whisper-windows.bat nitro.exe ${{ env.WHISPER_MODEL_URL }} + ..\..\..\.github\scripts\e2e-test-whisper-windows.bat nitro.exe ${{ env.WHISPER_MODEL_URL }} rmdir /S /Q .\build\Release\uploads - name: Upload Artifact @@ -552,7 +572,7 @@ jobs: if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' with: name: nitro-win-${{ matrix.build }} - path: ./build/Release + path: ./cortex-cpp/build/Release - uses: actions/upload-release-asset@v1.0.1 if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') @@ -560,7 +580,7 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: upload_url: ${{ needs.create-draft-release.outputs.upload_url }} - asset_path: ./nitro.tar.gz + asset_path: ./cortex-cpp/nitro.tar.gz asset_name: nitro-${{ needs.create-draft-release.outputs.version }}-win-${{ matrix.build }}.tar.gz asset_content_type: application/gzip @@ -576,59 +596,43 @@ jobs: matrix: include: - cuda: "12-0" - instructions: "-DLLAMA_NATIVE=OFF" - flags: "-DCUDA_12_0=ON -DCUDA_11_7=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" + instructions: "amd64-avx2" + inst-flags: "-DLLAMA_NATIVE=OFF" + cmake-flags: "-DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" - cuda: "12-0" - instructions: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF" - flags: "-DCUDA_12_0=ON -DCUDA_11_7=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" + instructions: "amd64-avx" + inst-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF" + cmake-flags: "-DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" - cuda: "12-0" - instructions: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF" - flags: "-DCUDA_12_0=ON -DCUDA_11_7=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" + instructions: "amd64-avx512" + inst-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF" + cmake-flags: "-DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" - cuda: "11-7" - instructions: "-DLLAMA_NATIVE=OFF" - flags: "-DCUDA_12_0=OFF -DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" + instructions: "amd64-avx2" + inst-flags: "-DLLAMA_NATIVE=OFF" + cmake-flags: "-DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" - cuda: "11-7" - instructions: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF" - flags: "-DCUDA_12_0=OFF -DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" + instructions: "amd64-avx" + inst-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF" + cmake-flags: "-DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" - cuda: "11-7" - instructions: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF" - flags: "-DCUDA_12_0=OFF -DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" + instructions: "amd64-avx512" + inst-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF" + cmake-flags: "-DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" steps: - - name: Setup VSWhere.exe - uses: warrenbuckley/Setup-VSWhere@v1 - with: - version: latest - silent: true - env: - ACTIONS_ALLOW_UNSECURE_COMMANDS: true - - - name: Add msbuild to PATH - uses: microsoft/setup-msbuild@v1 - - name: Fetch SDL2 and set SDL2_DIR version 2.28.5 run: | curl -L -o sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-2.28.5/SDL2-devel-2.28.5-VC.zip 7z x sdl2.zip -aoa echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-2.28.5/cmake" >> $env:GITHUB_ENV - - name: actions-setup-cmake - uses: jwlawson/actions-setup-cmake@v1.14.1 - - name: Clone id: checkout uses: actions/checkout@v3 with: submodules: recursive - - name: Setup VSWhere.exe - uses: warrenbuckley/Setup-VSWhere@v1 - with: - version: latest - silent: true - env: - ACTIONS_ALLOW_UNSECURE_COMMANDS: true - - uses: actions/setup-dotnet@v3 with: dotnet-version: "6.0.x" @@ -637,11 +641,12 @@ jobs: id: cmake_build shell: cmd run: | + cd cortex-cpp cmake -S ./nitro_deps -B ./build_deps/nitro_deps cmake --build ./build_deps/nitro_deps --config Release mkdir -p build cd build - cmake .. ${{ matrix.instructions }} ${{ matrix.flags }} -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} + cmake .. ${{ matrix.inst-flags }} ${{ matrix.cmake-flags }} -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} cmake --build . --config Release -j "%NUMBER_OF_PROCESSORS%" - name: Pack artifacts @@ -649,14 +654,15 @@ jobs: shell: cmd run: | set PATH=%PATH%;C:\Program Files\7-Zip\ + cd cortex-cpp mkdir .\build\Release\engines\cortex.llamacpp\ robocopy .\build\engines\cortex.llamacpp\ .\build\Release\engines\cortex.llamacpp\ engine.dll robocopy build_deps\_install\bin\ .\build\Release\ zlib.dll robocopy build\bin\Release\ .\build\Release\ llama.dll robocopy build\bin\Release\ .\build\Release\ whisper.dll - robocopy .github\patches\windows\ .\build\Release\ msvcp140.dll - robocopy .github\patches\windows\ .\build\Release\ vcruntime140_1.dll - robocopy .github\patches\windows\ .\build\Release\ vcruntime140.dll + robocopy ..\.github\patches\windows\ .\build\Release\ msvcp140.dll + robocopy ..\.github\patches\windows\ .\build\Release\ vcruntime140_1.dll + robocopy ..\.github\patches\windows\ .\build\Release\ vcruntime140.dll robocopy "$env:SDL2_DIR\..\lib\2.28.5\" .\build\Release\ SDL2.dll dotnet tool install --global AzureSignTool %USERPROFILE%\.dotnet\tools\azuresigntool.exe sign -kvu "${{ secrets.AZURE_KEY_VAULT_URI }}" -kvi "${{ secrets.AZURE_CLIENT_ID }}" -kvt "${{ secrets.AZURE_TENANT_ID }}" -kvs "${{ secrets.AZURE_CLIENT_SECRET }}" -kvc ${{ secrets.AZURE_CERT_NAME }} -tr http://timestamp.globalsign.com/tsa/r6advanced1 -v ".\build\Release\nitro.exe" @@ -668,7 +674,7 @@ jobs: if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' with: name: nitro-win-${{ matrix.instructions }}-cuda-${{ matrix.cuda }} - path: ./build/Release + path: ./cortex-cpp/build/Release - uses: actions/upload-release-asset@v1.0.1 if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') @@ -676,7 +682,7 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} with: upload_url: ${{ needs.create-draft-release.outputs.upload_url }} - asset_path: ./nitro.tar.gz + asset_path: ./cortex-cpp/nitro.tar.gz asset_name: nitro-${{ needs.create-draft-release.outputs.version }}-win-${{ matrix.instructions }}-cuda-${{ matrix.cuda }}.tar.gz asset_content_type: application/gzip diff --git a/cortex-cpp/install_deps.sh b/cortex-cpp/install_deps.sh old mode 100644 new mode 100755 From 95ec19a4d5edd83a669c2f23b568e3330d1f8b25 Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Tue, 7 May 2024 08:35:36 +0700 Subject: [PATCH 08/10] fix: avx2 flag --- .github/workflows/build.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 112c1023d..41d2d6025 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -132,7 +132,7 @@ jobs: matrix: include: - build: "amd64-avx2" - defines: "-DLLAMA_NATIVE=OFF" + defines: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF" - build: "amd64-avx" defines: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF" - build: "amd64-avx512" @@ -469,7 +469,7 @@ jobs: matrix: include: - build: "amd64-avx2" - defines: "-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" + defines: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" - build: "amd64-avx" defines: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" - build: "amd64-avx512" @@ -597,7 +597,7 @@ jobs: include: - cuda: "12-0" instructions: "amd64-avx2" - inst-flags: "-DLLAMA_NATIVE=OFF" + inst-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF" cmake-flags: "-DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" - cuda: "12-0" instructions: "amd64-avx" @@ -609,7 +609,7 @@ jobs: cmake-flags: "-DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" - cuda: "11-7" instructions: "amd64-avx2" - inst-flags: "-DLLAMA_NATIVE=OFF" + inst-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF" cmake-flags: "-DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" - cuda: "11-7" instructions: "amd64-avx" From e8e1100fed6e2c67482dfaf181a79f79b251e194 Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Tue, 7 May 2024 11:32:54 +0700 Subject: [PATCH 09/10] refactor: remove audio --- .../scripts/e2e-test-whisper-linux-and-mac.sh | 93 -- .github/scripts/e2e-test-whisper-windows.bat | 102 --- .github/workflows/build.yml | 89 +- .gitmodules | 3 - cortex-cpp/CMakeLists.txt | 6 +- cortex-cpp/common/base.cc | 0 cortex-cpp/common/base.h | 16 - cortex-cpp/context/whisper_server_context.cc | 796 ------------------ cortex-cpp/context/whisper_server_context.h | 165 ---- cortex-cpp/controllers/audio.cc | 300 ------- cortex-cpp/controllers/audio.h | 74 -- cortex-cpp/whisper.cpp | 1 - 12 files changed, 13 insertions(+), 1632 deletions(-) delete mode 100755 .github/scripts/e2e-test-whisper-linux-and-mac.sh delete mode 100644 .github/scripts/e2e-test-whisper-windows.bat delete mode 100644 cortex-cpp/common/base.cc delete mode 100644 cortex-cpp/context/whisper_server_context.cc delete mode 100644 cortex-cpp/context/whisper_server_context.h delete mode 100644 cortex-cpp/controllers/audio.cc delete mode 100644 cortex-cpp/controllers/audio.h delete mode 160000 cortex-cpp/whisper.cpp diff --git a/.github/scripts/e2e-test-whisper-linux-and-mac.sh b/.github/scripts/e2e-test-whisper-linux-and-mac.sh deleted file mode 100755 index 4c8a1e9eb..000000000 --- a/.github/scripts/e2e-test-whisper-linux-and-mac.sh +++ /dev/null @@ -1,93 +0,0 @@ -#!/bin/bash - -## Example run command -# ./linux-and-mac.sh './jan/plugins/@janhq/inference-plugin/dist/nitro/nitro_mac_arm64' https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF/resolve/main/tinyllama-1.1b-chat-v0.3.Q2_K.gguf - -# Check for required arguments -if [[ $# -ne 2 ]]; then - echo "Usage: $0 " - exit 1 -fi - -rm /tmp/response1.log /tmp/response2.log /tmp/nitro.log - -BINARY_PATH=$1 -DOWNLOAD_URL=$2 - -# Random port to ensure it's not used -min=10000 -max=11000 -range=$((max - min + 1)) -PORT=$((RANDOM % range + min)) - -# Start the binary file -"$BINARY_PATH" 1 127.0.0.1 $PORT >/tmp/nitro.log & - -# Get the process id of the binary file -pid=$! - -if ! ps -p $pid >/dev/null; then - echo "nitro failed to start. Logs:" - cat /tmp/nitro.log - exit 1 -fi - -# Wait for a few seconds to let the server start -sleep 5 - -# Check if /tmp/testwhisper exists, if not, download it -if [[ ! -f "/tmp/testwhisper" ]]; then - curl --connect-timeout 300 $DOWNLOAD_URL --output /tmp/testwhisper -fi - -# Run the curl commands -response1=$(curl --connect-timeout 60 -o /tmp/response1.log -s -w "%{http_code}" --location "http://127.0.0.1:$PORT/v1/audio/load_model" \ - --header 'Content-Type: application/json' \ - --data '{ - "model_path": "/tmp/testwhisper", - "model_id": "whisper.cpp" -}') - -response2=$( - curl --connect-timeout 60 -o /tmp/response2.log -s -w "%{http_code}" --location "http://127.0.0.1:$PORT/v1/audio/transcriptions" \ - --header 'Access-Control-Allow-Origin: *' \ - --form 'file=@"../whisper.cpp/samples/jfk.wav"' \ - --form 'model_id="whisper.cpp"' \ - --form 'temperature="0.0"' \ - --form 'prompt="The transcript is about OpenAI which makes technology like DALL·E, GPT-3, and ChatGPT with the hope of one day building an AGI system that benefits all of humanity. The president is trying to raly people to support the cause."' \ - -) - -error_occurred=0 -if [[ "$response1" -ne 200 ]]; then - echo "The first curl command failed with status code: $response1" - cat /tmp/response1.log - error_occurred=1 -fi - -if [[ "$response2" -ne 200 ]]; then - echo "The second curl command failed with status code: $response2" - cat /tmp/response2.log - error_occurred=1 -fi - -if [[ "$error_occurred" -eq 1 ]]; then - echo "Nitro test run failed!!!!!!!!!!!!!!!!!!!!!!" - echo "Nitro Error Logs:" - cat /tmp/nitro.log - kill $pid - exit 1 -fi - -echo "----------------------" -echo "Log load model:" -cat /tmp/response1.log - -echo "----------------------" -echo "Log run test:" -cat /tmp/response2.log - -echo "Nitro test run successfully!" - -# Kill the server process -kill $pid diff --git a/.github/scripts/e2e-test-whisper-windows.bat b/.github/scripts/e2e-test-whisper-windows.bat deleted file mode 100644 index 6eb2037ea..000000000 --- a/.github/scripts/e2e-test-whisper-windows.bat +++ /dev/null @@ -1,102 +0,0 @@ -@echo off - -set "TEMP=C:\Users\%UserName%\AppData\Local\Temp" -set "MODEL_PATH=%TEMP%\testwhisper" - -rem Check for required arguments -if "%~2"=="" ( - echo Usage: %~0 ^ ^ - exit /b 1 -) - -set "BINARY_PATH=%~1" -set "DOWNLOAD_URL=%~2" - -for %%i in ("%BINARY_PATH%") do set "BINARY_NAME=%%~nxi" - -echo BINARY_NAME=%BINARY_NAME% - -del %TEMP%\response1.log 2>nul -del %TEMP%\response2.log 2>nul -del %TEMP%\nitro.log 2>nul - -set /a min=9999 -set /a max=11000 -set /a range=max-min+1 -set /a PORT=%min% + %RANDOM% %% %range% - -rem Start the binary file -start /B "" "%BINARY_PATH%" 1 "127.0.0.1" %PORT% > %TEMP%\nitro.log 2>&1 - -ping -n 6 127.0.0.1 %PORT% > nul - -rem Capture the PID of the started process with "nitro" in its name -for /f "tokens=2" %%a in ('tasklist /fi "imagename eq %BINARY_NAME%" /fo list ^| findstr /B "PID:"') do ( - set "pid=%%a" -) - -echo pid=%pid% - -if not defined pid ( - echo nitro failed to start. Logs: - type %TEMP%\nitro.log - exit /b 1 -) - -rem Wait for a few seconds to let the server start - -rem Check if %TEMP%\testwhisper exists, if not, download it -if not exist "%MODEL_PATH%" ( - curl.exe --connect-timeout 300 %DOWNLOAD_URL% --output "%MODEL_PATH%" -) - -rem Define JSON strings for curl data -call set "MODEL_PATH_STRING=%%MODEL_PATH:\=\\%%" -set "curl_data1={\"model_path\":\"%MODEL_PATH_STRING%\",\"model_id\":\"whisper\"}" - -rem Run the curl commands and capture the status code -curl.exe --connect-timeout 60 -o %TEMP%\response1.log -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/v1/audio/load_model" --header "Content-Type: application/json" --data "%curl_data1%" > %TEMP%\response1_code.log 2>&1 - -curl --connect-timeout 60 -o %TEMP%\response2.log -s -w "%%{http_code}" --location "http://localhost:%PORT%/v1/audio/transcriptions" ^ ---form "file=@../..//whisper.cpp/samples/jfk.wav" ^ ---form "model_id=whisper" > %TEMP%\response2_code.log 2>&1 - -set "error_occurred=0" - -rem Read the status codes from the log files -for /f %%a in (%TEMP%\response1_code.log) do set "response1=%%a" -for /f %%a in (%TEMP%\response2_code.log) do set "response2=%%a" - -if "%response1%" neq "200" ( - echo The first curl command failed with status code: %response1% - type %TEMP%\response1.log - set "error_occurred=1" -) - -if "%response2%" neq "200" ( - echo The second curl command failed with status code: %response2% - type %TEMP%\response2.log - set "error_occurred=1" -) - -if "%error_occurred%"=="1" ( - echo Nitro test run failed!!!!!!!!!!!!!!!!!!!!!! - echo Nitro Error Logs: - type %TEMP%\nitro.log - taskkill /f /pid %pid% - exit /b 1 -) - - -echo ---------------------- -echo Log load model: -type %TEMP%\response1.log - -echo ---------------------- -echo "Log run test:" -type %TEMP%\response2.log - -echo Nitro test run successfully! - -rem Kill the server process -taskkill /f /im nitro.exe 2>nul || exit /B 0 diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 41d2d6025..dc0719f95 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -48,7 +48,6 @@ on: env: BRANCH_NAME: ${{ github.head_ref || github.ref_name }} LLM_MODEL_URL: https://delta.jan.ai/tinyllama-1.1b-chat-v0.3.Q2_K.gguf - WHISPER_MODEL_URL: https://delta.jan.ai/ggml-tiny-q5_1.bin EMBEDDING_MODEL_URL: https://catalog.jan.ai/dist/models/embeds/nomic-embed-text-v1.5.f16.gguf jobs: @@ -193,15 +192,6 @@ jobs: chmod +x ../../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }} rm -rf uploads/ - - name: Run e2e testing - Whisper.CPP - shell: bash - if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' && matrix.build != 'amd64-avx' }} - run: | - # run e2e testing - cd cortex-cpp/nitro - chmod +x ../../.github/scripts/e2e-test-whisper-linux-and-mac.sh && ../../.github/scripts/e2e-test-whisper-linux-and-mac.sh ./nitro ${{ env.WHISPER_MODEL_URL }} - rm -rf uploads/ - - uses: actions/upload-release-asset@v1.0.1 if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') env: @@ -240,7 +230,7 @@ jobs: cd cortex-cpp ./install_deps.sh mkdir build && cd build - cmake ${{matrix.flags}} -DLLAMA_NATIVE=OFF -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} .. + cmake ${{matrix.flags}} -DLLAMA_NATIVE=OFF -DLLAMA_CUDA=ON -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} .. make -j $(nproc) ls -la @@ -294,13 +284,6 @@ jobs: with: submodules: recursive - - name: Dependencies - id: depends - continue-on-error: true - run: | - brew update - brew install cmake sdl2 - - name: Build id: cmake_build run: | @@ -333,22 +316,6 @@ jobs: chmod +x ../../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }} rm -rf uploads/ - - name: Run e2e testing - Whisper.CPP - run: | - cd cortex-cpp - # To test with CoreML - if [[ ! -f "/tmp/testwhisper-encoder.mlmodelc" ]]; then - curl -LO https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny-encoder.mlmodelc.zip - unzip ggml-tiny-encoder.mlmodelc.zip - rm ggml-tiny-encoder.mlmodelc.zip - rm -rf /tmp/testwhisper-encoder.mlmodelc - mv ggml-tiny-encoder.mlmodelc /tmp/testwhisper-encoder.mlmodelc - fi - # run e2e testing - cd nitro - chmod +x ../../.github/scripts/e2e-test-whisper-linux-and-mac.sh && ../../.github/scripts/e2e-test-whisper-linux-and-mac.sh ./nitro ${{ env.WHISPER_MODEL_URL }} - rm -rf uploads/ - macOS-amd64-build: runs-on: macos-13 needs: [create-draft-release, set-nitro-version] @@ -404,15 +371,6 @@ jobs: chmod +x ../../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }} rm -rf uploads/ - - name: Run e2e testing - Whisper.CPP - shell: bash - run: | - # run e2e testing - cd cortex-cpp - cd nitro - chmod +x ../../.github/scripts/e2e-test-whisper-linux-and-mac.sh && ../../.github/scripts/e2e-test-whisper-linux-and-mac.sh ./nitro ${{ env.WHISPER_MODEL_URL }} - rm -rf uploads/ - universal-nitro-artifact-macos: runs-on: macos-latest needs: [create-draft-release, set-nitro-version, macOS-silicon-build, macOS-amd64-build] @@ -469,13 +427,13 @@ jobs: matrix: include: - build: "amd64-avx2" - defines: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" + defines: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" - build: "amd64-avx" - defines: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" + defines: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" - build: "amd64-avx512" - defines: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" + defines: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" - build: "amd64-vulkan" - defines: "-DLLAMA_VULKAN=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" + defines: "-DLLAMA_VULKAN=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" # - build: "arm64" # defines: "-A ARM64 -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON" @@ -501,12 +459,6 @@ jobs: - name: Add msbuild to PATH uses: microsoft/setup-msbuild@v1 - - name: Fetch SDL2 and set SDL2_DIR version 2.28.5 - run: | - C:/msys64/usr/bin/wget.exe -qO sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-2.28.5/SDL2-devel-2.28.5-VC.zip - 7z x sdl2.zip -aoa - echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-2.28.5/cmake" >> $env:GITHUB_ENV - - name: actions-setup-cmake uses: jwlawson/actions-setup-cmake@v1.14.1 @@ -539,11 +491,9 @@ jobs: robocopy .\build\engines\cortex.llamacpp\ .\build\Release\engines\cortex.llamacpp\ engine.dll robocopy build_deps\_install\bin\ .\build\Release\ zlib.dll robocopy build\bin\Release\ .\build\Release\ llama.dll - robocopy build\bin\Release\ .\build\Release\ whisper.dll robocopy ..\.github\patches\windows\ .\build\Release\ msvcp140.dll robocopy ..\.github\patches\windows\ .\build\Release\ vcruntime140_1.dll robocopy ..\.github\patches\windows\ .\build\Release\ vcruntime140.dll - robocopy "$env:SDL2_DIR\..\lib\2.28.5\" .\build\Release\ SDL2.dll dotnet tool install --global AzureSignTool azuresigntool.exe sign -kvu "${{ secrets.AZURE_KEY_VAULT_URI }}" -kvi "${{ secrets.AZURE_CLIENT_ID }}" -kvt "${{ secrets.AZURE_TENANT_ID }}" -kvs "${{ secrets.AZURE_CLIENT_SECRET }}" -kvc ${{ secrets.AZURE_CERT_NAME }} -tr http://timestamp.globalsign.com/tsa/r6advanced1 -v ".\build\Release\nitro.exe" 7z a -ttar temp.tar .\build\Release\* @@ -558,15 +508,6 @@ jobs: ..\..\..\.github\scripts\e2e-test-llama-windows.bat nitro.exe ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }} rmdir /S /Q .\build\Release\uploads - - name: Run e2e testing - Whisper.cpp - shell: cmd - if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' }} - run: | - cd cortex-cpp - cd build\Release - ..\..\..\.github\scripts\e2e-test-whisper-windows.bat nitro.exe ${{ env.WHISPER_MODEL_URL }} - rmdir /S /Q .\build\Release\uploads - - name: Upload Artifact uses: actions/upload-artifact@v2 if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request' @@ -598,35 +539,29 @@ jobs: - cuda: "12-0" instructions: "amd64-avx2" inst-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF" - cmake-flags: "-DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" + cmake-flags: "-DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" - cuda: "12-0" instructions: "amd64-avx" inst-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF" - cmake-flags: "-DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" + cmake-flags: "-DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" - cuda: "12-0" instructions: "amd64-avx512" inst-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF" - cmake-flags: "-DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" + cmake-flags: "-DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" - cuda: "11-7" instructions: "amd64-avx2" inst-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF" - cmake-flags: "-DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" + cmake-flags: "-DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" - cuda: "11-7" instructions: "amd64-avx" inst-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF" - cmake-flags: "-DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" + cmake-flags: "-DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" - cuda: "11-7" instructions: "amd64-avx512" inst-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF" - cmake-flags: "-DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON" + cmake-flags: "-DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE" steps: - - name: Fetch SDL2 and set SDL2_DIR version 2.28.5 - run: | - curl -L -o sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-2.28.5/SDL2-devel-2.28.5-VC.zip - 7z x sdl2.zip -aoa - echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-2.28.5/cmake" >> $env:GITHUB_ENV - - name: Clone id: checkout uses: actions/checkout@v3 @@ -659,11 +594,9 @@ jobs: robocopy .\build\engines\cortex.llamacpp\ .\build\Release\engines\cortex.llamacpp\ engine.dll robocopy build_deps\_install\bin\ .\build\Release\ zlib.dll robocopy build\bin\Release\ .\build\Release\ llama.dll - robocopy build\bin\Release\ .\build\Release\ whisper.dll robocopy ..\.github\patches\windows\ .\build\Release\ msvcp140.dll robocopy ..\.github\patches\windows\ .\build\Release\ vcruntime140_1.dll robocopy ..\.github\patches\windows\ .\build\Release\ vcruntime140.dll - robocopy "$env:SDL2_DIR\..\lib\2.28.5\" .\build\Release\ SDL2.dll dotnet tool install --global AzureSignTool %USERPROFILE%\.dotnet\tools\azuresigntool.exe sign -kvu "${{ secrets.AZURE_KEY_VAULT_URI }}" -kvi "${{ secrets.AZURE_CLIENT_ID }}" -kvt "${{ secrets.AZURE_TENANT_ID }}" -kvs "${{ secrets.AZURE_CLIENT_SECRET }}" -kvc ${{ secrets.AZURE_CERT_NAME }} -tr http://timestamp.globalsign.com/tsa/r6advanced1 -v ".\build\Release\nitro.exe" 7z a -ttar temp.tar .\build\Release\* diff --git a/.gitmodules b/.gitmodules index 65a042814..e69de29bb 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +0,0 @@ -[submodule "cortex-cpp/whisper.cpp"] - path = cortex-cpp/whisper.cpp - url = https://github.com/ggerganov/whisper.cpp.git diff --git a/cortex-cpp/CMakeLists.txt b/cortex-cpp/CMakeLists.txt index 4bb269aaa..97be0e86d 100644 --- a/cortex-cpp/CMakeLists.txt +++ b/cortex-cpp/CMakeLists.txt @@ -56,7 +56,6 @@ endif() add_compile_definitions(NITRO_VERSION="${NITRO_VERSION}") -add_subdirectory(whisper.cpp) add_subdirectory(test) add_executable(${PROJECT_NAME} main.cc) @@ -69,7 +68,7 @@ add_executable(${PROJECT_NAME} main.cc) # and comment out the following lines find_package(Drogon CONFIG REQUIRED) -target_link_libraries(${PROJECT_NAME} PRIVATE Drogon::Drogon whisper +target_link_libraries(${PROJECT_NAME} PRIVATE Drogon::Drogon ${CMAKE_THREAD_LIBS_INIT}) # ############################################################################## @@ -87,7 +86,6 @@ endif() aux_source_directory(controllers CTL_SRC) aux_source_directory(common COMMON_SRC) -aux_source_directory(context CONTEXT_SRC) aux_source_directory(models MODEL_SRC) aux_source_directory(cortex-common CORTEX_COMMON) # aux_source_directory(filters FILTER_SRC) aux_source_directory(plugins @@ -100,7 +98,7 @@ aux_source_directory(cortex-common CORTEX_COMMON) target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} ) # ${CMAKE_CURRENT_SOURCE_DIR}/models) -target_sources(${PROJECT_NAME} PRIVATE ${CTL_SRC} ${COMMON_SRC} ${CONTEXT_SRC}) +target_sources(${PROJECT_NAME} PRIVATE ${CTL_SRC} ${COMMON_SRC}) # ${FILTER_SRC} ${PLUGIN_SRC} ${MODEL_SRC}) # ############################################################################## # uncomment the following line for dynamically loading views set_property(TARGET diff --git a/cortex-cpp/common/base.cc b/cortex-cpp/common/base.cc deleted file mode 100644 index e69de29bb..000000000 diff --git a/cortex-cpp/common/base.h b/cortex-cpp/common/base.h index adfe42a19..43d612c1b 100644 --- a/cortex-cpp/common/base.h +++ b/cortex-cpp/common/base.h @@ -37,21 +37,5 @@ class BaseEmbedding { const HttpRequestPtr& req, std::function&& callback) = 0; - // The derived class can also override other methods if needed -}; - -class BaseAudio { - public: - virtual ~BaseAudio() {} - // Transcribes audio into the input language. - virtual void CreateTranscription( - const HttpRequestPtr& req, - std::function&& callback) = 0; - - // Translates audio into the input language. - virtual void CreateTranslation( - const HttpRequestPtr& req, - std::function&& callback) = 0; - // The derived class can also override other methods if needed }; \ No newline at end of file diff --git a/cortex-cpp/context/whisper_server_context.cc b/cortex-cpp/context/whisper_server_context.cc deleted file mode 100644 index a4ccbe710..000000000 --- a/cortex-cpp/context/whisper_server_context.cc +++ /dev/null @@ -1,796 +0,0 @@ -#include "whisper_server_context.h" -#include "utils/dr_wav.h" -#include -#include -#include -#include "utils/json.hpp" - -using json = nlohmann::json; - -bool read_wav(const std::string& fname, std::vector& pcmf32, - std::vector>& pcmf32s, bool stereo) { - drwav wav; - std::vector wav_data; // used for pipe input from stdin - - if (fname == "-") { - { - uint8_t buf[1024]; - while (true) { - const size_t n = fread(buf, 1, sizeof(buf), stdin); - if (n == 0) { - break; - } - wav_data.insert(wav_data.end(), buf, buf + n); - } - } - - if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == - false) { - fprintf(stderr, "error: failed to open WAV file from stdin\n"); - return false; - } - - fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, - wav_data.size()); - } else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) { - fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str()); - return false; - } - - if (wav.channels != 1 && wav.channels != 2) { - fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__, - fname.c_str()); - return false; - } - - if (stereo && wav.channels != 2) { - fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n", - __func__, fname.c_str()); - return false; - } - - if (wav.sampleRate != COMMON_SAMPLE_RATE) { - fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__, - fname.c_str(), COMMON_SAMPLE_RATE / 1000); - return false; - } - - if (wav.bitsPerSample != 16) { - fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__, - fname.c_str()); - return false; - } - - const uint64_t n = - wav_data.empty() - ? wav.totalPCMFrameCount - : wav_data.size() / (wav.channels * wav.bitsPerSample / 8); - - std::vector pcm16; - pcm16.resize(n * wav.channels); - drwav_read_pcm_frames_s16(&wav, n, pcm16.data()); - drwav_uninit(&wav); - - // convert to mono, float - pcmf32.resize(n); - if (wav.channels == 1) { - for (uint64_t i = 0; i < n; i++) { - pcmf32[i] = float(pcm16[i]) / 32768.0f; - } - } else { - for (uint64_t i = 0; i < n; i++) { - pcmf32[i] = float(pcm16[2 * i] + pcm16[2 * i + 1]) / 65536.0f; - } - } - - if (stereo) { - // convert to stereo, float - pcmf32s.resize(2); - - pcmf32s[0].resize(n); - pcmf32s[1].resize(n); - for (uint64_t i = 0; i < n; i++) { - pcmf32s[0][i] = float(pcm16[2 * i]) / 32768.0f; - pcmf32s[1][i] = float(pcm16[2 * i + 1]) / 32768.0f; - } - } - - return true; -} - -std::string output_str(struct whisper_context* ctx, - const whisper_params& params, - std::vector> pcmf32s) { - std::stringstream result; - const int n_segments = whisper_full_n_segments(ctx); - for (int i = 0; i < n_segments; ++i) { - const char* text = whisper_full_get_segment_text(ctx, i); - std::string speaker = ""; - - if (params.diarize && pcmf32s.size() == 2) { - const int64_t t0 = whisper_full_get_segment_t0(ctx, i); - const int64_t t1 = whisper_full_get_segment_t1(ctx, i); - speaker = estimate_diarization_speaker(pcmf32s, t0, t1); - } - - result << speaker << text << "\n"; - } - return result.str(); -} - -std::string estimate_diarization_speaker( - std::vector> pcmf32s, int64_t t0, int64_t t1, - bool id_only) { - std::string speaker = ""; - const int64_t n_samples = pcmf32s[0].size(); - - const int64_t is0 = timestamp_to_sample(t0, n_samples); - const int64_t is1 = timestamp_to_sample(t1, n_samples); - - double energy0 = 0.0f; - double energy1 = 0.0f; - - for (int64_t j = is0; j < is1; j++) { - energy0 += fabs(pcmf32s[0][j]); - energy1 += fabs(pcmf32s[1][j]); - } - - if (energy0 > 1.1 * energy1) { - speaker = "0"; - } else if (energy1 > 1.1 * energy0) { - speaker = "1"; - } else { - speaker = "?"; - } - - // printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, speaker = - // %s\n", is0, is1, energy0, energy1, speaker.c_str()); - - if (!id_only) { - speaker.insert(0, "(speaker "); - speaker.append(")"); - } - - return speaker; -} - -// 500 -> 00:05.000 -// 6000 -> 01:00.000 -std::string to_timestamp(int64_t t, bool comma) { - int64_t msec = t * 10; - int64_t hr = msec / (1000 * 60 * 60); - msec = msec - hr * (1000 * 60 * 60); - int64_t min = msec / (1000 * 60); - msec = msec - min * (1000 * 60); - int64_t sec = msec / 1000; - msec = msec - sec * 1000; - - char buf[32]; - snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int)hr, (int)min, - (int)sec, comma ? "," : ".", (int)msec); - - return std::string(buf); -} - -int timestamp_to_sample(int64_t t, int n_samples) { - return (std::max)(0, (std::min)((int)n_samples - 1, - (int)((t * WHISPER_SAMPLE_RATE) / 100))); -} - -bool is_file_exist(const char* fileName) { - std::ifstream infile(fileName); - return infile.good(); -} - -void whisper_print_usage(int /*argc*/, char** argv, - const whisper_params& params) { - fprintf(stderr, "\n"); - fprintf(stderr, "usage: %s [options] \n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, - " -h, --help [default] show this help " - "message and exit\n"); - fprintf(stderr, - " -t N, --threads N [%-7d] number of threads to use " - "during computation\n", - params.n_threads); - fprintf(stderr, - " -p N, --processors N [%-7d] number of processors to use " - "during computation\n", - params.n_processors); - fprintf( - stderr, - " -ot N, --offset-t N [%-7d] time offset in milliseconds\n", - params.offset_t_ms); - fprintf(stderr, - " -on N, --offset-n N [%-7d] segment index offset\n", - params.offset_n); - fprintf(stderr, - " -d N, --duration N [%-7d] duration of audio to " - "process in milliseconds\n", - params.duration_ms); - fprintf(stderr, - " -mc N, --max-context N [%-7d] maximum number of text " - "context tokens to store\n", - params.max_context); - fprintf(stderr, - " -ml N, --max-len N [%-7d] maximum segment length in " - "characters\n", - params.max_len); - fprintf(stderr, - " -sow, --split-on-word [%-7s] split on word rather than " - "on token\n", - params.split_on_word ? "true" : "false"); - fprintf(stderr, - " -bo N, --best-of N [%-7d] number of best candidates " - "to keep\n", - params.best_of); - fprintf(stderr, - " -bs N, --beam-size N [%-7d] beam size for beam search\n", - params.beam_size); - fprintf(stderr, - " -wt N, --word-thold N [%-7.2f] word timestamp " - "probability threshold\n", - params.word_thold); - fprintf(stderr, - " -et N, --entropy-thold N [%-7.2f] entropy threshold for " - "decoder fail\n", - params.entropy_thold); - fprintf(stderr, - " -lpt N, --logprob-thold N [%-7.2f] log probability threshold " - "for decoder fail\n", - params.logprob_thold); - // fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by - // x2 (reduced accuracy)\n", params.speed_up ? "true" : "false"); - fprintf(stderr, - " -debug, --debug-mode [%-7s] enable debug mode (eg. dump " - "log_mel)\n", - params.debug_mode ? "true" : "false"); - fprintf(stderr, - " -tr, --translate [%-7s] translate from source " - "language to english\n", - params.translate ? "true" : "false"); - fprintf(stderr, - " -di, --diarize [%-7s] stereo audio diarization\n", - params.diarize ? "true" : "false"); - fprintf(stderr, - " -tdrz, --tinydiarize [%-7s] enable tinydiarize " - "(requires a tdrz model)\n", - params.tinydiarize ? "true" : "false"); - fprintf(stderr, - " -nf, --no-fallback [%-7s] do not use temperature " - "fallback while decoding\n", - params.no_fallback ? "true" : "false"); - fprintf(stderr, - " -ps, --print-special [%-7s] print special tokens\n", - params.print_special ? "true" : "false"); - fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", - params.print_colors ? "true" : "false"); - fprintf(stderr, - " -pr, --print-realtime [%-7s] print output in realtime\n", - params.print_realtime ? "true" : "false"); - fprintf(stderr, " -pp, --print-progress [%-7s] print progress\n", - params.print_progress ? "true" : "false"); - fprintf(stderr, - " -nt, --no-timestamps [%-7s] do not print timestamps\n", - params.no_timestamps ? "true" : "false"); - fprintf(stderr, - " -l LANG, --language LANG [%-7s] spoken language ('auto' for " - "auto-detect)\n", - params.language.c_str()); - fprintf(stderr, - " -dl, --detect-language [%-7s] exit after automatically " - "detecting language\n", - params.detect_language ? "true" : "false"); - fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt\n", - params.prompt.c_str()); - fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", - params.model.c_str()); - fprintf(stderr, - " -oved D, --ov-e-device DNAME [%-7s] the OpenVINO device used " - "for encode inference\n", - params.openvino_encode_device.c_str()); - fprintf(stderr, - " --convert, [%-7s] Convert audio to WAV, " - "requires ffmpeg on the server", - params.ffmpeg_converter ? "true" : "false"); - fprintf(stderr, "\n"); -} - -bool whisper_params_parse(int argc, char** argv, whisper_params& params) { - for (int i = 1; i < argc; i++) { - std::string arg = argv[i]; - - if (arg == "-h" || arg == "--help") { - whisper_print_usage(argc, argv, params); - exit(0); - } else if (arg == "-t" || arg == "--threads") { - params.n_threads = std::stoi(argv[++i]); - } else if (arg == "-p" || arg == "--processors") { - params.n_processors = std::stoi(argv[++i]); - } else if (arg == "-ot" || arg == "--offset-t") { - params.offset_t_ms = std::stoi(argv[++i]); - } else if (arg == "-on" || arg == "--offset-n") { - params.offset_n = std::stoi(argv[++i]); - } else if (arg == "-d" || arg == "--duration") { - params.duration_ms = std::stoi(argv[++i]); - } else if (arg == "-mc" || arg == "--max-context") { - params.max_context = std::stoi(argv[++i]); - } else if (arg == "-ml" || arg == "--max-len") { - params.max_len = std::stoi(argv[++i]); - } else if (arg == "-bo" || arg == "--best-of") { - params.best_of = std::stoi(argv[++i]); - } else if (arg == "-bs" || arg == "--beam-size") { - params.beam_size = std::stoi(argv[++i]); - } else if (arg == "-wt" || arg == "--word-thold") { - params.word_thold = std::stof(argv[++i]); - } else if (arg == "-et" || arg == "--entropy-thold") { - params.entropy_thold = std::stof(argv[++i]); - } else if (arg == "-lpt" || arg == "--logprob-thold") { - params.logprob_thold = std::stof(argv[++i]); - } - // else if (arg == "-su" || arg == "--speed-up") { params.speed_up - // = true; } - else if (arg == "-debug" || arg == "--debug-mode") { - params.debug_mode = true; - } else if (arg == "-tr" || arg == "--translate") { - params.translate = true; - } else if (arg == "-di" || arg == "--diarize") { - params.diarize = true; - } else if (arg == "-tdrz" || arg == "--tinydiarize") { - params.tinydiarize = true; - } else if (arg == "-sow" || arg == "--split-on-word") { - params.split_on_word = true; - } else if (arg == "-nf" || arg == "--no-fallback") { - params.no_fallback = true; - } else if (arg == "-fp" || arg == "--font-path") { - params.font_path = argv[++i]; - } else if (arg == "-ps" || arg == "--print-special") { - params.print_special = true; - } else if (arg == "-pc" || arg == "--print-colors") { - params.print_colors = true; - } else if (arg == "-pr" || arg == "--print-realtime") { - params.print_realtime = true; - } else if (arg == "-pp" || arg == "--print-progress") { - params.print_progress = true; - } else if (arg == "-nt" || arg == "--no-timestamps") { - params.no_timestamps = true; - } else if (arg == "-l" || arg == "--language") { - params.language = argv[++i]; - } else if (arg == "-dl" || arg == "--detect-language") { - params.detect_language = true; - } else if (arg == "--prompt") { - params.prompt = argv[++i]; - } else if (arg == "-m" || arg == "--model") { - params.model = argv[++i]; - } else if (arg == "-oved" || arg == "--ov-e-device") { - params.openvino_encode_device = argv[++i]; - } else if (arg == "-ng" || arg == "--no-gpu") { - params.use_gpu = false; - } else if (arg == "--convert") { - params.ffmpeg_converter = true; - } else { - fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); - whisper_print_usage(argc, argv, params); - exit(0); - } - } - - return true; -} - -void check_ffmpeg_availibility() { - int result = system("ffmpeg -version"); - - if (result == 0) { - std::cout << "ffmpeg is available." << std::endl; - } else { - // ffmpeg is not available - std::cout << "ffmpeg is not found. Please ensure that ffmpeg is installed "; - std::cout << "and that its executable is included in your system's PATH. "; - exit(0); - } -} - -bool convert_to_wav(const std::string& temp_filename, std::string& error_resp) { - std::ostringstream cmd_stream; - std::string converted_filename_temp = temp_filename + "_temp.wav"; - cmd_stream << "ffmpeg -i \"" << temp_filename - << "\" -ar 16000 -ac 1 -c:a pcm_s16le \"" - << converted_filename_temp << "\" 2>&1"; - std::string cmd = cmd_stream.str(); - - int status = std::system(cmd.c_str()); - if (status != 0) { - error_resp = "{\"error\":\"FFmpeg conversion failed.\"}"; - return false; - } - - // Remove the original file - if (remove(temp_filename.c_str()) != 0) { - error_resp = "{\"error\":\"Failed to remove the original file.\"}"; - return false; - } - - // Rename the temporary file to match the original filename - if (rename(converted_filename_temp.c_str(), temp_filename.c_str()) != 0) { - error_resp = "{\"error\":\"Failed to rename the temporary file.\"}"; - return false; - } - return true; -} - -void whisper_print_progress_callback(struct whisper_context* /*ctx*/, - struct whisper_state* /*state*/, - int progress, void* user_data) { - int progress_step = - ((whisper_print_user_data*)user_data)->params->progress_step; - int* progress_prev = &(((whisper_print_user_data*)user_data)->progress_prev); - if (progress >= *progress_prev + progress_step) { - *progress_prev += progress_step; - fprintf(stderr, "%s: progress = %3d%%\n", __func__, progress); - } -} - -void whisper_print_segment_callback(struct whisper_context* ctx, - struct whisper_state* /*state*/, int n_new, - void* user_data) { - const auto& params = *((whisper_print_user_data*)user_data)->params; - const auto& pcmf32s = *((whisper_print_user_data*)user_data)->pcmf32s; - - const int n_segments = whisper_full_n_segments(ctx); - - std::string speaker = ""; - - int64_t t0 = 0; - int64_t t1 = 0; - - // print the last n_new segments - const int s0 = n_segments - n_new; - - if (s0 == 0) { - printf("\n"); - } - - for (int i = s0; i < n_segments; i++) { - if (!params.no_timestamps || params.diarize) { - t0 = whisper_full_get_segment_t0(ctx, i); - t1 = whisper_full_get_segment_t1(ctx, i); - } - - if (!params.no_timestamps) { - printf("[%s --> %s] ", to_timestamp(t0).c_str(), - to_timestamp(t1).c_str()); - } - - if (params.diarize && pcmf32s.size() == 2) { - speaker = estimate_diarization_speaker(pcmf32s, t0, t1); - } - - if (params.print_colors) { - for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) { - if (params.print_special == false) { - const whisper_token id = whisper_full_get_token_id(ctx, i, j); - if (id >= whisper_token_eot(ctx)) { - continue; - } - } - - const char* text = whisper_full_get_token_text(ctx, i, j); - const float p = whisper_full_get_token_p(ctx, i, j); - - const int col = (std::max)( - 0, (std::min)((int)k_colors.size() - 1, - (int)((std::pow)(p, 3) * float(k_colors.size())))); - - printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, - "\033[0m"); - } - } else { - const char* text = whisper_full_get_segment_text(ctx, i); - - printf("%s%s", speaker.c_str(), text); - } - - if (params.tinydiarize) { - if (whisper_full_get_segment_speaker_turn_next(ctx, i)) { - printf("%s", params.tdrz_speaker_turn.c_str()); - } - } - - // with timestamps or speakers: each segment on new line - if (!params.no_timestamps || params.diarize) { - printf("\n"); - } - fflush(stdout); - } -} - -whisper_server_context::~whisper_server_context() { - if (ctx) { - whisper_print_timings(ctx); - whisper_free(ctx); - ctx = nullptr; - } -} - -bool whisper_server_context::load_model(std::string& model_path) { - whisper_mutex.lock(); - - // clean up - whisper_free(ctx); - - // whisper init - ctx = whisper_init_from_file_with_params(model_path.c_str(), cparams); - - // TODO perhaps load prior model here instead of exit - if (ctx == nullptr) { - whisper_mutex.unlock(); - return false; - } - - // initialize openvino encoder. this has no effect on whisper.cpp builds that - // don't have OpenVINO configured - whisper_ctx_init_openvino_encoder( - ctx, nullptr, params.openvino_encode_device.c_str(), nullptr); - - // check if the model is in the file system - whisper_mutex.unlock(); - return true; -} - -std::string whisper_server_context::inference( - std::string& input_file_path, std::string language, std::string prompt, - std::string response_format, float temperature, bool translate) { - // acquire whisper model mutex lock - whisper_mutex.lock(); - - // audio arrays - std::vector pcmf32; // mono-channel F32 PCM - std::vector> pcmf32s; // stereo-channel F32 PCM - - // if file is not wav, convert to wav - if (params.ffmpeg_converter) { - std::string error_resp = "Failed to execute ffmpeg command converting " + - input_file_path + " to wav"; - const bool is_converted = convert_to_wav(input_file_path, error_resp); - if (!is_converted) { - whisper_mutex.unlock(); - LOG_ERROR << error_resp; - throw std::runtime_error(error_resp); - } - } - - // read wav content into pcmf32 - if (!read_wav(input_file_path, pcmf32, pcmf32s, params.diarize)) { - std::string error_resp = "Failed to read WAV file " + input_file_path; - LOG_ERROR << error_resp; - whisper_mutex.unlock(); - throw std::runtime_error(error_resp); - } - - printf("Successfully loaded %s\n", input_file_path.c_str()); - - params.translate = translate; - params.language = language; - params.response_format = response_format; - if (!whisper_is_multilingual(ctx)) { - if (params.language != "en" || params.translate) { - params.language = "en"; - params.translate = false; - LOG_WARN - << "Model " << model_id - << " is not multilingual, ignoring language and translation options"; - } - } - if (params.detect_language) { - params.language = "auto"; - } - - // print some processing info - std::string processing_info = - "Model " + model_id + " processing " + input_file_path + " (" + - std::to_string(pcmf32.size()) + " samples, " + - std::to_string(float(pcmf32.size()) / WHISPER_SAMPLE_RATE) + " sec), " + - std::to_string(params.n_threads) + " threads, " + - std::to_string(params.n_processors) + - " processors, lang = " + params.language + - ", task = " + (params.translate ? "translate" : "transcribe") + ", " + - (params.tinydiarize ? "tdrz = 1, " : "") + - (params.no_timestamps ? "timestamps = 0" : "timestamps = 1"); - LOG_INFO << processing_info; - - // run the inference - { - std::string msg = "Running whisper.cpp inference of model " + model_id + - " on " + input_file_path; - LOG_INFO << msg; - whisper_full_params wparams = - whisper_full_default_params(WHISPER_SAMPLING_GREEDY); - - wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH - : WHISPER_SAMPLING_GREEDY; - - wparams.print_realtime = false; - wparams.print_progress = params.print_progress; - wparams.print_timestamps = !params.no_timestamps; - wparams.print_special = params.print_special; - wparams.translate = params.translate; - wparams.language = params.language.c_str(); - wparams.detect_language = params.detect_language; - wparams.n_threads = params.n_threads; - wparams.n_max_text_ctx = - params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx; - wparams.offset_ms = params.offset_t_ms; - wparams.duration_ms = params.duration_ms; - - wparams.thold_pt = params.word_thold; - wparams.max_len = params.max_len == 0 ? 60 : params.max_len; - wparams.split_on_word = params.split_on_word; - - wparams.speed_up = params.speed_up; - wparams.debug_mode = params.debug_mode; - - wparams.tdrz_enable = params.tinydiarize; // [TDRZ] - - wparams.initial_prompt = prompt.c_str(); - - wparams.greedy.best_of = params.best_of; - wparams.beam_search.beam_size = params.beam_size; - - wparams.temperature = temperature; - wparams.temperature_inc = params.temperature_inc; - wparams.entropy_thold = params.entropy_thold; - wparams.logprob_thold = params.logprob_thold; - - wparams.no_timestamps = params.no_timestamps; - - whisper_print_user_data user_data = {¶ms, &pcmf32s, 0}; - - // this callback is called on each new segment - if (params.print_realtime) { - wparams.new_segment_callback = whisper_print_segment_callback; - wparams.new_segment_callback_user_data = &user_data; - } - - if (wparams.print_progress) { - wparams.progress_callback = whisper_print_progress_callback; - wparams.progress_callback_user_data = &user_data; - } - - // examples for abort mechanism - // in examples below, we do not abort the processing, but we could if the - // flag is set to true - - // the callback is called before every encoder run - if it returns false, - // the processing is aborted - { - static bool is_aborted = - false; // NOTE: this should be atomic to avoid data race - - wparams.encoder_begin_callback = [](struct whisper_context* /*ctx*/, - struct whisper_state* /*state*/, - void* user_data) { - bool is_aborted = *(bool*)user_data; - return !is_aborted; - }; - wparams.encoder_begin_callback_user_data = &is_aborted; - } - - // the callback is called before every computation - if it returns true, the - // computation is aborted - { - static bool is_aborted = - false; // NOTE: this should be atomic to avoid data race - - wparams.abort_callback = [](void* user_data) { - bool is_aborted = *(bool*)user_data; - return is_aborted; - }; - wparams.abort_callback_user_data = &is_aborted; - } - - if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), - params.n_processors) != 0) { - std::string error_resp = "Failed to process audio"; - LOG_ERROR << error_resp; - whisper_mutex.unlock(); - throw std::runtime_error(error_resp); - } - } - - // return results to user - std::string result; - if (params.response_format == text_format) { - result = output_str(ctx, params, pcmf32s); - } else if (params.response_format == srt_format) { - std::stringstream ss; - const int n_segments = whisper_full_n_segments(ctx); - for (int i = 0; i < n_segments; ++i) { - const char* text = whisper_full_get_segment_text(ctx, i); - const int64_t t0 = whisper_full_get_segment_t0(ctx, i); - const int64_t t1 = whisper_full_get_segment_t1(ctx, i); - std::string speaker = ""; - - if (params.diarize && pcmf32s.size() == 2) { - speaker = estimate_diarization_speaker(pcmf32s, t0, t1); - } - - ss << i + 1 + params.offset_n << "\n"; - ss << to_timestamp(t0, true) << " --> " << to_timestamp(t1, true) << "\n"; - ss << speaker << text << "\n\n"; - } - result = ss.str(); - } else if (params.response_format == vtt_format) { - std::stringstream ss; - - ss << "WEBVTT\n\n"; - - const int n_segments = whisper_full_n_segments(ctx); - for (int i = 0; i < n_segments; ++i) { - const char* text = whisper_full_get_segment_text(ctx, i); - const int64_t t0 = whisper_full_get_segment_t0(ctx, i); - const int64_t t1 = whisper_full_get_segment_t1(ctx, i); - std::string speaker = ""; - - if (params.diarize && pcmf32s.size() == 2) { - speaker = estimate_diarization_speaker(pcmf32s, t0, t1, true); - speaker.insert(0, ""); - } - - ss << to_timestamp(t0) << " --> " << to_timestamp(t1) << "\n"; - ss << speaker << text << "\n\n"; - } - result = ss.str(); - } else if (params.response_format == vjson_format) { - /* try to match openai/whisper's Python format */ - std::string results = output_str(ctx, params, pcmf32s); - json jres = json{{"text", results}}; - const int n_segments = whisper_full_n_segments(ctx); - for (int i = 0; i < n_segments; ++i) { - json segment = json{ - {"id", i}, - {"text", whisper_full_get_segment_text(ctx, i)}, - }; - - if (!params.no_timestamps) { - segment["start"] = whisper_full_get_segment_t0(ctx, i) * 0.01; - segment["end"] = whisper_full_get_segment_t1(ctx, i) * 0.01; - } - - const int n_tokens = whisper_full_n_tokens(ctx, i); - for (int j = 0; j < n_tokens; ++j) { - whisper_token_data token = whisper_full_get_token_data(ctx, i, j); - if (token.id >= whisper_token_eot(ctx)) { - continue; - } - - segment["tokens"].push_back(token.id); - json word = json{{"word", whisper_full_get_token_text(ctx, i, j)}}; - if (!params.no_timestamps) { - word["start"] = token.t0 * 0.01; - word["end"] = token.t1 * 0.01; - } - word["probability"] = token.p; - segment["words"].push_back(word); - } - jres["segments"].push_back(segment); - } - result = jres.dump(-1, ' ', false, json::error_handler_t::replace); - } else { - std::string results = output_str(ctx, params, pcmf32s); - json jres = json{{"text", results}}; - result = jres.dump(-1, ' ', false, json::error_handler_t::replace); - } - - // reset params to thier defaults - params = default_params; - - // return whisper model mutex lock - whisper_mutex.unlock(); - LOG_INFO << "Successfully processed " << input_file_path << ": " << result; - - return result; -} diff --git a/cortex-cpp/context/whisper_server_context.h b/cortex-cpp/context/whisper_server_context.h deleted file mode 100644 index da29e4d9f..000000000 --- a/cortex-cpp/context/whisper_server_context.h +++ /dev/null @@ -1,165 +0,0 @@ -#pragma once -#include -#include -#include -#include -#include - -#include "whisper.h" - -// Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9] -// Lowest is red, middle is yellow, highest is green. -const std::vector k_colors = { - "\033[38;5;196m", "\033[38;5;202m", "\033[38;5;208m", "\033[38;5;214m", - "\033[38;5;220m", "\033[38;5;226m", "\033[38;5;190m", "\033[38;5;154m", - "\033[38;5;118m", "\033[38;5;82m", -}; - -// output formats -const std::string json_format = "json"; -const std::string text_format = "text"; -const std::string srt_format = "srt"; -const std::string vjson_format = "verbose_json"; -const std::string vtt_format = "vtt"; - -#define COMMON_SAMPLE_RATE 16000 - -struct whisper_params { - int32_t n_threads = - (std::min)(4, (int32_t)std::thread::hardware_concurrency()); - int32_t n_processors = 1; - int32_t offset_t_ms = 0; - int32_t offset_n = 0; - int32_t duration_ms = 0; - int32_t progress_step = 5; - int32_t max_context = -1; - int32_t max_len = 0; - int32_t best_of = 2; - int32_t beam_size = -1; - - float word_thold = 0.01f; - float entropy_thold = 2.40f; - float logprob_thold = -1.00f; - float temperature = 0.00f; - float temperature_inc = 0.20f; - - bool speed_up = false; - bool debug_mode = false; - bool translate = false; - bool detect_language = false; - bool diarize = false; - bool tinydiarize = false; - bool split_on_word = false; - bool no_fallback = false; - bool print_special = false; - bool print_colors = false; - bool print_realtime = false; - bool print_progress = false; - bool no_timestamps = false; - bool use_gpu = true; - bool ffmpeg_converter = false; - - std::string language = "en"; - std::string prompt = ""; - std::string font_path = - "/System/Library/Fonts/Supplemental/Courier New Bold.ttf"; - std::string model = "models/ggml-base.en.bin"; - - std::string response_format = json_format; - - // [TDRZ] speaker turn string - std::string tdrz_speaker_turn = - " [SPEAKER_TURN]"; // TODO: set from command line - - std::string openvino_encode_device = "CPU"; -}; - -// Read WAV audio file and store the PCM data into pcmf32 -// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE -// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain -// 2 channel PCM -bool read_wav(const std::string& fname, std::vector& pcmf32, - std::vector>& pcmf32s, bool stereo); - -std::string output_str(struct whisper_context* ctx, - const whisper_params& params, - std::vector> pcmf32s); - -std::string estimate_diarization_speaker( - std::vector> pcmf32s, int64_t t0, int64_t t1, - bool id_only = false); - -// 500 -> 00:05.000 -// 6000 -> 01:00.000 -std::string to_timestamp(int64_t t, bool comma = false); - -int timestamp_to_sample(int64_t t, int n_samples); - -bool is_file_exist(const char* fileName); - -void whisper_print_usage(int /*argc*/, char** argv, - const whisper_params& params); - -bool whisper_params_parse(int argc, char** argv, whisper_params& params); - -void check_ffmpeg_availibility(); - -bool convert_to_wav(const std::string& temp_filename, std::string& error_resp); - -void whisper_print_progress_callback(struct whisper_context* /*ctx*/, - struct whisper_state* /*state*/, - int progress, void* user_data); - -void whisper_print_segment_callback(struct whisper_context* ctx, - struct whisper_state* /*state*/, int n_new, - void* user_data); - -struct whisper_print_user_data { - const whisper_params* params; - - const std::vector>* pcmf32s; - int progress_prev; -}; - -struct whisper_server_context { - whisper_params params; - whisper_params default_params; - std::mutex whisper_mutex; - std::string model_id; - - struct whisper_context_params cparams; - struct whisper_context* ctx = nullptr; - - whisper_server_context() = default; // add this line - - // Constructor - whisper_server_context(const std::string& model_id) { - this->model_id = model_id; - this->cparams = whisper_context_params(); - this->ctx = nullptr; - // store default params so we can reset after each inference request - this->default_params = whisper_params(); - this->params = whisper_params(); - } - - // Move constructor - whisper_server_context(whisper_server_context&& other) noexcept - : params(std::move(other.params)), - default_params(std::move(other.default_params)), - whisper_mutex() // std::mutex is not movable, so we initialize a new one - , - model_id(std::move(other.model_id)), - cparams(std::move(other.cparams)), - ctx(std::exchange( - other.ctx, - nullptr)) // ctx is a raw pointer, so we use std::exchange - {} - - bool load_model(std::string& model_path); - - std::string inference(std::string& input_file_path, std::string languague, - std::string prompt, std::string response_format, - float temperature, bool translate); - - ~whisper_server_context(); -}; \ No newline at end of file diff --git a/cortex-cpp/controllers/audio.cc b/cortex-cpp/controllers/audio.cc deleted file mode 100644 index 91fd76d5b..000000000 --- a/cortex-cpp/controllers/audio.cc +++ /dev/null @@ -1,300 +0,0 @@ -#include "audio.h" - -#include "utils/nitro_utils.h" -#include "whisper.h" - -using namespace v1; - -audio::audio() { - whisper_print_system_info(); -}; - -audio::~audio() {} - -std::optional audio::ParseModelId( - const std::shared_ptr& jsonBody, - const std::function& callback) { - if (!jsonBody->isMember("model_id")) { - LOG_INFO << "No model_id found in request body"; - Json::Value jsonResp; - jsonResp["message"] = "No model_id found in request body"; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k400BadRequest); - callback(resp); - return std::nullopt; // Signal that an error occurred - } - - return (*jsonBody)["model_id"].asString(); -} - -void audio::LoadModel(const HttpRequestPtr& req, - std::function&& callback) { - const auto jsonBody = req->getJsonObject(); - auto optional_model_id = ParseModelId(jsonBody, callback); - if (!optional_model_id) { - return; - } - std::string model_id = *optional_model_id; - - // Check if model is already loaded - if (whispers.find(model_id) != whispers.end()) { - std::string error_msg = "Model " + model_id + " already loaded"; - LOG_INFO << error_msg; - Json::Value jsonResp; - jsonResp["message"] = error_msg; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k409Conflict); - callback(resp); - return; - } - - // Model not loaded, load it - // Parse model path from request - std::string model_path = (*jsonBody)["model_path"].asString(); - if (!is_file_exist(model_path.c_str())) { - std::string error_msg = "Model " + model_path + " not found"; - LOG_INFO << error_msg; - Json::Value jsonResp; - jsonResp["message"] = error_msg; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k404NotFound); - callback(resp); - return; - } - - whisper_server_context whisper = whisper_server_context(model_id); - bool model_loaded = whisper.load_model(model_path); - // If model failed to load, return a 500 error - if (!model_loaded) { - whisper.~whisper_server_context(); - std::string error_msg = "Failed to load model " + model_path; - LOG_INFO << error_msg; - Json::Value jsonResp; - jsonResp["message"] = error_msg; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k500InternalServerError); - callback(resp); - return; - } - - // Warm up the model - // Parse warm up audio path from request - if (jsonBody->isMember("warm_up_audio_path")) { - std::string warm_up_msg = "Warming up model " + model_id; - LOG_INFO << warm_up_msg; - std::string warm_up_audio_path = - (*jsonBody)["warm_up_audio_path"].asString(); - // Return 400 error if warm up audio path is not found - if (!is_file_exist(warm_up_audio_path.c_str())) { - std::string error_msg = - "Warm up audio " + warm_up_audio_path + - " not found, please provide a valid path or don't specify it at all"; - LOG_INFO << error_msg; - Json::Value jsonResp; - jsonResp["message"] = error_msg; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k400BadRequest); - callback(resp); - return; - } else { - LOG_INFO << "Warming up model " << model_id << " with audio " - << warm_up_audio_path << " ..."; - std::string warm_up_result = whisper.inference(warm_up_audio_path, "en", - "", text_format, 0, false); - LOG_INFO << "Warm up model " << model_id << " completed"; - } - } else { - LOG_INFO << "No warm up audio provided, skipping warm up"; - } - - // Model loaded successfully, add it to the map of loaded models - // and return a 200 response - whispers.emplace(model_id, std::move(whisper)); - Json::Value jsonResp; - std::string success_msg = "Model " + model_id + " loaded successfully"; - jsonResp["message"] = success_msg; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k200OK); - callback(resp); - return; -} - -void audio::UnloadModel( - const HttpRequestPtr& req, - std::function&& callback) { - const auto& jsonBody = req->getJsonObject(); - auto optional_model_id = ParseModelId(jsonBody, callback); - if (!optional_model_id) { - return; - } - std::string model_id = *optional_model_id; - - // If model is not loaded, return a 404 error - if (whispers.find(model_id) == whispers.end()) { - std::string error_msg = - "Model " + model_id + - " has not been loaded, please load that model into nitro"; - LOG_INFO << error_msg; - Json::Value jsonResp; - jsonResp["message"] = error_msg; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k404NotFound); - callback(resp); - return; - } - - // Model loaded, unload it - whispers[model_id].~whisper_server_context(); - whispers.erase(model_id); - - // Return a 200 response - Json::Value jsonResp; - std::string success_msg = "Model " + model_id + " unloaded successfully"; - LOG_INFO << success_msg; - jsonResp["message"] = success_msg; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k200OK); - callback(resp); - return; -} - -void audio::ListModels(const HttpRequestPtr& req, - std::function&& callback) { - // Return a list of all loaded models - Json::Value jsonResp; - Json::Value models; - for (auto const& model : whispers) { - models.append(model.first); - } - jsonResp["models"] = models; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k200OK); - callback(resp); - return; -} - -void audio::TranscriptionImpl( - const HttpRequestPtr& req, - std::function&& callback, bool translate) { - MultiPartParser partParser; - Json::Value jsonResp; - if (partParser.parse(req) != 0 || partParser.getFiles().size() != 1) { - auto resp = HttpResponse::newHttpResponse(); - resp->setBody("Must have exactly one file"); - resp->setStatusCode(k403Forbidden); - callback(resp); - return; - } - auto& file = partParser.getFiles()[0]; - const auto& formFields = partParser.getParameters(); - - // Check if model_id are present in the request. If not, return a 400 error - if (formFields.find("model_id") == formFields.end()) { - LOG_INFO << "No model_id found in request body"; - Json::Value jsonResp; - jsonResp["message"] = "No model_id found in request body"; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k400BadRequest); - callback(resp); - return; - } - - std::string model_id = formFields.at("model_id"); - - // Parse all other optional parameters from the request - std::string language = formFields.find("language") != formFields.end() - ? formFields.at("language") - : "en"; - std::string prompt = formFields.find("prompt") != formFields.end() - ? formFields.at("prompt") - : ""; - std::string response_format = - formFields.find("response_format") != formFields.end() - ? formFields.at("response_format") - : json_format; - float temperature = formFields.find("temperature") != formFields.end() - ? std::stof(formFields.at("temperature")) - : 0; - - // Check if model is loaded. If not, return a 404 error - if (whispers.find(model_id) == whispers.end()) { - std::string error_msg = - "Model " + model_id + - " has not been loaded, please load that model into nitro"; - LOG_INFO << error_msg; - Json::Value jsonResp; - jsonResp["message"] = error_msg; - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k404NotFound); - callback(resp); - return; - } - - // Save input file to temp location - std::string temp_dir = - std::filesystem::temp_directory_path().string() + "/" + - std::to_string(std::chrono::duration_cast( - std::chrono::system_clock::now().time_since_epoch()) - .count()); - // Create the directory - std::filesystem::create_directory(temp_dir); - // Save the file to the directory, with its original name - std::string temp_file_path = temp_dir + "/" + file.getFileName(); - file.saveAs(temp_file_path); - - // Run inference - std::string result; - try { - result = - whispers[model_id].inference(temp_file_path, language, prompt, - response_format, temperature, translate); - } catch (const std::exception& e) { - std::remove(temp_file_path.c_str()); - Json::Value jsonResp; - jsonResp["message"] = e.what(); - auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp); - resp->setStatusCode(k500InternalServerError); - callback(resp); - return; - } - // TODO: Need to remove the entire temp directory, not just the file - std::remove(temp_file_path.c_str()); - - auto resp = nitro_utils::nitroHttpResponse(); - resp->setBody(result); - resp->setStatusCode(k200OK); - // Set content type based on response format - if (response_format == json_format || response_format == vjson_format) { - resp->addHeader("Content-Type", "application/json"); - } else if (response_format == text_format) { - resp->addHeader("Content-Type", "text/html"); - } else if (response_format == srt_format) { - resp->addHeader("Content-Type", "application/x-subrip"); - } else if (response_format == vtt_format) { - resp->addHeader("Content-Type", "text/vtt"); - } - callback(resp); - return; -} - -void audio::ModelStatus( - const HttpRequestPtr& req, - std::function&& callback) { - auto resp = nitro_utils::nitroHttpResponse(); - resp->setStatusCode(k200OK); - resp->setContentTypeCode(drogon::CT_APPLICATION_JSON); - resp->setBody("Unimplemented"); - callback(resp); -} - -void audio::CreateTranscription( - const HttpRequestPtr& req, - std::function&& callback) { - return TranscriptionImpl(req, std::move(callback), false); -} - -void audio::CreateTranslation( - const HttpRequestPtr& req, - std::function&& callback) { - return TranscriptionImpl(req, std::move(callback), true); -} \ No newline at end of file diff --git a/cortex-cpp/controllers/audio.h b/cortex-cpp/controllers/audio.h deleted file mode 100644 index 19b1efb6d..000000000 --- a/cortex-cpp/controllers/audio.h +++ /dev/null @@ -1,74 +0,0 @@ -#pragma once - -#include -#include -#include -#include "common/base.h" - -#define DR_WAV_IMPLEMENTATION -#include "utils/dr_wav.h" - -#include "utils/json.hpp" - -// Whisper Context -#include "context/whisper_server_context.h" - -using json = nlohmann::ordered_json; - -using namespace drogon; - -namespace v1 { - -class audio : public drogon::HttpController