From ed3e23f01f66bdeb4f4d88a87ae8ef05e159ed8d Mon Sep 17 00:00:00 2001
From: vansangpfiev <vansangpfiev@gmail.com>
Date: Tue, 30 Apr 2024 16:18:51 +0700
Subject: [PATCH 01/10] fix: struct cortex-cpp

---
 .gitmodules            | 6 ++++++
 cortex-cpp/.gitmodules | 7 -------
 cortex-cpp/llama.cpp   | 1 +
 cortex-cpp/whisper.cpp | 1 +
 4 files changed, 8 insertions(+), 7 deletions(-)
 create mode 100644 .gitmodules
 delete mode 100644 cortex-cpp/.gitmodules
 create mode 160000 cortex-cpp/llama.cpp
 create mode 160000 cortex-cpp/whisper.cpp

diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 000000000..ea95ac179
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,6 @@
+[submodule "cortex-cpp/llama.cpp"]
+	path = cortex-cpp/llama.cpp
+	url = https://github.com/ggerganov/llama.cpp
+[submodule "cortex-cpp/whisper.cpp"]
+	path = cortex-cpp/whisper.cpp
+	url = https://github.com/ggerganov/whisper.cpp.git
diff --git a/cortex-cpp/.gitmodules b/cortex-cpp/.gitmodules
deleted file mode 100644
index e2f71d456..000000000
--- a/cortex-cpp/.gitmodules
+++ /dev/null
@@ -1,7 +0,0 @@
-[submodule "llama.cpp"]
-	path = llama.cpp
-	url = https://github.com/ggerganov/llama.cpp
-	branch = master
-[submodule "whisper.cpp"]
-	path = whisper.cpp
-	url = https://github.com/ggerganov/whisper.cpp.git
diff --git a/cortex-cpp/llama.cpp b/cortex-cpp/llama.cpp
new file mode 160000
index 000000000..8a56075b0
--- /dev/null
+++ b/cortex-cpp/llama.cpp
@@ -0,0 +1 @@
+Subproject commit 8a56075b07a8b571bf95a912ffdce4c928c2b414
diff --git a/cortex-cpp/whisper.cpp b/cortex-cpp/whisper.cpp
new file mode 160000
index 000000000..9fab28135
--- /dev/null
+++ b/cortex-cpp/whisper.cpp
@@ -0,0 +1 @@
+Subproject commit 9fab28135c7867bb7eccd9ebcd2ea8d52e42ca81

From 45aabe388adc44c1b6723f8745db910caa662588 Mon Sep 17 00:00:00 2001
From: vansangpfiev <vansangpfiev@gmail.com>
Date: Tue, 30 Apr 2024 16:47:11 +0700
Subject: [PATCH 02/10] fix: split llama engine

---
 .gitmodules                                   |    3 -
 cortex-cpp/CMakeLists.txt                     |   11 +-
 cortex-cpp/common/base.h                      |    3 +-
 cortex-cpp/context/llama_server_context.h     | 2260 -----------------
 cortex-cpp/controllers/llamaCPP.cc            |  809 +-----
 cortex-cpp/controllers/llamaCPP.h             |   95 +-
 cortex-cpp/cortex-common/EngineI.h            |   27 +
 .../engines/cortex.llamacpp/engine.cmake      |   21 +
 cortex-cpp/llama.cpp                          |    1 -
 cortex-cpp/models/chat_completion_request.h   |   36 -
 cortex-cpp/nitro_deps/CMakeLists.txt          |   11 +-
 cortex-cpp/test/components/test_models.cc     |   53 -
 cortex-cpp/utils/dylib.h                      |  316 +++
 13 files changed, 532 insertions(+), 3114 deletions(-)
 delete mode 100644 cortex-cpp/context/llama_server_context.h
 create mode 100644 cortex-cpp/cortex-common/EngineI.h
 create mode 100644 cortex-cpp/engines/cortex.llamacpp/engine.cmake
 delete mode 160000 cortex-cpp/llama.cpp
 delete mode 100644 cortex-cpp/models/chat_completion_request.h
 delete mode 100644 cortex-cpp/test/components/test_models.cc
 create mode 100644 cortex-cpp/utils/dylib.h

diff --git a/.gitmodules b/.gitmodules
index ea95ac179..65a042814 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,3 @@
-[submodule "cortex-cpp/llama.cpp"]
-	path = cortex-cpp/llama.cpp
-	url = https://github.com/ggerganov/llama.cpp
 [submodule "cortex-cpp/whisper.cpp"]
 	path = cortex-cpp/whisper.cpp
 	url = https://github.com/ggerganov/whisper.cpp.git
diff --git a/cortex-cpp/CMakeLists.txt b/cortex-cpp/CMakeLists.txt
index eba4fee0c..9be937dc4 100644
--- a/cortex-cpp/CMakeLists.txt
+++ b/cortex-cpp/CMakeLists.txt
@@ -1,6 +1,7 @@
 cmake_minimum_required(VERSION 3.5)
 project(nitro C CXX)
 
+# include(engines/cortex.llamacpp/engine.cmake)
 include(CheckIncludeFileCXX)
 
 check_include_file_cxx(any HAS_ANY)
@@ -55,8 +56,6 @@ endif()
 
 add_compile_definitions(NITRO_VERSION="${NITRO_VERSION}")
 
-add_subdirectory(llama.cpp/examples/llava)
-add_subdirectory(llama.cpp)
 add_subdirectory(whisper.cpp)
 add_subdirectory(test)
 
@@ -68,8 +67,9 @@ add_executable(${PROJECT_NAME} main.cc)
 # target_link_libraries(${PROJECT_NAME} PRIVATE nitro_deps)
 #
 # and comment out the following lines
+
 find_package(Drogon CONFIG REQUIRED)
-target_link_libraries(${PROJECT_NAME} PRIVATE Drogon::Drogon common llama whisper llava
+target_link_libraries(${PROJECT_NAME} PRIVATE Drogon::Drogon whisper
                                               ${CMAKE_THREAD_LIBS_INIT})
 
 # ##############################################################################
@@ -89,6 +89,7 @@ aux_source_directory(controllers CTL_SRC)
 aux_source_directory(common COMMON_SRC)
 aux_source_directory(context CONTEXT_SRC)
 aux_source_directory(models MODEL_SRC)
+aux_source_directory(cortex-common CORTEX_COMMON)
 # aux_source_directory(filters FILTER_SRC) aux_source_directory(plugins
 # PLUGIN_SRC) 
 
@@ -97,10 +98,10 @@ aux_source_directory(models MODEL_SRC)
 # namespaces. drogon_create_views(${PROJECT_NAME}
 # ${CMAKE_CURRENT_SOURCE_DIR}/views ${CMAKE_CURRENT_BINARY_DIR} TRUE)
 
-target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} )
 # ${CMAKE_CURRENT_SOURCE_DIR}/models)
 target_sources(${PROJECT_NAME} PRIVATE ${CTL_SRC} ${COMMON_SRC} ${CONTEXT_SRC})
 # ${FILTER_SRC} ${PLUGIN_SRC} ${MODEL_SRC})
 # ##############################################################################
 # uncomment the following line for dynamically loading views set_property(TARGET
-# ${PROJECT_NAME} PROPERTY ENABLE_EXPORTS ON)
+# ${PROJECT_NAME} PROPERTY ENABLE_EXPORTS ON)
\ No newline at end of file
diff --git a/cortex-cpp/common/base.h b/cortex-cpp/common/base.h
index e87d07488..adfe42a19 100644
--- a/cortex-cpp/common/base.h
+++ b/cortex-cpp/common/base.h
@@ -1,6 +1,5 @@
 #pragma once
 #include <drogon/HttpController.h>
-#include <models/chat_completion_request.h>
 
 using namespace drogon;
 
@@ -25,7 +24,7 @@ class BaseChatCompletion {
 
   // General chat method
   virtual void ChatCompletion(
-      inferences::ChatCompletionRequest &&completion,
+      const HttpRequestPtr& req,
       std::function<void(const HttpResponsePtr&)>&& callback) = 0;
 };
 
diff --git a/cortex-cpp/context/llama_server_context.h b/cortex-cpp/context/llama_server_context.h
deleted file mode 100644
index 21792f11b..000000000
--- a/cortex-cpp/context/llama_server_context.h
+++ /dev/null
@@ -1,2260 +0,0 @@
-#include <mutex>
-#include <set>
-#include <string>
-#include <vector>
-
-// External
-#include "clip.h"
-#include "common.h"
-#include "llama.h"
-#include "llava.h"
-#include "stb_image.h"
-#include "utils/json.hpp"
-
-#if defined(_WIN32)
-#define NOMINMAX
-#endif
-
-using json = nlohmann::json;
-
-#define DEFAULT_OAICOMPAT_MODEL "gpt-3.5-turbo-0613"
-
-struct server_params {
-  std::string hostname = "127.0.0.1";
-  std::string api_key;
-  std::string public_path = "examples/server/public";
-  int32_t port = 8080;
-  int32_t read_timeout = 600;
-  int32_t write_timeout = 600;
-};
-
-static bool server_verbose = false;
-
-#if SERVER_VERBOSE != 1
-#define LOG_VERBOSE(MSG, ...)
-#else
-#define LOG_VERBOSE(MSG, ...)                                      \
-  do {                                                             \
-    if (server_verbose) {                                          \
-      server_log("VERBOSE", __func__, __LINE__, MSG, __VA_ARGS__); \
-    }                                                              \
-  } while (0)
-#endif
-
-#define LOG_ERROR_LLAMA(MSG, ...) \
-  server_log("ERROR", __func__, __LINE__, MSG, __VA_ARGS__)
-#define LOG_WARNING_LLAMA(MSG, ...) \
-  server_log("WARNING", __func__, __LINE__, MSG, __VA_ARGS__)
-#define LOG_INFO_LLAMA(MSG, ...) \
-  server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
-
-//
-// base64 utils (TODO: move to common in the future)
-//
-
-static const std::string base64_chars =
-    "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-    "abcdefghijklmnopqrstuvwxyz"
-    "0123456789+/";
-
-static inline bool is_base64(uint8_t c) {
-  return (isalnum(c) || (c == '+') || (c == '/'));
-}
-
-static std::vector<uint8_t> base64_decode(const std::string& encoded_string) {
-  int i = 0;
-  int j = 0;
-  int in_ = 0;
-
-  int in_len = encoded_string.size();
-
-  uint8_t char_array_4[4];
-  uint8_t char_array_3[3];
-
-  std::vector<uint8_t> ret;
-
-  while (in_len-- && (encoded_string[in_] != '=') &&
-         is_base64(encoded_string[in_])) {
-    char_array_4[i++] = encoded_string[in_];
-    in_++;
-    if (i == 4) {
-      for (i = 0; i < 4; i++) {
-        char_array_4[i] = base64_chars.find(char_array_4[i]);
-      }
-
-      char_array_3[0] =
-          ((char_array_4[0]) << 2) + ((char_array_4[1] & 0x30) >> 4);
-      char_array_3[1] =
-          ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
-      char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
-
-      for (i = 0; (i < 3); i++) {
-        ret.push_back(char_array_3[i]);
-      }
-      i = 0;
-    }
-  }
-
-  if (i) {
-    for (j = i; j < 4; j++) {
-      char_array_4[j] = 0;
-    }
-
-    for (j = 0; j < 4; j++) {
-      char_array_4[j] = base64_chars.find(char_array_4[j]);
-    }
-
-    char_array_3[0] =
-        ((char_array_4[0]) << 2) + ((char_array_4[1] & 0x30) >> 4);
-    char_array_3[1] =
-        ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
-    char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
-
-    for (j = 0; (j < i - 1); j++) {
-      ret.push_back(char_array_3[j]);
-    }
-  }
-
-  return ret;
-}
-
-//
-// parallel
-//
-
-enum task_type { COMPLETION_TASK, CANCEL_TASK };
-
-struct task_server {
-  int id;
-  int target_id;
-  task_type type;
-  json data;
-  bool infill_mode = false;
-  bool embedding_mode = false;
-  int multitask_id = -1;
-};
-
-struct task_result {
-  int id;
-  int multitask_id = -1;
-  bool stop;
-  bool error;
-  json result_json;
-};
-
-struct task_multi {
-  int id;
-  std::set<int> subtasks_remaining{};
-  std::vector<task_result> results{};
-};
-
-// TODO: can become bool if we can't find use of more states
-enum slot_state {
-  IDLE,
-  PROCESSING,
-};
-
-enum slot_command {
-  NONE,
-  LOAD_PROMPT,
-  RELEASE,
-};
-
-struct slot_params {
-  bool stream = true;
-  bool cache_prompt =
-      false;  // remember the prompt to avoid reprocessing all prompt
-
-  uint32_t seed = -1;      // RNG seed
-  int32_t n_keep = 0;      // number of tokens to keep from initial prompt
-  int32_t n_predict = -1;  // new tokens to predict
-
-  std::vector<std::string> antiprompt;
-
-  json input_prefix;
-  json input_suffix;
-};
-
-struct slot_image {
-  int32_t id;
-
-  bool request_encode_image = false;
-  float* image_embedding = nullptr;
-  int32_t image_tokens = 0;
-
-  clip_image_u8* img_data;
-
-  std::string prefix_prompt;  // before of this image
-};
-
-// completion token output with probabilities
-struct completion_token_output {
-  struct token_prob {
-    llama_token tok;
-    float prob;
-  };
-
-  std::vector<token_prob> probs;
-  llama_token tok;
-  std::string text_to_send;
-};
-
-static size_t common_part(const std::vector<llama_token>& a,
-                          const std::vector<llama_token>& b) {
-  size_t i;
-  for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) {}
-  return i;
-}
-
-enum stop_type {
-  STOP_FULL,
-  STOP_PARTIAL,
-};
-
-enum class ModelType { LLM = 0, EMBEDDING };
-
-static bool ends_with(const std::string& str, const std::string& suffix) {
-  return str.size() >= suffix.size() &&
-         0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
-}
-
-static size_t find_partial_stop_string(const std::string& stop,
-                                       const std::string& text) {
-  if (!text.empty() && !stop.empty()) {
-    const char text_last_char = text.back();
-    for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) {
-      if (stop[char_index] == text_last_char) {
-        const std::string current_partial = stop.substr(0, char_index + 1);
-        if (ends_with(text, current_partial)) {
-          return text.size() - char_index - 1;
-        }
-      }
-    }
-  }
-  return std::string::npos;
-}
-
-// TODO: reuse llama_detokenize
-template <class Iter>
-static std::string tokens_to_str(llama_context* ctx, Iter begin, Iter end) {
-  std::string ret;
-  for (; begin != end; ++begin) {
-    ret += llama_token_to_piece(ctx, *begin);
-  }
-  return ret;
-}
-
-static void server_log(const char* level, const char* function, int line,
-                       const char* message,
-                       const nlohmann::ordered_json& extra) {
-  nlohmann::ordered_json log{
-      {"timestamp", time(nullptr)}, {"level", level},
-      {"function", function},       {"line", line},
-      {"message", message},
-  };
-
-  if (!extra.empty()) {
-    log.merge_patch(extra);
-  }
-
-  const std::string str =
-      log.dump(-1, ' ', false, json::error_handler_t::replace);
-  printf("%.*s\n", (int)str.size(), str.data());
-  fflush(stdout);
-}
-
-// format incomplete utf-8 multibyte character for output
-static std::string tokens_to_output_formatted_string(const llama_context* ctx,
-                                                     const llama_token token) {
-  std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
-  // if the size is 1 and first bit is 1, meaning it's a partial character
-  //   (size > 1 meaning it's already a known token)
-  if (out.size() == 1 && (out[0] & 0x80) == 0x80) {
-    std::stringstream ss;
-    ss << std::hex << (out[0] & 0xff);
-    std::string res(ss.str());
-    out = "byte: \\x" + res;
-  }
-  return out;
-}
-
-// convert a vector of completion_token_output to json
-static json probs_vector_to_json(
-    const llama_context* ctx,
-    const std::vector<completion_token_output>& probs) {
-  json out = json::array();
-  for (const auto& prob : probs) {
-    json probs_for_token = json::array();
-    for (const auto& p : prob.probs) {
-      std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok);
-      probs_for_token.push_back(json{
-          {"tok_str", tok_str},
-          {"prob", p.prob},
-      });
-    }
-    std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok);
-    out.push_back(json{
-        {"content", tok_str},
-        {"probs", probs_for_token},
-    });
-  }
-  return out;
-}
-
-template <typename T>
-static T json_value(const json& body, const std::string& key,
-                    const T& default_value) {
-  // Fallback null to default value
-  return body.contains(key) && !body.at(key).is_null()
-             ? body.value(key, default_value)
-             : default_value;
-}
-
-struct llama_client_slot {
-  int id;
-  int task_id = -1;
-
-  struct slot_params params;
-
-  slot_state state = IDLE;
-  slot_command command = NONE;
-
-  // used to determine the slot that has been used the longest
-  int64_t t_last_used = -1;
-
-  // generation props
-  int32_t n_ctx = 0;  // context size per slot
-  int32_t n_past = 0;
-  int32_t n_decoded = 0;
-  int32_t n_remaining = -1;
-  int32_t i_batch = -1;
-
-  int32_t num_prompt_tokens = 0;
-  int32_t num_prompt_tokens_processed = 0;
-
-  json prompt;
-  std::string generated_text;
-  llama_token sampled;
-  std::vector<llama_token> cache_tokens;
-  std::vector<completion_token_output> generated_token_probs;
-
-  bool infill = false;
-  bool embedding = false;
-  bool has_next_token = true;
-  bool truncated = false;
-  bool stopped_eos = false;
-  bool stopped_word = false;
-  bool stopped_limit = false;
-
-  bool oaicompat = false;
-  std::string oaicompat_model;
-
-  std::string stopping_word;
-
-  // sampling
-  struct llama_sampling_params sparams;
-  llama_sampling_context* ctx_sampling = nullptr;
-
-  // multimodal
-  std::vector<slot_image> images;
-
-  // stats
-  size_t sent_count = 0;
-  size_t sent_token_probs_index = 0;
-
-  int64_t t_start_process_prompt;
-  int64_t t_start_genereration;
-
-  double t_prompt_processing;  // ms
-  double t_token_generation;   // ms
-
-  // multitasks
-  int multitask_id = -1;
-
-  void reset() {
-    num_prompt_tokens = 0;
-    generated_text = "";
-    truncated = false;
-    stopped_eos = false;
-    stopped_word = false;
-    stopped_limit = false;
-    stopping_word = "";
-    n_past = 0;
-    sent_count = 0;
-    sent_token_probs_index = 0;
-    infill = false;
-
-    generated_token_probs.clear();
-
-    for (slot_image& img : images) {
-      free(img.image_embedding);
-      if (img.img_data) {
-        clip_image_u8_free(img.img_data);
-      }
-      img.prefix_prompt = "";
-    }
-
-    images.clear();
-  }
-
-  bool has_budget(gpt_params& global_params) {
-    n_remaining = -1;
-    if (params.n_predict != -1) {
-      n_remaining = params.n_predict - n_decoded;
-    } else if (global_params.n_predict != -1) {
-      n_remaining = global_params.n_predict - n_decoded;
-    }
-    return n_remaining > 0 || n_remaining == -1;  // no budget || limitless
-  }
-
-  bool available() const { return state == IDLE && command == NONE; }
-
-  bool is_processing() const {
-    return (state == IDLE && command == LOAD_PROMPT) || state == PROCESSING;
-  }
-
-  void add_token_string(const completion_token_output& token) {
-    if (command == RELEASE) {
-      return;
-    }
-    cache_tokens.push_back(token.tok);
-    generated_token_probs.push_back(token);
-  }
-
-  void release() {
-    if (state == IDLE || state == PROCESSING) {
-      t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
-      command = RELEASE;
-    }
-  }
-
-  json get_formated_timings() {
-    return json{
-        {"prompt_n", num_prompt_tokens_processed},
-        {"prompt_ms", t_prompt_processing},
-        {"prompt_per_token_ms",
-         t_prompt_processing / num_prompt_tokens_processed},
-        {"prompt_per_second",
-         1e3 / t_prompt_processing * num_prompt_tokens_processed},
-
-        {"predicted_n", n_decoded},
-        {"predicted_ms", t_token_generation},
-        {"predicted_per_token_ms", t_token_generation / n_decoded},
-        {"predicted_per_second", 1e3 / t_token_generation * n_decoded},
-    };
-  }
-
-  void print_timings() const {
-    LOG_DEBUG << __func__ << ": prompt eval time = " << t_prompt_processing
-              << "ms / " << num_prompt_tokens_processed << " tokens ("
-              << t_prompt_processing / num_prompt_tokens_processed
-              << " ms per "
-                 "token, "
-              << 1e3 / t_prompt_processing * num_prompt_tokens_processed
-              << " tokens per second)";
-    LOG_DEBUG << __func__ << ":        eval time = " << t_token_generation
-              << " ms / " << n_decoded << " runs   ("
-              << t_token_generation / n_decoded
-              << " ms per "
-                 "token, "
-              << 1e3 / t_token_generation * n_decoded
-              << " tokens per second)\n";
-    LOG_DEBUG << __func__ << ":       total time = "
-              << t_prompt_processing + t_token_generation << " ms";
-  }
-};
-
-struct llama_server_context {
-  llama_model* model = nullptr;
-  llama_context* ctx = nullptr;
-
-  clip_ctx* clp_ctx = nullptr;
-
-  gpt_params params;
-
-  llama_batch batch;
-
-  bool multimodal = false;
-  bool clean_kv_cache = true;
-  bool all_slots_are_idle = false;
-  bool add_bos_token = true;
-
-  int32_t id_gen;
-  int32_t n_ctx;  // total context for all clients / slots
-
-  // Internal
-  std::atomic<bool> model_loaded_external = false;
-
-  // system prompt
-  bool system_need_update = false;
-
-  std::string system_prompt;
-  std::vector<llama_token> system_tokens;
-
-  std::string name_user;  // this should be the antiprompt
-  std::string name_assistant;
-
-  // slots / clients
-  std::vector<llama_client_slot> slots;
-
-  std::vector<task_server> queue_tasks;
-  std::vector<task_result> queue_results;
-  std::vector<task_multi> queue_multitasks;
-  std::mutex mutex_tasks;  // also guards id_gen, and queue_multitasks
-  std::condition_variable condition_tasks;
-  std::mutex mutex_results;
-  std::condition_variable condition_results;
-  ModelType model_type = ModelType::LLM;
-
-  ~llama_server_context() {
-    if (ctx) {
-      llama_free(ctx);
-      ctx = nullptr;
-    }
-    if (model) {
-      llama_free_model(model);
-      model = nullptr;
-    }
-  }
-
-  bool load_model(const gpt_params& params_) {
-    params = params_;
-    if (!params.mmproj.empty()) {
-      multimodal = true;
-      LOG_DEBUG << "Multi Modal Mode Enabled";
-      clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/1);
-      if (clp_ctx == nullptr) {
-        LOG_ERROR_LLAMA("unable to load clip model",
-                        {{"model", params.mmproj}});
-        return false;
-      }
-
-      if (params.n_ctx <
-          2048) {  // request larger context for the image embedding
-        params.n_ctx = 2048;
-      }
-    }
-
-    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    if (model == nullptr) {
-      LOG_ERROR_LLAMA("llama.cpp unable to load model",
-                      {{"model", params.model}});
-      return false;
-    }
-
-    if (multimodal) {
-      const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
-      const int n_embd_llm = llama_n_embd(model);
-      if (n_embd_clip != n_embd_llm) {
-        LOG_DEBUG << __func__ << ": embedding dim of the multimodal projector ("
-                  << n_embd_clip
-                  << ") is not "
-                     "equal to that of LLaMA ("
-                  << n_embd_llm
-                  << "). Make sure that you use the "
-                     "correct mmproj file.";
-        llama_free(ctx);
-        llama_free_model(model);
-        return false;
-      }
-    }
-
-    if (ctx == nullptr) {
-      LOG_ERROR_LLAMA("Unable to get llama.cpp context", {});
-      return false;
-    }
-    n_ctx = llama_n_ctx(ctx);
-
-    add_bos_token = llama_should_add_bos_token(model);
-
-    return true;
-  }
-
-  void initialize() {
-    id_gen = 0;
-
-    // create slots
-    all_slots_are_idle = true;
-
-    const int32_t n_ctx_slot = n_ctx / params.n_parallel;
-
-    LOG_DEBUG << "Available slots: ";
-    for (int i = 0; i < params.n_parallel; i++) {
-      llama_client_slot slot;
-
-      slot.id = i;
-      slot.n_ctx = n_ctx_slot;
-      slot.reset();
-
-      LOG_DEBUG << " -> Slot " << slot.id << " - max context: " << n_ctx_slot;
-      slots.push_back(slot);
-    }
-
-    try {
-      batch = llama_batch_init(n_ctx, 0, params.n_parallel);
-    } catch (const std::exception& e) {
-      LOG_ERROR_LLAMA("Failed to allocate llama.cpp batch metadata",
-                      {{"exception", e.what()},
-                       {"n_tokens_alloc", n_ctx},
-                       {"embd", 0},
-                       {"n_seq_max", params.n_parallel}});
-    }
-
-    // empty system prompt
-    system_prompt = "";
-    system_tokens.clear();
-  }
-
-  std::vector<llama_token> tokenize(const json& json_prompt,
-                                    bool add_bos) const {
-    // TODO: currently, we tokenize using special tokens by default
-    //       this is not always correct (see
-    //       https://github.com/ggerganov/llama.cpp/pull/4160#issuecomment-1824826216)
-    //       but it's better compared to completely ignoring ChatML and other
-    //       chat templates
-    const bool TMP_FORCE_SPECIAL = true;
-
-    // If `add_bos` is true, we only add BOS, when json_prompt is a string,
-    // or the first element of the json_prompt array is a string.
-    std::vector<llama_token> prompt_tokens;
-
-    if (json_prompt.is_array()) {
-      bool first = true;
-      for (const auto& p : json_prompt) {
-        if (p.is_string()) {
-          auto s = p.template get<std::string>();
-          std::vector<llama_token> p;
-          if (first) {
-            p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
-            first = false;
-          } else {
-            p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
-          }
-          prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
-        } else {
-          if (first) {
-            first = false;
-          }
-          prompt_tokens.push_back(p.template get<llama_token>());
-        }
-      }
-    } else {
-      auto s = json_prompt.template get<std::string>();
-      prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
-    }
-
-    return prompt_tokens;
-  }
-
-  llama_client_slot* get_slot(int id) {
-    int64_t t_last = ggml_time_us();
-    llama_client_slot* last_used = nullptr;
-
-    for (llama_client_slot& slot : slots) {
-      if (slot.id == id && slot.available()) {
-        return &slot;
-      }
-
-      if (slot.available() && slot.t_last_used < t_last) {
-        last_used = &slot;
-        t_last = slot.t_last_used;
-      }
-    }
-
-    return last_used;
-  }
-
-  bool launch_slot_with_data(llama_client_slot*& slot, json data) {
-    slot_params default_params;
-    llama_sampling_params default_sparams;
-
-    if (data.count("__oaicompat") != 0) {
-      slot->oaicompat = true;
-      slot->oaicompat_model =
-          json_value(data, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
-    } else {
-      slot->oaicompat = false;
-      slot->oaicompat_model = "";
-    }
-
-    slot->params.stream = json_value(data, "stream", false);
-    slot->params.cache_prompt = json_value(data, "cache_prompt", false);
-    slot->params.n_predict =
-        json_value(data, "n_predict", default_params.n_predict);
-    slot->sparams.top_k = json_value(data, "top_k", default_sparams.top_k);
-    slot->sparams.top_p = json_value(data, "top_p", default_sparams.top_p);
-    slot->sparams.min_p = json_value(data, "min_p", default_sparams.min_p);
-    slot->sparams.tfs_z = json_value(data, "tfs_z", default_sparams.tfs_z);
-    slot->sparams.typical_p =
-        json_value(data, "typical_p", default_sparams.typical_p);
-    slot->sparams.temp = json_value(data, "temperature", default_sparams.temp);
-    slot->sparams.penalty_last_n =
-        json_value(data, "repeat_last_n", default_sparams.penalty_last_n);
-    slot->sparams.penalty_repeat =
-        json_value(data, "repeat_penalty", default_sparams.penalty_repeat);
-    slot->sparams.penalty_freq =
-        json_value(data, "frequency_penalty", default_sparams.penalty_freq);
-    slot->sparams.penalty_present =
-        json_value(data, "presence_penalty", default_sparams.penalty_present);
-    slot->sparams.mirostat =
-        json_value(data, "mirostat", default_sparams.mirostat);
-    slot->sparams.mirostat_tau =
-        json_value(data, "mirostat_tau", default_sparams.mirostat_tau);
-    slot->sparams.mirostat_eta =
-        json_value(data, "mirostat_eta", default_sparams.mirostat_eta);
-    slot->sparams.penalize_nl =
-        json_value(data, "penalize_nl", default_sparams.penalize_nl);
-    slot->params.n_keep = json_value(data, "n_keep", slot->params.n_keep);
-    slot->params.seed = json_value(data, "seed", default_params.seed);
-    slot->sparams.grammar =
-        json_value(data, "grammar", default_sparams.grammar);
-    slot->sparams.n_probs =
-        json_value(data, "n_probs", default_sparams.n_probs);
-
-    // infill
-    if (data.count("input_prefix") != 0) {
-      slot->params.input_prefix = data["input_prefix"];
-    } else {
-      slot->params.input_prefix = "";
-    }
-
-    if (data.count("input_suffix") != 0) {
-      slot->params.input_suffix = data["input_suffix"];
-    } else {
-      slot->params.input_suffix = "";
-    }
-
-    if (data.count("prompt") != 0) {
-      slot->prompt = data["prompt"];
-    } else {
-      slot->prompt = "";
-    }
-
-    slot->sparams.penalty_prompt_tokens.clear();
-    slot->sparams.use_penalty_prompt_tokens = false;
-    const auto& penalty_prompt = data.find("penalty_prompt");
-    if (penalty_prompt != data.end()) {
-      if (penalty_prompt->is_string()) {
-        const auto penalty_prompt_string = penalty_prompt->get<std::string>();
-        auto penalty_tokens =
-            llama_tokenize(model, penalty_prompt_string, false);
-        slot->sparams.penalty_prompt_tokens.swap(penalty_tokens);
-        if (slot->params.n_predict > 0) {
-          slot->sparams.penalty_prompt_tokens.reserve(
-              slot->sparams.penalty_prompt_tokens.size() +
-              slot->params.n_predict);
-        }
-        slot->sparams.use_penalty_prompt_tokens = true;
-      } else if (penalty_prompt->is_array()) {
-        const auto n_tokens = penalty_prompt->size();
-        slot->sparams.penalty_prompt_tokens.reserve(
-            n_tokens + std::max(0, slot->params.n_predict));
-        const int n_vocab = llama_n_vocab(model);
-        for (const auto& penalty_token : *penalty_prompt) {
-          if (penalty_token.is_number_integer()) {
-            const auto tok = penalty_token.get<llama_token>();
-            if (tok >= 0 && tok < n_vocab) {
-              slot->sparams.penalty_prompt_tokens.push_back(tok);
-            }
-          }
-        }
-        slot->sparams.use_penalty_prompt_tokens = true;
-      }
-    }
-
-    slot->sparams.logit_bias.clear();
-
-    if (json_value(data, "ignore_eos", false)) {
-      slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
-    }
-
-    const auto& logit_bias = data.find("logit_bias");
-    if (logit_bias != data.end() && logit_bias->is_array()) {
-      const int n_vocab = llama_n_vocab(model);
-      for (const auto& el : *logit_bias) {
-        if (el.is_array() && el.size() == 2 && el[0].is_number_integer()) {
-          llama_token tok = el[0].get<llama_token>();
-          if (tok >= 0 && tok < n_vocab) {
-            if (el[1].is_number()) {
-              slot->sparams.logit_bias[tok] = el[1].get<float>();
-            } else if (el[1].is_boolean() && !el[1].get<bool>()) {
-              slot->sparams.logit_bias[tok] = -INFINITY;
-            }
-          }
-        }
-      }
-    }
-
-    slot->params.antiprompt.clear();
-
-    const auto& stop = data.find("stop");
-    if (stop != data.end() && stop->is_array()) {
-      for (const auto& word : *stop) {
-        if (!word.empty()) {
-          slot->params.antiprompt.push_back(word);
-        }
-      }
-    }
-
-    if (multimodal) {
-      const auto& images_data = data.find("image_data");
-      if (images_data != data.end() && images_data->is_array()) {
-        for (const auto& img : *images_data) {
-          const std::vector<uint8_t> image_buffer =
-              base64_decode(img["data"].get<std::string>());
-
-          slot_image img_sl;
-          img_sl.id =
-              img.count("id") != 0 ? img["id"].get<int>() : slot->images.size();
-          img_sl.img_data = clip_image_u8_init();
-          if (!clip_image_load_from_bytes(
-                  image_buffer.data(), image_buffer.size(), img_sl.img_data)) {
-            LOG_DEBUG << "slot " << slot->id
-                      << " - failed to load image [id: " << img_sl.id << "]";
-            return false;
-          }
-          LOG_DEBUG << "slot " << slot->id << " - loaded image";
-          img_sl.request_encode_image = true;
-          slot->images.push_back(img_sl);
-        }
-        // process prompt
-        // example: system prompt [img-102] user [img-103] describe [img-134] ->
-        // [{id: 102, prefix: 'system prompt '}, {id: 103, prefix: ' user '},
-        // {id: 134, prefix: ' describe '}]}
-        if (slot->images.size() > 0 && !slot->prompt.is_array()) {
-          std::string prompt = slot->prompt.get<std::string>();
-          size_t pos = 0, begin_prefix = 0;
-          std::string pattern = "[img-";
-          while ((pos = prompt.find(pattern, pos)) != std::string::npos) {
-            size_t end_prefix = pos;
-            pos += pattern.length();
-            size_t end_pos = prompt.find("]", pos);
-            if (end_pos != std::string::npos) {
-              std::string image_id = prompt.substr(pos, end_pos - pos);
-              try {
-                int img_id = std::stoi(image_id);
-                bool found = false;
-                for (slot_image& img : slot->images) {
-                  if (img.id == img_id) {
-                    found = true;
-                    img.prefix_prompt =
-                        prompt.substr(begin_prefix, end_prefix - begin_prefix);
-                    begin_prefix = end_pos + 1;
-                    break;
-                  }
-                }
-                if (!found) {
-                  LOG_DEBUG << "ERROR: Image with id: " << img_id
-                            << ", not found.\n";
-                  slot->images.clear();
-                  return false;
-                }
-              } catch (const std::invalid_argument& e) {
-                LOG_DEBUG << "Invalid image number id in prompt";
-                slot->images.clear();
-                return false;
-              }
-            }
-          }
-          slot->prompt = "";
-          slot->params.input_suffix = prompt.substr(begin_prefix);
-          slot->params.cache_prompt =
-              false;  // multimodal doesn't support cache prompt
-        }
-      }
-    }
-
-    if (slot->ctx_sampling != nullptr) {
-      llama_sampling_free(slot->ctx_sampling);
-    }
-    slot->ctx_sampling = llama_sampling_init(slot->sparams);
-    llama_set_rng_seed(ctx, slot->params.seed);
-    slot->command = LOAD_PROMPT;
-
-    all_slots_are_idle = false;
-
-    LOG_DEBUG << "slot " << slot->id
-              << " is processing [task id: " << slot->task_id << "]";
-
-    return true;
-  }
-
-  void kv_cache_clear() {
-    // clear the entire KV cache
-    llama_kv_cache_clear(ctx);
-  }
-
-  void update_system_prompt() {
-    system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
-
-    llama_batch_clear(batch);
-
-    kv_cache_clear();
-
-    for (int i = 0; i < (int)system_tokens.size(); ++i) {
-      llama_batch_add(batch, system_tokens[i], i, {0}, false);
-    }
-
-    if (llama_decode(ctx, batch) != 0) {
-      LOG_WARN << __func__ << ": llama_decode() failed";
-      return;
-    }
-
-    // assign the system KV cache to all parallel sequences
-    for (int32_t i = 1; i < params.n_parallel; ++i) {
-      llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size());
-    }
-
-    LOG_DEBUG << "system prompt updated";
-    system_need_update = false;
-  }
-
-  void notify_system_prompt_changed() {
-    // release all slots
-    for (llama_client_slot& slot : slots) {
-      slot.release();
-    }
-
-    system_need_update = true;
-  }
-
-  void process_system_prompt_data(const json& sys_props) {
-    system_prompt = sys_props.value("prompt", "");
-    name_user = sys_props.value("anti_prompt", "");
-    name_assistant = sys_props.value("assistant_name", "");
-
-    if (slots.size() > 0) {
-      notify_system_prompt_changed();
-    }
-  }
-
-  static size_t find_stopping_strings(const std::string& text,
-                                      const size_t last_token_size,
-                                      const stop_type type,
-                                      llama_client_slot& slot) {
-    size_t stop_pos = std::string::npos;
-
-    for (const std::string& word : slot.params.antiprompt) {
-      size_t pos;
-      if (type == STOP_FULL) {
-        const size_t tmp = word.size() + last_token_size;
-        const size_t from_pos = text.size() > tmp ? text.size() - tmp : 0;
-        pos = text.find(word, from_pos);
-      } else {
-        pos = find_partial_stop_string(word, text);
-      }
-      if (pos != std::string::npos &&
-          (stop_pos == std::string::npos || pos < stop_pos)) {
-        if (type == STOP_FULL) {
-          slot.stopped_word = true;
-          slot.stopping_word = word;
-          slot.has_next_token = false;
-        }
-        stop_pos = pos;
-      }
-    }
-
-    return stop_pos;
-  }
-
-  bool process_token(completion_token_output& result, llama_client_slot& slot) {
-    // remember which tokens were sampled - used for repetition penalties during
-    // sampling
-    const std::string token_str = llama_token_to_piece(ctx, result.tok);
-    slot.sampled = result.tok;
-
-    // search stop word and delete it
-    slot.generated_text += token_str;
-    slot.has_next_token = true;
-
-    if (slot.ctx_sampling->params.use_penalty_prompt_tokens &&
-        result.tok != -1) {
-      // we can change penalty_prompt_tokens because it is always created from
-      // scratch each request
-      slot.ctx_sampling->params.penalty_prompt_tokens.push_back(result.tok);
-    }
-
-    // check if there is incomplete UTF-8 character at the end
-    bool incomplete = false;
-    for (unsigned i = 1; i < 5 && i <= slot.generated_text.size(); ++i) {
-      unsigned char c = slot.generated_text[slot.generated_text.size() - i];
-      if ((c & 0xC0) == 0x80) {
-        // continuation byte: 10xxxxxx
-        continue;
-      }
-      if ((c & 0xE0) == 0xC0) {
-        // 2-byte character: 110xxxxx ...
-        incomplete = i < 2;
-      } else if ((c & 0xF0) == 0xE0) {
-        // 3-byte character: 1110xxxx ...
-        incomplete = i < 3;
-      } else if ((c & 0xF8) == 0xF0) {
-        // 4-byte character: 11110xxx ...
-        incomplete = i < 4;
-      }
-      // else 1-byte character or invalid byte
-      break;
-    }
-
-    if (!incomplete) {
-      size_t pos = std::min(slot.sent_count, slot.generated_text.size());
-      const std::string str_test = slot.generated_text.substr(pos);
-      bool is_stop_full = false;
-      size_t stop_pos =
-          find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot);
-      if (stop_pos != std::string::npos) {
-        is_stop_full = true;
-        slot.generated_text.erase(slot.generated_text.begin() + pos + stop_pos,
-                                  slot.generated_text.end());
-        pos = std::min(slot.sent_count, slot.generated_text.size());
-      } else {
-        is_stop_full = false;
-        stop_pos = find_stopping_strings(str_test, token_str.size(),
-                                         STOP_PARTIAL, slot);
-      }
-
-      // check if there is any token to predict
-      if (stop_pos == std::string::npos ||
-          (!slot.has_next_token && !is_stop_full && stop_pos > 0)) {
-        // no send the stop word in the response
-        result.text_to_send =
-            slot.generated_text.substr(pos, std::string::npos);
-        slot.sent_count += result.text_to_send.size();
-        // add the token to slot queue and cache
-      }
-      slot.add_token_string(result);
-      if (slot.params.stream) {
-        send_partial_response(slot, result);
-      }
-    }
-
-    if (incomplete) {
-      slot.has_next_token = true;
-    }
-
-    // check the limits
-    if (slot.n_decoded > 2 && slot.has_next_token && !slot.has_budget(params)) {
-      slot.stopped_limit = true;
-      slot.has_next_token = false;
-    }
-
-    if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model)) {
-      slot.stopped_eos = true;
-      slot.has_next_token = false;
-      LOG_VERBOSE("eos token found", {});
-    }
-
-    LOG_VERBOSE(
-        "next token",
-        {
-            {"token", result.tok},
-            {"token_text", tokens_to_output_formatted_string(ctx, result.tok)},
-            {"has_next_token", slot.has_next_token},
-            {"n_remain", slot.n_remaining},
-            {"num_tokens_predicted", slot.n_decoded},
-            {"stopped_eos", slot.stopped_eos},
-            {"stopped_word", slot.stopped_word},
-            {"stopped_limit", slot.stopped_limit},
-            {"stopping_word", slot.stopping_word},
-        });
-
-    return slot.has_next_token;  // continue
-  }
-  bool process_images(llama_client_slot& slot) const {
-    for (slot_image& img : slot.images) {
-      if (!img.request_encode_image) {
-        continue;
-      }
-
-      if (!llava_image_embed_make_with_clip_img(
-              clp_ctx, params.n_threads, img.img_data, &img.image_embedding,
-              &img.image_tokens)) {
-        LOG_DEBUG << "Error processing the given image";
-        return false;
-      }
-
-      img.request_encode_image = false;
-    }
-
-    return slot.images.size() > 0;
-  }
-  void send_error(task_server& task, std::string error) {
-    std::unique_lock<std::mutex> lock(mutex_results);
-    task_result res;
-    res.id = task.id;
-    res.multitask_id = task.multitask_id;
-    res.stop = false;
-    res.error = true;
-    res.result_json = {{"content", error}};
-    queue_results.push_back(res);
-    condition_results.notify_all();
-  }
-
-  void add_multi_task(int id, std::vector<int>& sub_ids) {
-    std::lock_guard<std::mutex> lock(mutex_tasks);
-    task_multi multi;
-    multi.id = id;
-    std::copy(sub_ids.begin(), sub_ids.end(),
-              std::inserter(multi.subtasks_remaining,
-                            multi.subtasks_remaining.end()));
-    queue_multitasks.push_back(multi);
-    condition_tasks.notify_one();
-  }
-
-  void update_multi_task(int multitask_id, int subtask_id,
-                         task_result& result) {
-    std::lock_guard<std::mutex> lock(mutex_tasks);
-    for (auto& multitask : queue_multitasks) {
-      if (multitask.id == multitask_id) {
-        multitask.subtasks_remaining.erase(subtask_id);
-        multitask.results.push_back(result);
-        condition_tasks.notify_one();
-      }
-    }
-  }
-
-  json get_model_props() { return get_formated_generation(slots[0]); }
-
-  json get_formated_generation(llama_client_slot& slot) {
-    const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model));
-    const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() &&
-                            eos_bias->second < 0.0f &&
-                            std::isinf(eos_bias->second);
-    return json{
-        {"n_ctx", slot.n_ctx},
-        {"model", params.model_alias},
-        {"seed", slot.params.seed},
-        {"temperature", slot.sparams.temp},
-        {"top_k", slot.sparams.top_k},
-        {"top_p", slot.sparams.top_p},
-        {"min_p", slot.sparams.min_p},
-        {"tfs_z", slot.sparams.tfs_z},
-        {"typical_p", slot.sparams.typical_p},
-        {"repeat_last_n", slot.sparams.penalty_last_n},
-        {"repeat_penalty", slot.sparams.penalty_repeat},
-        {"presence_penalty", slot.sparams.penalty_present},
-        {"frequency_penalty", slot.sparams.penalty_freq},
-        {"penalty_prompt_tokens", slot.sparams.penalty_prompt_tokens},
-        {"use_penalty_prompt_tokens", slot.sparams.use_penalty_prompt_tokens},
-        {"mirostat", slot.sparams.mirostat},
-        {"mirostat_tau", slot.sparams.mirostat_tau},
-        {"mirostat_eta", slot.sparams.mirostat_eta},
-        {"penalize_nl", slot.sparams.penalize_nl},
-        {"stop", slot.params.antiprompt},
-        {"n_predict", slot.params.n_predict},
-        {"n_keep", params.n_keep},
-        {"ignore_eos", ignore_eos},
-        {"stream", slot.params.stream},
-        {"logit_bias", slot.sparams.logit_bias},
-        {"n_probs", slot.sparams.n_probs},
-        {"grammar", slot.sparams.grammar},
-    };
-  }
-
-  void send_partial_response(llama_client_slot& slot,
-                             completion_token_output tkn) {
-    std::unique_lock<std::mutex> lock(mutex_results);
-    task_result res;
-    res.id = slot.task_id;
-    res.multitask_id = slot.multitask_id;
-    res.error = false;
-    res.stop = false;
-
-    res.result_json = json{{"content", tkn.text_to_send},
-                           {"stop", false},
-                           {"slot_id", slot.id},
-                           {"multimodal", multimodal}};
-
-    if (slot.sparams.n_probs > 0) {
-      std::vector<completion_token_output> probs_output = {};
-      const std::vector<llama_token> to_send_toks =
-          llama_tokenize(ctx, tkn.text_to_send, false);
-      size_t probs_pos = std::min(slot.sent_token_probs_index,
-                                  slot.generated_token_probs.size());
-      size_t probs_stop_pos =
-          std::min(slot.sent_token_probs_index + to_send_toks.size(),
-                   slot.generated_token_probs.size());
-      if (probs_pos < probs_stop_pos) {
-        probs_output = std::vector<completion_token_output>(
-            slot.generated_token_probs.begin() + probs_pos,
-            slot.generated_token_probs.begin() + probs_stop_pos);
-      }
-      slot.sent_token_probs_index = probs_stop_pos;
-      res.result_json["completion_probabilities"] =
-          probs_vector_to_json(ctx, probs_output);
-    }
-
-    if (slot.oaicompat) {
-      res.result_json["oaicompat_token_ctr"] = slot.n_decoded;
-      res.result_json["model"] = slot.oaicompat_model;
-    }
-
-    queue_results.push_back(res);
-    condition_results.notify_all();
-  }
-
-  void send_final_response(llama_client_slot& slot) {
-    std::unique_lock<std::mutex> lock(mutex_results);
-    task_result res;
-    res.id = slot.task_id;
-    res.multitask_id = slot.multitask_id;
-    res.error = false;
-    res.stop = true;
-
-    res.result_json =
-        json{{"content", !slot.params.stream ? slot.generated_text : ""},
-             {"slot_id", slot.id},
-             {"stop", true},
-             {"model", params.model_alias},
-             {"tokens_predicted", slot.n_decoded},
-             {"tokens_evaluated", slot.num_prompt_tokens},
-             {"generation_settings", get_formated_generation(slot)},
-             {"prompt", slot.prompt},
-             {"truncated", slot.truncated},
-             {"stopped_eos", slot.stopped_eos},
-             {"stopped_word", slot.stopped_word},
-             {"stopped_limit", slot.stopped_limit},
-             {"stopping_word", slot.stopping_word},
-             {"tokens_cached", slot.n_past},
-             {"timings", slot.get_formated_timings()}};
-
-    if (slot.sparams.n_probs > 0) {
-      std::vector<completion_token_output> probs = {};
-      if (!slot.params.stream && slot.stopped_word) {
-        const std::vector<llama_token> stop_word_toks =
-            llama_tokenize(ctx, slot.stopping_word, false);
-        probs = std::vector<completion_token_output>(
-            slot.generated_token_probs.begin(),
-            slot.generated_token_probs.end() - stop_word_toks.size());
-      } else {
-        probs = std::vector<completion_token_output>(
-            slot.generated_token_probs.begin(),
-            slot.generated_token_probs.begin() + slot.sent_token_probs_index);
-      }
-      res.result_json["completion_probabilities"] =
-          probs_vector_to_json(ctx, probs);
-    }
-
-    if (slot.oaicompat) {
-      res.result_json["oaicompat_token_ctr"] = slot.n_decoded;
-      res.result_json["model"] = slot.oaicompat_model;
-    }
-
-    // parent multitask, if any, needs to be updated
-    if (slot.multitask_id != -1) {
-      update_multi_task(slot.multitask_id, slot.task_id, res);
-    }
-
-    queue_results.push_back(res);
-    condition_results.notify_all();
-  }
-
-  void send_embedding(llama_client_slot& slot) {
-    std::unique_lock<std::mutex> lock(mutex_results);
-    task_result res;
-    res.id = slot.task_id;
-    res.multitask_id = slot.multitask_id;
-    res.error = false;
-    res.stop = true;
-
-    const int n_embd = llama_n_embd(model);
-
-    std::vector<float> embd_res(n_embd, 0.0f);
-
-    for (int i = 0; i < batch.n_tokens; ++i) {
-      if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
-        continue;
-      }
-
-      const float* embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
-      if (embd == NULL) {
-        embd = llama_get_embeddings_ith(ctx, i);
-      }
-
-      if (embd == NULL) {
-        LOG_ERROR << "failed to get embeddings"
-                  << " token " << batch.token[i] << ", seq_id "
-                  << batch.seq_id[i][0];
-
-        res.result_json = json{
-            {"embedding", std::vector<float>(n_embd, 0.0f)},
-        };
-
-        continue;
-      }
-
-      llama_embd_normalize(embd, embd_res.data(), n_embd);
-    }
-    res.result_json = json{
-        {"embedding", embd_res},
-    };
-
-    queue_results.push_back(res);
-    condition_results.notify_all();
-  }
-
-  int request_completion(json data, bool infill, bool embedding,
-                         int multitask_id) {
-    std::unique_lock<std::mutex> lock(mutex_tasks);
-    task_server task;
-    task.id = id_gen++;
-    task.target_id = 0;
-    task.data = std::move(data);
-    task.infill_mode = infill;
-    task.embedding_mode = embedding;
-    task.type = COMPLETION_TASK;
-    task.multitask_id = multitask_id;
-
-    // when a completion task's prompt array is not a singleton, we split it
-    // into multiple requests
-    if (task.data.at("prompt").size() > 1) {
-      lock.unlock();  // entering new func scope
-      return split_multiprompt_task(task);
-    }
-
-    // otherwise, it's a single-prompt task, we actually queue it
-    queue_tasks.push_back(task);
-    condition_tasks.notify_one();
-    return task.id;
-  }
-
-  task_result next_result(int task_id) {
-    while (true) {
-      std::unique_lock<std::mutex> lock(mutex_results);
-      condition_results.wait(lock, [&] { return !queue_results.empty(); });
-
-      for (int i = 0; i < (int)queue_results.size(); i++) {
-        // for now, tasks that have associated parent multitasks just get erased
-        // once multitask picks up the result
-        if (queue_results[i].multitask_id == task_id) {
-          update_multi_task(task_id, queue_results[i].id, queue_results[i]);
-          queue_results.erase(queue_results.begin() + i);
-          continue;
-        }
-
-        if (queue_results[i].id == task_id) {
-          if (queue_results[i].multitask_id != -1) {
-            LOG_ERROR_LLAMA("Incorrect multitask ID", {{"task_id", task_id}});
-          }
-          task_result res = queue_results[i];
-          queue_results.erase(queue_results.begin() + i);
-          return res;
-        }
-      }
-    }
-
-    // never reached
-    // return task_result{-1, false, false, {}};
-  }
-
-  // for multiple images processing
-  bool ingest_images(llama_client_slot& slot, int n_batch) {
-    int image_idx = 0;
-
-    while (image_idx < (int)slot.images.size()) {
-      slot_image& img = slot.images[image_idx];
-
-      // process prefix prompt
-      for (int32_t i = 0; i < (int32_t)batch.n_tokens; i += n_batch) {
-        const int32_t n_tokens =
-            std::min(n_batch, (int32_t)(batch.n_tokens - i));
-        llama_batch batch_view = {
-            n_tokens,
-            batch.token + i,
-            nullptr,
-            batch.pos + i,
-            batch.n_seq_id + i,
-            batch.seq_id + i,
-            batch.logits + i,
-            0,
-            0,
-            0,  // unused
-        };
-        if (llama_decode(ctx, batch_view)) {
-          LOG_DEBUG << __func__ << " : failed to eval\n";
-          return false;
-        }
-      }
-
-      // process image with llm
-      for (int i = 0; i < img.image_tokens; i += n_batch) {
-        int n_eval = img.image_tokens - i;
-        if (n_eval > n_batch) {
-          n_eval = n_batch;
-        }
-
-        const int n_embd = llama_n_embd(model);
-        llama_batch batch_img = {
-            n_eval,  nullptr,     (img.image_embedding + i * n_embd),
-            nullptr, nullptr,     nullptr,
-            nullptr, slot.n_past, 1,
-            0,
-        };
-        if (llama_decode(ctx, batch_img)) {
-          LOG_DEBUG << __func__ << " : failed to eval image";
-          return false;
-        }
-        slot.n_past += n_eval;
-      }
-      image_idx++;
-
-      llama_batch_clear(batch);
-
-      // append prefix of next image
-      const auto json_prompt =
-          (image_idx >= (int)slot.images.size())
-              ? slot.params.input_suffix
-              :  // no more images, then process suffix prompt
-              (json)(slot.images[image_idx].prefix_prompt);
-
-      std::vector<llama_token> append_tokens =
-          tokenize(json_prompt, false);  // has next image
-      for (int i = 0; i < (int)append_tokens.size(); ++i) {
-        llama_batch_add(batch, append_tokens[i], slot.n_past, {slot.id}, true);
-        slot.n_past += 1;
-      }
-    }
-
-    return true;
-  }
-
-  void request_cancel(int task_id) {
-    std::unique_lock<std::mutex> lock(mutex_tasks);
-    task_server task;
-    task.id = id_gen++;
-    task.type = CANCEL_TASK;
-    task.target_id = task_id;
-    queue_tasks.push_back(task);
-    condition_tasks.notify_one();
-  }
-
-  int split_multiprompt_task(task_server& multiprompt_task) {
-    int prompt_count = multiprompt_task.data.at("prompt").size();
-    assert(prompt_count > 1);
-
-    int multitask_id = id_gen++;
-    std::vector<int> subtask_ids(prompt_count);
-    for (int i = 0; i < prompt_count; i++) {
-      json subtask_data = multiprompt_task.data;
-      subtask_data["prompt"] = subtask_data["prompt"][i];
-
-      // subtasks inherit everything else (infill mode, embedding mode, etc.)
-      subtask_ids[i] =
-          request_completion(subtask_data, multiprompt_task.infill_mode,
-                             multiprompt_task.embedding_mode, multitask_id);
-    }
-
-    // queue up the multitask so we can track its subtask progression
-    add_multi_task(multitask_id, subtask_ids);
-    return multitask_id;
-  }
-
-  void process_tasks() {
-    std::unique_lock<std::mutex> lock(mutex_tasks);
-    while (!queue_tasks.empty()) {
-      task_server task = queue_tasks.front();
-      queue_tasks.erase(queue_tasks.begin());
-      switch (task.type) {
-        case COMPLETION_TASK: {
-          llama_client_slot* slot =
-              get_slot(json_value(task.data, "slot_id", -1));
-          if (slot == nullptr) {
-            LOG_DEBUG << "slot unavailable";
-            // send error result
-            send_error(task, "slot unavailable");
-            return;
-          }
-
-          if (task.data.contains("system_prompt")) {
-            process_system_prompt_data(task.data["system_prompt"]);
-          }
-
-          slot->reset();
-
-          slot->infill = task.infill_mode;
-          slot->embedding = task.embedding_mode;
-          slot->task_id = task.id;
-          slot->multitask_id = task.multitask_id;
-
-          if (!launch_slot_with_data(slot, task.data)) {
-            // send error result
-            send_error(task, "internal_error");
-            break;
-          }
-        } break;
-        case CANCEL_TASK: {  // release slot linked with the task id
-          for (auto& slot : slots) {
-            if (slot.task_id == task.target_id) {
-              slot.release();
-              break;
-            }
-          }
-        } break;
-      }
-    }
-
-    // remove finished multitasks from the queue of multitasks, and add the
-    // corresponding result to the result queue
-    auto queue_iterator = queue_multitasks.begin();
-    while (queue_iterator != queue_multitasks.end()) {
-      if (queue_iterator->subtasks_remaining.empty()) {
-        // all subtasks done == multitask is done
-        task_result aggregate_result;
-        aggregate_result.id = queue_iterator->id;
-        aggregate_result.stop = true;
-        aggregate_result.error = false;
-
-        // collect json results into one json result
-        std::vector<json> result_jsons;
-        for (auto& subres : queue_iterator->results) {
-          result_jsons.push_back(subres.result_json);
-          aggregate_result.error = aggregate_result.error && subres.error;
-        }
-        aggregate_result.result_json = json{"results", result_jsons};
-
-        std::lock_guard<std::mutex> lock(mutex_results);
-        queue_results.push_back(aggregate_result);
-        condition_results.notify_all();
-
-        queue_iterator = queue_multitasks.erase(queue_iterator);
-      } else {
-        ++queue_iterator;
-      }
-    }
-  }
-
-  bool update_slots() {
-    // attend tasks
-    process_tasks();
-
-    // update the system prompt wait until all slots are idle state
-    if (system_need_update && all_slots_are_idle) {
-      LOG_DEBUG << "updating system prompt";
-      update_system_prompt();
-    }
-
-    llama_batch_clear(batch);
-
-    if (all_slots_are_idle) {
-      if (system_prompt.empty() && clean_kv_cache) {
-        LOG_DEBUG
-            << "all slots are idle and system prompt is empty, clear the KV "
-               "cache";
-        kv_cache_clear();
-      }
-      // std::this_thread::sleep_for(std::chrono::milliseconds(5));
-      //  TODO: Need to implement queueing using CV for better performance
-      std::unique_lock<std::mutex> lock(mutex_tasks);
-      condition_tasks.wait(lock, [&] {
-        return (!queue_tasks.empty() && model_loaded_external) ||
-               (queue_tasks.empty() && !model_loaded_external);
-      });
-    }
-
-    for (llama_client_slot& slot : slots) {
-      if (slot.is_processing() &&
-          slot.cache_tokens.size() >= (size_t)slot.n_ctx) {
-        // Shift context
-        const int n_left = slot.n_past - slot.params.n_keep - 1;
-        const int n_discard = n_left / 2;
-
-        LOG_DEBUG << "slot " << slot.id
-                  << " context shift - n_keep = " << slot.params.n_keep
-                  << ", n_left = " << n_left << ", n_discard: " << n_discard;
-        llama_kv_cache_seq_rm(ctx, slot.id, slot.params.n_keep + 1,
-                              slot.params.n_keep + n_discard + 1);
-        llama_kv_cache_seq_add(ctx, slot.id, slot.params.n_keep + 1 + n_discard,
-                               slot.n_past, -n_discard);
-
-        for (size_t i = slot.params.n_keep + 1 + n_discard;
-             i < slot.cache_tokens.size(); i++) {
-          slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
-        }
-
-        slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
-
-        slot.n_past -= n_discard;
-
-        slot.truncated = true;
-
-        LOG_VERBOSE("context shift", {
-                                         {"n_ctx", n_ctx},
-                                         {"n_keep", params.n_keep},
-                                         {"n_left", n_left},
-                                     });
-      }
-    }
-
-    // decode any currently ongoing sequences
-    for (auto& slot : slots) {
-      // release the slot
-      if (slot.command == RELEASE) {
-        slot.state = IDLE;
-        slot.command = NONE;
-        slot.t_last_used = ggml_time_us();
-
-        LOG_DEBUG << "slot " << slot.id << " released ("
-                  << (int)slot.cache_tokens.size() << " tokens in cache)";
-
-        continue;
-      }
-
-      if (slot.state == IDLE) {
-        continue;
-      }
-
-      slot.i_batch = batch.n_tokens;
-
-      llama_batch_add(batch, slot.sampled, system_tokens.size() + slot.n_past,
-                      {slot.id}, true);
-
-      slot.n_decoded += 1;
-      slot.n_past += 1;
-    }
-
-    // process in chunks of params.n_batch
-    int32_t n_batch = params.n_batch;
-
-    // assign workload to the slots
-    if (params.cont_batching || batch.n_tokens == 0) {
-      for (auto& slot : slots) {
-        const bool has_prompt = slot.prompt.is_array() ||
-                                (slot.prompt.is_string() &&
-                                 !slot.prompt.get<std::string>().empty()) ||
-                                !slot.images.empty();
-
-        // empty prompt passed -> release the slot and send empty response
-        if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt) {
-          slot.release();
-          slot.print_timings();
-          send_final_response(slot);
-          continue;
-        }
-
-        // need process the prompt
-        if (slot.state == IDLE && slot.command == LOAD_PROMPT) {
-          slot.state = PROCESSING;
-          slot.command = NONE;
-          std::vector<llama_token> prompt_tokens;
-          slot.t_start_process_prompt = ggml_time_us();
-          slot.t_start_genereration = 0;
-
-          if (slot.infill) {
-            bool suff_rm_leading_spc = true;
-            if (params.input_suffix.find_first_of(' ') == 0 &&
-                params.input_suffix.size() > 1) {
-              params.input_suffix.erase(0, 1);
-              suff_rm_leading_spc = false;
-            }
-            auto prefix_tokens = tokenize(slot.params.input_prefix, false);
-            auto suffix_tokens = tokenize(slot.params.input_suffix, false);
-
-            const int space_token =
-                29871;  // TODO: this should not be hardcoded
-            if (suff_rm_leading_spc && !suffix_tokens.empty() &&
-                suffix_tokens[0] == space_token) {
-              suffix_tokens.erase(suffix_tokens.begin());
-            }
-
-            prefix_tokens.insert(prefix_tokens.begin(),
-                                 llama_token_prefix(model));
-            prefix_tokens.insert(prefix_tokens.begin(),
-                                 llama_token_bos(model));  // always add BOS
-            prefix_tokens.insert(prefix_tokens.end(),
-                                 llama_token_suffix(model));
-            prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(),
-                                 suffix_tokens.end());
-            prefix_tokens.push_back(llama_token_middle(model));
-            prompt_tokens = prefix_tokens;
-          } else {
-            prompt_tokens = tokenize(
-                slot.prompt,
-                system_prompt.empty() &&
-                    add_bos_token);  // add BOS if there isn't system prompt
-          }
-
-          slot.num_prompt_tokens = prompt_tokens.size();
-
-          if (slot.params.n_keep < 0) {
-            slot.params.n_keep = slot.num_prompt_tokens;
-          }
-          slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
-
-          // if input prompt is too big, truncate it
-          if (slot.num_prompt_tokens >= slot.n_ctx) {
-            const int n_left = slot.n_ctx - slot.params.n_keep;
-            const int n_block_size = n_left / 2;
-            const int erased_blocks =
-                (slot.num_prompt_tokens - slot.params.n_keep - n_block_size) /
-                n_block_size;
-
-            std::vector<llama_token> new_tokens(
-                prompt_tokens.begin(),
-                prompt_tokens.begin() + slot.params.n_keep);
-            new_tokens.insert(new_tokens.end(),
-                              prompt_tokens.begin() + slot.params.n_keep +
-                                  erased_blocks * n_block_size,
-                              prompt_tokens.end());
-
-            LOG_VERBOSE(
-                "input truncated",
-                {
-                    {"n_ctx", slot.n_ctx},
-                    {"n_keep", slot.params.n_keep},
-                    {"n_left", n_left},
-                    {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(),
-                                                 new_tokens.cend())},
-                });
-            slot.truncated = true;
-            prompt_tokens = new_tokens;
-
-            slot.num_prompt_tokens = prompt_tokens.size();
-            GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx);
-          }
-
-          if (!slot.params.cache_prompt) {
-            llama_sampling_reset(slot.ctx_sampling);
-
-            slot.n_past = 0;
-            slot.num_prompt_tokens_processed = slot.num_prompt_tokens;
-          } else {
-            // push the prompt into the sampling context (do not apply grammar)
-            for (auto& token : prompt_tokens) {
-              llama_sampling_accept(slot.ctx_sampling, ctx, token, false);
-            }
-
-            slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
-            slot.num_prompt_tokens_processed =
-                slot.num_prompt_tokens - slot.n_past;
-
-            LOG_DEBUG << "slot " << slot.id << " : in cache: " << slot.n_past
-                      << " tokens | to process: "
-                      << slot.num_prompt_tokens_processed << " tokens";
-          }
-
-          LOG_DEBUG << "slot " << slot.id << " : kv cache rm - ["
-                    << (int)system_tokens.size() + slot.n_past << ", end)";
-
-          llama_kv_cache_seq_rm(ctx, slot.id,
-                                system_tokens.size() + slot.n_past, -1);
-
-          slot.cache_tokens = prompt_tokens;
-
-          if (slot.n_past == slot.num_prompt_tokens) {
-            // we have to evaluate at least 1 token to generate logits.
-            LOG_DEBUG << "slot " << slot.id
-                      << " : we have to evaluate at least 1 token to "
-                         "generate logits";
-            slot.n_past--;
-          }
-
-          LOG_VERBOSE(
-              "prompt ingested",
-              {
-                  {"n_past", slot.n_past},
-                  {"cached",
-                   tokens_to_str(ctx, slot.cache_tokens.cbegin(),
-                                 slot.cache_tokens.cbegin() + slot.n_past)},
-                  {"to_eval",
-                   tokens_to_str(ctx, slot.cache_tokens.cbegin() + slot.n_past,
-                                 slot.cache_tokens.cend())},
-              });
-
-          const bool has_images = process_images(slot);
-
-          // process the prefix of first image
-          std::vector<llama_token> prefix_tokens =
-              has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token)
-                         : prompt_tokens;
-          for (; slot.n_past < (int)prefix_tokens.size(); ++slot.n_past) {
-            llama_batch_add(batch, prefix_tokens[slot.n_past],
-                            system_tokens.size() + slot.n_past, {slot.id},
-                            false);
-          }
-
-          if (has_images && !ingest_images(slot, n_batch)) {
-            LOG_DEBUG << "failed processing images";
-            return false;
-          }
-
-          // extract the logits only for the last token
-          if (batch.n_tokens > 0) {
-            batch.logits[batch.n_tokens - 1] = true;
-          }
-
-          slot.n_decoded = 0;
-          slot.i_batch = batch.n_tokens - 1;
-        }
-      }
-    }
-
-    if (batch.n_tokens == 0) {
-      all_slots_are_idle = true;
-      return true;
-    }
-
-    for (int32_t i = 0; i < (int32_t)batch.n_tokens; i += n_batch) {
-      const int32_t n_tokens = std::min(n_batch, (int32_t)(batch.n_tokens - i));
-      llama_batch batch_view = {
-          n_tokens,
-          batch.token + i,
-          nullptr,
-          batch.pos + i,
-          batch.n_seq_id + i,
-          batch.seq_id + i,
-          batch.logits + i,
-          0,
-          0,
-          0,  // unused
-      };
-
-      const int ret = llama_decode(ctx, batch_view);
-      if (ret != 0) {
-        if (n_batch == 1 || ret < 0) {
-          // if you get here, it means the KV cache is full - try increasing it
-          // via the context size
-          LOG_DEBUG << __func__
-                    << " : failed to decode the batch, n_batch = " << n_batch
-                    << ", ret = " << ret;
-          return false;
-        }
-
-        LOG_DEBUG
-            << __func__
-            << " : failed to find free space in the KV cache, retrying with "
-               "smaller n_batch = "
-            << n_batch / 2;
-
-        // retry with half the batch size to try to find a free slot in the KV
-        // cache
-        n_batch /= 2;
-        i -= n_batch;
-        continue;
-      }
-
-      for (auto& slot : slots) {
-        if (slot.i_batch < (int)i || slot.i_batch >= (int)(i + n_tokens)) {
-          continue;
-        }
-
-        // prompt evaluated for embedding
-        if (slot.embedding) {
-          send_embedding(slot);
-          slot.release();
-          slot.i_batch = -1;
-          return true;
-        }
-
-        completion_token_output result;
-        const llama_token id = llama_sampling_sample(slot.ctx_sampling, ctx,
-                                                     NULL, slot.i_batch - i);
-
-        llama_sampling_accept(slot.ctx_sampling, ctx, id, true);
-
-        if (slot.n_decoded == 1) {
-          slot.t_start_genereration = ggml_time_us();
-          slot.t_prompt_processing =
-              (slot.t_start_genereration - slot.t_start_process_prompt) / 1e3;
-        }
-
-        llama_token_data_array cur_p = {slot.ctx_sampling->cur.data(),
-                                        slot.ctx_sampling->cur.size(), false};
-        result.tok = id;
-
-        const int32_t n_probs = slot.sparams.n_probs;
-        if (slot.sparams.temp <= 0 && n_probs > 0) {
-          // for llama_sample_token_greedy we need to sort candidates
-          llama_sample_softmax(ctx, &cur_p);
-        }
-
-        for (size_t i = 0; i < std::min(cur_p.size, (size_t)n_probs); ++i) {
-          result.probs.push_back({cur_p.data[i].id, cur_p.data[i].p});
-        }
-
-        if (!process_token(result, slot)) {
-          slot.release();
-          slot.print_timings();
-          send_final_response(slot);
-        }
-
-        slot.i_batch = -1;
-      }
-    }
-    return true;
-  }
-};
-
-static void server_print_usage(const char* argv0, const gpt_params& params,
-                               const server_params& sparams) {
-  printf("usage: %s [options]\n", argv0);
-  printf("\n");
-  printf("options:\n");
-  printf("  -h, --help                show this help message and exit\n");
-  printf("  -v, --verbose             verbose output (default: %s)\n",
-         server_verbose ? "enabled" : "disabled");
-  printf(
-      "  -t N, --threads N         number of threads to use during "
-      "computation (default: %d)\n",
-      params.n_threads);
-  printf(
-      "  -tb N, --threads-batch N  number of threads to use during batch "
-      "and prompt processing (default: same as --threads)\n");
-  printf(
-      "  -c N, --ctx-size N        size of the prompt context (default: "
-      "%d)\n",
-      params.n_ctx);
-  printf("  --rope-scaling {none,linear,yarn}\n");
-  printf(
-      "                            RoPE frequency scaling method, defaults "
-      "to linear unless specified by the model\n");
-  printf(
-      "  --rope-freq-base N        RoPE base frequency (default: loaded "
-      "from model)\n");
-  printf(
-      "  --rope-freq-scale N       RoPE frequency scaling factor, expands "
-      "context by a factor of 1/N\n");
-  printf(
-      "  --yarn-ext-factor N       YaRN: extrapolation mix factor (default: "
-      "1.0, 0.0 = full interpolation)\n");
-  printf(
-      "  --yarn-attn-factor N      YaRN: scale sqrt(t) or attention "
-      "magnitude (default: 1.0)\n");
-  printf(
-      "  --yarn-beta-slow N        YaRN: high correction dim or alpha "
-      "(default: %.1f)\n",
-      params.yarn_beta_slow);
-  printf(
-      "  --yarn-beta-fast N        YaRN: low correction dim or beta "
-      "(default: %.1f)\n",
-      params.yarn_beta_fast);
-  printf(
-      "  -b N, --batch-size N      batch size for prompt processing "
-      "(default: %d)\n",
-      params.n_batch);
-  printf(
-      "  --memory-f32              use f32 instead of f16 for memory "
-      "key+value (default: disabled)\n");
-  printf(
-      "                            not recommended: doubles context memory "
-      "required and no measurable increase in quality\n");
-  if (llama_supports_mlock()) {
-    printf(
-        "  --mlock                   force system to keep model in RAM "
-        "rather than swapping or compressing\n");
-  }
-  if (llama_supports_mmap()) {
-    printf(
-        "  --no-mmap                 do not memory-map model (slower load "
-        "but may reduce pageouts if not using mlock)\n");
-  }
-  printf(
-      "  --numa                    attempt optimizations that help on some "
-      "NUMA systems\n");
-  if (llama_supports_gpu_offload()) {
-    printf("  -ngl N, --n-gpu-layers N\n");
-    printf("                            number of layers to store in VRAM\n");
-    printf("  -sm SPLIT_MODE, --split-mode SPLIT_MODE\n");
-    printf(
-        "                            how to split the model across multiple "
-        "GPUs, one of:\n");
-    printf("                              - none: use one GPU only\n");
-    printf(
-        "                              - layer (default): split layers and "
-        "KV across GPUs\n");
-    printf("                              - row: split rows across GPUs\n");
-    printf("  -ts SPLIT --tensor-split SPLIT\n");
-    printf(
-        "                            fraction of the model to offload to "
-        "each GPU, comma-separated list of proportions, e.g. 3,1\n");
-    printf(
-        "  -mg i, --main-gpu i       the GPU to use for the model (with "
-        "split-mode = none),\n");
-    printf(
-        "                            or for intermediate results and KV "
-        "(with split-mode = row)\n");
-  }
-  printf("  -m FNAME, --model FNAME\n");
-  printf("                            model path (default: %s)\n",
-         params.model.c_str());
-  printf("  -a ALIAS, --alias ALIAS\n");
-  printf(
-      "                            set an alias for the model, will be "
-      "added as `model` field in completion response\n");
-  printf(
-      "  --lora FNAME              apply LoRA adapter (implies --no-mmap)\n");
-  printf(
-      "  --lora-base FNAME         optional model to use as a base for the "
-      "layers modified by the LoRA adapter\n");
-  printf(
-      "  --host                    ip address to listen (default  (default: "
-      "%s)\n",
-      sparams.hostname.c_str());
-  printf("  --port PORT               port to listen (default  (default: %d)\n",
-         sparams.port);
-  printf(
-      "  --path PUBLIC_PATH        path from which to serve static files "
-      "(default %s)\n",
-      sparams.public_path.c_str());
-  printf(
-      "  --api-key API_KEY         optional api key to enhance server "
-      "security. If set, requests must include this key for access.\n");
-  printf(
-      "  --api-key-file FNAME      path to file containing api keys "
-      "delimited by new lines. If set, requests must include one of the "
-      "keys for access.\n");
-  printf(
-      "  -to N, --timeout N        server read/write timeout in seconds "
-      "(default: %d)\n",
-      sparams.read_timeout);
-  printf(
-      "  --embedding               enable embedding vector output (default: "
-      "%s)\n",
-      params.embedding ? "enabled" : "disabled");
-  printf(
-      "  -np N, --parallel N       number of slots for process requests "
-      "(default: %d)\n",
-      params.n_parallel);
-  printf(
-      "  -cb, --cont-batching      enable continuous batching (a.k.a "
-      "dynamic batching) (default: disabled)\n");
-  printf("  -spf FNAME, --system-prompt-file FNAME\n");
-  printf(
-      "                            set a file to load a system prompt "
-      "(initial "
-      "prompt of all slots), this is useful for chat applications.\n");
-  printf(
-      "  --mmproj MMPROJ_FILE      path to a multimodal projector file for "
-      "LLaVA.\n");
-  printf("  --log-disable             disables logging to a file.\n");
-  printf("\n");
-  printf("  --override-kv KEY=TYPE:VALUE\n");
-  printf(
-      "                            advanced option to override model "
-      "metadata by key. may be specified multiple times.\n");
-  printf(
-      "                            types: int, float, bool. example: "
-      "--override-kv tokenizer.ggml.add_bos_token=bool:false\n");
-  printf(
-      "  -gan N, --grp-attn-n N    set the group attention factor to extend "
-      "context size through self-extend(default: 1=disabled), used together "
-      "with group attention width `--grp-attn-w`");
-  printf(
-      "  -gaw N, --grp-attn-w N    set the group attention width to extend "
-      "context size through self-extend(default: 512), used together with "
-      "group attention factor `--grp-attn-n`");
-  printf("\n");
-}
-static std::string random_string() {
-  static const std::string str(
-      "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
-
-  std::random_device rd;
-  std::mt19937 generator(rd());
-
-  std::string result(32, ' ');
-
-  for (int i = 0; i < 32; ++i) {
-    result[i] = str[generator() % str.size()];
-  }
-
-  return result;
-}
-
-static std::string gen_chatcmplid() {
-  std::stringstream chatcmplid;
-  chatcmplid << "chatcmpl-" << random_string();
-  return chatcmplid.str();
-}
-static json format_final_response_oaicompat(const json& request,
-                                            const task_result& response,
-                                            bool streaming = false) {
-  json result = response.result_json;
-
-  bool stopped_word = result.count("stopped_word") != 0;
-  bool stopped_eos = json_value(result, "stopped_eos", false);
-  int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
-  int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
-  std::string content = json_value(result, "content", std::string(""));
-
-  std::string finish_reason = "length";
-  if (stopped_word || stopped_eos) {
-    finish_reason = "stop";
-  }
-
-  json choices =
-      streaming ? json::array({json{{"finish_reason", finish_reason},
-                                    {"index", 0},
-                                    {"delta", json::object()}}})
-                : json::array({json{{"finish_reason", finish_reason},
-                                    {"index", 0},
-                                    {"message", json{{"content", content},
-                                                     {"role", "assistant"}}}}});
-
-  std::time_t t = std::time(0);
-
-  json res =
-      json{{"choices", choices},
-           {"created", t},
-           {"model",
-            json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
-           {"object", streaming ? "chat.completion.chunk" : "chat.completion"},
-           {"usage",
-            json{{"completion_tokens", num_tokens_predicted},
-                 {"prompt_tokens", num_prompt_tokens},
-                 {"total_tokens", num_tokens_predicted + num_prompt_tokens}}},
-           {"id", gen_chatcmplid()}};
-
-  if (server_verbose) {
-    res["__verbose"] = result;
-  }
-
-  if (result.contains("completion_probabilities")) {
-    res["completion_probabilities"] =
-        json_value(result, "completion_probabilities", json::array());
-  }
-
-  return res;
-}
-
-// return value is vector as there is one case where we might need to generate
-// two responses
-static std::vector<json> format_partial_response_oaicompat(
-    const task_result& response) {
-  json result = response.result_json;
-
-  if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
-    return std::vector<json>({response.result_json});
-  }
-
-  bool first = json_value(result, "oaicompat_token_ctr", 0) == 0;
-  std::string modelname =
-      json_value(result, "model", std::string(DEFAULT_OAICOMPAT_MODEL));
-
-  bool stopped_word = json_value(result, "stopped_word", false);
-  bool stopped_eos = json_value(result, "stopped_eos", false);
-  bool stopped_limit = json_value(result, "stopped_limit", false);
-  std::string content = json_value(result, "content", std::string(""));
-
-  std::string finish_reason;
-  if (stopped_word || stopped_eos) {
-    finish_reason = "stop";
-  }
-  if (stopped_limit) {
-    finish_reason = "length";
-  }
-
-  std::time_t t = std::time(0);
-
-  json choices;
-
-  if (!finish_reason.empty()) {
-    choices = json::array({json{{"finish_reason", finish_reason},
-                                {"index", 0},
-                                {"delta", json::object()}}});
-  } else {
-    if (first) {
-      if (content.empty()) {
-        choices = json::array({json{{"finish_reason", nullptr},
-                                    {"index", 0},
-                                    {"delta", json{{"role", "assistant"}}}}});
-      } else {
-        // We have to send this as two updates to conform to openai behavior
-        json initial_ret =
-            json{{"choices",
-                  json::array({json{{"finish_reason", nullptr},
-                                    {"index", 0},
-                                    {"delta", json{{"role", "assistant"}}}}})},
-                 {"created", t},
-                 {"id", gen_chatcmplid()},
-                 {"model", modelname},
-                 {"object", "chat.completion.chunk"}};
-
-        json second_ret =
-            json{{"choices",
-                  json::array({json{{"finish_reason", nullptr},
-                                    {"index", 0},
-                                    {"delta", json{{"content", content}}}}})},
-                 {"created", t},
-                 {"id", gen_chatcmplid()},
-                 {"model", modelname},
-                 {"object", "chat.completion.chunk"}};
-
-        return std::vector<json>({initial_ret, second_ret});
-      }
-    } else {
-      // Some idiosyncrasy in task processing logic makes several trailing calls
-      // with empty content, we ignore these at the calee site.
-      if (content.empty()) {
-        return std::vector<json>({json::object()});
-      }
-
-      choices = json::array({json{
-          {"finish_reason", nullptr},
-          {"index", 0},
-          {"delta",
-           json{
-               {"content", content},
-           }},
-      }});
-    }
-  }
-
-  json ret = json{{"choices", choices},
-                  {"created", t},
-                  {"id", gen_chatcmplid()},
-                  {"model", modelname},
-                  {"object", "chat.completion.chunk"}};
-
-  return std::vector<json>({ret});
-}
-
-static json format_partial_response(
-    llama_server_context& llama, llama_client_slot* slot,
-    const std::string& content,
-    const std::vector<completion_token_output>& probs) {
-  json res = json{{"content", content},
-                  {"stop", false},
-                  {"slot_id", slot->id},
-                  {"multimodal", llama.multimodal}};
-
-  if (slot->sparams.n_probs > 0) {
-    res["completion_probabilities"] = probs_vector_to_json(llama.ctx, probs);
-  }
-
-  return res;
-}
-
-static json format_tokenizer_response(const std::vector<llama_token>& tokens) {
-  return json{{"tokens", tokens}};
-}
-
-static json format_detokenized_response(std::string content) {
-  return json{{"content", content}};
-}
-
-struct token_translator {
-  llama_context* ctx;
-  std::string operator()(llama_token tok) const {
-    return llama_token_to_piece(ctx, tok);
-  }
-  std::string operator()(const completion_token_output& cto) const {
-    return (*this)(cto.tok);
-  }
-};
-
-static void append_to_generated_text_from_generated_token_probs(
-    llama_server_context& llama, llama_client_slot* slot) {
-  auto& gtps = slot->generated_token_probs;
-  auto translator = token_translator{llama.ctx};
-  auto add_strlen = [=](size_t sum, const completion_token_output& cto) {
-    return sum + translator(cto).size();
-  };
-  const size_t len =
-      std::accumulate(gtps.begin(), gtps.end(), size_t(0), add_strlen);
-  if (slot->generated_text.capacity() < slot->generated_text.size() + len) {
-    slot->generated_text.reserve(slot->generated_text.size() + len);
-  }
-  for (const completion_token_output& cto : gtps) {
-    slot->generated_text += translator(cto);
-  }
-}
diff --git a/cortex-cpp/controllers/llamaCPP.cc b/cortex-cpp/controllers/llamaCPP.cc
index 69284d6e9..3beaae0e9 100644
--- a/cortex-cpp/controllers/llamaCPP.cc
+++ b/cortex-cpp/controllers/llamaCPP.cc
@@ -1,757 +1,148 @@
 #include "llamaCPP.h"
 
+#include <chrono>
 #include <fstream>
 #include <iostream>
-#include "log.h"
+
+#include "trantor/utils/Logger.h"
 #include "utils/logging_utils.h"
 #include "utils/nitro_utils.h"
 
-// External
-#include "common.h"
-#include "llama.h"
-
 using namespace inferences;
 using json = nlohmann::json;
-
-/**
- * The state of the inference task
- */
-enum InferenceStatus { PENDING, RUNNING, EOS, FINISHED };
-
-/**
- * There is a need to save state of current ongoing inference status of a
- * handler, this struct is to solve that issue
- *
- * @param inst Pointer to the llamaCPP instance this inference task is
- * associated with.
- */
-struct inferenceState {
-  int task_id;
-  InferenceStatus inference_status = PENDING;
-  llamaCPP* instance;
-  // Check if we receive the first token, set it to false after receiving
-  bool is_first_token = true;
-
-  inferenceState(llamaCPP* inst) : instance(inst) {}
-};
-
-/**
- * This function is to create the smart pointer to inferenceState, hence the
- * inferenceState will be persisting even tho the lambda in streaming might go
- * out of scope and the handler already moved on
- */
-std::shared_ptr<inferenceState> create_inference_state(llamaCPP* instance) {
-  return std::make_shared<inferenceState>(instance);
-}
-
-/**
- * Check if model already loaded if not return message to user
- * @param callback the function to return message to user
- */
-bool llamaCPP::CheckModelLoaded(
-    const std::function<void(const HttpResponsePtr&)>& callback) {
-  if (!llama.model_loaded_external) {
-    LOG_ERROR << "Model has not been loaded";
-    Json::Value jsonResp;
-    jsonResp["message"] =
-        "Model has not been loaded, please load model into nitro";
-    auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
-    resp->setStatusCode(drogon::k409Conflict);
-    callback(resp);
-    return false;
-  }
-  return true;
-}
-
-Json::Value create_embedding_payload(const std::vector<float>& embedding,
-                                     int prompt_tokens) {
-  Json::Value dataItem;
-
-  dataItem["object"] = "embedding";
-
-  Json::Value embeddingArray(Json::arrayValue);
-  for (const auto& value : embedding) {
-    embeddingArray.append(value);
-  }
-  dataItem["embedding"] = embeddingArray;
-  dataItem["index"] = 0;
-
-  return dataItem;
-}
-
-Json::Value create_full_return_json(const std::string& id,
-                                    const std::string& model,
-                                    const std::string& content,
-                                    const std::string& system_fingerprint,
-                                    int prompt_tokens, int completion_tokens,
-                                    Json::Value finish_reason = Json::Value()) {
-  Json::Value root;
-
-  root["id"] = id;
-  root["model"] = model;
-  root["created"] = static_cast<int>(std::time(nullptr));
-  root["object"] = "chat.completion";
-  root["system_fingerprint"] = system_fingerprint;
-
-  Json::Value choicesArray(Json::arrayValue);
-  Json::Value choice;
-
-  choice["index"] = 0;
-  Json::Value message;
-  message["role"] = "assistant";
-  message["content"] = content;
-  choice["message"] = message;
-  choice["finish_reason"] = finish_reason;
-
-  choicesArray.append(choice);
-  root["choices"] = choicesArray;
-
-  Json::Value usage;
-  usage["prompt_tokens"] = prompt_tokens;
-  usage["completion_tokens"] = completion_tokens;
-  usage["total_tokens"] = prompt_tokens + completion_tokens;
-  root["usage"] = usage;
-
-  return root;
-}
-
-std::string create_return_json(const std::string& id, const std::string& model,
-                               const std::string& content,
-                               Json::Value finish_reason = Json::Value()) {
-  Json::Value root;
-
-  root["id"] = id;
-  root["model"] = model;
-  root["created"] = static_cast<int>(std::time(nullptr));
-  root["object"] = "chat.completion.chunk";
-
-  Json::Value choicesArray(Json::arrayValue);
-  Json::Value choice;
-
-  choice["index"] = 0;
-  Json::Value delta;
-  delta["content"] = content;
-  choice["delta"] = delta;
-  choice["finish_reason"] = finish_reason;
-
-  choicesArray.append(choice);
-  root["choices"] = choicesArray;
-
-  Json::StreamWriterBuilder writer;
-  writer["indentation"] = "";  // This sets the indentation to an empty string,
-                               // producing compact output.
-  return Json::writeString(writer, root);
-}
-
-llamaCPP::llamaCPP()
-    : queue(new trantor::ConcurrentTaskQueue(llama.params.n_parallel,
-                                             "llamaCPP")) {
+namespace inferences {
+llamaCPP::llamaCPP() {
+  dylib_ = std::make_unique<dylib>("./engines/cortex.llamacpp", "engine");
+  auto func = dylib_->get_function<EngineI*()>("get_engine");
+  engine_ = func();
   // Some default values for now below
-  log_disable();  // Disable the log to file feature, reduce bloat for
-                  // target
-                  // system ()
+  // log_disable();  // Disable the log to file feature, reduce bloat for
+  // target
+  // system ()
 };
 
-llamaCPP::~llamaCPP() {
-  StopBackgroundTask();
-}
-
-void llamaCPP::WarmupModel() {
-  json pseudo;
-
-  LOG_INFO << "Warm-up model";
-  pseudo["prompt"] = "Hello";
-  pseudo["n_predict"] = 2;
-  pseudo["stream"] = false;
-  const int task_id = llama.request_completion(pseudo, false, false, -1);
-  std::string completion_text;
-  task_result result = llama.next_result(task_id);
-  if (!result.error && result.stop) {
-    LOG_INFO << result.result_json.dump(-1, ' ', false,
-                                        json::error_handler_t::replace);
-  }
-  return;
-}
+llamaCPP::~llamaCPP() {}
 
 void llamaCPP::ChatCompletion(
-    inferences::ChatCompletionRequest&& completion,
-    std::function<void(const HttpResponsePtr&)>&& callback) {
-  // Check if model is loaded
-  if (CheckModelLoaded(callback)) {
-    // Model is loaded
-    // Do Inference
-    InferenceImpl(std::move(completion), std::move(callback));
-  }
-}
-
-void llamaCPP::InferenceImpl(
-    inferences::ChatCompletionRequest&& completion,
+    const HttpRequestPtr& req,
     std::function<void(const HttpResponsePtr&)>&& callback) {
-  if (llama.model_type == ModelType::EMBEDDING) {
-    LOG_WARN << "Not support completion for embedding model";
-    Json::Value jsonResp;
-    jsonResp["message"] = "Not support completion for embedding model";
-    auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
-    resp->setStatusCode(drogon::k400BadRequest);
-    callback(resp);
-    return;
-  }
-  std::string formatted_output = pre_prompt;
-  int request_id = ++no_of_requests;
-  LOG_INFO_REQUEST(request_id) << "Generating reponse for inference request";
-
-  json data;
-  json stopWords;
-  int no_images = 0;
-  // To set default value
-
-  // Increase number of chats received and clean the prompt
-  //no_of_chats++;
-  //if (no_of_chats % clean_cache_threshold == 0) {
-  //  LOG_INFO_REQUEST(request_id) << "Clean cache threshold reached!";
-  //  llama.kv_cache_clear();
-  //  LOG_INFO_REQUEST(request_id) << "Cache cleaned";
-  //}
-
-  // Default values to enable auto caching
-  //data["cache_prompt"] = caching_enabled;
-  data["cache_prompt"] = false;
-  data["n_keep"] = -1;
-
-  // Passing load value
-  data["repeat_last_n"] = this->repeat_last_n;
-  LOG_INFO_REQUEST(request_id)
-      << "Stop words:" << completion.stop.toStyledString();
-
-  data["stream"] = completion.stream;
-  data["n_predict"] = completion.max_tokens;
-  data["top_p"] = completion.top_p;
-  data["temperature"] = completion.temperature;
-  data["frequency_penalty"] = completion.frequency_penalty;
-  data["presence_penalty"] = completion.presence_penalty;
-  const Json::Value& messages = completion.messages;
-
-  if (!grammar_file_content.empty()) {
-    data["grammar"] = grammar_file_content;
-  };
-
-  if (!llama.multimodal) {
-    for (const auto& message : messages) {
-      std::string input_role = message["role"].asString();
-      std::string role;
-      if (input_role == "user") {
-        role = user_prompt;
-        std::string content = message["content"].asString();
-        formatted_output += role + content;
-      } else if (input_role == "assistant") {
-        role = ai_prompt;
-        std::string content = message["content"].asString();
-        formatted_output += role + content;
-      } else if (input_role == "system") {
-        role = system_prompt;
-        std::string content = message["content"].asString();
-        formatted_output = role + content + formatted_output;
-
-      } else {
-        role = input_role;
-        std::string content = message["content"].asString();
-        formatted_output += role + content;
-      }
-    }
-    formatted_output += ai_prompt;
-  } else {
-    data["image_data"] = json::array();
-    for (const auto& message : messages) {
-      std::string input_role = message["role"].asString();
-      std::string role;
-      if (input_role == "user") {
-        formatted_output += role;
-        for (auto content_piece : message["content"]) {
-          role = user_prompt;
-
-          json content_piece_image_data;
-          content_piece_image_data["data"] = "";
-
-          auto content_piece_type = content_piece["type"].asString();
-          if (content_piece_type == "text") {
-            auto text = content_piece["text"].asString();
-            formatted_output += text;
-          } else if (content_piece_type == "image_url") {
-            auto image_url = content_piece["image_url"]["url"].asString();
-            std::string base64_image_data;
-            if (image_url.find("http") != std::string::npos) {
-              LOG_INFO_REQUEST(request_id)
-                  << "Remote image detected but not supported yet";
-            } else if (image_url.find("data:image") != std::string::npos) {
-              LOG_INFO_REQUEST(request_id) << "Base64 image detected";
-              base64_image_data = nitro_utils::extractBase64(image_url);
-              LOG_INFO_REQUEST(request_id) << base64_image_data;
-            } else {
-              LOG_INFO_REQUEST(request_id) << "Local image detected";
-              nitro_utils::processLocalImage(
-                  image_url, [&](const std::string& base64Image) {
-                    base64_image_data = base64Image;
-                  });
-              LOG_INFO_REQUEST(request_id) << base64_image_data;
-            }
-            content_piece_image_data["data"] = base64_image_data;
-
-            formatted_output += "[img-" + std::to_string(no_images) + "]";
-            content_piece_image_data["id"] = no_images;
-            data["image_data"].push_back(content_piece_image_data);
-            no_images++;
-          }
-        }
-
-      } else if (input_role == "assistant") {
-        role = ai_prompt;
-        std::string content = message["content"].asString();
-        formatted_output += role + content;
-      } else if (input_role == "system") {
-        role = system_prompt;
-        std::string content = message["content"].asString();
-        formatted_output = role + content + formatted_output;
-
-      } else {
-        role = input_role;
-        std::string content = message["content"].asString();
-        formatted_output += role + content;
-      }
-    }
-    formatted_output += ai_prompt;
-    LOG_INFO_REQUEST(request_id) << formatted_output;
-  }
-
-  data["prompt"] = formatted_output;
-  for (const auto& stop_word : completion.stop) {
-    stopWords.push_back(stop_word.asString());
-  }
-  // specify default stop words
-  // Ensure success case for chatML
-  stopWords.push_back("<|im_end|>");
-  stopWords.push_back(nitro_utils::rtrim(user_prompt));
-  data["stop"] = stopWords;
-
-  bool is_streamed = data["stream"];
-// Enable full message debugging
-#ifdef DEBUG
-  LOG_INFO_REQUEST(request_id) << "Current completion text";
-  LOG_INFO_REQUEST(request_id) << formatted_output;
-#endif
-
-  if (is_streamed) {
-    LOG_INFO_REQUEST(request_id) << "Streamed, waiting for respone";
-    auto state = create_inference_state(this);
-
-    auto chunked_content_provider = [state, data, request_id](
-                                        char* pBuffer,
-                                        std::size_t nBuffSize) -> std::size_t {
-      if (state->inference_status == PENDING) {
-        state->inference_status = RUNNING;
-      } else if (state->inference_status == FINISHED) {
-        return 0;
-      }
-
-      if (!pBuffer) {
-        LOG_WARN_REQUEST(request_id)
-        "Connection closed or buffer is null. Reset context";
-        state->inference_status = FINISHED;
-        return 0;
-      }
-
-      if (state->inference_status == EOS) {
-        LOG_INFO_REQUEST(request_id) << "End of result";
-        const std::string str =
-            "data: " +
-            create_return_json(nitro_utils::generate_random_string(20), "_", "",
-                               "stop") +
-            "\n\n" + "data: [DONE]" + "\n\n";
-
-        LOG_VERBOSE("data stream",
-                    {{"request_id": request_id}, {"to_send", str}});
-        std::size_t nRead = std::min(str.size(), nBuffSize);
-        memcpy(pBuffer, str.data(), nRead);
-        state->inference_status = FINISHED;
-        return nRead;
-      }
-
-      task_result result = state->instance->llama.next_result(state->task_id);
-      if (!result.error) {
-        std::string to_send = result.result_json["content"];
-
-        // trim the leading space if it is the first token
-        if (std::exchange(state->is_first_token, false)) {
-          nitro_utils::ltrim(to_send);
-        }
-
-        const std::string str =
-            "data: " +
-            create_return_json(nitro_utils::generate_random_string(20), "_",
-                               to_send) +
-            "\n\n";
-
-        std::size_t nRead = std::min(str.size(), nBuffSize);
-        memcpy(pBuffer, str.data(), nRead);
-
-        if (result.stop) {
-          LOG_INFO_REQUEST(request_id) << "Reached result stop";
-          state->inference_status = EOS;
-          return nRead;
-        }
-
-        // Make sure nBufferSize is not zero
-        // Otherwise it stop streaming
-        if (!nRead) {
-          state->inference_status = FINISHED;
-        }
-
-        return nRead;
-      } else {
-        LOG_ERROR_REQUEST(request_id) << "Error during inference";
-      }
-      state->inference_status = FINISHED;
-      return 0;
-    };
-    // Queued task
-    state->instance->queue->runTaskInQueue([cb = std::move(callback), state,
-                                            data, chunked_content_provider,
-                                            request_id]() {
-      state->task_id =
-          state->instance->llama.request_completion(data, false, false, -1);
-
-      // Start streaming response
-      auto resp = nitro_utils::nitroStreamResponse(chunked_content_provider,
-                                                   "chat_completions.txt");
-      cb(resp);
-
-      int retries = 0;
-
-      // Since this is an async task, we will wait for the task to be
-      // completed
-      while (state->inference_status != FINISHED && retries < 10 &&
-             state->instance->llama.model_loaded_external) {
-        // Should wait chunked_content_provider lambda to be called within
-        // 3s
-        if (state->inference_status == PENDING) {
-          retries += 1;
-        }
-        if (state->inference_status != RUNNING)
-          LOG_INFO_REQUEST(request_id)
-              << "Wait for task to be released:" << state->task_id;
-        std::this_thread::sleep_for(std::chrono::milliseconds(100));
-      }
-      LOG_INFO_REQUEST(request_id) << "Task completed, release it";
-      // Request completed, release it
-      state->instance->llama.request_cancel(state->task_id);
-      LOG_INFO_REQUEST(request_id) << "Inference completed";
-    });
+  LOG_TRACE << "Start chat completion";
+  auto json_body = req->getJsonObject();
+  bool is_stream = (*json_body).get("stream", false).asBool();
+  auto q = std::make_shared<SyncQueue>();
+  engine_->HandleChatCompletion(json_body,
+                                [q](Json::Value status, Json::Value res) {
+                                  q->push(std::make_pair(status, res));
+                                });
+  LOG_TRACE << "Wait to chat completion responses";
+  if (is_stream) {
+    ProcessStreamRes(std::move(callback), q);
   } else {
-    queue->runTaskInQueue(
-        [this, request_id, cb = std::move(callback), d = std::move(data)]() {
-          Json::Value respData;
-          int task_id = llama.request_completion(d, false, false, -1);
-          LOG_INFO_REQUEST(request_id) << "Non stream, waiting for respone";
-          if (!json_value(d, "stream", false)) {
-            std::string completion_text;
-            task_result result = llama.next_result(task_id);
-            if (!result.error && result.stop) {
-              int prompt_tokens = result.result_json["tokens_evaluated"];
-              int predicted_tokens = result.result_json["tokens_predicted"];
-              std::string to_send = result.result_json["content"];
-              nitro_utils::ltrim(to_send);
-              respData = create_full_return_json(
-                  nitro_utils::generate_random_string(20), "_", to_send, "_",
-                  prompt_tokens, predicted_tokens);
-            } else {
-              respData["message"] = "Internal error during inference";
-              LOG_ERROR_REQUEST(request_id) << "Error during inference";
-            }
-            auto resp = nitro_utils::nitroHttpJsonResponse(respData);
-            cb(resp);
-            LOG_INFO_REQUEST(request_id) << "Inference completed";
-          }
-        });
+    ProcessNonStreamRes(std::move(callback), *q);
   }
+  LOG_TRACE << "Done chat completion";
 }
 
 void llamaCPP::Embedding(
     const HttpRequestPtr& req,
     std::function<void(const HttpResponsePtr&)>&& callback) {
-  // Check if model is loaded
-  if (CheckModelLoaded(callback)) {
-    // Model is loaded
-    const auto& jsonBody = req->getJsonObject();
-    // Run embedding
-    EmbeddingImpl(jsonBody, std::move(callback));
-    return;
-  }
-}
-
-void llamaCPP::EmbeddingImpl(
-    std::shared_ptr<Json::Value> jsonBody,
-    std::function<void(const HttpResponsePtr&)>&& callback) {
-  int request_id = ++no_of_requests;
-  LOG_INFO_REQUEST(request_id) << "Generating reponse for embedding request";
-  // Queue embedding task
-  auto state = create_inference_state(this);
-
-  state->instance->queue->runTaskInQueue([this, state, jsonBody, callback,
-                                          request_id]() {
-    Json::Value responseData(Json::arrayValue);
-
-    if (jsonBody->isMember("input")) {
-      const Json::Value& input = (*jsonBody)["input"];
-      if (input.isString()) {
-        // Process the single string input
-        state->task_id = llama.request_completion(
-            {{"prompt", input.asString()}, {"n_predict", 0}}, false, true, -1);
-        task_result result = llama.next_result(state->task_id);
-        std::vector<float> embedding_result = result.result_json["embedding"];
-        responseData.append(create_embedding_payload(embedding_result, 0));
-      } else if (input.isArray()) {
-        // Process each element in the array input
-        for (const auto& elem : input) {
-          if (elem.isString()) {
-            const int task_id = llama.request_completion(
-                {{"prompt", elem.asString()}, {"n_predict", 0}}, false, true,
-                -1);
-            task_result result = llama.next_result(task_id);
-            std::vector<float> embedding_result =
-                result.result_json["embedding"];
-            responseData.append(create_embedding_payload(embedding_result, 0));
-          }
-        }
-      }
-    }
-
-    Json::Value root;
-    root["data"] = responseData;
-    root["model"] = "_";
-    root["object"] = "list";
-    Json::Value usage;
-    usage["prompt_tokens"] = 0;
-    usage["total_tokens"] = 0;
-    root["usage"] = usage;
-
-    auto resp = nitro_utils::nitroHttpJsonResponse(root);
-    callback(resp);
-    LOG_INFO_REQUEST(request_id) << "Embedding completed";
-  });
+  LOG_TRACE << "Start embedding";
+  SyncQueue q;
+  engine_->HandleEmbedding(req->getJsonObject(),
+                           [&q](Json::Value status, Json::Value res) {
+                             q.push(std::make_pair(status, res));
+                           });
+  LOG_TRACE << "Wait to embedding";
+  ProcessNonStreamRes(std::move(callback), q);
+  LOG_TRACE << "Done embedding";
 }
 
 void llamaCPP::UnloadModel(
     const HttpRequestPtr& req,
     std::function<void(const HttpResponsePtr&)>&& callback) {
-  Json::Value jsonResp;
-  if (CheckModelLoaded(callback)) {
-    StopBackgroundTask();
-
-    llama_free(llama.ctx);
-    llama_free_model(llama.model);
-    llama.ctx = nullptr;
-    llama.model = nullptr;
-    jsonResp["message"] = "Model unloaded successfully";
-    auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
-    callback(resp);
-    LOG_INFO << "Model unloaded successfully";
-  }
+  LOG_TRACE << "Start unload model";
+  engine_->UnloadModel(
+      req->getJsonObject(),
+      [cb = std::move(callback)](Json::Value status, Json::Value res) {
+        auto resp = nitro_utils::nitroHttpJsonResponse(res);
+        resp->setStatusCode(
+            static_cast<drogon::HttpStatusCode>(status["status_code"].asInt()));
+        cb(resp);
+      });
+  LOG_TRACE << "Done unload model";
 }
 
 void llamaCPP::ModelStatus(
     const HttpRequestPtr& req,
     std::function<void(const HttpResponsePtr&)>&& callback) {
-  Json::Value jsonResp;
-  bool is_model_loaded = llama.model_loaded_external;
-  if (CheckModelLoaded(callback)) {
-    jsonResp["model_loaded"] = is_model_loaded;
-    jsonResp["model_data"] = llama.get_model_props().dump();
-    auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
-    callback(resp);
-    LOG_INFO << "Model status responded";
-  }
+  LOG_TRACE << "Start to get model status";
+  engine_->GetModelStatus(
+      req->getJsonObject(),
+      [cb = std::move(callback)](Json::Value status, Json::Value res) {
+        auto resp = nitro_utils::nitroHttpJsonResponse(res);
+        resp->setStatusCode(
+            static_cast<drogon::HttpStatusCode>(status["status_code"].asInt()));
+        cb(resp);
+      });
+  LOG_TRACE << "Done get model status";
 }
 
 void llamaCPP::LoadModel(
     const HttpRequestPtr& req,
     std::function<void(const HttpResponsePtr&)>&& callback) {
-
-  if (!nitro_utils::isAVX2Supported() && ggml_cpu_has_avx2()) {
-    LOG_ERROR << "AVX2 is not supported by your processor";
-    Json::Value jsonResp;
-    jsonResp["message"] =
-        "AVX2 is not supported by your processor, please download and replace "
-        "the correct Nitro asset version";
-    auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
-    resp->setStatusCode(drogon::k500InternalServerError);
-    callback(resp);
-    return;
-  }
-
-  if (llama.model_loaded_external) {
-    LOG_INFO << "Model already loaded";
-    Json::Value jsonResp;
-    jsonResp["message"] = "Model already loaded";
-    auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
-    resp->setStatusCode(drogon::k409Conflict);
-    callback(resp);
-    return;
-  }
-
-  const auto& jsonBody = req->getJsonObject();
-  if (!LoadModelImpl(jsonBody)) {
-    // Error occurred during model loading
-    Json::Value jsonResp;
-    jsonResp["message"] = "Failed to load model";
-    auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
-    resp->setStatusCode(drogon::k500InternalServerError);
-    callback(resp);
-  } else {
-    // Model loaded successfully
-    Json::Value jsonResp;
-    jsonResp["message"] = "Model loaded successfully";
-    auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
-    callback(resp);
-    LOG_INFO << "Model loaded successfully";
-  }
-}
-
-bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
-  gpt_params params;
-  std::string model_type;
-  // By default will setting based on number of handlers
-  if (jsonBody) {
-    if (!jsonBody->operator[]("mmproj").isNull()) {
-      LOG_INFO << "MMPROJ FILE detected, multi-model enabled!";
-      params.mmproj = jsonBody->operator[]("mmproj").asString();
-    }
-    if (!jsonBody->operator[]("grp_attn_n").isNull()) {
-      params.grp_attn_n = jsonBody->operator[]("grp_attn_n").asInt();
-    }
-    if (!jsonBody->operator[]("grp_attn_w").isNull()) {
-      params.grp_attn_w = jsonBody->operator[]("grp_attn_w").asInt();
-    }
-    if (!jsonBody->operator[]("mlock").isNull()) {
-      params.use_mlock = jsonBody->operator[]("mlock").asBool();
+  LOG_TRACE << "Load model";
+  engine_->LoadModel(
+      req->getJsonObject(),
+      [cb = std::move(callback)](Json::Value status, Json::Value res) {
+        auto resp = nitro_utils::nitroHttpJsonResponse(res);
+        resp->setStatusCode(
+            static_cast<drogon::HttpStatusCode>(status["status_code"].asInt()));
+        cb(resp);
+      });
+  LOG_TRACE << "Done load model";
+}
+
+void llamaCPP::ProcessStreamRes(std::function<void(const HttpResponsePtr&)> cb,
+                                std::shared_ptr<SyncQueue> q) {
+  auto err_or_done = std::make_shared<std::atomic_bool>(false);
+  auto chunked_content_provider = [q, err_or_done](
+                                      char* buf,
+                                      std::size_t buf_size) -> std::size_t {
+    if (buf == nullptr) {
+      LOG_TRACE << "Buf is null";
+      return 0;
     }
 
-    if (!jsonBody->operator[]("grammar_file").isNull()) {
-      std::string grammar_file =
-          jsonBody->operator[]("grammar_file").asString();
-      std::ifstream file(grammar_file);
-      if (!file) {
-        LOG_ERROR << "Grammar file not found";
-      } else {
-        std::stringstream grammarBuf;
-        grammarBuf << file.rdbuf();
-        grammar_file_content = grammarBuf.str();
-      }
-    };
-
-    Json::Value model_path = jsonBody->operator[]("llama_model_path");
-    if (model_path.isNull()) {
-      LOG_ERROR << "Missing model path in request";
-    } else {
-      if (std::filesystem::exists(
-              std::filesystem::path(model_path.asString()))) {
-        params.model = model_path.asString();
-      } else {
-        LOG_ERROR << "Could not find model in path " << model_path.asString();
-      }
+    if (*err_or_done) {
+      LOG_TRACE << "Done";
+      return 0;
     }
 
-    params.n_gpu_layers = jsonBody->get("ngl", 100).asInt();
-    params.n_ctx = jsonBody->get("ctx_len", 2048).asInt();
-    params.embedding = jsonBody->get("embedding", true).asBool();
-    model_type = jsonBody->get("model_type", "llm").asString();
-    if (model_type == "llm") {
-      llama.model_type = ModelType::LLM;
-    } else {
-      llama.model_type = ModelType::EMBEDDING;
-    }
-    // Check if n_parallel exists in jsonBody, if not, set to drogon_thread
-    params.n_batch = jsonBody->get("n_batch", 512).asInt();
-    params.n_parallel = jsonBody->get("n_parallel", 1).asInt();
-    params.n_threads =
-        jsonBody->get("cpu_threads", std::thread::hardware_concurrency())
-            .asInt();
-    params.cont_batching = jsonBody->get("cont_batching", false).asBool();
-    this->clean_cache_threshold =
-        jsonBody->get("clean_cache_threshold", 5).asInt();
-    this->caching_enabled = jsonBody->get("caching_enabled", false).asBool();
-    this->user_prompt = jsonBody->get("user_prompt", "USER: ").asString();
-    this->ai_prompt = jsonBody->get("ai_prompt", "ASSISTANT: ").asString();
-    this->system_prompt =
-        jsonBody->get("system_prompt", "ASSISTANT's RULE: ").asString();
-    this->pre_prompt = jsonBody->get("pre_prompt", "").asString();
-    this->repeat_last_n = jsonBody->get("repeat_last_n", 32).asInt();
+    auto [status, res] = q->wait_and_pop();
 
-    if (!jsonBody->operator[]("llama_log_folder").isNull()) {
-      log_enable();
-      std::string llama_log_folder =
-          jsonBody->operator[]("llama_log_folder").asString();
-      log_set_target(llama_log_folder + "llama.log");
-    }  // Set folder for llama log
-  }
-  if (params.model_alias == "unknown") {
-    params.model_alias = params.model;
-  }
-
-  llama_backend_init();
-
-  // LOG_INFO_LLAMA("build info",
-  //                {{"build", BUILD_NUMBER}, {"commit", BUILD_COMMIT}});
-  LOG_INFO_LLAMA("system info",
-                 {
-                     {"n_threads", params.n_threads},
-                     {"total_threads", std::thread::hardware_concurrency()},
-                     {"system_info", llama_print_system_info()},
-                 });
-
-  // load the model
-  if (!llama.load_model(params)) {
-    LOG_ERROR << "Error loading the model";
-    return false;  // Indicate failure
-  }
-  llama.initialize();
-
-  if (queue != nullptr) {
-    delete queue;
-  }
-
-  queue = new trantor::ConcurrentTaskQueue(llama.params.n_parallel, "llamaCPP");
-
-  llama.model_loaded_external = true;
+    if (status["has_error"].asBool() || status["is_done"].asBool()) {
+      *err_or_done = true;
+    }
 
-  LOG_INFO << "Started background task here!";
-  backgroundThread = std::thread(&llamaCPP::BackgroundTask, this);
+    auto str = res["data"].asString();
+    LOG_TRACE << "data: " << str;
+    std::size_t n = std::min(str.size(), buf_size);
+    memcpy(buf, str.data(), n);
 
-  // For model like nomic-embed-text-v1.5.f16.gguf, etc, we don't need to warm up model.
-  // So we use this variable to differentiate with other models
-  if (llama.model_type == ModelType::LLM) {
-    WarmupModel();
-  }
-  return true;
-}
+    return n;
+  };
 
-void llamaCPP::BackgroundTask() {
-  while (llama.model_loaded_external) {
-    // model_loaded =
-    llama.update_slots();
-  }
-  LOG_INFO << "Background task stopped! ";
-  llama.kv_cache_clear();
-  LOG_INFO << "KV cache cleared!";
-  return;
+  auto resp = nitro_utils::nitroStreamResponse(chunked_content_provider,
+                                               "chat_completions.txt");
+  cb(resp);
 }
 
-void llamaCPP::StopBackgroundTask() {
-  if (llama.model_loaded_external) {
-    llama.model_loaded_external = false;
-    llama.condition_tasks.notify_one();
-    LOG_INFO << "Stopping background task! ";
-    if (backgroundThread.joinable()) {
-      backgroundThread.join();
-    }
-    LOG_INFO << "Background task stopped! ";
-  }
+void llamaCPP::ProcessNonStreamRes(
+    std::function<void(const HttpResponsePtr&)> cb, SyncQueue& q) {
+  auto [status, res] = q.wait_and_pop();
+  auto resp = nitro_utils::nitroHttpJsonResponse(res);
+  resp->setStatusCode(
+      static_cast<drogon::HttpStatusCode>(status["status_code"].asInt()));
+  cb(resp);
 }
+}  // namespace inferences
\ No newline at end of file
diff --git a/cortex-cpp/controllers/llamaCPP.h b/cortex-cpp/controllers/llamaCPP.h
index 531c18b20..62b554f6b 100644
--- a/cortex-cpp/controllers/llamaCPP.h
+++ b/cortex-cpp/controllers/llamaCPP.h
@@ -11,18 +11,15 @@
 #define CPPHTTPLIB_NO_EXCEPTIONS 1
 #endif
 
-#include <trantor/utils/ConcurrentTaskQueue.h>
+#include <condition_variable>
 #include <cstddef>
 #include <string>
-#include <thread>
 
 #include "common/base.h"
-#include "context/llama_server_context.h"
-#include "stb_image.h"
+#include "cortex-common/EngineI.h"
+#include "trantor/utils/SerialTaskQueue.h"
+#include "utils/dylib.h"
 #include "utils/json.hpp"
-
-#include "models/chat_completion_request.h"
-
 #ifndef SERVER_VERBOSE
 #define SERVER_VERBOSE 1
 #endif
@@ -37,6 +34,8 @@ class llamaCPP : public drogon::HttpController<llamaCPP>,
                  public BaseModel,
                  public BaseChatCompletion,
                  public BaseEmbedding {
+  struct SyncQueue;
+
  public:
   llamaCPP();
   ~llamaCPP();
@@ -59,7 +58,7 @@ class llamaCPP : public drogon::HttpController<llamaCPP>,
   // PATH_ADD("/llama/chat_completion", Post);
   METHOD_LIST_END
   void ChatCompletion(
-      inferences::ChatCompletionRequest&& completion,
+      const HttpRequestPtr& req,
       std::function<void(const HttpResponsePtr&)>&& callback) override;
   void Embedding(
       const HttpRequestPtr& req,
@@ -75,36 +74,52 @@ class llamaCPP : public drogon::HttpController<llamaCPP>,
       std::function<void(const HttpResponsePtr&)>&& callback) override;
 
  private:
-  llama_server_context llama;
-  // std::atomic<bool> model_loaded = false;
-  size_t sent_count = 0;
-  size_t sent_token_probs_index = 0;
-  std::thread backgroundThread;
-  std::string user_prompt;
-  std::string ai_prompt;
-  std::string system_prompt;
-  std::string pre_prompt;
-  int repeat_last_n;
-  bool caching_enabled;
-  std::atomic<int> no_of_requests = 0;
-  std::atomic<int> no_of_chats = 0;
-  int clean_cache_threshold;
-  std::string grammar_file_content;
-
-  /**
-   * Queue to handle the inference tasks
-   */
-  trantor::ConcurrentTaskQueue* queue;
-
-  bool LoadModelImpl(std::shared_ptr<Json::Value> jsonBody);
-  void InferenceImpl(inferences::ChatCompletionRequest&& completion,
-                     std::function<void(const HttpResponsePtr&)>&& callback);
-  void EmbeddingImpl(std::shared_ptr<Json::Value> jsonBody,
-                     std::function<void(const HttpResponsePtr&)>&& callback);
-  bool CheckModelLoaded(
-      const std::function<void(const HttpResponsePtr&)>& callback);
-  void WarmupModel();
-  void BackgroundTask();
-  void StopBackgroundTask();
+  void ProcessStreamRes(std::function<void(const HttpResponsePtr&)> cb,
+                        std::shared_ptr<SyncQueue> q);
+  void ProcessNonStreamRes(std::function<void(const HttpResponsePtr&)> cb,
+                           SyncQueue& q);
+
+ private:
+  struct SyncQueue {
+    void push(std::pair<Json::Value, Json::Value>&& p) {
+      std::unique_lock<std::mutex> l(mtx);
+      q.push(p);
+      cond.notify_one();
+    }
+
+    std::pair<Json::Value, Json::Value> wait_and_pop() {
+      std::unique_lock<std::mutex> l(mtx);
+      cond.wait(l, [this] { return !q.empty(); });
+      auto res = q.front();
+      q.pop();
+      return res;
+    }
+
+    std::mutex mtx;
+    std::condition_variable cond;
+    // Status and result
+    std::queue<std::pair<Json::Value, Json::Value>> q;
+  };
+  struct StreamStatus {
+    void Done() {
+      std::unique_lock<std::mutex> l(m);
+      stream_done = true;
+      cv.notify_all();
+    }
+
+    void Wait() {
+      std::unique_lock<std::mutex> l(m);
+      cv.wait(l, [this] { return stream_done; });
+    }
+
+   private:
+    std::mutex m;
+    std::condition_variable cv;
+    bool stream_done = false;
+  };
+
+ private:
+  std::unique_ptr<dylib> dylib_;
+  EngineI* engine_;
 };
-};  // namespace inferences
+};  // namespace inferences
\ No newline at end of file
diff --git a/cortex-cpp/cortex-common/EngineI.h b/cortex-cpp/cortex-common/EngineI.h
new file mode 100644
index 000000000..b8770b230
--- /dev/null
+++ b/cortex-cpp/cortex-common/EngineI.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <functional>
+#include <memory>
+
+#include "json/value.h"
+
+class EngineI {
+ public:
+  virtual ~EngineI() {}
+
+  virtual void HandleChatCompletion(
+      std::shared_ptr<Json::Value> jsonBody,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
+  virtual void HandleEmbedding(
+      std::shared_ptr<Json::Value> jsonBody,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
+  virtual void LoadModel(
+      std::shared_ptr<Json::Value> jsonBody,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
+  virtual void UnloadModel(
+      std::shared_ptr<Json::Value> jsonBody,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
+  virtual void GetModelStatus(
+      std::shared_ptr<Json::Value> jsonBody,
+      std::function<void(Json::Value&&, Json::Value&&)>&& callback) = 0;
+};
diff --git a/cortex-cpp/engines/cortex.llamacpp/engine.cmake b/cortex-cpp/engines/cortex.llamacpp/engine.cmake
new file mode 100644
index 000000000..5b9070510
--- /dev/null
+++ b/cortex-cpp/engines/cortex.llamacpp/engine.cmake
@@ -0,0 +1,21 @@
+set(LIBLLAMA_ENGINE_VERSION v1.0)
+
+if(UNIX) 
+  set(LIBRARY_NAME llama-engine-lib.so)
+else()
+  set(LIBRARY_NAME engine.dll)
+endif()
+
+set(LIBLLAMA_ENGINE_URL https://github.com/.../releases/download/${LIBLLAMA_ENGINE_VERSION}/${LIBRARY_NAME})
+set(LIBLLAMA_ENGINE_PATH ${CMAKE_BINARY_DIR}/engines/${LIBRARY_NAME})
+
+MESSAGE("CMAKE_BINARY_DIR = " ${CMAKE_BINARY_DIR})
+
+file(DOWNLOAD ${LIBLLAMA_ENGINE_URL} ${LIBLLAMA_ENGINE_PATH} STATUS LIBLLAMA_ENGINE_DOWNLOAD_STATUS)
+list(GET LIBLLAMA_ENGINE_DOWNLOAD_STATUS 0 LIBLLAMA_ENGINE_DOWNLOAD_STATUS_NO)
+
+if(LIBLLAMA_ENGINE_DOWNLOAD_STATUS_NO)
+    message(STATUS "Pre-built library not downloaded. (${LIBLLAMA_ENGINE_DOWNLOAD_STATUS})")
+else()
+    message(STATUS "Linking downloaded pre-built library.")
+endif()
\ No newline at end of file
diff --git a/cortex-cpp/llama.cpp b/cortex-cpp/llama.cpp
deleted file mode 160000
index 8a56075b0..000000000
--- a/cortex-cpp/llama.cpp
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 8a56075b07a8b571bf95a912ffdce4c928c2b414
diff --git a/cortex-cpp/models/chat_completion_request.h b/cortex-cpp/models/chat_completion_request.h
deleted file mode 100644
index f4fd087f5..000000000
--- a/cortex-cpp/models/chat_completion_request.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#pragma once
-#include <drogon/HttpController.h>
-
-namespace inferences {
-struct ChatCompletionRequest {
-  bool stream = false;
-  int max_tokens = 500;
-  float top_p = 0.95f;
-  float temperature = 0.8f;
-  float frequency_penalty = 0;
-  float presence_penalty = 0;
-  Json::Value stop = Json::Value(Json::arrayValue);
-  Json::Value messages = Json::Value(Json::arrayValue);
-};
-}  // namespace inferences
-
-namespace drogon {
-template <>
-inline inferences::ChatCompletionRequest fromRequest(const HttpRequest& req) {
-  auto jsonBody = req.getJsonObject();
-  inferences::ChatCompletionRequest completion;
-  if (jsonBody) {
-    completion.stream = (*jsonBody).get("stream", false).asBool();
-    completion.max_tokens = (*jsonBody).get("max_tokens", 500).asInt();
-    completion.top_p = (*jsonBody).get("top_p", 0.95).asFloat();
-    completion.temperature = (*jsonBody).get("temperature", 0.8).asFloat();
-    completion.frequency_penalty =
-        (*jsonBody).get("frequency_penalty", 0).asFloat();
-    completion.presence_penalty =
-        (*jsonBody).get("presence_penalty", 0).asFloat();
-    completion.messages = (*jsonBody)["messages"];
-    completion.stop = (*jsonBody)["stop"];
-  }
-  return completion;
-}
-}  // namespace inferences
diff --git a/cortex-cpp/nitro_deps/CMakeLists.txt b/cortex-cpp/nitro_deps/CMakeLists.txt
index 29b9b0186..667024cd7 100644
--- a/cortex-cpp/nitro_deps/CMakeLists.txt
+++ b/cortex-cpp/nitro_deps/CMakeLists.txt
@@ -5,6 +5,9 @@ project(MyProject)
 include(ExternalProject)
 
 # Define variables
+if(UNIX AND NOT APPLE)
+set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -fPIC")
+endif()
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(THIRD_PARTY_INSTALL_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../build_deps/_install)
 #if(NOT THIRD_PARTY_INSTALL_PATH )
@@ -34,8 +37,7 @@ ExternalProject_Add(
     brotli
     GIT_REPOSITORY https://github.com/google/brotli
     GIT_TAG v1.1.0
-    CMAKE_ARGS 
-	-DCMAKE_BUILD_TYPE=Release
+    CMAKE_ARGS
 	-DBUILD_SHARED_LIBS=OFF
 	-DSHARE_INSTALL_PREFIX=${CMAKE_CURRENT_SOURCE_DIR}/share
 	-DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH}
@@ -47,6 +49,7 @@ ExternalProject_Add(
     GIT_TAG 1.9.5
     CMAKE_ARGS 
     	-DBUILD_SHARED_LIBS=OFF
+        -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
     	-DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH}
 )
 
@@ -65,7 +68,7 @@ ExternalProject_Add(
     GIT_REPOSITORY https://github.com/drogonframework/drogon
     GIT_TAG v1.9.2
     CMAKE_ARGS
-	-DCMAKE_BUILD_TYPE=release
+    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
 	-DOPENSSL_USE_STATIC_LIBS=TRUE
 	-DZLIB_USE_STATIC_LIBS=${ZLIB_USE_STATIC_LIBS}
 	-DBUILD_ORM=OFF
@@ -86,7 +89,6 @@ ExternalProject_Add(
     GIT_TAG v1.14.0
     CMAKE_ARGS
 	-Dgtest_force_shared_crt=ON
-	-DCMAKE_BUILD_TYPE=release
 	-DCMAKE_PREFIX_PATH=${THIRD_PARTY_INSTALL_PATH}
 	-DCMAKE_INSTALL_PREFIX=${THIRD_PARTY_INSTALL_PATH}
 )
@@ -111,4 +113,3 @@ if(ZLIB_USE_STATIC_LIBS)
 endif()
 # target_link_libraries(<your-target> ...)
 
-
diff --git a/cortex-cpp/test/components/test_models.cc b/cortex-cpp/test/components/test_models.cc
deleted file mode 100644
index ac27e97dd..000000000
--- a/cortex-cpp/test/components/test_models.cc
+++ /dev/null
@@ -1,53 +0,0 @@
-#include "gtest/gtest.h"
-#include "models/chat_completion_request.h"
-
-using inferences::ChatCompletionRequest;
-
-class ModelTest : public ::testing::Test {
-};
-
-
-TEST_F(ModelTest, should_parse_request) {
-  {
-    Json::Value data;
-    auto req = drogon::HttpRequest::newHttpJsonRequest(data);
-
-    auto res =
-        drogon::fromRequest<inferences::ChatCompletionRequest>(*req.get());
-
-    EXPECT_EQ(res.stream, false);
-    EXPECT_EQ(res.max_tokens, 500);
-    EXPECT_EQ(res.top_p, 0.95f);
-    EXPECT_EQ(res.temperature, 0.8f);
-    EXPECT_EQ(res.frequency_penalty, 0);
-    EXPECT_EQ(res.presence_penalty, 0);
-    EXPECT_EQ(res.stop, Json::Value{});
-    EXPECT_EQ(res.messages, Json::Value{});
-  }
-
-  {
-    Json::Value data;
-    data["stream"] = true;
-    data["max_tokens"] = 400;
-    data["top_p"] = 0.8;
-    data["temperature"] = 0.7;
-    data["frequency_penalty"] = 0.1;
-    data["presence_penalty"] = 0.2;
-    data["messages"] = "message";
-    data["stop"] = "stop";
-
-    auto req = drogon::HttpRequest::newHttpJsonRequest(data);
-
-    auto res =
-        drogon::fromRequest<inferences::ChatCompletionRequest>(*req.get());
-
-    EXPECT_EQ(res.stream, true);
-    EXPECT_EQ(res.max_tokens, 400);
-    EXPECT_EQ(res.top_p, 0.8f);
-    EXPECT_EQ(res.temperature, 0.7f);
-    EXPECT_EQ(res.frequency_penalty, 0.1f);
-    EXPECT_EQ(res.presence_penalty, 0.2f);
-    EXPECT_EQ(res.stop, Json::Value{"stop"});
-    EXPECT_EQ(res.messages, Json::Value{"message"});
-  }
-}
diff --git a/cortex-cpp/utils/dylib.h b/cortex-cpp/utils/dylib.h
new file mode 100644
index 000000000..806e2ee67
--- /dev/null
+++ b/cortex-cpp/utils/dylib.h
@@ -0,0 +1,316 @@
+/**
+ * @file dylib.hpp
+ * @version 2.2.1
+ * @brief C++ cross-platform wrapper around dynamic loading of shared libraries
+ * @link https://github.com/martin-olivier/dylib
+ * 
+ * @author Martin Olivier <martin.olivier@live.fr>
+ * @copyright (c) 2023 Martin Olivier
+ *
+ * This library is released under MIT license
+ */
+
+#pragma once
+
+#include <string>
+#include <stdexcept>
+#include <utility>
+
+#if ((defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) || __cplusplus >= 201703L)
+#define DYLIB_CPP17
+#include <filesystem>
+#endif
+
+#if (defined(_WIN32) || defined(_WIN64))
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#define DYLIB_UNDEFINE_LEAN_AND_MEAN
+#endif
+#ifndef NOMINMAX
+#define NOMINMAX
+#define DYLIB_UNDEFINE_NOMINMAX
+#endif
+#include <windows.h>
+#ifdef DYLIB_UNDEFINE_LEAN_AND_MEAN
+#undef WIN32_LEAN_AND_MEAN
+#undef DYLIB_UNDEFINE_LEAN_AND_MEAN
+#endif
+#ifdef DYLIB_UNDEFINE_NOMINMAX
+#undef NOMINMAX
+#undef DYLIB_UNDEFINE_NOMINMAX
+#endif
+#else
+#include <dlfcn.h>
+#endif
+
+#if (defined(_WIN32) || defined(_WIN64))
+#define DYLIB_WIN_MAC_OTHER(win_def, mac_def, other_def) win_def
+#define DYLIB_WIN_OTHER(win_def, other_def) win_def
+#elif defined(__APPLE__)
+#define DYLIB_WIN_MAC_OTHER(win_def, mac_def, other_def) mac_def
+#define DYLIB_WIN_OTHER(win_def, other_def) other_def
+#else
+#define DYLIB_WIN_MAC_OTHER(win_def, mac_def, other_def) other_def
+#define DYLIB_WIN_OTHER(win_def, other_def) other_def
+#endif
+
+/**
+ *  The `dylib` class represents a single dynamic library instance,
+ *  allowing the access of symbols like functions or global variables
+ */
+class dylib {
+public:
+    struct filename_components {
+        static constexpr const char *prefix = DYLIB_WIN_OTHER("", "lib");
+        static constexpr const char *suffix = DYLIB_WIN_MAC_OTHER(".dll", ".dylib", ".so");
+    };
+    using native_handle_type = DYLIB_WIN_OTHER(HINSTANCE, void *);
+    using native_symbol_type = DYLIB_WIN_OTHER(FARPROC, void *);
+
+    static_assert(std::is_pointer<native_handle_type>::value, "Expecting HINSTANCE to be a pointer");
+    static_assert(std::is_pointer<native_symbol_type>::value, "Expecting FARPROC to be a pointer");
+
+    static constexpr bool add_filename_decorations = true;
+    static constexpr bool no_filename_decorations = false;
+
+    /**
+     *  This exception is raised when a library fails to load or a symbol fails to resolve
+     */
+    class exception : public std::runtime_error {
+        using std::runtime_error::runtime_error;
+    };
+
+    /**
+     *  This exception is raised when a library fails to load
+     */
+    class load_error : public exception {
+        using exception::exception;
+    };
+
+    /**
+     *  This exception is raised when a symbol fails to resolve
+     */
+    class symbol_error : public exception {
+        using exception::exception;
+    };
+
+    dylib(const dylib&) = delete;
+    dylib& operator=(const dylib&) = delete;
+
+    dylib(dylib &&other) noexcept : m_handle(other.m_handle) {
+        other.m_handle = nullptr;
+    }
+
+    dylib& operator=(dylib &&other) noexcept {
+        if (this != &other)
+            std::swap(m_handle, other.m_handle);
+        return *this;
+    }
+
+    /**
+     *  Loads a dynamic library
+     *
+     *  @throws `dylib::load_error` if the library could not be opened (including
+     *  the case of the library file not being found)
+     *  @throws `std::invalid_argument` if the arguments are null
+     *
+     *  @param dir_path the directory path where the dynamic library is located
+     *  @param name the name of the dynamic library to load
+     *  @param decorations adds OS-specific decorations to the library name
+     */
+    ///@{
+    dylib(const char *dir_path, const char *lib_name, bool decorations = add_filename_decorations) {
+        if (!dir_path)
+            throw std::invalid_argument("The directory path is null");
+        if (!lib_name)
+            throw std::invalid_argument("The library name is null");
+
+        std::string final_name = lib_name;
+        std::string final_path = dir_path;
+
+        if (decorations)
+            final_name = filename_components::prefix + final_name + filename_components::suffix;
+
+        if (!final_path.empty() && final_path.find_last_of('/') != final_path.size() - 1)
+            final_path += '/';
+
+        m_handle = open((final_path + final_name).c_str());
+
+        if (!m_handle)
+            throw load_error("Could not load library \"" + final_path + final_name + "\"\n" + get_error_description());
+    }
+
+    dylib(const std::string &dir_path, const std::string &lib_name, bool decorations = add_filename_decorations)
+        : dylib(dir_path.c_str(), lib_name.c_str(), decorations) {}
+
+    dylib(const std::string &dir_path, const char *lib_name, bool decorations = add_filename_decorations)
+        : dylib(dir_path.c_str(), lib_name, decorations) {}
+
+    dylib(const char *dir_path, const std::string &lib_name, bool decorations = add_filename_decorations)
+        : dylib(dir_path, lib_name.c_str(), decorations) {}
+
+    explicit dylib(const std::string &lib_name, bool decorations = add_filename_decorations)
+        : dylib("", lib_name.c_str(), decorations) {}
+
+    explicit dylib(const char *lib_name, bool decorations = add_filename_decorations)
+        : dylib("", lib_name, decorations) {}
+
+#ifdef DYLIB_CPP17
+    explicit dylib(const std::filesystem::path &lib_path)
+        : dylib("", lib_path.string().c_str(), no_filename_decorations) {}
+
+    dylib(const std::filesystem::path &dir_path, const std::string &lib_name, bool decorations = add_filename_decorations)
+        : dylib(dir_path.string().c_str(), lib_name.c_str(), decorations) {}
+
+    dylib(const std::filesystem::path &dir_path, const char *lib_name, bool decorations = add_filename_decorations)
+        : dylib(dir_path.string().c_str(), lib_name, decorations) {}
+#endif
+    ///@}
+
+    ~dylib() {
+        if (m_handle)
+            close(m_handle);
+    }
+
+    /**
+     *  Get a symbol from the currently loaded dynamic library
+     * 
+     *  @throws `dylib::symbol_error` if the symbol could not be found
+     *  @throws `std::invalid_argument` if the argument or library handle is null
+     *
+     *  @param symbol_name the symbol name to lookup
+     *
+     *  @return a pointer to the requested symbol
+     */
+    native_symbol_type get_symbol(const char *symbol_name) const {
+        if (!symbol_name)
+            throw std::invalid_argument("The symbol name to lookup is null");
+        if (!m_handle)
+            throw std::logic_error("The dynamic library handle is null. This object may have been moved from.");
+
+        auto symbol = locate_symbol(m_handle, symbol_name);
+
+        if (symbol == nullptr)
+            throw symbol_error("Could not get symbol \"" + std::string(symbol_name) + "\"\n" + get_error_description());
+        return symbol;
+    }
+
+    native_symbol_type get_symbol(const std::string &symbol_name) const {
+        return get_symbol(symbol_name.c_str());
+    }
+
+    /**
+     *  Get a function from the currently loaded dynamic library
+     * 
+     *  @throws `dylib::symbol_error` if the function could not be found
+     *  @throws `std::invalid_argument` if the argument is null
+     *
+     *  @tparam T the function type, e.g., `double(int, int)`
+     *  @param symbol_name the function name to lookup
+     *
+     *  @return a pointer to the requested function
+     */
+    template<typename T>
+    T *get_function(const char *symbol_name) const {
+#if (defined(__GNUC__) && __GNUC__ >= 8)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wcast-function-type"
+#endif
+        return reinterpret_cast<T *>(get_symbol(symbol_name));
+#if (defined(__GNUC__) && __GNUC__ >= 8)
+#pragma GCC diagnostic pop
+#endif
+    }
+
+    template<typename T>
+    T *get_function(const std::string &symbol_name) const {
+        return get_function<T>(symbol_name.c_str());
+    }
+
+    /**
+     *  Get a variable from the currently loaded dynamic library
+     * 
+     *  @throws `dylib::symbol_error` if the variable could not be found
+     *  @throws `std::invalid_argument` if the argument is null
+     *
+     *  @tparam T the variable type
+     *  @param symbol_name the variable name to lookup
+     *
+     *  @return a reference to the requested variable
+     */
+    template<typename T>
+    T &get_variable(const char *symbol_name) const {
+        return *reinterpret_cast<T *>(get_symbol(symbol_name));
+    }
+
+    template<typename T>
+    T &get_variable(const std::string &symbol_name) const {
+        return get_variable<T>(symbol_name.c_str());
+    }
+
+    /**
+     *  Check if a symbol exists in the currently loaded dynamic library. 
+     *  This method will return false if no dynamic library is currently loaded 
+     *  or if the symbol name is nullptr
+     *
+     *  @param symbol_name the symbol name to look for
+     *
+     *  @return true if the symbol exists in the dynamic library, false otherwise
+     */
+    bool has_symbol(const char *symbol_name) const noexcept {
+        if (!m_handle || !symbol_name)
+            return false;
+        return locate_symbol(m_handle, symbol_name) != nullptr;
+    }
+
+    bool has_symbol(const std::string &symbol) const noexcept {
+        return has_symbol(symbol.c_str());
+    }
+
+    /**
+     *  @return the dynamic library handle
+     */
+    native_handle_type native_handle() noexcept {
+        return m_handle;
+    }
+
+protected:
+    native_handle_type m_handle{nullptr};
+
+    static native_handle_type open(const char *path) noexcept {
+#if (defined(_WIN32) || defined(_WIN64))
+        return LoadLibraryA(path);
+#else
+        return dlopen(path, RTLD_NOW | RTLD_LOCAL);
+#endif
+    }
+
+    static native_symbol_type locate_symbol(native_handle_type lib, const char *name) noexcept {
+        return DYLIB_WIN_OTHER(GetProcAddress, dlsym)(lib, name);
+    }
+
+    static void close(native_handle_type lib) noexcept {
+        DYLIB_WIN_OTHER(FreeLibrary, dlclose)(lib);
+    }
+
+    static std::string get_error_description() noexcept {
+#if (defined(_WIN32) || defined(_WIN64))
+        constexpr const size_t BUF_SIZE = 512;
+        const auto error_code = GetLastError();
+        if (!error_code)
+            return "No error reported by GetLastError";
+        char description[BUF_SIZE];
+        const auto lang = MAKELANGID(LANG_ENGLISH, SUBLANG_ENGLISH_US);
+        const DWORD length =
+            FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM, nullptr, error_code, lang, description, BUF_SIZE, nullptr);
+        return (length == 0) ? "Unknown error (FormatMessage failed)" : description;
+#else
+        const auto description = dlerror();
+        return (description == nullptr) ? "No error reported by dlerror" : description;
+#endif
+    }
+};
+
+#undef DYLIB_WIN_MAC_OTHER
+#undef DYLIB_WIN_OTHER
+#undef DYLIB_CPP17
\ No newline at end of file

From f1a7105325b2174fee8b033b49f35bcf1da93019 Mon Sep 17 00:00:00 2001
From: vansangpfiev <vansangpfiev@gmail.com>
Date: Fri, 3 May 2024 08:29:13 +0700
Subject: [PATCH 03/10] feat: llama engine

---
 .github/workflows/build.yml                   | 60 ++++++++++++-------
 cortex-cpp/CMakeLists.txt                     |  2 +-
 .../engines/cortex.llamacpp/engine.cmake      | 51 ++++++++++++++--
 3 files changed, 88 insertions(+), 25 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 90e509d55..318b984a1 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -171,7 +171,9 @@ jobs:
         shell: bash
         run: |
           mkdir -p nitro
+          mkdir -p nitro/engines/cortex.llamacpp
           cp build/nitro nitro/
+          cp build/engines/cortex.llamacpp/libengine.so nitro/engines/cortex.llamacpp/
           tar -czvf nitro.tar.gz nitro
 
       - name: Upload Artifact
@@ -218,7 +220,11 @@ jobs:
       contents: write
     strategy:
       matrix:
-        cuda: ["12-0", "11-7"]
+        include:
+          - cuda: "12-0"
+            flags: "-DCUDA_12_0=ON -DCUDA_11_7=OFF"
+          - cuda: "11-7"
+            flags: "-DCUDA_12_0=OFF -DCUDA_11_7=ON"
 
     steps:
       - name: Clone
@@ -232,7 +238,7 @@ jobs:
         run: |
           ./install_deps.sh
           mkdir build && cd build
-          cmake  -DLLAMA_NATIVE=OFF -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} ..
+          cmake  ${{matrix.flags}} -DLLAMA_NATIVE=OFF -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} ..
           make -j $(nproc)
           ls -la
 
@@ -240,7 +246,9 @@ jobs:
         shell: bash
         run: |
           mkdir -p nitro
+          mkdir -p nitro/engines/cortex.llamacpp
           cp build/nitro nitro/
+          cp build/engines/cortex.llamacpp/libengine.so nitro/engines/cortex.llamacpp/
           tar -czvf nitro.tar.gz nitro
 
       - name: Upload Artifact
@@ -295,7 +303,9 @@ jobs:
         run: |
           mkdir -p nitro
           cp llama.cpp/ggml-metal.metal nitro/
+          mkdir -p nitro/engines/cortex.llamacpp
           cp build/nitro nitro/
+          cp build/engines/cortex.llamacpp/libengine.dylib nitro/engines/cortex.llamacpp/
 
       - name: Upload Artifact
         uses: actions/upload-artifact@v2
@@ -361,7 +371,9 @@ jobs:
         shell: bash
         run: |
           mkdir -p nitro
+          mkdir -p nitro/engines/cortex.llamacpp
           cp build/nitro nitro/
+          cp build/engines/cortex.llamacpp/libengine.dylib nitro/engines/cortex.llamacpp/
 
       - name: Upload Artifact
         uses: actions/upload-artifact@v2
@@ -408,8 +420,10 @@ jobs:
       - name: bundle universal binary
         run: |
           mkdir -p nitro
+          mkdir -p nitro/engines/cortex.llamacpp/
           ls ./nitro-mac-amd64
           lipo -create ./nitro-mac-amd64/nitro ./nitro-mac-arm64/nitro -output ./nitro/nitro
+          lipo -create ./nitro-mac-amd64/engines/cortex.llamacpp/libengine.dylib ./nitro-mac-arm64/engines/cortex.llamacpp/libengine.dylib -output ./nitro/engines/cortex.llamacpp/libengine.dylib
           cp ./nitro-mac-arm64/ggml-metal.metal ./nitro/ggml-metal.metal
           tar -czvf nitro.tar.gz nitro
 
@@ -503,6 +517,8 @@ jobs:
         id: pack_artifacts
         shell: cmd
         run: |
+          mkdir .\build\Release\engines\cortex.llamacpp\
+          robocopy .\build\engines\cortex.llamacpp\ .\build\Release\engines\cortex.llamacpp\ engine.dll
           robocopy build_deps\_install\bin\ .\build\Release\ zlib.dll
           robocopy build\bin\Release\ .\build\Release\ llama.dll
           robocopy build\bin\Release\ .\build\Release\ whisper.dll
@@ -558,8 +574,25 @@ jobs:
 
     strategy:
       matrix:
-        cuda: ["12-0", "11-7"]
-        instructions: ["amd64-avx2", "amd64-avx", "amd64-avx512"]
+        include:
+          - cuda: "12-0"
+            instructions: "-DLLAMA_NATIVE=OFF"
+            flags: "-DCUDA_12_0=ON -DCUDA_11_7=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
+          - cuda: "12-0"
+            instructions: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF"
+            flags: "-DCUDA_12_0=ON -DCUDA_11_7=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
+          - cuda: "12-0"
+            instructions: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF"
+            flags: "-DCUDA_12_0=ON -DCUDA_11_7=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
+          - cuda: "11-7"
+            instructions: "-DLLAMA_NATIVE=OFF"
+            flags: "-DCUDA_12_0=OFF -DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
+          - cuda: "11-7"
+            instructions: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF"
+            flags: "-DCUDA_12_0=OFF -DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
+          - cuda: "11-7"
+            instructions: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF"
+            flags: "-DCUDA_12_0=OFF -DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
 
     steps:
       - name: Setup VSWhere.exe
@@ -600,21 +633,6 @@ jobs:
         with:
           dotnet-version: "6.0.x"
 
-      # Conditional instruction check and set environment variable
-      - name: Set INSTRUCTION Based on Instructions ${{ matrix.instructions }}
-        shell: cmd
-        run: |
-          IF "${{ matrix.instructions }}" == "amd64-avx2" (
-            echo "INSTRUCTION=-DLLAMA_NATIVE=OFF" >> $env:GITHUB_ENV
-            echo "INSTRUCTION=-DLLAMA_NATIVE=OFF"
-          ) ELSE IF "${{ matrix.instructions }}" == "amd64-avx" (
-            echo "INSTRUCTION=-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF" >> $env:GITHUB_ENV
-            echo "INSTRUCTION=-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF"
-          ) ELSE IF "${{ matrix.instructions }}" == "amd64-avx512" (
-            echo "INSTRUCTION=-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF" >> $env:GITHUB_ENV
-            echo "INSTRUCTION=-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF"
-          )
-
       - name: Build
         id: cmake_build
         shell: cmd
@@ -623,7 +641,7 @@ jobs:
           cmake --build ./build_deps/nitro_deps --config Release
           mkdir -p build
           cd build
-          cmake .. %INSTRUCTION%  -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }}
+          cmake .. ${{ matrix.instructions }} ${{ matrix.flags }} -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }}
           cmake --build . --config Release -j "%NUMBER_OF_PROCESSORS%"
 
       - name: Pack artifacts
@@ -631,6 +649,8 @@ jobs:
         shell: cmd
         run: |
           set PATH=%PATH%;C:\Program Files\7-Zip\
+          mkdir .\build\Release\engines\cortex.llamacpp\
+          robocopy .\build\engines\cortex.llamacpp\ .\build\Release\engines\cortex.llamacpp\ engine.dll
           robocopy build_deps\_install\bin\ .\build\Release\ zlib.dll
           robocopy build\bin\Release\ .\build\Release\ llama.dll
           robocopy build\bin\Release\ .\build\Release\ whisper.dll
diff --git a/cortex-cpp/CMakeLists.txt b/cortex-cpp/CMakeLists.txt
index 9be937dc4..4bb269aaa 100644
--- a/cortex-cpp/CMakeLists.txt
+++ b/cortex-cpp/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.5)
 project(nitro C CXX)
 
-# include(engines/cortex.llamacpp/engine.cmake)
+include(engines/cortex.llamacpp/engine.cmake)
 include(CheckIncludeFileCXX)
 
 check_include_file_cxx(any HAS_ANY)
diff --git a/cortex-cpp/engines/cortex.llamacpp/engine.cmake b/cortex-cpp/engines/cortex.llamacpp/engine.cmake
index 5b9070510..4b879a1ca 100644
--- a/cortex-cpp/engines/cortex.llamacpp/engine.cmake
+++ b/cortex-cpp/engines/cortex.llamacpp/engine.cmake
@@ -1,21 +1,64 @@
 set(LIBLLAMA_ENGINE_VERSION v1.0)
 
-if(UNIX) 
-  set(LIBRARY_NAME llama-engine-lib.so)
+# Download library based on instruction 
+# TODO(sang) correct library name when cortex.llamacpp CI is done
+if(UNIX AND NOT APPLE) 
+  if(CUDA_12_0) 
+    set(LIBRARY_NAME linux-amd64-cuda-12-0.tar.gz)
+  elseif(CUDA_11_7)
+    set(LIBRARY_NAME linux-amd64-cuda-11-7.tar.gz)
+  elseif(LLAMA_VULKAN)
+    set(LIBRARY_NAME linux-amd64-vulkan.tar.gz)
+  elseif(DLLAMA_AVX512)
+    set(LIBRARY_NAME linux-amd64-avx512.tar.gz)
+  elseif(DLLAMA_AVX2)
+    set(LIBRARY_NAME linux-amd64-avx2.tar.gz)
+  else()
+    set(LIBRARY_NAME linux-amd64-avx.tar.gz)
+  endif()
+elseif(UNIX)
+  set(LIBRARY_NAME mac-universal.tar.gz)
 else()
-  set(LIBRARY_NAME engine.dll)
+  if(CUDA_12_0)
+    if(DLLAMA_AVX512)  
+      set(LIBRARY_NAME win-amd64-avx512-cuda-12-0.tar.gz)
+    elseif(DLLAMA_AVX2)
+      set(LIBRARY_NAME win-amd64-avx2-cuda-12-0.tar.gz)
+    else()
+      set(LIBRARY_NAME win-amd64-avx-cuda-12-0.tar.gz)
+    endif()
+  elseif(CUDA_11_7)
+    if(DLLAMA_AVX512)  
+      set(LIBRARY_NAME win-amd64-avx512-cuda-11-7.tar.gz)
+    elseif(DLLAMA_AVX2)
+      set(LIBRARY_NAME win-amd64-avx2-cuda-11-7.tar.gz)
+    else()
+      set(LIBRARY_NAME win-amd64-avx-cuda-11-7.tar.gz)
+    endif()
+  elseif(LLAMA_VULKAN)
+    set(LIBRARY_NAME win-amd64-vulkan.tar.gz)
+  elseif(DLLAMA_AVX512)
+    set(LIBRARY_NAME win-amd64-avx512.tar.gz)
+  elseif(DLLAMA_AVX2)
+    set(LIBRARY_NAME win-amd64-avx2.tar.gz)
+  else()
+    set(LIBRARY_NAME win-amd64-avx.tar.gz)
+  endif()
 endif()
 
 set(LIBLLAMA_ENGINE_URL https://github.com/.../releases/download/${LIBLLAMA_ENGINE_VERSION}/${LIBRARY_NAME})
+
 set(LIBLLAMA_ENGINE_PATH ${CMAKE_BINARY_DIR}/engines/${LIBRARY_NAME})
 
-MESSAGE("CMAKE_BINARY_DIR = " ${CMAKE_BINARY_DIR})
+# MESSAGE("CMAKE_BINARY_DIR = " ${CMAKE_BINARY_DIR})
 
 file(DOWNLOAD ${LIBLLAMA_ENGINE_URL} ${LIBLLAMA_ENGINE_PATH} STATUS LIBLLAMA_ENGINE_DOWNLOAD_STATUS)
 list(GET LIBLLAMA_ENGINE_DOWNLOAD_STATUS 0 LIBLLAMA_ENGINE_DOWNLOAD_STATUS_NO)
+# MESSAGE("file = " ${CMAKE_BINARY_DIR}/engines/${LIBRARY_NAME})
 
 if(LIBLLAMA_ENGINE_DOWNLOAD_STATUS_NO)
     message(STATUS "Pre-built library not downloaded. (${LIBLLAMA_ENGINE_DOWNLOAD_STATUS})")
 else()
     message(STATUS "Linking downloaded pre-built library.")
+    file(ARCHIVE_EXTRACT INPUT ${CMAKE_BINARY_DIR}/engines/${LIBRARY_NAME} DESTINATION ${CMAKE_BINARY_DIR}/engines/)
 endif()
\ No newline at end of file

From 775a62488537b2f88fcabd3138db5b3ea24f0c98 Mon Sep 17 00:00:00 2001
From: vansangpfiev <vansangpfiev@gmail.com>
Date: Fri, 3 May 2024 09:39:05 +0700
Subject: [PATCH 04/10] fix: only load engine when load model

---
 cortex-cpp/controllers/llamaCPP.cc | 105 +++++++++++++++++++++++++----
 cortex-cpp/controllers/llamaCPP.h  |   2 +
 2 files changed, 95 insertions(+), 12 deletions(-)

diff --git a/cortex-cpp/controllers/llamaCPP.cc b/cortex-cpp/controllers/llamaCPP.cc
index 3beaae0e9..d3e489072 100644
--- a/cortex-cpp/controllers/llamaCPP.cc
+++ b/cortex-cpp/controllers/llamaCPP.cc
@@ -11,21 +11,35 @@
 using namespace inferences;
 using json = nlohmann::json;
 namespace inferences {
-llamaCPP::llamaCPP() {
-  dylib_ = std::make_unique<dylib>("./engines/cortex.llamacpp", "engine");
-  auto func = dylib_->get_function<EngineI*()>("get_engine");
-  engine_ = func();
-  // Some default values for now below
-  // log_disable();  // Disable the log to file feature, reduce bloat for
-  // target
-  // system ()
-};
+namespace {
+constexpr static auto kLlamaEngine = "cortex.llamacpp";
+constexpr static auto kLlamaLibPath = "./engines/cortex.llamacpp";
+}  // namespace
+
+llamaCPP::llamaCPP()
+    : engine_{nullptr} {
+
+          // Some default values for now below
+          // log_disable();  // Disable the log to file feature, reduce bloat for
+          // target
+          // system ()
+      };
 
 llamaCPP::~llamaCPP() {}
 
 void llamaCPP::ChatCompletion(
     const HttpRequestPtr& req,
     std::function<void(const HttpResponsePtr&)>&& callback) {
+  if (!IsEngineLoaded()) {
+    Json::Value res;
+    res["message"] = "Engine is not loaded yet";
+    auto resp = nitro_utils::nitroHttpJsonResponse(res);
+    resp->setStatusCode(k409Conflict);
+    callback(resp);
+    LOG_WARN << "Engine is not loaded yet";
+    return;
+  }
+
   LOG_TRACE << "Start chat completion";
   auto json_body = req->getJsonObject();
   bool is_stream = (*json_body).get("stream", false).asBool();
@@ -46,6 +60,16 @@ void llamaCPP::ChatCompletion(
 void llamaCPP::Embedding(
     const HttpRequestPtr& req,
     std::function<void(const HttpResponsePtr&)>&& callback) {
+  if (!IsEngineLoaded()) {
+    Json::Value res;
+    res["message"] = "Engine is not loaded yet";
+    auto resp = nitro_utils::nitroHttpJsonResponse(res);
+    resp->setStatusCode(k409Conflict);
+    callback(resp);
+    LOG_WARN << "Engine is not loaded yet";
+    return;
+  }
+
   LOG_TRACE << "Start embedding";
   SyncQueue q;
   engine_->HandleEmbedding(req->getJsonObject(),
@@ -60,6 +84,15 @@ void llamaCPP::Embedding(
 void llamaCPP::UnloadModel(
     const HttpRequestPtr& req,
     std::function<void(const HttpResponsePtr&)>&& callback) {
+  if (!IsEngineLoaded()) {
+    Json::Value res;
+    res["message"] = "Engine is not loaded yet";
+    auto resp = nitro_utils::nitroHttpJsonResponse(res);
+    resp->setStatusCode(k409Conflict);
+    callback(resp);
+    LOG_WARN << "Engine is not loaded yet";
+    return;
+  }
   LOG_TRACE << "Start unload model";
   engine_->UnloadModel(
       req->getJsonObject(),
@@ -75,6 +108,16 @@ void llamaCPP::UnloadModel(
 void llamaCPP::ModelStatus(
     const HttpRequestPtr& req,
     std::function<void(const HttpResponsePtr&)>&& callback) {
+  if (!IsEngineLoaded()) {
+    Json::Value res;
+    res["message"] = "Engine is not loaded yet";
+    auto resp = nitro_utils::nitroHttpJsonResponse(res);
+    resp->setStatusCode(k409Conflict);
+    callback(resp);
+    LOG_WARN << "Engine is not loaded yet";
+    return;
+  }
+
   LOG_TRACE << "Start to get model status";
   engine_->GetModelStatus(
       req->getJsonObject(),
@@ -90,6 +133,40 @@ void llamaCPP::ModelStatus(
 void llamaCPP::LoadModel(
     const HttpRequestPtr& req,
     std::function<void(const HttpResponsePtr&)>&& callback) {
+  auto engine_type =
+      (*(req->getJsonObject())).get("engine", kLlamaEngine).asString();
+  if (!dylib_ || engine_type != cur_engine_name_) {
+    cur_engine_name_ = engine_type;
+    // TODO: change this when we get more engines
+    auto get_engine_path = [](std::string_view e) {
+      if (e == kLlamaEngine) {
+        return kLlamaLibPath;
+      }
+      return kLlamaLibPath;
+    };
+
+    try {
+      dylib_ =
+          std::make_unique<dylib>(get_engine_path(cur_engine_name_), "engine");
+    } catch (const dylib::load_error& e) {
+      LOG_ERROR << "Could not load engine: " << e.what();
+      dylib_.reset();
+      engine_ = nullptr;
+    }
+
+    if (!dylib_) {
+      Json::Value res;
+      res["message"] = "Could not load engine " + cur_engine_name_;
+      auto resp = nitro_utils::nitroHttpJsonResponse(res);
+      resp->setStatusCode(k500InternalServerError);
+      callback(resp);
+      return;
+    }
+    auto func = dylib_->get_function<EngineI*()>("get_engine");
+    engine_ = func();
+    LOG_INFO << "Loaded engine: " << cur_engine_name_;
+  }
+
   LOG_TRACE << "Load model";
   engine_->LoadModel(
       req->getJsonObject(),
@@ -105,9 +182,8 @@ void llamaCPP::LoadModel(
 void llamaCPP::ProcessStreamRes(std::function<void(const HttpResponsePtr&)> cb,
                                 std::shared_ptr<SyncQueue> q) {
   auto err_or_done = std::make_shared<std::atomic_bool>(false);
-  auto chunked_content_provider = [q, err_or_done](
-                                      char* buf,
-                                      std::size_t buf_size) -> std::size_t {
+  auto chunked_content_provider =
+      [q, err_or_done](char* buf, std::size_t buf_size) -> std::size_t {
     if (buf == nullptr) {
       LOG_TRACE << "Buf is null";
       return 0;
@@ -145,4 +221,9 @@ void llamaCPP::ProcessNonStreamRes(
       static_cast<drogon::HttpStatusCode>(status["status_code"].asInt()));
   cb(resp);
 }
+
+bool llamaCPP::IsEngineLoaded() {
+  return !!engine_;
+}
+
 }  // namespace inferences
\ No newline at end of file
diff --git a/cortex-cpp/controllers/llamaCPP.h b/cortex-cpp/controllers/llamaCPP.h
index 62b554f6b..691b20cb4 100644
--- a/cortex-cpp/controllers/llamaCPP.h
+++ b/cortex-cpp/controllers/llamaCPP.h
@@ -78,6 +78,7 @@ class llamaCPP : public drogon::HttpController<llamaCPP>,
                         std::shared_ptr<SyncQueue> q);
   void ProcessNonStreamRes(std::function<void(const HttpResponsePtr&)> cb,
                            SyncQueue& q);
+  bool IsEngineLoaded();
 
  private:
   struct SyncQueue {
@@ -121,5 +122,6 @@ class llamaCPP : public drogon::HttpController<llamaCPP>,
  private:
   std::unique_ptr<dylib> dylib_;
   EngineI* engine_;
+  std::string cur_engine_name_;
 };
 };  // namespace inferences
\ No newline at end of file

From 9adc9d94b7938a4b342ed7a50c13f83d9bf42110 Mon Sep 17 00:00:00 2001
From: vansangpfiev <vansangpfiev@gmail.com>
Date: Fri, 3 May 2024 13:16:28 +0700
Subject: [PATCH 05/10] chore: bump whisper.cpp

---
 cortex-cpp/whisper.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cortex-cpp/whisper.cpp b/cortex-cpp/whisper.cpp
index 9fab28135..58210d6a7 160000
--- a/cortex-cpp/whisper.cpp
+++ b/cortex-cpp/whisper.cpp
@@ -1 +1 @@
-Subproject commit 9fab28135c7867bb7eccd9ebcd2ea8d52e42ca81
+Subproject commit 58210d6a7634ea1e42e0a2dab611f4a0518731dc

From ad799c35707d8be7082245ea11cdeef3c78e584e Mon Sep 17 00:00:00 2001
From: vansangpfiev <vansangpfiev@gmail.com>
Date: Sat, 4 May 2024 08:56:32 +0700
Subject: [PATCH 06/10] fix: correct library name

---
 .github/workflows/build.yml                   |  6 +-
 .../engines/cortex.llamacpp/engine.cmake      | 68 +++++++++++--------
 2 files changed, 42 insertions(+), 32 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 318b984a1..2871f9a4b 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -294,7 +294,7 @@ jobs:
         run: |
           ./install_deps.sh
           mkdir build && cd build
-          cmake  -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} .. 
+          cmake -DMAC_ARM64=ON -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} .. 
           CC=gcc-8 make -j $(sysctl -n hw.ncpu)
           ls -la
 
@@ -363,7 +363,7 @@ jobs:
         run: |
           ./install_deps.sh
           mkdir build && cd build
-          cmake  -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} .. 
+          cmake -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} .. 
           CC=gcc-8 make -j $(sysctl -n hw.ncp)
           ls -la
 
@@ -423,7 +423,7 @@ jobs:
           mkdir -p nitro/engines/cortex.llamacpp/
           ls ./nitro-mac-amd64
           lipo -create ./nitro-mac-amd64/nitro ./nitro-mac-arm64/nitro -output ./nitro/nitro
-          lipo -create ./nitro-mac-amd64/engines/cortex.llamacpp/libengine.dylib ./nitro-mac-arm64/engines/cortex.llamacpp/libengine.dylib -output ./nitro/engines/cortex.llamacpp/libengine.dylib
+          lipo -create ./nitro-mac-amd64/engines/cortex.llamacpp/libengine.dylib ./nitro-mac-arm64/engines/cortex.llamacpp/libengine.dylib ./nitro/engines/cortex.llamacpp/libengine.dylib
           cp ./nitro-mac-arm64/ggml-metal.metal ./nitro/ggml-metal.metal
           tar -czvf nitro.tar.gz nitro
 
diff --git a/cortex-cpp/engines/cortex.llamacpp/engine.cmake b/cortex-cpp/engines/cortex.llamacpp/engine.cmake
index 4b879a1ca..be7df6b12 100644
--- a/cortex-cpp/engines/cortex.llamacpp/engine.cmake
+++ b/cortex-cpp/engines/cortex.llamacpp/engine.cmake
@@ -1,52 +1,62 @@
-set(LIBLLAMA_ENGINE_VERSION v1.0)
+# cortex.llamacpp release version
+set(VERSION 0.1.0)
+set(ENGINE_VERSION v${VERSION})
 
-# Download library based on instruction 
-# TODO(sang) correct library name when cortex.llamacpp CI is done
+# MESSAGE("ENGINE_VERSION=" ${ENGINE_VERSION})
+
+# Download library based on instructions 
 if(UNIX AND NOT APPLE) 
   if(CUDA_12_0) 
-    set(LIBRARY_NAME linux-amd64-cuda-12-0.tar.gz)
+    set(LIBRARY_NAME cortex.llamacpp-${VERSION}-linux-amd64-cuda-12-0.tar.gz)
   elseif(CUDA_11_7)
-    set(LIBRARY_NAME linux-amd64-cuda-11-7.tar.gz)
+    set(LIBRARY_NAME cortex.llamacpp-${VERSION}-linux-amd64-cuda-11-7.tar.gz)
   elseif(LLAMA_VULKAN)
-    set(LIBRARY_NAME linux-amd64-vulkan.tar.gz)
-  elseif(DLLAMA_AVX512)
-    set(LIBRARY_NAME linux-amd64-avx512.tar.gz)
-  elseif(DLLAMA_AVX2)
-    set(LIBRARY_NAME linux-amd64-avx2.tar.gz)
+    set(LIBRARY_NAME cortex.llamacpp-${VERSION}-linux-amd64-vulkan.tar.gz)
+  elseif(LLAMA_AVX512)
+    set(LIBRARY_NAME cortex.llamacpp-${VERSION}-linux-amd64-avx512.tar.gz)
+  elseif(NOT LLAMA_AVX2)
+    set(LIBRARY_NAME cortex.llamacpp-${VERSION}-linux-amd64-avx.tar.gz)
   else()
-    set(LIBRARY_NAME linux-amd64-avx.tar.gz)
+    set(LIBRARY_NAME cortex.llamacpp-${VERSION}-linux-amd64-avx2.tar.gz)
   endif()
 elseif(UNIX)
-  set(LIBRARY_NAME mac-universal.tar.gz)
+  if(MAC_ARM64) 
+    set(LIBRARY_NAME cortex.llamacpp-${VERSION}-mac-arm64.tar.gz)
+  else()
+    set(LIBRARY_NAME cortex.llamacpp-${VERSION}-mac-amd64.tar.gz)
+  endif()
 else()
   if(CUDA_12_0)
-    if(DLLAMA_AVX512)  
-      set(LIBRARY_NAME win-amd64-avx512-cuda-12-0.tar.gz)
-    elseif(DLLAMA_AVX2)
-      set(LIBRARY_NAME win-amd64-avx2-cuda-12-0.tar.gz)
+    if(LLAMA_AVX512)  
+      set(LIBRARY_NAME cortex.llamacpp-${VERSION}-windows-amd64-avx512-cuda-12-0.tar.gz)
+    elseif(NOT LLAMA_AVX2)
+      set(LIBRARY_NAME cortex.llamacpp-${VERSION}-windows-amd64-avx-cuda-12-0.tar.gz)
     else()
-      set(LIBRARY_NAME win-amd64-avx-cuda-12-0.tar.gz)
+      set(LIBRARY_NAME cortex.llamacpp-${VERSION}-windows-amd64-avx2-cuda-12-0.tar.gz)
     endif()
   elseif(CUDA_11_7)
-    if(DLLAMA_AVX512)  
-      set(LIBRARY_NAME win-amd64-avx512-cuda-11-7.tar.gz)
-    elseif(DLLAMA_AVX2)
-      set(LIBRARY_NAME win-amd64-avx2-cuda-11-7.tar.gz)
+    if(LLAMA_AVX512)  
+      set(LIBRARY_NAME cortex.llamacpp-${VERSION}-windows-amd64-avx512-cuda-11-7.tar.gz)
+    elseif(NOT LLAMA_AVX2)
+      set(LIBRARY_NAME cortex.llamacpp-${VERSION}-windows-amd64-avx-cuda-11-7.tar.gz)
     else()
-      set(LIBRARY_NAME win-amd64-avx-cuda-11-7.tar.gz)
+      set(LIBRARY_NAME cortex.llamacpp-${VERSION}-windows-amd64-avx2-cuda-11-7.tar.gz)
     endif()
   elseif(LLAMA_VULKAN)
-    set(LIBRARY_NAME win-amd64-vulkan.tar.gz)
-  elseif(DLLAMA_AVX512)
-    set(LIBRARY_NAME win-amd64-avx512.tar.gz)
-  elseif(DLLAMA_AVX2)
-    set(LIBRARY_NAME win-amd64-avx2.tar.gz)
+    set(LIBRARY_NAME cortex.llamacpp-${VERSION}-windows-amd64-vulkan.tar.gz)
+  elseif(LLAMA_AVX512)
+    set(LIBRARY_NAME cortex.llamacpp-${VERSION}-windows-amd64-avx512.tar.gz)
+  elseif(NOT LLAMA_AVX2)
+    set(LIBRARY_NAME cortex.llamacpp-${VERSION}-windows-amd64-avx.tar.gz)
   else()
-    set(LIBRARY_NAME win-amd64-avx.tar.gz)
+    set(LIBRARY_NAME cortex.llamacpp-${VERSION}-windows-amd64-avx2.tar.gz)
   endif()
 endif()
 
-set(LIBLLAMA_ENGINE_URL https://github.com/.../releases/download/${LIBLLAMA_ENGINE_VERSION}/${LIBRARY_NAME})
+
+set(LIBLLAMA_ENGINE_URL https://github.com/janhq/cortex.llamacpp/releases/download/${ENGINE_VERSION}/${LIBRARY_NAME})
+# MESSAGE("LIBLLAMA_ENGINE_URL="${LIBLLAMA_ENGINE_URL})
+# MESSAGE("LIBARRY_NAME=" ${LIBRARY_NAME})
 
 set(LIBLLAMA_ENGINE_PATH ${CMAKE_BINARY_DIR}/engines/${LIBRARY_NAME})
 

From 132ba7e2bc3fd521d04e71af26202afd3cf4e677 Mon Sep 17 00:00:00 2001
From: vansangpfiev <vansangpfiev@gmail.com>
Date: Mon, 6 May 2024 09:19:49 +0700
Subject: [PATCH 07/10] fix: ci-cd

---
 .github/workflows/build.yml | 172 +++++++++++++++++++-----------------
 cortex-cpp/install_deps.sh  |   0
 2 files changed, 89 insertions(+), 83 deletions(-)
 mode change 100644 => 100755 cortex-cpp/install_deps.sh

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 2871f9a4b..112c1023d 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -98,7 +98,7 @@ jobs:
               local max_retries=3
               local tag
               while [ $retries -lt $max_retries ]; do
-                tag=$(curl -s https://api.github.com/repos/janhq/nitro/releases/latest | jq -r .tag_name)
+                tag=$(curl -s https://api.github.com/repos/janhq/cortex/releases/latest | jq -r .tag_name)
                 if [ -n "$tag" ] && [ "$tag" != "null" ]; then
                   echo $tag
                   return
@@ -161,6 +161,7 @@ jobs:
         id: make_build
         run: |
           ldd --version
+          cd cortex-cpp
           ./install_deps.sh
           mkdir build && cd build
           cmake ${{ matrix.defines }} -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} ..
@@ -170,35 +171,35 @@ jobs:
       - name: Package
         shell: bash
         run: |
-          mkdir -p nitro
-          mkdir -p nitro/engines/cortex.llamacpp
-          cp build/nitro nitro/
-          cp build/engines/cortex.llamacpp/libengine.so nitro/engines/cortex.llamacpp/
-          tar -czvf nitro.tar.gz nitro
+          mkdir -p cortex-cpp/nitro
+          mkdir -p cortex-cpp/nitro/engines/cortex.llamacpp
+          cp cortex-cpp/build/nitro cortex-cpp/nitro/
+          cp cortex-cpp/build/engines/cortex.llamacpp/libengine.so cortex-cpp/nitro/engines/cortex.llamacpp/
+          tar -czvf cortex-cpp/nitro.tar.gz cortex-cpp/nitro
 
       - name: Upload Artifact
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request'
         with:
           name: nitro-linux-${{ matrix.build }}
-          path: ./nitro
+          path: ./cortex-cpp/nitro
 
       - name: Run e2e testing - LLama.CPP
         shell: bash
-        if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' }}
+        if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' && matrix.build != 'amd64-avx' }}
         run: |
           # run e2e testing
-          cd nitro
-          chmod +x ../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }}
+          cd cortex-cpp/nitro
+          chmod +x ../../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }}
           rm -rf uploads/
 
       - name: Run e2e testing - Whisper.CPP
         shell: bash
-        if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' }}
+        if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' && matrix.build != 'amd64-avx' }}
         run: |
           # run e2e testing
-          cd nitro
-          chmod +x ../.github/scripts/e2e-test-whisper-linux-and-mac.sh && ../.github/scripts/e2e-test-whisper-linux-and-mac.sh ./nitro ${{ env.WHISPER_MODEL_URL }}
+          cd cortex-cpp/nitro
+          chmod +x ../../.github/scripts/e2e-test-whisper-linux-and-mac.sh && ../../.github/scripts/e2e-test-whisper-linux-and-mac.sh ./nitro ${{ env.WHISPER_MODEL_URL }}
           rm -rf uploads/
 
       - uses: actions/upload-release-asset@v1.0.1
@@ -207,7 +208,7 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         with:
           upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
-          asset_path: ./nitro.tar.gz
+          asset_path: ./cortex-cpp/nitro.tar.gz
           asset_name: nitro-${{ needs.create-draft-release.outputs.version }}-linux-${{ matrix.build }}.tar.gz
           asset_content_type: application/gzip
 
@@ -222,9 +223,9 @@ jobs:
       matrix:
         include:
           - cuda: "12-0"
-            flags: "-DCUDA_12_0=ON -DCUDA_11_7=OFF"
+            flags: "-DCUDA_12_0=ON"
           - cuda: "11-7"
-            flags: "-DCUDA_12_0=OFF -DCUDA_11_7=ON"
+            flags: "-DCUDA_11_7=ON"
 
     steps:
       - name: Clone
@@ -236,6 +237,7 @@ jobs:
       - name: Build
         id: make_build
         run: |
+          cd cortex-cpp
           ./install_deps.sh
           mkdir build && cd build
           cmake  ${{matrix.flags}} -DLLAMA_NATIVE=OFF -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} ..
@@ -245,6 +247,7 @@ jobs:
       - name: Package
         shell: bash
         run: |
+          cd cortex-cpp
           mkdir -p nitro
           mkdir -p nitro/engines/cortex.llamacpp
           cp build/nitro nitro/
@@ -256,7 +259,16 @@ jobs:
         if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request'
         with:
           name: nitro-linux-amd64-cuda-${{ matrix.cuda }}
-          path: ./nitro
+          path: ./cortex-cpp/nitro
+      
+      - name: Run e2e testing - LLama.CPP
+        shell: bash
+        if: ${{ matrix.cuda != '12-0'}}
+        run: |
+          # run e2e testing
+          cd cortex-cpp/nitro
+          chmod +x ../../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }}
+          rm -rf uploads/
 
       - uses: actions/upload-release-asset@v1.0.1
         if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
@@ -264,7 +276,7 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         with:
           upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
-          asset_path: ./nitro.tar.gz
+          asset_path: ./cortex-cpp/nitro.tar.gz
           asset_name: nitro-${{ needs.create-draft-release.outputs.version }}-linux-amd64-cuda-${{ matrix.cuda }}.tar.gz
           asset_content_type: application/gzip
 
@@ -292,6 +304,7 @@ jobs:
       - name: Build
         id: cmake_build
         run: |
+          cd cortex-cpp
           ./install_deps.sh
           mkdir build && cd build
           cmake -DMAC_ARM64=ON -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} .. 
@@ -301,8 +314,8 @@ jobs:
       - name: Package
         shell: bash
         run: |
+          cd cortex-cpp
           mkdir -p nitro
-          cp llama.cpp/ggml-metal.metal nitro/
           mkdir -p nitro/engines/cortex.llamacpp
           cp build/nitro nitro/
           cp build/engines/cortex.llamacpp/libengine.dylib nitro/engines/cortex.llamacpp/
@@ -311,22 +324,21 @@ jobs:
         uses: actions/upload-artifact@v2
         with:
           name: nitro-mac-arm64
-          path: ./nitro
+          path: ./cortex-cpp/nitro
 
       - name: Run e2e testing - LLama.CPP
-        shell: bash
         run: |
           # run e2e testing
-          cd nitro/
-          chmod +x ../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }}
+          cd cortex-cpp/nitro/
+          chmod +x ../../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }}
           rm -rf uploads/
 
       - name: Run e2e testing - Whisper.CPP
-        shell: bash
         run: |
+          cd cortex-cpp
           # To test with CoreML
           if [[ ! -f "/tmp/testwhisper-encoder.mlmodelc" ]]; then
-            wget https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny-encoder.mlmodelc.zip
+            curl -LO https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny-encoder.mlmodelc.zip
             unzip ggml-tiny-encoder.mlmodelc.zip
             rm ggml-tiny-encoder.mlmodelc.zip
             rm -rf /tmp/testwhisper-encoder.mlmodelc
@@ -334,11 +346,11 @@ jobs:
           fi
           # run e2e testing
           cd nitro
-          chmod +x ../.github/scripts/e2e-test-whisper-linux-and-mac.sh && ../.github/scripts/e2e-test-whisper-linux-and-mac.sh ./nitro ${{ env.WHISPER_MODEL_URL }}
+          chmod +x ../../.github/scripts/e2e-test-whisper-linux-and-mac.sh && ../../.github/scripts/e2e-test-whisper-linux-and-mac.sh ./nitro ${{ env.WHISPER_MODEL_URL }}
           rm -rf uploads/
 
   macOS-amd64-build:
-    runs-on: macos-latest
+    runs-on: macos-13
     needs: [create-draft-release, set-nitro-version]
     if: always() && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.set-nitro-version.result == 'success'
     timeout-minutes: 40
@@ -361,6 +373,7 @@ jobs:
       - name: Build
         id: cmake_build
         run: |
+          cd cortex-cpp
           ./install_deps.sh
           mkdir build && cd build
           cmake -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} .. 
@@ -370,6 +383,7 @@ jobs:
       - name: Package
         shell: bash
         run: |
+          cd cortex-cpp
           mkdir -p nitro
           mkdir -p nitro/engines/cortex.llamacpp
           cp build/nitro nitro/
@@ -379,22 +393,24 @@ jobs:
         uses: actions/upload-artifact@v2
         with:
           name: nitro-mac-amd64
-          path: ./nitro
+          path: ./cortex-cpp/nitro
 
       - name: Run e2e testing - LLama.CPP
         shell: bash
         run: |
           # run e2e testing
+          cd cortex-cpp
           cd nitro
-          chmod +x ../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }}
+          chmod +x ../../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }}
           rm -rf uploads/
 
       - name: Run e2e testing - Whisper.CPP
         shell: bash
         run: |
           # run e2e testing
+          cd cortex-cpp
           cd nitro
-          chmod +x ../.github/scripts/e2e-test-whisper-linux-and-mac.sh && ../.github/scripts/e2e-test-whisper-linux-and-mac.sh ./nitro ${{ env.WHISPER_MODEL_URL }}
+          chmod +x ../../.github/scripts/e2e-test-whisper-linux-and-mac.sh && ../../.github/scripts/e2e-test-whisper-linux-and-mac.sh ./nitro ${{ env.WHISPER_MODEL_URL }}
           rm -rf uploads/
 
   universal-nitro-artifact-macos:
@@ -409,29 +425,29 @@ jobs:
         uses: actions/download-artifact@v2
         with:
           name: nitro-mac-amd64
-          path: ./nitro-mac-amd64
+          path: ./cortex-cpp/nitro-mac-amd64
 
       - name: download artifact arm64
         uses: actions/download-artifact@v2
         with:
           name: nitro-mac-arm64
-          path: ./nitro-mac-arm64
+          path: ./cortex-cpp/nitro-mac-arm64
       
       - name: bundle universal binary
         run: |
+          cd cortex-cpp
           mkdir -p nitro
           mkdir -p nitro/engines/cortex.llamacpp/
           ls ./nitro-mac-amd64
           lipo -create ./nitro-mac-amd64/nitro ./nitro-mac-arm64/nitro -output ./nitro/nitro
-          lipo -create ./nitro-mac-amd64/engines/cortex.llamacpp/libengine.dylib ./nitro-mac-arm64/engines/cortex.llamacpp/libengine.dylib ./nitro/engines/cortex.llamacpp/libengine.dylib
-          cp ./nitro-mac-arm64/ggml-metal.metal ./nitro/ggml-metal.metal
+          lipo -create ./nitro-mac-amd64/engines/cortex.llamacpp/libengine.dylib ./nitro-mac-arm64/engines/cortex.llamacpp/libengine.dylib -output ./nitro/engines/cortex.llamacpp/libengine.dylib
           tar -czvf nitro.tar.gz nitro
 
       - name: Upload Artifact
         uses: actions/upload-artifact@v2
         with:
           name: nitro-mac-universal
-          path: ./nitro
+          path: ./cortex-cpp/nitro
 
       - uses: actions/upload-release-asset@v1.0.1
         if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
@@ -439,7 +455,7 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         with:
           upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
-          asset_path: ./nitro.tar.gz
+          asset_path: ./cortex-cpp/nitro.tar.gz
           asset_name: nitro-${{ needs.create-draft-release.outputs.version }}-mac-universal.tar.gz
           asset_content_type: application/gzip
 
@@ -506,6 +522,7 @@ jobs:
         id: cmake_build
         shell: cmd
         run: |
+          cd cortex-cpp
           cmake -S ./nitro_deps -B ./build_deps/nitro_deps
           cmake --build ./build_deps/nitro_deps --config Release
           mkdir -p build
@@ -517,14 +534,15 @@ jobs:
         id: pack_artifacts
         shell: cmd
         run: |
+          cd cortex-cpp
           mkdir .\build\Release\engines\cortex.llamacpp\
           robocopy .\build\engines\cortex.llamacpp\ .\build\Release\engines\cortex.llamacpp\ engine.dll
           robocopy build_deps\_install\bin\ .\build\Release\ zlib.dll
           robocopy build\bin\Release\ .\build\Release\ llama.dll
           robocopy build\bin\Release\ .\build\Release\ whisper.dll
-          robocopy .github\patches\windows\ .\build\Release\ msvcp140.dll
-          robocopy .github\patches\windows\ .\build\Release\ vcruntime140_1.dll
-          robocopy .github\patches\windows\ .\build\Release\ vcruntime140.dll
+          robocopy ..\.github\patches\windows\ .\build\Release\ msvcp140.dll
+          robocopy ..\.github\patches\windows\ .\build\Release\ vcruntime140_1.dll
+          robocopy ..\.github\patches\windows\ .\build\Release\ vcruntime140.dll
           robocopy "$env:SDL2_DIR\..\lib\2.28.5\" .\build\Release\ SDL2.dll
           dotnet tool install --global AzureSignTool
           azuresigntool.exe sign -kvu "${{ secrets.AZURE_KEY_VAULT_URI }}" -kvi "${{ secrets.AZURE_CLIENT_ID }}" -kvt "${{ secrets.AZURE_TENANT_ID }}" -kvs "${{ secrets.AZURE_CLIENT_SECRET }}" -kvc ${{ secrets.AZURE_CERT_NAME }} -tr http://timestamp.globalsign.com/tsa/r6advanced1 -v ".\build\Release\nitro.exe"
@@ -535,16 +553,18 @@ jobs:
         shell: cmd
         if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' }}
         run: |
+          cd cortex-cpp
           cd build\Release
-          ..\..\.github\scripts\e2e-test-llama-windows.bat nitro.exe ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }}
+          ..\..\..\.github\scripts\e2e-test-llama-windows.bat nitro.exe ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }}
           rmdir /S /Q .\build\Release\uploads
 
       - name: Run e2e testing - Whisper.cpp
         shell: cmd
         if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' }}
         run: |
+          cd cortex-cpp
           cd build\Release
-          ..\..\.github\scripts\e2e-test-whisper-windows.bat nitro.exe ${{ env.WHISPER_MODEL_URL }}
+          ..\..\..\.github\scripts\e2e-test-whisper-windows.bat nitro.exe ${{ env.WHISPER_MODEL_URL }}
           rmdir /S /Q .\build\Release\uploads
 
       - name: Upload Artifact
@@ -552,7 +572,7 @@ jobs:
         if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request'
         with:
           name: nitro-win-${{ matrix.build }}
-          path: ./build/Release
+          path: ./cortex-cpp/build/Release
 
       - uses: actions/upload-release-asset@v1.0.1
         if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
@@ -560,7 +580,7 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         with:
           upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
-          asset_path: ./nitro.tar.gz
+          asset_path: ./cortex-cpp/nitro.tar.gz
           asset_name: nitro-${{ needs.create-draft-release.outputs.version }}-win-${{ matrix.build }}.tar.gz
           asset_content_type: application/gzip
 
@@ -576,59 +596,43 @@ jobs:
       matrix:
         include:
           - cuda: "12-0"
-            instructions: "-DLLAMA_NATIVE=OFF"
-            flags: "-DCUDA_12_0=ON -DCUDA_11_7=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
+            instructions: "amd64-avx2"
+            inst-flags: "-DLLAMA_NATIVE=OFF"
+            cmake-flags: "-DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
           - cuda: "12-0"
-            instructions: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF"
-            flags: "-DCUDA_12_0=ON -DCUDA_11_7=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
+            instructions: "amd64-avx"
+            inst-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF"
+            cmake-flags: "-DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
           - cuda: "12-0"
-            instructions: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF"
-            flags: "-DCUDA_12_0=ON -DCUDA_11_7=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
+            instructions: "amd64-avx512"
+            inst-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF"
+            cmake-flags: "-DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
           - cuda: "11-7"
-            instructions: "-DLLAMA_NATIVE=OFF"
-            flags: "-DCUDA_12_0=OFF -DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
+            instructions: "amd64-avx2"
+            inst-flags: "-DLLAMA_NATIVE=OFF"
+            cmake-flags: "-DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
           - cuda: "11-7"
-            instructions: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF"
-            flags: "-DCUDA_12_0=OFF -DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
+            instructions: "amd64-avx"
+            inst-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF"
+            cmake-flags: "-DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
           - cuda: "11-7"
-            instructions: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF"
-            flags: "-DCUDA_12_0=OFF -DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
+            instructions: "amd64-avx512"
+            inst-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF"
+            cmake-flags: "-DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
 
     steps:
-      - name: Setup VSWhere.exe
-        uses: warrenbuckley/Setup-VSWhere@v1
-        with:
-          version: latest
-          silent: true
-        env:
-          ACTIONS_ALLOW_UNSECURE_COMMANDS: true
-
-      - name: Add msbuild to PATH
-        uses: microsoft/setup-msbuild@v1
-
       - name: Fetch SDL2 and set SDL2_DIR version 2.28.5
         run: |
           curl -L -o sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-2.28.5/SDL2-devel-2.28.5-VC.zip
           7z x sdl2.zip -aoa
           echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-2.28.5/cmake" >> $env:GITHUB_ENV
 
-      - name: actions-setup-cmake
-        uses: jwlawson/actions-setup-cmake@v1.14.1
-
       - name: Clone
         id: checkout
         uses: actions/checkout@v3
         with:
           submodules: recursive
 
-      - name: Setup VSWhere.exe
-        uses: warrenbuckley/Setup-VSWhere@v1
-        with:
-          version: latest
-          silent: true
-        env:
-          ACTIONS_ALLOW_UNSECURE_COMMANDS: true
-
       - uses: actions/setup-dotnet@v3
         with:
           dotnet-version: "6.0.x"
@@ -637,11 +641,12 @@ jobs:
         id: cmake_build
         shell: cmd
         run: |
+          cd cortex-cpp
           cmake -S ./nitro_deps -B ./build_deps/nitro_deps
           cmake --build ./build_deps/nitro_deps --config Release
           mkdir -p build
           cd build
-          cmake .. ${{ matrix.instructions }} ${{ matrix.flags }} -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }}
+          cmake .. ${{ matrix.inst-flags }} ${{ matrix.cmake-flags }} -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }}
           cmake --build . --config Release -j "%NUMBER_OF_PROCESSORS%"
 
       - name: Pack artifacts
@@ -649,14 +654,15 @@ jobs:
         shell: cmd
         run: |
           set PATH=%PATH%;C:\Program Files\7-Zip\
+          cd cortex-cpp
           mkdir .\build\Release\engines\cortex.llamacpp\
           robocopy .\build\engines\cortex.llamacpp\ .\build\Release\engines\cortex.llamacpp\ engine.dll
           robocopy build_deps\_install\bin\ .\build\Release\ zlib.dll
           robocopy build\bin\Release\ .\build\Release\ llama.dll
           robocopy build\bin\Release\ .\build\Release\ whisper.dll
-          robocopy .github\patches\windows\ .\build\Release\ msvcp140.dll
-          robocopy .github\patches\windows\ .\build\Release\ vcruntime140_1.dll
-          robocopy .github\patches\windows\ .\build\Release\ vcruntime140.dll
+          robocopy ..\.github\patches\windows\ .\build\Release\ msvcp140.dll
+          robocopy ..\.github\patches\windows\ .\build\Release\ vcruntime140_1.dll
+          robocopy ..\.github\patches\windows\ .\build\Release\ vcruntime140.dll
           robocopy "$env:SDL2_DIR\..\lib\2.28.5\" .\build\Release\ SDL2.dll
           dotnet tool install --global AzureSignTool
           %USERPROFILE%\.dotnet\tools\azuresigntool.exe sign -kvu "${{ secrets.AZURE_KEY_VAULT_URI }}" -kvi "${{ secrets.AZURE_CLIENT_ID }}" -kvt "${{ secrets.AZURE_TENANT_ID }}" -kvs "${{ secrets.AZURE_CLIENT_SECRET }}" -kvc ${{ secrets.AZURE_CERT_NAME }} -tr http://timestamp.globalsign.com/tsa/r6advanced1 -v ".\build\Release\nitro.exe"
@@ -668,7 +674,7 @@ jobs:
         if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request'
         with:
           name: nitro-win-${{ matrix.instructions }}-cuda-${{ matrix.cuda }}
-          path: ./build/Release
+          path: ./cortex-cpp/build/Release
 
       - uses: actions/upload-release-asset@v1.0.1
         if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
@@ -676,7 +682,7 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         with:
           upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
-          asset_path: ./nitro.tar.gz
+          asset_path: ./cortex-cpp/nitro.tar.gz
           asset_name: nitro-${{ needs.create-draft-release.outputs.version }}-win-${{ matrix.instructions }}-cuda-${{ matrix.cuda }}.tar.gz
           asset_content_type: application/gzip
 
diff --git a/cortex-cpp/install_deps.sh b/cortex-cpp/install_deps.sh
old mode 100644
new mode 100755

From 95ec19a4d5edd83a669c2f23b568e3330d1f8b25 Mon Sep 17 00:00:00 2001
From: vansangpfiev <vansangpfiev@gmail.com>
Date: Tue, 7 May 2024 08:35:36 +0700
Subject: [PATCH 08/10] fix: avx2 flag

---
 .github/workflows/build.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 112c1023d..41d2d6025 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -132,7 +132,7 @@ jobs:
       matrix:
         include:
           - build: "amd64-avx2"
-            defines: "-DLLAMA_NATIVE=OFF"
+            defines: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF"
           - build: "amd64-avx"
             defines: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF"
           - build: "amd64-avx512"
@@ -469,7 +469,7 @@ jobs:
       matrix:
         include:
           - build: "amd64-avx2"
-            defines: "-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
+            defines: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
           - build: "amd64-avx"
             defines: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
           - build: "amd64-avx512"
@@ -597,7 +597,7 @@ jobs:
         include:
           - cuda: "12-0"
             instructions: "amd64-avx2"
-            inst-flags: "-DLLAMA_NATIVE=OFF"
+            inst-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF"
             cmake-flags: "-DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
           - cuda: "12-0"
             instructions: "amd64-avx"
@@ -609,7 +609,7 @@ jobs:
             cmake-flags: "-DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
           - cuda: "11-7"
             instructions: "amd64-avx2"
-            inst-flags: "-DLLAMA_NATIVE=OFF"
+            inst-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF"
             cmake-flags: "-DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
           - cuda: "11-7"
             instructions: "amd64-avx"

From e8e1100fed6e2c67482dfaf181a79f79b251e194 Mon Sep 17 00:00:00 2001
From: vansangpfiev <vansangpfiev@gmail.com>
Date: Tue, 7 May 2024 11:32:54 +0700
Subject: [PATCH 09/10] refactor: remove audio

---
 .../scripts/e2e-test-whisper-linux-and-mac.sh |  93 --
 .github/scripts/e2e-test-whisper-windows.bat  | 102 ---
 .github/workflows/build.yml                   |  89 +-
 .gitmodules                                   |   3 -
 cortex-cpp/CMakeLists.txt                     |   6 +-
 cortex-cpp/common/base.cc                     |   0
 cortex-cpp/common/base.h                      |  16 -
 cortex-cpp/context/whisper_server_context.cc  | 796 ------------------
 cortex-cpp/context/whisper_server_context.h   | 165 ----
 cortex-cpp/controllers/audio.cc               | 300 -------
 cortex-cpp/controllers/audio.h                |  74 --
 cortex-cpp/whisper.cpp                        |   1 -
 12 files changed, 13 insertions(+), 1632 deletions(-)
 delete mode 100755 .github/scripts/e2e-test-whisper-linux-and-mac.sh
 delete mode 100644 .github/scripts/e2e-test-whisper-windows.bat
 delete mode 100644 cortex-cpp/common/base.cc
 delete mode 100644 cortex-cpp/context/whisper_server_context.cc
 delete mode 100644 cortex-cpp/context/whisper_server_context.h
 delete mode 100644 cortex-cpp/controllers/audio.cc
 delete mode 100644 cortex-cpp/controllers/audio.h
 delete mode 160000 cortex-cpp/whisper.cpp

diff --git a/.github/scripts/e2e-test-whisper-linux-and-mac.sh b/.github/scripts/e2e-test-whisper-linux-and-mac.sh
deleted file mode 100755
index 4c8a1e9eb..000000000
--- a/.github/scripts/e2e-test-whisper-linux-and-mac.sh
+++ /dev/null
@@ -1,93 +0,0 @@
-#!/bin/bash
-
-## Example run command
-# ./linux-and-mac.sh './jan/plugins/@janhq/inference-plugin/dist/nitro/nitro_mac_arm64' https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF/resolve/main/tinyllama-1.1b-chat-v0.3.Q2_K.gguf
-
-# Check for required arguments
-if [[ $# -ne 2 ]]; then
-    echo "Usage: $0 <path_to_binary> <url_to_download>"
-    exit 1
-fi
-
-rm /tmp/response1.log /tmp/response2.log /tmp/nitro.log
-
-BINARY_PATH=$1
-DOWNLOAD_URL=$2
-
-# Random port to ensure it's not used
-min=10000
-max=11000
-range=$((max - min + 1))
-PORT=$((RANDOM % range + min))
-
-# Start the binary file
-"$BINARY_PATH" 1 127.0.0.1 $PORT >/tmp/nitro.log &
-
-# Get the process id of the binary file
-pid=$!
-
-if ! ps -p $pid >/dev/null; then
-    echo "nitro failed to start. Logs:"
-    cat /tmp/nitro.log
-    exit 1
-fi
-
-# Wait for a few seconds to let the server start
-sleep 5
-
-# Check if /tmp/testwhisper exists, if not, download it
-if [[ ! -f "/tmp/testwhisper" ]]; then
-    curl --connect-timeout 300 $DOWNLOAD_URL --output /tmp/testwhisper
-fi
-
-# Run the curl commands
-response1=$(curl --connect-timeout 60 -o /tmp/response1.log -s -w "%{http_code}" --location "http://127.0.0.1:$PORT/v1/audio/load_model" \
-    --header 'Content-Type: application/json' \
-    --data '{
-    "model_path": "/tmp/testwhisper",
-    "model_id": "whisper.cpp"
-}')
-
-response2=$(
-    curl --connect-timeout 60 -o /tmp/response2.log -s -w "%{http_code}" --location "http://127.0.0.1:$PORT/v1/audio/transcriptions" \
-        --header 'Access-Control-Allow-Origin: *' \
-        --form 'file=@"../whisper.cpp/samples/jfk.wav"' \
-        --form 'model_id="whisper.cpp"' \
-        --form 'temperature="0.0"' \
-        --form 'prompt="The transcript is about OpenAI which makes technology like DALL·E, GPT-3, and ChatGPT with the hope of one day building an AGI system that benefits all of humanity. The president is trying to raly people to support the cause."' \
-       
-)
-
-error_occurred=0
-if [[ "$response1" -ne 200 ]]; then
-    echo "The first curl command failed with status code: $response1"
-    cat /tmp/response1.log
-    error_occurred=1
-fi
-
-if [[ "$response2" -ne 200 ]]; then
-    echo "The second curl command failed with status code: $response2"
-    cat /tmp/response2.log
-    error_occurred=1
-fi
-
-if [[ "$error_occurred" -eq 1 ]]; then
-    echo "Nitro test run failed!!!!!!!!!!!!!!!!!!!!!!"
-    echo "Nitro Error Logs:"
-    cat /tmp/nitro.log
-    kill $pid
-    exit 1
-fi
-
-echo "----------------------"
-echo "Log load model:"
-cat /tmp/response1.log
-
-echo "----------------------"
-echo "Log run test:"
-cat /tmp/response2.log
-
-echo "Nitro test run successfully!"
-
-# Kill the server process
-kill $pid
diff --git a/.github/scripts/e2e-test-whisper-windows.bat b/.github/scripts/e2e-test-whisper-windows.bat
deleted file mode 100644
index 6eb2037ea..000000000
--- a/.github/scripts/e2e-test-whisper-windows.bat
+++ /dev/null
@@ -1,102 +0,0 @@
-@echo off
-
-set "TEMP=C:\Users\%UserName%\AppData\Local\Temp"
-set "MODEL_PATH=%TEMP%\testwhisper"
-
-rem Check for required arguments
-if "%~2"=="" (
-    echo Usage: %~0 ^<path_to_binary^> ^<url_to_download^>
-    exit /b 1
-)
-
-set "BINARY_PATH=%~1"
-set "DOWNLOAD_URL=%~2"
-
-for %%i in ("%BINARY_PATH%") do set "BINARY_NAME=%%~nxi"
-
-echo BINARY_NAME=%BINARY_NAME%
-
-del %TEMP%\response1.log 2>nul
-del %TEMP%\response2.log 2>nul
-del %TEMP%\nitro.log 2>nul
-
-set /a min=9999
-set /a max=11000
-set /a range=max-min+1
-set /a PORT=%min% + %RANDOM% %% %range%
-
-rem Start the binary file
-start /B "" "%BINARY_PATH%" 1 "127.0.0.1" %PORT% > %TEMP%\nitro.log 2>&1
-
-ping -n 6 127.0.0.1 %PORT% > nul
-
-rem Capture the PID of the started process with "nitro" in its name
-for /f "tokens=2" %%a in ('tasklist /fi "imagename eq %BINARY_NAME%" /fo list ^| findstr /B "PID:"') do (
-    set "pid=%%a"
-)
-
-echo pid=%pid%
-
-if not defined pid (
-    echo nitro failed to start. Logs:
-    type %TEMP%\nitro.log
-    exit /b 1
-)
-
-rem Wait for a few seconds to let the server start
-
-rem Check if %TEMP%\testwhisper exists, if not, download it
-if not exist "%MODEL_PATH%" (
-    curl.exe --connect-timeout 300 %DOWNLOAD_URL% --output "%MODEL_PATH%"
-)
-
-rem Define JSON strings for curl data
-call set "MODEL_PATH_STRING=%%MODEL_PATH:\=\\%%"
-set "curl_data1={\"model_path\":\"%MODEL_PATH_STRING%\",\"model_id\":\"whisper\"}"
-
-rem Run the curl commands and capture the status code
-curl.exe --connect-timeout 60 -o %TEMP%\response1.log -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/v1/audio/load_model" --header "Content-Type: application/json" --data "%curl_data1%" > %TEMP%\response1_code.log 2>&1
-
-curl --connect-timeout 60 -o %TEMP%\response2.log -s -w "%%{http_code}" --location "http://localhost:%PORT%/v1/audio/transcriptions" ^
---form "file=@../..//whisper.cpp/samples/jfk.wav" ^
---form "model_id=whisper" > %TEMP%\response2_code.log 2>&1
-
-set "error_occurred=0"
-
-rem Read the status codes from the log files
-for /f %%a in (%TEMP%\response1_code.log) do set "response1=%%a"
-for /f %%a in (%TEMP%\response2_code.log) do set "response2=%%a"
-
-if "%response1%" neq "200" (
-    echo The first curl command failed with status code: %response1%
-    type %TEMP%\response1.log
-    set "error_occurred=1"
-)
-
-if "%response2%" neq "200" (
-    echo The second curl command failed with status code: %response2%
-    type %TEMP%\response2.log
-    set "error_occurred=1"
-)
-
-if "%error_occurred%"=="1" (
-    echo Nitro test run failed!!!!!!!!!!!!!!!!!!!!!!
-    echo Nitro Error Logs:
-    type %TEMP%\nitro.log
-    taskkill /f /pid %pid%
-    exit /b 1
-)
-
-
-echo ----------------------
-echo Log load model:
-type %TEMP%\response1.log
-
-echo ----------------------
-echo "Log run test:"
-type %TEMP%\response2.log
-
-echo Nitro test run successfully!
-
-rem Kill the server process
-taskkill /f /im nitro.exe 2>nul || exit /B 0
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 41d2d6025..dc0719f95 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -48,7 +48,6 @@ on:
 env:
   BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
   LLM_MODEL_URL: https://delta.jan.ai/tinyllama-1.1b-chat-v0.3.Q2_K.gguf
-  WHISPER_MODEL_URL: https://delta.jan.ai/ggml-tiny-q5_1.bin
   EMBEDDING_MODEL_URL: https://catalog.jan.ai/dist/models/embeds/nomic-embed-text-v1.5.f16.gguf
 
 jobs:
@@ -193,15 +192,6 @@ jobs:
           chmod +x ../../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }}
           rm -rf uploads/
 
-      - name: Run e2e testing - Whisper.CPP
-        shell: bash
-        if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' && matrix.build != 'amd64-avx' }}
-        run: |
-          # run e2e testing
-          cd cortex-cpp/nitro
-          chmod +x ../../.github/scripts/e2e-test-whisper-linux-and-mac.sh && ../../.github/scripts/e2e-test-whisper-linux-and-mac.sh ./nitro ${{ env.WHISPER_MODEL_URL }}
-          rm -rf uploads/
-
       - uses: actions/upload-release-asset@v1.0.1
         if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
         env:
@@ -240,7 +230,7 @@ jobs:
           cd cortex-cpp
           ./install_deps.sh
           mkdir build && cd build
-          cmake  ${{matrix.flags}} -DLLAMA_NATIVE=OFF -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} ..
+          cmake  ${{matrix.flags}} -DLLAMA_NATIVE=OFF -DLLAMA_CUDA=ON -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} ..
           make -j $(nproc)
           ls -la
 
@@ -294,13 +284,6 @@ jobs:
         with:
           submodules: recursive
 
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-          brew install cmake sdl2
-
       - name: Build
         id: cmake_build
         run: |
@@ -333,22 +316,6 @@ jobs:
           chmod +x ../../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }}
           rm -rf uploads/
 
-      - name: Run e2e testing - Whisper.CPP
-        run: |
-          cd cortex-cpp
-          # To test with CoreML
-          if [[ ! -f "/tmp/testwhisper-encoder.mlmodelc" ]]; then
-            curl -LO https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny-encoder.mlmodelc.zip
-            unzip ggml-tiny-encoder.mlmodelc.zip
-            rm ggml-tiny-encoder.mlmodelc.zip
-            rm -rf /tmp/testwhisper-encoder.mlmodelc
-            mv ggml-tiny-encoder.mlmodelc /tmp/testwhisper-encoder.mlmodelc
-          fi
-          # run e2e testing
-          cd nitro
-          chmod +x ../../.github/scripts/e2e-test-whisper-linux-and-mac.sh && ../../.github/scripts/e2e-test-whisper-linux-and-mac.sh ./nitro ${{ env.WHISPER_MODEL_URL }}
-          rm -rf uploads/
-
   macOS-amd64-build:
     runs-on: macos-13
     needs: [create-draft-release, set-nitro-version]
@@ -404,15 +371,6 @@ jobs:
           chmod +x ../../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }}
           rm -rf uploads/
 
-      - name: Run e2e testing - Whisper.CPP
-        shell: bash
-        run: |
-          # run e2e testing
-          cd cortex-cpp
-          cd nitro
-          chmod +x ../../.github/scripts/e2e-test-whisper-linux-and-mac.sh && ../../.github/scripts/e2e-test-whisper-linux-and-mac.sh ./nitro ${{ env.WHISPER_MODEL_URL }}
-          rm -rf uploads/
-
   universal-nitro-artifact-macos:
     runs-on: macos-latest
     needs: [create-draft-release, set-nitro-version, macOS-silicon-build, macOS-amd64-build]
@@ -469,13 +427,13 @@ jobs:
       matrix:
         include:
           - build: "amd64-avx2"
-            defines: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
+            defines: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
           - build: "amd64-avx"
-            defines: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
+            defines: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
           - build: "amd64-avx512"
-            defines: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
+            defines: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
           - build: "amd64-vulkan"
-            defines: "-DLLAMA_VULKAN=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
+            defines: "-DLLAMA_VULKAN=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
           # - build: "arm64"
           #   defines: "-A ARM64 -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON"
 
@@ -501,12 +459,6 @@ jobs:
       - name: Add msbuild to PATH
         uses: microsoft/setup-msbuild@v1
 
-      - name: Fetch SDL2 and set SDL2_DIR version 2.28.5
-        run: |
-          C:/msys64/usr/bin/wget.exe -qO sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-2.28.5/SDL2-devel-2.28.5-VC.zip
-          7z x sdl2.zip -aoa
-          echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-2.28.5/cmake" >> $env:GITHUB_ENV
-
       - name: actions-setup-cmake
         uses: jwlawson/actions-setup-cmake@v1.14.1
 
@@ -539,11 +491,9 @@ jobs:
           robocopy .\build\engines\cortex.llamacpp\ .\build\Release\engines\cortex.llamacpp\ engine.dll
           robocopy build_deps\_install\bin\ .\build\Release\ zlib.dll
           robocopy build\bin\Release\ .\build\Release\ llama.dll
-          robocopy build\bin\Release\ .\build\Release\ whisper.dll
           robocopy ..\.github\patches\windows\ .\build\Release\ msvcp140.dll
           robocopy ..\.github\patches\windows\ .\build\Release\ vcruntime140_1.dll
           robocopy ..\.github\patches\windows\ .\build\Release\ vcruntime140.dll
-          robocopy "$env:SDL2_DIR\..\lib\2.28.5\" .\build\Release\ SDL2.dll
           dotnet tool install --global AzureSignTool
           azuresigntool.exe sign -kvu "${{ secrets.AZURE_KEY_VAULT_URI }}" -kvi "${{ secrets.AZURE_CLIENT_ID }}" -kvt "${{ secrets.AZURE_TENANT_ID }}" -kvs "${{ secrets.AZURE_CLIENT_SECRET }}" -kvc ${{ secrets.AZURE_CERT_NAME }} -tr http://timestamp.globalsign.com/tsa/r6advanced1 -v ".\build\Release\nitro.exe"
           7z a -ttar temp.tar .\build\Release\*
@@ -558,15 +508,6 @@ jobs:
           ..\..\..\.github\scripts\e2e-test-llama-windows.bat nitro.exe ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }}
           rmdir /S /Q .\build\Release\uploads
 
-      - name: Run e2e testing - Whisper.cpp
-        shell: cmd
-        if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' }}
-        run: |
-          cd cortex-cpp
-          cd build\Release
-          ..\..\..\.github\scripts\e2e-test-whisper-windows.bat nitro.exe ${{ env.WHISPER_MODEL_URL }}
-          rmdir /S /Q .\build\Release\uploads
-
       - name: Upload Artifact
         uses: actions/upload-artifact@v2
         if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request'
@@ -598,35 +539,29 @@ jobs:
           - cuda: "12-0"
             instructions: "amd64-avx2"
             inst-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF"
-            cmake-flags: "-DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
+            cmake-flags: "-DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
           - cuda: "12-0"
             instructions: "amd64-avx"
             inst-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF"
-            cmake-flags: "-DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
+            cmake-flags: "-DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
           - cuda: "12-0"
             instructions: "amd64-avx512"
             inst-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF"
-            cmake-flags: "-DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
+            cmake-flags: "-DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
           - cuda: "11-7"
             instructions: "amd64-avx2"
             inst-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF"
-            cmake-flags: "-DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
+            cmake-flags: "-DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
           - cuda: "11-7"
             instructions: "amd64-avx"
             inst-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF"
-            cmake-flags: "-DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
+            cmake-flags: "-DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
           - cuda: "11-7"
             instructions: "amd64-avx512"
             inst-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF"
-            cmake-flags: "-DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DWHISPER_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE -DWHISPER_SDL2=ON"
+            cmake-flags: "-DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
 
     steps:
-      - name: Fetch SDL2 and set SDL2_DIR version 2.28.5
-        run: |
-          curl -L -o sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-2.28.5/SDL2-devel-2.28.5-VC.zip
-          7z x sdl2.zip -aoa
-          echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-2.28.5/cmake" >> $env:GITHUB_ENV
-
       - name: Clone
         id: checkout
         uses: actions/checkout@v3
@@ -659,11 +594,9 @@ jobs:
           robocopy .\build\engines\cortex.llamacpp\ .\build\Release\engines\cortex.llamacpp\ engine.dll
           robocopy build_deps\_install\bin\ .\build\Release\ zlib.dll
           robocopy build\bin\Release\ .\build\Release\ llama.dll
-          robocopy build\bin\Release\ .\build\Release\ whisper.dll
           robocopy ..\.github\patches\windows\ .\build\Release\ msvcp140.dll
           robocopy ..\.github\patches\windows\ .\build\Release\ vcruntime140_1.dll
           robocopy ..\.github\patches\windows\ .\build\Release\ vcruntime140.dll
-          robocopy "$env:SDL2_DIR\..\lib\2.28.5\" .\build\Release\ SDL2.dll
           dotnet tool install --global AzureSignTool
           %USERPROFILE%\.dotnet\tools\azuresigntool.exe sign -kvu "${{ secrets.AZURE_KEY_VAULT_URI }}" -kvi "${{ secrets.AZURE_CLIENT_ID }}" -kvt "${{ secrets.AZURE_TENANT_ID }}" -kvs "${{ secrets.AZURE_CLIENT_SECRET }}" -kvc ${{ secrets.AZURE_CERT_NAME }} -tr http://timestamp.globalsign.com/tsa/r6advanced1 -v ".\build\Release\nitro.exe"
           7z a -ttar temp.tar .\build\Release\*
diff --git a/.gitmodules b/.gitmodules
index 65a042814..e69de29bb 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +0,0 @@
-[submodule "cortex-cpp/whisper.cpp"]
-	path = cortex-cpp/whisper.cpp
-	url = https://github.com/ggerganov/whisper.cpp.git
diff --git a/cortex-cpp/CMakeLists.txt b/cortex-cpp/CMakeLists.txt
index 4bb269aaa..97be0e86d 100644
--- a/cortex-cpp/CMakeLists.txt
+++ b/cortex-cpp/CMakeLists.txt
@@ -56,7 +56,6 @@ endif()
 
 add_compile_definitions(NITRO_VERSION="${NITRO_VERSION}")
 
-add_subdirectory(whisper.cpp)
 add_subdirectory(test)
 
 add_executable(${PROJECT_NAME} main.cc)
@@ -69,7 +68,7 @@ add_executable(${PROJECT_NAME} main.cc)
 # and comment out the following lines
 
 find_package(Drogon CONFIG REQUIRED)
-target_link_libraries(${PROJECT_NAME} PRIVATE Drogon::Drogon whisper
+target_link_libraries(${PROJECT_NAME} PRIVATE Drogon::Drogon
                                               ${CMAKE_THREAD_LIBS_INIT})
 
 # ##############################################################################
@@ -87,7 +86,6 @@ endif()
 
 aux_source_directory(controllers CTL_SRC)
 aux_source_directory(common COMMON_SRC)
-aux_source_directory(context CONTEXT_SRC)
 aux_source_directory(models MODEL_SRC)
 aux_source_directory(cortex-common CORTEX_COMMON)
 # aux_source_directory(filters FILTER_SRC) aux_source_directory(plugins
@@ -100,7 +98,7 @@ aux_source_directory(cortex-common CORTEX_COMMON)
 
 target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR} )
 # ${CMAKE_CURRENT_SOURCE_DIR}/models)
-target_sources(${PROJECT_NAME} PRIVATE ${CTL_SRC} ${COMMON_SRC} ${CONTEXT_SRC})
+target_sources(${PROJECT_NAME} PRIVATE ${CTL_SRC} ${COMMON_SRC})
 # ${FILTER_SRC} ${PLUGIN_SRC} ${MODEL_SRC})
 # ##############################################################################
 # uncomment the following line for dynamically loading views set_property(TARGET
diff --git a/cortex-cpp/common/base.cc b/cortex-cpp/common/base.cc
deleted file mode 100644
index e69de29bb..000000000
diff --git a/cortex-cpp/common/base.h b/cortex-cpp/common/base.h
index adfe42a19..43d612c1b 100644
--- a/cortex-cpp/common/base.h
+++ b/cortex-cpp/common/base.h
@@ -37,21 +37,5 @@ class BaseEmbedding {
       const HttpRequestPtr& req,
       std::function<void(const HttpResponsePtr&)>&& callback) = 0;
 
-  // The derived class can also override other methods if needed
-};
-
-class BaseAudio {
- public:
-  virtual ~BaseAudio() {}
-  // Transcribes audio into the input language.
-  virtual void CreateTranscription(
-      const HttpRequestPtr& req,
-      std::function<void(const HttpResponsePtr&)>&& callback) = 0;
-
-  //  Translates audio into the input language.
-  virtual void CreateTranslation(
-      const HttpRequestPtr& req,
-      std::function<void(const HttpResponsePtr&)>&& callback) = 0;
-
   // The derived class can also override other methods if needed
 };
\ No newline at end of file
diff --git a/cortex-cpp/context/whisper_server_context.cc b/cortex-cpp/context/whisper_server_context.cc
deleted file mode 100644
index a4ccbe710..000000000
--- a/cortex-cpp/context/whisper_server_context.cc
+++ /dev/null
@@ -1,796 +0,0 @@
-#include "whisper_server_context.h"
-#include "utils/dr_wav.h"
-#include <trantor/utils/Logger.h>
-#include <fstream>
-#include <sstream>
-#include "utils/json.hpp"
-
-using json = nlohmann::json;
-
-bool read_wav(const std::string& fname, std::vector<float>& pcmf32,
-              std::vector<std::vector<float>>& pcmf32s, bool stereo) {
-  drwav wav;
-  std::vector<uint8_t> wav_data;  // used for pipe input from stdin
-
-  if (fname == "-") {
-    {
-      uint8_t buf[1024];
-      while (true) {
-        const size_t n = fread(buf, 1, sizeof(buf), stdin);
-        if (n == 0) {
-          break;
-        }
-        wav_data.insert(wav_data.end(), buf, buf + n);
-      }
-    }
-
-    if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) ==
-        false) {
-      fprintf(stderr, "error: failed to open WAV file from stdin\n");
-      return false;
-    }
-
-    fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__,
-            wav_data.size());
-  } else if (drwav_init_file(&wav, fname.c_str(), nullptr) == false) {
-    fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname.c_str());
-    return false;
-  }
-
-  if (wav.channels != 1 && wav.channels != 2) {
-    fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", __func__,
-            fname.c_str());
-    return false;
-  }
-
-  if (stereo && wav.channels != 2) {
-    fprintf(stderr, "%s: WAV file '%s' must be stereo for diarization\n",
-            __func__, fname.c_str());
-    return false;
-  }
-
-  if (wav.sampleRate != COMMON_SAMPLE_RATE) {
-    fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", __func__,
-            fname.c_str(), COMMON_SAMPLE_RATE / 1000);
-    return false;
-  }
-
-  if (wav.bitsPerSample != 16) {
-    fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", __func__,
-            fname.c_str());
-    return false;
-  }
-
-  const uint64_t n =
-      wav_data.empty()
-          ? wav.totalPCMFrameCount
-          : wav_data.size() / (wav.channels * wav.bitsPerSample / 8);
-
-  std::vector<int16_t> pcm16;
-  pcm16.resize(n * wav.channels);
-  drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
-  drwav_uninit(&wav);
-
-  // convert to mono, float
-  pcmf32.resize(n);
-  if (wav.channels == 1) {
-    for (uint64_t i = 0; i < n; i++) {
-      pcmf32[i] = float(pcm16[i]) / 32768.0f;
-    }
-  } else {
-    for (uint64_t i = 0; i < n; i++) {
-      pcmf32[i] = float(pcm16[2 * i] + pcm16[2 * i + 1]) / 65536.0f;
-    }
-  }
-
-  if (stereo) {
-    // convert to stereo, float
-    pcmf32s.resize(2);
-
-    pcmf32s[0].resize(n);
-    pcmf32s[1].resize(n);
-    for (uint64_t i = 0; i < n; i++) {
-      pcmf32s[0][i] = float(pcm16[2 * i]) / 32768.0f;
-      pcmf32s[1][i] = float(pcm16[2 * i + 1]) / 32768.0f;
-    }
-  }
-
-  return true;
-}
-
-std::string output_str(struct whisper_context* ctx,
-                       const whisper_params& params,
-                       std::vector<std::vector<float>> pcmf32s) {
-  std::stringstream result;
-  const int n_segments = whisper_full_n_segments(ctx);
-  for (int i = 0; i < n_segments; ++i) {
-    const char* text = whisper_full_get_segment_text(ctx, i);
-    std::string speaker = "";
-
-    if (params.diarize && pcmf32s.size() == 2) {
-      const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-      const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-      speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
-    }
-
-    result << speaker << text << "\n";
-  }
-  return result.str();
-}
-
-std::string estimate_diarization_speaker(
-    std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1,
-    bool id_only) {
-  std::string speaker = "";
-  const int64_t n_samples = pcmf32s[0].size();
-
-  const int64_t is0 = timestamp_to_sample(t0, n_samples);
-  const int64_t is1 = timestamp_to_sample(t1, n_samples);
-
-  double energy0 = 0.0f;
-  double energy1 = 0.0f;
-
-  for (int64_t j = is0; j < is1; j++) {
-    energy0 += fabs(pcmf32s[0][j]);
-    energy1 += fabs(pcmf32s[1][j]);
-  }
-
-  if (energy0 > 1.1 * energy1) {
-    speaker = "0";
-  } else if (energy1 > 1.1 * energy0) {
-    speaker = "1";
-  } else {
-    speaker = "?";
-  }
-
-  // printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, speaker =
-  // %s\n", is0, is1, energy0, energy1, speaker.c_str());
-
-  if (!id_only) {
-    speaker.insert(0, "(speaker ");
-    speaker.append(")");
-  }
-
-  return speaker;
-}
-
-//  500 -> 00:05.000
-// 6000 -> 01:00.000
-std::string to_timestamp(int64_t t, bool comma) {
-  int64_t msec = t * 10;
-  int64_t hr = msec / (1000 * 60 * 60);
-  msec = msec - hr * (1000 * 60 * 60);
-  int64_t min = msec / (1000 * 60);
-  msec = msec - min * (1000 * 60);
-  int64_t sec = msec / 1000;
-  msec = msec - sec * 1000;
-
-  char buf[32];
-  snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int)hr, (int)min,
-           (int)sec, comma ? "," : ".", (int)msec);
-
-  return std::string(buf);
-}
-
-int timestamp_to_sample(int64_t t, int n_samples) {
-  return (std::max)(0, (std::min)((int)n_samples - 1,
-                                  (int)((t * WHISPER_SAMPLE_RATE) / 100)));
-}
-
-bool is_file_exist(const char* fileName) {
-  std::ifstream infile(fileName);
-  return infile.good();
-}
-
-void whisper_print_usage(int /*argc*/, char** argv,
-                         const whisper_params& params) {
-  fprintf(stderr, "\n");
-  fprintf(stderr, "usage: %s [options] \n", argv[0]);
-  fprintf(stderr, "\n");
-  fprintf(stderr, "options:\n");
-  fprintf(stderr,
-          "  -h,        --help              [default] show this help "
-          "message and exit\n");
-  fprintf(stderr,
-          "  -t N,      --threads N         [%-7d] number of threads to use "
-          "during computation\n",
-          params.n_threads);
-  fprintf(stderr,
-          "  -p N,      --processors N      [%-7d] number of processors to use "
-          "during computation\n",
-          params.n_processors);
-  fprintf(
-      stderr,
-      "  -ot N,     --offset-t N        [%-7d] time offset in milliseconds\n",
-      params.offset_t_ms);
-  fprintf(stderr,
-          "  -on N,     --offset-n N        [%-7d] segment index offset\n",
-          params.offset_n);
-  fprintf(stderr,
-          "  -d  N,     --duration N        [%-7d] duration of audio to "
-          "process in milliseconds\n",
-          params.duration_ms);
-  fprintf(stderr,
-          "  -mc N,     --max-context N     [%-7d] maximum number of text "
-          "context tokens to store\n",
-          params.max_context);
-  fprintf(stderr,
-          "  -ml N,     --max-len N         [%-7d] maximum segment length in "
-          "characters\n",
-          params.max_len);
-  fprintf(stderr,
-          "  -sow,      --split-on-word     [%-7s] split on word rather than "
-          "on token\n",
-          params.split_on_word ? "true" : "false");
-  fprintf(stderr,
-          "  -bo N,     --best-of N         [%-7d] number of best candidates "
-          "to keep\n",
-          params.best_of);
-  fprintf(stderr,
-          "  -bs N,     --beam-size N       [%-7d] beam size for beam search\n",
-          params.beam_size);
-  fprintf(stderr,
-          "  -wt N,     --word-thold N      [%-7.2f] word timestamp "
-          "probability threshold\n",
-          params.word_thold);
-  fprintf(stderr,
-          "  -et N,     --entropy-thold N   [%-7.2f] entropy threshold for "
-          "decoder fail\n",
-          params.entropy_thold);
-  fprintf(stderr,
-          "  -lpt N,    --logprob-thold N   [%-7.2f] log probability threshold "
-          "for decoder fail\n",
-          params.logprob_thold);
-  // fprintf(stderr, "  -su,       --speed-up          [%-7s] speed up audio by
-  // x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
-  fprintf(stderr,
-          "  -debug,    --debug-mode        [%-7s] enable debug mode (eg. dump "
-          "log_mel)\n",
-          params.debug_mode ? "true" : "false");
-  fprintf(stderr,
-          "  -tr,       --translate         [%-7s] translate from source "
-          "language to english\n",
-          params.translate ? "true" : "false");
-  fprintf(stderr,
-          "  -di,       --diarize           [%-7s] stereo audio diarization\n",
-          params.diarize ? "true" : "false");
-  fprintf(stderr,
-          "  -tdrz,     --tinydiarize       [%-7s] enable tinydiarize "
-          "(requires a tdrz model)\n",
-          params.tinydiarize ? "true" : "false");
-  fprintf(stderr,
-          "  -nf,       --no-fallback       [%-7s] do not use temperature "
-          "fallback while decoding\n",
-          params.no_fallback ? "true" : "false");
-  fprintf(stderr,
-          "  -ps,       --print-special     [%-7s] print special tokens\n",
-          params.print_special ? "true" : "false");
-  fprintf(stderr, "  -pc,       --print-colors      [%-7s] print colors\n",
-          params.print_colors ? "true" : "false");
-  fprintf(stderr,
-          "  -pr,       --print-realtime    [%-7s] print output in realtime\n",
-          params.print_realtime ? "true" : "false");
-  fprintf(stderr, "  -pp,       --print-progress    [%-7s] print progress\n",
-          params.print_progress ? "true" : "false");
-  fprintf(stderr,
-          "  -nt,       --no-timestamps     [%-7s] do not print timestamps\n",
-          params.no_timestamps ? "true" : "false");
-  fprintf(stderr,
-          "  -l LANG,   --language LANG     [%-7s] spoken language ('auto' for "
-          "auto-detect)\n",
-          params.language.c_str());
-  fprintf(stderr,
-          "  -dl,       --detect-language   [%-7s] exit after automatically "
-          "detecting language\n",
-          params.detect_language ? "true" : "false");
-  fprintf(stderr, "             --prompt PROMPT     [%-7s] initial prompt\n",
-          params.prompt.c_str());
-  fprintf(stderr, "  -m FNAME,  --model FNAME       [%-7s] model path\n",
-          params.model.c_str());
-  fprintf(stderr,
-          "  -oved D,   --ov-e-device DNAME [%-7s] the OpenVINO device used "
-          "for encode inference\n",
-          params.openvino_encode_device.c_str());
-  fprintf(stderr,
-          "  --convert,                     [%-7s] Convert audio to WAV, "
-          "requires ffmpeg on the server",
-          params.ffmpeg_converter ? "true" : "false");
-  fprintf(stderr, "\n");
-}
-
-bool whisper_params_parse(int argc, char** argv, whisper_params& params) {
-  for (int i = 1; i < argc; i++) {
-    std::string arg = argv[i];
-
-    if (arg == "-h" || arg == "--help") {
-      whisper_print_usage(argc, argv, params);
-      exit(0);
-    } else if (arg == "-t" || arg == "--threads") {
-      params.n_threads = std::stoi(argv[++i]);
-    } else if (arg == "-p" || arg == "--processors") {
-      params.n_processors = std::stoi(argv[++i]);
-    } else if (arg == "-ot" || arg == "--offset-t") {
-      params.offset_t_ms = std::stoi(argv[++i]);
-    } else if (arg == "-on" || arg == "--offset-n") {
-      params.offset_n = std::stoi(argv[++i]);
-    } else if (arg == "-d" || arg == "--duration") {
-      params.duration_ms = std::stoi(argv[++i]);
-    } else if (arg == "-mc" || arg == "--max-context") {
-      params.max_context = std::stoi(argv[++i]);
-    } else if (arg == "-ml" || arg == "--max-len") {
-      params.max_len = std::stoi(argv[++i]);
-    } else if (arg == "-bo" || arg == "--best-of") {
-      params.best_of = std::stoi(argv[++i]);
-    } else if (arg == "-bs" || arg == "--beam-size") {
-      params.beam_size = std::stoi(argv[++i]);
-    } else if (arg == "-wt" || arg == "--word-thold") {
-      params.word_thold = std::stof(argv[++i]);
-    } else if (arg == "-et" || arg == "--entropy-thold") {
-      params.entropy_thold = std::stof(argv[++i]);
-    } else if (arg == "-lpt" || arg == "--logprob-thold") {
-      params.logprob_thold = std::stof(argv[++i]);
-    }
-    // else if (arg == "-su"   || arg == "--speed-up")        { params.speed_up
-    // = true; }
-    else if (arg == "-debug" || arg == "--debug-mode") {
-      params.debug_mode = true;
-    } else if (arg == "-tr" || arg == "--translate") {
-      params.translate = true;
-    } else if (arg == "-di" || arg == "--diarize") {
-      params.diarize = true;
-    } else if (arg == "-tdrz" || arg == "--tinydiarize") {
-      params.tinydiarize = true;
-    } else if (arg == "-sow" || arg == "--split-on-word") {
-      params.split_on_word = true;
-    } else if (arg == "-nf" || arg == "--no-fallback") {
-      params.no_fallback = true;
-    } else if (arg == "-fp" || arg == "--font-path") {
-      params.font_path = argv[++i];
-    } else if (arg == "-ps" || arg == "--print-special") {
-      params.print_special = true;
-    } else if (arg == "-pc" || arg == "--print-colors") {
-      params.print_colors = true;
-    } else if (arg == "-pr" || arg == "--print-realtime") {
-      params.print_realtime = true;
-    } else if (arg == "-pp" || arg == "--print-progress") {
-      params.print_progress = true;
-    } else if (arg == "-nt" || arg == "--no-timestamps") {
-      params.no_timestamps = true;
-    } else if (arg == "-l" || arg == "--language") {
-      params.language = argv[++i];
-    } else if (arg == "-dl" || arg == "--detect-language") {
-      params.detect_language = true;
-    } else if (arg == "--prompt") {
-      params.prompt = argv[++i];
-    } else if (arg == "-m" || arg == "--model") {
-      params.model = argv[++i];
-    } else if (arg == "-oved" || arg == "--ov-e-device") {
-      params.openvino_encode_device = argv[++i];
-    } else if (arg == "-ng" || arg == "--no-gpu") {
-      params.use_gpu = false;
-    } else if (arg == "--convert") {
-      params.ffmpeg_converter = true;
-    } else {
-      fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-      whisper_print_usage(argc, argv, params);
-      exit(0);
-    }
-  }
-
-  return true;
-}
-
-void check_ffmpeg_availibility() {
-  int result = system("ffmpeg -version");
-
-  if (result == 0) {
-    std::cout << "ffmpeg is available." << std::endl;
-  } else {
-    // ffmpeg is not available
-    std::cout << "ffmpeg is not found. Please ensure that ffmpeg is installed ";
-    std::cout << "and that its executable is included in your system's PATH. ";
-    exit(0);
-  }
-}
-
-bool convert_to_wav(const std::string& temp_filename, std::string& error_resp) {
-  std::ostringstream cmd_stream;
-  std::string converted_filename_temp = temp_filename + "_temp.wav";
-  cmd_stream << "ffmpeg -i \"" << temp_filename
-             << "\" -ar 16000 -ac 1 -c:a pcm_s16le \""
-             << converted_filename_temp << "\" 2>&1";
-  std::string cmd = cmd_stream.str();
-
-  int status = std::system(cmd.c_str());
-  if (status != 0) {
-    error_resp = "{\"error\":\"FFmpeg conversion failed.\"}";
-    return false;
-  }
-
-  // Remove the original file
-  if (remove(temp_filename.c_str()) != 0) {
-    error_resp = "{\"error\":\"Failed to remove the original file.\"}";
-    return false;
-  }
-
-  // Rename the temporary file to match the original filename
-  if (rename(converted_filename_temp.c_str(), temp_filename.c_str()) != 0) {
-    error_resp = "{\"error\":\"Failed to rename the temporary file.\"}";
-    return false;
-  }
-  return true;
-}
-
-void whisper_print_progress_callback(struct whisper_context* /*ctx*/,
-                                     struct whisper_state* /*state*/,
-                                     int progress, void* user_data) {
-  int progress_step =
-      ((whisper_print_user_data*)user_data)->params->progress_step;
-  int* progress_prev = &(((whisper_print_user_data*)user_data)->progress_prev);
-  if (progress >= *progress_prev + progress_step) {
-    *progress_prev += progress_step;
-    fprintf(stderr, "%s: progress = %3d%%\n", __func__, progress);
-  }
-}
-
-void whisper_print_segment_callback(struct whisper_context* ctx,
-                                    struct whisper_state* /*state*/, int n_new,
-                                    void* user_data) {
-  const auto& params = *((whisper_print_user_data*)user_data)->params;
-  const auto& pcmf32s = *((whisper_print_user_data*)user_data)->pcmf32s;
-
-  const int n_segments = whisper_full_n_segments(ctx);
-
-  std::string speaker = "";
-
-  int64_t t0 = 0;
-  int64_t t1 = 0;
-
-  // print the last n_new segments
-  const int s0 = n_segments - n_new;
-
-  if (s0 == 0) {
-    printf("\n");
-  }
-
-  for (int i = s0; i < n_segments; i++) {
-    if (!params.no_timestamps || params.diarize) {
-      t0 = whisper_full_get_segment_t0(ctx, i);
-      t1 = whisper_full_get_segment_t1(ctx, i);
-    }
-
-    if (!params.no_timestamps) {
-      printf("[%s --> %s]  ", to_timestamp(t0).c_str(),
-             to_timestamp(t1).c_str());
-    }
-
-    if (params.diarize && pcmf32s.size() == 2) {
-      speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
-    }
-
-    if (params.print_colors) {
-      for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
-        if (params.print_special == false) {
-          const whisper_token id = whisper_full_get_token_id(ctx, i, j);
-          if (id >= whisper_token_eot(ctx)) {
-            continue;
-          }
-        }
-
-        const char* text = whisper_full_get_token_text(ctx, i, j);
-        const float p = whisper_full_get_token_p(ctx, i, j);
-
-        const int col = (std::max)(
-            0, (std::min)((int)k_colors.size() - 1,
-                          (int)((std::pow)(p, 3) * float(k_colors.size()))));
-
-        printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text,
-               "\033[0m");
-      }
-    } else {
-      const char* text = whisper_full_get_segment_text(ctx, i);
-
-      printf("%s%s", speaker.c_str(), text);
-    }
-
-    if (params.tinydiarize) {
-      if (whisper_full_get_segment_speaker_turn_next(ctx, i)) {
-        printf("%s", params.tdrz_speaker_turn.c_str());
-      }
-    }
-
-    // with timestamps or speakers: each segment on new line
-    if (!params.no_timestamps || params.diarize) {
-      printf("\n");
-    }
-    fflush(stdout);
-  }
-}
-
-whisper_server_context::~whisper_server_context() {
-  if (ctx) {
-    whisper_print_timings(ctx);
-    whisper_free(ctx);
-    ctx = nullptr;
-  }
-}
-
-bool whisper_server_context::load_model(std::string& model_path) {
-  whisper_mutex.lock();
-
-  // clean up
-  whisper_free(ctx);
-
-  // whisper init
-  ctx = whisper_init_from_file_with_params(model_path.c_str(), cparams);
-
-  // TODO perhaps load prior model here instead of exit
-  if (ctx == nullptr) {
-    whisper_mutex.unlock();
-    return false;
-  }
-
-  // initialize openvino encoder. this has no effect on whisper.cpp builds that
-  // don't have OpenVINO configured
-  whisper_ctx_init_openvino_encoder(
-      ctx, nullptr, params.openvino_encode_device.c_str(), nullptr);
-
-  // check if the model is in the file system
-  whisper_mutex.unlock();
-  return true;
-}
-
-std::string whisper_server_context::inference(
-    std::string& input_file_path, std::string language, std::string prompt,
-    std::string response_format, float temperature, bool translate) {
-  // acquire whisper model mutex lock
-  whisper_mutex.lock();
-
-  // audio arrays
-  std::vector<float> pcmf32;                // mono-channel F32 PCM
-  std::vector<std::vector<float>> pcmf32s;  // stereo-channel F32 PCM
-
-  // if file is not wav, convert to wav
-  if (params.ffmpeg_converter) {
-    std::string error_resp = "Failed to execute ffmpeg command converting " +
-                             input_file_path + " to wav";
-    const bool is_converted = convert_to_wav(input_file_path, error_resp);
-    if (!is_converted) {
-      whisper_mutex.unlock();
-      LOG_ERROR << error_resp;
-      throw std::runtime_error(error_resp);
-    }
-  }
-
-  // read wav content into pcmf32
-  if (!read_wav(input_file_path, pcmf32, pcmf32s, params.diarize)) {
-    std::string error_resp = "Failed to read WAV file " + input_file_path;
-    LOG_ERROR << error_resp;
-    whisper_mutex.unlock();
-    throw std::runtime_error(error_resp);
-  }
-
-  printf("Successfully loaded %s\n", input_file_path.c_str());
-
-  params.translate = translate;
-  params.language = language;
-  params.response_format = response_format;
-  if (!whisper_is_multilingual(ctx)) {
-    if (params.language != "en" || params.translate) {
-      params.language = "en";
-      params.translate = false;
-      LOG_WARN
-          << "Model " << model_id
-          << " is not multilingual, ignoring language and translation options";
-    }
-  }
-  if (params.detect_language) {
-    params.language = "auto";
-  }
-
-  // print some processing info
-  std::string processing_info =
-      "Model " + model_id + " processing " + input_file_path + " (" +
-      std::to_string(pcmf32.size()) + " samples, " +
-      std::to_string(float(pcmf32.size()) / WHISPER_SAMPLE_RATE) + " sec), " +
-      std::to_string(params.n_threads) + " threads, " +
-      std::to_string(params.n_processors) +
-      " processors, lang = " + params.language +
-      ", task = " + (params.translate ? "translate" : "transcribe") + ", " +
-      (params.tinydiarize ? "tdrz = 1, " : "") +
-      (params.no_timestamps ? "timestamps = 0" : "timestamps = 1");
-  LOG_INFO << processing_info;
-
-  // run the inference
-  {
-    std::string msg = "Running whisper.cpp inference of model " + model_id +
-                      " on " + input_file_path;
-    LOG_INFO << msg;
-    whisper_full_params wparams =
-        whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
-
-    wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH
-                                            : WHISPER_SAMPLING_GREEDY;
-
-    wparams.print_realtime = false;
-    wparams.print_progress = params.print_progress;
-    wparams.print_timestamps = !params.no_timestamps;
-    wparams.print_special = params.print_special;
-    wparams.translate = params.translate;
-    wparams.language = params.language.c_str();
-    wparams.detect_language = params.detect_language;
-    wparams.n_threads = params.n_threads;
-    wparams.n_max_text_ctx =
-        params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
-    wparams.offset_ms = params.offset_t_ms;
-    wparams.duration_ms = params.duration_ms;
-
-    wparams.thold_pt = params.word_thold;
-    wparams.max_len = params.max_len == 0 ? 60 : params.max_len;
-    wparams.split_on_word = params.split_on_word;
-
-    wparams.speed_up = params.speed_up;
-    wparams.debug_mode = params.debug_mode;
-
-    wparams.tdrz_enable = params.tinydiarize;  // [TDRZ]
-
-    wparams.initial_prompt = prompt.c_str();
-
-    wparams.greedy.best_of = params.best_of;
-    wparams.beam_search.beam_size = params.beam_size;
-
-    wparams.temperature = temperature;
-    wparams.temperature_inc = params.temperature_inc;
-    wparams.entropy_thold = params.entropy_thold;
-    wparams.logprob_thold = params.logprob_thold;
-
-    wparams.no_timestamps = params.no_timestamps;
-
-    whisper_print_user_data user_data = {&params, &pcmf32s, 0};
-
-    // this callback is called on each new segment
-    if (params.print_realtime) {
-      wparams.new_segment_callback = whisper_print_segment_callback;
-      wparams.new_segment_callback_user_data = &user_data;
-    }
-
-    if (wparams.print_progress) {
-      wparams.progress_callback = whisper_print_progress_callback;
-      wparams.progress_callback_user_data = &user_data;
-    }
-
-    // examples for abort mechanism
-    // in examples below, we do not abort the processing, but we could if the
-    // flag is set to true
-
-    // the callback is called before every encoder run - if it returns false,
-    // the processing is aborted
-    {
-      static bool is_aborted =
-          false;  // NOTE: this should be atomic to avoid data race
-
-      wparams.encoder_begin_callback = [](struct whisper_context* /*ctx*/,
-                                          struct whisper_state* /*state*/,
-                                          void* user_data) {
-        bool is_aborted = *(bool*)user_data;
-        return !is_aborted;
-      };
-      wparams.encoder_begin_callback_user_data = &is_aborted;
-    }
-
-    // the callback is called before every computation - if it returns true, the
-    // computation is aborted
-    {
-      static bool is_aborted =
-          false;  // NOTE: this should be atomic to avoid data race
-
-      wparams.abort_callback = [](void* user_data) {
-        bool is_aborted = *(bool*)user_data;
-        return is_aborted;
-      };
-      wparams.abort_callback_user_data = &is_aborted;
-    }
-
-    if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(),
-                              params.n_processors) != 0) {
-      std::string error_resp = "Failed to process audio";
-      LOG_ERROR << error_resp;
-      whisper_mutex.unlock();
-      throw std::runtime_error(error_resp);
-    }
-  }
-
-  // return results to user
-  std::string result;
-  if (params.response_format == text_format) {
-    result = output_str(ctx, params, pcmf32s);
-  } else if (params.response_format == srt_format) {
-    std::stringstream ss;
-    const int n_segments = whisper_full_n_segments(ctx);
-    for (int i = 0; i < n_segments; ++i) {
-      const char* text = whisper_full_get_segment_text(ctx, i);
-      const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-      const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-      std::string speaker = "";
-
-      if (params.diarize && pcmf32s.size() == 2) {
-        speaker = estimate_diarization_speaker(pcmf32s, t0, t1);
-      }
-
-      ss << i + 1 + params.offset_n << "\n";
-      ss << to_timestamp(t0, true) << " --> " << to_timestamp(t1, true) << "\n";
-      ss << speaker << text << "\n\n";
-    }
-    result = ss.str();
-  } else if (params.response_format == vtt_format) {
-    std::stringstream ss;
-
-    ss << "WEBVTT\n\n";
-
-    const int n_segments = whisper_full_n_segments(ctx);
-    for (int i = 0; i < n_segments; ++i) {
-      const char* text = whisper_full_get_segment_text(ctx, i);
-      const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-      const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-      std::string speaker = "";
-
-      if (params.diarize && pcmf32s.size() == 2) {
-        speaker = estimate_diarization_speaker(pcmf32s, t0, t1, true);
-        speaker.insert(0, "<v Speaker");
-        speaker.append(">");
-      }
-
-      ss << to_timestamp(t0) << " --> " << to_timestamp(t1) << "\n";
-      ss << speaker << text << "\n\n";
-    }
-    result = ss.str();
-  } else if (params.response_format == vjson_format) {
-    /* try to match openai/whisper's Python format */
-    std::string results = output_str(ctx, params, pcmf32s);
-    json jres = json{{"text", results}};
-    const int n_segments = whisper_full_n_segments(ctx);
-    for (int i = 0; i < n_segments; ++i) {
-      json segment = json{
-          {"id", i},
-          {"text", whisper_full_get_segment_text(ctx, i)},
-      };
-
-      if (!params.no_timestamps) {
-        segment["start"] = whisper_full_get_segment_t0(ctx, i) * 0.01;
-        segment["end"] = whisper_full_get_segment_t1(ctx, i) * 0.01;
-      }
-
-      const int n_tokens = whisper_full_n_tokens(ctx, i);
-      for (int j = 0; j < n_tokens; ++j) {
-        whisper_token_data token = whisper_full_get_token_data(ctx, i, j);
-        if (token.id >= whisper_token_eot(ctx)) {
-          continue;
-        }
-
-        segment["tokens"].push_back(token.id);
-        json word = json{{"word", whisper_full_get_token_text(ctx, i, j)}};
-        if (!params.no_timestamps) {
-          word["start"] = token.t0 * 0.01;
-          word["end"] = token.t1 * 0.01;
-        }
-        word["probability"] = token.p;
-        segment["words"].push_back(word);
-      }
-      jres["segments"].push_back(segment);
-    }
-    result = jres.dump(-1, ' ', false, json::error_handler_t::replace);
-  } else {
-    std::string results = output_str(ctx, params, pcmf32s);
-    json jres = json{{"text", results}};
-    result = jres.dump(-1, ' ', false, json::error_handler_t::replace);
-  }
-
-  // reset params to thier defaults
-  params = default_params;
-
-  // return whisper model mutex lock
-  whisper_mutex.unlock();
-  LOG_INFO << "Successfully processed " << input_file_path << ": " << result;
-
-  return result;
-}
diff --git a/cortex-cpp/context/whisper_server_context.h b/cortex-cpp/context/whisper_server_context.h
deleted file mode 100644
index da29e4d9f..000000000
--- a/cortex-cpp/context/whisper_server_context.h
+++ /dev/null
@@ -1,165 +0,0 @@
-#pragma once
-#include <mutex>
-#include <vector>
-#include <utility>
-#include <string>
-#include <thread>
-
-#include "whisper.h"
-
-// Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
-// Lowest is red, middle is yellow, highest is green.
-const std::vector<std::string> k_colors = {
-    "\033[38;5;196m", "\033[38;5;202m", "\033[38;5;208m", "\033[38;5;214m",
-    "\033[38;5;220m", "\033[38;5;226m", "\033[38;5;190m", "\033[38;5;154m",
-    "\033[38;5;118m", "\033[38;5;82m",
-};
-
-// output formats
-const std::string json_format = "json";
-const std::string text_format = "text";
-const std::string srt_format = "srt";
-const std::string vjson_format = "verbose_json";
-const std::string vtt_format = "vtt";
-
-#define COMMON_SAMPLE_RATE 16000
-
-struct whisper_params {
-  int32_t n_threads =
-      (std::min)(4, (int32_t)std::thread::hardware_concurrency());
-  int32_t n_processors = 1;
-  int32_t offset_t_ms = 0;
-  int32_t offset_n = 0;
-  int32_t duration_ms = 0;
-  int32_t progress_step = 5;
-  int32_t max_context = -1;
-  int32_t max_len = 0;
-  int32_t best_of = 2;
-  int32_t beam_size = -1;
-
-  float word_thold = 0.01f;
-  float entropy_thold = 2.40f;
-  float logprob_thold = -1.00f;
-  float temperature = 0.00f;
-  float temperature_inc = 0.20f;
-
-  bool speed_up = false;
-  bool debug_mode = false;
-  bool translate = false;
-  bool detect_language = false;
-  bool diarize = false;
-  bool tinydiarize = false;
-  bool split_on_word = false;
-  bool no_fallback = false;
-  bool print_special = false;
-  bool print_colors = false;
-  bool print_realtime = false;
-  bool print_progress = false;
-  bool no_timestamps = false;
-  bool use_gpu = true;
-  bool ffmpeg_converter = false;
-
-  std::string language = "en";
-  std::string prompt = "";
-  std::string font_path =
-      "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
-  std::string model = "models/ggml-base.en.bin";
-
-  std::string response_format = json_format;
-
-  // [TDRZ] speaker turn string
-  std::string tdrz_speaker_turn =
-      " [SPEAKER_TURN]";  // TODO: set from command line
-
-  std::string openvino_encode_device = "CPU";
-};
-
-// Read WAV audio file and store the PCM data into pcmf32
-// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
-// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain
-// 2 channel PCM
-bool read_wav(const std::string& fname, std::vector<float>& pcmf32,
-              std::vector<std::vector<float>>& pcmf32s, bool stereo);
-
-std::string output_str(struct whisper_context* ctx,
-                       const whisper_params& params,
-                       std::vector<std::vector<float>> pcmf32s);
-
-std::string estimate_diarization_speaker(
-    std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1,
-    bool id_only = false);
-
-//  500 -> 00:05.000
-// 6000 -> 01:00.000
-std::string to_timestamp(int64_t t, bool comma = false);
-
-int timestamp_to_sample(int64_t t, int n_samples);
-
-bool is_file_exist(const char* fileName);
-
-void whisper_print_usage(int /*argc*/, char** argv,
-                         const whisper_params& params);
-
-bool whisper_params_parse(int argc, char** argv, whisper_params& params);
-
-void check_ffmpeg_availibility();
-
-bool convert_to_wav(const std::string& temp_filename, std::string& error_resp);
-
-void whisper_print_progress_callback(struct whisper_context* /*ctx*/,
-                                     struct whisper_state* /*state*/,
-                                     int progress, void* user_data);
-
-void whisper_print_segment_callback(struct whisper_context* ctx,
-                                    struct whisper_state* /*state*/, int n_new,
-                                    void* user_data);
-
-struct whisper_print_user_data {
-  const whisper_params* params;
-
-  const std::vector<std::vector<float>>* pcmf32s;
-  int progress_prev;
-};
-
-struct whisper_server_context {
-  whisper_params params;
-  whisper_params default_params;
-  std::mutex whisper_mutex;
-  std::string model_id;
-
-  struct whisper_context_params cparams;
-  struct whisper_context* ctx = nullptr;
-
-  whisper_server_context() = default;  // add this line
-
-  // Constructor
-  whisper_server_context(const std::string& model_id) {
-    this->model_id = model_id;
-    this->cparams = whisper_context_params();
-    this->ctx = nullptr;
-    // store default params so we can reset after each inference request
-    this->default_params = whisper_params();
-    this->params = whisper_params();
-  }
-
-  // Move constructor
-  whisper_server_context(whisper_server_context&& other) noexcept
-      : params(std::move(other.params)),
-        default_params(std::move(other.default_params)),
-        whisper_mutex()  // std::mutex is not movable, so we initialize a new one
-        ,
-        model_id(std::move(other.model_id)),
-        cparams(std::move(other.cparams)),
-        ctx(std::exchange(
-            other.ctx,
-            nullptr))  // ctx is a raw pointer, so we use std::exchange
-  {}
-
-  bool load_model(std::string& model_path);
-
-  std::string inference(std::string& input_file_path, std::string languague,
-                        std::string prompt, std::string response_format,
-                        float temperature, bool translate);
-
-  ~whisper_server_context();
-};
\ No newline at end of file
diff --git a/cortex-cpp/controllers/audio.cc b/cortex-cpp/controllers/audio.cc
deleted file mode 100644
index 91fd76d5b..000000000
--- a/cortex-cpp/controllers/audio.cc
+++ /dev/null
@@ -1,300 +0,0 @@
-#include "audio.h"
-
-#include "utils/nitro_utils.h"
-#include "whisper.h"
-
-using namespace v1;
-
-audio::audio() {
-  whisper_print_system_info();
-};
-
-audio::~audio() {}
-
-std::optional<std::string> audio::ParseModelId(
-    const std::shared_ptr<Json::Value>& jsonBody,
-    const std::function<void(const HttpResponsePtr&)>& callback) {
-  if (!jsonBody->isMember("model_id")) {
-    LOG_INFO << "No model_id found in request body";
-    Json::Value jsonResp;
-    jsonResp["message"] = "No model_id found in request body";
-    auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
-    resp->setStatusCode(k400BadRequest);
-    callback(resp);
-    return std::nullopt;  // Signal that an error occurred
-  }
-
-  return (*jsonBody)["model_id"].asString();
-}
-
-void audio::LoadModel(const HttpRequestPtr& req,
-                      std::function<void(const HttpResponsePtr&)>&& callback) {
-  const auto jsonBody = req->getJsonObject();
-  auto optional_model_id = ParseModelId(jsonBody, callback);
-  if (!optional_model_id) {
-    return;
-  }
-  std::string model_id = *optional_model_id;
-
-  // Check if model is already loaded
-  if (whispers.find(model_id) != whispers.end()) {
-    std::string error_msg = "Model " + model_id + " already loaded";
-    LOG_INFO << error_msg;
-    Json::Value jsonResp;
-    jsonResp["message"] = error_msg;
-    auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
-    resp->setStatusCode(k409Conflict);
-    callback(resp);
-    return;
-  }
-
-  // Model not loaded, load it
-  // Parse model path from request
-  std::string model_path = (*jsonBody)["model_path"].asString();
-  if (!is_file_exist(model_path.c_str())) {
-    std::string error_msg = "Model " + model_path + " not found";
-    LOG_INFO << error_msg;
-    Json::Value jsonResp;
-    jsonResp["message"] = error_msg;
-    auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
-    resp->setStatusCode(k404NotFound);
-    callback(resp);
-    return;
-  }
-
-  whisper_server_context whisper = whisper_server_context(model_id);
-  bool model_loaded = whisper.load_model(model_path);
-  // If model failed to load, return a 500 error
-  if (!model_loaded) {
-    whisper.~whisper_server_context();
-    std::string error_msg = "Failed to load model " + model_path;
-    LOG_INFO << error_msg;
-    Json::Value jsonResp;
-    jsonResp["message"] = error_msg;
-    auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
-    resp->setStatusCode(k500InternalServerError);
-    callback(resp);
-    return;
-  }
-
-  // Warm up the model
-  // Parse warm up audio path from request
-  if (jsonBody->isMember("warm_up_audio_path")) {
-    std::string warm_up_msg = "Warming up model " + model_id;
-    LOG_INFO << warm_up_msg;
-    std::string warm_up_audio_path =
-        (*jsonBody)["warm_up_audio_path"].asString();
-    // Return 400 error if warm up audio path is not found
-    if (!is_file_exist(warm_up_audio_path.c_str())) {
-      std::string error_msg =
-          "Warm up audio " + warm_up_audio_path +
-          " not found, please provide a valid path or don't specify it at all";
-      LOG_INFO << error_msg;
-      Json::Value jsonResp;
-      jsonResp["message"] = error_msg;
-      auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
-      resp->setStatusCode(k400BadRequest);
-      callback(resp);
-      return;
-    } else {
-      LOG_INFO << "Warming up model " << model_id << " with audio "
-               << warm_up_audio_path << " ...";
-      std::string warm_up_result = whisper.inference(warm_up_audio_path, "en",
-                                                     "", text_format, 0, false);
-      LOG_INFO << "Warm up model " << model_id << " completed";
-    }
-  } else {
-    LOG_INFO << "No warm up audio provided, skipping warm up";
-  }
-
-  // Model loaded successfully, add it to the map of loaded models
-  // and return a 200 response
-  whispers.emplace(model_id, std::move(whisper));
-  Json::Value jsonResp;
-  std::string success_msg = "Model " + model_id + " loaded successfully";
-  jsonResp["message"] = success_msg;
-  auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
-  resp->setStatusCode(k200OK);
-  callback(resp);
-  return;
-}
-
-void audio::UnloadModel(
-    const HttpRequestPtr& req,
-    std::function<void(const HttpResponsePtr&)>&& callback) {
-  const auto& jsonBody = req->getJsonObject();
-  auto optional_model_id = ParseModelId(jsonBody, callback);
-  if (!optional_model_id) {
-    return;
-  }
-  std::string model_id = *optional_model_id;
-
-  // If model is not loaded, return a 404 error
-  if (whispers.find(model_id) == whispers.end()) {
-    std::string error_msg =
-        "Model " + model_id +
-        " has not been loaded, please load that model into nitro";
-    LOG_INFO << error_msg;
-    Json::Value jsonResp;
-    jsonResp["message"] = error_msg;
-    auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
-    resp->setStatusCode(k404NotFound);
-    callback(resp);
-    return;
-  }
-
-  // Model loaded, unload it
-  whispers[model_id].~whisper_server_context();
-  whispers.erase(model_id);
-
-  // Return a 200 response
-  Json::Value jsonResp;
-  std::string success_msg = "Model " + model_id + " unloaded successfully";
-  LOG_INFO << success_msg;
-  jsonResp["message"] = success_msg;
-  auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
-  resp->setStatusCode(k200OK);
-  callback(resp);
-  return;
-}
-
-void audio::ListModels(const HttpRequestPtr& req,
-                       std::function<void(const HttpResponsePtr&)>&& callback) {
-  // Return a list of all loaded models
-  Json::Value jsonResp;
-  Json::Value models;
-  for (auto const& model : whispers) {
-    models.append(model.first);
-  }
-  jsonResp["models"] = models;
-  auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
-  resp->setStatusCode(k200OK);
-  callback(resp);
-  return;
-}
-
-void audio::TranscriptionImpl(
-    const HttpRequestPtr& req,
-    std::function<void(const HttpResponsePtr&)>&& callback, bool translate) {
-  MultiPartParser partParser;
-  Json::Value jsonResp;
-  if (partParser.parse(req) != 0 || partParser.getFiles().size() != 1) {
-    auto resp = HttpResponse::newHttpResponse();
-    resp->setBody("Must have exactly one file");
-    resp->setStatusCode(k403Forbidden);
-    callback(resp);
-    return;
-  }
-  auto& file = partParser.getFiles()[0];
-  const auto& formFields = partParser.getParameters();
-
-  // Check if model_id are present in the request. If not, return a 400 error
-  if (formFields.find("model_id") == formFields.end()) {
-    LOG_INFO << "No model_id found in request body";
-    Json::Value jsonResp;
-    jsonResp["message"] = "No model_id found in request body";
-    auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
-    resp->setStatusCode(k400BadRequest);
-    callback(resp);
-    return;
-  }
-
-  std::string model_id = formFields.at("model_id");
-
-  // Parse all other optional parameters from the request
-  std::string language = formFields.find("language") != formFields.end()
-                             ? formFields.at("language")
-                             : "en";
-  std::string prompt = formFields.find("prompt") != formFields.end()
-                           ? formFields.at("prompt")
-                           : "";
-  std::string response_format =
-      formFields.find("response_format") != formFields.end()
-          ? formFields.at("response_format")
-          : json_format;
-  float temperature = formFields.find("temperature") != formFields.end()
-                          ? std::stof(formFields.at("temperature"))
-                          : 0;
-
-  // Check if model is loaded. If not, return a 404 error
-  if (whispers.find(model_id) == whispers.end()) {
-    std::string error_msg =
-        "Model " + model_id +
-        " has not been loaded, please load that model into nitro";
-    LOG_INFO << error_msg;
-    Json::Value jsonResp;
-    jsonResp["message"] = error_msg;
-    auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
-    resp->setStatusCode(k404NotFound);
-    callback(resp);
-    return;
-  }
-
-  // Save input file to temp location
-  std::string temp_dir =
-      std::filesystem::temp_directory_path().string() + "/" +
-      std::to_string(std::chrono::duration_cast<std::chrono::milliseconds>(
-                         std::chrono::system_clock::now().time_since_epoch())
-                         .count());
-  // Create the directory
-  std::filesystem::create_directory(temp_dir);
-  // Save the file to the directory, with its original name
-  std::string temp_file_path = temp_dir + "/" + file.getFileName();
-  file.saveAs(temp_file_path);
-
-  // Run inference
-  std::string result;
-  try {
-    result =
-        whispers[model_id].inference(temp_file_path, language, prompt,
-                                     response_format, temperature, translate);
-  } catch (const std::exception& e) {
-    std::remove(temp_file_path.c_str());
-    Json::Value jsonResp;
-    jsonResp["message"] = e.what();
-    auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
-    resp->setStatusCode(k500InternalServerError);
-    callback(resp);
-    return;
-  }
-  // TODO: Need to remove the entire temp directory, not just the file
-  std::remove(temp_file_path.c_str());
-
-  auto resp = nitro_utils::nitroHttpResponse();
-  resp->setBody(result);
-  resp->setStatusCode(k200OK);
-  // Set content type based on response format
-  if (response_format == json_format || response_format == vjson_format) {
-    resp->addHeader("Content-Type", "application/json");
-  } else if (response_format == text_format) {
-    resp->addHeader("Content-Type", "text/html");
-  } else if (response_format == srt_format) {
-    resp->addHeader("Content-Type", "application/x-subrip");
-  } else if (response_format == vtt_format) {
-    resp->addHeader("Content-Type", "text/vtt");
-  }
-  callback(resp);
-  return;
-}
-
-void audio::ModelStatus(
-    const HttpRequestPtr& req,
-    std::function<void(const HttpResponsePtr&)>&& callback) {
-  auto resp = nitro_utils::nitroHttpResponse();
-  resp->setStatusCode(k200OK);
-  resp->setContentTypeCode(drogon::CT_APPLICATION_JSON);
-  resp->setBody("Unimplemented");
-  callback(resp);
-}
-
-void audio::CreateTranscription(
-    const HttpRequestPtr& req,
-    std::function<void(const HttpResponsePtr&)>&& callback) {
-  return TranscriptionImpl(req, std::move(callback), false);
-}
-
-void audio::CreateTranslation(
-    const HttpRequestPtr& req,
-    std::function<void(const HttpResponsePtr&)>&& callback) {
-  return TranscriptionImpl(req, std::move(callback), true);
-}
\ No newline at end of file
diff --git a/cortex-cpp/controllers/audio.h b/cortex-cpp/controllers/audio.h
deleted file mode 100644
index 19b1efb6d..000000000
--- a/cortex-cpp/controllers/audio.h
+++ /dev/null
@@ -1,74 +0,0 @@
-#pragma once
-
-#include <drogon/HttpController.h>
-#include <trantor/utils/Logger.h>
-#include <optional>
-#include "common/base.h"
-
-#define DR_WAV_IMPLEMENTATION
-#include "utils/dr_wav.h"
-
-#include "utils/json.hpp"
-
-// Whisper Context
-#include "context/whisper_server_context.h"
-
-using json = nlohmann::ordered_json;
-
-using namespace drogon;
-
-namespace v1 {
-
-class audio : public drogon::HttpController<audio>,
-              public BaseModel,
-              public BaseAudio {
- public:
-  audio();
-  ~audio();
-  METHOD_LIST_BEGIN
-
-  METHOD_ADD(audio::LoadModel, "load_model", Post);
-  METHOD_ADD(audio::UnloadModel, "unload_model", Post);
-  METHOD_ADD(audio::ListModels, "list_model", Get);
-  METHOD_ADD(audio::ModelStatus, "model_status", Get);
-  METHOD_ADD(audio::CreateTranscription, "transcriptions", Post);
-  METHOD_ADD(audio::CreateTranslation, "translations", Post);
-
-  METHOD_LIST_END
-  void LoadModel(
-      const HttpRequestPtr& req,
-      std::function<void(const HttpResponsePtr&)>&& callback) override;
-
-  void UnloadModel(
-      const HttpRequestPtr& req,
-      std::function<void(const HttpResponsePtr&)>&& callback) override;
-
-  // TODO: Add to the BaseModel interface
-  void ListModels(const HttpRequestPtr& req,
-                  std::function<void(const HttpResponsePtr&)>&& callback);
-
-  // TODO: Unimplemented
-  void ModelStatus(
-      const HttpRequestPtr& req,
-      std::function<void(const HttpResponsePtr&)>&& callback) override;
-
-  void CreateTranscription(
-      const HttpRequestPtr& req,
-      std::function<void(const HttpResponsePtr&)>&& callback) override;
-
-  void CreateTranslation(
-      const HttpRequestPtr& req,
-      std::function<void(const HttpResponsePtr&)>&& callback) override;
-
- private:
-  std::unordered_map<std::string, whisper_server_context> whispers;
-
-  std::optional<std::string> ParseModelId(
-      const std::shared_ptr<Json::Value>& jsonBody,
-      const std::function<void(const HttpResponsePtr&)>& callback);
-
-  void TranscriptionImpl(const HttpRequestPtr& req,
-                         std::function<void(const HttpResponsePtr&)>&& callback,
-                         bool translate);
-};
-}  // namespace v1
\ No newline at end of file
diff --git a/cortex-cpp/whisper.cpp b/cortex-cpp/whisper.cpp
deleted file mode 160000
index 58210d6a7..000000000
--- a/cortex-cpp/whisper.cpp
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 58210d6a7634ea1e42e0a2dab611f4a0518731dc

From 1367fc272342d64843478648c60d8126b94a0ab4 Mon Sep 17 00:00:00 2001
From: hiento09 <136591877+hiento09@users.noreply.github.com>
Date: Wed, 8 May 2024 20:53:15 +0700
Subject: [PATCH 10/10] Refactor CICD (#546)

Co-authored-by: Hien To <tominhhien97@gmail.com>
---
 .github/workflows/build.yml        | 766 +++++------------------------
 .github/workflows/docs.yml         |  95 ----
 .github/workflows/quality-gate.yml | 163 ++++++
 cortex-cpp/.gitignore              |   1 -
 cortex-cpp/Makefile                |  68 +++
 5 files changed, 355 insertions(+), 738 deletions(-)
 delete mode 100644 .github/workflows/docs.yml
 create mode 100644 .github/workflows/quality-gate.yml
 create mode 100644 cortex-cpp/Makefile

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index dc0719f95..356a2a9ca 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -1,52 +1,15 @@
 name: CI
 
 on:
-  schedule:
-    - cron: "0 20 * * *" # At 8 PM UTC, which is 3 AM UTC+7
   push:
     tags: ["v[0-9]+.[0-9]+.[0-9]+"]
     paths:
       [
-        ".github/scripts/**",
-        ".github/workflows/build.yml",
-        "**/CMakeLists.txt",
-        "**/Makefile",
-        "**/*.h",
-        "**/*.hpp",
-        "**/*.c",
-        "**/*.cpp",
-        "**/*.cu",
-        "**/*.cc",
-        "**/*.cxx",
-        "llama.cpp",
-        "!docs/**",
-        "!.gitignore",
-        "!README.md",
-      ]
-  pull_request:
-    types: [opened, synchronize, reopened]
-    paths:
-      [
-        ".github/scripts/**",
-        ".github/workflows/build.yml",
-        "**/CMakeLists.txt",
-        "**/Makefile",
-        "**/*.h",
-        "**/*.hpp",
-        "**/*.c",
-        "**/*.cpp",
-        "**/*.cu",
-        "**/*.cc",
-        "**/*.cxx",
-        "llama.cpp",
-        "!docs/**",
-        "!.gitignore",
-        "!README.md",
+        "cortex-cpp/**",
       ]
   workflow_dispatch:
 
 env:
-  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
   LLM_MODEL_URL: https://delta.jan.ai/tinyllama-1.1b-chat-v0.3.Q2_K.gguf
   EMBEDDING_MODEL_URL: https://catalog.jan.ai/dist/models/embeds/nomic-embed-text-v1.5.f16.gguf
 
@@ -76,70 +39,120 @@ jobs:
           draft: true
           prerelease: false
 
-  # Get the latest version of the release
-  set-nitro-version:
-    runs-on: ubuntu-latest
-    outputs:
-      version: ${{ steps.version_update.outputs.new_version }}
-    steps:
-      - name: Get latest release
-        id: version_update
-        run: |
-          ldd --version
-          if [[ ${{ github.event_name }} == push && ${{ github.ref }} == refs/tags/* ]]; then
-            echo "VERSION=${GITHUB_REF#refs/tags/}"
-            NEW_VERSION="${VERSION#v}"
-            echo "::set-output name=new_version::$NEW_VERSION"
-          else
-            # Function to get the latest release tag
-            get_latest_tag() {
-              local retries=0
-              local max_retries=3
-              local tag
-              while [ $retries -lt $max_retries ]; do
-                tag=$(curl -s https://api.github.com/repos/janhq/cortex/releases/latest | jq -r .tag_name)
-                if [ -n "$tag" ] && [ "$tag" != "null" ]; then
-                  echo $tag
-                  return
-                else
-                  let retries++
-                  sleep 2
-                fi
-              done
-              echo "Failed to fetch latest tag after $max_retries attempts."
-              exit 1
-            }
-            # Get the latest release tag from GitHub API
-            LATEST_TAG=$(get_latest_tag)
-            
-            # Remove the 'v' and append the build number to the version
-            NEW_VERSION="${LATEST_TAG#v}-${GITHUB_RUN_NUMBER}"
-            echo "New version: $NEW_VERSION"
-            echo "::set-output name=new_version::$NEW_VERSION"
-          fi
-          echo "Version: $NEW_VERSION"
-
-  ubuntu-amd64-build:
-    runs-on: ubuntu-18-04-cuda-11-7
-    needs: [create-draft-release, set-nitro-version]
-    if: always() && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.set-nitro-version.result == 'success'
+  build-and-test:
+    runs-on: ${{ matrix.runs-on }}
+    needs: [create-draft-release]
     timeout-minutes: 40
-    permissions:
-      contents: write
-
     strategy:
+      fail-fast: false
       matrix:
         include:
-          - build: "amd64-avx2"
-            defines: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF"
-          - build: "amd64-avx"
-            defines: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF"
-          - build: "amd64-avx512"
-            defines: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF"
-          - build: "amd64-vulkan"
-            defines: "-DLLAMA_VULKAN=ON -DLLAMA_NATIVE=OFF"
-          # - build: "arm64"
-          #   defines: "-A ARM64 -DLLAMA_NATIVE=OFF"
+          - os: "linux"
+            name: "amd64-avx2"
+            runs-on: "ubuntu-18-04"
+            cmake-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF"
+            run-e2e: true
+
+          - os: "linux"
+            name: "amd64-avx"
+            runs-on: "ubuntu-18-04"
+            cmake-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF"
+            run-e2e: false
+
+          - os: "linux"
+            name: "amd64-avx512"
+            runs-on: "ubuntu-18-04"
+            cmake-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF"
+            run-e2e: false
+
+          - os: "linux"
+            name: "amd64-vulkan"
+            runs-on: "ubuntu-18-04-cuda-11-7"
+            cmake-flags: "-DLLAMA_VULKAN=ON -DLLAMA_NATIVE=OFF"
+            run-e2e: false
+
+          - os: "linux"
+            name: "amd64-cuda-11-7"
+            runs-on: "ubuntu-18-04-cuda-11-7"
+            cmake-flags: "-DCUDA_11_7=ON -DLLAMA_NATIVE=OFF -DLLAMA_CUDA=ON"
+            run-e2e: false
+
+          - os: "linux"
+            name: "amd64-cuda-12-0"
+            runs-on: "ubuntu-18-04-cuda-12-0"
+            cmake-flags: "-DCUDA_12_0=ON -DLLAMA_NATIVE=OFF -DLLAMA_CUDA=ON"
+            run-e2e: false
+
+          - os: "mac"
+            name: "amd64"
+            runs-on: "macos-13"
+            cmake-flags: ""
+            run-e2e: true
+
+          - os: "mac"
+            name: "arm64"
+            runs-on: "mac-silicon"
+            cmake-flags: "-DMAC_ARM64=ON"
+            run-e2e: true
+
+          - os: "windows"
+            name: "amd64-avx2"
+            runs-on: "windows-latest"
+            cmake-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: true
+
+          - os: "windows"
+            name: "amd64-avx"
+            runs-on: "windows-latest"
+            cmake-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+
+          - os: "windows"
+            name: "amd64-avx512"
+            runs-on: "windows-latest"
+            cmake-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+
+          - os: "windows"
+            name: "amd64-vulkan"
+            runs-on: "windows-latest"
+            cmake-flags: "-DLLAMA_VULKAN=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+
+          - os: "windows"
+            name: "amd64-avx2-cuda-12-0"
+            runs-on: "windows-cuda-12-0"
+            cmake-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF -DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+
+          - os: "windows"
+            name: "amd64-avx-cuda-12-0"
+            runs-on: "windows-cuda-12-0"
+            cmake-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF -DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+
+          - os: "windows"
+            name: "amd64-avx512-cuda-12-0"
+            runs-on: "windows-cuda-12-0"
+            cmake-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF -DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+
+          - os: "windows"
+            name: "amd64-avx2-cuda-11-7"
+            runs-on: "windows-cuda-11-7"
+            cmake-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF -DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+
+          - os: "windows"
+            name: "amd64-avx-cuda-11-7"
+            runs-on: "windows-cuda-11-7"
+            cmake-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF -DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+          - os: "windows"
+            name: "amd64-avx512-cuda-11-7"
+            runs-on: "windows-cuda-11-7"
+            cmake-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF -DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
 
     steps:
       - name: Clone
@@ -148,372 +161,32 @@ jobs:
         with:
           submodules: recursive
 
-      - name: Prepare Vulkan SDK
-        if: ${{ matrix.build == 'amd64-vulkan' }}
-        uses: humbletim/setup-vulkan-sdk@v1.2.0
-        with:
-          vulkan-query-version: 1.3.275.0
-          vulkan-components: Vulkan-Headers, Vulkan-Loader
-          vulkan-use-cache: true
-
-      - name: Build
-        id: make_build
+      - name: Install choco on Windows
+        if: runner.os == 'Windows'
         run: |
-          ldd --version
-          cd cortex-cpp
-          ./install_deps.sh
-          mkdir build && cd build
-          cmake ${{ matrix.defines }} -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} ..
-          make -j $(nproc)
-          ls -la
-
-      - name: Package
-        shell: bash
-        run: |
-          mkdir -p cortex-cpp/nitro
-          mkdir -p cortex-cpp/nitro/engines/cortex.llamacpp
-          cp cortex-cpp/build/nitro cortex-cpp/nitro/
-          cp cortex-cpp/build/engines/cortex.llamacpp/libengine.so cortex-cpp/nitro/engines/cortex.llamacpp/
-          tar -czvf cortex-cpp/nitro.tar.gz cortex-cpp/nitro
-
-      - name: Upload Artifact
-        uses: actions/upload-artifact@v2
-        if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request'
-        with:
-          name: nitro-linux-${{ matrix.build }}
-          path: ./cortex-cpp/nitro
-
-      - name: Run e2e testing - LLama.CPP
-        shell: bash
-        if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' && matrix.build != 'amd64-avx' }}
-        run: |
-          # run e2e testing
-          cd cortex-cpp/nitro
-          chmod +x ../../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }}
-          rm -rf uploads/
-
-      - uses: actions/upload-release-asset@v1.0.1
-        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        with:
-          upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
-          asset_path: ./cortex-cpp/nitro.tar.gz
-          asset_name: nitro-${{ needs.create-draft-release.outputs.version }}-linux-${{ matrix.build }}.tar.gz
-          asset_content_type: application/gzip
-
-  ubuntu-amd64-cuda-build:
-    runs-on: ubuntu-18-04-cuda-${{ matrix.cuda }}
-    needs: [create-draft-release, set-nitro-version]
-    if: always() && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.set-nitro-version.result == 'success'
-    timeout-minutes: 40
-    permissions:
-      contents: write
-    strategy:
-      matrix:
-        include:
-          - cuda: "12-0"
-            flags: "-DCUDA_12_0=ON"
-          - cuda: "11-7"
-            flags: "-DCUDA_11_7=ON"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: recursive
+          choco install make -y
 
       - name: Build
-        id: make_build
         run: |
           cd cortex-cpp
-          ./install_deps.sh
-          mkdir build && cd build
-          cmake  ${{matrix.flags}} -DLLAMA_NATIVE=OFF -DLLAMA_CUDA=ON -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} ..
-          make -j $(nproc)
-          ls -la
+          make build CMAKE_EXTRA_FLAGS="${{ matrix.cmake-flags }}"
 
       - name: Package
-        shell: bash
-        run: |
-          cd cortex-cpp
-          mkdir -p nitro
-          mkdir -p nitro/engines/cortex.llamacpp
-          cp build/nitro nitro/
-          cp build/engines/cortex.llamacpp/libengine.so nitro/engines/cortex.llamacpp/
-          tar -czvf nitro.tar.gz nitro
-
-      - name: Upload Artifact
-        uses: actions/upload-artifact@v2
-        if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request'
-        with:
-          name: nitro-linux-amd64-cuda-${{ matrix.cuda }}
-          path: ./cortex-cpp/nitro
-      
-      - name: Run e2e testing - LLama.CPP
-        shell: bash
-        if: ${{ matrix.cuda != '12-0'}}
-        run: |
-          # run e2e testing
-          cd cortex-cpp/nitro
-          chmod +x ../../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }}
-          rm -rf uploads/
-
-      - uses: actions/upload-release-asset@v1.0.1
-        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        with:
-          upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
-          asset_path: ./cortex-cpp/nitro.tar.gz
-          asset_name: nitro-${{ needs.create-draft-release.outputs.version }}-linux-amd64-cuda-${{ matrix.cuda }}.tar.gz
-          asset_content_type: application/gzip
-
-  macOS-silicon-build:
-    runs-on: mac-silicon
-    needs: [create-draft-release, set-nitro-version]
-    if: always() && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.set-nitro-version.result == 'success'
-    timeout-minutes: 40
-    permissions:
-      contents: write
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: recursive
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cd cortex-cpp
-          ./install_deps.sh
-          mkdir build && cd build
-          cmake -DMAC_ARM64=ON -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} .. 
-          CC=gcc-8 make -j $(sysctl -n hw.ncpu)
-          ls -la
-
-      - name: Package
-        shell: bash
-        run: |
-          cd cortex-cpp
-          mkdir -p nitro
-          mkdir -p nitro/engines/cortex.llamacpp
-          cp build/nitro nitro/
-          cp build/engines/cortex.llamacpp/libengine.dylib nitro/engines/cortex.llamacpp/
-
-      - name: Upload Artifact
-        uses: actions/upload-artifact@v2
-        with:
-          name: nitro-mac-arm64
-          path: ./cortex-cpp/nitro
-
-      - name: Run e2e testing - LLama.CPP
-        run: |
-          # run e2e testing
-          cd cortex-cpp/nitro/
-          chmod +x ../../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }}
-          rm -rf uploads/
-
-  macOS-amd64-build:
-    runs-on: macos-13
-    needs: [create-draft-release, set-nitro-version]
-    if: always() && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.set-nitro-version.result == 'success'
-    timeout-minutes: 40
-    permissions:
-      contents: write
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: recursive
-
-      - name: Dependencies
-        id: depends
-        continue-on-error: true
-        run: |
-          brew update
-          brew install sdl2
-
-      - name: Build
-        id: cmake_build
-        run: |
-          cd cortex-cpp
-          ./install_deps.sh
-          mkdir build && cd build
-          cmake -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }} .. 
-          CC=gcc-8 make -j $(sysctl -n hw.ncp)
-          ls -la
-
-      - name: Package
-        shell: bash
-        run: |
-          cd cortex-cpp
-          mkdir -p nitro
-          mkdir -p nitro/engines/cortex.llamacpp
-          cp build/nitro nitro/
-          cp build/engines/cortex.llamacpp/libengine.dylib nitro/engines/cortex.llamacpp/
-
-      - name: Upload Artifact
-        uses: actions/upload-artifact@v2
-        with:
-          name: nitro-mac-amd64
-          path: ./cortex-cpp/nitro
-
-      - name: Run e2e testing - LLama.CPP
-        shell: bash
-        run: |
-          # run e2e testing
-          cd cortex-cpp
-          cd nitro
-          chmod +x ../../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }}
-          rm -rf uploads/
-
-  universal-nitro-artifact-macos:
-    runs-on: macos-latest
-    needs: [create-draft-release, set-nitro-version, macOS-silicon-build, macOS-amd64-build]
-    if: always() && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.set-nitro-version.result == 'success' 
-    timeout-minutes: 40
-    permissions:
-      contents: write
-    steps:
-      - name: download artifact amd64
-        uses: actions/download-artifact@v2
-        with:
-          name: nitro-mac-amd64
-          path: ./cortex-cpp/nitro-mac-amd64
-
-      - name: download artifact arm64
-        uses: actions/download-artifact@v2
-        with:
-          name: nitro-mac-arm64
-          path: ./cortex-cpp/nitro-mac-arm64
-      
-      - name: bundle universal binary
-        run: |
-          cd cortex-cpp
-          mkdir -p nitro
-          mkdir -p nitro/engines/cortex.llamacpp/
-          ls ./nitro-mac-amd64
-          lipo -create ./nitro-mac-amd64/nitro ./nitro-mac-arm64/nitro -output ./nitro/nitro
-          lipo -create ./nitro-mac-amd64/engines/cortex.llamacpp/libengine.dylib ./nitro-mac-arm64/engines/cortex.llamacpp/libengine.dylib -output ./nitro/engines/cortex.llamacpp/libengine.dylib
-          tar -czvf nitro.tar.gz nitro
-
-      - name: Upload Artifact
-        uses: actions/upload-artifact@v2
-        with:
-          name: nitro-mac-universal
-          path: ./cortex-cpp/nitro
-
-      - uses: actions/upload-release-asset@v1.0.1
-        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        with:
-          upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
-          asset_path: ./cortex-cpp/nitro.tar.gz
-          asset_name: nitro-${{ needs.create-draft-release.outputs.version }}-mac-universal.tar.gz
-          asset_content_type: application/gzip
-
-  windows-amd64-build:
-    runs-on: windows-latest
-    needs: [create-draft-release, set-nitro-version]
-    if: always() && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.set-nitro-version.result == 'success'
-    timeout-minutes: 40
-
-    strategy:
-      matrix:
-        include:
-          - build: "amd64-avx2"
-            defines: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
-          - build: "amd64-avx"
-            defines: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
-          - build: "amd64-avx512"
-            defines: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
-          - build: "amd64-vulkan"
-            defines: "-DLLAMA_VULKAN=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
-          # - build: "arm64"
-          #   defines: "-A ARM64 -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON"
-
-    permissions:
-      contents: write
-
-    steps:
-      - name: Clone
-
-        id: checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: recursive
-
-      - name: Setup VSWhere.exe
-        uses: warrenbuckley/Setup-VSWhere@v1
-        with:
-          version: latest
-          silent: true
-        env:
-          ACTIONS_ALLOW_UNSECURE_COMMANDS: true
-
-      - name: Add msbuild to PATH
-        uses: microsoft/setup-msbuild@v1
-
-      - name: actions-setup-cmake
-        uses: jwlawson/actions-setup-cmake@v1.14.1
-
-      - name: Prepare Vulkan SDK
-        uses: humbletim/setup-vulkan-sdk@v1.2.0
-        if: ${{ matrix.build == 'amd64-vulkan' }}
-        with:
-          vulkan-query-version: 1.3.275.0
-          vulkan-components: Vulkan-Headers, Vulkan-Loader
-          vulkan-use-cache: true
-
-      - name: Build
-        id: cmake_build
-        shell: cmd
         run: |
           cd cortex-cpp
-          cmake -S ./nitro_deps -B ./build_deps/nitro_deps
-          cmake --build ./build_deps/nitro_deps --config Release
-          mkdir -p build
-          cd build
-          cmake .. ${{ matrix.defines }} -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }}
-          cmake --build . --config Release -j "%NUMBER_OF_PROCESSORS%"
+          make package
 
-      - name: Pack artifacts
-        id: pack_artifacts
-        shell: cmd
+      - name: Run e2e testing
+        if: ${{ matrix.run-e2e }}
         run: |
           cd cortex-cpp
-          mkdir .\build\Release\engines\cortex.llamacpp\
-          robocopy .\build\engines\cortex.llamacpp\ .\build\Release\engines\cortex.llamacpp\ engine.dll
-          robocopy build_deps\_install\bin\ .\build\Release\ zlib.dll
-          robocopy build\bin\Release\ .\build\Release\ llama.dll
-          robocopy ..\.github\patches\windows\ .\build\Release\ msvcp140.dll
-          robocopy ..\.github\patches\windows\ .\build\Release\ vcruntime140_1.dll
-          robocopy ..\.github\patches\windows\ .\build\Release\ vcruntime140.dll
-          dotnet tool install --global AzureSignTool
-          azuresigntool.exe sign -kvu "${{ secrets.AZURE_KEY_VAULT_URI }}" -kvi "${{ secrets.AZURE_CLIENT_ID }}" -kvt "${{ secrets.AZURE_TENANT_ID }}" -kvs "${{ secrets.AZURE_CLIENT_SECRET }}" -kvc ${{ secrets.AZURE_CERT_NAME }} -tr http://timestamp.globalsign.com/tsa/r6advanced1 -v ".\build\Release\nitro.exe"
-          7z a -ttar temp.tar .\build\Release\*
-          7z a -tgzip nitro.tar.gz temp.tar
-
-      - name: Run e2e testing - Llama.cpp
-        shell: cmd
-        if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' }}
-        run: |
-          cd cortex-cpp
-          cd build\Release
-          ..\..\..\.github\scripts\e2e-test-llama-windows.bat nitro.exe ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }}
-          rmdir /S /Q .\build\Release\uploads
+          make run-e2e-test RUN_TESTS=true LLM_MODEL_URL=${{ env.LLM_MODEL_URL }} EMBEDDING_MODEL_URL=${{ env.EMBEDDING_MODEL_URL }}
 
       - name: Upload Artifact
         uses: actions/upload-artifact@v2
-        if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request'
         with:
-          name: nitro-win-${{ matrix.build }}
-          path: ./cortex-cpp/build/Release
+          name: cortex-llamacpp-engine-${{ matrix.os }}-${{ matrix.name }}
+          path: ./cortex-cpp/cortex
 
       - uses: actions/upload-release-asset@v1.0.1
         if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
@@ -521,197 +194,6 @@ jobs:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         with:
           upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
-          asset_path: ./cortex-cpp/nitro.tar.gz
-          asset_name: nitro-${{ needs.create-draft-release.outputs.version }}-win-${{ matrix.build }}.tar.gz
-          asset_content_type: application/gzip
-
-  windows-amd64-cuda-build:
-    runs-on: windows-cuda-${{ matrix.cuda }}
-    needs: [create-draft-release, set-nitro-version]
-    if: always() && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.set-nitro-version.result == 'success'
-    timeout-minutes: 40
-    permissions:
-      contents: write
-
-    strategy:
-      matrix:
-        include:
-          - cuda: "12-0"
-            instructions: "amd64-avx2"
-            inst-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF"
-            cmake-flags: "-DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
-          - cuda: "12-0"
-            instructions: "amd64-avx"
-            inst-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF"
-            cmake-flags: "-DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
-          - cuda: "12-0"
-            instructions: "amd64-avx512"
-            inst-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF"
-            cmake-flags: "-DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
-          - cuda: "11-7"
-            instructions: "amd64-avx2"
-            inst-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF"
-            cmake-flags: "-DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
-          - cuda: "11-7"
-            instructions: "amd64-avx"
-            inst-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF"
-            cmake-flags: "-DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
-          - cuda: "11-7"
-            instructions: "amd64-avx512"
-            inst-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF"
-            cmake-flags: "-DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
-
-    steps:
-      - name: Clone
-        id: checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: recursive
-
-      - uses: actions/setup-dotnet@v3
-        with:
-          dotnet-version: "6.0.x"
-
-      - name: Build
-        id: cmake_build
-        shell: cmd
-        run: |
-          cd cortex-cpp
-          cmake -S ./nitro_deps -B ./build_deps/nitro_deps
-          cmake --build ./build_deps/nitro_deps --config Release
-          mkdir -p build
-          cd build
-          cmake .. ${{ matrix.inst-flags }} ${{ matrix.cmake-flags }} -DNITRO_VERSION=${{ needs.set-nitro-version.outputs.version }}
-          cmake --build . --config Release -j "%NUMBER_OF_PROCESSORS%"
-
-      - name: Pack artifacts
-        id: pack_artifacts
-        shell: cmd
-        run: |
-          set PATH=%PATH%;C:\Program Files\7-Zip\
-          cd cortex-cpp
-          mkdir .\build\Release\engines\cortex.llamacpp\
-          robocopy .\build\engines\cortex.llamacpp\ .\build\Release\engines\cortex.llamacpp\ engine.dll
-          robocopy build_deps\_install\bin\ .\build\Release\ zlib.dll
-          robocopy build\bin\Release\ .\build\Release\ llama.dll
-          robocopy ..\.github\patches\windows\ .\build\Release\ msvcp140.dll
-          robocopy ..\.github\patches\windows\ .\build\Release\ vcruntime140_1.dll
-          robocopy ..\.github\patches\windows\ .\build\Release\ vcruntime140.dll
-          dotnet tool install --global AzureSignTool
-          %USERPROFILE%\.dotnet\tools\azuresigntool.exe sign -kvu "${{ secrets.AZURE_KEY_VAULT_URI }}" -kvi "${{ secrets.AZURE_CLIENT_ID }}" -kvt "${{ secrets.AZURE_TENANT_ID }}" -kvs "${{ secrets.AZURE_CLIENT_SECRET }}" -kvc ${{ secrets.AZURE_CERT_NAME }} -tr http://timestamp.globalsign.com/tsa/r6advanced1 -v ".\build\Release\nitro.exe"
-          7z a -ttar temp.tar .\build\Release\*
-          7z a -tgzip nitro.tar.gz temp.tar
-
-      - name: Upload Artifact
-        uses: actions/upload-artifact@v2
-        if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || github.event_name == 'pull_request'
-        with:
-          name: nitro-win-${{ matrix.instructions }}-cuda-${{ matrix.cuda }}
-          path: ./cortex-cpp/build/Release
-
-      - uses: actions/upload-release-asset@v1.0.1
-        if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        with:
-          upload_url: ${{ needs.create-draft-release.outputs.upload_url }}
-          asset_path: ./cortex-cpp/nitro.tar.gz
-          asset_name: nitro-${{ needs.create-draft-release.outputs.version }}-win-${{ matrix.instructions }}-cuda-${{ matrix.cuda }}.tar.gz
-          asset_content_type: application/gzip
-
-  update_release_draft:
-    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/')
-    timeout-minutes: 40
-    needs:
-      [
-        ubuntu-amd64-build,
-        ubuntu-amd64-cuda-build,
-        macOS-silicon-build,
-        macOS-amd64-build,
-        windows-amd64-build,
-        windows-amd64-cuda-build,
-      ]
-    permissions:
-      contents: write
-      pull-requests: write
-    runs-on: ubuntu-latest
-    steps:
-      - uses: release-drafter/release-drafter@v5
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-  noti-discord-nightly:
-    timeout-minutes: 40
-    if: github.event_name == 'schedule' && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.ubuntu-amd64-build.result == 'success' && needs.ubuntu-amd64-cuda-build.result == 'success' && needs.macOS-silicon-build.result == 'success' && needs.macOS-amd64-build.result == 'success' && needs.windows-amd64-build.result == 'success' && needs.windows-amd64-cuda-build.result == 'success'
-    needs:
-      [
-        create-draft-release,
-        ubuntu-amd64-build,
-        ubuntu-amd64-cuda-build,
-        macOS-silicon-build,
-        macOS-amd64-build,
-        windows-amd64-build,
-        windows-amd64-cuda-build,
-      ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: "0"
-          token: ${{ secrets.PAT_SERVICE_ACCOUNT }}
-      - name: Notify Discord
-        uses: Ilshidur/action-discord@master
-        with:
-          args: "Nightly build artifact: https://github.com/janhq/nitro/actions/runs/{{ GITHUB_RUN_ID }}"
-        env:
-          DISCORD_WEBHOOK: ${{ secrets.DISCORD_WEBHOOK }}
-      - name: Update README.md with artifact URL
-        run: |
-          sed -i "s|<a href='https://github.com/janhq/nitro/actions/runs/.*'>|<a href='https://github.com/janhq/nitro/actions/runs/${GITHUB_RUN_ID}'>|" README.md
-          git config --global user.email "service@jan.ai"
-          git config --global user.name "Service Account"
-          git add README.md
-          git commit -m "${GITHUB_REPOSITORY}: Update README.md with nightly build artifact URL"
-          git -c http.extraheader="AUTHORIZATION: bearer ${{ secrets.PAT_SERVICE_ACCOUNT }}" push origin HEAD:main
-        env:
-          GITHUB_RUN_ID: ${{ github.run_id }}
-
-  noti-discord-manual:
-    timeout-minutes: 40
-    if: github.event_name == 'workflow_dispatch' && (needs.create-draft-release.result == 'success' || needs.create-draft-release.result == 'skipped') && needs.ubuntu-amd64-build.result == 'success' && needs.ubuntu-amd64-cuda-build.result == 'success' && needs.macOS-silicon-build.result == 'success' && needs.macOS-amd64-build.result == 'success' && needs.windows-amd64-build.result == 'success' && needs.windows-amd64-cuda-build.result == 'success'
-    needs:
-      [
-        create-draft-release,
-        ubuntu-amd64-build,
-        ubuntu-amd64-cuda-build,
-        macOS-silicon-build,
-        macOS-amd64-build,
-        windows-amd64-build,
-        windows-amd64-cuda-build,
-      ]
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: "0"
-          token: ${{ secrets.PAT_SERVICE_ACCOUNT }}
-      - name: Notify Discord
-        uses: Ilshidur/action-discord@master
-        with:
-          args: "Manual build artifact: https://github.com/janhq/nitro/actions/runs/{{ GITHUB_RUN_ID }}"
-        env:
-          DISCORD_WEBHOOK: ${{ secrets.DISCORD_WEBHOOK }}
-      # Update README.md with artifact URL if manual build from main branch
-      - name: Update README.md with artifact URL
-        if: github.ref == 'refs/heads/main'
-        run: |
-          sed -i "s|<a href='https://github.com/janhq/nitro/actions/runs/.*'>|<a href='https://github.com/janhq/nitro/actions/runs/${GITHUB_RUN_ID}'>|" README.md
-          git config --global user.email "service@jan.ai"
-          git config --global user.name "Service Account"
-          git add README.md
-          git commit -m "${GITHUB_REPOSITORY}: Update README.md with nightly build artifact URL"
-          git -c http.extraheader="AUTHORIZATION: bearer ${{ secrets.PAT_SERVICE_ACCOUNT }}" push origin HEAD:main
-        env:
-          GITHUB_RUN_ID: ${{ github.run_id }}
+          asset_path: ./cortex-cpp/cortex.tar.gz
+          asset_name: cortex-llamacpp-engine-${{ needs.create-draft-release.outputs.version }}-${{ matrix.os }}-${{ matrix.name }}.tar.gz
+          asset_content_type: application/gzip
\ No newline at end of file
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
deleted file mode 100644
index 75d46cb03..000000000
--- a/.github/workflows/docs.yml
+++ /dev/null
@@ -1,95 +0,0 @@
-name: Nitro Docs
-
-on:
-  push:
-    branches:
-      - main
-    paths:
-      - 'docs/**'
-      - '.github/workflows/docs.yml'
-  pull_request:
-    branches:
-      - main
-    paths:
-      - 'docs/**'
-      - '.github/workflows/docs.yml'
-    # Review gh actions docs if you want to further define triggers, paths, etc
-    # https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#on
-
-jobs:
-  deploy:
-    name: Deploy to GitHub Pages
-    env:
-      CLOUDFLARE_ACCOUNT_ID: 9707100ef42a1a25bd70e3ee2137bd0e
-      CLOUDFLARE_PROJECT_NAME: nitro
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v3
-      - uses: actions/setup-node@v3
-        with:
-          node-version: 18
-
-      - name: Install jq      
-        uses: dcarbone/install-jq-action@v2.0.1
-
-      - name: Fill env vars
-        run: |
-          env_example_file=".env.example"
-          touch .env
-          while IFS= read -r line || [[ -n "$line" ]]; do
-            if [[ "$line" == *"="* ]]; then
-              var_name=$(echo $line | cut -d '=' -f 1)
-              echo $var_name
-              var_value="$(jq -r --arg key "$var_name" '.[$key]' <<< "$SECRETS")"
-              echo "$var_name=$var_value" >> .env
-            fi
-          done < "$env_example_file"
-        working-directory: docs
-        env:
-          SECRETS: '${{ toJson(secrets) }}'
-
-      - name: Install dependencies
-        run: yarn install
-        working-directory: docs
-      - name: Build website
-        run: sed -i '/process.env.DEBUG = namespaces;/c\// process.env.DEBUG = namespaces;' ./node_modules/debug/src/node.js && yarn build
-        working-directory: docs
-
-      - name: Publish to Cloudflare Pages PR Preview and Staging
-        if: (github.event_name == 'push' && github.ref == 'refs/heads/main') || (github.event_name == 'pull_request' && github.event.pull_request.base.ref == 'main')
-        uses: cloudflare/pages-action@v1
-        with:
-          apiToken: ${{ secrets.CLOUDFLARE_API_TOKEN }}
-          accountId: ${{ env.CLOUDFLARE_ACCOUNT_ID }}
-          projectName: ${{ env.CLOUDFLARE_PROJECT_NAME }}
-          directory: ./docs/build
-          # Optional: Enable this if you want to have GitHub Deployments triggered
-          gitHubToken: ${{ secrets.GITHUB_TOKEN }}
-        id: deployCloudflarePages
-
-      - uses: mshick/add-pr-comment@v2
-        if: github.event_name == 'pull_request' && github.event.pull_request.base.ref == 'main'
-        with:
-          message: |
-              Preview URL: ${{ steps.deployCloudflarePages.outputs.url }}
-
-      - name: Add Custome Domain file
-        if: github.event_name == 'push' && github.event.pull_request.head.repo.full_name != github.repository
-        run: echo "${{ vars.DOCUSAURUS_DOMAIN }}" > ./docs/build/CNAME
-
-      # Popular action to deploy to GitHub Pages:
-      # Docs: https://github.com/peaceiris/actions-gh-pages#%EF%B8%8F-docusaurus
-      - name: Deploy to GitHub Pages
-        if: github.event_name == 'push' && github.event.pull_request.head.repo.full_name != github.repository
-        uses: peaceiris/actions-gh-pages@v3
-        with:
-          github_token: ${{ secrets.GITHUB_TOKEN }}
-          # Build output to publish to the `gh-pages` branch:
-          publish_dir: ./docs/build
-          # The following lines assign commit authorship to the official
-          # GH-Actions bot for deploys to `gh-pages` branch:
-          # https://github.com/actions/checkout/issues/13#issuecomment-724415212
-          # The GH actions bot is used by default if you didn't specify the two fields.
-          # You can swap them out with your own user credentials.
-          user_name: github-actions[bot]
-          user_email: 41898282+github-actions[bot]@users.noreply.github.com
\ No newline at end of file
diff --git a/.github/workflows/quality-gate.yml b/.github/workflows/quality-gate.yml
new file mode 100644
index 000000000..82930e0ed
--- /dev/null
+++ b/.github/workflows/quality-gate.yml
@@ -0,0 +1,163 @@
+name: CI Quality Gate
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths:
+      [
+        "cortex-cpp/**",
+      ]
+  workflow_dispatch:
+
+env:
+  LLM_MODEL_URL: https://delta.jan.ai/tinyllama-1.1b-chat-v0.3.Q2_K.gguf
+  EMBEDDING_MODEL_URL: https://catalog.jan.ai/dist/models/embeds/nomic-embed-text-v1.5.f16.gguf
+
+jobs:
+  build-and-test:
+    runs-on: ${{ matrix.runs-on }}
+    timeout-minutes: 40
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - os: "linux"
+            name: "amd64-avx2"
+            runs-on: "ubuntu-18-04"
+            cmake-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF"
+            run-e2e: true
+
+          - os: "linux"
+            name: "amd64-avx"
+            runs-on: "ubuntu-18-04"
+            cmake-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF"
+            run-e2e: false
+
+          - os: "linux"
+            name: "amd64-avx512"
+            runs-on: "ubuntu-18-04"
+            cmake-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF"
+            run-e2e: false
+
+          - os: "linux"
+            name: "amd64-vulkan"
+            runs-on: "ubuntu-18-04-cuda-11-7"
+            cmake-flags: "-DLLAMA_VULKAN=ON -DLLAMA_NATIVE=OFF"
+            run-e2e: false
+
+          - os: "linux"
+            name: "amd64-cuda-11-7"
+            runs-on: "ubuntu-18-04-cuda-11-7"
+            cmake-flags: "-DCUDA_11_7=ON -DLLAMA_NATIVE=OFF -DLLAMA_CUDA=ON"
+            run-e2e: false
+
+          - os: "linux"
+            name: "amd64-cuda-12-0"
+            runs-on: "ubuntu-18-04-cuda-12-0"
+            cmake-flags: "-DCUDA_12_0=ON -DLLAMA_NATIVE=OFF -DLLAMA_CUDA=ON"
+            run-e2e: false
+
+          - os: "mac"
+            name: "amd64"
+            runs-on: "macos-13"
+            cmake-flags: ""
+            run-e2e: true
+
+          - os: "mac"
+            name: "arm64"
+            runs-on: "mac-silicon"
+            cmake-flags: "-DMAC_ARM64=ON"
+            run-e2e: true
+
+          - os: "windows"
+            name: "amd64-avx2"
+            runs-on: "windows-latest"
+            cmake-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: true
+
+          - os: "windows"
+            name: "amd64-avx"
+            runs-on: "windows-latest"
+            cmake-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+
+          - os: "windows"
+            name: "amd64-avx512"
+            runs-on: "windows-latest"
+            cmake-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+
+          - os: "windows"
+            name: "amd64-vulkan"
+            runs-on: "windows-latest"
+            cmake-flags: "-DLLAMA_VULKAN=ON -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+
+          - os: "windows"
+            name: "amd64-avx2-cuda-12-0"
+            runs-on: "windows-cuda-12-0"
+            cmake-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF -DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+
+          - os: "windows"
+            name: "amd64-avx-cuda-12-0"
+            runs-on: "windows-cuda-12-0"
+            cmake-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF -DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+
+          - os: "windows"
+            name: "amd64-avx512-cuda-12-0"
+            runs-on: "windows-cuda-12-0"
+            cmake-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF -DCUDA_12_0=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+
+          - os: "windows"
+            name: "amd64-avx2-cuda-11-7"
+            runs-on: "windows-cuda-11-7"
+            cmake-flags: "-DLLAMA_AVX2=ON -DLLAMA_NATIVE=OFF -DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+
+          - os: "windows"
+            name: "amd64-avx-cuda-11-7"
+            runs-on: "windows-cuda-11-7"
+            cmake-flags: "-DLLAMA_AVX2=OFF -DLLAMA_NATIVE=OFF -DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+          - os: "windows"
+            name: "amd64-avx512-cuda-11-7"
+            runs-on: "windows-cuda-11-7"
+            cmake-flags: "-DLLAMA_AVX512=ON -DLLAMA_NATIVE=OFF -DCUDA_11_7=ON -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE=RELEASE"
+            run-e2e: false
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+
+      - name: Install choco on Windows
+        if: runner.os == 'Windows'
+        run: |
+          choco install make -y
+
+      - name: Build
+        run: |
+          cd cortex-cpp
+          make build CMAKE_EXTRA_FLAGS="${{ matrix.cmake-flags }}"
+
+      - name: Package
+        run: |
+          cd cortex-cpp
+          make package
+
+      - name: Run e2e testing
+        if: ${{ matrix.run-e2e }}
+        run: |
+          cd cortex-cpp
+          make run-e2e-test RUN_TESTS=true LLM_MODEL_URL=${{ env.LLM_MODEL_URL }} EMBEDDING_MODEL_URL=${{ env.EMBEDDING_MODEL_URL }}
+
+      - name: Upload Artifact
+        uses: actions/upload-artifact@v2
+        with:
+          name: cortex-llamacpp-engine-${{ matrix.os }}-${{ matrix.name }}
+          path: ./cortex-cpp/cortex
\ No newline at end of file
diff --git a/cortex-cpp/.gitignore b/cortex-cpp/.gitignore
index be1237faa..69c167305 100644
--- a/cortex-cpp/.gitignore
+++ b/cortex-cpp/.gitignore
@@ -85,7 +85,6 @@ CMakeCache.txt
 CMakeFiles
 CMakeScripts
 Testing
-Makefile
 !nitro-node/Makefile
 cmake_install.cmake
 install_manifest.txt
diff --git a/cortex-cpp/Makefile b/cortex-cpp/Makefile
new file mode 100644
index 000000000..960bb198a
--- /dev/null
+++ b/cortex-cpp/Makefile
@@ -0,0 +1,68 @@
+# Makefile for Cortex llamacpp engine - Build, Lint, Test, and Clean
+.PHONY: all build package run-e2e-test
+
+
+CMAKE_EXTRA_FLAGS ?= ""
+RUN_TESTS ?= false
+LLM_MODEL_URL ?= "https://delta.jan.ai/tinyllama-1.1b-chat-v0.3.Q2_K.gguf"
+EMBEDDING_MODEL_URL ?= "https://catalog.jan.ai/dist/models/embeds/nomic-embed-text-v1.5.f16.gguf"
+
+# Default target, does nothing
+all:
+	@echo "Specify a target to run"
+
+# Build the Cortex engine
+build:
+ifeq ($(OS),Windows_NT)
+	@powershell -Command "cmake -S ./nitro_deps -B ./build_deps/nitro_deps;"
+	@powershell -Command "cmake --build ./build_deps/nitro_deps --config Release -j4;"
+	@powershell -Command "mkdir -p build; cd build; cmake .. $(CMAKE_EXTRA_FLAGS); cmake --build . --config Release -j4;"
+else ifeq ($(shell uname -s),Linux)
+	@./install_deps.sh;
+	@mkdir -p build && cd build; \
+	cmake .. $(CMAKE_EXTRA_FLAGS); \
+	make -j4;
+else
+	@./install_deps.sh;
+	@mkdir -p build && cd build; \
+	cmake .. $(CMAKE_EXTRA_FLAGS); \
+	make -j4;
+endif
+
+package:
+ifeq ($(OS),Windows_NT)
+	@powershell -Command "mkdir -p cortex\engines\cortex.llamacpp\; cp build\engines\cortex.llamacpp\engine.dll cortex\engines\cortex.llamacpp\;"
+	@powershell -Command "cp build\Release\nitro.exe .\cortex\;"
+	@powershell -Command "cp build_deps\_install\bin\zlib.dll .\cortex\;"
+	@powershell -Command "cp ..\.github\patches\windows\msvcp140.dll .\cortex\;"
+	@powershell -Command "cp ..\.github\patches\windows\vcruntime140_1.dll .\cortex\;"
+	@powershell -Command "cp ..\.github\patches\windows\vcruntime140.dll .\cortex\;"
+	@powershell -Command "7z a -ttar temp.tar cortex\\*; 7z a -tgzip cortex.tar.gz temp.tar;"
+else ifeq ($(shell uname -s),Linux)
+	@mkdir -p cortex/engines/cortex.llamacpp; \
+	cp build/engines/cortex.llamacpp/libengine.so cortex/engines/cortex.llamacpp/; \
+	cp build/nitro cortex/; \
+	tar -czvf cortex.tar.gz cortex;
+else
+	@mkdir -p cortex/engines/cortex.llamacpp; \
+	cp build/engines/cortex.llamacpp/libengine.dylib cortex/engines/cortex.llamacpp/; \
+	cp build/nitro cortex/; \
+	tar -czvf cortex.llamacpp.tar.gz cortex;
+endif
+
+run-e2e-test:
+ifeq ($(RUN_TESTS),false)
+	@echo "Skipping tests"
+	@exit 0
+endif
+ifeq ($(OS),Windows_NT)
+	@powershell -Command "cd cortex; ..\..\.github\scripts\e2e-test-llama-windows.bat nitro.exe $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL);"
+else ifeq ($(shell uname -s),Linux)
+	@cd cortex; \
+	chmod +x ../../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL); \
+	rm -rf uploads/;
+else
+	@cd cortex; \
+	chmod +x ../../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro $(LLM_MODEL_URL) $(EMBEDDING_MODEL_URL); \
+	rm -rf uploads/;
+endif
\ No newline at end of file