From df7d27d9583c975be1046c1f49252bc6528bf2ba Mon Sep 17 00:00:00 2001
From: vansangpfiev <vansangpfiev@gmail.com>
Date: Sat, 13 Apr 2024 05:08:08 +0700
Subject: [PATCH 1/7] fix: embed, log for testing

---
 context/llama_server_context.h | 185 ++++++++++++++++++++-------------
 controllers/llamaCPP.cc        |   4 +-
 2 files changed, 113 insertions(+), 76 deletions(-)
diff --git a/context/llama_server_context.h b/context/llama_server_context.h
index 0aedc494c..5727897ca 100644
--- a/context/llama_server_context.h
+++ b/context/llama_server_context.h
@@ -1,15 +1,15 @@
+#include <mutex>
+#include <set>
 #include <string>
 #include <vector>
-#include <set>
-#include <mutex>
 
 // External
 #include "clip.h"
 #include "common.h"
 #include "llama.h"
-#include "utils/json.hpp"
-#include "stb_image.h"
 #include "llava.h"
+#include "stb_image.h"
+#include "utils/json.hpp"
 
 #if defined(_WIN32)
 #define NOMINMAX
@@ -443,20 +443,22 @@ struct llama_client_slot {
   }
 
   void print_timings() const {
-    LOG_TEE("\n");
-    LOG_TEE(
-        "%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per "
-        "token, %8.2f tokens per second)\n",
-        __func__, t_prompt_processing, num_prompt_tokens_processed,
-        t_prompt_processing / num_prompt_tokens_processed,
-        1e3 / t_prompt_processing * num_prompt_tokens_processed);
-    LOG_TEE(
-        "%s:        eval time = %10.2f ms / %5d runs   (%8.2f ms per "
-        "token, %8.2f tokens per second)\n",
-        __func__, t_token_generation, n_decoded, t_token_generation / n_decoded,
-        1e3 / t_token_generation * n_decoded);
-    LOG_TEE("%s:       total time = %10.2f ms\n", __func__,
-            t_prompt_processing + t_token_generation);
+    LOG_DEBUG << __func__ << ": prompt eval time = " << t_prompt_processing
+              << "ms / " << num_prompt_tokens_processed << " tokens ("
+              << t_prompt_processing / num_prompt_tokens_processed
+              << " ms per "
+                 "token, "
+              << 1e3 / t_prompt_processing * num_prompt_tokens_processed
+              << " tokens per second)";
+    LOG_DEBUG << __func__ << ":        eval time = " << t_token_generation
+              << " ms / " << n_decoded << " runs   ("
+              << t_token_generation / n_decoded
+              << " ms per "
+                 "token, "
+              << 1e3 / t_token_generation * n_decoded
+              << " tokens per second)\n";
+    LOG_DEBUG << __func__ << ":       total time = "
+              << t_prompt_processing + t_token_generation << " ms";
   }
 };
 
@@ -516,7 +518,7 @@ struct llama_server_context {
     params = params_;
     if (!params.mmproj.empty()) {
       multimodal = true;
-      LOG_TEE("Multi Modal Mode Enabled");
+      LOG_DEBUG << "Multi Modal Mode Enabled";
       clp_ctx = clip_model_load(params.mmproj.c_str(), /*verbosity=*/1);
       if (clp_ctx == nullptr) {
         LOG_ERROR_LLAMA("unable to load clip model",
@@ -532,7 +534,8 @@ struct llama_server_context {
 
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
     if (model == nullptr) {
-      LOG_ERROR_LLAMA("llama.cpp unable to load model", {{"model", params.model}});
+      LOG_ERROR_LLAMA("llama.cpp unable to load model",
+                      {{"model", params.model}});
       return false;
     }
 
@@ -540,11 +543,13 @@ struct llama_server_context {
       const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
       const int n_embd_llm = llama_n_embd(model);
       if (n_embd_clip != n_embd_llm) {
-        LOG_TEE(
-            "%s: embedding dim of the multimodal projector (%d) is not "
-            "equal to that of LLaMA (%d). Make sure that you use the "
-            "correct mmproj file.\n",
-            __func__, n_embd_clip, n_embd_llm);
+        LOG_DEBUG << __func__ << ": embedding dim of the multimodal projector ("
+                  << n_embd_clip
+                  << ") is not "
+                     "equal to that of LLaMA ("
+                  << n_embd_llm
+                  << "). Make sure that you use the "
+                     "correct mmproj file.";
         llama_free(ctx);
         llama_free_model(model);
         return false;
@@ -570,7 +575,7 @@ struct llama_server_context {
 
     const int32_t n_ctx_slot = n_ctx / params.n_parallel;
 
-    LOG_TEE("Available slots:\n");
+    LOG_INFO << "Available slots: ";
     for (int i = 0; i < params.n_parallel; i++) {
       llama_client_slot slot;
 
@@ -578,14 +583,18 @@ struct llama_server_context {
       slot.n_ctx = n_ctx_slot;
       slot.reset();
 
-      LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
+      LOG_INFO << " -> Slot " << slot.id << " - max context: " << n_ctx_slot;
       slots.push_back(slot);
     }
 
     try {
       batch = llama_batch_init(n_ctx, 0, params.n_parallel);
     } catch (const std::exception& e) {
-      LOG_ERROR_LLAMA("Failed to allocate llama.cpp batch metadata" , {{"exception", e.what()}, {"n_tokens_alloc", n_ctx}, {"embd", 0}, {"n_seq_max", params.n_parallel}});
+      LOG_ERROR_LLAMA("Failed to allocate llama.cpp batch metadata",
+                      {{"exception", e.what()},
+                       {"n_tokens_alloc", n_ctx},
+                       {"embd", 0},
+                       {"n_seq_max", params.n_parallel}});
     }
 
     // empty system prompt
@@ -797,11 +806,11 @@ struct llama_server_context {
           img_sl.img_data = clip_image_u8_init();
           if (!clip_image_load_from_bytes(
                   image_buffer.data(), image_buffer.size(), img_sl.img_data)) {
-            LOG_TEE("slot %i - failed to load image [id: %i]\n", slot->id,
-                    img_sl.id);
+            LOG_DEBUG << "slot " << slot->id
+                      << " - failed to load image [id: " << img_sl.id << "]";
             return false;
           }
-          LOG_TEE("slot %i - loaded image\n", slot->id);
+          LOG_INFO << "slot " << slot->id << " - loaded image";
           img_sl.request_encode_image = true;
           slot->images.push_back(img_sl);
         }
@@ -832,12 +841,13 @@ struct llama_server_context {
                   }
                 }
                 if (!found) {
-                  LOG_TEE("ERROR: Image with id: %i, not found.\n", img_id);
+                  LOG_INFO << "ERROR: Image with id: " << img_id
+                           << ", not found.\n";
                   slot->images.clear();
                   return false;
                 }
               } catch (const std::invalid_argument& e) {
-                LOG_TEE("Invalid image number id in prompt\n");
+                LOG_INFO << "Invalid image number id in prompt";
                 slot->images.clear();
                 return false;
               }
@@ -860,7 +870,8 @@ struct llama_server_context {
 
     all_slots_are_idle = false;
 
-    LOG_TEE("slot %i is processing [task id: %i]\n", slot->id, slot->task_id);
+    LOG_INFO << "slot " << slot->id
+             << " is processing [task id: " << slot->task_id << "]";
 
     return true;
   }
@@ -882,7 +893,7 @@ struct llama_server_context {
     }
 
     if (llama_decode(ctx, batch) != 0) {
-      LOG_TEE("%s: llama_decode() failed\n", __func__);
+      LOG_WARN << __func__ << ": llama_decode() failed";
       return;
     }
 
@@ -891,7 +902,7 @@ struct llama_server_context {
       llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size());
     }
 
-    LOG_TEE("system prompt updated\n");
+    LOG_INFO << "system prompt updated";
     system_need_update = false;
   }
 
@@ -1055,7 +1066,7 @@ struct llama_server_context {
       if (!llava_image_embed_make_with_clip_img(
               clp_ctx, params.n_threads, img.img_data, &img.image_embedding,
               &img.image_tokens)) {
-        LOG_TEE("Error processing the given image");
+        LOG_DEBUG << "Error processing the given image";
         return false;
       }
 
@@ -1253,12 +1264,37 @@ struct llama_server_context {
           {"embedding", std::vector<float>(n_embd, 0.0f)},
       };
     } else {
-      const float* data = llama_get_embeddings(ctx);
-      std::vector<float> embedding(data, data + n_embd);
+      std::vector<float> embd_res(n_embd, 0.0f);
+
+      for (int i = 0; i < batch.n_tokens; ++i) {
+        if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
+          continue;
+        }
+
+        const float* embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+        if (embd == NULL) {
+          embd = llama_get_embeddings_ith(ctx, i);
+        }
+
+        if (embd == NULL) {
+          LOG_ERROR << "failed to get embeddings"
+                    << " token " << batch.token[i] << ", seq_id "
+                    << batch.seq_id[i][0];
+
+          res.result_json = json{
+              {"embedding", std::vector<float>(n_embd, 0.0f)},
+          };
+
+          continue;
+        }
+
+        llama_embd_normalize(embd, embd_res.data(), n_embd);
+      }
       res.result_json = json{
-          {"embedding", embedding},
+          {"embedding", embd_res},
       };
     }
+
     queue_results.push_back(res);
     condition_results.notify_all();
   }
@@ -1341,7 +1377,7 @@ struct llama_server_context {
             0,  // unused
         };
         if (llama_decode(ctx, batch_view)) {
-          LOG_TEE("%s : failed to eval\n", __func__);
+          LOG_INFO << __func__ << " : failed to eval\n";
           return false;
         }
       }
@@ -1361,7 +1397,7 @@ struct llama_server_context {
             0,
         };
         if (llama_decode(ctx, batch_img)) {
-          LOG_TEE("%s : failed to eval image\n", __func__);
+          LOG_INFO << __func__ << " : failed to eval image";
           return false;
         }
         slot.n_past += n_eval;
@@ -1429,7 +1465,7 @@ struct llama_server_context {
           llama_client_slot* slot =
               get_slot(json_value(task.data, "slot_id", -1));
           if (slot == nullptr) {
-            LOG_TEE("slot unavailable\n");
+            LOG_INFO << "slot unavailable";
             // send error result
             send_error(task, "slot unavailable");
             return;
@@ -1499,7 +1535,7 @@ struct llama_server_context {
 
     // update the system prompt wait until all slots are idle state
     if (system_need_update && all_slots_are_idle) {
-      LOG_TEE("updating system prompt\n");
+      LOG_DEBUG << "updating system prompt";
       update_system_prompt();
     }
 
@@ -1507,10 +1543,9 @@ struct llama_server_context {
 
     if (all_slots_are_idle) {
       if (system_prompt.empty() && clean_kv_cache) {
-        LOG_TEE(
-            "all slots are idle and system prompt is empty, clear the KV "
-            "cache\n");
-        kv_cache_clear();
+        LOG_INFO
+            << "all slots are idle and system prompt is empty, clear the KV "
+               "cache";
       }
       // std::this_thread::sleep_for(std::chrono::milliseconds(5));
       //  TODO: Need to implement queueing using CV for better performance
@@ -1528,15 +1563,13 @@ struct llama_server_context {
         const int n_left = slot.n_past - slot.params.n_keep - 1;
         const int n_discard = n_left / 2;
 
-        LOG_TEE(
-            "slot %d: context shift - n_keep = %d, n_left = %d, n_discard "
-            "= %d\n",
-            slot.id, slot.params.n_keep, n_left, n_discard);
+        LOG_INFO << "slot " << slot.id
+                 << " context shift - n_keep = " << slot.params.n_keep
+                 << ", n_left = " << n_left << ", n_discard: " << n_discard;
         llama_kv_cache_seq_rm(ctx, slot.id, slot.params.n_keep + 1,
                               slot.params.n_keep + n_discard + 1);
-        llama_kv_cache_seq_add(ctx, slot.id,
-                                 slot.params.n_keep + 1 + n_discard,
-                                 slot.n_past, -n_discard);
+        llama_kv_cache_seq_add(ctx, slot.id, slot.params.n_keep + 1 + n_discard,
+                               slot.n_past, -n_discard);
 
         for (size_t i = slot.params.n_keep + 1 + n_discard;
              i < slot.cache_tokens.size(); i++) {
@@ -1565,8 +1598,8 @@ struct llama_server_context {
         slot.command = NONE;
         slot.t_last_used = ggml_time_us();
 
-        LOG_TEE("slot %d released (%d tokens in cache)\n", slot.id,
-                (int)slot.cache_tokens.size());
+        LOG_INFO << "slot " << slot.id << " released ("
+                 << (int)slot.cache_tokens.size() << " tokens in cache)";
 
         continue;
       }
@@ -1699,12 +1732,13 @@ struct llama_server_context {
             slot.num_prompt_tokens_processed =
                 slot.num_prompt_tokens - slot.n_past;
 
-            LOG_TEE("slot %d : in cache: %i tokens | to process: %i tokens\n",
-                    slot.id, slot.n_past, slot.num_prompt_tokens_processed);
+            LOG_INFO << "slot " << slot.id << " : in cache: " << slot.n_past
+                     << " tokens | to process: "
+                     << slot.num_prompt_tokens_processed << " tokens";
           }
 
-          LOG_TEE("slot %d : kv cache rm - [%d, end)\n", slot.id,
-                  (int)system_tokens.size() + slot.n_past);
+          LOG_INFO << "slot " << slot.id << " : kv cache rm - ["
+                   << (int)system_tokens.size() + slot.n_past << ", end)";
 
           llama_kv_cache_seq_rm(ctx, slot.id,
                                 system_tokens.size() + slot.n_past, -1);
@@ -1713,10 +1747,9 @@ struct llama_server_context {
 
           if (slot.n_past == slot.num_prompt_tokens) {
             // we have to evaluate at least 1 token to generate logits.
-            LOG_TEE(
-                "slot %d : we have to evaluate at least 1 token to "
-                "generate logits\n",
-                slot.id);
+            LOG_INFO << "slot " << slot.id
+                     << " : we have to evaluate at least 1 token to "
+                        "generate logits";
             slot.n_past--;
           }
 
@@ -1745,7 +1778,7 @@ struct llama_server_context {
           }
 
           if (has_images && !ingest_images(slot, n_batch)) {
-            LOG_TEE("failed processing images\n");
+            LOG_INFO << "failed processing images";
             return false;
           }
 
@@ -1785,15 +1818,17 @@ struct llama_server_context {
         if (n_batch == 1 || ret < 0) {
           // if you get here, it means the KV cache is full - try increasing it
           // via the context size
-          LOG_TEE("%s : failed to decode the batch, n_batch = %d, ret = %d\n",
-                  __func__, n_batch, ret);
+          LOG_INFO << __func__
+                   << " : failed to decode the batch, n_batch = " << n_batch
+                   << ", ret = " << ret;
           return false;
         }
 
-        LOG_TEE(
-            "%s : failed to find free space in the KV cache, retrying with "
-            "smaller n_batch = %d\n",
-            __func__, n_batch / 2);
+        LOG_INFO
+            << __func__
+            << " : failed to find free space in the KV cache, retrying with "
+               "smaller n_batch = "
+            << n_batch / 2;
 
         // retry with half the batch size to try to find a free slot in the KV
         // cache
@@ -1870,7 +1905,8 @@ static void server_print_usage(const char* argv0, const gpt_params& params,
       "  -tb N, --threads-batch N  number of threads to use during batch "
       "and prompt processing (default: same as --threads)\n");
   printf(
-      "  -c N, --ctx-size N        size of the prompt context (default: %d)\n",
+      "  -c N, --ctx-size N        size of the prompt context (default: "
+      "%d)\n",
       params.n_ctx);
   printf("  --rope-scaling {none,linear,yarn}\n");
   printf(
@@ -1988,7 +2024,8 @@ static void server_print_usage(const char* argv0, const gpt_params& params,
       "dynamic batching) (default: disabled)\n");
   printf("  -spf FNAME, --system-prompt-file FNAME\n");
   printf(
-      "                            set a file to load a system prompt (initial "
+      "                            set a file to load a system prompt "
+      "(initial "
       "prompt of all slots), this is useful for chat applications.\n");
   printf(
       "  --mmproj MMPROJ_FILE      path to a multimodal projector file for "
diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc
index e19a7f32b..951b325b5 100644
--- a/controllers/llamaCPP.cc
+++ b/controllers/llamaCPP.cc
@@ -147,7 +147,7 @@ llamaCPP::llamaCPP()
     : queue(new trantor::ConcurrentTaskQueue(llama.params.n_parallel,
                                              "llamaCPP")) {
   // Some default values for now below
-  log_disable();  // Disable the log to file feature, reduce bloat for
+  // log_disable();  // Disable the log to file feature, reduce bloat for
   // target
   // system ()
 };
@@ -706,7 +706,7 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
 
   LOG_INFO << "Started background task here!";
   backgroundThread = std::thread(&llamaCPP::BackgroundTask, this);
-  WarmupModel();
+  // WarmupModel();
   return true;
 }
 

From ff67a5be05ff575df1b3b89c35f8ff82380d7154 Mon Sep 17 00:00:00 2001
From: vansangpfiev <vansangpfiev@gmail.com>
Date: Sat, 13 Apr 2024 14:32:29 +0700
Subject: [PATCH 2/7] fix: warmup model

---
 controllers/llamaCPP.cc | 24 +++++++++++++++---------
 controllers/llamaCPP.h  |  2 +-
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc
index 951b325b5..0b36e02e0 100644
--- a/controllers/llamaCPP.cc
+++ b/controllers/llamaCPP.cc
@@ -145,25 +145,25 @@ std::string create_return_json(const std::string& id, const std::string& model,
 
 llamaCPP::llamaCPP()
     : queue(new trantor::ConcurrentTaskQueue(llama.params.n_parallel,
-                                             "llamaCPP")) {
-  // Some default values for now below
-  // log_disable();  // Disable the log to file feature, reduce bloat for
-  // target
-  // system ()
-};
+                                             "llamaCPP")){
+          // Some default values for now below
+          // log_disable();  // Disable the log to file feature, reduce bloat for
+          // target
+          // system ()
+      };
 
 llamaCPP::~llamaCPP() {
   StopBackgroundTask();
 }
 
-void llamaCPP::WarmupModel() {
+void llamaCPP::WarmupModel(bool is_embedded_model) {
   json pseudo;
 
   LOG_INFO << "Warm-up model";
   pseudo["prompt"] = "Hello";
   pseudo["n_predict"] = 2;
   pseudo["stream"] = false;
-  const int task_id = llama.request_completion(pseudo, false, false, -1);
+  const int task_id = llama.request_completion(pseudo, false, is_embedded_model, -1);
   std::string completion_text;
   task_result result = llama.next_result(task_id);
   if (!result.error && result.stop) {
@@ -606,6 +606,10 @@ void llamaCPP::LoadModel(
 
 bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
   gpt_params params;
+  // For model like nomic-embed-text-v1.5.f16.gguf, etc, we need to warmup model with flag embedding = true.
+  // So we use this variable to differentiate with other models
+  // TODO: in case embedded model only, we should reject completion request from user?
+  bool is_embedded_model = false;
   // By default will setting based on number of handlers
   if (jsonBody) {
     if (!jsonBody->operator[]("mmproj").isNull()) {
@@ -649,6 +653,8 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
 
     params.n_gpu_layers = jsonBody->get("ngl", 100).asInt();
     params.n_ctx = jsonBody->get("ctx_len", 2048).asInt();
+    is_embedded_model =
+        !(*jsonBody)["embedding"].isNull() && (*jsonBody)["embedding"].asBool();
     params.embedding = jsonBody->get("embedding", true).asBool();
     // Check if n_parallel exists in jsonBody, if not, set to drogon_thread
     params.n_batch = jsonBody->get("n_batch", 512).asInt();
@@ -706,7 +712,7 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
 
   LOG_INFO << "Started background task here!";
   backgroundThread = std::thread(&llamaCPP::BackgroundTask, this);
-  // WarmupModel();
+  WarmupModel(is_embedded_model);
   return true;
 }
 
diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h
index 900786c79..914caff60 100644
--- a/controllers/llamaCPP.h
+++ b/controllers/llamaCPP.h
@@ -101,7 +101,7 @@ class llamaCPP : public drogon::HttpController<llamaCPP>,
   void EmbeddingImpl(std::shared_ptr<Json::Value> jsonBody,
                      std::function<void(const HttpResponsePtr&)>&& callback);
   bool CheckModelLoaded(const std::function<void(const HttpResponsePtr&)>& callback);
-  void WarmupModel();
+  void WarmupModel(bool is_embedded_model);
   void BackgroundTask();
   void StopBackgroundTask();
 };

From 4c0879482db1a83eab5e2a532d81639d627445e0 Mon Sep 17 00:00:00 2001
From: vansangpfiev <vansangpfiev@gmail.com>
Date: Sat, 13 Apr 2024 14:35:44 +0700
Subject: [PATCH 3/7] fix: change log level

---
 context/llama_server_context.h | 38 +++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/context/llama_server_context.h b/context/llama_server_context.h
index 5727897ca..841ec1108 100644
--- a/context/llama_server_context.h
+++ b/context/llama_server_context.h
@@ -575,7 +575,7 @@ struct llama_server_context {
 
     const int32_t n_ctx_slot = n_ctx / params.n_parallel;
 
-    LOG_INFO << "Available slots: ";
+    LOG_DEBUG << "Available slots: ";
     for (int i = 0; i < params.n_parallel; i++) {
       llama_client_slot slot;
 
@@ -583,7 +583,7 @@ struct llama_server_context {
       slot.n_ctx = n_ctx_slot;
       slot.reset();
 
-      LOG_INFO << " -> Slot " << slot.id << " - max context: " << n_ctx_slot;
+      LOG_DEBUG << " -> Slot " << slot.id << " - max context: " << n_ctx_slot;
       slots.push_back(slot);
     }
 
@@ -810,7 +810,7 @@ struct llama_server_context {
                       << " - failed to load image [id: " << img_sl.id << "]";
             return false;
           }
-          LOG_INFO << "slot " << slot->id << " - loaded image";
+          LOG_DEBUG << "slot " << slot->id << " - loaded image";
           img_sl.request_encode_image = true;
           slot->images.push_back(img_sl);
         }
@@ -841,13 +841,13 @@ struct llama_server_context {
                   }
                 }
                 if (!found) {
-                  LOG_INFO << "ERROR: Image with id: " << img_id
+                  LOG_DEBUG << "ERROR: Image with id: " << img_id
                            << ", not found.\n";
                   slot->images.clear();
                   return false;
                 }
               } catch (const std::invalid_argument& e) {
-                LOG_INFO << "Invalid image number id in prompt";
+                LOG_DEBUG << "Invalid image number id in prompt";
                 slot->images.clear();
                 return false;
               }
@@ -870,7 +870,7 @@ struct llama_server_context {
 
     all_slots_are_idle = false;
 
-    LOG_INFO << "slot " << slot->id
+    LOG_DEBUG << "slot " << slot->id
              << " is processing [task id: " << slot->task_id << "]";
 
     return true;
@@ -902,7 +902,7 @@ struct llama_server_context {
       llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size());
     }
 
-    LOG_INFO << "system prompt updated";
+    LOG_DEBUG << "system prompt updated";
     system_need_update = false;
   }
 
@@ -1377,7 +1377,7 @@ struct llama_server_context {
             0,  // unused
         };
         if (llama_decode(ctx, batch_view)) {
-          LOG_INFO << __func__ << " : failed to eval\n";
+          LOG_DEBUG << __func__ << " : failed to eval\n";
           return false;
         }
       }
@@ -1397,7 +1397,7 @@ struct llama_server_context {
             0,
         };
         if (llama_decode(ctx, batch_img)) {
-          LOG_INFO << __func__ << " : failed to eval image";
+          LOG_DEBUG << __func__ << " : failed to eval image";
           return false;
         }
         slot.n_past += n_eval;
@@ -1465,7 +1465,7 @@ struct llama_server_context {
           llama_client_slot* slot =
               get_slot(json_value(task.data, "slot_id", -1));
           if (slot == nullptr) {
-            LOG_INFO << "slot unavailable";
+            LOG_DEBUG << "slot unavailable";
             // send error result
             send_error(task, "slot unavailable");
             return;
@@ -1543,7 +1543,7 @@ struct llama_server_context {
 
     if (all_slots_are_idle) {
       if (system_prompt.empty() && clean_kv_cache) {
-        LOG_INFO
+        LOG_DEBUG
             << "all slots are idle and system prompt is empty, clear the KV "
                "cache";
       }
@@ -1563,7 +1563,7 @@ struct llama_server_context {
         const int n_left = slot.n_past - slot.params.n_keep - 1;
         const int n_discard = n_left / 2;
 
-        LOG_INFO << "slot " << slot.id
+        LOG_DEBUG << "slot " << slot.id
                  << " context shift - n_keep = " << slot.params.n_keep
                  << ", n_left = " << n_left << ", n_discard: " << n_discard;
         llama_kv_cache_seq_rm(ctx, slot.id, slot.params.n_keep + 1,
@@ -1598,7 +1598,7 @@ struct llama_server_context {
         slot.command = NONE;
         slot.t_last_used = ggml_time_us();
 
-        LOG_INFO << "slot " << slot.id << " released ("
+        LOG_DEBUG << "slot " << slot.id << " released ("
                  << (int)slot.cache_tokens.size() << " tokens in cache)";
 
         continue;
@@ -1732,12 +1732,12 @@ struct llama_server_context {
             slot.num_prompt_tokens_processed =
                 slot.num_prompt_tokens - slot.n_past;
 
-            LOG_INFO << "slot " << slot.id << " : in cache: " << slot.n_past
+            LOG_DEBUG << "slot " << slot.id << " : in cache: " << slot.n_past
                      << " tokens | to process: "
                      << slot.num_prompt_tokens_processed << " tokens";
           }
 
-          LOG_INFO << "slot " << slot.id << " : kv cache rm - ["
+          LOG_DEBUG << "slot " << slot.id << " : kv cache rm - ["
                    << (int)system_tokens.size() + slot.n_past << ", end)";
 
           llama_kv_cache_seq_rm(ctx, slot.id,
@@ -1747,7 +1747,7 @@ struct llama_server_context {
 
           if (slot.n_past == slot.num_prompt_tokens) {
             // we have to evaluate at least 1 token to generate logits.
-            LOG_INFO << "slot " << slot.id
+            LOG_DEBUG << "slot " << slot.id
                      << " : we have to evaluate at least 1 token to "
                         "generate logits";
             slot.n_past--;
@@ -1778,7 +1778,7 @@ struct llama_server_context {
           }
 
           if (has_images && !ingest_images(slot, n_batch)) {
-            LOG_INFO << "failed processing images";
+            LOG_DEBUG << "failed processing images";
             return false;
           }
 
@@ -1818,13 +1818,13 @@ struct llama_server_context {
         if (n_batch == 1 || ret < 0) {
           // if you get here, it means the KV cache is full - try increasing it
           // via the context size
-          LOG_INFO << __func__
+          LOG_DEBUG << __func__
                    << " : failed to decode the batch, n_batch = " << n_batch
                    << ", ret = " << ret;
           return false;
         }
 
-        LOG_INFO
+        LOG_DEBUG
             << __func__
             << " : failed to find free space in the KV cache, retrying with "
                "smaller n_batch = "

From 30abf3a4cfc39e589b253c476ef60e51bb035c6c Mon Sep 17 00:00:00 2001
From: vansangpfiev <vansangpfiev@gmail.com>
Date: Mon, 15 Apr 2024 09:43:39 +0700
Subject: [PATCH 4/7] fix: restore kv_cache_clear

---
 context/llama_server_context.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/context/llama_server_context.h b/context/llama_server_context.h
index 841ec1108..a4d8f7a89 100644
--- a/context/llama_server_context.h
+++ b/context/llama_server_context.h
@@ -1546,6 +1546,7 @@ struct llama_server_context {
         LOG_DEBUG
             << "all slots are idle and system prompt is empty, clear the KV "
                "cache";
+        kv_cache_clear();
       }
       // std::this_thread::sleep_for(std::chrono::milliseconds(5));
       //  TODO: Need to implement queueing using CV for better performance

From a2cb6ba161f0c853b357cdabe61598706cbb074f Mon Sep 17 00:00:00 2001
From: vansangpfiev <vansangpfiev@gmail.com>
Date: Mon, 15 Apr 2024 16:36:47 +0700
Subject: [PATCH 5/7] fix: log_disable

---
 controllers/llamaCPP.cc | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc
index 0b36e02e0..c40d7b284 100644
--- a/controllers/llamaCPP.cc
+++ b/controllers/llamaCPP.cc
@@ -145,12 +145,12 @@ std::string create_return_json(const std::string& id, const std::string& model,
 
 llamaCPP::llamaCPP()
     : queue(new trantor::ConcurrentTaskQueue(llama.params.n_parallel,
-                                             "llamaCPP")){
-          // Some default values for now below
-          // log_disable();  // Disable the log to file feature, reduce bloat for
-          // target
-          // system ()
-      };
+                                             "llamaCPP")) {
+  // Some default values for now below
+  log_disable();  // Disable the log to file feature, reduce bloat for
+                  // target
+                  // system ()
+};
 
 llamaCPP::~llamaCPP() {
   StopBackgroundTask();
@@ -163,7 +163,8 @@ void llamaCPP::WarmupModel(bool is_embedded_model) {
   pseudo["prompt"] = "Hello";
   pseudo["n_predict"] = 2;
   pseudo["stream"] = false;
-  const int task_id = llama.request_completion(pseudo, false, is_embedded_model, -1);
+  const int task_id =
+      llama.request_completion(pseudo, false, is_embedded_model, -1);
   std::string completion_text;
   task_result result = llama.next_result(task_id);
   if (!result.error && result.stop) {

From 833866106e06fe9478ecb4776156ccb94ba3c254 Mon Sep 17 00:00:00 2001
From: vansangpfiev <vansangpfiev@gmail.com>
Date: Mon, 15 Apr 2024 17:33:03 +0700
Subject: [PATCH 6/7] fix: use params.embedding to decide warmup or not

---
 context/llama_server_context.h | 77 ++++++++++++++-----------------
 controllers/llamaCPP.cc        | 82 +++++++++++++++++-----------------
 controllers/llamaCPP.h         |  2 +-
 3 files changed, 77 insertions(+), 84 deletions(-)

diff --git a/context/llama_server_context.h b/context/llama_server_context.h
index a4d8f7a89..3839e4b3f 100644
--- a/context/llama_server_context.h
+++ b/context/llama_server_context.h
@@ -842,7 +842,7 @@ struct llama_server_context {
                 }
                 if (!found) {
                   LOG_DEBUG << "ERROR: Image with id: " << img_id
-                           << ", not found.\n";
+                            << ", not found.\n";
                   slot->images.clear();
                   return false;
                 }
@@ -871,7 +871,7 @@ struct llama_server_context {
     all_slots_are_idle = false;
 
     LOG_DEBUG << "slot " << slot->id
-             << " is processing [task id: " << slot->task_id << "]";
+              << " is processing [task id: " << slot->task_id << "]";
 
     return true;
   }
@@ -1255,45 +1255,36 @@ struct llama_server_context {
     res.stop = true;
 
     const int n_embd = llama_n_embd(model);
-    if (!params.embedding) {
-      LOG_WARNING_LLAMA("embedding disabled",
-                        {
-                            {"params.embedding", params.embedding},
-                        });
-      res.result_json = json{
-          {"embedding", std::vector<float>(n_embd, 0.0f)},
-      };
-    } else {
-      std::vector<float> embd_res(n_embd, 0.0f);
 
-      for (int i = 0; i < batch.n_tokens; ++i) {
-        if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
-          continue;
-        }
+    std::vector<float> embd_res(n_embd, 0.0f);
 
-        const float* embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
-        if (embd == NULL) {
-          embd = llama_get_embeddings_ith(ctx, i);
-        }
+    for (int i = 0; i < batch.n_tokens; ++i) {
+      if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
+        continue;
+      }
 
-        if (embd == NULL) {
-          LOG_ERROR << "failed to get embeddings"
-                    << " token " << batch.token[i] << ", seq_id "
-                    << batch.seq_id[i][0];
+      const float* embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+      if (embd == NULL) {
+        embd = llama_get_embeddings_ith(ctx, i);
+      }
 
-          res.result_json = json{
-              {"embedding", std::vector<float>(n_embd, 0.0f)},
-          };
+      if (embd == NULL) {
+        LOG_ERROR << "failed to get embeddings"
+                  << " token " << batch.token[i] << ", seq_id "
+                  << batch.seq_id[i][0];
 
-          continue;
-        }
+        res.result_json = json{
+            {"embedding", std::vector<float>(n_embd, 0.0f)},
+        };
 
-        llama_embd_normalize(embd, embd_res.data(), n_embd);
+        continue;
       }
-      res.result_json = json{
-          {"embedding", embd_res},
-      };
+
+      llama_embd_normalize(embd, embd_res.data(), n_embd);
     }
+    res.result_json = json{
+        {"embedding", embd_res},
+    };
 
     queue_results.push_back(res);
     condition_results.notify_all();
@@ -1565,8 +1556,8 @@ struct llama_server_context {
         const int n_discard = n_left / 2;
 
         LOG_DEBUG << "slot " << slot.id
-                 << " context shift - n_keep = " << slot.params.n_keep
-                 << ", n_left = " << n_left << ", n_discard: " << n_discard;
+                  << " context shift - n_keep = " << slot.params.n_keep
+                  << ", n_left = " << n_left << ", n_discard: " << n_discard;
         llama_kv_cache_seq_rm(ctx, slot.id, slot.params.n_keep + 1,
                               slot.params.n_keep + n_discard + 1);
         llama_kv_cache_seq_add(ctx, slot.id, slot.params.n_keep + 1 + n_discard,
@@ -1600,7 +1591,7 @@ struct llama_server_context {
         slot.t_last_used = ggml_time_us();
 
         LOG_DEBUG << "slot " << slot.id << " released ("
-                 << (int)slot.cache_tokens.size() << " tokens in cache)";
+                  << (int)slot.cache_tokens.size() << " tokens in cache)";
 
         continue;
       }
@@ -1734,12 +1725,12 @@ struct llama_server_context {
                 slot.num_prompt_tokens - slot.n_past;
 
             LOG_DEBUG << "slot " << slot.id << " : in cache: " << slot.n_past
-                     << " tokens | to process: "
-                     << slot.num_prompt_tokens_processed << " tokens";
+                      << " tokens | to process: "
+                      << slot.num_prompt_tokens_processed << " tokens";
           }
 
           LOG_DEBUG << "slot " << slot.id << " : kv cache rm - ["
-                   << (int)system_tokens.size() + slot.n_past << ", end)";
+                    << (int)system_tokens.size() + slot.n_past << ", end)";
 
           llama_kv_cache_seq_rm(ctx, slot.id,
                                 system_tokens.size() + slot.n_past, -1);
@@ -1749,8 +1740,8 @@ struct llama_server_context {
           if (slot.n_past == slot.num_prompt_tokens) {
             // we have to evaluate at least 1 token to generate logits.
             LOG_DEBUG << "slot " << slot.id
-                     << " : we have to evaluate at least 1 token to "
-                        "generate logits";
+                      << " : we have to evaluate at least 1 token to "
+                         "generate logits";
             slot.n_past--;
           }
 
@@ -1820,8 +1811,8 @@ struct llama_server_context {
           // if you get here, it means the KV cache is full - try increasing it
           // via the context size
           LOG_DEBUG << __func__
-                   << " : failed to decode the batch, n_batch = " << n_batch
-                   << ", ret = " << ret;
+                    << " : failed to decode the batch, n_batch = " << n_batch
+                    << ", ret = " << ret;
           return false;
         }
 
diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc
index c40d7b284..3ff0e899a 100644
--- a/controllers/llamaCPP.cc
+++ b/controllers/llamaCPP.cc
@@ -156,7 +156,7 @@ llamaCPP::~llamaCPP() {
   StopBackgroundTask();
 }
 
-void llamaCPP::WarmupModel(bool is_embedded_model) {
+void llamaCPP::WarmupModel() {
   json pseudo;
 
   LOG_INFO << "Warm-up model";
@@ -164,7 +164,7 @@ void llamaCPP::WarmupModel(bool is_embedded_model) {
   pseudo["n_predict"] = 2;
   pseudo["stream"] = false;
   const int task_id =
-      llama.request_completion(pseudo, false, is_embedded_model, -1);
+      llama.request_completion(pseudo, false, false, -1);
   std::string completion_text;
   task_result result = llama.next_result(task_id);
   if (!result.error && result.stop) {
@@ -405,36 +405,37 @@ void llamaCPP::InferenceImpl(
       return 0;
     };
     // Queued task
-    state->instance->queue->runTaskInQueue(
-        [cb = std::move(callback), state, data, chunked_content_provider, request_id]() {
-          state->task_id =
-              state->instance->llama.request_completion(data, false, false, -1);
-
-          // Start streaming response
-          auto resp = nitro_utils::nitroStreamResponse(chunked_content_provider,
-                                                       "chat_completions.txt");
-          cb(resp);
-
-          int retries = 0;
-
-          // Since this is an async task, we will wait for the task to be
-          // completed
-          while (state->inference_status != FINISHED && retries < 10) {
-            // Should wait chunked_content_provider lambda to be called within
-            // 3s
-            if (state->inference_status == PENDING) {
-              retries += 1;
-            }
-            if (state->inference_status != RUNNING)
-              LOG_INFO_REQUEST(request_id)
-                  << "Wait for task to be released:" << state->task_id;
-            std::this_thread::sleep_for(std::chrono::milliseconds(100));
-          }
-          LOG_INFO_REQUEST(request_id) << "Task completed, release it";
-          // Request completed, release it
-          state->instance->llama.request_cancel(state->task_id);
-          LOG_INFO_REQUEST(request_id) << "Inference completed";
-        });
+    state->instance->queue->runTaskInQueue([cb = std::move(callback), state,
+                                            data, chunked_content_provider,
+                                            request_id]() {
+      state->task_id =
+          state->instance->llama.request_completion(data, false, false, -1);
+
+      // Start streaming response
+      auto resp = nitro_utils::nitroStreamResponse(chunked_content_provider,
+                                                   "chat_completions.txt");
+      cb(resp);
+
+      int retries = 0;
+
+      // Since this is an async task, we will wait for the task to be
+      // completed
+      while (state->inference_status != FINISHED && retries < 10) {
+        // Should wait chunked_content_provider lambda to be called within
+        // 3s
+        if (state->inference_status == PENDING) {
+          retries += 1;
+        }
+        if (state->inference_status != RUNNING)
+          LOG_INFO_REQUEST(request_id)
+              << "Wait for task to be released:" << state->task_id;
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+      }
+      LOG_INFO_REQUEST(request_id) << "Task completed, release it";
+      // Request completed, release it
+      state->instance->llama.request_cancel(state->task_id);
+      LOG_INFO_REQUEST(request_id) << "Inference completed";
+    });
   } else {
     queue->runTaskInQueue(
         [this, request_id, cb = std::move(callback), d = std::move(data)]() {
@@ -607,10 +608,7 @@ void llamaCPP::LoadModel(
 
 bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
   gpt_params params;
-  // For model like nomic-embed-text-v1.5.f16.gguf, etc, we need to warmup model with flag embedding = true.
-  // So we use this variable to differentiate with other models
-  // TODO: in case embedded model only, we should reject completion request from user?
-  bool is_embedded_model = false;
+
   // By default will setting based on number of handlers
   if (jsonBody) {
     if (!jsonBody->operator[]("mmproj").isNull()) {
@@ -654,9 +652,7 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
 
     params.n_gpu_layers = jsonBody->get("ngl", 100).asInt();
     params.n_ctx = jsonBody->get("ctx_len", 2048).asInt();
-    is_embedded_model =
-        !(*jsonBody)["embedding"].isNull() && (*jsonBody)["embedding"].asBool();
-    params.embedding = jsonBody->get("embedding", true).asBool();
+    params.embedding = jsonBody->get("embedding", false).asBool();
     // Check if n_parallel exists in jsonBody, if not, set to drogon_thread
     params.n_batch = jsonBody->get("n_batch", 512).asInt();
     params.n_parallel = jsonBody->get("n_parallel", 1).asInt();
@@ -713,7 +709,13 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
 
   LOG_INFO << "Started background task here!";
   backgroundThread = std::thread(&llamaCPP::BackgroundTask, this);
-  WarmupModel(is_embedded_model);
+
+  // For model like nomic-embed-text-v1.5.f16.gguf, etc, we don't need to warm up model.
+  // So we use this variable to differentiate with other models
+  // TODO: in case embedded model only, we should reject completion request from user?
+  if (!params.embedding) {
+    WarmupModel();
+  }
   return true;
 }
 
diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h
index 914caff60..900786c79 100644
--- a/controllers/llamaCPP.h
+++ b/controllers/llamaCPP.h
@@ -101,7 +101,7 @@ class llamaCPP : public drogon::HttpController<llamaCPP>,
   void EmbeddingImpl(std::shared_ptr<Json::Value> jsonBody,
                      std::function<void(const HttpResponsePtr&)>&& callback);
   bool CheckModelLoaded(const std::function<void(const HttpResponsePtr&)>& callback);
-  void WarmupModel(bool is_embedded_model);
+  void WarmupModel();
   void BackgroundTask();
   void StopBackgroundTask();
 };

From c7bb799548c983178db944bca8f9fd3108fc7a2f Mon Sep 17 00:00:00 2001
From: vansangpfiev <vansangpfiev@gmail.com>
Date: Mon, 15 Apr 2024 18:19:45 +0700
Subject: [PATCH 7/7] fix: add model_type

---
 README.md               |  1 +
 controllers/llamaCPP.cc | 10 +++++-----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 565e399dd..92fda215e 100644
--- a/README.md
+++ b/README.md
@@ -114,6 +114,7 @@ Table of parameters
 |`grp_attn_w`|Integer|Group attention width in self-extend|
 |`mlock`|Boolean|Prevent system swapping of the model to disk in macOS|
 |`grammar_file`| String |You can constrain the sampling using GBNF grammars by providing path to a grammar file|
+|`model_type` | String | Model type we want to use: llm or embedding, default value is llm|
 
 ***OPTIONAL***: You can run Nitro on a different port like 5000 instead of 3928 by running it manually in terminal
 ```zsh
diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc
index 3ff0e899a..3e3015c2a 100644
--- a/controllers/llamaCPP.cc
+++ b/controllers/llamaCPP.cc
@@ -163,8 +163,7 @@ void llamaCPP::WarmupModel() {
   pseudo["prompt"] = "Hello";
   pseudo["n_predict"] = 2;
   pseudo["stream"] = false;
-  const int task_id =
-      llama.request_completion(pseudo, false, false, -1);
+  const int task_id = llama.request_completion(pseudo, false, false, -1);
   std::string completion_text;
   task_result result = llama.next_result(task_id);
   if (!result.error && result.stop) {
@@ -608,7 +607,7 @@ void llamaCPP::LoadModel(
 
 bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
   gpt_params params;
-
+  std::string model_type;
   // By default will setting based on number of handlers
   if (jsonBody) {
     if (!jsonBody->operator[]("mmproj").isNull()) {
@@ -652,7 +651,8 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
 
     params.n_gpu_layers = jsonBody->get("ngl", 100).asInt();
     params.n_ctx = jsonBody->get("ctx_len", 2048).asInt();
-    params.embedding = jsonBody->get("embedding", false).asBool();
+    params.embedding = jsonBody->get("embedding", true).asBool();
+    model_type = jsonBody->get("model_type", "llm").asString();
     // Check if n_parallel exists in jsonBody, if not, set to drogon_thread
     params.n_batch = jsonBody->get("n_batch", 512).asInt();
     params.n_parallel = jsonBody->get("n_parallel", 1).asInt();
@@ -713,7 +713,7 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
   // For model like nomic-embed-text-v1.5.f16.gguf, etc, we don't need to warm up model.
   // So we use this variable to differentiate with other models
   // TODO: in case embedded model only, we should reject completion request from user?
-  if (!params.embedding) {
+  if (model_type == "llm") {
     WarmupModel();
   }
   return true;