janhq · vansangpfiev · Apr 11, 2024 · Apr 10, 2024
diff --git a/context/llama_server_context.h b/context/llama_server_context.h
@@ -1,15 +1,15 @@
+#include <mutex>
+#include <set>
 #include <string>
 #include <vector>
-#include <set>
-#include <mutex>
 
 // External
 #include "clip.h"
 #include "common.h"
 #include "llama.h"
-#include "utils/json.hpp"
-#include "stb_image.h"
 #include "llava.h"
+#include "stb_image.h"
+#include "utils/json.hpp"
 
 #if defined(_WIN32)
 #define NOMINMAX
@@ -532,7 +532,8 @@ struct llama_server_context {
 
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
     if (model == nullptr) {
-      LOG_ERROR_LLAMA("llama.cpp unable to load model", {{"model", params.model}});
+      LOG_ERROR_LLAMA("llama.cpp unable to load model",
+                      {{"model", params.model}});
       return false;
     }
 
@@ -585,7 +586,11 @@ struct llama_server_context {
     try {
       batch = llama_batch_init(n_ctx, 0, params.n_parallel);
     } catch (const std::exception& e) {
-      LOG_ERROR_LLAMA("Failed to allocate llama.cpp batch metadata" , {{"exception", e.what()}, {"n_tokens_alloc", n_ctx}, {"embd", 0}, {"n_seq_max", params.n_parallel}});
+      LOG_ERROR_LLAMA("Failed to allocate llama.cpp batch metadata",
+                      {{"exception", e.what()},
+                       {"n_tokens_alloc", n_ctx},
+                       {"embd", 0},
+                       {"n_seq_max", params.n_parallel}});
     }
 
     // empty system prompt
@@ -1244,19 +1249,35 @@ struct llama_server_context {
     res.stop = true;
 
     const int n_embd = llama_n_embd(model);
-    if (!params.embedding) {
-      LOG_WARNING_LLAMA("embedding disabled",
-                        {
-                            {"params.embedding", params.embedding},
-                        });
-      res.result_json = json{
-          {"embedding", std::vector<float>(n_embd, 0.0f)},
-      };
-    } else {
-      const float* data = llama_get_embeddings(ctx);
-      std::vector<float> embedding(data, data + n_embd);
+
+    std::vector<float> embd_res(n_embd, 0.0f);
+
+    for (int i = 0; i < batch.n_tokens; ++i) {
+      if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) {
+        continue;
+      }
+
+      const float* embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+      if (embd == NULL) {
+        embd = llama_get_embeddings_ith(ctx, i);
+      }
+
+      if (embd == NULL) {
+        LOG_ERROR << "failed to get embeddings "
+                  << "token: " << batch.token[i]
+                  << ", seq_id: " << batch.seq_id[i][0];
+
+        res.result_json = json{
+            {"embedding", std::vector<float>(n_embd, 0.0f)},
+        };
+
+        continue;
+      }
+
+      llama_embd_normalize(embd, embd_res.data(), n_embd);
+
       res.result_json = json{
-          {"embedding", embedding},
+          {"embedding", embd_res},
       };
     }
     queue_results.push_back(res);
@@ -1380,7 +1401,7 @@ struct llama_server_context {
       std::vector<llama_token> append_tokens =
           tokenize(json_prompt, false);  // has next image
       for (int i = 0; i < (int)append_tokens.size(); ++i) {
-        llama_batch_add(batch, append_tokens[i], slot.n_past, {slot.id}, true);
+        llama_batch_add(batch, append_tokens[i], slot.n_past, {slot.id + 1}, true);
         slot.n_past += 1;
       }
     }
@@ -1523,27 +1544,28 @@ struct llama_server_context {
 
     for (llama_client_slot& slot : slots) {
       if (slot.is_processing() &&
-          slot.cache_tokens.size() >= (size_t)slot.n_ctx) {
+          (int)system_tokens.size() + slot.n_past >= slot.n_ctx - 1) {
         // Shift context
-        const int n_left = slot.n_past - slot.params.n_keep - 1;
+        const int n_keep = slot.params.n_keep + add_bos_token;
+        const int n_left = (int)system_tokens.size() + slot.n_past - n_keep;
         const int n_discard = n_left / 2;
 
         LOG_TEE(
             "slot %d: context shift - n_keep = %d, n_left = %d, n_discard "
             "= %d\n",
             slot.id, slot.params.n_keep, n_left, n_discard);
-        llama_kv_cache_seq_rm(ctx, slot.id, slot.params.n_keep + 1,
-                              slot.params.n_keep + n_discard + 1);
-        llama_kv_cache_seq_add(ctx, slot.id,
-                                 slot.params.n_keep + 1 + n_discard,
-                                 slot.n_past, -n_discard);
-
-        for (size_t i = slot.params.n_keep + 1 + n_discard;
-             i < slot.cache_tokens.size(); i++) {
-          slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
-        }
+        llama_kv_cache_seq_rm(ctx, slot.id + 1, n_keep, n_keep + n_discard);
+        llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard,
+                               system_tokens.size() + slot.n_past, -n_discard);
+
+        if (slot.params.cache_prompt) {
+          for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size();
+               i++) {
+            slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
+          }
 
-        slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
+          slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
+        }
 
         slot.n_past -= n_discard;
 
@@ -1557,6 +1579,9 @@ struct llama_server_context {
       }
     }
 
+    // start populating the batch for this iteration
+    llama_batch_clear(batch);
+
     // decode any currently ongoing sequences
     for (auto& slot : slots) {
       // release the slot
@@ -1578,14 +1603,15 @@ struct llama_server_context {
       slot.i_batch = batch.n_tokens;
 
       llama_batch_add(batch, slot.sampled, system_tokens.size() + slot.n_past,
-                      {slot.id}, true);
+                      {slot.id + 1}, true);
 
       slot.n_decoded += 1;
       slot.n_past += 1;
     }
 
     // process in chunks of params.n_batch
-    int32_t n_batch = params.n_batch;
+    int32_t n_batch = llama_n_batch(ctx);
+    int32_t n_ubatch = llama_n_ubatch(ctx);
 
     // assign workload to the slots
     if (params.cont_batching || batch.n_tokens == 0) {
@@ -1641,8 +1667,7 @@ struct llama_server_context {
           } else {
             prompt_tokens = tokenize(
                 slot.prompt,
-                system_prompt.empty() &&
-                    add_bos_token);  // add BOS if there isn't system prompt
+                system_prompt.empty());  // add BOS if there isn't system prompt
           }
 
           slot.num_prompt_tokens = prompt_tokens.size();
@@ -1738,9 +1763,11 @@ struct llama_server_context {
           std::vector<llama_token> prefix_tokens =
               has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token)
                          : prompt_tokens;
-          for (; slot.n_past < (int)prefix_tokens.size(); ++slot.n_past) {
+          for (;
+               slot.n_past < slot.num_prompt_tokens && batch.n_tokens < n_batch;
+               ++slot.n_past) {
             llama_batch_add(batch, prefix_tokens[slot.n_past],
-                            system_tokens.size() + slot.n_past, {slot.id},
+                            system_tokens.size() + slot.n_past, {slot.id + 1},
                             false);
           }
 
@@ -1803,7 +1830,8 @@ struct llama_server_context {
       }
 
       for (auto& slot : slots) {
-        if (slot.i_batch < (int)i || slot.i_batch >= (int)(i + n_tokens)) {
+        if (slot.state != PROCESSING || slot.i_batch < (int)i ||
+            slot.i_batch >= (int)(i + n_tokens)) {
           continue;
         }
 
@@ -1812,7 +1840,7 @@ struct llama_server_context {
           send_embedding(slot);
           slot.release();
           slot.i_batch = -1;
-          return true;
+          continue;
         }
 
         completion_token_output result;

diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc
@@ -154,14 +154,14 @@ llamaCPP::~llamaCPP() {
   StopBackgroundTask();
 }
 
-void llamaCPP::WarmupModel() {
+void llamaCPP::WarmupModel(bool is_embedding) {
   json pseudo;
 
   LOG_INFO << "Warm-up model";
   pseudo["prompt"] = "Hello";
   pseudo["n_predict"] = 2;
   pseudo["stream"] = false;
-  const int task_id = llama.request_completion(pseudo, false, false, -1);
+  const int task_id = llama.request_completion(pseudo, false, is_embedding, -1);
   std::string completion_text;
   task_result result = llama.next_result(task_id);
   if (!result.error && result.stop) {
@@ -624,7 +624,7 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
 
     params.n_gpu_layers = jsonBody->get("ngl", 100).asInt();
     params.n_ctx = jsonBody->get("ctx_len", 2048).asInt();
-    params.embedding = jsonBody->get("embedding", true).asBool();
+    params.embedding = jsonBody->get("embedding", false).asBool();
     // Check if n_parallel exists in jsonBody, if not, set to drogon_thread
     params.n_batch = jsonBody->get("n_batch", 512).asInt();
     params.n_parallel = jsonBody->get("n_parallel", 1).asInt();
@@ -681,7 +681,7 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
 
   LOG_INFO << "Started background task here!";
   backgroundThread = std::thread(&llamaCPP::BackgroundTask, this);
-  WarmupModel();
+  WarmupModel(params.embedding);
   return true;
 }
 

diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h
@@ -101,7 +101,7 @@ class llamaCPP : public drogon::HttpController<llamaCPP>,
   void EmbeddingImpl(std::shared_ptr<Json::Value> jsonBody,
                      std::function<void(const HttpResponsePtr&)>& callback);
   bool CheckModelLoaded(std::function<void(const HttpResponsePtr&)>& callback);
-  void WarmupModel();
+  void WarmupModel(bool is_embedding);
   void BackgroundTask();
   void StopBackgroundTask();
 };