Revert "fix: make embedding work again" (#502)

vansangpfiev · web-flow · commit 002cbc44194e · 2024-04-12T13:30:43.000+07:00
diff --git a/context/llama_server_context.h b/context/llama_server_context.h
@@ -1,15 +1,15 @@
-#include <mutex>
-#include <set>
 #include <string>
 #include <vector>
+#include <set>
+#include <mutex>
 
 // External
 #include "clip.h"
 #include "common.h"
 #include "llama.h"
-#include "llava.h"
-#include "stb_image.h"
 #include "utils/json.hpp"
+#include "stb_image.h"
+#include "llava.h"
 
 #if defined(_WIN32)
 #define NOMINMAX
@@ -532,8 +532,7 @@ struct llama_server_context {
 
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
     if (model == nullptr) {
-      LOG_ERROR_LLAMA("llama.cpp unable to load model",
-                      {{"model", params.model}});
+      LOG_ERROR_LLAMA("llama.cpp unable to load model", {{"model", params.model}});
       return false;
     }
 
@@ -586,11 +585,7 @@ struct llama_server_context {
     try {
       batch = llama_batch_init(n_ctx, 0, params.n_parallel);
     } catch (const std::exception& e) {
-      LOG_ERROR_LLAMA("Failed to allocate llama.cpp batch metadata",
-                      {{"exception", e.what()},
-                       {"n_tokens_alloc", n_ctx},
-                       {"embd", 0},
-                       {"n_seq_max", params.n_parallel}});
+      LOG_ERROR_LLAMA("Failed to allocate llama.cpp batch metadata" , {{"exception", e.what()}, {"n_tokens_alloc", n_ctx}, {"embd", 0}, {"n_seq_max", params.n_parallel}});
     }
 
     // empty system prompt
@@ -1249,35 +1244,19 @@ struct llama_server_context {
     res.stop = true;
 
     const int n_embd = llama_n_embd(model);
-
-    std::vector<float> embd_res(n_embd, 0.0f);
-
-    for (int i = 0; i < batch.n_tokens; ++i) {
-      if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) {
-        continue;
-      }
-
-      const float* embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
-      if (embd == NULL) {
-        embd = llama_get_embeddings_ith(ctx, i);
-      }
-
-      if (embd == NULL) {
-        LOG_ERROR << "failed to get embeddings "
-                  << "token: " << batch.token[i]
-                  << ", seq_id: " << batch.seq_id[i][0];
-
-        res.result_json = json{
-            {"embedding", std::vector<float>(n_embd, 0.0f)},
-        };
-
-        continue;
-      }
-
-      llama_embd_normalize(embd, embd_res.data(), n_embd);
-
+    if (!params.embedding) {
+      LOG_WARNING_LLAMA("embedding disabled",
+                        {
+                            {"params.embedding", params.embedding},
+                        });
       res.result_json = json{
-          {"embedding", embd_res},
+          {"embedding", std::vector<float>(n_embd, 0.0f)},
+      };
+    } else {
+      const float* data = llama_get_embeddings(ctx);
+      std::vector<float> embedding(data, data + n_embd);
+      res.result_json = json{
+          {"embedding", embedding},
       };
     }
     queue_results.push_back(res);
@@ -1401,7 +1380,7 @@ struct llama_server_context {
       std::vector<llama_token> append_tokens =
           tokenize(json_prompt, false);  // has next image
       for (int i = 0; i < (int)append_tokens.size(); ++i) {
-        llama_batch_add(batch, append_tokens[i], slot.n_past, {slot.id + 1}, true);
+        llama_batch_add(batch, append_tokens[i], slot.n_past, {slot.id}, true);
         slot.n_past += 1;
       }
     }
@@ -1544,29 +1523,28 @@ struct llama_server_context {
 
     for (llama_client_slot& slot : slots) {
       if (slot.is_processing() &&
-          (int)system_tokens.size() + slot.n_past >= slot.n_ctx - 1) {
+          slot.cache_tokens.size() >= (size_t)slot.n_ctx) {
         // Shift context
-        const int n_keep = slot.params.n_keep + add_bos_token;
-        const int n_left = (int)system_tokens.size() + slot.n_past - n_keep;
+        const int n_left = slot.n_past - slot.params.n_keep - 1;
         const int n_discard = n_left / 2;
 
         LOG_TEE(
             "slot %d: context shift - n_keep = %d, n_left = %d, n_discard "
             "= %d\n",
             slot.id, slot.params.n_keep, n_left, n_discard);
-        llama_kv_cache_seq_rm(ctx, slot.id + 1, n_keep, n_keep + n_discard);
-        llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard,
-                               system_tokens.size() + slot.n_past, -n_discard);
-
-        if (slot.params.cache_prompt) {
-          for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size();
-               i++) {
-            slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
-          }
-
-          slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
+        llama_kv_cache_seq_rm(ctx, slot.id, slot.params.n_keep + 1,
+                              slot.params.n_keep + n_discard + 1);
+        llama_kv_cache_seq_add(ctx, slot.id,
+                                 slot.params.n_keep + 1 + n_discard,
+                                 slot.n_past, -n_discard);
+
+        for (size_t i = slot.params.n_keep + 1 + n_discard;
+             i < slot.cache_tokens.size(); i++) {
+          slot.cache_tokens[i - n_discard] = slot.cache_tokens[i];
         }
 
+        slot.cache_tokens.resize(slot.cache_tokens.size() - n_discard);
+
         slot.n_past -= n_discard;
 
         slot.truncated = true;
@@ -1579,9 +1557,6 @@ struct llama_server_context {
       }
     }
 
-    // start populating the batch for this iteration
-    llama_batch_clear(batch);
-
     // decode any currently ongoing sequences
     for (auto& slot : slots) {
       // release the slot
@@ -1603,15 +1578,14 @@ struct llama_server_context {
       slot.i_batch = batch.n_tokens;
 
       llama_batch_add(batch, slot.sampled, system_tokens.size() + slot.n_past,
-                      {slot.id + 1}, true);
+                      {slot.id}, true);
 
       slot.n_decoded += 1;
       slot.n_past += 1;
     }
 
     // process in chunks of params.n_batch
-    int32_t n_batch = llama_n_batch(ctx);
-    int32_t n_ubatch = llama_n_ubatch(ctx);
+    int32_t n_batch = params.n_batch;
 
     // assign workload to the slots
     if (params.cont_batching || batch.n_tokens == 0) {
@@ -1667,7 +1641,8 @@ struct llama_server_context {
           } else {
             prompt_tokens = tokenize(
                 slot.prompt,
-                system_prompt.empty());  // add BOS if there isn't system prompt
+                system_prompt.empty() &&
+                    add_bos_token);  // add BOS if there isn't system prompt
           }
 
           slot.num_prompt_tokens = prompt_tokens.size();
@@ -1763,11 +1738,9 @@ struct llama_server_context {
           std::vector<llama_token> prefix_tokens =
               has_images ? tokenize(slot.images[0].prefix_prompt, add_bos_token)
                          : prompt_tokens;
-          for (;
-               slot.n_past < slot.num_prompt_tokens && batch.n_tokens < n_batch;
-               ++slot.n_past) {
+          for (; slot.n_past < (int)prefix_tokens.size(); ++slot.n_past) {
             llama_batch_add(batch, prefix_tokens[slot.n_past],
-                            system_tokens.size() + slot.n_past, {slot.id + 1},
+                            system_tokens.size() + slot.n_past, {slot.id},
                             false);
           }
 
@@ -1830,8 +1803,7 @@ struct llama_server_context {
       }
 
       for (auto& slot : slots) {
-        if (slot.state != PROCESSING || slot.i_batch < (int)i ||
-            slot.i_batch >= (int)(i + n_tokens)) {
+        if (slot.i_batch < (int)i || slot.i_batch >= (int)(i + n_tokens)) {
           continue;
         }
 
@@ -1840,7 +1812,7 @@ struct llama_server_context {
           send_embedding(slot);
           slot.release();
           slot.i_batch = -1;
-          continue;
+          return true;
         }
 
         completion_token_output result;
diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc
@@ -156,14 +156,14 @@ llamaCPP::~llamaCPP() {
   StopBackgroundTask();
 }
 
-void llamaCPP::WarmupModel(bool is_embedding) {
+void llamaCPP::WarmupModel() {
   json pseudo;
 
   LOG_INFO << "Warm-up model";
   pseudo["prompt"] = "Hello";
   pseudo["n_predict"] = 2;
   pseudo["stream"] = false;
-  const int task_id = llama.request_completion(pseudo, false, is_embedding, -1);
+  const int task_id = llama.request_completion(pseudo, false, false, -1);
   std::string completion_text;
   task_result result = llama.next_result(task_id);
   if (!result.error && result.stop) {
@@ -645,7 +645,7 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
 
     params.n_gpu_layers = jsonBody->get("ngl", 100).asInt();
     params.n_ctx = jsonBody->get("ctx_len", 2048).asInt();
-    params.embedding = jsonBody->get("embedding", false).asBool();
+    params.embedding = jsonBody->get("embedding", true).asBool();
     // Check if n_parallel exists in jsonBody, if not, set to drogon_thread
     params.n_batch = jsonBody->get("n_batch", 512).asInt();
     params.n_parallel = jsonBody->get("n_parallel", 1).asInt();
@@ -702,7 +702,7 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
 
   LOG_INFO << "Started background task here!";
   backgroundThread = std::thread(&llamaCPP::BackgroundTask, this);
-  WarmupModel(params.embedding);
+  WarmupModel();
   return true;
 }
 
diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h
@@ -101,7 +101,7 @@ class llamaCPP : public drogon::HttpController<llamaCPP>,
   void EmbeddingImpl(std::shared_ptr<Json::Value> jsonBody,
                      std::function<void(const HttpResponsePtr&)>& callback);
   bool CheckModelLoaded(std::function<void(const HttpResponsePtr&)>& callback);
-  void WarmupModel(bool is_embedding);
+  void WarmupModel();
   void BackgroundTask();
   void StopBackgroundTask();
 };