Revert "fix: use params.embedding to decide warmup or not"

vansangpfiev · vansangpfiev · commit b47f6e9fa88a · 2024-04-15T17:36:29.000+07:00
This reverts commit 8338661.
diff --git a/context/llama_server_context.h b/context/llama_server_context.h
@@ -842,7 +842,7 @@ struct llama_server_context {
                 }
                 if (!found) {
                   LOG_DEBUG << "ERROR: Image with id: " << img_id
-                            << ", not found.\n";
+                           << ", not found.\n";
                   slot->images.clear();
                   return false;
                 }
@@ -871,7 +871,7 @@ struct llama_server_context {
     all_slots_are_idle = false;
 
     LOG_DEBUG << "slot " << slot->id
-              << " is processing [task id: " << slot->task_id << "]";
+             << " is processing [task id: " << slot->task_id << "]";
 
     return true;
   }
@@ -1255,36 +1255,45 @@ struct llama_server_context {
     res.stop = true;
 
     const int n_embd = llama_n_embd(model);
+    if (!params.embedding) {
+      LOG_WARNING_LLAMA("embedding disabled",
+                        {
+                            {"params.embedding", params.embedding},
+                        });
+      res.result_json = json{
+          {"embedding", std::vector<float>(n_embd, 0.0f)},
+      };
+    } else {
+      std::vector<float> embd_res(n_embd, 0.0f);
 
-    std::vector<float> embd_res(n_embd, 0.0f);
+      for (int i = 0; i < batch.n_tokens; ++i) {
+        if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
+          continue;
+        }
 
-    for (int i = 0; i < batch.n_tokens; ++i) {
-      if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
-        continue;
-      }
+        const float* embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+        if (embd == NULL) {
+          embd = llama_get_embeddings_ith(ctx, i);
+        }
 
-      const float* embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
-      if (embd == NULL) {
-        embd = llama_get_embeddings_ith(ctx, i);
-      }
+        if (embd == NULL) {
+          LOG_ERROR << "failed to get embeddings"
+                    << " token " << batch.token[i] << ", seq_id "
+                    << batch.seq_id[i][0];
 
-      if (embd == NULL) {
-        LOG_ERROR << "failed to get embeddings"
-                  << " token " << batch.token[i] << ", seq_id "
-                  << batch.seq_id[i][0];
+          res.result_json = json{
+              {"embedding", std::vector<float>(n_embd, 0.0f)},
+          };
 
-        res.result_json = json{
-            {"embedding", std::vector<float>(n_embd, 0.0f)},
-        };
+          continue;
+        }
 
-        continue;
+        llama_embd_normalize(embd, embd_res.data(), n_embd);
       }
-
-      llama_embd_normalize(embd, embd_res.data(), n_embd);
+      res.result_json = json{
+          {"embedding", embd_res},
+      };
     }
-    res.result_json = json{
-        {"embedding", embd_res},
-    };
 
     queue_results.push_back(res);
     condition_results.notify_all();
@@ -1556,8 +1565,8 @@ struct llama_server_context {
         const int n_discard = n_left / 2;
 
         LOG_DEBUG << "slot " << slot.id
-                  << " context shift - n_keep = " << slot.params.n_keep
-                  << ", n_left = " << n_left << ", n_discard: " << n_discard;
+                 << " context shift - n_keep = " << slot.params.n_keep
+                 << ", n_left = " << n_left << ", n_discard: " << n_discard;
         llama_kv_cache_seq_rm(ctx, slot.id, slot.params.n_keep + 1,
                               slot.params.n_keep + n_discard + 1);
         llama_kv_cache_seq_add(ctx, slot.id, slot.params.n_keep + 1 + n_discard,
@@ -1591,7 +1600,7 @@ struct llama_server_context {
         slot.t_last_used = ggml_time_us();
 
         LOG_DEBUG << "slot " << slot.id << " released ("
-                  << (int)slot.cache_tokens.size() << " tokens in cache)";
+                 << (int)slot.cache_tokens.size() << " tokens in cache)";
 
         continue;
       }
@@ -1725,12 +1734,12 @@ struct llama_server_context {
                 slot.num_prompt_tokens - slot.n_past;
 
             LOG_DEBUG << "slot " << slot.id << " : in cache: " << slot.n_past
-                      << " tokens | to process: "
-                      << slot.num_prompt_tokens_processed << " tokens";
+                     << " tokens | to process: "
+                     << slot.num_prompt_tokens_processed << " tokens";
           }
 
           LOG_DEBUG << "slot " << slot.id << " : kv cache rm - ["
-                    << (int)system_tokens.size() + slot.n_past << ", end)";
+                   << (int)system_tokens.size() + slot.n_past << ", end)";
 
           llama_kv_cache_seq_rm(ctx, slot.id,
                                 system_tokens.size() + slot.n_past, -1);
@@ -1740,8 +1749,8 @@ struct llama_server_context {
           if (slot.n_past == slot.num_prompt_tokens) {
             // we have to evaluate at least 1 token to generate logits.
             LOG_DEBUG << "slot " << slot.id
-                      << " : we have to evaluate at least 1 token to "
-                         "generate logits";
+                     << " : we have to evaluate at least 1 token to "
+                        "generate logits";
             slot.n_past--;
           }
 
@@ -1811,8 +1820,8 @@ struct llama_server_context {
           // if you get here, it means the KV cache is full - try increasing it
           // via the context size
           LOG_DEBUG << __func__
-                    << " : failed to decode the batch, n_batch = " << n_batch
-                    << ", ret = " << ret;
+                   << " : failed to decode the batch, n_batch = " << n_batch
+                   << ", ret = " << ret;
           return false;
         }
 
diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc
@@ -156,15 +156,15 @@ llamaCPP::~llamaCPP() {
   StopBackgroundTask();
 }
 
-void llamaCPP::WarmupModel() {
+void llamaCPP::WarmupModel(bool is_embedded_model) {
   json pseudo;
 
   LOG_INFO << "Warm-up model";
   pseudo["prompt"] = "Hello";
   pseudo["n_predict"] = 2;
   pseudo["stream"] = false;
   const int task_id =
-      llama.request_completion(pseudo, false, false, -1);
+      llama.request_completion(pseudo, false, is_embedded_model, -1);
   std::string completion_text;
   task_result result = llama.next_result(task_id);
   if (!result.error && result.stop) {
@@ -405,37 +405,36 @@ void llamaCPP::InferenceImpl(
       return 0;
     };
     // Queued task
-    state->instance->queue->runTaskInQueue([cb = std::move(callback), state,
-                                            data, chunked_content_provider,
-                                            request_id]() {
-      state->task_id =
-          state->instance->llama.request_completion(data, false, false, -1);
-
-      // Start streaming response
-      auto resp = nitro_utils::nitroStreamResponse(chunked_content_provider,
-                                                   "chat_completions.txt");
-      cb(resp);
-
-      int retries = 0;
-
-      // Since this is an async task, we will wait for the task to be
-      // completed
-      while (state->inference_status != FINISHED && retries < 10) {
-        // Should wait chunked_content_provider lambda to be called within
-        // 3s
-        if (state->inference_status == PENDING) {
-          retries += 1;
-        }
-        if (state->inference_status != RUNNING)
-          LOG_INFO_REQUEST(request_id)
-              << "Wait for task to be released:" << state->task_id;
-        std::this_thread::sleep_for(std::chrono::milliseconds(100));
-      }
-      LOG_INFO_REQUEST(request_id) << "Task completed, release it";
-      // Request completed, release it
-      state->instance->llama.request_cancel(state->task_id);
-      LOG_INFO_REQUEST(request_id) << "Inference completed";
-    });
+    state->instance->queue->runTaskInQueue(
+        [cb = std::move(callback), state, data, chunked_content_provider, request_id]() {
+          state->task_id =
+              state->instance->llama.request_completion(data, false, false, -1);
+
+          // Start streaming response
+          auto resp = nitro_utils::nitroStreamResponse(chunked_content_provider,
+                                                       "chat_completions.txt");
+          cb(resp);
+
+          int retries = 0;
+
+          // Since this is an async task, we will wait for the task to be
+          // completed
+          while (state->inference_status != FINISHED && retries < 10) {
+            // Should wait chunked_content_provider lambda to be called within
+            // 3s
+            if (state->inference_status == PENDING) {
+              retries += 1;
+            }
+            if (state->inference_status != RUNNING)
+              LOG_INFO_REQUEST(request_id)
+                  << "Wait for task to be released:" << state->task_id;
+            std::this_thread::sleep_for(std::chrono::milliseconds(100));
+          }
+          LOG_INFO_REQUEST(request_id) << "Task completed, release it";
+          // Request completed, release it
+          state->instance->llama.request_cancel(state->task_id);
+          LOG_INFO_REQUEST(request_id) << "Inference completed";
+        });
   } else {
     queue->runTaskInQueue(
         [this, request_id, cb = std::move(callback), d = std::move(data)]() {
@@ -608,7 +607,10 @@ void llamaCPP::LoadModel(
 
 bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
   gpt_params params;
-
+  // For model like nomic-embed-text-v1.5.f16.gguf, etc, we need to warmup model with flag embedding = true.
+  // So we use this variable to differentiate with other models
+  // TODO: in case embedded model only, we should reject completion request from user?
+  bool is_embedded_model = false;
   // By default will setting based on number of handlers
   if (jsonBody) {
     if (!jsonBody->operator[]("mmproj").isNull()) {
@@ -652,7 +654,9 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
 
     params.n_gpu_layers = jsonBody->get("ngl", 100).asInt();
     params.n_ctx = jsonBody->get("ctx_len", 2048).asInt();
-    params.embedding = jsonBody->get("embedding", false).asBool();
+    is_embedded_model =
+        !(*jsonBody)["embedding"].isNull() && (*jsonBody)["embedding"].asBool();
+    params.embedding = jsonBody->get("embedding", true).asBool();
     // Check if n_parallel exists in jsonBody, if not, set to drogon_thread
     params.n_batch = jsonBody->get("n_batch", 512).asInt();
     params.n_parallel = jsonBody->get("n_parallel", 1).asInt();
@@ -709,13 +713,7 @@ bool llamaCPP::LoadModelImpl(std::shared_ptr<Json::Value> jsonBody) {
 
   LOG_INFO << "Started background task here!";
   backgroundThread = std::thread(&llamaCPP::BackgroundTask, this);
-
-  // For model like nomic-embed-text-v1.5.f16.gguf, etc, we don't need to warm up model.
-  // So we use this variable to differentiate with other models
-  // TODO: in case embedded model only, we should reject completion request from user?
-  if (!params.embedding) {
-    WarmupModel();
-  }
+  WarmupModel(is_embedded_model);
   return true;
 }
 
diff --git a/controllers/llamaCPP.h b/controllers/llamaCPP.h
@@ -101,7 +101,7 @@ class llamaCPP : public drogon::HttpController<llamaCPP>,
   void EmbeddingImpl(std::shared_ptr<Json::Value> jsonBody,
                      std::function<void(const HttpResponsePtr&)>&& callback);
   bool CheckModelLoaded(const std::function<void(const HttpResponsePtr&)>& callback);
-  void WarmupModel();
+  void WarmupModel(bool is_embedded_model);
   void BackgroundTask();
   void StopBackgroundTask();
 };