From 876790f57a959608c9a44ea5ca95a5e9ff2c7ec1 Mon Sep 17 00:00:00 2001
From: vansangpfiev <vansangpfiev@gmail.com>
Date: Tue, 16 Apr 2024 17:44:39 +0700
Subject: [PATCH 1/3] fix: e2e testing for linux and mac

---
 .github/scripts/e2e-test-llama-linux-and-mac.sh | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/scripts/e2e-test-llama-linux-and-mac.sh b/.github/scripts/e2e-test-llama-linux-and-mac.sh
index 5b7b9771d..7d888e08f 100644
--- a/.github/scripts/e2e-test-llama-linux-and-mac.sh
+++ b/.github/scripts/e2e-test-llama-linux-and-mac.sh
@@ -65,14 +65,12 @@ fi
 response2=$(
     curl --connect-timeout 60 -o /tmp/completion-res.log -s -w "%{http_code}" --location "http://127.0.0.1:$PORT/v1/chat/completions" \
         --header 'Content-Type: application/json' \
-        --header 'Accept: text/event-stream' \
-        --header 'Access-Control-Allow-Origin: *' \
         --data '{
         "messages": [
             {"content": "Hello there", "role": "assistant"},
             {"content": "Write a long and sad story for me", "role": "user"}
         ],
-        "stream": true,
+        "stream": false,
         "model": "gpt-3.5-turbo",
         "max_tokens": 50,
         "stop": ["hello"],

From 849f7b4abd465f0d0773d87f37fefe0bcf4c48e9 Mon Sep 17 00:00:00 2001
From: vansangpfiev <vansangpfiev@gmail.com>
Date: Tue, 16 Apr 2024 19:30:26 +0700
Subject: [PATCH 2/3] fix: thread hang if unload model when still receiving
 stream data

---
 controllers/llamaCPP.cc | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/controllers/llamaCPP.cc b/controllers/llamaCPP.cc
index e4f33a27e..69284d6e9 100644
--- a/controllers/llamaCPP.cc
+++ b/controllers/llamaCPP.cc
@@ -190,8 +190,7 @@ void llamaCPP::InferenceImpl(
   if (llama.model_type == ModelType::EMBEDDING) {
     LOG_WARN << "Not support completion for embedding model";
     Json::Value jsonResp;
-    jsonResp["message"] =
-        "Not support completion for embedding model";
+    jsonResp["message"] = "Not support completion for embedding model";
     auto resp = nitro_utils::nitroHttpJsonResponse(jsonResp);
     resp->setStatusCode(drogon::k400BadRequest);
     callback(resp);
@@ -429,7 +428,8 @@ void llamaCPP::InferenceImpl(
 
       // Since this is an async task, we will wait for the task to be
       // completed
-      while (state->inference_status != FINISHED && retries < 10) {
+      while (state->inference_status != FINISHED && retries < 10 &&
+             state->instance->llama.model_loaded_external) {
         // Should wait chunked_content_provider lambda to be called within
         // 3s
         if (state->inference_status == PENDING) {
@@ -748,9 +748,10 @@ void llamaCPP::StopBackgroundTask() {
   if (llama.model_loaded_external) {
     llama.model_loaded_external = false;
     llama.condition_tasks.notify_one();
-    LOG_INFO << "Background task stopped! ";
+    LOG_INFO << "Stopping background task! ";
     if (backgroundThread.joinable()) {
       backgroundThread.join();
     }
+    LOG_INFO << "Background task stopped! ";
   }
 }

From 9608a0c905ff84c3ce85e475d6d0555d2b9240b7 Mon Sep 17 00:00:00 2001
From: vansangpfiev <vansangpfiev@gmail.com>
Date: Tue, 16 Apr 2024 19:30:49 +0700
Subject: [PATCH 3/3] Revert "fix: e2e testing for linux and mac"

This reverts commit 876790f57a959608c9a44ea5ca95a5e9ff2c7ec1.
---
 .github/scripts/e2e-test-llama-linux-and-mac.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/scripts/e2e-test-llama-linux-and-mac.sh b/.github/scripts/e2e-test-llama-linux-and-mac.sh
index 7d888e08f..5b7b9771d 100644
--- a/.github/scripts/e2e-test-llama-linux-and-mac.sh
+++ b/.github/scripts/e2e-test-llama-linux-and-mac.sh
@@ -65,12 +65,14 @@ fi
 response2=$(
     curl --connect-timeout 60 -o /tmp/completion-res.log -s -w "%{http_code}" --location "http://127.0.0.1:$PORT/v1/chat/completions" \
         --header 'Content-Type: application/json' \
+        --header 'Accept: text/event-stream' \
+        --header 'Access-Control-Allow-Origin: *' \
         --data '{
         "messages": [
             {"content": "Hello there", "role": "assistant"},
             {"content": "Write a long and sad story for me", "role": "user"}
         ],
-        "stream": false,
+        "stream": true,
         "model": "gpt-3.5-turbo",
         "max_tokens": 50,
         "stop": ["hello"],