From 30687eee3fe202c8e148466db498fe9e4c0e90ab Mon Sep 17 00:00:00 2001
From: vansangpfiev <vansangpfiev@gmail.com>
Date: Tue, 16 Apr 2024 09:47:52 +0700
Subject: [PATCH 1/6] feat: e2e embedding endpoint testing for linux and mac

---
 .../e2e-test-embedding-linux-and-mac.sh       | 104 ++++++++++++++++++
 .github/workflows/build.yml                   |  26 +++++
 2 files changed, 130 insertions(+)
 create mode 100644 .github/scripts/e2e-test-embedding-linux-and-mac.sh

diff --git a/.github/scripts/e2e-test-embedding-linux-and-mac.sh b/.github/scripts/e2e-test-embedding-linux-and-mac.sh
new file mode 100644
index 000000000..f62cfca39
--- /dev/null
+++ b/.github/scripts/e2e-test-embedding-linux-and-mac.sh
@@ -0,0 +1,104 @@
+#!/bin/bash
+
+## Example run command
+# ./linux-and-mac.sh './jan/plugins/@janhq/inference-plugin/dist/nitro/nitro_mac_arm64' https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF/resolve/main/tinyllama-1.1b-chat-v0.3.Q2_K.gguf
+
+# Check for required arguments
+if [[ $# -ne 2 ]]; then
+    echo "Usage: $0 <path_to_binary> <url_to_download>"
+    exit 1
+fi
+
+rm /tmp/response1.log /tmp/response2.log /tmp/nitro.log
+
+BINARY_PATH=$1
+DOWNLOAD_URL=$2
+
+# Random port to ensure it's not used
+min=10000
+max=11000
+range=$((max - min + 1))
+PORT=$((RANDOM % range + min))
+
+# Start the binary file
+"$BINARY_PATH" 1 127.0.0.1 $PORT >/tmp/nitro.log &
+
+# Get the process id of the binary file
+pid=$!
+
+if ! ps -p $pid >/dev/null; then
+    echo "nitro failed to start. Logs:"
+    cat /tmp/nitro.log
+    exit 1
+fi
+
+# Wait for a few seconds to let the server start
+sleep 5
+
+# Check if /tmp/test-embedding exists, if not, download it
+if [[ ! -f "/tmp/test-embedding" ]]; then
+    curl --connect-timeout 300 $DOWNLOAD_URL --output /tmp/test-embedding
+fi
+
+# Run the curl commands
+response1=$(curl --connect-timeout 60 -o /tmp/response1.log -s -w "%{http_code}" --location "http://127.0.0.1:$PORT/inferences/llamacpp/loadModel" \
+    --header 'Content-Type: application/json' \
+    --data '{
+    "llama_model_path": "/tmp/test-embedding",
+    "ctx_len": 50,
+    "ngl": 32,
+    "embedding": true,
+    "model_type": "embedding"
+}')
+
+if ! ps -p $pid >/dev/null; then
+    echo "nitro failed to load model. Logs:"
+    cat /tmp/nitro.log
+    exit 1
+fi
+
+response2=$(
+    curl --connect-timeout 60 -o /tmp/response2.log -s -w "%{http_code}" --location "http://127.0.0.1:$PORT/v1/embeddings" \
+        --header 'Content-Type: application/json' \
+        --header 'Accept: text/event-stream' \
+        --header 'Access-Control-Allow-Origin: *' \
+        --data '{
+          "input": "Hello",
+          "model": "test-embedding",
+          "encoding_format": "float"       
+     }'
+)
+
+error_occurred=0
+if [[ "$response1" -ne 200 ]]; then
+    echo "The first curl command failed with status code: $response1"
+    cat /tmp/response1.log
+    error_occurred=1
+fi
+
+if [[ "$response2" -ne 200 ]]; then
+    echo "The second curl command failed with status code: $response2"
+    cat /tmp/response2.log
+    error_occurred=1
+fi
+
+if [[ "$error_occurred" -eq 1 ]]; then
+    echo "Nitro test run failed!!!!!!!!!!!!!!!!!!!!!!"
+    echo "Nitro Error Logs:"
+    cat /tmp/nitro.log
+    kill $pid
+    exit 1
+fi
+
+echo "----------------------"
+echo "Log load model:"
+cat /tmp/response1.log
+
+echo "----------------------"
+echo "Log run test:"
+cat /tmp/response2.log
+
+echo "Nitro test run successfully!"
+
+# Kill the server process
+kill $pid
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 5d6f44115..c342c1a58 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -49,6 +49,7 @@ env:
   BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
   LLM_MODEL_URL: https://delta.jan.ai/tinyllama-1.1b-chat-v0.3.Q2_K.gguf
   WHISPER_MODEL_URL: https://delta.jan.ai/ggml-tiny-q5_1.bin
+  EMBEDDING_MODEL_URL: https://catalog.jan.ai/dist/models/embeds/nomic-embed-text-v1.5.f16.gguf
 
 jobs:
   create-draft-release:
@@ -188,6 +189,15 @@ jobs:
           cd nitro
           chmod +x ../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }}
           rm -rf uploads/
+      
+      - name: Run e2e testing - Embedding
+        shell: bash
+        if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' }}
+        run: |
+          # run e2e testing
+          cd nitro
+          chmod +x ../.github/scripts/e2e-test-embedding-linux-and-mac.sh && ../.github/scripts/e2e-test-embedding-linux-and-mac.sh ./nitro ${{ env.EMBEDDING_MODEL_URL }}
+          rm -rf uploads/
 
       - name: Run e2e testing - Whisper.CPP
         shell: bash
@@ -309,6 +319,14 @@ jobs:
           cd nitro/
           chmod +x ../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }}
           rm -rf uploads/
+      
+      - name: Run e2e testing - Embedding
+        shell: bash
+        run: |
+          # run e2e testing
+          cd nitro/
+          chmod +x ../.github/scripts/e2e-test-embedding-linux-and-mac.sh && ../.github/scripts/e2e-test-embedding-linux-and-mac.sh ./nitro ${{ env.EMBEDDING_MODEL_URL }}
+          rm -rf uploads/
 
       - name: Run e2e testing - Whisper.CPP
         shell: bash
@@ -375,6 +393,14 @@ jobs:
           cd nitro
           chmod +x ../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }}
           rm -rf uploads/
+      
+      - name: Run e2e testing - Embedding
+        shell: bash
+        run: |
+          # run e2e testing
+          cd nitro
+          chmod +x ../.github/scripts/e2e-test-embedding-linux-and-mac.sh && ../.github/scripts/e2e-test-embedding-linux-and-mac.sh ./nitro ${{ env.EMBEDDING_MODEL_URL }}
+          rm -rf uploads/
 
       - name: Run e2e testing - Whisper.CPP
         shell: bash

From 1307204b432d02310ee6fe5d0f5a1e17e0e5b436 Mon Sep 17 00:00:00 2001
From: vansangpfiev <vansangpfiev@gmail.com>
Date: Tue, 16 Apr 2024 10:24:05 +0700
Subject: [PATCH 2/6] feat: e2e embedding endpoint testing for windows

---
 .../scripts/e2e-test-embedding-windows.bat    | 110 ++++++++++++++++++
 .github/workflows/build.yml                   |   8 ++
 2 files changed, 118 insertions(+)
 create mode 100644 .github/scripts/e2e-test-embedding-windows.bat

diff --git a/.github/scripts/e2e-test-embedding-windows.bat b/.github/scripts/e2e-test-embedding-windows.bat
new file mode 100644
index 000000000..9358230b1
--- /dev/null
+++ b/.github/scripts/e2e-test-embedding-windows.bat
@@ -0,0 +1,110 @@
+@echo off
+
+set "TEMP=C:\Users\%UserName%\AppData\Local\Temp"
+set "MODEL_PATH=%TEMP%\test-embedding"
+
+rem Check for required arguments
+if "%~2"=="" (
+    echo Usage: %~0 ^<path_to_binary^> ^<url_to_download^>
+    exit /b 1
+)
+
+set "BINARY_PATH=%~1"
+set "DOWNLOAD_URL=%~2"
+
+for %%i in ("%BINARY_PATH%") do set "BINARY_NAME=%%~nxi"
+
+echo BINARY_NAME=%BINARY_NAME%
+
+del %TEMP%\response1.log 2>nul
+del %TEMP%\response2.log 2>nul
+del %TEMP%\nitro.log 2>nul
+
+set /a min=9999
+set /a max=11000
+set /a range=max-min+1
+set /a PORT=%min% + %RANDOM% %% %range%
+
+rem Start the binary file
+start /B "" "%BINARY_PATH%" 1 "127.0.0.1" %PORT% > %TEMP%\nitro.log 2>&1
+
+ping -n 6 127.0.0.1 %PORT% > nul
+
+rem Capture the PID of the started process with "nitro" in its name
+for /f "tokens=2" %%a in ('tasklist /fi "imagename eq %BINARY_NAME%" /fo list ^| findstr /B "PID:"') do (
+    set "pid=%%a"
+)
+
+echo pid=%pid%
+
+if not defined pid (
+    echo nitro failed to start. Logs:
+    type %TEMP%\nitro.log
+    exit /b 1
+)
+
+rem Wait for a few seconds to let the server start
+
+rem Check if %TEMP%\testmodel exists, if not, download it
+if not exist "%MODEL_PATH%" (
+    curl.exe --connect-timeout 300 %DOWNLOAD_URL% --output "%MODEL_PATH%"
+)
+
+rem Define JSON strings for curl data
+call set "MODEL_PATH_STRING=%%MODEL_PATH:\=\\%%"
+set "curl_data1={\"llama_model_path\":\"%MODEL_PATH_STRING%\", \"embedding\": true, \"model_type\": \"embedding\"}"
+set "curl_data2={\"input\": \"Hello\", \"model\": \"test-embedding\", \"encoding_format\": \"float\"}"
+
+rem Print the values of curl_data1 and curl_data2 for debugging
+echo curl_data1=%curl_data1%
+echo curl_data2=%curl_data2%
+
+rem Run the curl commands and capture the status code
+curl.exe --connect-timeout 60 -o "%TEMP%\response1.log" -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/inferences/llamacpp/loadModel" --header "Content-Type: application/json" --data "%curl_data1%" > %TEMP%\response1.log 2>&1
+
+curl.exe --connect-timeout 60 -o "%TEMP%\response2.log" -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/v1/embeddings" ^
+--header "Content-Type: application/json" ^
+--header "Accept: text/event-stream" ^
+--header "Access-Control-Allow-Origin: *" ^
+--data "%curl_data2%" > %TEMP%\response2.log 2>&1
+
+set "error_occurred=0"
+
+rem Read the status codes from the log files
+for /f %%a in (%TEMP%\response1.log) do set "response1=%%a"
+for /f %%a in (%TEMP%\response2.log) do set "response2=%%a"
+
+if "%response1%" neq "200" (
+    echo The first curl command failed with status code: %response1%
+    type %TEMP%\response1.log
+    set "error_occurred=1"
+)
+
+if "%response2%" neq "200" (
+    echo The second curl command failed with status code: %response2%
+    type %TEMP%\response2.log
+    set "error_occurred=1"
+)
+
+if "%error_occurred%"=="1" (
+    echo Nitro test run failed!!!!!!!!!!!!!!!!!!!!!!
+    echo Nitro Error Logs:
+    type %TEMP%\nitro.log
+    taskkill /f /pid %pid%
+    exit /b 1
+)
+
+
+echo ----------------------
+echo Log load model:
+type %TEMP%\response1.log
+
+echo ----------------------
+echo "Log run test:"
+type %TEMP%\response2.log
+
+echo Nitro test run successfully!
+
+rem Kill the server process
+@REM taskkill /f /pid %pid%
+taskkill /f /im nitro.exe 2>nul || exit /B 0
\ No newline at end of file
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index c342c1a58..bcb6095ac 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -548,6 +548,14 @@ jobs:
           ..\..\.github\scripts\e2e-test-llama-windows.bat nitro.exe ${{ env.LLM_MODEL_URL }}
           rmdir /S /Q .\build\Release\uploads
 
+      - name: Run e2e testing - Embedding
+        shell: cmd
+        if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' }}
+        run: |
+          cd build\Release
+          ..\..\.github\scripts\e2e-test-embedding-windows.bat nitro.exe ${{ env.EMBEDDING_MODEL_URL }}
+          rmdir /S /Q .\build\Release\uploads
+
       - name: Run e2e testing - Whisper.cpp
         shell: cmd
         if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' }}

From 5e216339c4867eca088ffcb6a7e558dc6b02789d Mon Sep 17 00:00:00 2001
From: vansangpfiev <vansangpfiev@gmail.com>
Date: Tue, 16 Apr 2024 11:43:10 +0700
Subject: [PATCH 3/6] fix: move e2e embedding linux and mac to llama

---
 .../e2e-test-embedding-linux-and-mac.sh       | 104 ------------------
 .../scripts/e2e-test-llama-linux-and-mac.sh   |  93 +++++++++++++---
 .github/workflows/build.yml                   |  31 +-----
 3 files changed, 83 insertions(+), 145 deletions(-)
 delete mode 100644 .github/scripts/e2e-test-embedding-linux-and-mac.sh

diff --git a/.github/scripts/e2e-test-embedding-linux-and-mac.sh b/.github/scripts/e2e-test-embedding-linux-and-mac.sh
deleted file mode 100644
index f62cfca39..000000000
--- a/.github/scripts/e2e-test-embedding-linux-and-mac.sh
+++ /dev/null
@@ -1,104 +0,0 @@
-#!/bin/bash
-
-## Example run command
-# ./linux-and-mac.sh './jan/plugins/@janhq/inference-plugin/dist/nitro/nitro_mac_arm64' https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF/resolve/main/tinyllama-1.1b-chat-v0.3.Q2_K.gguf
-
-# Check for required arguments
-if [[ $# -ne 2 ]]; then
-    echo "Usage: $0 <path_to_binary> <url_to_download>"
-    exit 1
-fi
-
-rm /tmp/response1.log /tmp/response2.log /tmp/nitro.log
-
-BINARY_PATH=$1
-DOWNLOAD_URL=$2
-
-# Random port to ensure it's not used
-min=10000
-max=11000
-range=$((max - min + 1))
-PORT=$((RANDOM % range + min))
-
-# Start the binary file
-"$BINARY_PATH" 1 127.0.0.1 $PORT >/tmp/nitro.log &
-
-# Get the process id of the binary file
-pid=$!
-
-if ! ps -p $pid >/dev/null; then
-    echo "nitro failed to start. Logs:"
-    cat /tmp/nitro.log
-    exit 1
-fi
-
-# Wait for a few seconds to let the server start
-sleep 5
-
-# Check if /tmp/test-embedding exists, if not, download it
-if [[ ! -f "/tmp/test-embedding" ]]; then
-    curl --connect-timeout 300 $DOWNLOAD_URL --output /tmp/test-embedding
-fi
-
-# Run the curl commands
-response1=$(curl --connect-timeout 60 -o /tmp/response1.log -s -w "%{http_code}" --location "http://127.0.0.1:$PORT/inferences/llamacpp/loadModel" \
-    --header 'Content-Type: application/json' \
-    --data '{
-    "llama_model_path": "/tmp/test-embedding",
-    "ctx_len": 50,
-    "ngl": 32,
-    "embedding": true,
-    "model_type": "embedding"
-}')
-
-if ! ps -p $pid >/dev/null; then
-    echo "nitro failed to load model. Logs:"
-    cat /tmp/nitro.log
-    exit 1
-fi
-
-response2=$(
-    curl --connect-timeout 60 -o /tmp/response2.log -s -w "%{http_code}" --location "http://127.0.0.1:$PORT/v1/embeddings" \
-        --header 'Content-Type: application/json' \
-        --header 'Accept: text/event-stream' \
-        --header 'Access-Control-Allow-Origin: *' \
-        --data '{
-          "input": "Hello",
-          "model": "test-embedding",
-          "encoding_format": "float"       
-     }'
-)
-
-error_occurred=0
-if [[ "$response1" -ne 200 ]]; then
-    echo "The first curl command failed with status code: $response1"
-    cat /tmp/response1.log
-    error_occurred=1
-fi
-
-if [[ "$response2" -ne 200 ]]; then
-    echo "The second curl command failed with status code: $response2"
-    cat /tmp/response2.log
-    error_occurred=1
-fi
-
-if [[ "$error_occurred" -eq 1 ]]; then
-    echo "Nitro test run failed!!!!!!!!!!!!!!!!!!!!!!"
-    echo "Nitro Error Logs:"
-    cat /tmp/nitro.log
-    kill $pid
-    exit 1
-fi
-
-echo "----------------------"
-echo "Log load model:"
-cat /tmp/response1.log
-
-echo "----------------------"
-echo "Log run test:"
-cat /tmp/response2.log
-
-echo "Nitro test run successfully!"
-
-# Kill the server process
-kill $pid
diff --git a/.github/scripts/e2e-test-llama-linux-and-mac.sh b/.github/scripts/e2e-test-llama-linux-and-mac.sh
index e97c51f63..5b7b9771d 100644
--- a/.github/scripts/e2e-test-llama-linux-and-mac.sh
+++ b/.github/scripts/e2e-test-llama-linux-and-mac.sh
@@ -4,15 +4,16 @@
 # ./linux-and-mac.sh './jan/plugins/@janhq/inference-plugin/dist/nitro/nitro_mac_arm64' https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF/resolve/main/tinyllama-1.1b-chat-v0.3.Q2_K.gguf
 
 # Check for required arguments
-if [[ $# -ne 2 ]]; then
-    echo "Usage: $0 <path_to_binary> <url_to_download>"
+if [[ $# -ne 3 ]]; then
+    echo "Usage: $0 <path_to_binary> <url_to_download_llm> <url_to_download_embedding>"
     exit 1
 fi
 
-rm /tmp/response1.log /tmp/response2.log /tmp/nitro.log
+rm /tmp/load-llm-model-res.log /tmp/completion-res.log /tmp/unload-model-res.log /tmp/load-embedding-model-res.log /tmp/embedding-res.log /tmp/nitro.log
 
 BINARY_PATH=$1
-DOWNLOAD_URL=$2
+DOWNLOAD_LLM_URL=$2
+DOWNLOAD_EMBEDDING_URL=$3
 
 # Random port to ensure it's not used
 min=10000
@@ -37,11 +38,16 @@ sleep 5
 
 # Check if /tmp/testllm exists, if not, download it
 if [[ ! -f "/tmp/testllm" ]]; then
-    curl --connect-timeout 300 $DOWNLOAD_URL --output /tmp/testllm
+    curl --connect-timeout 300 $DOWNLOAD_LLM_URL --output /tmp/testllm
+fi
+
+# Check if /tmp/test-embedding exists, if not, download it
+if [[ ! -f "/tmp/test-embedding" ]]; then
+    curl --connect-timeout 300 $DOWNLOAD_EMBEDDING_URL --output /tmp/test-embedding
 fi
 
 # Run the curl commands
-response1=$(curl --connect-timeout 60 -o /tmp/response1.log -s -w "%{http_code}" --location "http://127.0.0.1:$PORT/inferences/llamacpp/loadModel" \
+response1=$(curl --connect-timeout 60 -o /tmp/load-llm-model-res.log -s -w "%{http_code}" --location "http://127.0.0.1:$PORT/inferences/llamacpp/loadModel" \
     --header 'Content-Type: application/json' \
     --data '{
     "llama_model_path": "/tmp/testllm",
@@ -57,7 +63,7 @@ if ! ps -p $pid >/dev/null; then
 fi
 
 response2=$(
-    curl --connect-timeout 60 -o /tmp/response2.log -s -w "%{http_code}" --location "http://127.0.0.1:$PORT/v1/chat/completions" \
+    curl --connect-timeout 60 -o /tmp/completion-res.log -s -w "%{http_code}" --location "http://127.0.0.1:$PORT/v1/chat/completions" \
         --header 'Content-Type: application/json' \
         --header 'Accept: text/event-stream' \
         --header 'Access-Control-Allow-Origin: *' \
@@ -76,16 +82,65 @@ response2=$(
      }'
 )
 
+# unload model
+response3=$(curl --connect-timeout 60 -o /tmp/unload-model-res.log --request GET -s -w "%{http_code}" --location "http://127.0.0.1:$PORT/inferences/llamacpp/unloadModel" \
+    --header 'Content-Type: application/json' \
+    --data '{
+    "llama_model_path": "/tmp/testllm"
+}')
+
+# load embedding model
+response4=$(curl --connect-timeout 60 -o /tmp/load-embedding-model-res.log -s -w "%{http_code}" --location "http://127.0.0.1:$PORT/inferences/llamacpp/loadModel" \
+    --header 'Content-Type: application/json' \
+    --data '{
+    "llama_model_path": "/tmp/test-embedding",
+    "ctx_len": 50,
+    "ngl": 32,
+    "embedding": true,
+    "model_type": "embedding"
+}')
+
+# request embedding
+response5=$(
+    curl --connect-timeout 60 -o /tmp/embedding-res.log -s -w "%{http_code}" --location "http://127.0.0.1:$PORT/v1/embeddings" \
+        --header 'Content-Type: application/json' \
+        --header 'Accept: text/event-stream' \
+        --header 'Access-Control-Allow-Origin: *' \
+        --data '{
+          "input": "Hello",
+          "model": "test-embedding",
+          "encoding_format": "float"       
+     }'
+)
+
 error_occurred=0
 if [[ "$response1" -ne 200 ]]; then
-    echo "The first curl command failed with status code: $response1"
-    cat /tmp/response1.log
+    echo "The load llm model curl command failed with status code: $response1"
+    cat /tmp/load-llm-model-res.log
     error_occurred=1
 fi
 
 if [[ "$response2" -ne 200 ]]; then
-    echo "The second curl command failed with status code: $response2"
-    cat /tmp/response2.log
+    echo "The completion curl command failed with status code: $response2"
+    cat /tmp/completion-res.log
+    error_occurred=1
+fi
+
+if [[ "$response3" -ne 200 ]]; then
+    echo "The unload model curl command failed with status code: $response3"
+    cat /tmp/unload-model-res.log
+    error_occurred=1
+fi
+
+if [[ "$response4" -ne 200 ]]; then
+    echo "The load embedding model curl command failed with status code: $response4"
+    cat /tmp/load-embedding-model-res.log
+    error_occurred=1
+fi
+
+if [[ "$response5" -ne 200 ]]; then
+    echo "The embedding curl command failed with status code: $response5"
+    cat /tmp/embedding-res.log
     error_occurred=1
 fi
 
@@ -99,11 +154,23 @@ fi
 
 echo "----------------------"
 echo "Log load model:"
-cat /tmp/response1.log
+cat /tmp/load-llm-model-res.log
+
+echo "----------------------"
+echo "Log run test:"
+cat /tmp/completion-res.log
+
+echo "----------------------"
+echo "Log run test:"
+cat /tmp/unload-model-res.log
+
+echo "----------------------"
+echo "Log run test:"
+cat /tmp/load-embedding-model-res.log
 
 echo "----------------------"
 echo "Log run test:"
-cat /tmp/response2.log
+cat /tmp/embedding-res.log
 
 echo "Nitro test run successfully!"
 
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index bcb6095ac..e1b57b4c1 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -187,16 +187,7 @@ jobs:
         run: |
           # run e2e testing
           cd nitro
-          chmod +x ../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }}
-          rm -rf uploads/
-      
-      - name: Run e2e testing - Embedding
-        shell: bash
-        if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' }}
-        run: |
-          # run e2e testing
-          cd nitro
-          chmod +x ../.github/scripts/e2e-test-embedding-linux-and-mac.sh && ../.github/scripts/e2e-test-embedding-linux-and-mac.sh ./nitro ${{ env.EMBEDDING_MODEL_URL }}
+          chmod +x ../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }}
           rm -rf uploads/
 
       - name: Run e2e testing - Whisper.CPP
@@ -317,15 +308,7 @@ jobs:
         run: |
           # run e2e testing
           cd nitro/
-          chmod +x ../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }}
-          rm -rf uploads/
-      
-      - name: Run e2e testing - Embedding
-        shell: bash
-        run: |
-          # run e2e testing
-          cd nitro/
-          chmod +x ../.github/scripts/e2e-test-embedding-linux-and-mac.sh && ../.github/scripts/e2e-test-embedding-linux-and-mac.sh ./nitro ${{ env.EMBEDDING_MODEL_URL }}
+          chmod +x ../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }}
           rm -rf uploads/
 
       - name: Run e2e testing - Whisper.CPP
@@ -391,15 +374,7 @@ jobs:
         run: |
           # run e2e testing
           cd nitro
-          chmod +x ../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }}
-          rm -rf uploads/
-      
-      - name: Run e2e testing - Embedding
-        shell: bash
-        run: |
-          # run e2e testing
-          cd nitro
-          chmod +x ../.github/scripts/e2e-test-embedding-linux-and-mac.sh && ../.github/scripts/e2e-test-embedding-linux-and-mac.sh ./nitro ${{ env.EMBEDDING_MODEL_URL }}
+          chmod +x ../.github/scripts/e2e-test-llama-linux-and-mac.sh && ../.github/scripts/e2e-test-llama-linux-and-mac.sh ./nitro ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }}
           rm -rf uploads/
 
       - name: Run e2e testing - Whisper.CPP

From af83b1461fc68df5cb457866cb88d536b3ef50b6 Mon Sep 17 00:00:00 2001
From: vansangpfiev <vansangpfiev@gmail.com>
Date: Tue, 16 Apr 2024 13:40:43 +0700
Subject: [PATCH 4/6] fix: move e2e ebedding windows to llama

---
 .github/scripts/e2e-test-llama-windows.bat | 84 ++++++++++++++++++----
 1 file changed, 71 insertions(+), 13 deletions(-)

diff --git a/.github/scripts/e2e-test-llama-windows.bat b/.github/scripts/e2e-test-llama-windows.bat
index a6526f358..9ceb9e5c5 100644
--- a/.github/scripts/e2e-test-llama-windows.bat
+++ b/.github/scripts/e2e-test-llama-windows.bat
@@ -1,16 +1,18 @@
 @echo off
 
 set "TEMP=C:\Users\%UserName%\AppData\Local\Temp"
-set "MODEL_PATH=%TEMP%\testllm"
+set "MODEL_LLM_PATH=%TEMP%\testllm"
+set "MODEL_EMBEDDING_PATH=%TEMP%\test-embedding"
 
 rem Check for required arguments
-if "%~2"=="" (
-    echo Usage: %~0 ^<path_to_binary^> ^<url_to_download^>
+if "%~3"=="" (
+    echo Usage: %~0 ^<path_to_binary^> ^<url_to_download_llm^> ^<url_to_download_embedding^>
     exit /b 1
 )
 
 set "BINARY_PATH=%~1"
-set "DOWNLOAD_URL=%~2"
+set "DOWNLOAD_LLM_URL=%~2"
+set "DOWNLOAD_EMBEDDING_URL=%~3"
 
 for %%i in ("%BINARY_PATH%") do set "BINARY_NAME=%%~nxi"
 
@@ -18,6 +20,9 @@ echo BINARY_NAME=%BINARY_NAME%
 
 del %TEMP%\response1.log 2>nul
 del %TEMP%\response2.log 2>nul
+del %TEMP%\response3.log 2>nul
+del %TEMP%\response4.log 2>nul
+del %TEMP%\response5.log 2>nul
 del %TEMP%\nitro.log 2>nul
 
 set /a min=9999
@@ -46,33 +51,56 @@ if not defined pid (
 rem Wait for a few seconds to let the server start
 
 rem Check if %TEMP%\testmodel exists, if not, download it
-if not exist "%MODEL_PATH%" (
-    curl.exe --connect-timeout 300 %DOWNLOAD_URL% --output "%MODEL_PATH%"
+if not exist "%MODEL_LLM_PATH%" (
+    curl.exe --connect-timeout 300 %DOWNLOAD_LLM_URL% --output "%MODEL_LLM_PATH%"
+)
+
+if not exist "%MODEL_EMBEDDING_PATH%" (
+    curl.exe --connect-timeout 300 %DOWNLOAD_EMBEDDING_URL% --output "%MODEL_EMBEDDING_PATH%"
 )
 
 rem Define JSON strings for curl data
-call set "MODEL_PATH_STRING=%%MODEL_PATH:\=\\%%"
-set "curl_data1={\"llama_model_path\":\"%MODEL_PATH_STRING%\"}"
+call set "MODEL_LLM_PATH_STRING=%%MODEL_LLM_PATH:\=\\%%"
+call set "MODEL_EMBEDDING_PATH_STRING=%%MODEL_EMBEDDING_PATH:\=\\%%"
+set "curl_data1={\"llama_model_path\":\"%MODEL_LLM_PATH_STRING%\"}"
 set "curl_data2={\"messages\":[{\"content\":\"Hello there\",\"role\":\"assistant\"},{\"content\":\"Write a long and sad story for me\",\"role\":\"user\"}],\"stream\":true,\"model\":\"gpt-3.5-turbo\",\"max_tokens\":50,\"stop\":[\"hello\"],\"frequency_penalty\":0,\"presence_penalty\":0,\"temperature\":0.1}"
+set "curl_data3={\"llama_model_path\":\"%MODEL_LLM_PATH_STRING%\"}"
+set "curl_data4={\"llama_model_path\":\"%MODEL_EMBEDDING_PATH_STRING%\", \"embedding\": true, \"model_type\": \"embedding\"}"
+set "curl_data5={\"input\": \"Hello\", \"model\": \"test-embedding\", \"encoding_format\": \"float\"}"
 
-rem Print the values of curl_data1 and curl_data2 for debugging
+rem Print the values of curl_data for debugging
 echo curl_data1=%curl_data1%
 echo curl_data2=%curl_data2%
+echo curl_data3=%curl_data3%
+echo curl_data4=%curl_data4%
+echo curl_data5=%curl_data5%
 
 rem Run the curl commands and capture the status code
 curl.exe --connect-timeout 60 -o "%TEMP%\response1.log" -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/inferences/llamacpp/loadModel" --header "Content-Type: application/json" --data "%curl_data1%" > %TEMP%\response1.log 2>&1
 
 curl.exe --connect-timeout 60 -o "%TEMP%\response2.log" -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/inferences/llamacpp/chat_completion" ^
 --header "Content-Type: application/json" ^
---header "Accept: text/event-stream" ^
---header "Access-Control-Allow-Origin: *" ^
 --data "%curl_data2%" > %TEMP%\response2.log 2>&1
 
+rem give it some time to receive full response
+timeout /t 5
+
+curl.exe --connect-timeout 60 -o "%TEMP%\response3.log" --request GET -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/inferences/llamacpp/unloadModel" --header "Content-Type: application/json" --data "%curl_data3%" > %TEMP%\response3.log 2>&1
+
+curl.exe --connect-timeout 60 -o "%TEMP%\response4.log" --request POST -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/inferences/llamacpp/loadModel" --header "Content-Type: application/json" --data "%curl_data4%" > %TEMP%\response4.log 2>&1
+
+curl.exe --connect-timeout 60 -o "%TEMP%\response5.log" -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/v1/embeddings" ^
+--header "Content-Type: application/json" ^
+--data "%curl_data5%" > %TEMP%\response5.log 2>&1
+
 set "error_occurred=0"
 
 rem Read the status codes from the log files
 for /f %%a in (%TEMP%\response1.log) do set "response1=%%a"
 for /f %%a in (%TEMP%\response2.log) do set "response2=%%a"
+for /f %%a in (%TEMP%\response3.log) do set "response3=%%a"
+for /f %%a in (%TEMP%\response4.log) do set "response4=%%a"
+for /f %%a in (%TEMP%\response5.log) do set "response5=%%a"
 
 if "%response1%" neq "200" (
     echo The first curl command failed with status code: %response1%
@@ -86,6 +114,24 @@ if "%response2%" neq "200" (
     set "error_occurred=1"
 )
 
+if "%response3%" neq "200" (
+    echo The third curl command failed with status code: %response3%
+    type %TEMP%\response3.log
+    set "error_occurred=1"
+)
+
+if "%response4%" neq "200" (
+    echo The fourth curl command failed with status code: %response4%
+    type %TEMP%\response4.log
+    set "error_occurred=1"
+)
+
+if "%response5%" neq "200" (
+    echo The fifth curl command failed with status code: %response5%
+    type %TEMP%\response5.log
+    set "error_occurred=1"
+)
+
 if "%error_occurred%"=="1" (
     echo Nitro test run failed!!!!!!!!!!!!!!!!!!!!!!
     echo Nitro Error Logs:
@@ -96,13 +142,25 @@ if "%error_occurred%"=="1" (
 
 
 echo ----------------------
-echo Log load model:
+echo Log load llm model:
 type %TEMP%\response1.log
 
 echo ----------------------
-echo "Log run test:"
+echo Log run test:
 type %TEMP%\response2.log
 
+echo ----------------------
+echo Log unload model:
+type %TEMP%\response3.log
+
+echo ----------------------
+echo Log load embedding model:
+type %TEMP%\response3.log
+
+echo ----------------------
+echo Log run embedding test:
+type %TEMP%\response5.log
+
 echo Nitro test run successfully!
 
 rem Kill the server process

From abcbed2debf55a90d42fce59b7967befb23529fd Mon Sep 17 00:00:00 2001
From: vansangpfiev <vansangpfiev@gmail.com>
Date: Tue, 16 Apr 2024 13:55:42 +0700
Subject: [PATCH 5/6] fix: remove embedding e2e

---
 .../scripts/e2e-test-embedding-windows.bat    | 110 ------------------
 .github/workflows/build.yml                   |  10 +-
 2 files changed, 1 insertion(+), 119 deletions(-)
 delete mode 100644 .github/scripts/e2e-test-embedding-windows.bat

diff --git a/.github/scripts/e2e-test-embedding-windows.bat b/.github/scripts/e2e-test-embedding-windows.bat
deleted file mode 100644
index 9358230b1..000000000
--- a/.github/scripts/e2e-test-embedding-windows.bat
+++ /dev/null
@@ -1,110 +0,0 @@
-@echo off
-
-set "TEMP=C:\Users\%UserName%\AppData\Local\Temp"
-set "MODEL_PATH=%TEMP%\test-embedding"
-
-rem Check for required arguments
-if "%~2"=="" (
-    echo Usage: %~0 ^<path_to_binary^> ^<url_to_download^>
-    exit /b 1
-)
-
-set "BINARY_PATH=%~1"
-set "DOWNLOAD_URL=%~2"
-
-for %%i in ("%BINARY_PATH%") do set "BINARY_NAME=%%~nxi"
-
-echo BINARY_NAME=%BINARY_NAME%
-
-del %TEMP%\response1.log 2>nul
-del %TEMP%\response2.log 2>nul
-del %TEMP%\nitro.log 2>nul
-
-set /a min=9999
-set /a max=11000
-set /a range=max-min+1
-set /a PORT=%min% + %RANDOM% %% %range%
-
-rem Start the binary file
-start /B "" "%BINARY_PATH%" 1 "127.0.0.1" %PORT% > %TEMP%\nitro.log 2>&1
-
-ping -n 6 127.0.0.1 %PORT% > nul
-
-rem Capture the PID of the started process with "nitro" in its name
-for /f "tokens=2" %%a in ('tasklist /fi "imagename eq %BINARY_NAME%" /fo list ^| findstr /B "PID:"') do (
-    set "pid=%%a"
-)
-
-echo pid=%pid%
-
-if not defined pid (
-    echo nitro failed to start. Logs:
-    type %TEMP%\nitro.log
-    exit /b 1
-)
-
-rem Wait for a few seconds to let the server start
-
-rem Check if %TEMP%\testmodel exists, if not, download it
-if not exist "%MODEL_PATH%" (
-    curl.exe --connect-timeout 300 %DOWNLOAD_URL% --output "%MODEL_PATH%"
-)
-
-rem Define JSON strings for curl data
-call set "MODEL_PATH_STRING=%%MODEL_PATH:\=\\%%"
-set "curl_data1={\"llama_model_path\":\"%MODEL_PATH_STRING%\", \"embedding\": true, \"model_type\": \"embedding\"}"
-set "curl_data2={\"input\": \"Hello\", \"model\": \"test-embedding\", \"encoding_format\": \"float\"}"
-
-rem Print the values of curl_data1 and curl_data2 for debugging
-echo curl_data1=%curl_data1%
-echo curl_data2=%curl_data2%
-
-rem Run the curl commands and capture the status code
-curl.exe --connect-timeout 60 -o "%TEMP%\response1.log" -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/inferences/llamacpp/loadModel" --header "Content-Type: application/json" --data "%curl_data1%" > %TEMP%\response1.log 2>&1
-
-curl.exe --connect-timeout 60 -o "%TEMP%\response2.log" -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/v1/embeddings" ^
---header "Content-Type: application/json" ^
---header "Accept: text/event-stream" ^
---header "Access-Control-Allow-Origin: *" ^
---data "%curl_data2%" > %TEMP%\response2.log 2>&1
-
-set "error_occurred=0"
-
-rem Read the status codes from the log files
-for /f %%a in (%TEMP%\response1.log) do set "response1=%%a"
-for /f %%a in (%TEMP%\response2.log) do set "response2=%%a"
-
-if "%response1%" neq "200" (
-    echo The first curl command failed with status code: %response1%
-    type %TEMP%\response1.log
-    set "error_occurred=1"
-)
-
-if "%response2%" neq "200" (
-    echo The second curl command failed with status code: %response2%
-    type %TEMP%\response2.log
-    set "error_occurred=1"
-)
-
-if "%error_occurred%"=="1" (
-    echo Nitro test run failed!!!!!!!!!!!!!!!!!!!!!!
-    echo Nitro Error Logs:
-    type %TEMP%\nitro.log
-    taskkill /f /pid %pid%
-    exit /b 1
-)
-
-
-echo ----------------------
-echo Log load model:
-type %TEMP%\response1.log
-
-echo ----------------------
-echo "Log run test:"
-type %TEMP%\response2.log
-
-echo Nitro test run successfully!
-
-rem Kill the server process
-@REM taskkill /f /pid %pid%
-taskkill /f /im nitro.exe 2>nul || exit /B 0
\ No newline at end of file
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index e1b57b4c1..2705ba701 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -520,15 +520,7 @@ jobs:
         if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' }}
         run: |
           cd build\Release
-          ..\..\.github\scripts\e2e-test-llama-windows.bat nitro.exe ${{ env.LLM_MODEL_URL }}
-          rmdir /S /Q .\build\Release\uploads
-
-      - name: Run e2e testing - Embedding
-        shell: cmd
-        if: ${{ matrix.build != 'arm64' && matrix.build != 'amd64-vulkan' && matrix.build != 'amd64-avx512' }}
-        run: |
-          cd build\Release
-          ..\..\.github\scripts\e2e-test-embedding-windows.bat nitro.exe ${{ env.EMBEDDING_MODEL_URL }}
+          ..\..\.github\scripts\e2e-test-llama-windows.bat nitro.exe ${{ env.LLM_MODEL_URL }} ${{ env.EMBEDDING_MODEL_URL }}
           rmdir /S /Q .\build\Release\uploads
 
       - name: Run e2e testing - Whisper.cpp

From bb5a6b1bcad06468bad5b85c3cb5e85ccd18c27e Mon Sep 17 00:00:00 2001
From: vansangpfiev <vansangpfiev@gmail.com>
Date: Tue, 16 Apr 2024 14:25:45 +0700
Subject: [PATCH 6/6] fix: e2e windows

---
 .github/scripts/e2e-test-llama-windows.bat | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.github/scripts/e2e-test-llama-windows.bat b/.github/scripts/e2e-test-llama-windows.bat
index 9ceb9e5c5..cddca1e0b 100644
--- a/.github/scripts/e2e-test-llama-windows.bat
+++ b/.github/scripts/e2e-test-llama-windows.bat
@@ -63,7 +63,7 @@ rem Define JSON strings for curl data
 call set "MODEL_LLM_PATH_STRING=%%MODEL_LLM_PATH:\=\\%%"
 call set "MODEL_EMBEDDING_PATH_STRING=%%MODEL_EMBEDDING_PATH:\=\\%%"
 set "curl_data1={\"llama_model_path\":\"%MODEL_LLM_PATH_STRING%\"}"
-set "curl_data2={\"messages\":[{\"content\":\"Hello there\",\"role\":\"assistant\"},{\"content\":\"Write a long and sad story for me\",\"role\":\"user\"}],\"stream\":true,\"model\":\"gpt-3.5-turbo\",\"max_tokens\":50,\"stop\":[\"hello\"],\"frequency_penalty\":0,\"presence_penalty\":0,\"temperature\":0.1}"
+set "curl_data2={\"messages\":[{\"content\":\"Hello there\",\"role\":\"assistant\"},{\"content\":\"Write a long and sad story for me\",\"role\":\"user\"}],\"stream\":false,\"model\":\"gpt-3.5-turbo\",\"max_tokens\":50,\"stop\":[\"hello\"],\"frequency_penalty\":0,\"presence_penalty\":0,\"temperature\":0.1}"
 set "curl_data3={\"llama_model_path\":\"%MODEL_LLM_PATH_STRING%\"}"
 set "curl_data4={\"llama_model_path\":\"%MODEL_EMBEDDING_PATH_STRING%\", \"embedding\": true, \"model_type\": \"embedding\"}"
 set "curl_data5={\"input\": \"Hello\", \"model\": \"test-embedding\", \"encoding_format\": \"float\"}"
@@ -82,9 +82,6 @@ curl.exe --connect-timeout 60 -o "%TEMP%\response2.log" -s -w "%%{http_code}" --
 --header "Content-Type: application/json" ^
 --data "%curl_data2%" > %TEMP%\response2.log 2>&1
 
-rem give it some time to receive full response
-timeout /t 5
-
 curl.exe --connect-timeout 60 -o "%TEMP%\response3.log" --request GET -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/inferences/llamacpp/unloadModel" --header "Content-Type: application/json" --data "%curl_data3%" > %TEMP%\response3.log 2>&1
 
 curl.exe --connect-timeout 60 -o "%TEMP%\response4.log" --request POST -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/inferences/llamacpp/loadModel" --header "Content-Type: application/json" --data "%curl_data4%" > %TEMP%\response4.log 2>&1