janhq · vansangpfiev · Aug 5, 2024 · Aug 2, 2024 · Aug 2, 2024 · Aug 2, 2024
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -101,6 +101,14 @@ jobs:
             vulkan: true
             sccache: true
             sccache-conf-path: "/tmp/sccache.conf"
+          - os: "linux"
+            name: "amd64-noavx-cuda-11-7"
+            runs-on: "ubuntu-20-04-cuda-11-7"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            sccache: true
+            sccache-conf-path: "/tmp/sccache.conf"
           - os: "linux"
             name: "amd64-avx2-cuda-11-7"
             runs-on: "ubuntu-20-04-cuda-11-7"
@@ -125,6 +133,14 @@ jobs:
             vulkan: false
             sccache: true
             sccache-conf-path: "/tmp/sccache.conf"
+          - os: "linux"
+            name: "amd64-noavx-cuda-12-0"
+            runs-on: "ubuntu-20-04-cuda-12-0"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            sccache: true
+            sccache-conf-path: "/tmp/sccache.conf"
           - os: "linux"
             name: "amd64-avx2-cuda-12-0"
             runs-on: "ubuntu-20-04-cuda-12-0"
@@ -205,6 +221,14 @@ jobs:
             vulkan: true
             sccache: false
             sccache-conf-path: ""
+          - os: "windows"
+            name: "amd64-noavx-cuda-12-0"
+            runs-on: "windows-cuda-12-0"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CUDA_COMPILER_LAUNCHER=sccache -GNinja"
+            run-e2e: false
+            vulkan: false
+            sccache: true
+            sccache-conf-path: 'C:\sccache.conf'
           - os: "windows"
             name: "amd64-avx2-cuda-12-0"
             runs-on: "windows-cuda-12-0"
@@ -229,6 +253,14 @@ jobs:
             vulkan: false
             sccache: true
             sccache-conf-path: 'C:\sccache.conf'
+          - os: "windows"
+            name: "amd64-noavx-cuda-11-7"
+            runs-on: "windows-cuda-11-7"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CUDA_COMPILER_LAUNCHER=sccache -GNinja"
+            run-e2e: false
+            vulkan: false
+            sccache: true
+            sccache-conf-path: 'C:\sccache.conf'
           - os: "windows"
             name: "amd64-avx2-cuda-11-7"
             runs-on: "windows-cuda-11-7"

diff --git a/.github/workflows/nightly-build.yml b/.github/workflows/nightly-build.yml
@@ -101,6 +101,14 @@ jobs:
             vulkan: true
             sccache: true
             sccache-conf-path: "/tmp/sccache.conf"
+          - os: "linux"
+            name: "amd64-noavx-cuda-11-7"
+            runs-on: "ubuntu-20-04-cuda-11-7"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            sccache: true
+            sccache-conf-path: "/tmp/sccache.conf"
           - os: "linux"
             name: "amd64-avx2-cuda-11-7"
             runs-on: "ubuntu-20-04-cuda-11-7"
@@ -125,6 +133,14 @@ jobs:
             vulkan: false
             sccache: true
             sccache-conf-path: "/tmp/sccache.conf"
+          - os: "linux"
+            name: "amd64-noavx-cuda-12-0"
+            runs-on: "ubuntu-20-04-cuda-12-0"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            sccache: true
+            sccache-conf-path: "/tmp/sccache.conf"
           - os: "linux"
             name: "amd64-avx2-cuda-12-0"
             runs-on: "ubuntu-20-04-cuda-12-0"
@@ -205,6 +221,14 @@ jobs:
             vulkan: true
             sccache: false
             sccache-conf-path: ""
+          - os: "windows"
+            name: "amd64-noavx-cuda-12-0"
+            runs-on: "windows-cuda-12-0"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CUDA_COMPILER_LAUNCHER=sccache -GNinja"
+            run-e2e: false
+            vulkan: false
+            sccache: true
+            sccache-conf-path: 'C:\sccache.conf'
           - os: "windows"
             name: "amd64-avx2-cuda-12-0"
             runs-on: "windows-cuda-12-0"
@@ -229,6 +253,14 @@ jobs:
             vulkan: false
             sccache: true
             sccache-conf-path: 'C:\sccache.conf'
+          - os: "windows"
+            name: "amd64-noavx-cuda-11-7"
+            runs-on: "windows-cuda-11-7"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CUDA_COMPILER_LAUNCHER=sccache -GNinja"
+            run-e2e: false
+            vulkan: false
+            sccache: true
+            sccache-conf-path: 'C:\sccache.conf'
           - os: "windows"
             name: "amd64-avx2-cuda-11-7"
             runs-on: "windows-cuda-11-7"

diff --git a/.github/workflows/template-quality-gate-pr.yml b/.github/workflows/template-quality-gate-pr.yml
@@ -68,6 +68,14 @@ jobs:
             vulkan: true
             sccache: true
             sccache-conf-path: "/tmp/sccache.conf"
+          - os: "linux"
+            name: "amd64-noavx-cuda-11-7"
+            runs-on: "ubuntu-20-04-cuda-11-7"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            sccache: true
+            sccache-conf-path: "/tmp/sccache.conf"
           - os: "linux"
             name: "amd64-avx2-cuda-11-7"
             runs-on: "ubuntu-20-04-cuda-11-7"
@@ -92,6 +100,14 @@ jobs:
             vulkan: false
             sccache: true
             sccache-conf-path: "/tmp/sccache.conf"
+          - os: "linux"
+            name: "amd64-noavx-cuda-12-0"
+            runs-on: "ubuntu-20-04-cuda-12-0"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_FMA=OFF -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            sccache: true
+            sccache-conf-path: "/tmp/sccache.conf"
           - os: "linux"
             name: "amd64-avx2-cuda-12-0"
             runs-on: "ubuntu-20-04-cuda-12-0"
@@ -172,6 +188,14 @@ jobs:
             vulkan: true
             sccache: false
             sccache-conf-path: ""
+          - os: "windows"
+            name: "amd64-noavx-cuda-12-0"
+            runs-on: "windows-cuda-12-0"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CUDA_COMPILER_LAUNCHER=sccache -GNinja"
+            run-e2e: false
+            vulkan: false
+            sccache: true
+            sccache-conf-path: 'C:\sccache.conf'
           - os: "windows"
             name: "amd64-avx2-cuda-12-0"
             runs-on: "windows-cuda-12-0"
@@ -196,6 +220,14 @@ jobs:
             vulkan: false
             sccache: true
             sccache-conf-path: 'C:\sccache.conf'
+          - os: "windows"
+            name: "amd64-noavx-cuda-11-7"
+            runs-on: "windows-cuda-11-7"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CUDA_COMPILER_LAUNCHER=sccache -GNinja"
+            run-e2e: false
+            vulkan: false
+            sccache: true
+            sccache-conf-path: 'C:\sccache.conf'
           - os: "windows"
             name: "amd64-avx2-cuda-11-7"
             runs-on: "windows-cuda-11-7"

diff --git a/.github/workflows/template-quality-gate-submodule.yml b/.github/workflows/template-quality-gate-submodule.yml
@@ -68,6 +68,14 @@ jobs:
             vulkan: true
             sccache: true
             sccache-conf-path: "/tmp/sccache.conf"
+          - os: "linux"
+            name: "amd64-noavx-cuda-11-7"
+            runs-on: "ubuntu-20-04-cuda-11-7"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=OFF -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            sccache: true
+            sccache-conf-path: "/tmp/sccache.conf"
           - os: "linux"
             name: "amd64-avx2-cuda-11-7"
             runs-on: "ubuntu-20-04-cuda-11-7"
@@ -92,6 +100,14 @@ jobs:
             vulkan: false
             sccache: true
             sccache-conf-path: "/tmp/sccache.conf"
+          - os: "linux"
+            name: "amd64-noavx-cuda-12-0"
+            runs-on: "ubuntu-20-04-cuda-12-0"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_FMA=OFF -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
+            run-e2e: false
+            vulkan: false
+            sccache: true
+            sccache-conf-path: "/tmp/sccache.conf"
           - os: "linux"
             name: "amd64-avx2-cuda-12-0"
             runs-on: "ubuntu-20-04-cuda-12-0"
@@ -172,6 +188,14 @@ jobs:
             vulkan: true
             sccache: false
             sccache-conf-path: ""
+          - os: "windows"
+            name: "amd64-noavx-cuda-12-0"
+            runs-on: "windows-cuda-12-0"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CUDA_COMPILER_LAUNCHER=sccache -GNinja"
+            run-e2e: false
+            vulkan: false
+            sccache: true
+            sccache-conf-path: 'C:\sccache.conf'
           - os: "windows"
             name: "amd64-avx2-cuda-12-0"
             runs-on: "windows-cuda-12-0"
@@ -196,6 +220,14 @@ jobs:
             vulkan: false
             sccache: true
             sccache-conf-path: 'C:\sccache.conf'
+          - os: "windows"
+            name: "amd64-noavx-cuda-11-7"
+            runs-on: "windows-cuda-11-7"
+            cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CUDA_COMPILER_LAUNCHER=sccache -GNinja"
+            run-e2e: false
+            vulkan: false
+            sccache: true
+            sccache-conf-path: 'C:\sccache.conf'
           - os: "windows"
             name: "amd64-avx2-cuda-11-7"
             runs-on: "windows-cuda-11-7"

diff --git a/examples/server/server.cc b/examples/server/server.cc
@@ -49,6 +49,18 @@ class Server {
   };
 };
 
+struct SyncJsonReader {
+ public:
+  void Parse(const std::string& document, Json::Value& root) {
+    std::lock_guard<std::mutex> l(m);
+    r.parse(document, root);
+  }
+
+ private:
+  Json::Reader r;
+  std::mutex m;
+};
+
 std::function<void(int)> shutdown_handler;
 std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;
 
@@ -78,7 +90,7 @@ int main(int argc, char** argv) {
   }
 
   Server server;
-  Json::Reader r;
+  SyncJsonReader r;
   auto svr = std::make_unique<httplib::Server>();
 
   if (!svr->bind_to_port(hostname, port)) {
@@ -127,7 +139,7 @@ int main(int argc, char** argv) {
     resp.set_header("Access-Control-Allow-Origin",
                     req.get_header_value("Origin"));
     auto req_body = std::make_shared<Json::Value>();
-    r.parse(req.body, *req_body);
+    r.Parse(req.body, *req_body);
     server.engine_->LoadModel(
         req_body, [&server, &resp](Json::Value status, Json::Value res) {
           resp.set_content(res.toStyledString().c_str(),
@@ -141,7 +153,7 @@ int main(int argc, char** argv) {
     resp.set_header("Access-Control-Allow-Origin",
                     req.get_header_value("Origin"));
     auto req_body = std::make_shared<Json::Value>();
-    r.parse(req.body, *req_body);
+    r.Parse(req.body, *req_body);
     server.engine_->UnloadModel(
         req_body, [&server, &resp](Json::Value status, Json::Value res) {
           resp.set_content(res.toStyledString().c_str(),
@@ -155,7 +167,7 @@ int main(int argc, char** argv) {
     resp.set_header("Access-Control-Allow-Origin",
                     req.get_header_value("Origin"));
     auto req_body = std::make_shared<Json::Value>();
-    r.parse(req.body, *req_body);
+    r.Parse(req.body, *req_body);
     bool is_stream = (*req_body).get("stream", false).asBool();
     // This is an async call, need to use queue
     auto q = std::make_shared<SyncQueue>();
@@ -175,7 +187,7 @@ int main(int argc, char** argv) {
     resp.set_header("Access-Control-Allow-Origin",
                     req.get_header_value("Origin"));
     auto req_body = std::make_shared<Json::Value>();
-    r.parse(req.body, *req_body);
+    r.Parse(req.body, *req_body);
     // This is an async call, need to use queue
     SyncQueue q;
     server.engine_->HandleEmbedding(
@@ -190,7 +202,7 @@ int main(int argc, char** argv) {
     resp.set_header("Access-Control-Allow-Origin",
                     req.get_header_value("Origin"));
     auto req_body = std::make_shared<Json::Value>();
-    r.parse(req.body, *req_body);
+    r.Parse(req.body, *req_body);
     server.engine_->GetModelStatus(
         req_body, [&server, &resp](Json::Value status, Json::Value res) {
           resp.set_content(res.toStyledString().c_str(),
@@ -204,7 +216,7 @@ int main(int argc, char** argv) {
     resp.set_header("Access-Control-Allow-Origin",
                     req.get_header_value("Origin"));
     auto req_body = std::make_shared<Json::Value>();
-    r.parse(req.body, *req_body);
+    r.Parse(req.body, *req_body);
     server.engine_->GetModels(
         req_body, [&server, &resp](Json::Value status, Json::Value res) {
           resp.set_content(res.toStyledString().c_str(),
@@ -222,10 +234,10 @@ int main(int argc, char** argv) {
   svr->Get("/models", handle_get_running_models);
   std::atomic<bool> running = true;
   svr->Delete("/destroy",
-            [&](const httplib::Request& req, httplib::Response& resp) {
-              LOG_INFO << "Received Stop command";
-              running = false;
-            });
+              [&](const httplib::Request& req, httplib::Response& resp) {
+                LOG_INFO << "Received Stop command";
+                running = false;
+              });
 
   LOG_INFO << "HTTP server listening: " << hostname << ":" << port;
   svr->new_task_queue = [] {