Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,14 @@ jobs:
vulkan: true
sccache: true
sccache-conf-path: "/tmp/sccache.conf"
- os: "linux"
name: "amd64-noavx-cuda-11-7"
runs-on: "ubuntu-20-04-cuda-11-7"
cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
run-e2e: false
vulkan: false
sccache: true
sccache-conf-path: "/tmp/sccache.conf"
- os: "linux"
name: "amd64-avx2-cuda-11-7"
runs-on: "ubuntu-20-04-cuda-11-7"
Expand All @@ -125,6 +133,14 @@ jobs:
vulkan: false
sccache: true
sccache-conf-path: "/tmp/sccache.conf"
- os: "linux"
name: "amd64-noavx-cuda-12-0"
runs-on: "ubuntu-20-04-cuda-12-0"
cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
run-e2e: false
vulkan: false
sccache: true
sccache-conf-path: "/tmp/sccache.conf"
- os: "linux"
name: "amd64-avx2-cuda-12-0"
runs-on: "ubuntu-20-04-cuda-12-0"
Expand Down Expand Up @@ -205,6 +221,14 @@ jobs:
vulkan: true
sccache: false
sccache-conf-path: ""
- os: "windows"
name: "amd64-noavx-cuda-12-0"
runs-on: "windows-cuda-12-0"
cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CUDA_COMPILER_LAUNCHER=sccache -GNinja"
run-e2e: false
vulkan: false
sccache: true
sccache-conf-path: 'C:\sccache.conf'
- os: "windows"
name: "amd64-avx2-cuda-12-0"
runs-on: "windows-cuda-12-0"
Expand All @@ -229,6 +253,14 @@ jobs:
vulkan: false
sccache: true
sccache-conf-path: 'C:\sccache.conf'
- os: "windows"
name: "amd64-noavx-cuda-11-7"
runs-on: "windows-cuda-11-7"
cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CUDA_COMPILER_LAUNCHER=sccache -GNinja"
run-e2e: false
vulkan: false
sccache: true
sccache-conf-path: 'C:\sccache.conf'
- os: "windows"
name: "amd64-avx2-cuda-11-7"
runs-on: "windows-cuda-11-7"
Expand Down
32 changes: 32 additions & 0 deletions .github/workflows/nightly-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,14 @@ jobs:
vulkan: true
sccache: true
sccache-conf-path: "/tmp/sccache.conf"
- os: "linux"
name: "amd64-noavx-cuda-11-7"
runs-on: "ubuntu-20-04-cuda-11-7"
cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
run-e2e: false
vulkan: false
sccache: true
sccache-conf-path: "/tmp/sccache.conf"
- os: "linux"
name: "amd64-avx2-cuda-11-7"
runs-on: "ubuntu-20-04-cuda-11-7"
Expand All @@ -125,6 +133,14 @@ jobs:
vulkan: false
sccache: true
sccache-conf-path: "/tmp/sccache.conf"
- os: "linux"
name: "amd64-noavx-cuda-12-0"
runs-on: "ubuntu-20-04-cuda-12-0"
cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
run-e2e: false
vulkan: false
sccache: true
sccache-conf-path: "/tmp/sccache.conf"
- os: "linux"
name: "amd64-avx2-cuda-12-0"
runs-on: "ubuntu-20-04-cuda-12-0"
Expand Down Expand Up @@ -205,6 +221,14 @@ jobs:
vulkan: true
sccache: false
sccache-conf-path: ""
- os: "windows"
name: "amd64-noavx-cuda-12-0"
runs-on: "windows-cuda-12-0"
cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CUDA_COMPILER_LAUNCHER=sccache -GNinja"
run-e2e: false
vulkan: false
sccache: true
sccache-conf-path: 'C:\sccache.conf'
- os: "windows"
name: "amd64-avx2-cuda-12-0"
runs-on: "windows-cuda-12-0"
Expand All @@ -229,6 +253,14 @@ jobs:
vulkan: false
sccache: true
sccache-conf-path: 'C:\sccache.conf'
- os: "windows"
name: "amd64-noavx-cuda-11-7"
runs-on: "windows-cuda-11-7"
cmake-flags: "-DCORTEXLLAMA_VERSION=${{needs.create-draft-release.outputs.version}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CUDA_COMPILER_LAUNCHER=sccache -GNinja"
run-e2e: false
vulkan: false
sccache: true
sccache-conf-path: 'C:\sccache.conf'
- os: "windows"
name: "amd64-avx2-cuda-11-7"
runs-on: "windows-cuda-11-7"
Expand Down
32 changes: 32 additions & 0 deletions .github/workflows/template-quality-gate-pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,14 @@ jobs:
vulkan: true
sccache: true
sccache-conf-path: "/tmp/sccache.conf"
- os: "linux"
name: "amd64-noavx-cuda-11-7"
runs-on: "ubuntu-20-04-cuda-11-7"
cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
run-e2e: false
vulkan: false
sccache: true
sccache-conf-path: "/tmp/sccache.conf"
- os: "linux"
name: "amd64-avx2-cuda-11-7"
runs-on: "ubuntu-20-04-cuda-11-7"
Expand All @@ -92,6 +100,14 @@ jobs:
vulkan: false
sccache: true
sccache-conf-path: "/tmp/sccache.conf"
- os: "linux"
name: "amd64-noavx-cuda-12-0"
runs-on: "ubuntu-20-04-cuda-12-0"
cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_FMA=OFF -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
run-e2e: false
vulkan: false
sccache: true
sccache-conf-path: "/tmp/sccache.conf"
- os: "linux"
name: "amd64-avx2-cuda-12-0"
runs-on: "ubuntu-20-04-cuda-12-0"
Expand Down Expand Up @@ -172,6 +188,14 @@ jobs:
vulkan: true
sccache: false
sccache-conf-path: ""
- os: "windows"
name: "amd64-noavx-cuda-12-0"
runs-on: "windows-cuda-12-0"
cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CUDA_COMPILER_LAUNCHER=sccache -GNinja"
run-e2e: false
vulkan: false
sccache: true
sccache-conf-path: 'C:\sccache.conf'
- os: "windows"
name: "amd64-avx2-cuda-12-0"
runs-on: "windows-cuda-12-0"
Expand All @@ -196,6 +220,14 @@ jobs:
vulkan: false
sccache: true
sccache-conf-path: 'C:\sccache.conf'
- os: "windows"
name: "amd64-noavx-cuda-11-7"
runs-on: "windows-cuda-11-7"
cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CUDA_COMPILER_LAUNCHER=sccache -GNinja"
run-e2e: false
vulkan: false
sccache: true
sccache-conf-path: 'C:\sccache.conf'
- os: "windows"
name: "amd64-avx2-cuda-11-7"
runs-on: "windows-cuda-11-7"
Expand Down
32 changes: 32 additions & 0 deletions .github/workflows/template-quality-gate-submodule.yml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,14 @@ jobs:
vulkan: true
sccache: true
sccache-conf-path: "/tmp/sccache.conf"
- os: "linux"
name: "amd64-noavx-cuda-11-7"
runs-on: "ubuntu-20-04-cuda-11-7"
cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_FMA=OFF -DBUILD_SHARED_LIBS=OFF -DGGML_AVX2=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
run-e2e: false
vulkan: false
sccache: true
sccache-conf-path: "/tmp/sccache.conf"
- os: "linux"
name: "amd64-avx2-cuda-11-7"
runs-on: "ubuntu-20-04-cuda-11-7"
Expand All @@ -92,6 +100,14 @@ jobs:
vulkan: false
sccache: true
sccache-conf-path: "/tmp/sccache.conf"
- os: "linux"
name: "amd64-noavx-cuda-12-0"
runs-on: "ubuntu-20-04-cuda-12-0"
cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_FMA=OFF -DGGML_AVX2=OFF -DBUILD_SHARED_LIBS=OFF -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_BUILD_TYPE='Release' -GNinja"
run-e2e: false
vulkan: false
sccache: true
sccache-conf-path: "/tmp/sccache.conf"
- os: "linux"
name: "amd64-avx2-cuda-12-0"
runs-on: "ubuntu-20-04-cuda-12-0"
Expand Down Expand Up @@ -172,6 +188,14 @@ jobs:
vulkan: true
sccache: false
sccache-conf-path: ""
- os: "windows"
name: "amd64-noavx-cuda-12-0"
runs-on: "windows-cuda-12-0"
cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CUDA_COMPILER_LAUNCHER=sccache -GNinja"
run-e2e: false
vulkan: false
sccache: true
sccache-conf-path: 'C:\sccache.conf'
- os: "windows"
name: "amd64-avx2-cuda-12-0"
runs-on: "windows-cuda-12-0"
Expand All @@ -196,6 +220,14 @@ jobs:
vulkan: false
sccache: true
sccache-conf-path: 'C:\sccache.conf'
- os: "windows"
name: "amd64-noavx-cuda-11-7"
runs-on: "windows-cuda-11-7"
cmake-flags: "-DCORTEXLLAMA_VERSION=${{github.event.pull_request.head.sha}} -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=OFF -DCMAKE_BUILD_TYPE='Release' -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CUDA_COMPILER_LAUNCHER=sccache -GNinja"
run-e2e: false
vulkan: false
sccache: true
sccache-conf-path: 'C:\sccache.conf'
- os: "windows"
name: "amd64-avx2-cuda-11-7"
runs-on: "windows-cuda-11-7"
Expand Down
34 changes: 23 additions & 11 deletions examples/server/server.cc
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,18 @@ class Server {
};
};

struct SyncJsonReader {
public:
void Parse(const std::string& document, Json::Value& root) {
std::lock_guard<std::mutex> l(m);
r.parse(document, root);
}

private:
Json::Reader r;
std::mutex m;
};

std::function<void(int)> shutdown_handler;
std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;

Expand Down Expand Up @@ -78,7 +90,7 @@ int main(int argc, char** argv) {
}

Server server;
Json::Reader r;
SyncJsonReader r;
auto svr = std::make_unique<httplib::Server>();

if (!svr->bind_to_port(hostname, port)) {
Expand Down Expand Up @@ -127,7 +139,7 @@ int main(int argc, char** argv) {
resp.set_header("Access-Control-Allow-Origin",
req.get_header_value("Origin"));
auto req_body = std::make_shared<Json::Value>();
r.parse(req.body, *req_body);
r.Parse(req.body, *req_body);
server.engine_->LoadModel(
req_body, [&server, &resp](Json::Value status, Json::Value res) {
resp.set_content(res.toStyledString().c_str(),
Expand All @@ -141,7 +153,7 @@ int main(int argc, char** argv) {
resp.set_header("Access-Control-Allow-Origin",
req.get_header_value("Origin"));
auto req_body = std::make_shared<Json::Value>();
r.parse(req.body, *req_body);
r.Parse(req.body, *req_body);
server.engine_->UnloadModel(
req_body, [&server, &resp](Json::Value status, Json::Value res) {
resp.set_content(res.toStyledString().c_str(),
Expand All @@ -155,7 +167,7 @@ int main(int argc, char** argv) {
resp.set_header("Access-Control-Allow-Origin",
req.get_header_value("Origin"));
auto req_body = std::make_shared<Json::Value>();
r.parse(req.body, *req_body);
r.Parse(req.body, *req_body);
bool is_stream = (*req_body).get("stream", false).asBool();
// This is an async call, need to use queue
auto q = std::make_shared<SyncQueue>();
Expand All @@ -175,7 +187,7 @@ int main(int argc, char** argv) {
resp.set_header("Access-Control-Allow-Origin",
req.get_header_value("Origin"));
auto req_body = std::make_shared<Json::Value>();
r.parse(req.body, *req_body);
r.Parse(req.body, *req_body);
// This is an async call, need to use queue
SyncQueue q;
server.engine_->HandleEmbedding(
Expand All @@ -190,7 +202,7 @@ int main(int argc, char** argv) {
resp.set_header("Access-Control-Allow-Origin",
req.get_header_value("Origin"));
auto req_body = std::make_shared<Json::Value>();
r.parse(req.body, *req_body);
r.Parse(req.body, *req_body);
server.engine_->GetModelStatus(
req_body, [&server, &resp](Json::Value status, Json::Value res) {
resp.set_content(res.toStyledString().c_str(),
Expand All @@ -204,7 +216,7 @@ int main(int argc, char** argv) {
resp.set_header("Access-Control-Allow-Origin",
req.get_header_value("Origin"));
auto req_body = std::make_shared<Json::Value>();
r.parse(req.body, *req_body);
r.Parse(req.body, *req_body);
server.engine_->GetModels(
req_body, [&server, &resp](Json::Value status, Json::Value res) {
resp.set_content(res.toStyledString().c_str(),
Expand All @@ -222,10 +234,10 @@ int main(int argc, char** argv) {
svr->Get("/models", handle_get_running_models);
std::atomic<bool> running = true;
svr->Delete("/destroy",
[&](const httplib::Request& req, httplib::Response& resp) {
LOG_INFO << "Received Stop command";
running = false;
});
[&](const httplib::Request& req, httplib::Response& resp) {
LOG_INFO << "Received Stop command";
running = false;
});

LOG_INFO << "HTTP server listening: " << hostname << ":" << port;
svr->new_task_queue = [] {
Expand Down