Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ else()
endif()

aux_source_directory(controllers CTL_SRC)
aux_source_directory(common COMMON_SRC)
# aux_source_directory(filters FILTER_SRC) aux_source_directory(plugins
# PLUGIN_SRC) aux_source_directory(models MODEL_SRC)

Expand All @@ -86,7 +87,7 @@ aux_source_directory(controllers CTL_SRC)

target_include_directories(${PROJECT_NAME} PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
# ${CMAKE_CURRENT_SOURCE_DIR}/models)
target_sources(${PROJECT_NAME} PRIVATE ${CTL_SRC})
target_sources(${PROJECT_NAME} PRIVATE ${CTL_SRC} ${COMMON_SRC})
# ${FILTER_SRC} ${PLUGIN_SRC} ${MODEL_SRC})
# ##############################################################################
# uncomment the following line for dynamically loading views set_property(TARGET
Expand Down
Empty file added common/base.cc
Empty file.
42 changes: 42 additions & 0 deletions common/base.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#pragma once
#include <drogon/HttpController.h>

using namespace drogon;

#pragma once
#include <drogon/HttpController.h>

using namespace drogon;

class BaseProvider {
public:
virtual ~BaseProvider() {}

// General inference method
virtual void
inference(const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback) = 0;

// Model management
virtual void
loadModel(const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback) = 0;
virtual void
unloadModel(const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback) = 0;
virtual void
modelStatus(const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback) = 0;
};

class ChatProvider : public BaseProvider {
public:
virtual ~ChatProvider() {}

// Implement embedding functionality specific to chat
virtual void
embedding(const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback) = 0;

// The derived class can also override other methods if needed
};
26 changes: 12 additions & 14 deletions controllers/llamaCPP.cc
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,15 @@ std::string create_return_json(const std::string &id, const std::string &model,
return Json::writeString(writer, root);
}

llamaCPP::llamaCPP() {
// Some default values for now below
log_disable(); // Disable the log to file feature, reduce bloat for
// target
// system ()
};

llamaCPP::~llamaCPP() { stopBackgroundTask(); }

void llamaCPP::warmupModel() {
json pseudo;

Expand All @@ -148,29 +157,18 @@ void llamaCPP::warmupModel() {
return;
}

void llamaCPP::handlePrelight(
const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback) {
auto resp = drogon::HttpResponse::newHttpResponse();
resp->setStatusCode(drogon::HttpStatusCode::k200OK);
resp->addHeader("Access-Control-Allow-Origin", "*");
resp->addHeader("Access-Control-Allow-Methods", "POST, OPTIONS");
resp->addHeader("Access-Control-Allow-Headers", "*");
callback(resp);
}

void llamaCPP::chatCompletion(
void llamaCPP::inference(
const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback) {

const auto &jsonBody = req->getJsonObject();
// Check if model is loaded
checkModelLoaded(callback);

chatCompletionImpl(jsonBody, callback);
inferenceImpl(jsonBody, callback);
}

void llamaCPP::chatCompletionImpl(
void llamaCPP::inferenceImpl(
std::shared_ptr<Json::Value> jsonBody,
std::function<void(const HttpResponsePtr &)> &callback) {

Expand Down
51 changes: 24 additions & 27 deletions controllers/llamaCPP.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#define CPPHTTPLIB_NO_EXCEPTIONS 1
#endif

#include "common/base.h"
#include "utils/json.hpp"

// auto generated files (update with ./deps.sh)
Expand Down Expand Up @@ -2510,45 +2511,42 @@ append_to_generated_text_from_generated_token_probs(llama_server_context &llama,
using namespace drogon;

namespace inferences {
class llamaCPP : public drogon::HttpController<llamaCPP> {
class llamaCPP : public drogon::HttpController<llamaCPP>, public ChatProvider {
public:
llamaCPP() {
// Some default values for now below
log_disable(); // Disable the log to file feature, reduce bloat for
// target
// system ()
}

~llamaCPP() { stopBackgroundTask(); }
llamaCPP();
~llamaCPP();
METHOD_LIST_BEGIN
// list path definitions here;
METHOD_ADD(llamaCPP::chatCompletion, "chat_completion", Post);
METHOD_ADD(llamaCPP::inference, "chat_completion", Post);
METHOD_ADD(llamaCPP::embedding, "embedding", Post);
METHOD_ADD(llamaCPP::loadModel, "loadmodel", Post);
METHOD_ADD(llamaCPP::unloadModel, "unloadmodel", Get);
METHOD_ADD(llamaCPP::modelStatus, "modelstatus", Get);

// Openai compatible path
ADD_METHOD_TO(llamaCPP::chatCompletion, "/v1/chat/completions", Post);
ADD_METHOD_TO(llamaCPP::handlePrelight, "/v1/chat/completions", Options);
ADD_METHOD_TO(llamaCPP::inference, "/v1/chat/completions", Post);
// ADD_METHOD_TO(llamaCPP::handlePrelight, "/v1/chat/completions", Options); NOTE: prelight will be added back when browser support is properly planned

ADD_METHOD_TO(llamaCPP::embedding, "/v1/embeddings", Post);
ADD_METHOD_TO(llamaCPP::handlePrelight, "/v1/embeddings", Options);
//ADD_METHOD_TO(llamaCPP::handlePrelight, "/v1/embeddings", Options);

// PATH_ADD("/llama/chat_completion", Post);
METHOD_LIST_END
void chatCompletion(const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback);
void handlePrelight(const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback);
void embedding(const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback);
void loadModel(const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback);
void unloadModel(const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback);
void modelStatus(const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback);
void
inference(const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback) override;
void
embedding(const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback) override;
void
loadModel(const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback) override;
void
unloadModel(const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback) override;
void
modelStatus(const HttpRequestPtr &req,
std::function<void(const HttpResponsePtr &)> &&callback) override;

private:
llama_server_context llama;
Expand All @@ -2569,8 +2567,7 @@ class llamaCPP : public drogon::HttpController<llamaCPP> {
std::string grammar_file_content;

bool loadModelImpl(std::shared_ptr<Json::Value> jsonBody);
void
chatCompletionImpl(std::shared_ptr<Json::Value> jsonBody,
void inferenceImpl(std::shared_ptr<Json::Value> jsonBody,
std::function<void(const HttpResponsePtr &)> &callback);
void embeddingImpl(std::shared_ptr<Json::Value> jsonBody,
std::function<void(const HttpResponsePtr &)> &callback);
Expand Down