Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.

Commit 832c57c

Browse files
committed
feat: engines endpoint and cortex.python
1 parent e65ad47 commit 832c57c

File tree

11 files changed

+271
-76
lines changed

11 files changed

+271
-76
lines changed

.github/scripts/e2e-test-python-runtime-linux-and-mac.sh

Whitespace-only changes.

.github/scripts/e2e-test-python-runtime-windows.bat

Whitespace-only changes.

cortex-cpp/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ cmake_minimum_required(VERSION 3.5)
22
project(cortex-cpp C CXX)
33

44
include(engines/cortex.llamacpp/engine.cmake)
5+
include(engines/cortex.python/engine.cmake)
56
include(CheckIncludeFileCXX)
67

78
check_include_file_cxx(any HAS_ANY)

cortex-cpp/common/base.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,21 @@ class BaseModel {
88
virtual ~BaseModel() {}
99

1010
// Model management
11-
virtual void LoadModel(const HttpRequestPtr& req,
12-
std::function<void(const HttpResponsePtr&)>&& callback) = 0;
11+
virtual void LoadModel(
12+
const HttpRequestPtr& req,
13+
std::function<void(const HttpResponsePtr&)>&& callback) = 0;
1314
virtual void UnloadModel(
1415
const HttpRequestPtr& req,
1516
std::function<void(const HttpResponsePtr&)>&& callback) = 0;
1617
virtual void ModelStatus(
1718
const HttpRequestPtr& req,
1819
std::function<void(const HttpResponsePtr&)>&& callback) = 0;
20+
virtual void GetEngines(
21+
const HttpRequestPtr& req,
22+
std::function<void(const HttpResponsePtr&)>&& callback) = 0;
23+
virtual void FineTuning(
24+
const HttpRequestPtr& req,
25+
std::function<void(const HttpResponsePtr&)>&& callback) = 0;
1926
};
2027

2128
class BaseChatCompletion {

cortex-cpp/controllers/server.cc

Lines changed: 146 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -13,24 +13,25 @@ using json = nlohmann::json;
1313
namespace inferences {
1414
namespace {
1515
constexpr static auto kLlamaEngine = "cortex.llamacpp";
16-
constexpr static auto kLlamaLibPath = "/engines/cortex.llamacpp";
16+
constexpr static auto kPythonRuntimeEngine = "cortex.python";
1717
} // namespace
1818

19-
server::server()
20-
: engine_{nullptr} {
19+
server::server(){
2120

22-
// Some default values for now below
23-
// log_disable(); // Disable the log to file feature, reduce bloat for
24-
// target
25-
// system ()
26-
};
21+
// Some default values for now below
22+
// log_disable(); // Disable the log to file feature, reduce bloat for
23+
// target
24+
// system ()
25+
};
2726

2827
server::~server() {}
2928

3029
void server::ChatCompletion(
3130
const HttpRequestPtr& req,
3231
std::function<void(const HttpResponsePtr&)>&& callback) {
33-
if (!IsEngineLoaded()) {
32+
auto engine_type =
33+
(*(req->getJsonObject())).get("engine", kLlamaEngine).asString();
34+
if (!IsEngineLoaded(engine_type)) {
3435
Json::Value res;
3536
res["message"] = "Engine is not loaded yet";
3637
auto resp = cortex_utils::nitroHttpJsonResponse(res);
@@ -44,10 +45,11 @@ void server::ChatCompletion(
4445
auto json_body = req->getJsonObject();
4546
bool is_stream = (*json_body).get("stream", false).asBool();
4647
auto q = std::make_shared<SyncQueue>();
47-
engine_->HandleChatCompletion(json_body,
48-
[q](Json::Value status, Json::Value res) {
49-
q->push(std::make_pair(status, res));
50-
});
48+
std::get<EngineI*>(engines_[engine_type].engine)
49+
->HandleChatCompletion(json_body,
50+
[q](Json::Value status, Json::Value res) {
51+
q->push(std::make_pair(status, res));
52+
});
5153
LOG_TRACE << "Wait to chat completion responses";
5254
if (is_stream) {
5355
ProcessStreamRes(std::move(callback), q);
@@ -59,7 +61,9 @@ void server::ChatCompletion(
5961

6062
void server::Embedding(const HttpRequestPtr& req,
6163
std::function<void(const HttpResponsePtr&)>&& callback) {
62-
if (!IsEngineLoaded()) {
64+
auto engine_type =
65+
(*(req->getJsonObject())).get("engine", kLlamaEngine).asString();
66+
if (!IsEngineLoaded(engine_type)) {
6367
Json::Value res;
6468
res["message"] = "Engine is not loaded yet";
6569
auto resp = cortex_utils::nitroHttpJsonResponse(res);
@@ -71,10 +75,11 @@ void server::Embedding(const HttpRequestPtr& req,
7175

7276
LOG_TRACE << "Start embedding";
7377
SyncQueue q;
74-
engine_->HandleEmbedding(req->getJsonObject(),
75-
[&q](Json::Value status, Json::Value res) {
76-
q.push(std::make_pair(status, res));
77-
});
78+
std::get<EngineI*>(engines_[engine_type].engine)
79+
->HandleEmbedding(req->getJsonObject(),
80+
[&q](Json::Value status, Json::Value res) {
81+
q.push(std::make_pair(status, res));
82+
});
7883
LOG_TRACE << "Wait to embedding";
7984
ProcessNonStreamRes(std::move(callback), q);
8085
LOG_TRACE << "Done embedding";
@@ -83,7 +88,9 @@ void server::Embedding(const HttpRequestPtr& req,
8388
void server::UnloadModel(
8489
const HttpRequestPtr& req,
8590
std::function<void(const HttpResponsePtr&)>&& callback) {
86-
if (!IsEngineLoaded()) {
91+
auto engine_type =
92+
(*(req->getJsonObject())).get("engine", kLlamaEngine).asString();
93+
if (!IsEngineLoaded(engine_type)) {
8794
Json::Value res;
8895
res["message"] = "Engine is not loaded yet";
8996
auto resp = cortex_utils::nitroHttpJsonResponse(res);
@@ -93,21 +100,24 @@ void server::UnloadModel(
93100
return;
94101
}
95102
LOG_TRACE << "Start unload model";
96-
engine_->UnloadModel(
97-
req->getJsonObject(),
98-
[cb = std::move(callback)](Json::Value status, Json::Value res) {
99-
auto resp = cortex_utils::nitroHttpJsonResponse(res);
100-
resp->setStatusCode(
101-
static_cast<drogon::HttpStatusCode>(status["status_code"].asInt()));
102-
cb(resp);
103-
});
103+
std::get<EngineI*>(engines_[engine_type].engine)
104+
->UnloadModel(
105+
req->getJsonObject(),
106+
[cb = std::move(callback)](Json::Value status, Json::Value res) {
107+
auto resp = cortex_utils::nitroHttpJsonResponse(res);
108+
resp->setStatusCode(static_cast<drogon::HttpStatusCode>(
109+
status["status_code"].asInt()));
110+
cb(resp);
111+
});
104112
LOG_TRACE << "Done unload model";
105113
}
106114

107115
void server::ModelStatus(
108116
const HttpRequestPtr& req,
109117
std::function<void(const HttpResponsePtr&)>&& callback) {
110-
if (!IsEngineLoaded()) {
118+
auto engine_type =
119+
(*(req->getJsonObject())).get("engine", kLlamaEngine).asString();
120+
if (!IsEngineLoaded(engine_type)) {
111121
Json::Value res;
112122
res["message"] = "Engine is not loaded yet";
113123
auto resp = cortex_utils::nitroHttpJsonResponse(res);
@@ -118,64 +128,138 @@ void server::ModelStatus(
118128
}
119129

120130
LOG_TRACE << "Start to get model status";
121-
engine_->GetModelStatus(
122-
req->getJsonObject(),
123-
[cb = std::move(callback)](Json::Value status, Json::Value res) {
124-
auto resp = cortex_utils::nitroHttpJsonResponse(res);
125-
resp->setStatusCode(
126-
static_cast<drogon::HttpStatusCode>(status["status_code"].asInt()));
127-
cb(resp);
128-
});
131+
std::get<EngineI*>(engines_[engine_type].engine)
132+
->GetModelStatus(
133+
req->getJsonObject(),
134+
[cb = std::move(callback)](Json::Value status, Json::Value res) {
135+
auto resp = cortex_utils::nitroHttpJsonResponse(res);
136+
resp->setStatusCode(static_cast<drogon::HttpStatusCode>(
137+
status["status_code"].asInt()));
138+
cb(resp);
139+
});
129140
LOG_TRACE << "Done get model status";
130141
}
131142

143+
void server::GetEngines(
144+
const HttpRequestPtr& req,
145+
std::function<void(const HttpResponsePtr&)>&& callback) {
146+
Json::Value res;
147+
Json::Value engine_array(Json::arrayValue);
148+
for (const auto& [s, _] : engines_) {
149+
Json::Value val;
150+
val["id"] = s;
151+
val["object"] = "engine";
152+
engine_array.append(val);
153+
}
154+
155+
res["object"] = "list";
156+
res["data"] = engine_array;
157+
158+
auto resp = cortex_utils::nitroHttpJsonResponse(res);
159+
callback(resp);
160+
}
161+
162+
void server::FineTuning(
163+
const HttpRequestPtr& req,
164+
std::function<void(const HttpResponsePtr&)>&& callback) {
165+
auto engine_type =
166+
(*(req->getJsonObject())).get("engine", kPythonRuntimeEngine).asString();
167+
168+
if (engines_.find(engine_type) == engines_.end()) {
169+
try {
170+
std::string abs_path =
171+
cortex_utils::GetCurrentPath() + cortex_utils::kPythonRuntimeLibPath;
172+
engines_[engine_type].dl =
173+
std::make_unique<cortex_cpp::dylib>(abs_path, "engine");
174+
} catch (const cortex_cpp::dylib::load_error& e) {
175+
176+
LOG_ERROR << "Could not load engine: " << e.what();
177+
engines_.erase(engine_type);
178+
179+
Json::Value res;
180+
res["message"] = "Could not load engine " + engine_type;
181+
auto resp = cortex_utils::nitroHttpJsonResponse(res);
182+
resp->setStatusCode(k500InternalServerError);
183+
callback(resp);
184+
return;
185+
}
186+
187+
auto func = engines_[engine_type].dl->get_function<CortexPythonEngineI*()>(
188+
"get_engine");
189+
engines_[engine_type].engine = func();
190+
LOG_INFO << "Loaded engine: " << engine_type;
191+
}
192+
193+
LOG_TRACE << "Start to fine-tuning";
194+
auto& en = std::get<CortexPythonEngineI*>(engines_[engine_type].engine);
195+
if (en->IsSupported("HandlePythonFileExecutionRequest")) {
196+
en->HandlePythonFileExecutionRequest(
197+
req->getJsonObject(),
198+
[cb = std::move(callback)](Json::Value status, Json::Value res) {
199+
auto resp = cortex_utils::nitroHttpJsonResponse(res);
200+
resp->setStatusCode(static_cast<drogon::HttpStatusCode>(
201+
status["status_code"].asInt()));
202+
cb(resp);
203+
});
204+
} else {
205+
Json::Value res;
206+
res["message"] = "Method is not supported yet";
207+
auto resp = cortex_utils::nitroHttpJsonResponse(res);
208+
resp->setStatusCode(k500InternalServerError);
209+
callback(resp);
210+
LOG_WARN << "Method is not supported yet";
211+
}
212+
LOG_TRACE << "Done fine-tuning";
213+
}
214+
132215
void server::LoadModel(const HttpRequestPtr& req,
133216
std::function<void(const HttpResponsePtr&)>&& callback) {
134217
auto engine_type =
135218
(*(req->getJsonObject())).get("engine", kLlamaEngine).asString();
136-
if (!dylib_ || engine_type != cur_engine_name_) {
137-
cur_engine_name_ = engine_type;
138-
// TODO: change this when we get more engines
219+
220+
// We have not loaded engine yet, should load it before using it
221+
if (engines_.find(engine_type) == engines_.end()) {
222+
// TODO(sang) we cannot run cortex.llamacpp and cortex.tensorrt-llm at the same time.
223+
// So need an unload engine machanism to handle.
139224
auto get_engine_path = [](std::string_view e) {
140225
if (e == kLlamaEngine) {
141-
return kLlamaLibPath;
226+
return cortex_utils::kLlamaLibPath;
142227
}
143-
return kLlamaLibPath;
228+
return cortex_utils::kLlamaLibPath;
144229
};
145230

146231
try {
147-
std::string abs_path = cortex_utils::GetCurrentPath() +
148-
get_engine_path(cur_engine_name_);
149-
dylib_ =
232+
std::string abs_path =
233+
cortex_utils::GetCurrentPath() + get_engine_path(engine_type);
234+
engines_[engine_type].dl =
150235
std::make_unique<cortex_cpp::dylib>(abs_path, "engine");
151236
} catch (const cortex_cpp::dylib::load_error& e) {
152237
LOG_ERROR << "Could not load engine: " << e.what();
153-
dylib_.reset();
154-
engine_ = nullptr;
155-
}
238+
engines_.erase(engine_type);
156239

157-
if (!dylib_) {
158240
Json::Value res;
159-
res["message"] = "Could not load engine " + cur_engine_name_;
241+
res["message"] = "Could not load engine " + engine_type;
160242
auto resp = cortex_utils::nitroHttpJsonResponse(res);
161243
resp->setStatusCode(k500InternalServerError);
162244
callback(resp);
163245
return;
164246
}
165-
auto func = dylib_->get_function<EngineI*()>("get_engine");
166-
engine_ = func();
167-
LOG_INFO << "Loaded engine: " << cur_engine_name_;
247+
248+
auto func =
249+
engines_[engine_type].dl->get_function<EngineI*()>("get_engine");
250+
engines_[engine_type].engine = func();
251+
LOG_INFO << "Loaded engine: " << engine_type;
168252
}
169253

170254
LOG_TRACE << "Load model";
171-
engine_->LoadModel(
172-
req->getJsonObject(),
173-
[cb = std::move(callback)](Json::Value status, Json::Value res) {
174-
auto resp = cortex_utils::nitroHttpJsonResponse(res);
175-
resp->setStatusCode(
176-
static_cast<drogon::HttpStatusCode>(status["status_code"].asInt()));
177-
cb(resp);
178-
});
255+
auto& en = std::get<EngineI*>(engines_[engine_type].engine);
256+
en->LoadModel(req->getJsonObject(), [cb = std::move(callback)](
257+
Json::Value status, Json::Value res) {
258+
auto resp = cortex_utils::nitroHttpJsonResponse(res);
259+
resp->setStatusCode(
260+
static_cast<drogon::HttpStatusCode>(status["status_code"].asInt()));
261+
cb(resp);
262+
});
179263
LOG_TRACE << "Done load model";
180264
}
181265

@@ -222,8 +306,8 @@ void server::ProcessNonStreamRes(std::function<void(const HttpResponsePtr&)> cb,
222306
cb(resp);
223307
}
224308

225-
bool server::IsEngineLoaded() {
226-
return !!engine_;
309+
bool server::IsEngineLoaded(const std::string& e) {
310+
return engines_.find(e) != engines_.end();
227311
}
228312

229313
} // namespace inferences

0 commit comments

Comments
 (0)