@@ -13,24 +13,25 @@ using json = nlohmann::json;
1313namespace inferences {
1414namespace {
1515constexpr static auto kLlamaEngine = " cortex.llamacpp" ;
16- constexpr static auto kLlamaLibPath = " /engines/ cortex.llamacpp " ;
16+ constexpr static auto kPythonRuntimeEngine = " cortex.python " ;
1717} // namespace
1818
19- server::server ()
20- : engine_{nullptr } {
19+ server::server (){
2120
22- // Some default values for now below
23- // log_disable(); // Disable the log to file feature, reduce bloat for
24- // target
25- // system ()
26- };
21+ // Some default values for now below
22+ // log_disable(); // Disable the log to file feature, reduce bloat for
23+ // target
24+ // system ()
25+ };
2726
2827server::~server () {}
2928
3029void server::ChatCompletion (
3130 const HttpRequestPtr& req,
3231 std::function<void (const HttpResponsePtr&)>&& callback) {
33- if (!IsEngineLoaded ()) {
32+ auto engine_type =
33+ (*(req->getJsonObject ())).get (" engine" , kLlamaEngine ).asString ();
34+ if (!IsEngineLoaded (engine_type)) {
3435 Json::Value res;
3536 res[" message" ] = " Engine is not loaded yet" ;
3637 auto resp = cortex_utils::nitroHttpJsonResponse (res);
@@ -44,10 +45,11 @@ void server::ChatCompletion(
4445 auto json_body = req->getJsonObject ();
4546 bool is_stream = (*json_body).get (" stream" , false ).asBool ();
4647 auto q = std::make_shared<SyncQueue>();
47- engine_->HandleChatCompletion (json_body,
48- [q](Json::Value status, Json::Value res) {
49- q->push (std::make_pair (status, res));
50- });
48+ std::get<EngineI*>(engines_[engine_type].engine )
49+ ->HandleChatCompletion (json_body,
50+ [q](Json::Value status, Json::Value res) {
51+ q->push (std::make_pair (status, res));
52+ });
5153 LOG_TRACE << " Wait to chat completion responses" ;
5254 if (is_stream) {
5355 ProcessStreamRes (std::move (callback), q);
@@ -59,7 +61,9 @@ void server::ChatCompletion(
5961
6062void server::Embedding (const HttpRequestPtr& req,
6163 std::function<void (const HttpResponsePtr&)>&& callback) {
62- if (!IsEngineLoaded ()) {
64+ auto engine_type =
65+ (*(req->getJsonObject ())).get (" engine" , kLlamaEngine ).asString ();
66+ if (!IsEngineLoaded (engine_type)) {
6367 Json::Value res;
6468 res[" message" ] = " Engine is not loaded yet" ;
6569 auto resp = cortex_utils::nitroHttpJsonResponse (res);
@@ -71,10 +75,11 @@ void server::Embedding(const HttpRequestPtr& req,
7175
7276 LOG_TRACE << " Start embedding" ;
7377 SyncQueue q;
74- engine_->HandleEmbedding (req->getJsonObject (),
75- [&q](Json::Value status, Json::Value res) {
76- q.push (std::make_pair (status, res));
77- });
78+ std::get<EngineI*>(engines_[engine_type].engine )
79+ ->HandleEmbedding (req->getJsonObject (),
80+ [&q](Json::Value status, Json::Value res) {
81+ q.push (std::make_pair (status, res));
82+ });
7883 LOG_TRACE << " Wait to embedding" ;
7984 ProcessNonStreamRes (std::move (callback), q);
8085 LOG_TRACE << " Done embedding" ;
@@ -83,7 +88,9 @@ void server::Embedding(const HttpRequestPtr& req,
8388void server::UnloadModel (
8489 const HttpRequestPtr& req,
8590 std::function<void (const HttpResponsePtr&)>&& callback) {
86- if (!IsEngineLoaded ()) {
91+ auto engine_type =
92+ (*(req->getJsonObject ())).get (" engine" , kLlamaEngine ).asString ();
93+ if (!IsEngineLoaded (engine_type)) {
8794 Json::Value res;
8895 res[" message" ] = " Engine is not loaded yet" ;
8996 auto resp = cortex_utils::nitroHttpJsonResponse (res);
@@ -93,21 +100,24 @@ void server::UnloadModel(
93100 return ;
94101 }
95102 LOG_TRACE << " Start unload model" ;
96- engine_->UnloadModel (
97- req->getJsonObject (),
98- [cb = std::move (callback)](Json::Value status, Json::Value res) {
99- auto resp = cortex_utils::nitroHttpJsonResponse (res);
100- resp->setStatusCode (
101- static_cast <drogon::HttpStatusCode>(status[" status_code" ].asInt ()));
102- cb (resp);
103- });
103+ std::get<EngineI*>(engines_[engine_type].engine )
104+ ->UnloadModel (
105+ req->getJsonObject (),
106+ [cb = std::move (callback)](Json::Value status, Json::Value res) {
107+ auto resp = cortex_utils::nitroHttpJsonResponse (res);
108+ resp->setStatusCode (static_cast <drogon::HttpStatusCode>(
109+ status[" status_code" ].asInt ()));
110+ cb (resp);
111+ });
104112 LOG_TRACE << " Done unload model" ;
105113}
106114
107115void server::ModelStatus (
108116 const HttpRequestPtr& req,
109117 std::function<void (const HttpResponsePtr&)>&& callback) {
110- if (!IsEngineLoaded ()) {
118+ auto engine_type =
119+ (*(req->getJsonObject ())).get (" engine" , kLlamaEngine ).asString ();
120+ if (!IsEngineLoaded (engine_type)) {
111121 Json::Value res;
112122 res[" message" ] = " Engine is not loaded yet" ;
113123 auto resp = cortex_utils::nitroHttpJsonResponse (res);
@@ -118,64 +128,138 @@ void server::ModelStatus(
118128 }
119129
120130 LOG_TRACE << " Start to get model status" ;
121- engine_->GetModelStatus (
122- req->getJsonObject (),
123- [cb = std::move (callback)](Json::Value status, Json::Value res) {
124- auto resp = cortex_utils::nitroHttpJsonResponse (res);
125- resp->setStatusCode (
126- static_cast <drogon::HttpStatusCode>(status[" status_code" ].asInt ()));
127- cb (resp);
128- });
131+ std::get<EngineI*>(engines_[engine_type].engine )
132+ ->GetModelStatus (
133+ req->getJsonObject (),
134+ [cb = std::move (callback)](Json::Value status, Json::Value res) {
135+ auto resp = cortex_utils::nitroHttpJsonResponse (res);
136+ resp->setStatusCode (static_cast <drogon::HttpStatusCode>(
137+ status[" status_code" ].asInt ()));
138+ cb (resp);
139+ });
129140 LOG_TRACE << " Done get model status" ;
130141}
131142
143+ void server::GetEngines (
144+ const HttpRequestPtr& req,
145+ std::function<void (const HttpResponsePtr&)>&& callback) {
146+ Json::Value res;
147+ Json::Value engine_array (Json::arrayValue);
148+ for (const auto & [s, _] : engines_) {
149+ Json::Value val;
150+ val[" id" ] = s;
151+ val[" object" ] = " engine" ;
152+ engine_array.append (val);
153+ }
154+
155+ res[" object" ] = " list" ;
156+ res[" data" ] = engine_array;
157+
158+ auto resp = cortex_utils::nitroHttpJsonResponse (res);
159+ callback (resp);
160+ }
161+
162+ void server::FineTuning (
163+ const HttpRequestPtr& req,
164+ std::function<void (const HttpResponsePtr&)>&& callback) {
165+ auto engine_type =
166+ (*(req->getJsonObject ())).get (" engine" , kPythonRuntimeEngine ).asString ();
167+
168+ if (engines_.find (engine_type) == engines_.end ()) {
169+ try {
170+ std::string abs_path =
171+ cortex_utils::GetCurrentPath () + cortex_utils::kPythonRuntimeLibPath ;
172+ engines_[engine_type].dl =
173+ std::make_unique<cortex_cpp::dylib>(abs_path, " engine" );
174+ } catch (const cortex_cpp::dylib::load_error& e) {
175+
176+ LOG_ERROR << " Could not load engine: " << e.what ();
177+ engines_.erase (engine_type);
178+
179+ Json::Value res;
180+ res[" message" ] = " Could not load engine " + engine_type;
181+ auto resp = cortex_utils::nitroHttpJsonResponse (res);
182+ resp->setStatusCode (k500InternalServerError);
183+ callback (resp);
184+ return ;
185+ }
186+
187+ auto func = engines_[engine_type].dl ->get_function <CortexPythonEngineI*()>(
188+ " get_engine" );
189+ engines_[engine_type].engine = func ();
190+ LOG_INFO << " Loaded engine: " << engine_type;
191+ }
192+
193+ LOG_TRACE << " Start to fine-tuning" ;
194+ auto & en = std::get<CortexPythonEngineI*>(engines_[engine_type].engine );
195+ if (en->IsSupported (" HandlePythonFileExecutionRequest" )) {
196+ en->HandlePythonFileExecutionRequest (
197+ req->getJsonObject (),
198+ [cb = std::move (callback)](Json::Value status, Json::Value res) {
199+ auto resp = cortex_utils::nitroHttpJsonResponse (res);
200+ resp->setStatusCode (static_cast <drogon::HttpStatusCode>(
201+ status[" status_code" ].asInt ()));
202+ cb (resp);
203+ });
204+ } else {
205+ Json::Value res;
206+ res[" message" ] = " Method is not supported yet" ;
207+ auto resp = cortex_utils::nitroHttpJsonResponse (res);
208+ resp->setStatusCode (k500InternalServerError);
209+ callback (resp);
210+ LOG_WARN << " Method is not supported yet" ;
211+ }
212+ LOG_TRACE << " Done fine-tuning" ;
213+ }
214+
132215void server::LoadModel (const HttpRequestPtr& req,
133216 std::function<void (const HttpResponsePtr&)>&& callback) {
134217 auto engine_type =
135218 (*(req->getJsonObject ())).get (" engine" , kLlamaEngine ).asString ();
136- if (!dylib_ || engine_type != cur_engine_name_) {
137- cur_engine_name_ = engine_type;
138- // TODO: change this when we get more engines
219+
220+ // We have not loaded engine yet, should load it before using it
221+ if (engines_.find (engine_type) == engines_.end ()) {
222+ // TODO(sang) we cannot run cortex.llamacpp and cortex.tensorrt-llm at the same time.
223+ // So need an unload engine machanism to handle.
139224 auto get_engine_path = [](std::string_view e) {
140225 if (e == kLlamaEngine ) {
141- return kLlamaLibPath ;
226+ return cortex_utils:: kLlamaLibPath ;
142227 }
143- return kLlamaLibPath ;
228+ return cortex_utils:: kLlamaLibPath ;
144229 };
145230
146231 try {
147- std::string abs_path = cortex_utils::GetCurrentPath () +
148- get_engine_path (cur_engine_name_ );
149- dylib_ =
232+ std::string abs_path =
233+ cortex_utils::GetCurrentPath () + get_engine_path (engine_type );
234+ engines_[engine_type]. dl =
150235 std::make_unique<cortex_cpp::dylib>(abs_path, " engine" );
151236 } catch (const cortex_cpp::dylib::load_error& e) {
152237 LOG_ERROR << " Could not load engine: " << e.what ();
153- dylib_.reset ();
154- engine_ = nullptr ;
155- }
238+ engines_.erase (engine_type);
156239
157- if (!dylib_) {
158240 Json::Value res;
159- res[" message" ] = " Could not load engine " + cur_engine_name_ ;
241+ res[" message" ] = " Could not load engine " + engine_type ;
160242 auto resp = cortex_utils::nitroHttpJsonResponse (res);
161243 resp->setStatusCode (k500InternalServerError);
162244 callback (resp);
163245 return ;
164246 }
165- auto func = dylib_->get_function <EngineI*()>(" get_engine" );
166- engine_ = func ();
167- LOG_INFO << " Loaded engine: " << cur_engine_name_;
247+
248+ auto func =
249+ engines_[engine_type].dl ->get_function <EngineI*()>(" get_engine" );
250+ engines_[engine_type].engine = func ();
251+ LOG_INFO << " Loaded engine: " << engine_type;
168252 }
169253
170254 LOG_TRACE << " Load model" ;
171- engine_-> LoadModel (
172- req->getJsonObject (),
173- [cb = std::move (callback)]( Json::Value status, Json::Value res) {
174- auto resp = cortex_utils::nitroHttpJsonResponse (res);
175- resp->setStatusCode (
176- static_cast <drogon::HttpStatusCode>(status[" status_code" ].asInt ()));
177- cb (resp);
178- });
255+ auto & en = std::get<EngineI*>(engines_[engine_type]. engine );
256+ en-> LoadModel ( req->getJsonObject (), [cb = std::move (callback)](
257+ Json::Value status, Json::Value res) {
258+ auto resp = cortex_utils::nitroHttpJsonResponse (res);
259+ resp->setStatusCode (
260+ static_cast <drogon::HttpStatusCode>(status[" status_code" ].asInt ()));
261+ cb (resp);
262+ });
179263 LOG_TRACE << " Done load model" ;
180264}
181265
@@ -222,8 +306,8 @@ void server::ProcessNonStreamRes(std::function<void(const HttpResponsePtr&)> cb,
222306 cb (resp);
223307}
224308
225- bool server::IsEngineLoaded () {
226- return !!engine_ ;
309+ bool server::IsEngineLoaded (const std::string& e ) {
310+ return engines_. find (e) != engines_. end () ;
227311}
228312
229313} // namespace inferences
0 commit comments