Skip to content
This repository was archived by the owner on Jul 4, 2025. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions docs/docs/architecture/cortex-db.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@ This document outlines Cortex database architecture which is designed to store a
files and more.

## Table Structure
### schema Table
The `schema` table is designed to hold schema version for cortex database. Below is the structure of the table:
### schema_version Table
The `schema_version` table is designed to hold schema version for cortex database. Below is the structure of the table:

| Column Name | Data Type | Description |
|--------------------|-----------|---------------------------------------------------------|
| schema_version | INTEGER | A unique schema version for database. |
| version | INTEGER | A unique schema version for database. |

### models Table
The `models` table is designed to hold metadata about various AI models. Below is the structure of the table:
Expand Down Expand Up @@ -63,10 +63,10 @@ Below is the structure of the table:
| api_key | TEXT | |
| url | TEXT | |
| version | TEXT | The current version of the engine. |
| variant | TEXT | |
| variant | TEXT | A string that specifies the specific configuration or build variant of the engine. |
| status | TEXT | Current status of the engine (e.g., "downloaded", "downloadable"). |
| metadata | TEXT | Additional metadata or information about the engine. |
| date_ceated | TEXT | Date when the engine was downloaded. |
| date_created | TEXT | Date when the engine was downloaded. |
| date_updated | TEXT | Date when the engine was last updated. |

### files Table
Expand All @@ -78,5 +78,5 @@ The `files` table is designed to hold metadata about objects dowloaded via Corte
| object | TEXT | The type of hardware. |
| purpose | TEXT | Purpose of file |
| filename | TEXT | The name of the file. |
| created_at | INTEGER | Date when file was created |
| bytes | INTEGER | |
| created_at | INTEGER | Date when file was created. |
| bytes | INTEGER | Size of the file on disk in bytes. |
11 changes: 11 additions & 0 deletions engine/common/hardware_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,16 @@ struct NvidiaAddInfo {
};
struct AmdAddInfo {};
using GPUAddInfo = std::variant<NvidiaAddInfo, AmdAddInfo>;

enum class GpuType {
kGpuTypeOther = 0,
kGpuTypeIntegrated = 1,
kGpuTypeDiscrete = 2,
kGpuTypeVirtual = 3,
kGpuTypeCpu = 4,
kGpuTypeMaxEnum = 0x7FFFFFFF
};

struct GPU {
std::string id;
uint32_t device_id;
Expand All @@ -80,6 +90,7 @@ struct GPU {
std::string uuid;
bool is_activated = true;
std::string vendor;
GpuType gpu_type;
};

inline Json::Value ToJson(const std::vector<GPU>& gpus) {
Expand Down
9 changes: 4 additions & 5 deletions engine/controllers/models.cc
Original file line number Diff line number Diff line change
Expand Up @@ -218,11 +218,10 @@ void Models::ListModel(
obj["id"] = model_entry.model;
obj["model"] = model_entry.model;
obj["status"] = "downloaded";
// TODO(sang) Temporarily remove this estimation
// auto es = model_service_->GetEstimation(model_entry.model);
// if (es.has_value() && !!es.value()) {
// obj["recommendation"] = hardware::ToJson(*(es.value()));
// }
auto es = model_service_->GetEstimation(model_entry.model);
if (es.has_value()) {
obj["recommendation"] = hardware::ToJson(*es);
}
data.append(std::move(obj));
yaml_handler.Reset();
} else if (model_config.engine == kPythonEngine) {
Expand Down
8 changes: 6 additions & 2 deletions engine/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#include "utils/file_manager_utils.h"
#include "utils/logging_utils.h"
#include "utils/system_info_utils.h"
#include "utils/task_queue.h"

#if defined(__APPLE__) && defined(__MACH__)
#include <libgen.h> // for dirname()
Expand Down Expand Up @@ -177,8 +178,11 @@ void RunServer(std::optional<std::string> host, std::optional<int> port,
download_service, dylib_path_manager, db_service);
auto inference_svc = std::make_shared<InferenceService>(engine_service);
auto model_src_svc = std::make_shared<ModelSourceService>(db_service);
auto model_service = std::make_shared<ModelService>(
db_service, hw_service, download_service, inference_svc, engine_service);
cortex::TaskQueue task_queue(
std::min(2u, std::thread::hardware_concurrency()), "background_task");
auto model_service =
std::make_shared<ModelService>(db_service, hw_service, download_service,
inference_svc, engine_service, task_queue);
inference_svc->SetModelService(model_service);

auto file_watcher_srv = std::make_shared<FileWatcherService>(
Expand Down
42 changes: 28 additions & 14 deletions engine/services/hardware_service.cc
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ HardwareInfo HardwareService::GetHardwareInfo() {
};
}

return HardwareInfo{.cpu = cortex::hw::GetCPUInfo(),
return HardwareInfo{.cpu = cpu_info_.GetCPUInfo(),
.os = cortex::hw::GetOSInfo(),
.ram = cortex::hw::GetMemoryInfo(),
.storage = cortex::hw::GetStorageInfo(),
Expand Down Expand Up @@ -207,9 +207,6 @@ bool HardwareService::Restart(const std::string& host, int port) {
if (!TryConnectToServer(host, port)) {
return false;
}
std::cout << "Server started" << std::endl;
std::cout << "API Documentation available at: http://" << host << ":"
<< port << std::endl;
}

#endif
Expand Down Expand Up @@ -322,23 +319,40 @@ void HardwareService::UpdateHardwareInfos() {
}
}
CTL_INF("Activated GPUs before: " << debug_b);
auto has_nvidia = [&gpus] {
for (auto const& g : gpus) {
if (g.vendor == cortex::hw::kNvidiaStr) {
return true;
}
}
return false;
}();

for (auto const& gpu : gpus) {
// ignore error
// Note: only support NVIDIA for now, so hardware_id = software_id
if (db_service_->HasHardwareEntry(gpu.uuid)) {
auto res = db_service_->UpdateHardwareEntry(gpu.uuid, std::stoi(gpu.id),
std::stoi(gpu.id));
std::stoi(gpu.id));
if (res.has_error()) {
CTL_WRN(res.error());
}
} else {
auto res =
db_service_->AddHardwareEntry(HwEntry{.uuid = gpu.uuid,
.type = "gpu",
.hardware_id = std::stoi(gpu.id),
.software_id = std::stoi(gpu.id),
.activated = true,
.priority = INT_MAX});
// iGPU should be deactivated by default
// Only activate Nvidia GPUs if both AMD and Nvidia GPUs exists
auto activated = [&gpu, &gpus, has_nvidia] {
if (gpu.gpu_type != cortex::hw::GpuType::kGpuTypeDiscrete)
return false;
if (has_nvidia && gpu.vendor != cortex::hw::kNvidiaStr)
return false;
return true;
};

auto res = db_service_->AddHardwareEntry(
HwEntry{.uuid = gpu.uuid,
.type = "gpu",
.hardware_id = std::stoi(gpu.id),
.software_id = std::stoi(gpu.id),
.activated = activated(),
.priority = INT_MAX});
if (res.has_error()) {
CTL_WRN(res.error());
}
Expand Down
3 changes: 2 additions & 1 deletion engine/services/hardware_service.h
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#pragma once
#include <stdint.h>
#include <mutex>
#include <string>
#include <vector>
#include <mutex>

#include "common/hardware_config.h"
#include "database_service.h"
Expand Down Expand Up @@ -41,4 +41,5 @@ class HardwareService {
std::shared_ptr<DatabaseService> db_service_ = nullptr;
std::optional<cortex::hw::ActivateHardwareConfig> ahc_;
std::mutex mtx_;
cortex::hw::CpuInfo cpu_info_;
};
63 changes: 60 additions & 3 deletions engine/services/model_service.cc
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,21 @@ cpp::result<DownloadTask, std::string> GetDownloadTask(
}
} // namespace

ModelService::ModelService(std::shared_ptr<DatabaseService> db_service,
std::shared_ptr<HardwareService> hw_service,
std::shared_ptr<DownloadService> download_service,
std::shared_ptr<InferenceService> inference_service,
std::shared_ptr<EngineServiceI> engine_svc,
cortex::TaskQueue& task_queue)
: db_service_(db_service),
hw_service_(hw_service),
download_service_{download_service},
inference_svc_(inference_service),
engine_svc_(engine_svc),
task_queue_(task_queue) {
ProcessBgrTasks();
};

void ModelService::ForceIndexingModelList() {
CTL_INF("Force indexing model list");

Expand Down Expand Up @@ -331,8 +346,17 @@ cpp::result<DownloadTask, std::string> ModelService::HandleDownloadUrlAsync(
return download_service_->AddTask(downloadTask, on_finished);
}

std::optional<hardware::Estimation> ModelService::GetEstimation(
const std::string& model_handle) {
std::lock_guard l(es_mtx_);
if (auto it = es_.find(model_handle); it != es_.end()) {
return it->second;
}
return std::nullopt;
}

cpp::result<std::optional<hardware::Estimation>, std::string>
ModelService::GetEstimation(const std::string& model_handle,
ModelService::EstimateModel(const std::string& model_handle,
const std::string& kv_cache, int n_batch,
int n_ubatch) {
namespace fs = std::filesystem;
Expand Down Expand Up @@ -548,7 +572,7 @@ ModelService::DownloadModelFromCortexsoAsync(
// Close the file
pyvenv_cfg.close();
// Add executable permission to python
set_permission_utils::SetExecutePermissionsRecursive(venv_path);
(void)set_permission_utils::SetExecutePermissionsRecursive(venv_path);
} else {
CTL_ERR("Failed to extract venv.zip");
};
Expand Down Expand Up @@ -828,7 +852,7 @@ cpp::result<StartModelResult, std::string> ModelService::StartModel(
CTL_WRN("Error: " + res.error());
for (auto& depend : depends) {
if (depend != model_handle) {
StopModel(depend);
auto sr = StopModel(depend);
}
}
return cpp::fail("Model failed to start dependency '" + depend +
Expand Down Expand Up @@ -945,6 +969,11 @@ cpp::result<StartModelResult, std::string> ModelService::StartModel(

json_helper::MergeJson(json_data, params_override);

// Set default cpu_threads if it is not configured
if (!json_data.isMember("cpu_threads")) {
json_data["cpu_threads"] = GetCpuThreads();
}

// Set the latest ctx_len
if (ctx_len) {
json_data["ctx_len"] =
Expand Down Expand Up @@ -1329,6 +1358,10 @@ ModelService::MayFallbackToCpu(const std::string& model_path, int ngl,
return warning;
}

int ModelService::GetCpuThreads() const {
return std::max(std::thread::hardware_concurrency() / 2, 1u);
}

cpp::result<std::shared_ptr<ModelMetadata>, std::string>
ModelService::GetModelMetadata(const std::string& model_id) const {
if (model_id.empty()) {
Expand Down Expand Up @@ -1381,4 +1414,28 @@ std::string ModelService::GetEngineByModelId(
auto mc = yaml_handler.GetModelConfig();
CTL_DBG(mc.engine);
return mc.engine;
}

void ModelService::ProcessBgrTasks() {
CTL_INF("Start processing background tasks")
auto cb = [this] {
CTL_DBG("Estimate model resource usage");
auto list_entry = db_service_->LoadModelList();
if (list_entry) {
for (const auto& model_entry : list_entry.value()) {
// Only process local models
if (model_entry.status == cortex::db::ModelStatus::Downloaded) {
auto es = EstimateModel(model_entry.model);
if (es.has_value()) {
std::lock_guard l(es_mtx_);
es_[model_entry.model] = es.value();
}
}
}
}
};

auto clone = cb;
task_queue_.RunInQueue(std::move(cb));
task_queue_.RunEvery(std::chrono::seconds(10), std::move(clone));
}
22 changes: 15 additions & 7 deletions engine/services/model_service.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "services/download_service.h"
#include "services/hardware_service.h"
#include "utils/hardware/gguf/gguf_file_estimate.h"
#include "utils/task_queue.h"

class InferenceService;

Expand All @@ -35,12 +36,8 @@ class ModelService {
std::shared_ptr<HardwareService> hw_service,
std::shared_ptr<DownloadService> download_service,
std::shared_ptr<InferenceService> inference_service,
std::shared_ptr<EngineServiceI> engine_svc)
: db_service_(db_service),
hw_service_(hw_service),
download_service_{download_service},
inference_svc_(inference_service),
engine_svc_(engine_svc) {};
std::shared_ptr<EngineServiceI> engine_svc,
cortex::TaskQueue& task_queue);

cpp::result<std::string, std::string> AbortDownloadModel(
const std::string& task_id);
Expand Down Expand Up @@ -81,7 +78,10 @@ class ModelService {

bool HasModel(const std::string& id) const;

cpp::result<std::optional<hardware::Estimation>, std::string> GetEstimation(
std::optional<hardware::Estimation> GetEstimation(
const std::string& model_handle);

cpp::result<std::optional<hardware::Estimation>, std::string> EstimateModel(
const std::string& model_handle, const std::string& kv_cache = "f16",
int n_batch = 2048, int n_ubatch = 2048);

Expand Down Expand Up @@ -112,6 +112,10 @@ class ModelService {
const std::string& model_path, int ngl, int ctx_len, int n_batch = 2048,
int n_ubatch = 2048, const std::string& kv_cache_type = "f16");

void ProcessBgrTasks();

int GetCpuThreads() const;

std::shared_ptr<DatabaseService> db_service_;
std::shared_ptr<HardwareService> hw_service_;
std::shared_ptr<DownloadService> download_service_;
Expand All @@ -124,4 +128,8 @@ class ModelService {
*/
std::unordered_map<std::string, std::shared_ptr<ModelMetadata>>
loaded_model_metadata_map_;

std::mutex es_mtx_;
std::unordered_map<std::string, std::optional<hardware::Estimation>> es_;
cortex::TaskQueue& task_queue_;
};
Loading
Loading