From 2659789b53826ba9988c4400035bf855b78266f3 Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Mon, 11 Nov 2024 16:33:32 +0700 Subject: [PATCH 1/8] feat: model estimation --- engine/cli/commands/model_list_cmd.cc | 18 + engine/controllers/models.cc | 4 + engine/main.cc | 10 + engine/services/model_service.cc | 63 +- engine/services/model_service.h | 4 + engine/utils/hardware/gguf/ggml.h | 235 ++++++++ engine/utils/hardware/gguf/gguf_file.h | 539 ++++++++++++++++++ .../utils/hardware/gguf/gguf_file_estimate.h | 172 ++++++ 8 files changed, 1028 insertions(+), 17 deletions(-) create mode 100644 engine/utils/hardware/gguf/ggml.h create mode 100644 engine/utils/hardware/gguf/gguf_file.h create mode 100644 engine/utils/hardware/gguf/gguf_file_estimate.h diff --git a/engine/cli/commands/model_list_cmd.cc b/engine/cli/commands/model_list_cmd.cc index 41fe61d1c..75abea0e8 100644 --- a/engine/cli/commands/model_list_cmd.cc +++ b/engine/cli/commands/model_list_cmd.cc @@ -38,6 +38,9 @@ void ModelListCmd::Exec(const std::string& host, int port, column_headers.push_back("Version"); } + column_headers.push_back("CPU Mode"); + column_headers.push_back("GPU Mode"); + Row_t header{column_headers.begin(), column_headers.end()}; table.add_row(header); table.format().font_color(Color::green); @@ -70,6 +73,21 @@ void ModelListCmd::Exec(const std::string& host, int port, row.push_back(v["version"].asString()); } + if(auto& r = v["recommendation"]; !r.isNull()) { + if(!r["cpu_mode"].isNull()) { + row.push_back("RAM: " + r["cpu_mode"]["ram"].asString() + " MiB"); + } + + if(!r["gpu_mode"].isNull()) { + std::string s; + s += "ngl: " + r["gpu_mode"][0]["ngl"].asString() + " - "; + s += "context length: " + r["gpu_mode"][0]["context_length"].asString() + " - "; + s += "RAM: " + r["gpu_mode"][0]["ram"].asString() + " MiB - "; + s += "VRAM: " + r["gpu_mode"][0]["vram"].asString() + " MiB"; + row.push_back(s); + } + } + table.add_row({row.begin(), row.end()}); } } diff --git a/engine/controllers/models.cc b/engine/controllers/models.cc index 888983d7c..30e8eaad6 100644 --- a/engine/controllers/models.cc +++ b/engine/controllers/models.cc @@ -171,6 +171,10 @@ void Models::ListModel( Json::Value obj = model_config.ToJson(); obj["id"] = model_entry.model; obj["model"] = model_entry.model; + auto es = model_service_->GetEstimation(model_entry.model); + if (es.has_value()) { + obj["recommendation"] = hardware::ToJson(es.value()); + } data.append(std::move(obj)); yaml_handler.Reset(); } catch (const std::exception& e) { diff --git a/engine/main.cc b/engine/main.cc index 8eab545b9..e0ef37a27 100644 --- a/engine/main.cc +++ b/engine/main.cc @@ -18,6 +18,7 @@ #include "utils/event_processor.h" #include "utils/file_logger.h" #include "utils/file_manager_utils.h" +#include "utils/hardware/gguf/gguf_file_estimate.h" #include "utils/logging_utils.h" #include "utils/system_info_utils.h" @@ -197,6 +198,15 @@ void RunServer(std::optional port, bool ignore_cout) { } int main(int argc, char* argv[]) { + // hardware::RunConfig rc = {.ngl = 33, + // .ctx_len = 2048, + // .n_batch = 2048, + // .n_ubatch = 2048, + // .kv_cache_type = "f16"}; + // auto res = hardware::EstimateLLaMACppRun( + // "C:\\Users\\vansa\\cortexcpp-nightly\\models\\cortex.so\\llama3.1\\8b-gguf\\model.gguf", + // rc); + // return 0; // Stop the program if the system is not supported auto system_info = system_info_utils::GetSystemInfo(); if (system_info->arch == system_info_utils::kUnsupported || diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc index 80d02f4c7..02454efb7 100644 --- a/engine/services/model_service.cc +++ b/engine/services/model_service.cc @@ -335,6 +335,41 @@ cpp::result ModelService::HandleDownloadUrlAsync( return download_service_->AddTask(downloadTask, on_finished); } +cpp::result ModelService::GetEstimation( + const std::string& model_handle) { + namespace fs = std::filesystem; + namespace fmu = file_manager_utils; + cortex::db::Models modellist_handler; + config::YamlHandler yaml_handler; + + try { + auto model_entry = modellist_handler.GetModelInfo(model_handle); + if (model_entry.has_error()) { + CTL_WRN("Error: " + model_entry.error()); + return cpp::fail(model_entry.error()); + } + auto file_path = fmu::ToAbsoluteCortexDataPath( + fs::path(model_entry.value().path_to_model_yaml)) + .parent_path() / + "model.gguf"; + yaml_handler.ModelConfigFromFile( + fmu::ToAbsoluteCortexDataPath( + fs::path(model_entry.value().path_to_model_yaml)) + .string()); + auto mc = yaml_handler.GetModelConfig(); + + return hardware::EstimateLLaMACppRun(file_path.string(), + {.ngl = mc.ngl, + .ctx_len = mc.ctx_len, + .n_batch = 2048, + .n_ubatch = 2048, + .kv_cache_type = "f16"}); + } catch (const std::exception& e) { + return cpp::fail("Fail to get model status with ID '" + model_handle + + "': " + e.what()); + } +} + cpp::result ModelService::HandleUrl( const std::string& url) { auto url_obj = url_parser::FromUrlString(url); @@ -771,27 +806,21 @@ cpp::result ModelService::StartModel( auto const& mp = json_data["model_path"].asString(); auto ngl = json_data["ngl"].asInt(); - // Bypass for now - auto vram_needed_MiB = 0u; - auto ram_needed_MiB = 0u; - - if (vram_needed_MiB > free_vram_MiB && is_cuda) { - CTL_WRN("Not enough VRAM - " << "required: " << vram_needed_MiB + hardware::RunConfig rc = {.ngl = ngl, + .ctx_len = json_data["ctx_len"].asInt(), + .n_batch = 2048, + .n_ubatch = 2048, + .kv_cache_type = "f16"}; + auto es = hardware::EstimateLLaMACppRun(mp, rc); + + if (es.gpu_mode.vram_MiB > free_vram_MiB && is_cuda) { + CTL_WRN("Not enough VRAM - " << "required: " << es.gpu_mode.vram_MiB << ", available: " << free_vram_MiB); - - return cpp::fail( - "Not enough VRAM - required: " + std::to_string(vram_needed_MiB) + - " MiB, available: " + std::to_string(free_vram_MiB) + - " MiB - Should adjust ngl to " + - std::to_string(free_vram_MiB / (vram_needed_MiB / ngl) - 1)); } - if (ram_needed_MiB > free_ram_MiB) { - CTL_WRN("Not enough RAM - " << "required: " << ram_needed_MiB + if (es.cpu_mode.ram_MiB > free_ram_MiB) { + CTL_WRN("Not enough RAM - " << "required: " << es.cpu_mode.ram_MiB << ", available: " << free_ram_MiB); - return cpp::fail( - "Not enough RAM - required: " + std::to_string(ram_needed_MiB) + - " MiB,, available: " + std::to_string(free_ram_MiB) + " MiB"); } assert(!!inference_svc_); diff --git a/engine/services/model_service.h b/engine/services/model_service.h index a29f092d4..323fdd588 100644 --- a/engine/services/model_service.h +++ b/engine/services/model_service.h @@ -8,6 +8,7 @@ #include "database/models.h" #include "services/download_service.h" #include "services/inference_service.h" +#include "utils/hardware/gguf/gguf_file_estimate.h" struct ModelPullInfo { std::string id; @@ -97,6 +98,9 @@ class ModelService { bool HasModel(const std::string& id) const; + cpp::result GetEstimation( + const std::string& model_handle); + private: /** * Handle downloading model which have following pattern: author/model_name diff --git a/engine/utils/hardware/gguf/ggml.h b/engine/utils/hardware/gguf/ggml.h new file mode 100644 index 000000000..d04cf4146 --- /dev/null +++ b/engine/utils/hardware/gguf/ggml.h @@ -0,0 +1,235 @@ +#pragma once +#include +#include +#include +#include +#include "utils/result.hpp" + +namespace hardware { +enum GGMLType { + GGML_TYPE_F32 = 0, + GGML_TYPE_F16 = 1, + GGML_TYPE_Q4_0 = 2, + GGML_TYPE_Q4_1 = 3, + // GGML_TYPE_Q4_2 = 4, support has been removed + // GGML_TYPE_Q4_3 = 5, support has been removed + GGML_TYPE_Q5_0 = 6, + GGML_TYPE_Q5_1 = 7, + GGML_TYPE_Q8_0 = 8, + GGML_TYPE_Q8_1 = 9, + GGML_TYPE_Q2_K = 10, + GGML_TYPE_Q3_K = 11, + GGML_TYPE_Q4_K = 12, + GGML_TYPE_Q5_K = 13, + GGML_TYPE_Q6_K = 14, + GGML_TYPE_Q8_K = 15, + GGML_TYPE_IQ2_XXS = 16, + GGML_TYPE_IQ2_XS = 17, + GGML_TYPE_IQ3_XXS = 18, + GGML_TYPE_IQ1_S = 19, + GGML_TYPE_IQ4_NL = 20, + GGML_TYPE_IQ3_S = 21, + GGML_TYPE_IQ2_S = 22, + GGML_TYPE_IQ4_XS = 23, + GGML_TYPE_I8 = 24, + GGML_TYPE_I16 = 25, + GGML_TYPE_I32 = 26, + GGML_TYPE_I64 = 27, + GGML_TYPE_F64 = 28, + GGML_TYPE_IQ1_M = 29, + GGML_TYPE_BF16 = 30, + GGML_TYPE_Q4_0_4_4 = 31, + GGML_TYPE_Q4_0_4_8 = 32, + GGML_TYPE_Q4_0_8_8 = 33, + GGML_TYPE_TQ1_0 = 34, + GGML_TYPE_TQ2_0 = 35, + GGML_TYPE_COUNT, +}; + +inline float GetQuantBit(GGMLType gt) { + switch (gt) { + case GGML_TYPE_F32: + return 32.0; + case GGML_TYPE_F16: + return 16.0; + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: + case GGML_TYPE_Q2_K: + return 2.5625; + case GGML_TYPE_Q3_K: + return 3.4375; + case GGML_TYPE_Q4_K: + return 4.5; + case GGML_TYPE_Q5_K: + return 5.5; + case GGML_TYPE_Q6_K: + return 6.5625; + case GGML_TYPE_Q8_K: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ3_S: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ4_XS: + case GGML_TYPE_I8: + case GGML_TYPE_I16: + case GGML_TYPE_I32: + case GGML_TYPE_I64: + case GGML_TYPE_F64: + case GGML_TYPE_IQ1_M: + case GGML_TYPE_BF16: + case GGML_TYPE_Q4_0_4_4: + case GGML_TYPE_Q4_0_4_8: + case GGML_TYPE_Q4_0_8_8: + case GGML_TYPE_TQ1_0: + case GGML_TYPE_TQ2_0: + + default: + return 8.0; + } +} + +inline std::string to_string(GGMLType t) { + switch (t) { + case GGML_TYPE_F32: + return "F32"; + case GGML_TYPE_F16: + return "F16"; + case GGML_TYPE_Q4_0: + return "Q4_0"; + case GGML_TYPE_Q4_1: + return "Q4_1"; + case GGML_TYPE_Q5_0: + return "Q5_0"; + case GGML_TYPE_Q5_1: + return "Q5_1"; + case GGML_TYPE_Q8_0: + return "Q8_0"; + case GGML_TYPE_Q8_1: + return "Q8_1"; + case GGML_TYPE_Q2_K: + return "Q2_K"; + case GGML_TYPE_Q3_K: + return "Q3_K"; + case GGML_TYPE_Q4_K: + return "Q4_K"; + case GGML_TYPE_Q5_K: + return "Q5_K"; + case GGML_TYPE_Q6_K: + return "Q6_K"; + case GGML_TYPE_Q8_K: + return "Q8_K"; + case GGML_TYPE_IQ2_XXS: + return "IQ2_XXS"; + case GGML_TYPE_IQ2_XS: + return "IQ2_XS"; + case GGML_TYPE_IQ3_XXS: + return "IQ3_XXS"; + case GGML_TYPE_IQ1_S: + return "IQ1_S"; + case GGML_TYPE_IQ4_NL: + return "IQ4_NL"; + case GGML_TYPE_IQ3_S: + return "IQ3_S"; + case GGML_TYPE_IQ2_S: + return "IQ2_S"; + case GGML_TYPE_IQ4_XS: + return "IQ4_XS"; + case GGML_TYPE_I8: + return "I8"; + case GGML_TYPE_I16: + return "I16"; + case GGML_TYPE_I32: + return "I32"; + case GGML_TYPE_I64: + return "I64"; + case GGML_TYPE_F64: + return "F64"; + case GGML_TYPE_IQ1_M: + return "IQ1_M"; + case GGML_TYPE_BF16: + return "BF16"; + case GGML_TYPE_Q4_0_4_4: + return "Q4_0_4_4"; + case GGML_TYPE_Q4_0_4_8: + return "Q4_0_4_8"; + case GGML_TYPE_Q4_0_8_8: + return "Q4_0_8_8"; + case GGML_TYPE_TQ1_0: + return "TQ1_0"; + case GGML_TYPE_TQ2_0: + return "TQ2_0"; + default: + return "Invalid"; + } +} + +struct GGMLTypeTrait { + uint64_t block_size; + uint64_t type_size; + bool is_quantized; +}; + +const std::unordered_map kGGMLTypeTraits = { + {GGML_TYPE_F32, {.block_size = 1, .type_size = 4}}, + {GGML_TYPE_F16, {.block_size = 1, .type_size = 2}}, + {GGML_TYPE_Q4_0, {.block_size = 32, .type_size = 18, .is_quantized = true}}, + {GGML_TYPE_Q4_1, {.block_size = 32, .type_size = 20, .is_quantized = true}}, + {GGML_TYPE_Q5_0, {.block_size = 32, .type_size = 22, .is_quantized = true}}, + {GGML_TYPE_Q5_1, {.block_size = 32, .type_size = 24, .is_quantized = true}}, + {GGML_TYPE_Q8_0, {.block_size = 32, .type_size = 34, .is_quantized = true}}, + {GGML_TYPE_Q8_1, {.block_size = 32, .type_size = 36, .is_quantized = true}}, + {GGML_TYPE_Q2_K, + {.block_size = 256, .type_size = 84, .is_quantized = true}}, + {GGML_TYPE_Q3_K, + {.block_size = 256, .type_size = 110, .is_quantized = true}}, + {GGML_TYPE_Q4_K, + {.block_size = 256, .type_size = 144, .is_quantized = true}}, + {GGML_TYPE_Q5_K, + {.block_size = 256, .type_size = 176, .is_quantized = true}}, + {GGML_TYPE_Q6_K, + {.block_size = 256, .type_size = 210, .is_quantized = true}}, + {GGML_TYPE_Q8_K, + {.block_size = 256, .type_size = 292, .is_quantized = true}}, + {GGML_TYPE_IQ2_XXS, + {.block_size = 256, .type_size = 66, .is_quantized = true}}, + {GGML_TYPE_IQ2_XS, + {.block_size = 256, .type_size = 74, .is_quantized = true}}, + {GGML_TYPE_IQ3_XXS, + {.block_size = 256, .type_size = 98, .is_quantized = true}}, + {GGML_TYPE_IQ1_S, + {.block_size = 256, .type_size = 50, .is_quantized = true}}, + {GGML_TYPE_IQ4_NL, + {.block_size = 32, .type_size = 18, .is_quantized = true}}, + {GGML_TYPE_IQ3_S, + {.block_size = 256, .type_size = 110, .is_quantized = true}}, + {GGML_TYPE_IQ2_S, + {.block_size = 256, .type_size = 82, .is_quantized = true}}, + {GGML_TYPE_IQ4_XS, + {.block_size = 256, .type_size = 136, .is_quantized = true}}, + {GGML_TYPE_I8, {.block_size = 1, .type_size = 1}}, + {GGML_TYPE_I16, {.block_size = 1, .type_size = 2}}, + {GGML_TYPE_I32, {.block_size = 1, .type_size = 4}}, + {GGML_TYPE_I64, {.block_size = 1, .type_size = 8}}, + {GGML_TYPE_F64, {.block_size = 1, .type_size = 8}}, + {GGML_TYPE_IQ1_M, + {.block_size = 256, .type_size = 56, .is_quantized = true}}, + {GGML_TYPE_BF16, {.block_size = 1, .type_size = 2}}, + {GGML_TYPE_Q4_0_4_4, + {.block_size = 32, .type_size = 18, .is_quantized = true}}, + {GGML_TYPE_Q4_0_4_8, + {.block_size = 32, .type_size = 18, .is_quantized = true}}, + {GGML_TYPE_Q4_0_8_8, + {.block_size = 32, .type_size = 18, .is_quantized = true}}, + {GGML_TYPE_TQ1_0, + {.block_size = 256, .type_size = 54, .is_quantized = true}}, + {GGML_TYPE_TQ2_0, + {.block_size = 256, .type_size = 66, .is_quantized = true}}, +}; +} // namespace hardware \ No newline at end of file diff --git a/engine/utils/hardware/gguf/gguf_file.h b/engine/utils/hardware/gguf/gguf_file.h new file mode 100644 index 000000000..9980247f0 --- /dev/null +++ b/engine/utils/hardware/gguf/gguf_file.h @@ -0,0 +1,539 @@ +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _WIN32 +#include +#include +#include +#else +#include // For memory-mapped file +#include // For file descriptors +#endif + +#include "ggml.h" +#include "utils/string_utils.h" + +// #define GGUF_LOG(msg) \ +// do { \ +// std::cout << __FILE__ << "(@" << __LINE__ << "): " << msg << '\n'; \ +// } while (false) + +#define GGUF_LOG(msg) +namespace hardware { +#undef min +#undef max + +using GGUFMagic = uint32_t; +constexpr const GGUFMagic kGGUFMagicGGML = 0x67676d6c; +constexpr const GGUFMagic kGGUFMagicGGMF = 0x67676d66; +constexpr const GGUFMagic kGGUFMagicGGJT = 0x67676a74; +constexpr const GGUFMagic kGGUFMagicGGUFLe = 0x46554747; // GGUF +constexpr const GGUFMagic kGGUFMagicGGUFBe = 0x47475546; // GGUF + +using GGUFVersion = uint32_t; +constexpr const GGUFVersion kGGUFVersionV1 = 1; +constexpr const GGUFVersion kGGUFVersionV2 = 2; +constexpr const GGUFVersion kGGUFVersionV3 = 3; + +enum GGUFMetadataValueType : uint32_t { + GGUFMetadataValueTypeUint8 = 0, + GGUFMetadataValueTypeInt8, + GGUFMetadataValueTypeUint16, + GGUFMetadataValueTypeInt16, + GGUFMetadataValueTypeUint32, + GGUFMetadataValueTypeInt32, + GGUFMetadataValueTypeFloat32, + GGUFMetadataValueTypeBool, + GGUFMetadataValueTypeString, + GGUFMetadataValueTypeArray, + GGUFMetadataValueTypeUint64, + GGUFMetadataValueTypeInt64, + GGUFMetadataValueTypeFloat64, + _GGUFMetadataValueTypeCount // Unknown +}; + +struct GGUFMetadataKV { + // Key is the key of the metadata key-value pair, + // which is no larger than 64 bytes long. + std::string key; // Using std::string for dynamic string handling + + // ValueType is the type of the metadata value. + GGUFMetadataValueType value_type; // Enum to represent value types + + // Value is the value of the metadata key-value pair. + std::any value; +}; + +struct GGUFMetadataKVArrayValue { + /* Basic */ + + // type is the type of the array item. + GGUFMetadataValueType type; // Enum to represent value types + + // Len is the length of the array. + uint64_t len; // Using uint64_t for length + + // Array holds all array items. + std::vector arr; + /* Appendix */ + + // start_offset is the offset in bytes of the GGUFMetadataKVArrayValue in the GGUFFile file. + int64_t start_offset; // Using int64_t for offset + + // Size is the size of the array in bytes. + int64_t size; // Using int64_t for size +}; + +inline std::string to_string(GGUFMetadataValueType vt, const std::any& v) { + switch (vt) { + case GGUFMetadataValueTypeUint8: + return std::to_string(std::any_cast(v)); + case GGUFMetadataValueTypeInt8: + return std::to_string(std::any_cast(v)); + case GGUFMetadataValueTypeUint16: + return std::to_string(std::any_cast(v)); + case GGUFMetadataValueTypeInt16: + return std::to_string(std::any_cast(v)); + case GGUFMetadataValueTypeUint32: + return std::to_string(std::any_cast(v)); + case GGUFMetadataValueTypeInt32: + return std::to_string(std::any_cast(v)); + case GGUFMetadataValueTypeFloat32: + return std::to_string(std::any_cast(v)); + case GGUFMetadataValueTypeBool: + return std::to_string(std::any_cast(v)); + case GGUFMetadataValueTypeString: + return std::any_cast(v); + case GGUFMetadataValueTypeUint64: + return std::to_string(std::any_cast(v)); + case GGUFMetadataValueTypeInt64: + return std::to_string(std::any_cast(v)); + case GGUFMetadataValueTypeFloat64: + return std::to_string(std::any_cast(v)); + default: + break; + } + return "array"; +} +inline std::string to_string(const GGUFMetadataKVArrayValue& arr_v) { + std::string res; + auto num = std::min(size_t(5), arr_v.arr.size()); + for (size_t i = 0; i < num; i++) { + res += to_string(arr_v.type, arr_v.arr[i]) + " "; + } + return res; +} + +inline std::string to_string(const GGUFMetadataKV& kv) { + switch (kv.value_type) { + case GGUFMetadataValueTypeUint8: + return std::to_string(std::any_cast(kv.value)); + case GGUFMetadataValueTypeInt8: + return std::to_string(std::any_cast(kv.value)); + case GGUFMetadataValueTypeUint16: + return std::to_string(std::any_cast(kv.value)); + case GGUFMetadataValueTypeInt16: + return std::to_string(std::any_cast(kv.value)); + case GGUFMetadataValueTypeUint32: + return std::to_string(std::any_cast(kv.value)); + case GGUFMetadataValueTypeInt32: + return std::to_string(std::any_cast(kv.value)); + case GGUFMetadataValueTypeFloat32: + return std::to_string(std::any_cast(kv.value)); + case GGUFMetadataValueTypeBool: + return std::to_string(std::any_cast(kv.value)); + case GGUFMetadataValueTypeString: + return std::any_cast(kv.value); + case GGUFMetadataValueTypeUint64: + return std::to_string(std::any_cast(kv.value)); + case GGUFMetadataValueTypeInt64: + return std::to_string(std::any_cast(kv.value)); + case GGUFMetadataValueTypeFloat64: + return std::to_string(std::any_cast(kv.value)); + case GGUFMetadataValueTypeArray: + return to_string(std::any_cast(kv.value)); + default: + break; + } + return "Invalid type "; +} + + + +struct GGUFTensorInfo { + /* Basic */ + std::string name; + + // NDimensions is the number of dimensions of the tensor. + uint32_t n_dimensions; + // Dimensions is the dimensions of the tensor, + // the length is NDimensions. + std::vector dimensions; + // type is the type of the tensor. + GGMLType type; + // Offset is the offset in bytes of the tensor's data in this file. + // + // The offset is relative to tensor data, not to the start of the file. + uint64_t offset; + + /* Appendix */ + + // StartOffset is the offset in bytes of the GGUFTensorInfo in the GGUFFile file. + // + // The offset is the start of the file. + int64_t start_offset; +}; + +struct GGUFHelper { + uint8_t* data; + uint8_t* d_close; + uint64_t file_size; + + bool OpenAndMMap(const std::string& file_path) { +#ifdef _WIN32 + HANDLE file_handle = INVALID_HANDLE_VALUE; + HANDLE file_mapping = nullptr; + file_handle = + CreateFileA(file_path.c_str(), GENERIC_READ, FILE_SHARE_READ, nullptr, + OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, nullptr); + if (file_handle == INVALID_HANDLE_VALUE) { + std::cout << "Failed to open file" << std::endl; + return false; + } + // Get the file size + LARGE_INTEGER file_size_struct; + if (!GetFileSizeEx(file_handle, &file_size_struct)) { + CloseHandle(file_handle); + std::cout << "Failed to open file" << std::endl; + return false; + } + file_size = static_cast(file_size_struct.QuadPart); + + // Create a file mapping object + file_mapping = + CreateFileMappingA(file_handle, nullptr, PAGE_READONLY, 0, 0, nullptr); + if (file_mapping == nullptr) { + CloseHandle(file_handle); + std::cout << "Failed to create file mapping" << std::endl; + return false; + } + + // Map the file into memory + data = static_cast( + MapViewOfFile(file_mapping, FILE_MAP_READ, 0, 0, file_size)); + if (data == nullptr) { + CloseHandle(file_mapping); + CloseHandle(file_handle); + std::cout << "Failed to map file" << std::endl; + return false; + } + + // Close the file handle, as it is no longer needed after mapping + CloseHandle(file_handle); + d_close = data; +#else + file_size = std::filesystem::file_size(file_path); + + int fd = open(file_path.c_str(), O_RDONLY); + // Memory-map the file + data = static_cast( + mmap(nullptr, file_size, PROT_READ, MAP_PRIVATE, fd, 0)); + if (data == MAP_FAILED) { + perror("Error mapping file"); + close(fd); + return false; + } + + close(fd); + d_close = data; +#endif + return true; + } + + ~GGUFHelper() { Close(); } + + void Close() { +#ifdef _WIN32 + if (d_close != nullptr) { + UnmapViewOfFile(d_close); + d_close = nullptr; + } +#else + if (d_close != nullptr && d_close != MAP_FAILED) { + munmap(d_close, file_size); + d_close = nullptr; + } +#endif + } + + template + T Read() { + static_assert(std::is_floating_point::value || + std::is_integral::value || std::is_same::value); + T res = *reinterpret_cast(data); + data += sizeof(T); + return res; + } + + std::string ReadString() { + auto l = Read(); + std::string res(reinterpret_cast(data), l); + auto r = res; + data += l; + return r; + } + + GGUFMetadataKVArrayValue ReadArray() { + GGUFMetadataKVArrayValue v; + v.start_offset = (data - d_close); + v.type = static_cast(Read()); + auto arr_length = Read(); + for (uint64_t i = 0; i < arr_length; ++i) { + switch (v.type) { + case GGUFMetadataValueTypeUint8: + v.arr.push_back(Read()); + break; + case GGUFMetadataValueTypeInt8: + v.arr.push_back(Read()); + break; + case GGUFMetadataValueTypeUint16: + v.arr.push_back(Read()); + break; + case GGUFMetadataValueTypeInt16: + v.arr.push_back(Read()); + break; + case GGUFMetadataValueTypeUint32: + v.arr.push_back(Read()); + break; + case GGUFMetadataValueTypeInt32: + v.arr.push_back(Read()); + break; + case GGUFMetadataValueTypeFloat32: + v.arr.push_back(Read()); + break; + case GGUFMetadataValueTypeBool: + v.arr.push_back(Read()); + break; + case GGUFMetadataValueTypeString: + v.arr.push_back(ReadString()); + break; + case GGUFMetadataValueTypeUint64: + v.arr.push_back(Read()); + break; + case GGUFMetadataValueTypeInt64: + v.arr.push_back(Read()); + break; + case GGUFMetadataValueTypeFloat64: + v.arr.push_back(Read()); + break; + default: + std::cout << "Invalid type: " << std::to_string(v.type); + } + } + v.size = data - v.start_offset - d_close - 4 - 8; + return v; + } + + std::any ReadValue(GGUFMetadataValueType vt) { + switch (vt) { + case GGUFMetadataValueTypeUint8: + return Read(); + case GGUFMetadataValueTypeInt8: + return Read(); + case GGUFMetadataValueTypeUint16: + return Read(); + case GGUFMetadataValueTypeInt16: + return Read(); + case GGUFMetadataValueTypeUint32: + return Read(); + case GGUFMetadataValueTypeInt32: + return Read(); + case GGUFMetadataValueTypeFloat32: + return Read(); + case GGUFMetadataValueTypeBool: + return Read(); + case GGUFMetadataValueTypeString: + return ReadString(); + case GGUFMetadataValueTypeArray: + return ReadArray(); + case GGUFMetadataValueTypeUint64: + return Read(); + case GGUFMetadataValueTypeInt64: + return Read(); + case GGUFMetadataValueTypeFloat64: + return Read(); + default: + std::cout << "Invalid type: " << vt; + return {}; + } + } + + GGUFMetadataKV ReadMetadataKV() { + GGUFMetadataKV kv; + kv.key = ReadString(); + auto vt = Read(); + kv.value_type = GGUFMetadataValueType(vt); + kv.value = ReadValue(kv.value_type); + return kv; + } + + std::shared_ptr ReadTensorInfo() { + auto ti = std::make_shared(); + ti->start_offset = data - d_close; + ti->name = ReadString(); + ti->n_dimensions = Read(); + ti->dimensions.resize(ti->n_dimensions); + for (size_t i = 0; i < ti->n_dimensions; i++) { + ti->dimensions[i] = Read(); + } + auto v = Read(); + ti->type = GGMLType(v); + ti->offset = Read(); + return ti; + } +}; + +constexpr const auto ErrGGUFFileInvalidFormat = "invalid GGUF format"; + +struct GGUFHeader { + // Magic is a magic number that announces that this is a GGUF file. + GGUFMagic magic; + // Version is a version of the GGUF file format. + GGUFVersion version; + // TensorCount is the number of tensors in the file. + uint64_t tensor_count; + // MetadataKVCount is the number of key-value pairs in the metadata. + uint64_t metadata_kv_count; + // MetadataKV are the key-value pairs in the metadata, + std::vector metadata_kv; + + std::pair Get(const std::string& name) { + for (auto const& kv : metadata_kv) { + if (kv.key == name) { + return std::pair(kv, true); + } + } + return std::pair(GGUFMetadataKV{}, false); + } +}; + +struct GGUFFile { + /* Basic */ + + // header is the header of the GGUF file. + GGUFHeader header; + // tensor_infos are the tensor infos of the GGUF file, + // the size of TensorInfos is equal to `Header.TensorCount`. + std::vector> tensor_infos; + + // padding is the padding size of the GGUF file, + // which is used to split Header and TensorInfos from tensor data. + int64_t padding; + // split_paddings holds the padding size slice of the GGUF file splits, + // each item represents splitting Header and TensorInfos from tensor data. + // + // The length of split_paddings is the number of split files. + std::vector split_paddings; + // tensor_data_start_offset is the offset in bytes of the tensor data in this file. + // + // The offset is the start of the file. + int64_t tensor_data_start_offset; + // split_tensor_data_start_offsets holds the offset slice in bytes of the tensor data of the GGUF file splits, + // each item represents the offset of the tensor data in the split file. + // + // The length of split_tensor_data_start_offsets is the number of split files. + std::vector split_tensor_data_start_offsets; + + /* Appendix */ + + // size is the size of the GGUF file, + // if the file is split, the size is the sum of all split files. + uint64_t size; + // split_sizes holds the size slice of the GGUF file splits, + // each item represents the size of the split file. + // + // The length of split_sizes is the number of split files. + std::vector split_sizes; + // model_size is the size of the model when loading. + uint64_t model_size; + // split_model_sizes holds the size slice of the model, + // each item represents a size when loading of the split file. + // + // The length of split_model_sizes is the number of split files. + std::vector split_model_sizes; + + // model_parameters is the number of the model parameters. + uint64_t model_parameters; + // model_bits_per_weight is the bits per weight of the model, + // which describes how many bits are used to store a weight, + // higher is better. + double model_bits_per_weight; +}; + +inline GGUFFile ParseGgufFile(const std::string& path) { + GGUFFile gf; + GGUFHelper h; + h.OpenAndMMap(path); + + GGUFMagic magic = h.Read(); + // GGUF_LOG("magic: " << magic); + gf.header.magic = magic; + GGUFVersion version = h.Read(); + auto tensor_count = h.Read(); + // GGUF_LOG("tensor_count: " << tensor_count); + gf.header.tensor_count += tensor_count; + + auto metadata_kv_count = h.Read(); + gf.header.metadata_kv_count += metadata_kv_count; + // GGUF_LOG("metadata_kv_count: " << metadata_kv_count); + + // metadata kv + { + std::vector kvs; + kvs.resize(metadata_kv_count); + for (size_t i = 0; i < metadata_kv_count; i++) { + kvs[i] = h.ReadMetadataKV(); + GGUF_LOG("i: " << i << " " << kvs[i].value_type << " " << kvs[i].key + << ": " << to_string(kvs[i])); + } + for (auto const& kv : kvs) { + if (kv.key == "split.no") { + gf.header.metadata_kv_count--; + continue; + } + gf.header.metadata_kv.push_back(kv); + } + } + + { + std::vector> tis; + tis.resize(tensor_count); + for (size_t i = 0; i < tensor_count; i++) { + tis[i] = h.ReadTensorInfo(); + // auto tto_string = [](const std::vector& ds) -> std::string { + // std::string res = "["; + // for (auto d : ds) + // res += std::to_string(d) + " "; + // return res + "]"; + // }; + // auto ds = tto_string(tis[i]->dimensions); + // GGUF_LOG("i: " << i << " name: " << tis[i]->name + // << " type: " << to_string(tis[i]->type) << " dimensions: " + // << std::to_string(tis[i]->n_dimensions) << " " << ds); + } + gf.tensor_infos = tis; + } + return gf; +} +} // namespace hardware \ No newline at end of file diff --git a/engine/utils/hardware/gguf/gguf_file_estimate.h b/engine/utils/hardware/gguf/gguf_file_estimate.h new file mode 100644 index 000000000..31b328608 --- /dev/null +++ b/engine/utils/hardware/gguf/gguf_file_estimate.h @@ -0,0 +1,172 @@ +#pragma once +#include +#include +#include "gguf_file.h" +#include "json/json.h" + +namespace hardware { +inline uint64_t BytesToMiB(uint64_t b) { + return (double)b / 1024 / 1024; +}; +struct RunConfig { + int ngl; + int ctx_len; + int n_batch; + int n_ubatch; + std::string kv_cache_type; +}; + +struct CpuMode { + int64_t ram_MiB; +}; + +struct GpuMode { + int64_t ram_MiB; + int64_t vram_MiB; + int ngl; + int ctx_len; +}; + +struct Estimation { + CpuMode cpu_mode; + GpuMode gpu_mode; +}; + +inline Json::Value ToJson(const Estimation& es) { + Json::Value res; + Json::Value cpu; + cpu["ram"] = es.cpu_mode.ram_MiB; + Json::Value gpus(Json::arrayValue); + Json::Value gpu; + gpu["ram"] = es.gpu_mode.ram_MiB; + gpu["vram"] = es.gpu_mode.vram_MiB; + gpu["ngl"] = es.gpu_mode.ngl; + gpu["context_length"] = es.gpu_mode.ctx_len; + gpus.append(gpu); + res["cpu_mode"] = cpu; + res["gpu_mode"] = gpus; + return res; +} + +inline float GetQuantBit(const std::string& kv_cache_t) { + if (kv_cache_t == "f16") { + return 16.0; + } else if (kv_cache_t == "q8_0") { + return 8.0; + } else if (kv_cache_t == "q4_0") { + return 4.5; + } + return 16.0; +} + +inline Estimation EstimateLLaMACppRun(const std::string& file_path, + const RunConfig& rc) { + Estimation res; + // token_embeddings_size = n_vocab * embedding_length * 2 * quant_bit/16 bytes + //RAM = token_embeddings_size + ((total_ngl-ngl) >=1 ? Output_layer_size + (total_ngl - ngl - 1 ) / (total_ngl-1) * (total_file_size - token_embeddings_size - Output_layer_size) : 0 ) (bytes) + + // VRAM = total_file_size - RAM (bytes) + auto gf = ParseGgufFile(file_path); + int32_t embedding_length = 0; + int64_t n_vocab = 0; + int32_t num_block = 0; + int32_t total_ngl = 0; + auto file_size = std::filesystem::file_size(file_path); + for (auto const& kv : gf.header.metadata_kv) { + if (kv.key == "llama.embedding_length") { + embedding_length = std::any_cast(kv.value); + } else if (kv.key == "tokenizer.ggml.tokens") { + n_vocab = std::any_cast(kv.value).arr.size(); + } else if (kv.key == "llama.block_count") { + num_block = std::any_cast(kv.value); + total_ngl = num_block + 1; + } + } + + // token_embeddings_size = n_vocab * embedding_length * 2 * quant_bit_in/16 bytes + int32_t quant_bit_in = 0; + int32_t quant_bit_out = 0; + + for (auto const& ti : gf.tensor_infos) { + if (ti->name == "output.weight") { + quant_bit_out = GetQuantBit(ti->type); + std::cout << ti->type << std::endl; + } else if (ti->name == "token_embd.weight") { + quant_bit_in = GetQuantBit(ti->type); + std::cout << ti->type << std::endl; + } + } + // output.weight + // token_embd.weight + std::cout << "embedding_length: " << embedding_length << std::endl; + std::cout << "n_vocab: " << n_vocab << std::endl; + std::cout << "file_size: " << file_size << std::endl; + // Model weight + int64_t token_embeddings_size = + n_vocab * embedding_length * 2 * quant_bit_in / 16; + int64_t output_layer_size = + n_vocab * embedding_length * 2 * quant_bit_out / 16; + // RAM = token_embeddings_size + ((total_ngl-ngl) >=1 ? output_layer_size + (total_ngl - ngl - 1 ) / (total_ngl-1) * (total_file_size - token_embeddings_size - output_layer_size) : 0 ) (bytes) + int64_t offload = 0; + if (total_ngl >= rc.ngl + 1) { + offload = output_layer_size + + (double)(total_ngl - rc.ngl - 1) / (total_ngl - 1) * + (file_size - token_embeddings_size - output_layer_size); + } + + int64_t ram_usage = token_embeddings_size + offload; + int64_t vram_usage = file_size - ram_usage; + std::cout << "token_embeddings_size: " << BytesToMiB(token_embeddings_size) + << std::endl; + std::cout << "output_layer_size: " << BytesToMiB(output_layer_size) + << std::endl; + std::cout << "ram_usage: " << BytesToMiB(ram_usage) << std::endl; + std::cout << "vram_usage: " << BytesToMiB(vram_usage) << std::endl; + + // KV cache + // kv_cache_size = ctx_len/8192 * hidden_dim/4096 * quant_bit/16 * num_block/33 * 1 (GB) + auto hidden_dim = embedding_length; + int kv_quant_bit = + GetQuantBit(rc.kv_cache_type); // f16, 8 bits for q8_0, 4.5 bits for q4_0 + int64_t kv_cache_size = (double)(1024 * 1024 * 1024) * rc.ctx_len / 8192 * + hidden_dim / 4096 * kv_quant_bit / 16 * num_block / + 33; //(bytes) + + std::cout << "kv_cache_size: " << BytesToMiB(kv_cache_size) << std::endl; + + // VRAM = (min(n_batch, n_ubatch))/ 512 * 266 (MiB) + int64_t preprocessing_buffer_size = + (double)std::min(rc.n_batch, rc.n_ubatch) / 512 * 266 * 1024 * 1024 / + 4; //(bytes) + if (total_ngl != rc.ngl) { + preprocessing_buffer_size += output_layer_size; + } + std::cout << "preprocessing_buffer_size: " + << BytesToMiB(preprocessing_buffer_size) << std::endl; + + // CPU mode + { + // Model weight + int64_t model_weight = file_size; + // KV cache + // Buffer + res.cpu_mode.ram_MiB = + BytesToMiB(model_weight + kv_cache_size + preprocessing_buffer_size); + } + // GPU mode + { + res.gpu_mode.ctx_len = rc.ctx_len; + res.gpu_mode.ngl = rc.ngl; + res.gpu_mode.ram_MiB = BytesToMiB(ram_usage); + res.gpu_mode.vram_MiB = + BytesToMiB(vram_usage + kv_cache_size + preprocessing_buffer_size); + } + return res; +} +// CPU_Mapped model buffer size = 35.16 MiB +// CUDA0 model buffer size = 601.02 MiB +// CUDA0 KV buffer size = 88.00 MiB +// CUDA_Host output buffer size = 0.12 MiB +// CUDA0 compute buffer size = 266.00 MiB +// CUDA_Host compute buffer size = 48.02 MiB +} // namespace hardware \ No newline at end of file From e4bcb3ab96ab8ed88846439ea9a65e5f89b7d3d9 Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Fri, 22 Nov 2024 09:40:00 +0700 Subject: [PATCH 2/8] fix: cleanup and improve --- engine/cli/command_line_parser.cc | 13 ++++-- engine/cli/command_line_parser.h | 2 + engine/cli/commands/model_list_cmd.cc | 40 ++++++++++++------- engine/cli/commands/model_list_cmd.h | 3 +- engine/main.cc | 9 ----- engine/services/model_service.cc | 13 ++++-- .../utils/hardware/gguf/gguf_file_estimate.h | 15 ++++--- 7 files changed, 58 insertions(+), 37 deletions(-) diff --git a/engine/cli/command_line_parser.cc b/engine/cli/command_line_parser.cc index bb41dbe8b..8b540d1fd 100644 --- a/engine/cli/command_line_parser.cc +++ b/engine/cli/command_line_parser.cc @@ -246,14 +246,19 @@ void CommandLineParser::SetupModelCommands() { "Display engine"); list_models_cmd->add_flag("-v,--version", cml_data_.display_version, "Display version"); + list_models_cmd->add_flag("--cpu_mode", cml_data_.display_cpu_mode, + "Display cpu mode"); + list_models_cmd->add_flag("--gpu_mode", cml_data_.display_gpu_mode, + "Display gpu mode"); list_models_cmd->group(kSubcommands); list_models_cmd->callback([this]() { if (std::exchange(executed_, true)) return; - commands::ModelListCmd().Exec(cml_data_.config.apiServerHost, - std::stoi(cml_data_.config.apiServerPort), - cml_data_.filter, cml_data_.display_engine, - cml_data_.display_version); + commands::ModelListCmd().Exec( + cml_data_.config.apiServerHost, + std::stoi(cml_data_.config.apiServerPort), cml_data_.filter, + cml_data_.display_engine, cml_data_.display_version, + cml_data_.display_cpu_mode, cml_data_.display_gpu_mode); }); auto get_models_cmd = diff --git a/engine/cli/command_line_parser.h b/engine/cli/command_line_parser.h index bce83222a..f7ca3f507 100644 --- a/engine/cli/command_line_parser.h +++ b/engine/cli/command_line_parser.h @@ -64,6 +64,8 @@ class CommandLineParser { // for model list bool display_engine = false; bool display_version = false; + bool display_cpu_mode = false; + bool display_gpu_mode = false; std::string filter = ""; std::string log_level = "INFO"; diff --git a/engine/cli/commands/model_list_cmd.cc b/engine/cli/commands/model_list_cmd.cc index 75abea0e8..e598e22d5 100644 --- a/engine/cli/commands/model_list_cmd.cc +++ b/engine/cli/commands/model_list_cmd.cc @@ -19,7 +19,8 @@ using Row_t = void ModelListCmd::Exec(const std::string& host, int port, const std::string& filter, bool display_engine, - bool display_version) { + bool display_version, bool display_cpu_mode, + bool display_gpu_mode) { // Start server if server is not started yet if (!commands::IsServerAlive(host, port)) { CLI_LOG("Starting server ..."); @@ -38,9 +39,12 @@ void ModelListCmd::Exec(const std::string& host, int port, column_headers.push_back("Version"); } - column_headers.push_back("CPU Mode"); - column_headers.push_back("GPU Mode"); - + if (display_cpu_mode) { + column_headers.push_back("CPU Mode"); + } + if (display_gpu_mode) { + column_headers.push_back("GPU Mode"); + } Row_t header{column_headers.begin(), column_headers.end()}; table.add_row(header); table.format().font_color(Color::green); @@ -73,18 +77,26 @@ void ModelListCmd::Exec(const std::string& host, int port, row.push_back(v["version"].asString()); } - if(auto& r = v["recommendation"]; !r.isNull()) { - if(!r["cpu_mode"].isNull()) { - row.push_back("RAM: " + r["cpu_mode"]["ram"].asString() + " MiB"); + if (auto& r = v["recommendation"]; !r.isNull()) { + if (display_cpu_mode) { + if (!r["cpu_mode"].isNull()) { + row.push_back("RAM: " + r["cpu_mode"]["ram"].asString() + + " MiB"); + } } - if(!r["gpu_mode"].isNull()) { - std::string s; - s += "ngl: " + r["gpu_mode"][0]["ngl"].asString() + " - "; - s += "context length: " + r["gpu_mode"][0]["context_length"].asString() + " - "; - s += "RAM: " + r["gpu_mode"][0]["ram"].asString() + " MiB - "; - s += "VRAM: " + r["gpu_mode"][0]["vram"].asString() + " MiB"; - row.push_back(s); + if (display_gpu_mode) { + if (!r["gpu_mode"].isNull()) { + std::string s; + s += "ngl: " + r["gpu_mode"][0]["ngl"].asString() + " - "; + s += "context: " + + r["gpu_mode"][0]["context_length"].asString() + " - "; + s += "RAM: " + r["gpu_mode"][0]["ram"].asString() + " MiB - "; + s += "VRAM: " + r["gpu_mode"][0]["vram"].asString() + " MiB - "; + s += "recommend ngl: " + + r["gpu_mode"][0]["recommend_ngl"].asString(); + row.push_back(s); + } } } diff --git a/engine/cli/commands/model_list_cmd.h b/engine/cli/commands/model_list_cmd.h index 2e7c446e7..791c1ecf6 100644 --- a/engine/cli/commands/model_list_cmd.h +++ b/engine/cli/commands/model_list_cmd.h @@ -7,6 +7,7 @@ namespace commands { class ModelListCmd { public: void Exec(const std::string& host, int port, const std::string& filter, - bool display_engine = false, bool display_version = false); + bool display_engine = false, bool display_version = false, + bool display_cpu_mode = false, bool display_gpu_mode = false); }; } // namespace commands diff --git a/engine/main.cc b/engine/main.cc index e0ef37a27..095461333 100644 --- a/engine/main.cc +++ b/engine/main.cc @@ -198,15 +198,6 @@ void RunServer(std::optional port, bool ignore_cout) { } int main(int argc, char* argv[]) { - // hardware::RunConfig rc = {.ngl = 33, - // .ctx_len = 2048, - // .n_batch = 2048, - // .n_ubatch = 2048, - // .kv_cache_type = "f16"}; - // auto res = hardware::EstimateLLaMACppRun( - // "C:\\Users\\vansa\\cortexcpp-nightly\\models\\cortex.so\\llama3.1\\8b-gguf\\model.gguf", - // rc); - // return 0; // Stop the program if the system is not supported auto system_info = system_info_utils::GetSystemInfo(); if (system_info->arch == system_info_utils::kUnsupported || diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc index 02454efb7..61bbc2447 100644 --- a/engine/services/model_service.cc +++ b/engine/services/model_service.cc @@ -357,13 +357,19 @@ cpp::result ModelService::GetEstimation( fs::path(model_entry.value().path_to_model_yaml)) .string()); auto mc = yaml_handler.GetModelConfig(); - + services::HardwareService hw_svc; + auto hw_info = hw_svc.GetHardwareInfo(); + auto free_vram_MiB = 0u; + for (const auto& gpu : hw_info.gpus) { + free_vram_MiB += gpu.free_vram; + } return hardware::EstimateLLaMACppRun(file_path.string(), {.ngl = mc.ngl, .ctx_len = mc.ctx_len, .n_batch = 2048, .n_ubatch = 2048, - .kv_cache_type = "f16"}); + .kv_cache_type = "f16", + .free_vram_MiB = free_vram_MiB}); } catch (const std::exception& e) { return cpp::fail("Fail to get model status with ID '" + model_handle + "': " + e.what()); @@ -810,7 +816,8 @@ cpp::result ModelService::StartModel( .ctx_len = json_data["ctx_len"].asInt(), .n_batch = 2048, .n_ubatch = 2048, - .kv_cache_type = "f16"}; + .kv_cache_type = "f16", + .free_vram_MiB = free_vram_MiB}; auto es = hardware::EstimateLLaMACppRun(mp, rc); if (es.gpu_mode.vram_MiB > free_vram_MiB && is_cuda) { diff --git a/engine/utils/hardware/gguf/gguf_file_estimate.h b/engine/utils/hardware/gguf/gguf_file_estimate.h index 31b328608..2d22886aa 100644 --- a/engine/utils/hardware/gguf/gguf_file_estimate.h +++ b/engine/utils/hardware/gguf/gguf_file_estimate.h @@ -14,6 +14,7 @@ struct RunConfig { int n_batch; int n_ubatch; std::string kv_cache_type; + int64_t free_vram_MiB; }; struct CpuMode { @@ -25,6 +26,7 @@ struct GpuMode { int64_t vram_MiB; int ngl; int ctx_len; + int recommend_ngl; }; struct Estimation { @@ -42,6 +44,7 @@ inline Json::Value ToJson(const Estimation& es) { gpu["vram"] = es.gpu_mode.vram_MiB; gpu["ngl"] = es.gpu_mode.ngl; gpu["context_length"] = es.gpu_mode.ctx_len; + gpu["recommend_ngl"] = es.gpu_mode.recommend_ngl; gpus.append(gpu); res["cpu_mode"] = cpu; res["gpu_mode"] = gpus; @@ -160,13 +163,13 @@ inline Estimation EstimateLLaMACppRun(const std::string& file_path, res.gpu_mode.ram_MiB = BytesToMiB(ram_usage); res.gpu_mode.vram_MiB = BytesToMiB(vram_usage + kv_cache_size + preprocessing_buffer_size); + if (rc.free_vram_MiB > res.gpu_mode.vram_MiB) { + res.gpu_mode.recommend_ngl = total_ngl; + } else { + res.gpu_mode.recommend_ngl = + (double)rc.free_vram_MiB / res.gpu_mode.vram_MiB * rc.ngl; + } } return res; } -// CPU_Mapped model buffer size = 35.16 MiB -// CUDA0 model buffer size = 601.02 MiB -// CUDA0 KV buffer size = 88.00 MiB -// CUDA_Host output buffer size = 0.12 MiB -// CUDA0 compute buffer size = 266.00 MiB -// CUDA_Host compute buffer size = 48.02 MiB } // namespace hardware \ No newline at end of file From 8cb01c9b95e53611290fae858a77e5f73ce9613d Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Fri, 22 Nov 2024 13:43:56 +0700 Subject: [PATCH 3/8] chore: cleanup --- engine/services/model_service.cc | 204 ++++++++++-------- engine/services/model_service.h | 7 +- engine/utils/hardware/gguf/ggml.h | 54 ++--- engine/utils/hardware/gguf/gguf_file.h | 2 - .../utils/hardware/gguf/gguf_file_estimate.h | 28 +-- 5 files changed, 156 insertions(+), 139 deletions(-) diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc index 61bbc2447..dda89f565 100644 --- a/engine/services/model_service.cc +++ b/engine/services/model_service.cc @@ -336,7 +336,8 @@ cpp::result ModelService::HandleDownloadUrlAsync( } cpp::result ModelService::GetEstimation( - const std::string& model_handle) { + const std::string& model_handle, const std::string& kv_cache, int n_batch, + int n_ubatch) { namespace fs = std::filesystem; namespace fmu = file_manager_utils; cortex::db::Models modellist_handler; @@ -366,9 +367,9 @@ cpp::result ModelService::GetEstimation( return hardware::EstimateLLaMACppRun(file_path.string(), {.ngl = mc.ngl, .ctx_len = mc.ctx_len, - .n_batch = 2048, - .n_ubatch = 2048, - .kv_cache_type = "f16", + .n_batch = n_batch, + .n_ubatch = n_ubatch, + .kv_cache_type = kv_cache, .free_vram_MiB = free_vram_MiB}); } catch (const std::exception& e) { return cpp::fail("Fail to get model status with ID '" + model_handle + @@ -739,95 +740,11 @@ cpp::result ModelService::StartModel( #undef ASSIGN_IF_PRESENT CTL_INF(json_data.toStyledString()); - // TODO(sang) move this into another function - // Calculate ram/vram needed to load model - services::HardwareService hw_svc; - auto hw_info = hw_svc.GetHardwareInfo(); - assert(!!engine_svc_); - auto default_engine = engine_svc_->GetDefaultEngineVariant(kLlamaEngine); - bool is_cuda = false; - if (default_engine.has_error()) { - CTL_INF("Could not get default engine"); - } else { - auto& de = default_engine.value(); - is_cuda = de.variant.find("cuda") != std::string::npos; - CTL_INF("is_cuda: " << is_cuda); - } - - std::optional warning; - if (is_cuda && !system_info_utils::IsNvidiaSmiAvailable()) { - CTL_INF( - "Running cuda variant but nvidia-driver is not installed yet, " - "fallback to CPU mode"); - auto res = engine_svc_->GetInstalledEngineVariants(kLlamaEngine); - if (res.has_error()) { - CTL_WRN("Could not get engine variants"); - return cpp::fail("Nvidia-driver is not installed!"); - } else { - auto& es = res.value(); - std::sort( - es.begin(), es.end(), - [](const EngineVariantResponse& e1, - const EngineVariantResponse& e2) { return e1.name > e2.name; }); - for (auto& e : es) { - CTL_INF(e.name << " " << e.version << " " << e.engine); - // Select the first CPU candidate - if (e.name.find("cuda") == std::string::npos) { - auto r = engine_svc_->SetDefaultEngineVariant(kLlamaEngine, - e.version, e.name); - if (r.has_error()) { - CTL_WRN("Could not set default engine variant"); - return cpp::fail("Nvidia-driver is not installed!"); - } else { - CTL_INF("Change default engine to: " << e.name); - auto rl = engine_svc_->LoadEngine(kLlamaEngine); - if (rl.has_error()) { - return cpp::fail("Nvidia-driver is not installed!"); - } else { - CTL_INF("Engine started"); - is_cuda = false; - warning = "Nvidia-driver is not installed, use CPU variant: " + - e.version + "-" + e.name; - break; - } - } - } - } - // If we reach here, means that no CPU variant to fallback - if (!warning) { - return cpp::fail( - "Nvidia-driver is not installed, no available CPU version to " - "fallback"); - } - } - } - // If in GPU acceleration mode: - // We use all visible GPUs, so only need to sum all free vram - auto free_vram_MiB = 0u; - for (const auto& gpu : hw_info.gpus) { - free_vram_MiB += gpu.free_vram; - } - - auto free_ram_MiB = hw_info.ram.available_MiB; - - auto const& mp = json_data["model_path"].asString(); - auto ngl = json_data["ngl"].asInt(); - hardware::RunConfig rc = {.ngl = ngl, - .ctx_len = json_data["ctx_len"].asInt(), - .n_batch = 2048, - .n_ubatch = 2048, - .kv_cache_type = "f16", - .free_vram_MiB = free_vram_MiB}; - auto es = hardware::EstimateLLaMACppRun(mp, rc); - - if (es.gpu_mode.vram_MiB > free_vram_MiB && is_cuda) { - CTL_WRN("Not enough VRAM - " << "required: " << es.gpu_mode.vram_MiB - << ", available: " << free_vram_MiB); - } - - if (es.cpu_mode.ram_MiB > free_ram_MiB) { - CTL_WRN("Not enough RAM - " << "required: " << es.cpu_mode.ram_MiB - << ", available: " << free_ram_MiB); + auto may_fallback_res = MayFallbackToCpu(json_data["model_path"].asString(), + json_data["ngl"].asInt(), + json_data["ctx_len"].asInt()); + if (may_fallback_res.has_error()) { + return cpp::fail(may_fallback_res.error()); } assert(!!inference_svc_); @@ -836,10 +753,12 @@ cpp::result ModelService::StartModel( auto status = std::get<0>(ir)["status_code"].asInt(); auto data = std::get<1>(ir); if (status == httplib::StatusCode::OK_200) { - return StartModelResult{.success = true, .warning = warning}; + return StartModelResult{ + .success = true, .warning = may_fallback_res.value_or(std::nullopt)}; } else if (status == httplib::StatusCode::Conflict_409) { CTL_INF("Model '" + model_handle + "' is already loaded"); - return StartModelResult{.success = true, .warning = warning}; + return StartModelResult{ + .success = true, .warning = may_fallback_res.value_or(std::nullopt)}; } else { // only report to user the error CTL_ERR("Model failed to start with status code: " << status); @@ -1068,3 +987,98 @@ cpp::result ModelService::AbortDownloadModel( const std::string& task_id) { return download_service_->StopTask(task_id); } + +cpp::result, std::string> +ModelService::MayFallbackToCpu(const std::string& model_path, int ngl, + int ctx_len, int n_batch, int n_ubatch, + const std::string& kv_cache_type) { + services::HardwareService hw_svc; + auto hw_info = hw_svc.GetHardwareInfo(); + assert(!!engine_svc_); + auto default_engine = engine_svc_->GetDefaultEngineVariant(kLlamaEngine); + bool is_cuda = false; + if (default_engine.has_error()) { + CTL_INF("Could not get default engine"); + } else { + auto& de = default_engine.value(); + is_cuda = de.variant.find("cuda") != std::string::npos; + CTL_INF("is_cuda: " << is_cuda); + } + + std::optional warning; + if (is_cuda && !system_info_utils::IsNvidiaSmiAvailable()) { + CTL_INF( + "Running cuda variant but nvidia-driver is not installed yet, " + "fallback to CPU mode"); + auto res = engine_svc_->GetInstalledEngineVariants(kLlamaEngine); + if (res.has_error()) { + CTL_WRN("Could not get engine variants"); + return cpp::fail("Nvidia-driver is not installed!"); + } else { + auto& es = res.value(); + std::sort( + es.begin(), es.end(), + [](const EngineVariantResponse& e1, const EngineVariantResponse& e2) { + return e1.name > e2.name; + }); + for (auto& e : es) { + CTL_INF(e.name << " " << e.version << " " << e.engine); + // Select the first CPU candidate + if (e.name.find("cuda") == std::string::npos) { + auto r = engine_svc_->SetDefaultEngineVariant(kLlamaEngine, e.version, + e.name); + if (r.has_error()) { + CTL_WRN("Could not set default engine variant"); + return cpp::fail("Nvidia-driver is not installed!"); + } else { + CTL_INF("Change default engine to: " << e.name); + auto rl = engine_svc_->LoadEngine(kLlamaEngine); + if (rl.has_error()) { + return cpp::fail("Nvidia-driver is not installed!"); + } else { + CTL_INF("Engine started"); + is_cuda = false; + warning = "Nvidia-driver is not installed, use CPU variant: " + + e.version + "-" + e.name; + break; + } + } + } + } + // If we reach here, means that no CPU variant to fallback + if (!warning) { + return cpp::fail( + "Nvidia-driver is not installed, no available CPU version to " + "fallback"); + } + } + } + // If in GPU acceleration mode: + // We use all visible GPUs, so only need to sum all free vram + auto free_vram_MiB = 0u; + for (const auto& gpu : hw_info.gpus) { + free_vram_MiB += gpu.free_vram; + } + + auto free_ram_MiB = hw_info.ram.available_MiB; + + hardware::RunConfig rc = {.ngl = ngl, + .ctx_len = ctx_len, + .n_batch = n_batch, + .n_ubatch = n_ubatch, + .kv_cache_type = kv_cache_type, + .free_vram_MiB = free_vram_MiB}; + auto es = hardware::EstimateLLaMACppRun(model_path, rc); + + if (es.gpu_mode.vram_MiB > free_vram_MiB && is_cuda) { + CTL_WRN("Not enough VRAM - " << "required: " << es.gpu_mode.vram_MiB + << ", available: " << free_vram_MiB); + } + + if (es.cpu_mode.ram_MiB > free_ram_MiB) { + CTL_WRN("Not enough RAM - " << "required: " << es.cpu_mode.ram_MiB + << ", available: " << free_ram_MiB); + } + + return warning; +} diff --git a/engine/services/model_service.h b/engine/services/model_service.h index 323fdd588..65e4e018f 100644 --- a/engine/services/model_service.h +++ b/engine/services/model_service.h @@ -99,7 +99,8 @@ class ModelService { bool HasModel(const std::string& id) const; cpp::result GetEstimation( - const std::string& model_handle); + const std::string& model_handle, const std::string& kv_cache = "f16", + int n_batch = 2048, int n_ubatch = 2048); private: /** @@ -116,6 +117,10 @@ class ModelService { cpp::result HandleCortexsoModel( const std::string& modelName); + cpp::result, std::string> MayFallbackToCpu( + const std::string& model_path, int ngl, int ctx_len, int n_batch = 2048, + int n_ubatch = 2048, const std::string& kv_cache_type = "f16"); + std::shared_ptr download_service_; std::shared_ptr inference_svc_; std::unordered_set bypass_stop_check_set_; diff --git a/engine/utils/hardware/gguf/ggml.h b/engine/utils/hardware/gguf/ggml.h index d04cf4146..e898fc796 100644 --- a/engine/utils/hardware/gguf/ggml.h +++ b/engine/utils/hardware/gguf/ggml.h @@ -48,47 +48,47 @@ enum GGMLType { inline float GetQuantBit(GGMLType gt) { switch (gt) { + case GGML_TYPE_I32: case GGML_TYPE_F32: return 32.0; + case GGML_TYPE_I16: + case GGML_TYPE_BF16: case GGML_TYPE_F16: return 16.0; - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q8_1: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + return 2.31; case GGML_TYPE_Q2_K: - return 2.5625; + return 2.5625; + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ3_S: case GGML_TYPE_Q3_K: - return 3.4375; + return 3.4375; + case GGML_TYPE_Q4_0_4_4: + case GGML_TYPE_Q4_0_4_8: + case GGML_TYPE_Q4_0_8_8: + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ4_XS: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_K: return 4.5; + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: case GGML_TYPE_Q5_K: - return 5.5; + return 5.5; case GGML_TYPE_Q6_K: return 6.5625; - case GGML_TYPE_Q8_K: - case GGML_TYPE_IQ2_XXS: - case GGML_TYPE_IQ2_XS: - case GGML_TYPE_IQ3_XXS: - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ4_NL: - case GGML_TYPE_IQ3_S: - case GGML_TYPE_IQ2_S: - case GGML_TYPE_IQ4_XS: case GGML_TYPE_I8: - case GGML_TYPE_I16: - case GGML_TYPE_I32: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q8_1: + case GGML_TYPE_Q8_K: + return 8.0; + case GGML_TYPE_I64: case GGML_TYPE_F64: - case GGML_TYPE_IQ1_M: - case GGML_TYPE_BF16: - case GGML_TYPE_Q4_0_4_4: - case GGML_TYPE_Q4_0_4_8: - case GGML_TYPE_Q4_0_8_8: - case GGML_TYPE_TQ1_0: - case GGML_TYPE_TQ2_0: + return 64.0; default: return 8.0; diff --git a/engine/utils/hardware/gguf/gguf_file.h b/engine/utils/hardware/gguf/gguf_file.h index 9980247f0..1263debf2 100644 --- a/engine/utils/hardware/gguf/gguf_file.h +++ b/engine/utils/hardware/gguf/gguf_file.h @@ -429,8 +429,6 @@ struct GGUFHeader { }; struct GGUFFile { - /* Basic */ - // header is the header of the GGUF file. GGUFHeader header; // tensor_infos are the tensor infos of the GGUF file, diff --git a/engine/utils/hardware/gguf/gguf_file_estimate.h b/engine/utils/hardware/gguf/gguf_file_estimate.h index 2d22886aa..ca60351f5 100644 --- a/engine/utils/hardware/gguf/gguf_file_estimate.h +++ b/engine/utils/hardware/gguf/gguf_file_estimate.h @@ -93,17 +93,17 @@ inline Estimation EstimateLLaMACppRun(const std::string& file_path, for (auto const& ti : gf.tensor_infos) { if (ti->name == "output.weight") { quant_bit_out = GetQuantBit(ti->type); - std::cout << ti->type << std::endl; + // std::cout << ti->type << std::endl; } else if (ti->name == "token_embd.weight") { quant_bit_in = GetQuantBit(ti->type); - std::cout << ti->type << std::endl; + // std::cout << ti->type << std::endl; } } // output.weight // token_embd.weight - std::cout << "embedding_length: " << embedding_length << std::endl; - std::cout << "n_vocab: " << n_vocab << std::endl; - std::cout << "file_size: " << file_size << std::endl; + // std::cout << "embedding_length: " << embedding_length << std::endl; + // std::cout << "n_vocab: " << n_vocab << std::endl; + // std::cout << "file_size: " << file_size << std::endl; // Model weight int64_t token_embeddings_size = n_vocab * embedding_length * 2 * quant_bit_in / 16; @@ -119,12 +119,12 @@ inline Estimation EstimateLLaMACppRun(const std::string& file_path, int64_t ram_usage = token_embeddings_size + offload; int64_t vram_usage = file_size - ram_usage; - std::cout << "token_embeddings_size: " << BytesToMiB(token_embeddings_size) - << std::endl; - std::cout << "output_layer_size: " << BytesToMiB(output_layer_size) - << std::endl; - std::cout << "ram_usage: " << BytesToMiB(ram_usage) << std::endl; - std::cout << "vram_usage: " << BytesToMiB(vram_usage) << std::endl; + // std::cout << "token_embeddings_size: " << BytesToMiB(token_embeddings_size) + // << std::endl; + // std::cout << "output_layer_size: " << BytesToMiB(output_layer_size) + // << std::endl; + // std::cout << "ram_usage: " << BytesToMiB(ram_usage) << std::endl; + // std::cout << "vram_usage: " << BytesToMiB(vram_usage) << std::endl; // KV cache // kv_cache_size = ctx_len/8192 * hidden_dim/4096 * quant_bit/16 * num_block/33 * 1 (GB) @@ -135,7 +135,7 @@ inline Estimation EstimateLLaMACppRun(const std::string& file_path, hidden_dim / 4096 * kv_quant_bit / 16 * num_block / 33; //(bytes) - std::cout << "kv_cache_size: " << BytesToMiB(kv_cache_size) << std::endl; + // std::cout << "kv_cache_size: " << BytesToMiB(kv_cache_size) << std::endl; // VRAM = (min(n_batch, n_ubatch))/ 512 * 266 (MiB) int64_t preprocessing_buffer_size = @@ -144,8 +144,8 @@ inline Estimation EstimateLLaMACppRun(const std::string& file_path, if (total_ngl != rc.ngl) { preprocessing_buffer_size += output_layer_size; } - std::cout << "preprocessing_buffer_size: " - << BytesToMiB(preprocessing_buffer_size) << std::endl; + // std::cout << "preprocessing_buffer_size: " + // << BytesToMiB(preprocessing_buffer_size) << std::endl; // CPU mode { From 440348f27b456844419e1e918fdb752d2dc16862 Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Fri, 22 Nov 2024 14:01:50 +0700 Subject: [PATCH 4/8] chore: API docs --- docs/static/openapi/cortex.json | 53 +++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/docs/static/openapi/cortex.json b/docs/static/openapi/cortex.json index da31ab64b..78430294f 100644 --- a/docs/static/openapi/cortex.json +++ b/docs/static/openapi/cortex.json @@ -3940,6 +3940,55 @@ }, "required": ["description", "name", "productName", "status"] }, + "CpuModeDto": { + "type": "object", + "properties": { + "ram": { + "type": "number", + "example": 1024 + } + } + }, + "GpuModeDto": { + "type": "object", + "properties": { + "ram": { + "type": "number", + "example": 1024 + }, + "vram": { + "type": "number", + "example": 1024 + }, + "ngl": { + "type": "number", + "example": 30 + }, + "context_length": { + "type": "number", + "example": 4096 + }, + "recommend_ngl": { + "type": "number", + "example": 33 + } + } + }, + "RecommendDto": { + "type": "object", + "properties": { + "cpu_mode": { + "type": "object", + "$ref": "#/components/schemas/CpuModeDto" + }, + "gpu_mode": { + "type": "array", + "items": { + "$ref": "#/components/schemas/GPUDto" + } + } + } + }, "ModelDto": { "type": "object", "properties": { @@ -4064,6 +4113,10 @@ "type": "string", "description": "The engine to use.", "example": "llamacpp" + }, + "recommendation": { + "type": "object", + "$ref": "#/components/schemas/RecommendDto" } }, "required": ["id"] From dcf0841ff4af2ce571d401d5ca3976edb442e98c Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Fri, 22 Nov 2024 14:08:18 +0700 Subject: [PATCH 5/8] chore: CLI docs --- docs/docs/cli/models/index.mdx | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/docs/cli/models/index.mdx b/docs/docs/cli/models/index.mdx index 5b29069a6..b75bf9d49 100644 --- a/docs/docs/cli/models/index.mdx +++ b/docs/docs/cli/models/index.mdx @@ -120,8 +120,11 @@ For example, it returns the following:w | Option | Description | Required | Default value | Example | |---------------------------|----------------------------------------------------|----------|---------------|----------------------| -| `-h`, `--help` | Display help for command. | No | - | `-h` | - +| `-h`, `--help` | Display help for command. | No | - | `-h` | +| `-e`, `--engine` | Display engines. | No | - | `--engine` | +| `-v`, `--version` | Display version for model. | No | - | `--version` | +| `--cpu_mode` | Display CPU mode. | No | - | `--cpu_mode` | +| `--gpu_mode` | Display GPU mode. | No | - | `--gpu_mode` | ## `cortex models start` :::info From 3f7bbe01e643818090592339e232000eaac8c46f Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Fri, 22 Nov 2024 15:03:10 +0700 Subject: [PATCH 6/8] fix: correct calculation --- engine/utils/hardware/gguf/gguf_file_estimate.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/engine/utils/hardware/gguf/gguf_file_estimate.h b/engine/utils/hardware/gguf/gguf_file_estimate.h index ca60351f5..7b1a5519d 100644 --- a/engine/utils/hardware/gguf/gguf_file_estimate.h +++ b/engine/utils/hardware/gguf/gguf_file_estimate.h @@ -76,16 +76,18 @@ inline Estimation EstimateLLaMACppRun(const std::string& file_path, int32_t total_ngl = 0; auto file_size = std::filesystem::file_size(file_path); for (auto const& kv : gf.header.metadata_kv) { - if (kv.key == "llama.embedding_length") { + if (kv.key.find("embedding_length") != std::string::npos) { embedding_length = std::any_cast(kv.value); } else if (kv.key == "tokenizer.ggml.tokens") { n_vocab = std::any_cast(kv.value).arr.size(); - } else if (kv.key == "llama.block_count") { + } else if (kv.key.find("block_count") != std::string::npos) { num_block = std::any_cast(kv.value); total_ngl = num_block + 1; } } + // std::cout << n_vocab << std::endl; + // token_embeddings_size = n_vocab * embedding_length * 2 * quant_bit_in/16 bytes int32_t quant_bit_in = 0; int32_t quant_bit_out = 0; @@ -139,8 +141,8 @@ inline Estimation EstimateLLaMACppRun(const std::string& file_path, // VRAM = (min(n_batch, n_ubatch))/ 512 * 266 (MiB) int64_t preprocessing_buffer_size = - (double)std::min(rc.n_batch, rc.n_ubatch) / 512 * 266 * 1024 * 1024 / - 4; //(bytes) + (double)std::min(rc.n_batch, rc.n_ubatch) / 512 * 266 * 1024 * 1024 * + n_vocab / 128256 /*llama3 n_vocab*/; //(bytes) if (total_ngl != rc.ngl) { preprocessing_buffer_size += output_layer_size; } @@ -161,7 +163,10 @@ inline Estimation EstimateLLaMACppRun(const std::string& file_path, res.gpu_mode.ctx_len = rc.ctx_len; res.gpu_mode.ngl = rc.ngl; res.gpu_mode.ram_MiB = BytesToMiB(ram_usage); + // We also need to reserve extra 100 MiB -200 MiB of Ram for some small buffers during processing + constexpr const int64_t kDeltaVramMiB = 200; res.gpu_mode.vram_MiB = + kDeltaVramMiB + BytesToMiB(vram_usage + kv_cache_size + preprocessing_buffer_size); if (rc.free_vram_MiB > res.gpu_mode.vram_MiB) { res.gpu_mode.recommend_ngl = total_ngl; From 12f07f4e1251cb00df9974c532881e92e0333cbe Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Fri, 22 Nov 2024 16:20:37 +0700 Subject: [PATCH 7/8] fix: handle macOS --- engine/services/model_service.cc | 9 +++++++++ engine/utils/hardware/gguf/gguf_file_estimate.h | 3 +++ 2 files changed, 12 insertions(+) diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc index dda89f565..48de10296 100644 --- a/engine/services/model_service.cc +++ b/engine/services/model_service.cc @@ -364,6 +364,11 @@ cpp::result ModelService::GetEstimation( for (const auto& gpu : hw_info.gpus) { free_vram_MiB += gpu.free_vram; } + +#if defined(__APPLE__) && defined(__MACH__) + free_vram_MiB = hw_info.ram.available_MiB; +#endif + return hardware::EstimateLLaMACppRun(file_path.string(), {.ngl = mc.ngl, .ctx_len = mc.ctx_len, @@ -1062,6 +1067,10 @@ ModelService::MayFallbackToCpu(const std::string& model_path, int ngl, auto free_ram_MiB = hw_info.ram.available_MiB; +#if defined(__APPLE__) && defined(__MACH__) + free_vram_MiB = free_ram_MiB; +#endif + hardware::RunConfig rc = {.ngl = ngl, .ctx_len = ctx_len, .n_batch = n_batch, diff --git a/engine/utils/hardware/gguf/gguf_file_estimate.h b/engine/utils/hardware/gguf/gguf_file_estimate.h index 7b1a5519d..fde0b0ac0 100644 --- a/engine/utils/hardware/gguf/gguf_file_estimate.h +++ b/engine/utils/hardware/gguf/gguf_file_estimate.h @@ -174,6 +174,9 @@ inline Estimation EstimateLLaMACppRun(const std::string& file_path, res.gpu_mode.recommend_ngl = (double)rc.free_vram_MiB / res.gpu_mode.vram_MiB * rc.ngl; } +#if defined(__APPLE__) && defined(__MACH__) + res.cpu_mode.ram_MiB = res.gpu_mode.vram_MiB + res.gpu_mode.ram_MiB; +#endif } return res; } From f2651acd43db860d77e2d6d7a030bb61906a2e8a Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Tue, 26 Nov 2024 08:47:00 +0700 Subject: [PATCH 8/8] chore: typo --- engine/cli/commands/model_list_cmd.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/engine/cli/commands/model_list_cmd.cc b/engine/cli/commands/model_list_cmd.cc index e598e22d5..cfc7a8d3a 100644 --- a/engine/cli/commands/model_list_cmd.cc +++ b/engine/cli/commands/model_list_cmd.cc @@ -93,7 +93,7 @@ void ModelListCmd::Exec(const std::string& host, int port, r["gpu_mode"][0]["context_length"].asString() + " - "; s += "RAM: " + r["gpu_mode"][0]["ram"].asString() + " MiB - "; s += "VRAM: " + r["gpu_mode"][0]["vram"].asString() + " MiB - "; - s += "recommend ngl: " + + s += "recommended ngl: " + r["gpu_mode"][0]["recommend_ngl"].asString(); row.push_back(s); }