From 9e3bf4717032d8982e5ca8123cd8c4903daf18df Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Mon, 10 Mar 2025 11:03:57 +0700 Subject: [PATCH 1/3] chore: suppress warnings on Windows --- engine/cli/main.cc | 3 +- engine/common/message.h | 3 +- engine/config/model_config.h | 6 +-- engine/main.cc | 2 +- engine/services/model_service.cc | 8 ++-- engine/utils/command_executor.h | 3 +- engine/utils/cortex_utils.h | 2 + engine/utils/format_utils.h | 2 +- engine/utils/hardware/cpu_info.h | 2 +- engine/utils/hardware/gguf/ggml.h | 22 +++++------ .../utils/hardware/gguf/gguf_file_estimate.h | 37 ++++++++++--------- engine/utils/hardware/gpu/vulkan/vulkan_gpu.h | 16 +++++--- engine/utils/huggingface_utils.h | 4 +- engine/utils/url_parser.h | 3 +- 14 files changed, 62 insertions(+), 51 deletions(-) diff --git a/engine/cli/main.cc b/engine/cli/main.cc index 8ed4beb61..66f14c116 100644 --- a/engine/cli/main.cc +++ b/engine/cli/main.cc @@ -8,6 +8,7 @@ #include "utils/file_manager_utils.h" #include "utils/logging_utils.h" #include "utils/system_info_utils.h" +#include "utils/widechar_conv.h" #if defined(__APPLE__) && defined(__MACH__) #include // for dirname() @@ -46,7 +47,7 @@ void SetupLogger(trantor::FileLogger& async_logger, bool verbose) { std::filesystem::create_directories( #if defined(_WIN32) - std::filesystem::u8path(config.logFolderPath) / + std::filesystem::path(cortex::wc::Utf8ToWstring(config.logFolderPath)) / #else std::filesystem::path(config.logFolderPath) / #endif diff --git a/engine/common/message.h b/engine/common/message.h index d31c4f0d3..62ad69ab9 100644 --- a/engine/common/message.h +++ b/engine/common/message.h @@ -107,7 +107,8 @@ struct Message : JsonSerializable { std::move(root.get("object", "thread.message").asString()); message.created_at = root["created_at"].asUInt(); if (message.created_at == 0 && root["created"].asUInt64() != 0) { - message.created_at = root["created"].asUInt64() / 1000; + message.created_at = + static_cast(root["created"].asUInt64() / 1000); } message.thread_id = std::move(root["thread_id"].asString()); message.status = StatusFromString(std::move(root["status"].asString())); diff --git a/engine/config/model_config.h b/engine/config/model_config.h index 1d51cfb01..687f50681 100644 --- a/engine/config/model_config.h +++ b/engine/config/model_config.h @@ -35,8 +35,7 @@ struct RemoteModelConfig { // Load basic string fields model = json.get("model", model).asString(); - header_template = - json.get("header_template", header_template).asString(); + header_template = json.get("header_template", header_template).asString(); engine = json.get("engine", engine).asString(); version = json.get("version", version).asString(); created = @@ -405,7 +404,8 @@ struct ModelConfig { oss << format_utils::print_comment("END REQUIRED"); oss << format_utils::print_comment("BEGIN OPTIONAL"); - oss << format_utils::print_float("size", size); + oss << format_utils::print_kv("size", std::to_string(size), + format_utils::MAGENTA); oss << format_utils::print_bool("stream", stream); oss << format_utils::print_float("top_p", top_p); oss << format_utils::print_float("temperature", temperature); diff --git a/engine/main.cc b/engine/main.cc index a51d825fc..492dc9629 100644 --- a/engine/main.cc +++ b/engine/main.cc @@ -105,7 +105,7 @@ void RunServer(std::optional host, std::optional port, // Create logs/ folder and setup log to file std::filesystem::create_directories( #if defined(_WIN32) - std::filesystem::u8path(config.logFolderPath) / + std::filesystem::path(cortex::wc::Utf8ToWstring(config.logFolderPath)) / #else std::filesystem::path(config.logFolderPath) / #endif diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc index c13f7cf19..3129362ce 100644 --- a/engine/services/model_service.cc +++ b/engine/services/model_service.cc @@ -315,7 +315,7 @@ cpp::result ModelService::HandleDownloadUrlAsync( try { std::filesystem::create_directories(local_path.parent_path()); - } catch (const std::filesystem::filesystem_error& e) { + } catch (const std::filesystem::filesystem_error&) { // if file exist, remove it std::filesystem::remove(local_path.parent_path()); std::filesystem::create_directories(local_path.parent_path()); @@ -380,7 +380,7 @@ ModelService::EstimateModel(const std::string& model_handle, auto mc = yaml_handler.GetModelConfig(); assert(hw_service_); auto hw_info = hw_service_->GetHardwareInfo(); - auto free_vram_MiB = 0u; + int64_t free_vram_MiB = 0; for (const auto& gpu : hw_info.gpus) { free_vram_MiB += gpu.free_vram; } @@ -444,7 +444,7 @@ cpp::result ModelService::HandleUrl( try { std::filesystem::create_directories(local_path.parent_path()); - } catch (const std::filesystem::filesystem_error& e) { + } catch (const std::filesystem::filesystem_error&) { // if file exist, remove it std::filesystem::remove(local_path.parent_path()); std::filesystem::create_directories(local_path.parent_path()); @@ -1326,7 +1326,7 @@ ModelService::MayFallbackToCpu(const std::string& model_path, int ngl, } // If in GPU acceleration mode: // We use all visible GPUs, so only need to sum all free vram - auto free_vram_MiB = 0u; + int64_t free_vram_MiB = 0; for (const auto& gpu : hw_info.gpus) { free_vram_MiB += gpu.free_vram; } diff --git a/engine/utils/command_executor.h b/engine/utils/command_executor.h index 87460e2c1..2a6064521 100644 --- a/engine/utils/command_executor.h +++ b/engine/utils/command_executor.h @@ -37,7 +37,8 @@ class CommandExecutor { std::array buffer; std::string result; - while (fgets(buffer.data(), buffer.size(), m_pipe.get()) != nullptr) { + while (fgets(buffer.data(), static_cast(buffer.size()), + m_pipe.get()) != nullptr) { result += buffer.data(); } diff --git a/engine/utils/cortex_utils.h b/engine/utils/cortex_utils.h index f58fcfe8f..23f4330f4 100644 --- a/engine/utils/cortex_utils.h +++ b/engine/utils/cortex_utils.h @@ -1,4 +1,6 @@ #pragma once + + #include #include #include diff --git a/engine/utils/format_utils.h b/engine/utils/format_utils.h index 5dccee359..871588179 100644 --- a/engine/utils/format_utils.h +++ b/engine/utils/format_utils.h @@ -67,7 +67,7 @@ inline std::string WriteKeyValue(const std::string& key, strValue.pop_back(); } out_file << strValue; - } catch (const std::exception& e) { + } catch (const std::exception&) { out_file << value; // If not a float, write as is } } else { diff --git a/engine/utils/hardware/cpu_info.h b/engine/utils/hardware/cpu_info.h index af7a85a4b..ac5e1c83a 100644 --- a/engine/utils/hardware/cpu_info.h +++ b/engine/utils/hardware/cpu_info.h @@ -187,7 +187,7 @@ struct CpuInfo { return CPU{}; auto cpu = res[0]; cortex::cpuid::CpuInfo inst; - float usage = GetCPUUsage(); + auto usage = static_cast(GetCPUUsage()); return CPU{.cores = cpu.numPhysicalCores(), .arch = std::string(GetArch()), .model = cpu.modelName(), diff --git a/engine/utils/hardware/gguf/ggml.h b/engine/utils/hardware/gguf/ggml.h index 7a8f480a1..f56fb9172 100644 --- a/engine/utils/hardware/gguf/ggml.h +++ b/engine/utils/hardware/gguf/ggml.h @@ -49,21 +49,21 @@ inline float GetQuantBit(GGMLType gt) { switch (gt) { case GGML_TYPE_I32: case GGML_TYPE_F32: - return 32.0; + return 32.0f; case GGML_TYPE_I16: case GGML_TYPE_BF16: case GGML_TYPE_F16: - return 16.0; + return 16.0f; case GGML_TYPE_IQ2_S: case GGML_TYPE_IQ2_XXS: case GGML_TYPE_IQ2_XS: - return 2.31; + return 2.31f; case GGML_TYPE_Q2_K: - return 2.5625; + return 2.5625f; case GGML_TYPE_IQ3_XXS: case GGML_TYPE_IQ3_S: case GGML_TYPE_Q3_K: - return 3.4375; + return 3.4375f; case GGML_TYPE_Q4_0_4_4: case GGML_TYPE_Q4_0_4_8: case GGML_TYPE_Q4_0_8_8: @@ -72,25 +72,25 @@ inline float GetQuantBit(GGMLType gt) { case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: case GGML_TYPE_Q4_K: - return 4.5; + return 4.5f; case GGML_TYPE_Q5_0: case GGML_TYPE_Q5_1: case GGML_TYPE_Q5_K: - return 5.5; + return 5.5f; case GGML_TYPE_Q6_K: - return 6.5625; + return 6.5625f; case GGML_TYPE_I8: case GGML_TYPE_Q8_0: case GGML_TYPE_Q8_1: case GGML_TYPE_Q8_K: - return 8.0; + return 8.0f; case GGML_TYPE_I64: case GGML_TYPE_F64: - return 64.0; + return 64.0f; default: - return 8.0; + return 8.0f; } } diff --git a/engine/utils/hardware/gguf/gguf_file_estimate.h b/engine/utils/hardware/gguf/gguf_file_estimate.h index 402a70958..df7188b67 100644 --- a/engine/utils/hardware/gguf/gguf_file_estimate.h +++ b/engine/utils/hardware/gguf/gguf_file_estimate.h @@ -6,7 +6,7 @@ namespace hardware { inline uint64_t BytesToMiB(uint64_t b) { - return (double)b / 1024 / 1024; + return static_cast((double)b / 1024 / 1024); }; struct RunConfig { int ngl; @@ -91,8 +91,8 @@ inline std::optional EstimateLLaMACppRun( // std::cout << n_vocab << std::endl; // token_embeddings_size = n_vocab * embedding_length * 2 * quant_bit_in/16 bytes - int32_t quant_bit_in = 0; - int32_t quant_bit_out = 0; + float quant_bit_in = 0; + float quant_bit_out = 0; for (auto const& ti : (*gf).tensor_infos) { if (ti->name == "output.weight") { @@ -109,16 +109,17 @@ inline std::optional EstimateLLaMACppRun( // std::cout << "n_vocab: " << n_vocab << std::endl; // std::cout << "file_size: " << file_size << std::endl; // Model weight - int64_t token_embeddings_size = - n_vocab * embedding_length * 2 * quant_bit_in / 16; - int64_t output_layer_size = - n_vocab * embedding_length * 2 * quant_bit_out / 16; + auto token_embeddings_size = + static_cast(n_vocab * embedding_length * 2 * quant_bit_in / 16); + auto output_layer_size = + static_cast(n_vocab * embedding_length * 2 * quant_bit_out / 16); // RAM = token_embeddings_size + ((total_ngl-ngl) >=1 ? output_layer_size + (total_ngl - ngl - 1 ) / (total_ngl-1) * (total_file_size - token_embeddings_size - output_layer_size) : 0 ) (bytes) int64_t offload = 0; if (total_ngl >= rc.ngl + 1) { - offload = output_layer_size + - (double)(total_ngl - rc.ngl - 1) / (total_ngl - 1) * - (file_size - token_embeddings_size - output_layer_size); + offload = static_cast( + output_layer_size + + (double)(total_ngl - rc.ngl - 1) / (total_ngl - 1) * + (file_size - token_embeddings_size - output_layer_size)); } int64_t ram_usage = token_embeddings_size + offload; @@ -133,18 +134,18 @@ inline std::optional EstimateLLaMACppRun( // KV cache // kv_cache_size = ctx_len/8192 * hidden_dim/4096 * quant_bit/16 * num_block/33 * 1 (GB) auto hidden_dim = embedding_length; - int kv_quant_bit = + auto kv_quant_bit = GetQuantBit(rc.kv_cache_type); // f16, 8 bits for q8_0, 4.5 bits for q4_0 - int64_t kv_cache_size = (double)(1024 * 1024 * 1024) * rc.ctx_len / 8192 * - hidden_dim / 4096 * kv_quant_bit / 16 * num_block / - 33; //(bytes) + auto kv_cache_size = static_cast( + (double)(1024 * 1024 * 1024) * rc.ctx_len / 8192 * hidden_dim / 4096 * + kv_quant_bit / 16 * num_block / 33); //(bytes) // std::cout << "kv_cache_size: " << BytesToMiB(kv_cache_size) << std::endl; // VRAM = (min(n_batch, n_ubatch))/ 512 * 266 (MiB) - int64_t preprocessing_buffer_size = + auto preprocessing_buffer_size = static_cast( (double)std::min(rc.n_batch, rc.n_ubatch) / 512 * 266 * 1024 * 1024 * - n_vocab / 128256 /*llama3 n_vocab*/; //(bytes) + n_vocab / 128256 /*llama3 n_vocab*/); //(bytes) if (total_ngl != rc.ngl) { preprocessing_buffer_size += output_layer_size; } @@ -173,8 +174,8 @@ inline std::optional EstimateLLaMACppRun( if (rc.free_vram_MiB > res.gpu_mode.vram_MiB) { res.gpu_mode.recommend_ngl = total_ngl; } else { - res.gpu_mode.recommend_ngl = - (double)rc.free_vram_MiB / res.gpu_mode.vram_MiB * rc.ngl; + res.gpu_mode.recommend_ngl = static_cast( + (double)rc.free_vram_MiB / res.gpu_mode.vram_MiB * rc.ngl); } #if defined(__APPLE__) && defined(__MACH__) res.cpu_mode.ram_MiB = res.gpu_mode.vram_MiB + res.gpu_mode.ram_MiB; diff --git a/engine/utils/hardware/gpu/vulkan/vulkan_gpu.h b/engine/utils/hardware/gpu/vulkan/vulkan_gpu.h index 27899ca77..15a40c97e 100644 --- a/engine/utils/hardware/gpu/vulkan/vulkan_gpu.h +++ b/engine/utils/hardware/gpu/vulkan/vulkan_gpu.h @@ -433,8 +433,8 @@ class VulkanGpu { for (uint32_t i = 0; i < memory_properties.memoryHeapCount; ++i) { if (memory_properties.memoryHeaps[i].flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) { - gpu_avail_MiB += - memory_properties.memoryHeaps[i].size / (1024ull * 1024ull); + gpu_avail_MiB += static_cast( + memory_properties.memoryHeaps[i].size / (1024ull * 1024ull)); } } @@ -449,8 +449,10 @@ class VulkanGpu { used_vram_MiB = gpus_usages[device_properties.deviceName]; #endif - int free_vram_MiB = - total_vram_MiB > used_vram_MiB ? total_vram_MiB - used_vram_MiB : 0; + auto free_vram_MiB = + total_vram_MiB > used_vram_MiB + ? static_cast(total_vram_MiB - used_vram_MiB) + : 0; if (device_properties.vendorID == kNvidiaVendor || device_properties.vendorID == kAmdVendor) { gpus.emplace_back(cortex::hw::GPU{ @@ -507,8 +509,10 @@ class VulkanGpu { total_vram_MiB = gpus_[i].free_vram; used_vram_MiB = gpus_usages[gpus_[i].name]; #endif - int free_vram_MiB = - total_vram_MiB > used_vram_MiB ? total_vram_MiB - used_vram_MiB : 0; + auto free_vram_MiB = + total_vram_MiB > used_vram_MiB + ? static_cast(total_vram_MiB - used_vram_MiB) + : 0; gpus_[i].free_vram = free_vram_MiB; } diff --git a/engine/utils/huggingface_utils.h b/engine/utils/huggingface_utils.h index 1c0ab906c..14c19084a 100644 --- a/engine/utils/huggingface_utils.h +++ b/engine/utils/huggingface_utils.h @@ -308,7 +308,7 @@ inline std::optional GetDefaultBranch( return default_branch.as(); } return std::nullopt; - } catch (const std::exception& e) { + } catch (const std::exception&) { return std::nullopt; } } @@ -328,7 +328,7 @@ inline std::optional GetModelAuthorCortexsoHub( return author.as(); } return std::nullopt; - } catch (const std::exception& e) { + } catch (const std::exception&) { return std::nullopt; } } diff --git a/engine/utils/url_parser.h b/engine/utils/url_parser.h index 4802ba1a1..483c44312 100644 --- a/engine/utils/url_parser.h +++ b/engine/utils/url_parser.h @@ -153,7 +153,8 @@ inline std::string FromUrl(const Url& url) { } catch (const std::bad_variant_access& e) { // Handle the case where the variant does not match any of the expected types // This should not happen if the map was created correctly - throw std::runtime_error("Invalid variant type in queries map"); + throw std::runtime_error( + std::string("Invalid variant type in queries map: ") + e.what()); } } From 4e669c2956e6adb414f0e02155f7448d04754ee0 Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Mon, 10 Mar 2025 11:43:38 +0700 Subject: [PATCH 2/3] chore: msvc --- engine/services/hardware_service.cc | 38 +++++++++++++++++++++++- engine/utils/cortex_utils.h | 12 ++++++-- engine/utils/engine_matcher_utils.h | 46 ++++++++++++++++------------- 3 files changed, 71 insertions(+), 25 deletions(-) diff --git a/engine/services/hardware_service.cc b/engine/services/hardware_service.cc index e6bcc89ef..93b2d70f8 100644 --- a/engine/services/hardware_service.cc +++ b/engine/services/hardware_service.cc @@ -106,9 +106,19 @@ bool HardwareService::Restart(const std::string& host, int port) { return false; } +#ifdef _MSC_VER + char* value = nullptr; + size_t len = 0; + _dupenv_s(&value, &len, "CUDA_VISIBLE_DEVICES"); +#else const char* value = std::getenv("CUDA_VISIBLE_DEVICES"); +#endif + if (value) { LOG_INFO << "CUDA_VISIBLE_DEVICES is set to: " << value; +#ifdef _MSC_VER + free(value); +#endif } else { LOG_WARN << "CUDA_VISIBLE_DEVICES is not set."; } @@ -128,9 +138,18 @@ bool HardwareService::Restart(const std::string& host, int port) { return false; } +#ifdef _MSC_VER + char* vk_value = nullptr; + _dupenv_s(&vk_value, &len, "GGML_VK_VISIBLE_DEVICES"); +#else const char* vk_value = std::getenv("GGML_VK_VISIBLE_DEVICES"); +#endif + if (vk_value) { LOG_INFO << "GGML_VK_VISIBLE_DEVICES is set to: " << vk_value; +#ifdef _MSC_VER + free(vk_value); +#endif } else { LOG_WARN << "GGML_VK_VISIBLE_DEVICES is not set."; } @@ -240,7 +259,7 @@ bool HardwareService::SetActivateHardwareConfig( auto priority = [&ahc](int software_id) -> int { for (size_t i = 0; i < ahc.gpus.size(); i++) { if (ahc.gpus[i] == software_id) - return i; + return static_cast(i); break; } return INT_MAX; @@ -390,16 +409,33 @@ void HardwareService::UpdateHardwareInfos() { #if defined(_WIN32) || defined(_WIN64) || defined(__linux__) bool has_deactivated_gpu = a.value().size() != activated_gpu_af.size(); if (!gpus.empty() && has_deactivated_gpu) { +#ifdef _MSC_VER + char* value = nullptr; + size_t len = 0; + _dupenv_s(&value, &len, "CUDA_VISIBLE_DEVICES"); +#else const char* value = std::getenv("CUDA_VISIBLE_DEVICES"); +#endif if (value) { LOG_INFO << "CUDA_VISIBLE_DEVICES: " << value; +#ifdef _MSC_VER + free(value); +#endif } else { need_restart = true; } +#ifdef _MSC_VER + char* vk_value = nullptr; + _dupenv_s(&vk_value, &len, "GGML_VK_VISIBLE_DEVICES"); +#else const char* vk_value = std::getenv("GGML_VK_VISIBLE_DEVICES"); +#endif if (vk_value) { LOG_INFO << "GGML_VK_VISIBLE_DEVICES: " << vk_value; +#ifdef _MSC_VER + free(vk_value); +#endif } else { need_restart = true; } diff --git a/engine/utils/cortex_utils.h b/engine/utils/cortex_utils.h index 23f4330f4..6dcd590fc 100644 --- a/engine/utils/cortex_utils.h +++ b/engine/utils/cortex_utils.h @@ -1,6 +1,5 @@ #pragma once - #include #include #include @@ -31,9 +30,16 @@ inline std::string logs_cli_base_name = "./logs/cortex-cli.log"; // example: Mon, 25 Nov 2024 09:57:03 GMT inline std::string GetDateRFC1123() { std::time_t now = std::time(nullptr); - std::tm* gmt_time = std::gmtime(&now); + std::tm gmt_time = {}; +#ifdef _MSC_VER + gmtime_s(&gmt_time, &now); + std::ostringstream oss; + oss << std::put_time(&gmt_time, "%a, %d %b %Y %H:%M:%S GMT"); +#else + std::tm* gmt_time_ptr = std::gmtime(&now); std::ostringstream oss; - oss << std::put_time(gmt_time, "%a, %d %b %Y %H:%M:%S GMT"); + oss << std::put_time(gmt_time_ptr, "%a, %d %b %Y %H:%M:%S GMT"); +#endif return oss.str(); } diff --git a/engine/utils/engine_matcher_utils.h b/engine/utils/engine_matcher_utils.h index 28c0f0c2a..0b0cb26be 100644 --- a/engine/utils/engine_matcher_utils.h +++ b/engine/utils/engine_matcher_utils.h @@ -51,52 +51,56 @@ inline std::string GetSuitableCudaVariant( std::regex cuda_reg("cuda-(\\d+)-(\\d+)"); std::smatch match; - int requestedMajor = 0; - int requestedMinor = 0; + int requested_major = 0; + int requested_minor = 0; if (!cuda_version.empty()) { - // Split the provided CUDA version into major and minor parts - sscanf(cuda_version.c_str(), "%d.%d", &requestedMajor, &requestedMinor); +// Split the provided CUDA version into major and minor parts +#if defined(_MSC_VER) + sscanf_s(cuda_version.c_str(), "%d.%d", &requested_major, &requested_minor); +#else + sscanf(cuda_version.c_str(), "%d.%d", &requested_major, &requested_minor); +#endif } - std::string selectedVariant; - int bestMatchMajor = -1; - int bestMatchMinor = -1; + std::string selected_variant; + int best_match_major = -1; + int best_match_minor = -1; for (const auto& variant : variants) { if (std::regex_search(variant, match, cuda_reg)) { // Found a CUDA version in the variant - int variantMajor = std::stoi(match[1]); - int variantMinor = std::stoi(match[2]); + int variant_major = std::stoi(match[1]); + int variant_minor = std::stoi(match[2]); - if (requestedMajor == variantMajor) { + if (requested_major == variant_major) { // If the major versions match, prefer the closest minor version - if (requestedMinor >= variantMinor && - (variantMajor > bestMatchMajor || - (variantMajor == bestMatchMajor && - variantMinor > bestMatchMinor))) { - selectedVariant = variant; - bestMatchMajor = variantMajor; - bestMatchMinor = variantMinor; + if (requested_minor >= variant_minor && + (variant_major > best_match_major || + (variant_major == best_match_major && + variant_minor > best_match_minor))) { + selected_variant = variant; + best_match_major = variant_major; + best_match_minor = variant_minor; } } } } // If no CUDA version is provided, select the variant without any CUDA in the name - if (selectedVariant.empty()) { + if (selected_variant.empty()) { LOG_WARN << "No suitable CUDA variant found, selecting a variant without CUDA"; for (const auto& variant : variants) { if (variant.find("cuda") == std::string::npos) { - selectedVariant = variant; - LOG_INFO << "Found variant without CUDA: " << selectedVariant << "\n"; + selected_variant = variant; + LOG_INFO << "Found variant without CUDA: " << selected_variant << "\n"; break; } } } - return selectedVariant; + return selected_variant; } inline std::string ValidateTensorrtLlm(const std::vector& variants, From c5156938dbaa86f060262e6f5527a99da829757e Mon Sep 17 00:00:00 2001 From: vansangpfiev Date: Mon, 10 Mar 2025 11:52:08 +0700 Subject: [PATCH 3/3] chore: remove c-style cast --- engine/utils/hardware/gguf/gguf_file_estimate.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/engine/utils/hardware/gguf/gguf_file_estimate.h b/engine/utils/hardware/gguf/gguf_file_estimate.h index df7188b67..292c185ed 100644 --- a/engine/utils/hardware/gguf/gguf_file_estimate.h +++ b/engine/utils/hardware/gguf/gguf_file_estimate.h @@ -6,7 +6,7 @@ namespace hardware { inline uint64_t BytesToMiB(uint64_t b) { - return static_cast((double)b / 1024 / 1024); + return static_cast(static_cast(b) / 1024 / 1024); }; struct RunConfig { int ngl; @@ -118,7 +118,7 @@ inline std::optional EstimateLLaMACppRun( if (total_ngl >= rc.ngl + 1) { offload = static_cast( output_layer_size + - (double)(total_ngl - rc.ngl - 1) / (total_ngl - 1) * + static_cast(total_ngl - rc.ngl - 1) / (total_ngl - 1) * (file_size - token_embeddings_size - output_layer_size)); } @@ -137,15 +137,15 @@ inline std::optional EstimateLLaMACppRun( auto kv_quant_bit = GetQuantBit(rc.kv_cache_type); // f16, 8 bits for q8_0, 4.5 bits for q4_0 auto kv_cache_size = static_cast( - (double)(1024 * 1024 * 1024) * rc.ctx_len / 8192 * hidden_dim / 4096 * - kv_quant_bit / 16 * num_block / 33); //(bytes) + static_cast(1024 * 1024 * 1024) * rc.ctx_len / 8192 * hidden_dim / + 4096 * kv_quant_bit / 16 * num_block / 33); //(bytes) // std::cout << "kv_cache_size: " << BytesToMiB(kv_cache_size) << std::endl; // VRAM = (min(n_batch, n_ubatch))/ 512 * 266 (MiB) auto preprocessing_buffer_size = static_cast( - (double)std::min(rc.n_batch, rc.n_ubatch) / 512 * 266 * 1024 * 1024 * - n_vocab / 128256 /*llama3 n_vocab*/); //(bytes) + static_cast(std::min(rc.n_batch, rc.n_ubatch)) / 512 * 266 * + 1024 * 1024 * n_vocab / 128256 /*llama3 n_vocab*/); //(bytes) if (total_ngl != rc.ngl) { preprocessing_buffer_size += output_layer_size; } @@ -174,8 +174,9 @@ inline std::optional EstimateLLaMACppRun( if (rc.free_vram_MiB > res.gpu_mode.vram_MiB) { res.gpu_mode.recommend_ngl = total_ngl; } else { - res.gpu_mode.recommend_ngl = static_cast( - (double)rc.free_vram_MiB / res.gpu_mode.vram_MiB * rc.ngl); + res.gpu_mode.recommend_ngl = + static_cast(static_cast(rc.free_vram_MiB) / + res.gpu_mode.vram_MiB * rc.ngl); } #if defined(__APPLE__) && defined(__MACH__) res.cpu_mode.ram_MiB = res.gpu_mode.vram_MiB + res.gpu_mode.ram_MiB;