From 9e3bf4717032d8982e5ca8123cd8c4903daf18df Mon Sep 17 00:00:00 2001
From: vansangpfiev <vansangpfiev@gmail.com>
Date: Mon, 10 Mar 2025 11:03:57 +0700
Subject: [PATCH 1/3] chore: suppress warnings on Windows

---
 engine/cli/main.cc                            |  3 +-
 engine/common/message.h                       |  3 +-
 engine/config/model_config.h                  |  6 +--
 engine/main.cc                                |  2 +-
 engine/services/model_service.cc              |  8 ++--
 engine/utils/command_executor.h               |  3 +-
 engine/utils/cortex_utils.h                   |  2 +
 engine/utils/format_utils.h                   |  2 +-
 engine/utils/hardware/cpu_info.h              |  2 +-
 engine/utils/hardware/gguf/ggml.h             | 22 +++++------
 .../utils/hardware/gguf/gguf_file_estimate.h  | 37 ++++++++++---------
 engine/utils/hardware/gpu/vulkan/vulkan_gpu.h | 16 +++++---
 engine/utils/huggingface_utils.h              |  4 +-
 engine/utils/url_parser.h                     |  3 +-
 14 files changed, 62 insertions(+), 51 deletions(-)
diff --git a/engine/cli/main.cc b/engine/cli/main.cc
index 8ed4beb61..66f14c116 100644
--- a/engine/cli/main.cc
+++ b/engine/cli/main.cc
@@ -8,6 +8,7 @@
 #include "utils/file_manager_utils.h"
 #include "utils/logging_utils.h"
 #include "utils/system_info_utils.h"
+#include "utils/widechar_conv.h"
 
 #if defined(__APPLE__) && defined(__MACH__)
 #include <libgen.h>  // for dirname()
@@ -46,7 +47,7 @@ void SetupLogger(trantor::FileLogger& async_logger, bool verbose) {
 
     std::filesystem::create_directories(
 #if defined(_WIN32)
-        std::filesystem::u8path(config.logFolderPath) /
+        std::filesystem::path(cortex::wc::Utf8ToWstring(config.logFolderPath)) /
 #else
         std::filesystem::path(config.logFolderPath) /
 #endif
diff --git a/engine/common/message.h b/engine/common/message.h
index d31c4f0d3..62ad69ab9 100644
--- a/engine/common/message.h
+++ b/engine/common/message.h
@@ -107,7 +107,8 @@ struct Message : JsonSerializable {
           std::move(root.get("object", "thread.message").asString());
       message.created_at = root["created_at"].asUInt();
       if (message.created_at == 0 && root["created"].asUInt64() != 0) {
-        message.created_at = root["created"].asUInt64() / 1000;
+        message.created_at =
+            static_cast<uint32_t>(root["created"].asUInt64() / 1000);
       }
       message.thread_id = std::move(root["thread_id"].asString());
       message.status = StatusFromString(std::move(root["status"].asString()));
diff --git a/engine/config/model_config.h b/engine/config/model_config.h
index 1d51cfb01..687f50681 100644
--- a/engine/config/model_config.h
+++ b/engine/config/model_config.h
@@ -35,8 +35,7 @@ struct RemoteModelConfig {
 
     // Load basic string fields
     model = json.get("model", model).asString();
-    header_template =
-        json.get("header_template", header_template).asString();
+    header_template = json.get("header_template", header_template).asString();
     engine = json.get("engine", engine).asString();
     version = json.get("version", version).asString();
     created =
@@ -405,7 +404,8 @@ struct ModelConfig {
     oss << format_utils::print_comment("END REQUIRED");
     oss << format_utils::print_comment("BEGIN OPTIONAL");
 
-    oss << format_utils::print_float("size", size);
+    oss << format_utils::print_kv("size", std::to_string(size),
+                                  format_utils::MAGENTA);
     oss << format_utils::print_bool("stream", stream);
     oss << format_utils::print_float("top_p", top_p);
     oss << format_utils::print_float("temperature", temperature);
diff --git a/engine/main.cc b/engine/main.cc
index a51d825fc..492dc9629 100644
--- a/engine/main.cc
+++ b/engine/main.cc
@@ -105,7 +105,7 @@ void RunServer(std::optional<std::string> host, std::optional<int> port,
   // Create logs/ folder and setup log to file
   std::filesystem::create_directories(
 #if defined(_WIN32)
-      std::filesystem::u8path(config.logFolderPath) /
+      std::filesystem::path(cortex::wc::Utf8ToWstring(config.logFolderPath)) /
 #else
       std::filesystem::path(config.logFolderPath) /
 #endif
diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc
index c13f7cf19..3129362ce 100644
--- a/engine/services/model_service.cc
+++ b/engine/services/model_service.cc
@@ -315,7 +315,7 @@ cpp::result<DownloadTask, std::string> ModelService::HandleDownloadUrlAsync(
 
   try {
     std::filesystem::create_directories(local_path.parent_path());
-  } catch (const std::filesystem::filesystem_error& e) {
+  } catch (const std::filesystem::filesystem_error&) {
     // if file exist, remove it
     std::filesystem::remove(local_path.parent_path());
     std::filesystem::create_directories(local_path.parent_path());
@@ -380,7 +380,7 @@ ModelService::EstimateModel(const std::string& model_handle,
     auto mc = yaml_handler.GetModelConfig();
     assert(hw_service_);
     auto hw_info = hw_service_->GetHardwareInfo();
-    auto free_vram_MiB = 0u;
+    int64_t free_vram_MiB = 0;
     for (const auto& gpu : hw_info.gpus) {
       free_vram_MiB += gpu.free_vram;
     }
@@ -444,7 +444,7 @@ cpp::result<std::string, std::string> ModelService::HandleUrl(
 
   try {
     std::filesystem::create_directories(local_path.parent_path());
-  } catch (const std::filesystem::filesystem_error& e) {
+  } catch (const std::filesystem::filesystem_error&) {
     // if file exist, remove it
     std::filesystem::remove(local_path.parent_path());
     std::filesystem::create_directories(local_path.parent_path());
@@ -1326,7 +1326,7 @@ ModelService::MayFallbackToCpu(const std::string& model_path, int ngl,
   }
   // If in GPU acceleration mode:
   // We use all visible GPUs, so only need to sum all free vram
-  auto free_vram_MiB = 0u;
+  int64_t free_vram_MiB = 0;
   for (const auto& gpu : hw_info.gpus) {
     free_vram_MiB += gpu.free_vram;
   }
diff --git a/engine/utils/command_executor.h b/engine/utils/command_executor.h
index 87460e2c1..2a6064521 100644
--- a/engine/utils/command_executor.h
+++ b/engine/utils/command_executor.h
@@ -37,7 +37,8 @@ class CommandExecutor {
     std::array<char, 128> buffer;
     std::string result;
 
-    while (fgets(buffer.data(), buffer.size(), m_pipe.get()) != nullptr) {
+    while (fgets(buffer.data(), static_cast<int>(buffer.size()),
+                 m_pipe.get()) != nullptr) {
       result += buffer.data();
     }
 
diff --git a/engine/utils/cortex_utils.h b/engine/utils/cortex_utils.h
index f58fcfe8f..23f4330f4 100644
--- a/engine/utils/cortex_utils.h
+++ b/engine/utils/cortex_utils.h
@@ -1,4 +1,6 @@
 #pragma once
+
+
 #include <drogon/HttpClient.h>
 #include <drogon/HttpResponse.h>
 #include <sys/stat.h>
diff --git a/engine/utils/format_utils.h b/engine/utils/format_utils.h
index 5dccee359..871588179 100644
--- a/engine/utils/format_utils.h
+++ b/engine/utils/format_utils.h
@@ -67,7 +67,7 @@ inline std::string WriteKeyValue(const std::string& key,
         strValue.pop_back();
       }
       out_file << strValue;
-    } catch (const std::exception& e) {
+    } catch (const std::exception&) {
       out_file << value;  // If not a float, write as is
     }
   } else {
diff --git a/engine/utils/hardware/cpu_info.h b/engine/utils/hardware/cpu_info.h
index af7a85a4b..ac5e1c83a 100644
--- a/engine/utils/hardware/cpu_info.h
+++ b/engine/utils/hardware/cpu_info.h
@@ -187,7 +187,7 @@ struct CpuInfo {
       return CPU{};
     auto cpu = res[0];
     cortex::cpuid::CpuInfo inst;
-    float usage = GetCPUUsage();
+    auto usage = static_cast<float>(GetCPUUsage());
     return CPU{.cores = cpu.numPhysicalCores(),
                .arch = std::string(GetArch()),
                .model = cpu.modelName(),
diff --git a/engine/utils/hardware/gguf/ggml.h b/engine/utils/hardware/gguf/ggml.h
index 7a8f480a1..f56fb9172 100644
--- a/engine/utils/hardware/gguf/ggml.h
+++ b/engine/utils/hardware/gguf/ggml.h
@@ -49,21 +49,21 @@ inline float GetQuantBit(GGMLType gt) {
   switch (gt) {
     case GGML_TYPE_I32:
     case GGML_TYPE_F32:
-      return 32.0;
+      return 32.0f;
     case GGML_TYPE_I16:
     case GGML_TYPE_BF16:
     case GGML_TYPE_F16:
-      return 16.0;
+      return 16.0f;
     case GGML_TYPE_IQ2_S:
     case GGML_TYPE_IQ2_XXS:
     case GGML_TYPE_IQ2_XS:
-      return 2.31;
+      return 2.31f;
     case GGML_TYPE_Q2_K:
-      return 2.5625;
+      return 2.5625f;
     case GGML_TYPE_IQ3_XXS:
     case GGML_TYPE_IQ3_S:
     case GGML_TYPE_Q3_K:
-      return 3.4375;
+      return 3.4375f;
     case GGML_TYPE_Q4_0_4_4:
     case GGML_TYPE_Q4_0_4_8:
     case GGML_TYPE_Q4_0_8_8:
@@ -72,25 +72,25 @@ inline float GetQuantBit(GGMLType gt) {
     case GGML_TYPE_Q4_0:
     case GGML_TYPE_Q4_1:
     case GGML_TYPE_Q4_K:
-      return 4.5;
+      return 4.5f;
     case GGML_TYPE_Q5_0:
     case GGML_TYPE_Q5_1:
     case GGML_TYPE_Q5_K:
-      return 5.5;
+      return 5.5f;
     case GGML_TYPE_Q6_K:
-      return 6.5625;
+      return 6.5625f;
     case GGML_TYPE_I8:
     case GGML_TYPE_Q8_0:
     case GGML_TYPE_Q8_1:
     case GGML_TYPE_Q8_K:
-      return 8.0;
+      return 8.0f;
 
     case GGML_TYPE_I64:
     case GGML_TYPE_F64:
-      return 64.0;
+      return 64.0f;
 
     default:
-      return 8.0;
+      return 8.0f;
   }
 }
 
diff --git a/engine/utils/hardware/gguf/gguf_file_estimate.h b/engine/utils/hardware/gguf/gguf_file_estimate.h
index 402a70958..df7188b67 100644
--- a/engine/utils/hardware/gguf/gguf_file_estimate.h
+++ b/engine/utils/hardware/gguf/gguf_file_estimate.h
@@ -6,7 +6,7 @@
 
 namespace hardware {
 inline uint64_t BytesToMiB(uint64_t b) {
-  return (double)b / 1024 / 1024;
+  return static_cast<uint64_t>((double)b / 1024 / 1024);
 };
 struct RunConfig {
   int ngl;
@@ -91,8 +91,8 @@ inline std::optional<Estimation> EstimateLLaMACppRun(
   // std::cout << n_vocab << std::endl;
 
   // token_embeddings_size = n_vocab * embedding_length * 2 * quant_bit_in/16 bytes
-  int32_t quant_bit_in = 0;
-  int32_t quant_bit_out = 0;
+  float quant_bit_in = 0;
+  float quant_bit_out = 0;
 
   for (auto const& ti : (*gf).tensor_infos) {
     if (ti->name == "output.weight") {
@@ -109,16 +109,17 @@ inline std::optional<Estimation> EstimateLLaMACppRun(
   // std::cout << "n_vocab: " << n_vocab << std::endl;
   // std::cout << "file_size: " << file_size << std::endl;
   // Model weight
-  int64_t token_embeddings_size =
-      n_vocab * embedding_length * 2 * quant_bit_in / 16;
-  int64_t output_layer_size =
-      n_vocab * embedding_length * 2 * quant_bit_out / 16;
+  auto token_embeddings_size =
+      static_cast<int64_t>(n_vocab * embedding_length * 2 * quant_bit_in / 16);
+  auto output_layer_size =
+      static_cast<int64_t>(n_vocab * embedding_length * 2 * quant_bit_out / 16);
   // RAM = token_embeddings_size + ((total_ngl-ngl) >=1 ? output_layer_size +  (total_ngl - ngl - 1 ) / (total_ngl-1) * (total_file_size - token_embeddings_size - output_layer_size) : 0  )  (bytes)
   int64_t offload = 0;
   if (total_ngl >= rc.ngl + 1) {
-    offload = output_layer_size +
-              (double)(total_ngl - rc.ngl - 1) / (total_ngl - 1) *
-                  (file_size - token_embeddings_size - output_layer_size);
+    offload = static_cast<int64_t>(
+        output_layer_size +
+        (double)(total_ngl - rc.ngl - 1) / (total_ngl - 1) *
+            (file_size - token_embeddings_size - output_layer_size));
   }
 
   int64_t ram_usage = token_embeddings_size + offload;
@@ -133,18 +134,18 @@ inline std::optional<Estimation> EstimateLLaMACppRun(
   // KV cache
   // kv_cache_size = ctx_len/8192 * hidden_dim/4096 * quant_bit/16 * num_block/33 * 1 (GB)
   auto hidden_dim = embedding_length;
-  int kv_quant_bit =
+  auto kv_quant_bit =
       GetQuantBit(rc.kv_cache_type);  // f16, 8 bits for q8_0, 4.5 bits for q4_0
-  int64_t kv_cache_size = (double)(1024 * 1024 * 1024) * rc.ctx_len / 8192 *
-                          hidden_dim / 4096 * kv_quant_bit / 16 * num_block /
-                          33;  //(bytes)
+  auto kv_cache_size = static_cast<int64_t>(
+      (double)(1024 * 1024 * 1024) * rc.ctx_len / 8192 * hidden_dim / 4096 *
+      kv_quant_bit / 16 * num_block / 33);  //(bytes)
 
   // std::cout << "kv_cache_size: " << BytesToMiB(kv_cache_size) << std::endl;
 
   // VRAM = (min(n_batch, n_ubatch))/ 512 * 266 (MiB)
-  int64_t preprocessing_buffer_size =
+  auto preprocessing_buffer_size = static_cast<int64_t>(
       (double)std::min(rc.n_batch, rc.n_ubatch) / 512 * 266 * 1024 * 1024 *
-      n_vocab / 128256 /*llama3 n_vocab*/;  //(bytes)
+      n_vocab / 128256 /*llama3 n_vocab*/);  //(bytes)
   if (total_ngl != rc.ngl) {
     preprocessing_buffer_size += output_layer_size;
   }
@@ -173,8 +174,8 @@ inline std::optional<Estimation> EstimateLLaMACppRun(
     if (rc.free_vram_MiB > res.gpu_mode.vram_MiB) {
       res.gpu_mode.recommend_ngl = total_ngl;
     } else {
-      res.gpu_mode.recommend_ngl =
-          (double)rc.free_vram_MiB / res.gpu_mode.vram_MiB * rc.ngl;
+      res.gpu_mode.recommend_ngl = static_cast<int>(
+          (double)rc.free_vram_MiB / res.gpu_mode.vram_MiB * rc.ngl);
     }
 #if defined(__APPLE__) && defined(__MACH__)
     res.cpu_mode.ram_MiB = res.gpu_mode.vram_MiB + res.gpu_mode.ram_MiB;
diff --git a/engine/utils/hardware/gpu/vulkan/vulkan_gpu.h b/engine/utils/hardware/gpu/vulkan/vulkan_gpu.h
index 27899ca77..15a40c97e 100644
--- a/engine/utils/hardware/gpu/vulkan/vulkan_gpu.h
+++ b/engine/utils/hardware/gpu/vulkan/vulkan_gpu.h
@@ -433,8 +433,8 @@ class VulkanGpu {
       for (uint32_t i = 0; i < memory_properties.memoryHeapCount; ++i) {
         if (memory_properties.memoryHeaps[i].flags &
             VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) {
-          gpu_avail_MiB +=
-              memory_properties.memoryHeaps[i].size / (1024ull * 1024ull);
+          gpu_avail_MiB += static_cast<int>(
+              memory_properties.memoryHeaps[i].size / (1024ull * 1024ull));
         }
       }
 
@@ -449,8 +449,10 @@ class VulkanGpu {
       used_vram_MiB = gpus_usages[device_properties.deviceName];
 
 #endif
-      int free_vram_MiB =
-          total_vram_MiB > used_vram_MiB ? total_vram_MiB - used_vram_MiB : 0;
+      auto free_vram_MiB =
+          total_vram_MiB > used_vram_MiB
+              ? static_cast<int>(total_vram_MiB - used_vram_MiB)
+              : 0;
       if (device_properties.vendorID == kNvidiaVendor ||
           device_properties.vendorID == kAmdVendor) {
         gpus.emplace_back(cortex::hw::GPU{
@@ -507,8 +509,10 @@ class VulkanGpu {
       total_vram_MiB = gpus_[i].free_vram;
       used_vram_MiB = gpus_usages[gpus_[i].name];
 #endif
-      int free_vram_MiB =
-          total_vram_MiB > used_vram_MiB ? total_vram_MiB - used_vram_MiB : 0;
+      auto free_vram_MiB =
+          total_vram_MiB > used_vram_MiB
+              ? static_cast<int>(total_vram_MiB - used_vram_MiB)
+              : 0;
       gpus_[i].free_vram = free_vram_MiB;
     }
 
diff --git a/engine/utils/huggingface_utils.h b/engine/utils/huggingface_utils.h
index 1c0ab906c..14c19084a 100644
--- a/engine/utils/huggingface_utils.h
+++ b/engine/utils/huggingface_utils.h
@@ -308,7 +308,7 @@ inline std::optional<std::string> GetDefaultBranch(
       return default_branch.as<std::string>();
     }
     return std::nullopt;
-  } catch (const std::exception& e) {
+  } catch (const std::exception&) {
     return std::nullopt;
   }
 }
@@ -328,7 +328,7 @@ inline std::optional<std::string> GetModelAuthorCortexsoHub(
       return author.as<std::string>();
     }
     return std::nullopt;
-  } catch (const std::exception& e) {
+  } catch (const std::exception&) {
     return std::nullopt;
   }
 }
diff --git a/engine/utils/url_parser.h b/engine/utils/url_parser.h
index 4802ba1a1..483c44312 100644
--- a/engine/utils/url_parser.h
+++ b/engine/utils/url_parser.h
@@ -153,7 +153,8 @@ inline std::string FromUrl(const Url& url) {
     } catch (const std::bad_variant_access& e) {
       // Handle the case where the variant does not match any of the expected types
       // This should not happen if the map was created correctly
-      throw std::runtime_error("Invalid variant type in queries map");
+      throw std::runtime_error(
+          std::string("Invalid variant type in queries map: ") + e.what());
     }
   }
 

From 4e669c2956e6adb414f0e02155f7448d04754ee0 Mon Sep 17 00:00:00 2001
From: vansangpfiev <vansangpfiev@gmail.com>
Date: Mon, 10 Mar 2025 11:43:38 +0700
Subject: [PATCH 2/3] chore: msvc

---
 engine/services/hardware_service.cc | 38 +++++++++++++++++++++++-
 engine/utils/cortex_utils.h         | 12 ++++++--
 engine/utils/engine_matcher_utils.h | 46 ++++++++++++++++-------------
 3 files changed, 71 insertions(+), 25 deletions(-)

diff --git a/engine/services/hardware_service.cc b/engine/services/hardware_service.cc
index e6bcc89ef..93b2d70f8 100644
--- a/engine/services/hardware_service.cc
+++ b/engine/services/hardware_service.cc
@@ -106,9 +106,19 @@ bool HardwareService::Restart(const std::string& host, int port) {
     return false;
   }
 
+#ifdef _MSC_VER
+  char* value = nullptr;
+  size_t len = 0;
+  _dupenv_s(&value, &len, "CUDA_VISIBLE_DEVICES");
+#else
   const char* value = std::getenv("CUDA_VISIBLE_DEVICES");
+#endif
+
   if (value) {
     LOG_INFO << "CUDA_VISIBLE_DEVICES is set to: " << value;
+#ifdef _MSC_VER
+    free(value);
+#endif
   } else {
     LOG_WARN << "CUDA_VISIBLE_DEVICES is not set.";
   }
@@ -128,9 +138,18 @@ bool HardwareService::Restart(const std::string& host, int port) {
     return false;
   }
 
+#ifdef _MSC_VER
+  char* vk_value = nullptr;
+  _dupenv_s(&vk_value, &len, "GGML_VK_VISIBLE_DEVICES");
+#else
   const char* vk_value = std::getenv("GGML_VK_VISIBLE_DEVICES");
+#endif
+
   if (vk_value) {
     LOG_INFO << "GGML_VK_VISIBLE_DEVICES is set to: " << vk_value;
+#ifdef _MSC_VER
+    free(vk_value);
+#endif
   } else {
     LOG_WARN << "GGML_VK_VISIBLE_DEVICES is not set.";
   }
@@ -240,7 +259,7 @@ bool HardwareService::SetActivateHardwareConfig(
   auto priority = [&ahc](int software_id) -> int {
     for (size_t i = 0; i < ahc.gpus.size(); i++) {
       if (ahc.gpus[i] == software_id)
-        return i;
+        return static_cast<int>(i);
       break;
     }
     return INT_MAX;
@@ -390,16 +409,33 @@ void HardwareService::UpdateHardwareInfos() {
 #if defined(_WIN32) || defined(_WIN64) || defined(__linux__)
   bool has_deactivated_gpu = a.value().size() != activated_gpu_af.size();
   if (!gpus.empty() && has_deactivated_gpu) {
+#ifdef _MSC_VER
+    char* value = nullptr;
+    size_t len = 0;
+    _dupenv_s(&value, &len, "CUDA_VISIBLE_DEVICES");
+#else
     const char* value = std::getenv("CUDA_VISIBLE_DEVICES");
+#endif
     if (value) {
       LOG_INFO << "CUDA_VISIBLE_DEVICES: " << value;
+#ifdef _MSC_VER
+      free(value);
+#endif
     } else {
       need_restart = true;
     }
 
+#ifdef _MSC_VER
+    char* vk_value = nullptr;
+    _dupenv_s(&vk_value, &len, "GGML_VK_VISIBLE_DEVICES");
+#else
     const char* vk_value = std::getenv("GGML_VK_VISIBLE_DEVICES");
+#endif
     if (vk_value) {
       LOG_INFO << "GGML_VK_VISIBLE_DEVICES: " << vk_value;
+#ifdef _MSC_VER
+      free(vk_value);
+#endif
     } else {
       need_restart = true;
     }
diff --git a/engine/utils/cortex_utils.h b/engine/utils/cortex_utils.h
index 23f4330f4..6dcd590fc 100644
--- a/engine/utils/cortex_utils.h
+++ b/engine/utils/cortex_utils.h
@@ -1,6 +1,5 @@
 #pragma once
 
-
 #include <drogon/HttpClient.h>
 #include <drogon/HttpResponse.h>
 #include <sys/stat.h>
@@ -31,9 +30,16 @@ inline std::string logs_cli_base_name = "./logs/cortex-cli.log";
 // example: Mon, 25 Nov 2024 09:57:03 GMT
 inline std::string GetDateRFC1123() {
   std::time_t now = std::time(nullptr);
-  std::tm* gmt_time = std::gmtime(&now);
+  std::tm gmt_time = {};
+#ifdef _MSC_VER
+  gmtime_s(&gmt_time, &now);
+  std::ostringstream oss;
+  oss << std::put_time(&gmt_time, "%a, %d %b %Y %H:%M:%S GMT");
+#else
+  std::tm* gmt_time_ptr = std::gmtime(&now);
   std::ostringstream oss;
-  oss << std::put_time(gmt_time, "%a, %d %b %Y %H:%M:%S GMT");
+  oss << std::put_time(gmt_time_ptr, "%a, %d %b %Y %H:%M:%S GMT");
+#endif
   return oss.str();
 }
 
diff --git a/engine/utils/engine_matcher_utils.h b/engine/utils/engine_matcher_utils.h
index 28c0f0c2a..0b0cb26be 100644
--- a/engine/utils/engine_matcher_utils.h
+++ b/engine/utils/engine_matcher_utils.h
@@ -51,52 +51,56 @@ inline std::string GetSuitableCudaVariant(
   std::regex cuda_reg("cuda-(\\d+)-(\\d+)");
   std::smatch match;
 
-  int requestedMajor = 0;
-  int requestedMinor = 0;
+  int requested_major = 0;
+  int requested_minor = 0;
 
   if (!cuda_version.empty()) {
-    // Split the provided CUDA version into major and minor parts
-    sscanf(cuda_version.c_str(), "%d.%d", &requestedMajor, &requestedMinor);
+// Split the provided CUDA version into major and minor parts
+#if defined(_MSC_VER)
+    sscanf_s(cuda_version.c_str(), "%d.%d", &requested_major, &requested_minor);
+#else
+    sscanf(cuda_version.c_str(), "%d.%d", &requested_major, &requested_minor);
+#endif
   }
 
-  std::string selectedVariant;
-  int bestMatchMajor = -1;
-  int bestMatchMinor = -1;
+  std::string selected_variant;
+  int best_match_major = -1;
+  int best_match_minor = -1;
 
   for (const auto& variant : variants) {
     if (std::regex_search(variant, match, cuda_reg)) {
       // Found a CUDA version in the variant
-      int variantMajor = std::stoi(match[1]);
-      int variantMinor = std::stoi(match[2]);
+      int variant_major = std::stoi(match[1]);
+      int variant_minor = std::stoi(match[2]);
 
-      if (requestedMajor == variantMajor) {
+      if (requested_major == variant_major) {
         // If the major versions match, prefer the closest minor version
-        if (requestedMinor >= variantMinor &&
-            (variantMajor > bestMatchMajor ||
-             (variantMajor == bestMatchMajor &&
-              variantMinor > bestMatchMinor))) {
-          selectedVariant = variant;
-          bestMatchMajor = variantMajor;
-          bestMatchMinor = variantMinor;
+        if (requested_minor >= variant_minor &&
+            (variant_major > best_match_major ||
+             (variant_major == best_match_major &&
+              variant_minor > best_match_minor))) {
+          selected_variant = variant;
+          best_match_major = variant_major;
+          best_match_minor = variant_minor;
         }
       }
     }
   }
 
   // If no CUDA version is provided, select the variant without any CUDA in the name
-  if (selectedVariant.empty()) {
+  if (selected_variant.empty()) {
     LOG_WARN
         << "No suitable CUDA variant found, selecting a variant without CUDA";
     for (const auto& variant : variants) {
       if (variant.find("cuda") == std::string::npos) {
-        selectedVariant = variant;
-        LOG_INFO << "Found variant without CUDA: " << selectedVariant << "\n";
+        selected_variant = variant;
+        LOG_INFO << "Found variant without CUDA: " << selected_variant << "\n";
         break;
       }
     }
   }
 
-  return selectedVariant;
+  return selected_variant;
 }
 
 inline std::string ValidateTensorrtLlm(const std::vector<std::string>& variants,

From c5156938dbaa86f060262e6f5527a99da829757e Mon Sep 17 00:00:00 2001
From: vansangpfiev <vansangpfiev@gmail.com>
Date: Mon, 10 Mar 2025 11:52:08 +0700
Subject: [PATCH 3/3] chore: remove c-style cast

---
 engine/utils/hardware/gguf/gguf_file_estimate.h | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/engine/utils/hardware/gguf/gguf_file_estimate.h b/engine/utils/hardware/gguf/gguf_file_estimate.h
index df7188b67..292c185ed 100644
--- a/engine/utils/hardware/gguf/gguf_file_estimate.h
+++ b/engine/utils/hardware/gguf/gguf_file_estimate.h
@@ -6,7 +6,7 @@
 
 namespace hardware {
 inline uint64_t BytesToMiB(uint64_t b) {
-  return static_cast<uint64_t>((double)b / 1024 / 1024);
+  return static_cast<uint64_t>(static_cast<double>(b) / 1024 / 1024);
 };
 struct RunConfig {
   int ngl;
@@ -118,7 +118,7 @@ inline std::optional<Estimation> EstimateLLaMACppRun(
   if (total_ngl >= rc.ngl + 1) {
     offload = static_cast<int64_t>(
         output_layer_size +
-        (double)(total_ngl - rc.ngl - 1) / (total_ngl - 1) *
+        static_cast<double>(total_ngl - rc.ngl - 1) / (total_ngl - 1) *
             (file_size - token_embeddings_size - output_layer_size));
   }
 
@@ -137,15 +137,15 @@ inline std::optional<Estimation> EstimateLLaMACppRun(
   auto kv_quant_bit =
       GetQuantBit(rc.kv_cache_type);  // f16, 8 bits for q8_0, 4.5 bits for q4_0
   auto kv_cache_size = static_cast<int64_t>(
-      (double)(1024 * 1024 * 1024) * rc.ctx_len / 8192 * hidden_dim / 4096 *
-      kv_quant_bit / 16 * num_block / 33);  //(bytes)
+      static_cast<double>(1024 * 1024 * 1024) * rc.ctx_len / 8192 * hidden_dim /
+      4096 * kv_quant_bit / 16 * num_block / 33);  //(bytes)
 
   // std::cout << "kv_cache_size: " << BytesToMiB(kv_cache_size) << std::endl;
 
   // VRAM = (min(n_batch, n_ubatch))/ 512 * 266 (MiB)
   auto preprocessing_buffer_size = static_cast<int64_t>(
-      (double)std::min(rc.n_batch, rc.n_ubatch) / 512 * 266 * 1024 * 1024 *
-      n_vocab / 128256 /*llama3 n_vocab*/);  //(bytes)
+      static_cast<double>(std::min(rc.n_batch, rc.n_ubatch)) / 512 * 266 *
+      1024 * 1024 * n_vocab / 128256 /*llama3 n_vocab*/);  //(bytes)
   if (total_ngl != rc.ngl) {
     preprocessing_buffer_size += output_layer_size;
   }
@@ -174,8 +174,9 @@ inline std::optional<Estimation> EstimateLLaMACppRun(
     if (rc.free_vram_MiB > res.gpu_mode.vram_MiB) {
       res.gpu_mode.recommend_ngl = total_ngl;
     } else {
-      res.gpu_mode.recommend_ngl = static_cast<int>(
-          (double)rc.free_vram_MiB / res.gpu_mode.vram_MiB * rc.ngl);
+      res.gpu_mode.recommend_ngl =
+          static_cast<int>(static_cast<double>(rc.free_vram_MiB) /
+                           res.gpu_mode.vram_MiB * rc.ngl);
     }
 #if defined(__APPLE__) && defined(__MACH__)
     res.cpu_mode.ram_MiB = res.gpu_mode.vram_MiB + res.gpu_mode.ram_MiB;