janhq · vansangpfiev · Dec 6, 2024 · Dec 6, 2024
diff --git a/engine/controllers/models.cc b/engine/controllers/models.cc
@@ -184,8 +184,8 @@ void Models::ListModel(
           obj["model"] = model_entry.model;
           obj["model"] = model_entry.model;
           auto es = model_service_->GetEstimation(model_entry.model);
-          if (es.has_value()) {
-            obj["recommendation"] = hardware::ToJson(es.value());
+          if (es.has_value() && !!es.value()) {
+            obj["recommendation"] = hardware::ToJson(*(es.value()));
           }
           data.append(std::move(obj));
           yaml_handler.Reset();

diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc
@@ -341,9 +341,10 @@ cpp::result<DownloadTask, std::string> ModelService::HandleDownloadUrlAsync(
   return download_service_->AddTask(downloadTask, on_finished);
 }
 
-cpp::result<hardware::Estimation, std::string> ModelService::GetEstimation(
-    const std::string& model_handle, const std::string& kv_cache, int n_batch,
-    int n_ubatch) {
+cpp::result<std::optional<hardware::Estimation>, std::string>
+ModelService::GetEstimation(const std::string& model_handle,
+                            const std::string& kv_cache, int n_batch,
+                            int n_ubatch) {
   namespace fs = std::filesystem;
   namespace fmu = file_manager_utils;
   cortex::db::Models modellist_handler;
@@ -918,7 +919,7 @@ cpp::result<bool, std::string> ModelService::GetModelStatus(
     if (status == drogon::k200OK) {
       return true;
     } else {
-      CTL_ERR("Model failed to get model status with status code: " << status);
+      CTL_WRN("Model failed to get model status with status code: " << status);
       return cpp::fail("Model failed to get model status: " +
                        data["message"].asString());
     }
@@ -1146,13 +1147,13 @@ ModelService::MayFallbackToCpu(const std::string& model_path, int ngl,
                             .free_vram_MiB = free_vram_MiB};
   auto es = hardware::EstimateLLaMACppRun(model_path, rc);
 
-  if (es.gpu_mode.vram_MiB > free_vram_MiB && is_cuda) {
-    CTL_WRN("Not enough VRAM - " << "required: " << es.gpu_mode.vram_MiB
+  if (!!es && (*es).gpu_mode.vram_MiB > free_vram_MiB && is_cuda) {
+    CTL_WRN("Not enough VRAM - " << "required: " << (*es).gpu_mode.vram_MiB
                                  << ", available: " << free_vram_MiB);
   }
 
-  if (es.cpu_mode.ram_MiB > free_ram_MiB) {
-    CTL_WRN("Not enough RAM - " << "required: " << es.cpu_mode.ram_MiB
+  if (!!es && (*es).cpu_mode.ram_MiB > free_ram_MiB) {
+    CTL_WRN("Not enough RAM - " << "required: " << (*es).cpu_mode.ram_MiB
                                 << ", available: " << free_ram_MiB);
   }
 

diff --git a/engine/services/model_service.h b/engine/services/model_service.h
@@ -97,7 +97,7 @@ class ModelService {
 
   bool HasModel(const std::string& id) const;
 
-  cpp::result<hardware::Estimation, std::string> GetEstimation(
+  cpp::result<std::optional<hardware::Estimation>, std::string> GetEstimation(
       const std::string& model_handle, const std::string& kv_cache = "f16",
       int n_batch = 2048, int n_ubatch = 2048);
 

diff --git a/engine/utils/hardware/gguf/gguf_file.h b/engine/utils/hardware/gguf/gguf_file.h
@@ -11,6 +11,7 @@
 #include <unordered_set>
 #include <variant>
 #include <vector>
+#include <optional>
 
 #ifdef _WIN32
 #include <io.h>
@@ -23,13 +24,14 @@
 
 #include "ggml.h"
 #include "utils/string_utils.h"
+#include "utils/logging_utils.h"
 
 // #define GGUF_LOG(msg)                                                  \
 //   do {                                                                 \
 //     std::cout << __FILE__ << "(@" << __LINE__ << "): " << msg << '\n'; \
 //   } while (false)
 
-#define GGUF_LOG(msg)  
+#define GGUF_LOG(msg)
 namespace hardware {
 #undef min
 #undef max
@@ -169,8 +171,6 @@ inline std::string to_string(const GGUFMetadataKV& kv) {
   return "Invalid type ";
 }
 
-
-
 struct GGUFTensorInfo {
   /* Basic */
   std::string name;
@@ -208,14 +208,14 @@ struct GGUFHelper {
         CreateFileA(file_path.c_str(), GENERIC_READ, FILE_SHARE_READ, nullptr,
                     OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, nullptr);
     if (file_handle == INVALID_HANDLE_VALUE) {
-      std::cout << "Failed to open file" << std::endl;
+      CTL_INF("Failed to open file: " << file_path);
       return false;
     }
     // Get the file size
     LARGE_INTEGER file_size_struct;
     if (!GetFileSizeEx(file_handle, &file_size_struct)) {
       CloseHandle(file_handle);
-      std::cout << "Failed to open file" << std::endl;
+      CTL_INF("Failed to get file size: " << file_path);
       return false;
     }
     file_size = static_cast<size_t>(file_size_struct.QuadPart);
@@ -225,7 +225,7 @@ struct GGUFHelper {
         CreateFileMappingA(file_handle, nullptr, PAGE_READONLY, 0, 0, nullptr);
     if (file_mapping == nullptr) {
       CloseHandle(file_handle);
-      std::cout << "Failed to create file mapping" << std::endl;
+      CTL_INF("Failed to create file mapping: " << file_path);
       return false;
     }
 
@@ -235,7 +235,7 @@ struct GGUFHelper {
     if (data == nullptr) {
       CloseHandle(file_mapping);
       CloseHandle(file_handle);
-      std::cout << "Failed to map file" << std::endl;
+      CTL_INF("Failed to map file:: " << file_path);
       return false;
     }
 
@@ -479,10 +479,12 @@ struct GGUFFile {
   double model_bits_per_weight;
 };
 
-inline GGUFFile ParseGgufFile(const std::string& path) {
+inline std::optional<GGUFFile> ParseGgufFile(const std::string& path) {
   GGUFFile gf;
   GGUFHelper h;
-  h.OpenAndMMap(path);
+  if(!h.OpenAndMMap(path)) {
+    return std::nullopt;
+  }
 
   GGUFMagic magic = h.Read<GGUFMagic>();
   // GGUF_LOG("magic: " << magic);

diff --git a/engine/utils/hardware/gguf/gguf_file_estimate.h b/engine/utils/hardware/gguf/gguf_file_estimate.h
@@ -62,20 +62,22 @@ inline float GetQuantBit(const std::string& kv_cache_t) {
   return 16.0;
 }
 
-inline Estimation EstimateLLaMACppRun(const std::string& file_path,
-                                      const RunConfig& rc) {
+inline std::optional<Estimation> EstimateLLaMACppRun(
+    const std::string& file_path, const RunConfig& rc) {
   Estimation res;
   // token_embeddings_size = n_vocab * embedding_length * 2 * quant_bit/16 bytes
   //RAM = token_embeddings_size + ((total_ngl-ngl) >=1 ? Output_layer_size +  (total_ngl - ngl - 1 ) / (total_ngl-1) * (total_file_size - token_embeddings_size - Output_layer_size) : 0  )  (bytes)
 
   // VRAM = total_file_size - RAM (bytes)
   auto gf = ParseGgufFile(file_path);
+  if (!gf)
+    return std::nullopt;
   int32_t embedding_length = 0;
   int64_t n_vocab = 0;
   int32_t num_block = 0;
   int32_t total_ngl = 0;
   auto file_size = std::filesystem::file_size(file_path);
-  for (auto const& kv : gf.header.metadata_kv) {
+  for (auto const& kv : (*gf).header.metadata_kv) {
     if (kv.key.find("embedding_length") != std::string::npos) {
       embedding_length = std::any_cast<uint32_t>(kv.value);
     } else if (kv.key == "tokenizer.ggml.tokens") {
@@ -92,7 +94,7 @@ inline Estimation EstimateLLaMACppRun(const std::string& file_path,
   int32_t quant_bit_in = 0;
   int32_t quant_bit_out = 0;
 
-  for (auto const& ti : gf.tensor_infos) {
+  for (auto const& ti : (*gf).tensor_infos) {
     if (ti->name == "output.weight") {
       quant_bit_out = GetQuantBit(ti->type);
       // std::cout << ti->type << std::endl;