From 2659789b53826ba9988c4400035bf855b78266f3 Mon Sep 17 00:00:00 2001
From: vansangpfiev <sang@jan.ai>
Date: Mon, 11 Nov 2024 16:33:32 +0700
Subject: [PATCH 1/8] feat: model estimation

---
 engine/cli/commands/model_list_cmd.cc         |  18 +
 engine/controllers/models.cc                  |   4 +
 engine/main.cc                                |  10 +
 engine/services/model_service.cc              |  63 +-
 engine/services/model_service.h               |   4 +
 engine/utils/hardware/gguf/ggml.h             | 235 ++++++++
 engine/utils/hardware/gguf/gguf_file.h        | 539 ++++++++++++++++++
 .../utils/hardware/gguf/gguf_file_estimate.h  | 172 ++++++
 8 files changed, 1028 insertions(+), 17 deletions(-)
 create mode 100644 engine/utils/hardware/gguf/ggml.h
 create mode 100644 engine/utils/hardware/gguf/gguf_file.h
 create mode 100644 engine/utils/hardware/gguf/gguf_file_estimate.h
diff --git a/engine/cli/commands/model_list_cmd.cc b/engine/cli/commands/model_list_cmd.cc
index 41fe61d1c..75abea0e8 100644
--- a/engine/cli/commands/model_list_cmd.cc
+++ b/engine/cli/commands/model_list_cmd.cc
@@ -38,6 +38,9 @@ void ModelListCmd::Exec(const std::string& host, int port,
     column_headers.push_back("Version");
   }
 
+  column_headers.push_back("CPU Mode");
+  column_headers.push_back("GPU Mode");
+
   Row_t header{column_headers.begin(), column_headers.end()};
   table.add_row(header);
   table.format().font_color(Color::green);
@@ -70,6 +73,21 @@ void ModelListCmd::Exec(const std::string& host, int port,
             row.push_back(v["version"].asString());
           }
 
+          if(auto& r = v["recommendation"]; !r.isNull()) {
+            if(!r["cpu_mode"].isNull()) {
+              row.push_back("RAM: " + r["cpu_mode"]["ram"].asString() + " MiB");
+            }
+
+            if(!r["gpu_mode"].isNull()) {
+              std::string s;
+              s += "ngl: " + r["gpu_mode"][0]["ngl"].asString() + " - ";
+              s += "context length: " + r["gpu_mode"][0]["context_length"].asString() + " - ";
+              s += "RAM: " + r["gpu_mode"][0]["ram"].asString() + " MiB - ";
+              s += "VRAM: " + r["gpu_mode"][0]["vram"].asString() + " MiB";
+              row.push_back(s);
+            }
+          }
+
           table.add_row({row.begin(), row.end()});
         }
       }
diff --git a/engine/controllers/models.cc b/engine/controllers/models.cc
index 888983d7c..30e8eaad6 100644
--- a/engine/controllers/models.cc
+++ b/engine/controllers/models.cc
@@ -171,6 +171,10 @@ void Models::ListModel(
         Json::Value obj = model_config.ToJson();
         obj["id"] = model_entry.model;
         obj["model"] = model_entry.model;
+        auto es = model_service_->GetEstimation(model_entry.model);
+        if (es.has_value()) {
+          obj["recommendation"] = hardware::ToJson(es.value());
+        }
         data.append(std::move(obj));
         yaml_handler.Reset();
       } catch (const std::exception& e) {
diff --git a/engine/main.cc b/engine/main.cc
index 8eab545b9..e0ef37a27 100644
--- a/engine/main.cc
+++ b/engine/main.cc
@@ -18,6 +18,7 @@
 #include "utils/event_processor.h"
 #include "utils/file_logger.h"
 #include "utils/file_manager_utils.h"
+#include "utils/hardware/gguf/gguf_file_estimate.h"
 #include "utils/logging_utils.h"
 #include "utils/system_info_utils.h"
 
@@ -197,6 +198,15 @@ void RunServer(std::optional<int> port, bool ignore_cout) {
 }
 
 int main(int argc, char* argv[]) {
+  // hardware::RunConfig rc = {.ngl = 33,
+  //                           .ctx_len = 2048,
+  //                           .n_batch = 2048,
+  //                           .n_ubatch = 2048,
+  //                           .kv_cache_type = "f16"};
+  // auto res = hardware::EstimateLLaMACppRun(
+  //     "C:\\Users\\vansa\\cortexcpp-nightly\\models\\cortex.so\\llama3.1\\8b-gguf\\model.gguf",
+  //     rc);
+  // return 0;
   // Stop the program if the system is not supported
   auto system_info = system_info_utils::GetSystemInfo();
   if (system_info->arch == system_info_utils::kUnsupported ||
diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc
index 80d02f4c7..02454efb7 100644
--- a/engine/services/model_service.cc
+++ b/engine/services/model_service.cc
@@ -335,6 +335,41 @@ cpp::result<DownloadTask, std::string> ModelService::HandleDownloadUrlAsync(
   return download_service_->AddTask(downloadTask, on_finished);
 }
 
+cpp::result<hardware::Estimation, std::string> ModelService::GetEstimation(
+    const std::string& model_handle) {
+  namespace fs = std::filesystem;
+  namespace fmu = file_manager_utils;
+  cortex::db::Models modellist_handler;
+  config::YamlHandler yaml_handler;
+
+  try {
+    auto model_entry = modellist_handler.GetModelInfo(model_handle);
+    if (model_entry.has_error()) {
+      CTL_WRN("Error: " + model_entry.error());
+      return cpp::fail(model_entry.error());
+    }
+    auto file_path = fmu::ToAbsoluteCortexDataPath(
+                         fs::path(model_entry.value().path_to_model_yaml))
+                         .parent_path() /
+                     "model.gguf";
+    yaml_handler.ModelConfigFromFile(
+        fmu::ToAbsoluteCortexDataPath(
+            fs::path(model_entry.value().path_to_model_yaml))
+            .string());
+    auto mc = yaml_handler.GetModelConfig();
+
+    return hardware::EstimateLLaMACppRun(file_path.string(),
+                                         {.ngl = mc.ngl,
+                                          .ctx_len = mc.ctx_len,
+                                          .n_batch = 2048,
+                                          .n_ubatch = 2048,
+                                          .kv_cache_type = "f16"});
+  } catch (const std::exception& e) {
+    return cpp::fail("Fail to get model status with ID '" + model_handle +
+                     "': " + e.what());
+  }
+}
+
 cpp::result<std::string, std::string> ModelService::HandleUrl(
     const std::string& url) {
   auto url_obj = url_parser::FromUrlString(url);
@@ -771,27 +806,21 @@ cpp::result<StartModelResult, std::string> ModelService::StartModel(
 
     auto const& mp = json_data["model_path"].asString();
     auto ngl = json_data["ngl"].asInt();
-    // Bypass for now
-    auto vram_needed_MiB = 0u;
-    auto ram_needed_MiB = 0u;
-
-    if (vram_needed_MiB > free_vram_MiB && is_cuda) {
-      CTL_WRN("Not enough VRAM - " << "required: " << vram_needed_MiB
+    hardware::RunConfig rc = {.ngl = ngl,
+                              .ctx_len = json_data["ctx_len"].asInt(),
+                              .n_batch = 2048,
+                              .n_ubatch = 2048,
+                              .kv_cache_type = "f16"};
+    auto es = hardware::EstimateLLaMACppRun(mp, rc);
+
+    if (es.gpu_mode.vram_MiB > free_vram_MiB && is_cuda) {
+      CTL_WRN("Not enough VRAM - " << "required: " << es.gpu_mode.vram_MiB
                                    << ", available: " << free_vram_MiB);
-
-      return cpp::fail(
-          "Not enough VRAM - required: " + std::to_string(vram_needed_MiB) +
-          " MiB, available: " + std::to_string(free_vram_MiB) +
-          " MiB - Should adjust ngl to " +
-          std::to_string(free_vram_MiB / (vram_needed_MiB / ngl) - 1));
     }
 
-    if (ram_needed_MiB > free_ram_MiB) {
-      CTL_WRN("Not enough RAM - " << "required: " << ram_needed_MiB
+    if (es.cpu_mode.ram_MiB > free_ram_MiB) {
+      CTL_WRN("Not enough RAM - " << "required: " << es.cpu_mode.ram_MiB
                                   << ", available: " << free_ram_MiB);
-      return cpp::fail(
-          "Not enough RAM - required: " + std::to_string(ram_needed_MiB) +
-          " MiB,, available: " + std::to_string(free_ram_MiB) + " MiB");
     }
 
     assert(!!inference_svc_);
diff --git a/engine/services/model_service.h b/engine/services/model_service.h
index a29f092d4..323fdd588 100644
--- a/engine/services/model_service.h
+++ b/engine/services/model_service.h
@@ -8,6 +8,7 @@
 #include "database/models.h"
 #include "services/download_service.h"
 #include "services/inference_service.h"
+#include "utils/hardware/gguf/gguf_file_estimate.h"
 
 struct ModelPullInfo {
   std::string id;
@@ -97,6 +98,9 @@ class ModelService {
 
   bool HasModel(const std::string& id) const;
 
+  cpp::result<hardware::Estimation, std::string> GetEstimation(
+      const std::string& model_handle);
+
  private:
   /**
    * Handle downloading model which have following pattern: author/model_name
diff --git a/engine/utils/hardware/gguf/ggml.h b/engine/utils/hardware/gguf/ggml.h
new file mode 100644
index 000000000..d04cf4146
--- /dev/null
+++ b/engine/utils/hardware/gguf/ggml.h
@@ -0,0 +1,235 @@
+#pragma once
+#include <stdint.h>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "utils/result.hpp"
+
+namespace hardware {
+enum GGMLType {
+  GGML_TYPE_F32 = 0,
+  GGML_TYPE_F16 = 1,
+  GGML_TYPE_Q4_0 = 2,
+  GGML_TYPE_Q4_1 = 3,
+  // GGML_TYPE_Q4_2 = 4, support has been removed
+  // GGML_TYPE_Q4_3 = 5, support has been removed
+  GGML_TYPE_Q5_0 = 6,
+  GGML_TYPE_Q5_1 = 7,
+  GGML_TYPE_Q8_0 = 8,
+  GGML_TYPE_Q8_1 = 9,
+  GGML_TYPE_Q2_K = 10,
+  GGML_TYPE_Q3_K = 11,
+  GGML_TYPE_Q4_K = 12,
+  GGML_TYPE_Q5_K = 13,
+  GGML_TYPE_Q6_K = 14,
+  GGML_TYPE_Q8_K = 15,
+  GGML_TYPE_IQ2_XXS = 16,
+  GGML_TYPE_IQ2_XS = 17,
+  GGML_TYPE_IQ3_XXS = 18,
+  GGML_TYPE_IQ1_S = 19,
+  GGML_TYPE_IQ4_NL = 20,
+  GGML_TYPE_IQ3_S = 21,
+  GGML_TYPE_IQ2_S = 22,
+  GGML_TYPE_IQ4_XS = 23,
+  GGML_TYPE_I8 = 24,
+  GGML_TYPE_I16 = 25,
+  GGML_TYPE_I32 = 26,
+  GGML_TYPE_I64 = 27,
+  GGML_TYPE_F64 = 28,
+  GGML_TYPE_IQ1_M = 29,
+  GGML_TYPE_BF16 = 30,
+  GGML_TYPE_Q4_0_4_4 = 31,
+  GGML_TYPE_Q4_0_4_8 = 32,
+  GGML_TYPE_Q4_0_8_8 = 33,
+  GGML_TYPE_TQ1_0 = 34,
+  GGML_TYPE_TQ2_0 = 35,
+  GGML_TYPE_COUNT,
+};
+
+inline float GetQuantBit(GGMLType gt) {
+  switch (gt) {
+    case GGML_TYPE_F32:
+      return 32.0;
+    case GGML_TYPE_F16:
+      return 16.0;
+    case GGML_TYPE_Q4_0:
+    case GGML_TYPE_Q4_1:
+    case GGML_TYPE_Q5_0:
+    case GGML_TYPE_Q5_1:
+    case GGML_TYPE_Q8_0:
+    case GGML_TYPE_Q8_1:
+    case GGML_TYPE_Q2_K:
+    return 2.5625;
+    case GGML_TYPE_Q3_K:
+    return 3.4375;
+    case GGML_TYPE_Q4_K:
+      return 4.5;
+    case GGML_TYPE_Q5_K:
+    return 5.5;
+    case GGML_TYPE_Q6_K:
+      return 6.5625;
+    case GGML_TYPE_Q8_K:
+    case GGML_TYPE_IQ2_XXS:
+    case GGML_TYPE_IQ2_XS:
+    case GGML_TYPE_IQ3_XXS:
+    case GGML_TYPE_IQ1_S:
+    case GGML_TYPE_IQ4_NL:
+    case GGML_TYPE_IQ3_S:
+    case GGML_TYPE_IQ2_S:
+    case GGML_TYPE_IQ4_XS:
+    case GGML_TYPE_I8:
+    case GGML_TYPE_I16:
+    case GGML_TYPE_I32:
+    case GGML_TYPE_I64:
+    case GGML_TYPE_F64:
+    case GGML_TYPE_IQ1_M:
+    case GGML_TYPE_BF16:
+    case GGML_TYPE_Q4_0_4_4:
+    case GGML_TYPE_Q4_0_4_8:
+    case GGML_TYPE_Q4_0_8_8:
+    case GGML_TYPE_TQ1_0:
+    case GGML_TYPE_TQ2_0:
+
+    default:
+      return 8.0;
+  }
+}
+
+inline std::string to_string(GGMLType t) {
+  switch (t) {
+    case GGML_TYPE_F32:
+      return "F32";
+    case GGML_TYPE_F16:
+      return "F16";
+    case GGML_TYPE_Q4_0:
+      return "Q4_0";
+    case GGML_TYPE_Q4_1:
+      return "Q4_1";
+    case GGML_TYPE_Q5_0:
+      return "Q5_0";
+    case GGML_TYPE_Q5_1:
+      return "Q5_1";
+    case GGML_TYPE_Q8_0:
+      return "Q8_0";
+    case GGML_TYPE_Q8_1:
+      return "Q8_1";
+    case GGML_TYPE_Q2_K:
+      return "Q2_K";
+    case GGML_TYPE_Q3_K:
+      return "Q3_K";
+    case GGML_TYPE_Q4_K:
+      return "Q4_K";
+    case GGML_TYPE_Q5_K:
+      return "Q5_K";
+    case GGML_TYPE_Q6_K:
+      return "Q6_K";
+    case GGML_TYPE_Q8_K:
+      return "Q8_K";
+    case GGML_TYPE_IQ2_XXS:
+      return "IQ2_XXS";
+    case GGML_TYPE_IQ2_XS:
+      return "IQ2_XS";
+    case GGML_TYPE_IQ3_XXS:
+      return "IQ3_XXS";
+    case GGML_TYPE_IQ1_S:
+      return "IQ1_S";
+    case GGML_TYPE_IQ4_NL:
+      return "IQ4_NL";
+    case GGML_TYPE_IQ3_S:
+      return "IQ3_S";
+    case GGML_TYPE_IQ2_S:
+      return "IQ2_S";
+    case GGML_TYPE_IQ4_XS:
+      return "IQ4_XS";
+    case GGML_TYPE_I8:
+      return "I8";
+    case GGML_TYPE_I16:
+      return "I16";
+    case GGML_TYPE_I32:
+      return "I32";
+    case GGML_TYPE_I64:
+      return "I64";
+    case GGML_TYPE_F64:
+      return "F64";
+    case GGML_TYPE_IQ1_M:
+      return "IQ1_M";
+    case GGML_TYPE_BF16:
+      return "BF16";
+    case GGML_TYPE_Q4_0_4_4:
+      return "Q4_0_4_4";
+    case GGML_TYPE_Q4_0_4_8:
+      return "Q4_0_4_8";
+    case GGML_TYPE_Q4_0_8_8:
+      return "Q4_0_8_8";
+    case GGML_TYPE_TQ1_0:
+      return "TQ1_0";
+    case GGML_TYPE_TQ2_0:
+      return "TQ2_0";
+    default:
+      return "Invalid";
+  }
+}
+
+struct GGMLTypeTrait {
+  uint64_t block_size;
+  uint64_t type_size;
+  bool is_quantized;
+};
+
+const std::unordered_map<GGMLType, GGMLTypeTrait> kGGMLTypeTraits = {
+    {GGML_TYPE_F32, {.block_size = 1, .type_size = 4}},
+    {GGML_TYPE_F16, {.block_size = 1, .type_size = 2}},
+    {GGML_TYPE_Q4_0, {.block_size = 32, .type_size = 18, .is_quantized = true}},
+    {GGML_TYPE_Q4_1, {.block_size = 32, .type_size = 20, .is_quantized = true}},
+    {GGML_TYPE_Q5_0, {.block_size = 32, .type_size = 22, .is_quantized = true}},
+    {GGML_TYPE_Q5_1, {.block_size = 32, .type_size = 24, .is_quantized = true}},
+    {GGML_TYPE_Q8_0, {.block_size = 32, .type_size = 34, .is_quantized = true}},
+    {GGML_TYPE_Q8_1, {.block_size = 32, .type_size = 36, .is_quantized = true}},
+    {GGML_TYPE_Q2_K,
+     {.block_size = 256, .type_size = 84, .is_quantized = true}},
+    {GGML_TYPE_Q3_K,
+     {.block_size = 256, .type_size = 110, .is_quantized = true}},
+    {GGML_TYPE_Q4_K,
+     {.block_size = 256, .type_size = 144, .is_quantized = true}},
+    {GGML_TYPE_Q5_K,
+     {.block_size = 256, .type_size = 176, .is_quantized = true}},
+    {GGML_TYPE_Q6_K,
+     {.block_size = 256, .type_size = 210, .is_quantized = true}},
+    {GGML_TYPE_Q8_K,
+     {.block_size = 256, .type_size = 292, .is_quantized = true}},
+    {GGML_TYPE_IQ2_XXS,
+     {.block_size = 256, .type_size = 66, .is_quantized = true}},
+    {GGML_TYPE_IQ2_XS,
+     {.block_size = 256, .type_size = 74, .is_quantized = true}},
+    {GGML_TYPE_IQ3_XXS,
+     {.block_size = 256, .type_size = 98, .is_quantized = true}},
+    {GGML_TYPE_IQ1_S,
+     {.block_size = 256, .type_size = 50, .is_quantized = true}},
+    {GGML_TYPE_IQ4_NL,
+     {.block_size = 32, .type_size = 18, .is_quantized = true}},
+    {GGML_TYPE_IQ3_S,
+     {.block_size = 256, .type_size = 110, .is_quantized = true}},
+    {GGML_TYPE_IQ2_S,
+     {.block_size = 256, .type_size = 82, .is_quantized = true}},
+    {GGML_TYPE_IQ4_XS,
+     {.block_size = 256, .type_size = 136, .is_quantized = true}},
+    {GGML_TYPE_I8, {.block_size = 1, .type_size = 1}},
+    {GGML_TYPE_I16, {.block_size = 1, .type_size = 2}},
+    {GGML_TYPE_I32, {.block_size = 1, .type_size = 4}},
+    {GGML_TYPE_I64, {.block_size = 1, .type_size = 8}},
+    {GGML_TYPE_F64, {.block_size = 1, .type_size = 8}},
+    {GGML_TYPE_IQ1_M,
+     {.block_size = 256, .type_size = 56, .is_quantized = true}},
+    {GGML_TYPE_BF16, {.block_size = 1, .type_size = 2}},
+    {GGML_TYPE_Q4_0_4_4,
+     {.block_size = 32, .type_size = 18, .is_quantized = true}},
+    {GGML_TYPE_Q4_0_4_8,
+     {.block_size = 32, .type_size = 18, .is_quantized = true}},
+    {GGML_TYPE_Q4_0_8_8,
+     {.block_size = 32, .type_size = 18, .is_quantized = true}},
+    {GGML_TYPE_TQ1_0,
+     {.block_size = 256, .type_size = 54, .is_quantized = true}},
+    {GGML_TYPE_TQ2_0,
+     {.block_size = 256, .type_size = 66, .is_quantized = true}},
+};
+}  // namespace hardware
\ No newline at end of file
diff --git a/engine/utils/hardware/gguf/gguf_file.h b/engine/utils/hardware/gguf/gguf_file.h
new file mode 100644
index 000000000..9980247f0
--- /dev/null
+++ b/engine/utils/hardware/gguf/gguf_file.h
@@ -0,0 +1,539 @@
+#pragma once
+#include <assert.h>
+#include <fcntl.h>
+#include <stdint.h>
+#include <algorithm>
+#include <any>
+#include <filesystem>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <variant>
+#include <vector>
+
+#ifdef _WIN32
+#include <io.h>
+#include <windows.h>
+#include <limits>
+#else
+#include <sys/mman.h>  // For memory-mapped file
+#include <unistd.h>    // For file descriptors
+#endif
+
+#include "ggml.h"
+#include "utils/string_utils.h"
+
+// #define GGUF_LOG(msg)                                                  \
+//   do {                                                                 \
+//     std::cout << __FILE__ << "(@" << __LINE__ << "): " << msg << '\n'; \
+//   } while (false)
+
+#define GGUF_LOG(msg)  
+namespace hardware {
+#undef min
+#undef max
+
+using GGUFMagic = uint32_t;
+constexpr const GGUFMagic kGGUFMagicGGML = 0x67676d6c;
+constexpr const GGUFMagic kGGUFMagicGGMF = 0x67676d66;
+constexpr const GGUFMagic kGGUFMagicGGJT = 0x67676a74;
+constexpr const GGUFMagic kGGUFMagicGGUFLe = 0x46554747;  // GGUF
+constexpr const GGUFMagic kGGUFMagicGGUFBe = 0x47475546;  // GGUF
+
+using GGUFVersion = uint32_t;
+constexpr const GGUFVersion kGGUFVersionV1 = 1;
+constexpr const GGUFVersion kGGUFVersionV2 = 2;
+constexpr const GGUFVersion kGGUFVersionV3 = 3;
+
+enum GGUFMetadataValueType : uint32_t {
+  GGUFMetadataValueTypeUint8 = 0,
+  GGUFMetadataValueTypeInt8,
+  GGUFMetadataValueTypeUint16,
+  GGUFMetadataValueTypeInt16,
+  GGUFMetadataValueTypeUint32,
+  GGUFMetadataValueTypeInt32,
+  GGUFMetadataValueTypeFloat32,
+  GGUFMetadataValueTypeBool,
+  GGUFMetadataValueTypeString,
+  GGUFMetadataValueTypeArray,
+  GGUFMetadataValueTypeUint64,
+  GGUFMetadataValueTypeInt64,
+  GGUFMetadataValueTypeFloat64,
+  _GGUFMetadataValueTypeCount  // Unknown
+};
+
+struct GGUFMetadataKV {
+  // Key is the key of the metadata key-value pair,
+  // which is no larger than 64 bytes long.
+  std::string key;  // Using std::string for dynamic string handling
+
+  // ValueType is the type of the metadata value.
+  GGUFMetadataValueType value_type;  // Enum to represent value types
+
+  // Value is the value of the metadata key-value pair.
+  std::any value;
+};
+
+struct GGUFMetadataKVArrayValue {
+  /* Basic */
+
+  // type is the type of the array item.
+  GGUFMetadataValueType type;  // Enum to represent value types
+
+  // Len is the length of the array.
+  uint64_t len;  // Using uint64_t for length
+
+  // Array holds all array items.
+  std::vector<std::any> arr;
+  /* Appendix */
+
+  // start_offset is the offset in bytes of the GGUFMetadataKVArrayValue in the GGUFFile file.
+  int64_t start_offset;  // Using int64_t for offset
+
+  // Size is the size of the array in bytes.
+  int64_t size;  // Using int64_t for size
+};
+
+inline std::string to_string(GGUFMetadataValueType vt, const std::any& v) {
+  switch (vt) {
+    case GGUFMetadataValueTypeUint8:
+      return std::to_string(std::any_cast<uint8_t>(v));
+    case GGUFMetadataValueTypeInt8:
+      return std::to_string(std::any_cast<int8_t>(v));
+    case GGUFMetadataValueTypeUint16:
+      return std::to_string(std::any_cast<uint16_t>(v));
+    case GGUFMetadataValueTypeInt16:
+      return std::to_string(std::any_cast<int16_t>(v));
+    case GGUFMetadataValueTypeUint32:
+      return std::to_string(std::any_cast<uint32_t>(v));
+    case GGUFMetadataValueTypeInt32:
+      return std::to_string(std::any_cast<int32_t>(v));
+    case GGUFMetadataValueTypeFloat32:
+      return std::to_string(std::any_cast<float>(v));
+    case GGUFMetadataValueTypeBool:
+      return std::to_string(std::any_cast<bool>(v));
+    case GGUFMetadataValueTypeString:
+      return std::any_cast<std::string>(v);
+    case GGUFMetadataValueTypeUint64:
+      return std::to_string(std::any_cast<uint64_t>(v));
+    case GGUFMetadataValueTypeInt64:
+      return std::to_string(std::any_cast<int64_t>(v));
+    case GGUFMetadataValueTypeFloat64:
+      return std::to_string(std::any_cast<double>(v));
+    default:
+      break;
+  }
+  return "array";
+}
+inline std::string to_string(const GGUFMetadataKVArrayValue& arr_v) {
+  std::string res;
+  auto num = std::min(size_t(5), arr_v.arr.size());
+  for (size_t i = 0; i < num; i++) {
+    res += to_string(arr_v.type, arr_v.arr[i]) + " ";
+  }
+  return res;
+}
+
+inline std::string to_string(const GGUFMetadataKV& kv) {
+  switch (kv.value_type) {
+    case GGUFMetadataValueTypeUint8:
+      return std::to_string(std::any_cast<uint8_t>(kv.value));
+    case GGUFMetadataValueTypeInt8:
+      return std::to_string(std::any_cast<int8_t>(kv.value));
+    case GGUFMetadataValueTypeUint16:
+      return std::to_string(std::any_cast<uint16_t>(kv.value));
+    case GGUFMetadataValueTypeInt16:
+      return std::to_string(std::any_cast<int16_t>(kv.value));
+    case GGUFMetadataValueTypeUint32:
+      return std::to_string(std::any_cast<uint32_t>(kv.value));
+    case GGUFMetadataValueTypeInt32:
+      return std::to_string(std::any_cast<int32_t>(kv.value));
+    case GGUFMetadataValueTypeFloat32:
+      return std::to_string(std::any_cast<float>(kv.value));
+    case GGUFMetadataValueTypeBool:
+      return std::to_string(std::any_cast<bool>(kv.value));
+    case GGUFMetadataValueTypeString:
+      return std::any_cast<std::string>(kv.value);
+    case GGUFMetadataValueTypeUint64:
+      return std::to_string(std::any_cast<uint64_t>(kv.value));
+    case GGUFMetadataValueTypeInt64:
+      return std::to_string(std::any_cast<int64_t>(kv.value));
+    case GGUFMetadataValueTypeFloat64:
+      return std::to_string(std::any_cast<double>(kv.value));
+    case GGUFMetadataValueTypeArray:
+      return to_string(std::any_cast<GGUFMetadataKVArrayValue>(kv.value));
+    default:
+      break;
+  }
+  return "Invalid type ";
+}
+
+
+
+struct GGUFTensorInfo {
+  /* Basic */
+  std::string name;
+
+  // NDimensions is the number of dimensions of the tensor.
+  uint32_t n_dimensions;
+  // Dimensions is the dimensions of the tensor,
+  // the length is NDimensions.
+  std::vector<uint64_t> dimensions;
+  // type is the type of the tensor.
+  GGMLType type;
+  // Offset is the offset in bytes of the tensor's data in this file.
+  //
+  // The offset is relative to tensor data, not to the start of the file.
+  uint64_t offset;
+
+  /* Appendix */
+
+  // StartOffset is the offset in bytes of the GGUFTensorInfo in the GGUFFile file.
+  //
+  // The offset is the start of the file.
+  int64_t start_offset;
+};
+
+struct GGUFHelper {
+  uint8_t* data;
+  uint8_t* d_close;
+  uint64_t file_size;
+
+  bool OpenAndMMap(const std::string& file_path) {
+#ifdef _WIN32
+    HANDLE file_handle = INVALID_HANDLE_VALUE;
+    HANDLE file_mapping = nullptr;
+    file_handle =
+        CreateFileA(file_path.c_str(), GENERIC_READ, FILE_SHARE_READ, nullptr,
+                    OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, nullptr);
+    if (file_handle == INVALID_HANDLE_VALUE) {
+      std::cout << "Failed to open file" << std::endl;
+      return false;
+    }
+    // Get the file size
+    LARGE_INTEGER file_size_struct;
+    if (!GetFileSizeEx(file_handle, &file_size_struct)) {
+      CloseHandle(file_handle);
+      std::cout << "Failed to open file" << std::endl;
+      return false;
+    }
+    file_size = static_cast<size_t>(file_size_struct.QuadPart);
+
+    // Create a file mapping object
+    file_mapping =
+        CreateFileMappingA(file_handle, nullptr, PAGE_READONLY, 0, 0, nullptr);
+    if (file_mapping == nullptr) {
+      CloseHandle(file_handle);
+      std::cout << "Failed to create file mapping" << std::endl;
+      return false;
+    }
+
+    // Map the file into memory
+    data = static_cast<uint8_t*>(
+        MapViewOfFile(file_mapping, FILE_MAP_READ, 0, 0, file_size));
+    if (data == nullptr) {
+      CloseHandle(file_mapping);
+      CloseHandle(file_handle);
+      std::cout << "Failed to map file" << std::endl;
+      return false;
+    }
+
+    // Close the file handle, as it is no longer needed after mapping
+    CloseHandle(file_handle);
+    d_close = data;
+#else
+    file_size = std::filesystem::file_size(file_path);
+
+    int fd = open(file_path.c_str(), O_RDONLY);
+    // Memory-map the file
+    data = static_cast<uint8_t*>(
+        mmap(nullptr, file_size, PROT_READ, MAP_PRIVATE, fd, 0));
+    if (data == MAP_FAILED) {
+      perror("Error mapping file");
+      close(fd);
+      return false;
+    }
+
+    close(fd);
+    d_close = data;
+#endif
+    return true;
+  }
+
+  ~GGUFHelper() { Close(); }
+
+  void Close() {
+#ifdef _WIN32
+    if (d_close != nullptr) {
+      UnmapViewOfFile(d_close);
+      d_close = nullptr;
+    }
+#else
+    if (d_close != nullptr && d_close != MAP_FAILED) {
+      munmap(d_close, file_size);
+      d_close = nullptr;
+    }
+#endif
+  }
+
+  template <typename T>
+  T Read() {
+    static_assert(std::is_floating_point<T>::value ||
+                  std::is_integral<T>::value || std::is_same<T, bool>::value);
+    T res = *reinterpret_cast<const T*>(data);
+    data += sizeof(T);
+    return res;
+  }
+
+  std::string ReadString() {
+    auto l = Read<uint64_t>();
+    std::string res(reinterpret_cast<const char*>(data), l);
+    auto r = res;
+    data += l;
+    return r;
+  }
+
+  GGUFMetadataKVArrayValue ReadArray() {
+    GGUFMetadataKVArrayValue v;
+    v.start_offset = (data - d_close);
+    v.type = static_cast<GGUFMetadataValueType>(Read<uint32_t>());
+    auto arr_length = Read<uint64_t>();
+    for (uint64_t i = 0; i < arr_length; ++i) {
+      switch (v.type) {
+        case GGUFMetadataValueTypeUint8:
+          v.arr.push_back(Read<uint8_t>());
+          break;
+        case GGUFMetadataValueTypeInt8:
+          v.arr.push_back(Read<int8_t>());
+          break;
+        case GGUFMetadataValueTypeUint16:
+          v.arr.push_back(Read<uint16_t>());
+          break;
+        case GGUFMetadataValueTypeInt16:
+          v.arr.push_back(Read<uint16_t>());
+          break;
+        case GGUFMetadataValueTypeUint32:
+          v.arr.push_back(Read<uint32_t>());
+          break;
+        case GGUFMetadataValueTypeInt32:
+          v.arr.push_back(Read<int32_t>());
+          break;
+        case GGUFMetadataValueTypeFloat32:
+          v.arr.push_back(Read<float>());
+          break;
+        case GGUFMetadataValueTypeBool:
+          v.arr.push_back(Read<bool>());
+          break;
+        case GGUFMetadataValueTypeString:
+          v.arr.push_back(ReadString());
+          break;
+        case GGUFMetadataValueTypeUint64:
+          v.arr.push_back(Read<uint64_t>());
+          break;
+        case GGUFMetadataValueTypeInt64:
+          v.arr.push_back(Read<int64_t>());
+          break;
+        case GGUFMetadataValueTypeFloat64:
+          v.arr.push_back(Read<double>());
+          break;
+        default:
+          std::cout << "Invalid type: " << std::to_string(v.type);
+      }
+    }
+    v.size = data - v.start_offset - d_close - 4 - 8;
+    return v;
+  }
+
+  std::any ReadValue(GGUFMetadataValueType vt) {
+    switch (vt) {
+      case GGUFMetadataValueTypeUint8:
+        return Read<uint8_t>();
+      case GGUFMetadataValueTypeInt8:
+        return Read<int8_t>();
+      case GGUFMetadataValueTypeUint16:
+        return Read<uint16_t>();
+      case GGUFMetadataValueTypeInt16:
+        return Read<uint16_t>();
+      case GGUFMetadataValueTypeUint32:
+        return Read<uint32_t>();
+      case GGUFMetadataValueTypeInt32:
+        return Read<int32_t>();
+      case GGUFMetadataValueTypeFloat32:
+        return Read<float>();
+      case GGUFMetadataValueTypeBool:
+        return Read<bool>();
+      case GGUFMetadataValueTypeString:
+        return ReadString();
+      case GGUFMetadataValueTypeArray:
+        return ReadArray();
+      case GGUFMetadataValueTypeUint64:
+        return Read<uint64_t>();
+      case GGUFMetadataValueTypeInt64:
+        return Read<int64_t>();
+      case GGUFMetadataValueTypeFloat64:
+        return Read<double>();
+      default:
+        std::cout << "Invalid type: " << vt;
+        return {};
+    }
+  }
+
+  GGUFMetadataKV ReadMetadataKV() {
+    GGUFMetadataKV kv;
+    kv.key = ReadString();
+    auto vt = Read<uint32_t>();
+    kv.value_type = GGUFMetadataValueType(vt);
+    kv.value = ReadValue(kv.value_type);
+    return kv;
+  }
+
+  std::shared_ptr<GGUFTensorInfo> ReadTensorInfo() {
+    auto ti = std::make_shared<GGUFTensorInfo>();
+    ti->start_offset = data - d_close;
+    ti->name = ReadString();
+    ti->n_dimensions = Read<uint32_t>();
+    ti->dimensions.resize(ti->n_dimensions);
+    for (size_t i = 0; i < ti->n_dimensions; i++) {
+      ti->dimensions[i] = Read<uint64_t>();
+    }
+    auto v = Read<uint32_t>();
+    ti->type = GGMLType(v);
+    ti->offset = Read<uint64_t>();
+    return ti;
+  }
+};
+
+constexpr const auto ErrGGUFFileInvalidFormat = "invalid GGUF format";
+
+struct GGUFHeader {
+  // Magic is a magic number that announces that this is a GGUF file.
+  GGUFMagic magic;
+  // Version is a version of the GGUF file format.
+  GGUFVersion version;
+  // TensorCount is the number of tensors in the file.
+  uint64_t tensor_count;
+  // MetadataKVCount is the number of key-value pairs in the metadata.
+  uint64_t metadata_kv_count;
+  // MetadataKV are the key-value pairs in the metadata,
+  std::vector<GGUFMetadataKV> metadata_kv;
+
+  std::pair<GGUFMetadataKV, bool> Get(const std::string& name) {
+    for (auto const& kv : metadata_kv) {
+      if (kv.key == name) {
+        return std::pair(kv, true);
+      }
+    }
+    return std::pair(GGUFMetadataKV{}, false);
+  }
+};
+
+struct GGUFFile {
+  /* Basic */
+
+  // header is the header of the GGUF file.
+  GGUFHeader header;
+  // tensor_infos are the tensor infos of the GGUF file,
+  // the size of TensorInfos is equal to `Header.TensorCount`.
+  std::vector<std::shared_ptr<GGUFTensorInfo>> tensor_infos;
+
+  // padding is the padding size of the GGUF file,
+  // which is used to split Header and TensorInfos from tensor data.
+  int64_t padding;
+  // split_paddings holds the padding size slice of the GGUF file splits,
+  // each item represents splitting Header and TensorInfos from tensor data.
+  //
+  // The length of split_paddings is the number of split files.
+  std::vector<int64_t> split_paddings;
+  // tensor_data_start_offset is the offset in bytes of the tensor data in this file.
+  //
+  // The offset is the start of the file.
+  int64_t tensor_data_start_offset;
+  // split_tensor_data_start_offsets holds the offset slice in bytes of the tensor data of the GGUF file splits,
+  // each item represents the offset of the tensor data in the split file.
+  //
+  // The length of split_tensor_data_start_offsets is the number of split files.
+  std::vector<int64_t> split_tensor_data_start_offsets;
+
+  /* Appendix */
+
+  // size is the size of the GGUF file,
+  // if the file is split, the size is the sum of all split files.
+  uint64_t size;
+  // split_sizes holds the size slice of the GGUF file splits,
+  // each item represents the size of the split file.
+  //
+  // The length of split_sizes is the number of split files.
+  std::vector<uint64_t> split_sizes;
+  // model_size is the size of the model when loading.
+  uint64_t model_size;
+  // split_model_sizes holds the size slice of the model,
+  // each item represents a size when loading of the split file.
+  //
+  // The length of split_model_sizes is the number of split files.
+  std::vector<uint64_t> split_model_sizes;
+
+  // model_parameters is the number of the model parameters.
+  uint64_t model_parameters;
+  // model_bits_per_weight is the bits per weight of the model,
+  // which describes how many bits are used to store a weight,
+  // higher is better.
+  double model_bits_per_weight;
+};
+
+inline GGUFFile ParseGgufFile(const std::string& path) {
+  GGUFFile gf;
+  GGUFHelper h;
+  h.OpenAndMMap(path);
+
+  GGUFMagic magic = h.Read<GGUFMagic>();
+  // GGUF_LOG("magic: " << magic);
+  gf.header.magic = magic;
+  GGUFVersion version = h.Read<GGUFVersion>();
+  auto tensor_count = h.Read<uint64_t>();
+  // GGUF_LOG("tensor_count: " << tensor_count);
+  gf.header.tensor_count += tensor_count;
+
+  auto metadata_kv_count = h.Read<uint64_t>();
+  gf.header.metadata_kv_count += metadata_kv_count;
+  // GGUF_LOG("metadata_kv_count: " << metadata_kv_count);
+
+  // metadata kv
+  {
+    std::vector<GGUFMetadataKV> kvs;
+    kvs.resize(metadata_kv_count);
+    for (size_t i = 0; i < metadata_kv_count; i++) {
+      kvs[i] = h.ReadMetadataKV();
+      GGUF_LOG("i: " << i << " " << kvs[i].value_type << " " << kvs[i].key
+                     << ": " << to_string(kvs[i]));
+    }
+    for (auto const& kv : kvs) {
+      if (kv.key == "split.no") {
+        gf.header.metadata_kv_count--;
+        continue;
+      }
+      gf.header.metadata_kv.push_back(kv);
+    }
+  }
+
+  {
+    std::vector<std::shared_ptr<GGUFTensorInfo>> tis;
+    tis.resize(tensor_count);
+    for (size_t i = 0; i < tensor_count; i++) {
+      tis[i] = h.ReadTensorInfo();
+      // auto tto_string = [](const std::vector<size_t>& ds) -> std::string {
+      //   std::string res = "[";
+      //   for (auto d : ds)
+      //     res += std::to_string(d) + " ";
+      //   return res + "]";
+      // };
+      // auto ds = tto_string(tis[i]->dimensions);
+      // GGUF_LOG("i: " << i << " name: " << tis[i]->name
+      //                << " type: " << to_string(tis[i]->type) << " dimensions: "
+      //                << std::to_string(tis[i]->n_dimensions) << " " << ds);
+    }
+    gf.tensor_infos = tis;
+  }
+  return gf;
+}
+}  // namespace hardware
\ No newline at end of file
diff --git a/engine/utils/hardware/gguf/gguf_file_estimate.h b/engine/utils/hardware/gguf/gguf_file_estimate.h
new file mode 100644
index 000000000..31b328608
--- /dev/null
+++ b/engine/utils/hardware/gguf/gguf_file_estimate.h
@@ -0,0 +1,172 @@
+#pragma once
+#include <algorithm>
+#include <regex>
+#include "gguf_file.h"
+#include "json/json.h"
+
+namespace hardware {
+inline uint64_t BytesToMiB(uint64_t b) {
+  return (double)b / 1024 / 1024;
+};
+struct RunConfig {
+  int ngl;
+  int ctx_len;
+  int n_batch;
+  int n_ubatch;
+  std::string kv_cache_type;
+};
+
+struct CpuMode {
+  int64_t ram_MiB;
+};
+
+struct GpuMode {
+  int64_t ram_MiB;
+  int64_t vram_MiB;
+  int ngl;
+  int ctx_len;
+};
+
+struct Estimation {
+  CpuMode cpu_mode;
+  GpuMode gpu_mode;
+};
+
+inline Json::Value ToJson(const Estimation& es) {
+  Json::Value res;
+  Json::Value cpu;
+  cpu["ram"] = es.cpu_mode.ram_MiB;
+  Json::Value gpus(Json::arrayValue);
+  Json::Value gpu;
+  gpu["ram"] = es.gpu_mode.ram_MiB;
+  gpu["vram"] = es.gpu_mode.vram_MiB;
+  gpu["ngl"] = es.gpu_mode.ngl;
+  gpu["context_length"] = es.gpu_mode.ctx_len;
+  gpus.append(gpu);
+  res["cpu_mode"] = cpu;
+  res["gpu_mode"] = gpus;
+  return res;
+}
+
+inline float GetQuantBit(const std::string& kv_cache_t) {
+  if (kv_cache_t == "f16") {
+    return 16.0;
+  } else if (kv_cache_t == "q8_0") {
+    return 8.0;
+  } else if (kv_cache_t == "q4_0") {
+    return 4.5;
+  }
+  return 16.0;
+}
+
+inline Estimation EstimateLLaMACppRun(const std::string& file_path,
+                                      const RunConfig& rc) {
+  Estimation res;
+  // token_embeddings_size = n_vocab * embedding_length * 2 * quant_bit/16 bytes
+  //RAM = token_embeddings_size + ((total_ngl-ngl) >=1 ? Output_layer_size +  (total_ngl - ngl - 1 ) / (total_ngl-1) * (total_file_size - token_embeddings_size - Output_layer_size) : 0  )  (bytes)
+
+  // VRAM = total_file_size - RAM (bytes)
+  auto gf = ParseGgufFile(file_path);
+  int32_t embedding_length = 0;
+  int64_t n_vocab = 0;
+  int32_t num_block = 0;
+  int32_t total_ngl = 0;
+  auto file_size = std::filesystem::file_size(file_path);
+  for (auto const& kv : gf.header.metadata_kv) {
+    if (kv.key == "llama.embedding_length") {
+      embedding_length = std::any_cast<uint32_t>(kv.value);
+    } else if (kv.key == "tokenizer.ggml.tokens") {
+      n_vocab = std::any_cast<GGUFMetadataKVArrayValue>(kv.value).arr.size();
+    } else if (kv.key == "llama.block_count") {
+      num_block = std::any_cast<uint32_t>(kv.value);
+      total_ngl = num_block + 1;
+    }
+  }
+
+  // token_embeddings_size = n_vocab * embedding_length * 2 * quant_bit_in/16 bytes
+  int32_t quant_bit_in = 0;
+  int32_t quant_bit_out = 0;
+
+  for (auto const& ti : gf.tensor_infos) {
+    if (ti->name == "output.weight") {
+      quant_bit_out = GetQuantBit(ti->type);
+      std::cout << ti->type << std::endl;
+    } else if (ti->name == "token_embd.weight") {
+      quant_bit_in = GetQuantBit(ti->type);
+      std::cout << ti->type << std::endl;
+    }
+  }
+  // output.weight
+  // token_embd.weight
+  std::cout << "embedding_length: " << embedding_length << std::endl;
+  std::cout << "n_vocab: " << n_vocab << std::endl;
+  std::cout << "file_size: " << file_size << std::endl;
+  // Model weight
+  int64_t token_embeddings_size =
+      n_vocab * embedding_length * 2 * quant_bit_in / 16;
+  int64_t output_layer_size =
+      n_vocab * embedding_length * 2 * quant_bit_out / 16;
+  // RAM = token_embeddings_size + ((total_ngl-ngl) >=1 ? output_layer_size +  (total_ngl - ngl - 1 ) / (total_ngl-1) * (total_file_size - token_embeddings_size - output_layer_size) : 0  )  (bytes)
+  int64_t offload = 0;
+  if (total_ngl >= rc.ngl + 1) {
+    offload = output_layer_size +
+              (double)(total_ngl - rc.ngl - 1) / (total_ngl - 1) *
+                  (file_size - token_embeddings_size - output_layer_size);
+  }
+
+  int64_t ram_usage = token_embeddings_size + offload;
+  int64_t vram_usage = file_size - ram_usage;
+  std::cout << "token_embeddings_size: " << BytesToMiB(token_embeddings_size)
+            << std::endl;
+  std::cout << "output_layer_size: " << BytesToMiB(output_layer_size)
+            << std::endl;
+  std::cout << "ram_usage: " << BytesToMiB(ram_usage) << std::endl;
+  std::cout << "vram_usage: " << BytesToMiB(vram_usage) << std::endl;
+
+  // KV cache
+  // kv_cache_size = ctx_len/8192 * hidden_dim/4096 * quant_bit/16 * num_block/33 * 1 (GB)
+  auto hidden_dim = embedding_length;
+  int kv_quant_bit =
+      GetQuantBit(rc.kv_cache_type);  // f16, 8 bits for q8_0, 4.5 bits for q4_0
+  int64_t kv_cache_size = (double)(1024 * 1024 * 1024) * rc.ctx_len / 8192 *
+                          hidden_dim / 4096 * kv_quant_bit / 16 * num_block /
+                          33;  //(bytes)
+
+  std::cout << "kv_cache_size: " << BytesToMiB(kv_cache_size) << std::endl;
+
+  // VRAM = (min(n_batch, n_ubatch))/ 512 * 266 (MiB)
+  int64_t preprocessing_buffer_size =
+      (double)std::min(rc.n_batch, rc.n_ubatch) / 512 * 266 * 1024 * 1024 /
+      4;  //(bytes)
+  if (total_ngl != rc.ngl) {
+    preprocessing_buffer_size += output_layer_size;
+  }
+  std::cout << "preprocessing_buffer_size: "
+            << BytesToMiB(preprocessing_buffer_size) << std::endl;
+
+  // CPU mode
+  {
+    // Model weight
+    int64_t model_weight = file_size;
+    // KV cache
+    // Buffer
+    res.cpu_mode.ram_MiB =
+        BytesToMiB(model_weight + kv_cache_size + preprocessing_buffer_size);
+  }
+  // GPU mode
+  {
+    res.gpu_mode.ctx_len = rc.ctx_len;
+    res.gpu_mode.ngl = rc.ngl;
+    res.gpu_mode.ram_MiB = BytesToMiB(ram_usage);
+    res.gpu_mode.vram_MiB =
+        BytesToMiB(vram_usage + kv_cache_size + preprocessing_buffer_size);
+  }
+  return res;
+}
+// CPU_Mapped model buffer size =    35.16 MiB
+// CUDA0 model buffer size =   601.02 MiB
+// CUDA0 KV buffer size =    88.00 MiB
+// CUDA_Host  output buffer size =     0.12 MiB
+// CUDA0 compute buffer size =   266.00 MiB
+// CUDA_Host compute buffer size =    48.02 MiB
+}  // namespace hardware
\ No newline at end of file

From e4bcb3ab96ab8ed88846439ea9a65e5f89b7d3d9 Mon Sep 17 00:00:00 2001
From: vansangpfiev <sang@jan.ai>
Date: Fri, 22 Nov 2024 09:40:00 +0700
Subject: [PATCH 2/8] fix: cleanup and improve

---
 engine/cli/command_line_parser.cc             | 13 ++++--
 engine/cli/command_line_parser.h              |  2 +
 engine/cli/commands/model_list_cmd.cc         | 40 ++++++++++++-------
 engine/cli/commands/model_list_cmd.h          |  3 +-
 engine/main.cc                                |  9 -----
 engine/services/model_service.cc              | 13 ++++--
 .../utils/hardware/gguf/gguf_file_estimate.h  | 15 ++++---
 7 files changed, 58 insertions(+), 37 deletions(-)

diff --git a/engine/cli/command_line_parser.cc b/engine/cli/command_line_parser.cc
index bb41dbe8b..8b540d1fd 100644
--- a/engine/cli/command_line_parser.cc
+++ b/engine/cli/command_line_parser.cc
@@ -246,14 +246,19 @@ void CommandLineParser::SetupModelCommands() {
                             "Display engine");
   list_models_cmd->add_flag("-v,--version", cml_data_.display_version,
                             "Display version");
+  list_models_cmd->add_flag("--cpu_mode", cml_data_.display_cpu_mode,
+                            "Display cpu mode");
+  list_models_cmd->add_flag("--gpu_mode", cml_data_.display_gpu_mode,
+                            "Display gpu mode");
   list_models_cmd->group(kSubcommands);
   list_models_cmd->callback([this]() {
     if (std::exchange(executed_, true))
       return;
-    commands::ModelListCmd().Exec(cml_data_.config.apiServerHost,
-                                  std::stoi(cml_data_.config.apiServerPort),
-                                  cml_data_.filter, cml_data_.display_engine,
-                                  cml_data_.display_version);
+    commands::ModelListCmd().Exec(
+        cml_data_.config.apiServerHost,
+        std::stoi(cml_data_.config.apiServerPort), cml_data_.filter,
+        cml_data_.display_engine, cml_data_.display_version,
+        cml_data_.display_cpu_mode, cml_data_.display_gpu_mode);
   });
 
   auto get_models_cmd =
diff --git a/engine/cli/command_line_parser.h b/engine/cli/command_line_parser.h
index bce83222a..f7ca3f507 100644
--- a/engine/cli/command_line_parser.h
+++ b/engine/cli/command_line_parser.h
@@ -64,6 +64,8 @@ class CommandLineParser {
     // for model list
     bool display_engine = false;
     bool display_version = false;
+    bool display_cpu_mode = false;
+    bool display_gpu_mode = false;
     std::string filter = "";
     std::string log_level = "INFO";
 
diff --git a/engine/cli/commands/model_list_cmd.cc b/engine/cli/commands/model_list_cmd.cc
index 75abea0e8..e598e22d5 100644
--- a/engine/cli/commands/model_list_cmd.cc
+++ b/engine/cli/commands/model_list_cmd.cc
@@ -19,7 +19,8 @@ using Row_t =
 
 void ModelListCmd::Exec(const std::string& host, int port,
                         const std::string& filter, bool display_engine,
-                        bool display_version) {
+                        bool display_version, bool display_cpu_mode,
+                        bool display_gpu_mode) {
   // Start server if server is not started yet
   if (!commands::IsServerAlive(host, port)) {
     CLI_LOG("Starting server ...");
@@ -38,9 +39,12 @@ void ModelListCmd::Exec(const std::string& host, int port,
     column_headers.push_back("Version");
   }
 
-  column_headers.push_back("CPU Mode");
-  column_headers.push_back("GPU Mode");
-
+  if (display_cpu_mode) {
+    column_headers.push_back("CPU Mode");
+  }
+  if (display_gpu_mode) {
+    column_headers.push_back("GPU Mode");
+  }
   Row_t header{column_headers.begin(), column_headers.end()};
   table.add_row(header);
   table.format().font_color(Color::green);
@@ -73,18 +77,26 @@ void ModelListCmd::Exec(const std::string& host, int port,
             row.push_back(v["version"].asString());
           }
 
-          if(auto& r = v["recommendation"]; !r.isNull()) {
-            if(!r["cpu_mode"].isNull()) {
-              row.push_back("RAM: " + r["cpu_mode"]["ram"].asString() + " MiB");
+          if (auto& r = v["recommendation"]; !r.isNull()) {
+            if (display_cpu_mode) {
+              if (!r["cpu_mode"].isNull()) {
+                row.push_back("RAM: " + r["cpu_mode"]["ram"].asString() +
+                              " MiB");
+              }
             }
 
-            if(!r["gpu_mode"].isNull()) {
-              std::string s;
-              s += "ngl: " + r["gpu_mode"][0]["ngl"].asString() + " - ";
-              s += "context length: " + r["gpu_mode"][0]["context_length"].asString() + " - ";
-              s += "RAM: " + r["gpu_mode"][0]["ram"].asString() + " MiB - ";
-              s += "VRAM: " + r["gpu_mode"][0]["vram"].asString() + " MiB";
-              row.push_back(s);
+            if (display_gpu_mode) {
+              if (!r["gpu_mode"].isNull()) {
+                std::string s;
+                s += "ngl: " + r["gpu_mode"][0]["ngl"].asString() + " - ";
+                s += "context: " +
+                     r["gpu_mode"][0]["context_length"].asString() + " - ";
+                s += "RAM: " + r["gpu_mode"][0]["ram"].asString() + " MiB - ";
+                s += "VRAM: " + r["gpu_mode"][0]["vram"].asString() + " MiB - ";
+                s += "recommend ngl: " +
+                     r["gpu_mode"][0]["recommend_ngl"].asString();
+                row.push_back(s);
+              }
             }
           }
 
diff --git a/engine/cli/commands/model_list_cmd.h b/engine/cli/commands/model_list_cmd.h
index 2e7c446e7..791c1ecf6 100644
--- a/engine/cli/commands/model_list_cmd.h
+++ b/engine/cli/commands/model_list_cmd.h
@@ -7,6 +7,7 @@ namespace commands {
 class ModelListCmd {
  public:
   void Exec(const std::string& host, int port, const std::string& filter,
-            bool display_engine = false, bool display_version = false);
+            bool display_engine = false, bool display_version = false,
+            bool display_cpu_mode = false, bool display_gpu_mode = false);
 };
 }  // namespace commands
diff --git a/engine/main.cc b/engine/main.cc
index e0ef37a27..095461333 100644
--- a/engine/main.cc
+++ b/engine/main.cc
@@ -198,15 +198,6 @@ void RunServer(std::optional<int> port, bool ignore_cout) {
 }
 
 int main(int argc, char* argv[]) {
-  // hardware::RunConfig rc = {.ngl = 33,
-  //                           .ctx_len = 2048,
-  //                           .n_batch = 2048,
-  //                           .n_ubatch = 2048,
-  //                           .kv_cache_type = "f16"};
-  // auto res = hardware::EstimateLLaMACppRun(
-  //     "C:\\Users\\vansa\\cortexcpp-nightly\\models\\cortex.so\\llama3.1\\8b-gguf\\model.gguf",
-  //     rc);
-  // return 0;
   // Stop the program if the system is not supported
   auto system_info = system_info_utils::GetSystemInfo();
   if (system_info->arch == system_info_utils::kUnsupported ||
diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc
index 02454efb7..61bbc2447 100644
--- a/engine/services/model_service.cc
+++ b/engine/services/model_service.cc
@@ -357,13 +357,19 @@ cpp::result<hardware::Estimation, std::string> ModelService::GetEstimation(
             fs::path(model_entry.value().path_to_model_yaml))
             .string());
     auto mc = yaml_handler.GetModelConfig();
-
+    services::HardwareService hw_svc;
+    auto hw_info = hw_svc.GetHardwareInfo();
+    auto free_vram_MiB = 0u;
+    for (const auto& gpu : hw_info.gpus) {
+      free_vram_MiB += gpu.free_vram;
+    }
     return hardware::EstimateLLaMACppRun(file_path.string(),
                                          {.ngl = mc.ngl,
                                           .ctx_len = mc.ctx_len,
                                           .n_batch = 2048,
                                           .n_ubatch = 2048,
-                                          .kv_cache_type = "f16"});
+                                          .kv_cache_type = "f16",
+                                          .free_vram_MiB = free_vram_MiB});
   } catch (const std::exception& e) {
     return cpp::fail("Fail to get model status with ID '" + model_handle +
                      "': " + e.what());
@@ -810,7 +816,8 @@ cpp::result<StartModelResult, std::string> ModelService::StartModel(
                               .ctx_len = json_data["ctx_len"].asInt(),
                               .n_batch = 2048,
                               .n_ubatch = 2048,
-                              .kv_cache_type = "f16"};
+                              .kv_cache_type = "f16",
+                              .free_vram_MiB = free_vram_MiB};
     auto es = hardware::EstimateLLaMACppRun(mp, rc);
 
     if (es.gpu_mode.vram_MiB > free_vram_MiB && is_cuda) {
diff --git a/engine/utils/hardware/gguf/gguf_file_estimate.h b/engine/utils/hardware/gguf/gguf_file_estimate.h
index 31b328608..2d22886aa 100644
--- a/engine/utils/hardware/gguf/gguf_file_estimate.h
+++ b/engine/utils/hardware/gguf/gguf_file_estimate.h
@@ -14,6 +14,7 @@ struct RunConfig {
   int n_batch;
   int n_ubatch;
   std::string kv_cache_type;
+  int64_t free_vram_MiB;
 };
 
 struct CpuMode {
@@ -25,6 +26,7 @@ struct GpuMode {
   int64_t vram_MiB;
   int ngl;
   int ctx_len;
+  int recommend_ngl;
 };
 
 struct Estimation {
@@ -42,6 +44,7 @@ inline Json::Value ToJson(const Estimation& es) {
   gpu["vram"] = es.gpu_mode.vram_MiB;
   gpu["ngl"] = es.gpu_mode.ngl;
   gpu["context_length"] = es.gpu_mode.ctx_len;
+  gpu["recommend_ngl"] = es.gpu_mode.recommend_ngl;
   gpus.append(gpu);
   res["cpu_mode"] = cpu;
   res["gpu_mode"] = gpus;
@@ -160,13 +163,13 @@ inline Estimation EstimateLLaMACppRun(const std::string& file_path,
     res.gpu_mode.ram_MiB = BytesToMiB(ram_usage);
     res.gpu_mode.vram_MiB =
         BytesToMiB(vram_usage + kv_cache_size + preprocessing_buffer_size);
+    if (rc.free_vram_MiB > res.gpu_mode.vram_MiB) {
+      res.gpu_mode.recommend_ngl = total_ngl;
+    } else {
+      res.gpu_mode.recommend_ngl =
+          (double)rc.free_vram_MiB / res.gpu_mode.vram_MiB * rc.ngl;
+    }
   }
   return res;
 }
-// CPU_Mapped model buffer size =    35.16 MiB
-// CUDA0 model buffer size =   601.02 MiB
-// CUDA0 KV buffer size =    88.00 MiB
-// CUDA_Host  output buffer size =     0.12 MiB
-// CUDA0 compute buffer size =   266.00 MiB
-// CUDA_Host compute buffer size =    48.02 MiB
 }  // namespace hardware
\ No newline at end of file

From 8cb01c9b95e53611290fae858a77e5f73ce9613d Mon Sep 17 00:00:00 2001
From: vansangpfiev <sang@jan.ai>
Date: Fri, 22 Nov 2024 13:43:56 +0700
Subject: [PATCH 3/8] chore: cleanup

---
 engine/services/model_service.cc              | 204 ++++++++++--------
 engine/services/model_service.h               |   7 +-
 engine/utils/hardware/gguf/ggml.h             |  54 ++---
 engine/utils/hardware/gguf/gguf_file.h        |   2 -
 .../utils/hardware/gguf/gguf_file_estimate.h  |  28 +--
 5 files changed, 156 insertions(+), 139 deletions(-)

diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc
index 61bbc2447..dda89f565 100644
--- a/engine/services/model_service.cc
+++ b/engine/services/model_service.cc
@@ -336,7 +336,8 @@ cpp::result<DownloadTask, std::string> ModelService::HandleDownloadUrlAsync(
 }
 
 cpp::result<hardware::Estimation, std::string> ModelService::GetEstimation(
-    const std::string& model_handle) {
+    const std::string& model_handle, const std::string& kv_cache, int n_batch,
+    int n_ubatch) {
   namespace fs = std::filesystem;
   namespace fmu = file_manager_utils;
   cortex::db::Models modellist_handler;
@@ -366,9 +367,9 @@ cpp::result<hardware::Estimation, std::string> ModelService::GetEstimation(
     return hardware::EstimateLLaMACppRun(file_path.string(),
                                          {.ngl = mc.ngl,
                                           .ctx_len = mc.ctx_len,
-                                          .n_batch = 2048,
-                                          .n_ubatch = 2048,
-                                          .kv_cache_type = "f16",
+                                          .n_batch = n_batch,
+                                          .n_ubatch = n_ubatch,
+                                          .kv_cache_type = kv_cache,
                                           .free_vram_MiB = free_vram_MiB});
   } catch (const std::exception& e) {
     return cpp::fail("Fail to get model status with ID '" + model_handle +
@@ -739,95 +740,11 @@ cpp::result<StartModelResult, std::string> ModelService::StartModel(
 #undef ASSIGN_IF_PRESENT
 
     CTL_INF(json_data.toStyledString());
-    // TODO(sang) move this into another function
-    // Calculate ram/vram needed to load model
-    services::HardwareService hw_svc;
-    auto hw_info = hw_svc.GetHardwareInfo();
-    assert(!!engine_svc_);
-    auto default_engine = engine_svc_->GetDefaultEngineVariant(kLlamaEngine);
-    bool is_cuda = false;
-    if (default_engine.has_error()) {
-      CTL_INF("Could not get default engine");
-    } else {
-      auto& de = default_engine.value();
-      is_cuda = de.variant.find("cuda") != std::string::npos;
-      CTL_INF("is_cuda: " << is_cuda);
-    }
-
-    std::optional<std::string> warning;
-    if (is_cuda && !system_info_utils::IsNvidiaSmiAvailable()) {
-      CTL_INF(
-          "Running cuda variant but nvidia-driver is not installed yet, "
-          "fallback to CPU mode");
-      auto res = engine_svc_->GetInstalledEngineVariants(kLlamaEngine);
-      if (res.has_error()) {
-        CTL_WRN("Could not get engine variants");
-        return cpp::fail("Nvidia-driver is not installed!");
-      } else {
-        auto& es = res.value();
-        std::sort(
-            es.begin(), es.end(),
-            [](const EngineVariantResponse& e1,
-               const EngineVariantResponse& e2) { return e1.name > e2.name; });
-        for (auto& e : es) {
-          CTL_INF(e.name << " " << e.version << " " << e.engine);
-          // Select the first CPU candidate
-          if (e.name.find("cuda") == std::string::npos) {
-            auto r = engine_svc_->SetDefaultEngineVariant(kLlamaEngine,
-                                                          e.version, e.name);
-            if (r.has_error()) {
-              CTL_WRN("Could not set default engine variant");
-              return cpp::fail("Nvidia-driver is not installed!");
-            } else {
-              CTL_INF("Change default engine to: " << e.name);
-              auto rl = engine_svc_->LoadEngine(kLlamaEngine);
-              if (rl.has_error()) {
-                return cpp::fail("Nvidia-driver is not installed!");
-              } else {
-                CTL_INF("Engine started");
-                is_cuda = false;
-                warning = "Nvidia-driver is not installed, use CPU variant: " +
-                          e.version + "-" + e.name;
-                break;
-              }
-            }
-          }
-        }
-        // If we reach here, means that no CPU variant to fallback
-        if (!warning) {
-          return cpp::fail(
-              "Nvidia-driver is not installed, no available CPU version to "
-              "fallback");
-        }
-      }
-    }
-    // If in GPU acceleration mode:
-    // We use all visible GPUs, so only need to sum all free vram
-    auto free_vram_MiB = 0u;
-    for (const auto& gpu : hw_info.gpus) {
-      free_vram_MiB += gpu.free_vram;
-    }
-
-    auto free_ram_MiB = hw_info.ram.available_MiB;
-
-    auto const& mp = json_data["model_path"].asString();
-    auto ngl = json_data["ngl"].asInt();
-    hardware::RunConfig rc = {.ngl = ngl,
-                              .ctx_len = json_data["ctx_len"].asInt(),
-                              .n_batch = 2048,
-                              .n_ubatch = 2048,
-                              .kv_cache_type = "f16",
-                              .free_vram_MiB = free_vram_MiB};
-    auto es = hardware::EstimateLLaMACppRun(mp, rc);
-
-    if (es.gpu_mode.vram_MiB > free_vram_MiB && is_cuda) {
-      CTL_WRN("Not enough VRAM - " << "required: " << es.gpu_mode.vram_MiB
-                                   << ", available: " << free_vram_MiB);
-    }
-
-    if (es.cpu_mode.ram_MiB > free_ram_MiB) {
-      CTL_WRN("Not enough RAM - " << "required: " << es.cpu_mode.ram_MiB
-                                  << ", available: " << free_ram_MiB);
+    auto may_fallback_res = MayFallbackToCpu(json_data["model_path"].asString(),
+                                             json_data["ngl"].asInt(),
+                                             json_data["ctx_len"].asInt());
+    if (may_fallback_res.has_error()) {
+      return cpp::fail(may_fallback_res.error());
     }
 
     assert(!!inference_svc_);
@@ -836,10 +753,12 @@ cpp::result<StartModelResult, std::string> ModelService::StartModel(
     auto status = std::get<0>(ir)["status_code"].asInt();
     auto data = std::get<1>(ir);
     if (status == httplib::StatusCode::OK_200) {
-      return StartModelResult{.success = true, .warning = warning};
+      return StartModelResult{
+          .success = true, .warning = may_fallback_res.value_or(std::nullopt)};
     } else if (status == httplib::StatusCode::Conflict_409) {
       CTL_INF("Model '" + model_handle + "' is already loaded");
-      return StartModelResult{.success = true, .warning = warning};
+      return StartModelResult{
+          .success = true, .warning = may_fallback_res.value_or(std::nullopt)};
     } else {
       // only report to user the error
       CTL_ERR("Model failed to start with status code: " << status);
@@ -1068,3 +987,98 @@ cpp::result<std::string, std::string> ModelService::AbortDownloadModel(
     const std::string& task_id) {
   return download_service_->StopTask(task_id);
 }
+
+cpp::result<std::optional<std::string>, std::string>
+ModelService::MayFallbackToCpu(const std::string& model_path, int ngl,
+                               int ctx_len, int n_batch, int n_ubatch,
+                               const std::string& kv_cache_type) {
+  services::HardwareService hw_svc;
+  auto hw_info = hw_svc.GetHardwareInfo();
+  assert(!!engine_svc_);
+  auto default_engine = engine_svc_->GetDefaultEngineVariant(kLlamaEngine);
+  bool is_cuda = false;
+  if (default_engine.has_error()) {
+    CTL_INF("Could not get default engine");
+  } else {
+    auto& de = default_engine.value();
+    is_cuda = de.variant.find("cuda") != std::string::npos;
+    CTL_INF("is_cuda: " << is_cuda);
+  }
+
+  std::optional<std::string> warning;
+  if (is_cuda && !system_info_utils::IsNvidiaSmiAvailable()) {
+    CTL_INF(
+        "Running cuda variant but nvidia-driver is not installed yet, "
+        "fallback to CPU mode");
+    auto res = engine_svc_->GetInstalledEngineVariants(kLlamaEngine);
+    if (res.has_error()) {
+      CTL_WRN("Could not get engine variants");
+      return cpp::fail("Nvidia-driver is not installed!");
+    } else {
+      auto& es = res.value();
+      std::sort(
+          es.begin(), es.end(),
+          [](const EngineVariantResponse& e1, const EngineVariantResponse& e2) {
+            return e1.name > e2.name;
+          });
+      for (auto& e : es) {
+        CTL_INF(e.name << " " << e.version << " " << e.engine);
+        // Select the first CPU candidate
+        if (e.name.find("cuda") == std::string::npos) {
+          auto r = engine_svc_->SetDefaultEngineVariant(kLlamaEngine, e.version,
+                                                        e.name);
+          if (r.has_error()) {
+            CTL_WRN("Could not set default engine variant");
+            return cpp::fail("Nvidia-driver is not installed!");
+          } else {
+            CTL_INF("Change default engine to: " << e.name);
+            auto rl = engine_svc_->LoadEngine(kLlamaEngine);
+            if (rl.has_error()) {
+              return cpp::fail("Nvidia-driver is not installed!");
+            } else {
+              CTL_INF("Engine started");
+              is_cuda = false;
+              warning = "Nvidia-driver is not installed, use CPU variant: " +
+                        e.version + "-" + e.name;
+              break;
+            }
+          }
+        }
+      }
+      // If we reach here, means that no CPU variant to fallback
+      if (!warning) {
+        return cpp::fail(
+            "Nvidia-driver is not installed, no available CPU version to "
+            "fallback");
+      }
+    }
+  }
+  // If in GPU acceleration mode:
+  // We use all visible GPUs, so only need to sum all free vram
+  auto free_vram_MiB = 0u;
+  for (const auto& gpu : hw_info.gpus) {
+    free_vram_MiB += gpu.free_vram;
+  }
+
+  auto free_ram_MiB = hw_info.ram.available_MiB;
+
+  hardware::RunConfig rc = {.ngl = ngl,
+                            .ctx_len = ctx_len,
+                            .n_batch = n_batch,
+                            .n_ubatch = n_ubatch,
+                            .kv_cache_type = kv_cache_type,
+                            .free_vram_MiB = free_vram_MiB};
+  auto es = hardware::EstimateLLaMACppRun(model_path, rc);
+
+  if (es.gpu_mode.vram_MiB > free_vram_MiB && is_cuda) {
+    CTL_WRN("Not enough VRAM - " << "required: " << es.gpu_mode.vram_MiB
+                                 << ", available: " << free_vram_MiB);
+  }
+
+  if (es.cpu_mode.ram_MiB > free_ram_MiB) {
+    CTL_WRN("Not enough RAM - " << "required: " << es.cpu_mode.ram_MiB
+                                << ", available: " << free_ram_MiB);
+  }
+
+  return warning;
+}
diff --git a/engine/services/model_service.h b/engine/services/model_service.h
index 323fdd588..65e4e018f 100644
--- a/engine/services/model_service.h
+++ b/engine/services/model_service.h
@@ -99,7 +99,8 @@ class ModelService {
   bool HasModel(const std::string& id) const;
 
   cpp::result<hardware::Estimation, std::string> GetEstimation(
-      const std::string& model_handle);
+      const std::string& model_handle, const std::string& kv_cache = "f16",
+      int n_batch = 2048, int n_ubatch = 2048);
 
  private:
   /**
@@ -116,6 +117,10 @@ class ModelService {
   cpp::result<std::string, std::string> HandleCortexsoModel(
       const std::string& modelName);
 
+  cpp::result<std::optional<std::string>, std::string> MayFallbackToCpu(
+      const std::string& model_path, int ngl, int ctx_len, int n_batch = 2048,
+      int n_ubatch = 2048, const std::string& kv_cache_type = "f16");
+
   std::shared_ptr<DownloadService> download_service_;
   std::shared_ptr<services::InferenceService> inference_svc_;
   std::unordered_set<std::string> bypass_stop_check_set_;
diff --git a/engine/utils/hardware/gguf/ggml.h b/engine/utils/hardware/gguf/ggml.h
index d04cf4146..e898fc796 100644
--- a/engine/utils/hardware/gguf/ggml.h
+++ b/engine/utils/hardware/gguf/ggml.h
@@ -48,47 +48,47 @@ enum GGMLType {
 
 inline float GetQuantBit(GGMLType gt) {
   switch (gt) {
+    case GGML_TYPE_I32:
     case GGML_TYPE_F32:
       return 32.0;
+    case GGML_TYPE_I16:
+    case GGML_TYPE_BF16:
     case GGML_TYPE_F16:
       return 16.0;
-    case GGML_TYPE_Q4_0:
-    case GGML_TYPE_Q4_1:
-    case GGML_TYPE_Q5_0:
-    case GGML_TYPE_Q5_1:
-    case GGML_TYPE_Q8_0:
-    case GGML_TYPE_Q8_1:
+    case GGML_TYPE_IQ2_S:
+    case GGML_TYPE_IQ2_XXS:
+    case GGML_TYPE_IQ2_XS:
+      return 2.31;
     case GGML_TYPE_Q2_K:
-    return 2.5625;
+      return 2.5625;
+    case GGML_TYPE_IQ3_XXS:
+    case GGML_TYPE_IQ3_S:
     case GGML_TYPE_Q3_K:
-    return 3.4375;
+      return 3.4375;
+    case GGML_TYPE_Q4_0_4_4:
+    case GGML_TYPE_Q4_0_4_8:
+    case GGML_TYPE_Q4_0_8_8:
+    case GGML_TYPE_IQ4_NL:
+    case GGML_TYPE_IQ4_XS:
+    case GGML_TYPE_Q4_0:
+    case GGML_TYPE_Q4_1:
     case GGML_TYPE_Q4_K:
       return 4.5;
+    case GGML_TYPE_Q5_0:
+    case GGML_TYPE_Q5_1:
     case GGML_TYPE_Q5_K:
-    return 5.5;
+      return 5.5;
     case GGML_TYPE_Q6_K:
       return 6.5625;
-    case GGML_TYPE_Q8_K:
-    case GGML_TYPE_IQ2_XXS:
-    case GGML_TYPE_IQ2_XS:
-    case GGML_TYPE_IQ3_XXS:
-    case GGML_TYPE_IQ1_S:
-    case GGML_TYPE_IQ4_NL:
-    case GGML_TYPE_IQ3_S:
-    case GGML_TYPE_IQ2_S:
-    case GGML_TYPE_IQ4_XS:
     case GGML_TYPE_I8:
-    case GGML_TYPE_I16:
-    case GGML_TYPE_I32:
+    case GGML_TYPE_Q8_0:
+    case GGML_TYPE_Q8_1:
+    case GGML_TYPE_Q8_K:
+      return 8.0;
+
     case GGML_TYPE_I64:
     case GGML_TYPE_F64:
-    case GGML_TYPE_IQ1_M:
-    case GGML_TYPE_BF16:
-    case GGML_TYPE_Q4_0_4_4:
-    case GGML_TYPE_Q4_0_4_8:
-    case GGML_TYPE_Q4_0_8_8:
-    case GGML_TYPE_TQ1_0:
-    case GGML_TYPE_TQ2_0:
+      return 64.0;
 
     default:
       return 8.0;
diff --git a/engine/utils/hardware/gguf/gguf_file.h b/engine/utils/hardware/gguf/gguf_file.h
index 9980247f0..1263debf2 100644
--- a/engine/utils/hardware/gguf/gguf_file.h
+++ b/engine/utils/hardware/gguf/gguf_file.h
@@ -429,8 +429,6 @@ struct GGUFHeader {
 };
 
 struct GGUFFile {
-  /* Basic */
-
   // header is the header of the GGUF file.
   GGUFHeader header;
   // tensor_infos are the tensor infos of the GGUF file,
diff --git a/engine/utils/hardware/gguf/gguf_file_estimate.h b/engine/utils/hardware/gguf/gguf_file_estimate.h
index 2d22886aa..ca60351f5 100644
--- a/engine/utils/hardware/gguf/gguf_file_estimate.h
+++ b/engine/utils/hardware/gguf/gguf_file_estimate.h
@@ -93,17 +93,17 @@ inline Estimation EstimateLLaMACppRun(const std::string& file_path,
   for (auto const& ti : gf.tensor_infos) {
     if (ti->name == "output.weight") {
       quant_bit_out = GetQuantBit(ti->type);
-      std::cout << ti->type << std::endl;
+      // std::cout << ti->type << std::endl;
     } else if (ti->name == "token_embd.weight") {
       quant_bit_in = GetQuantBit(ti->type);
-      std::cout << ti->type << std::endl;
+      // std::cout << ti->type << std::endl;
     }
   }
   // output.weight
   // token_embd.weight
-  std::cout << "embedding_length: " << embedding_length << std::endl;
-  std::cout << "n_vocab: " << n_vocab << std::endl;
-  std::cout << "file_size: " << file_size << std::endl;
+  // std::cout << "embedding_length: " << embedding_length << std::endl;
+  // std::cout << "n_vocab: " << n_vocab << std::endl;
+  // std::cout << "file_size: " << file_size << std::endl;
   // Model weight
   int64_t token_embeddings_size =
       n_vocab * embedding_length * 2 * quant_bit_in / 16;
@@ -119,12 +119,12 @@ inline Estimation EstimateLLaMACppRun(const std::string& file_path,
 
   int64_t ram_usage = token_embeddings_size + offload;
   int64_t vram_usage = file_size - ram_usage;
-  std::cout << "token_embeddings_size: " << BytesToMiB(token_embeddings_size)
-            << std::endl;
-  std::cout << "output_layer_size: " << BytesToMiB(output_layer_size)
-            << std::endl;
-  std::cout << "ram_usage: " << BytesToMiB(ram_usage) << std::endl;
-  std::cout << "vram_usage: " << BytesToMiB(vram_usage) << std::endl;
+  // std::cout << "token_embeddings_size: " << BytesToMiB(token_embeddings_size)
+  //           << std::endl;
+  // std::cout << "output_layer_size: " << BytesToMiB(output_layer_size)
+  //           << std::endl;
+  // std::cout << "ram_usage: " << BytesToMiB(ram_usage) << std::endl;
+  // std::cout << "vram_usage: " << BytesToMiB(vram_usage) << std::endl;
 
   // KV cache
   // kv_cache_size = ctx_len/8192 * hidden_dim/4096 * quant_bit/16 * num_block/33 * 1 (GB)
@@ -135,7 +135,7 @@ inline Estimation EstimateLLaMACppRun(const std::string& file_path,
                           hidden_dim / 4096 * kv_quant_bit / 16 * num_block /
                           33;  //(bytes)
 
-  std::cout << "kv_cache_size: " << BytesToMiB(kv_cache_size) << std::endl;
+  // std::cout << "kv_cache_size: " << BytesToMiB(kv_cache_size) << std::endl;
 
   // VRAM = (min(n_batch, n_ubatch))/ 512 * 266 (MiB)
   int64_t preprocessing_buffer_size =
@@ -144,8 +144,8 @@ inline Estimation EstimateLLaMACppRun(const std::string& file_path,
   if (total_ngl != rc.ngl) {
     preprocessing_buffer_size += output_layer_size;
   }
-  std::cout << "preprocessing_buffer_size: "
-            << BytesToMiB(preprocessing_buffer_size) << std::endl;
+  // std::cout << "preprocessing_buffer_size: "
+  //           << BytesToMiB(preprocessing_buffer_size) << std::endl;
 
   // CPU mode
   {

From 440348f27b456844419e1e918fdb752d2dc16862 Mon Sep 17 00:00:00 2001
From: vansangpfiev <sang@jan.ai>
Date: Fri, 22 Nov 2024 14:01:50 +0700
Subject: [PATCH 4/8] chore: API docs

---
 docs/static/openapi/cortex.json | 53 +++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/docs/static/openapi/cortex.json b/docs/static/openapi/cortex.json
index da31ab64b..78430294f 100644
--- a/docs/static/openapi/cortex.json
+++ b/docs/static/openapi/cortex.json
@@ -3940,6 +3940,55 @@
         },
         "required": ["description", "name", "productName", "status"]
       },
+      "CpuModeDto": {
+        "type": "object",
+        "properties": {
+          "ram": {
+            "type": "number",
+            "example": 1024
+          }
+        }
+      },
+      "GpuModeDto": {
+        "type": "object",
+        "properties": {
+          "ram": {
+            "type": "number",
+            "example": 1024
+          },
+          "vram": {
+            "type": "number",
+            "example": 1024
+          },
+          "ngl": {
+            "type": "number",
+            "example": 30
+          },
+          "context_length": {
+            "type": "number",
+            "example": 4096
+          },
+          "recommend_ngl": {
+            "type": "number",
+            "example": 33
+          }
+        }
+      },
+      "RecommendDto": {
+        "type": "object",
+        "properties": {
+          "cpu_mode": {
+            "type": "object",
+            "$ref": "#/components/schemas/CpuModeDto"
+          },
+          "gpu_mode": {
+            "type": "array",
+            "items": {
+              "$ref": "#/components/schemas/GPUDto"
+            }
+          }
+        }
+      },
       "ModelDto": {
         "type": "object",
         "properties": {
@@ -4064,6 +4113,10 @@
             "type": "string",
             "description": "The engine to use.",
             "example": "llamacpp"
+          },
+          "recommendation": {
+            "type": "object",
+            "$ref": "#/components/schemas/RecommendDto"
           }
         },
         "required": ["id"]

From dcf0841ff4af2ce571d401d5ca3976edb442e98c Mon Sep 17 00:00:00 2001
From: vansangpfiev <sang@jan.ai>
Date: Fri, 22 Nov 2024 14:08:18 +0700
Subject: [PATCH 5/8] chore: CLI docs

---
 docs/docs/cli/models/index.mdx | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/docs/docs/cli/models/index.mdx b/docs/docs/cli/models/index.mdx
index 5b29069a6..b75bf9d49 100644
--- a/docs/docs/cli/models/index.mdx
+++ b/docs/docs/cli/models/index.mdx
@@ -120,8 +120,11 @@ For example, it returns the following:w
 
 | Option                    | Description                                        | Required | Default value | Example              |
 |---------------------------|----------------------------------------------------|----------|---------------|----------------------|
-| `-h`, `--help`            | Display help for command.                          | No       | -             | `-h`             |
-<!-- | `-f`, `--format <format>` | Specify output format for the models list.         | No       | `json`        | `-f json`       | -->
+| `-h`, `--help`            | Display help for command.                          | No       | -             | `-h`                 |
+| `-e`, `--engine`          | Display engines.                                   | No       | -             | `--engine`           |
+| `-v`, `--version`         | Display version for model.                         | No       | -             | `--version`          |
+| `--cpu_mode`              | Display CPU mode.                                  | No       | -             | `--cpu_mode`         |
+| `--gpu_mode`              | Display GPU mode.                                  | No       | -             | `--gpu_mode`         |
 
 ## `cortex models start`
 :::info

From 3f7bbe01e643818090592339e232000eaac8c46f Mon Sep 17 00:00:00 2001
From: vansangpfiev <sang@jan.ai>
Date: Fri, 22 Nov 2024 15:03:10 +0700
Subject: [PATCH 6/8] fix: correct calculation

---
 engine/utils/hardware/gguf/gguf_file_estimate.h | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/engine/utils/hardware/gguf/gguf_file_estimate.h b/engine/utils/hardware/gguf/gguf_file_estimate.h
index ca60351f5..7b1a5519d 100644
--- a/engine/utils/hardware/gguf/gguf_file_estimate.h
+++ b/engine/utils/hardware/gguf/gguf_file_estimate.h
@@ -76,16 +76,18 @@ inline Estimation EstimateLLaMACppRun(const std::string& file_path,
   int32_t total_ngl = 0;
   auto file_size = std::filesystem::file_size(file_path);
   for (auto const& kv : gf.header.metadata_kv) {
-    if (kv.key == "llama.embedding_length") {
+    if (kv.key.find("embedding_length") != std::string::npos) {
       embedding_length = std::any_cast<uint32_t>(kv.value);
     } else if (kv.key == "tokenizer.ggml.tokens") {
       n_vocab = std::any_cast<GGUFMetadataKVArrayValue>(kv.value).arr.size();
-    } else if (kv.key == "llama.block_count") {
+    } else if (kv.key.find("block_count") != std::string::npos) {
       num_block = std::any_cast<uint32_t>(kv.value);
       total_ngl = num_block + 1;
     }
   }
 
+  // std::cout << n_vocab << std::endl;
+
   // token_embeddings_size = n_vocab * embedding_length * 2 * quant_bit_in/16 bytes
   int32_t quant_bit_in = 0;
   int32_t quant_bit_out = 0;
@@ -139,8 +141,8 @@ inline Estimation EstimateLLaMACppRun(const std::string& file_path,
 
   // VRAM = (min(n_batch, n_ubatch))/ 512 * 266 (MiB)
   int64_t preprocessing_buffer_size =
-      (double)std::min(rc.n_batch, rc.n_ubatch) / 512 * 266 * 1024 * 1024 /
-      4;  //(bytes)
+      (double)std::min(rc.n_batch, rc.n_ubatch) / 512 * 266 * 1024 * 1024 *
+      n_vocab / 128256 /*llama3 n_vocab*/;  //(bytes)
   if (total_ngl != rc.ngl) {
     preprocessing_buffer_size += output_layer_size;
   }
@@ -161,7 +163,10 @@ inline Estimation EstimateLLaMACppRun(const std::string& file_path,
     res.gpu_mode.ctx_len = rc.ctx_len;
     res.gpu_mode.ngl = rc.ngl;
     res.gpu_mode.ram_MiB = BytesToMiB(ram_usage);
+    // We also need to reserve extra 100 MiB -200 MiB of Ram for some small buffers during processing
+    constexpr const int64_t kDeltaVramMiB = 200;
     res.gpu_mode.vram_MiB =
+        kDeltaVramMiB +
         BytesToMiB(vram_usage + kv_cache_size + preprocessing_buffer_size);
     if (rc.free_vram_MiB > res.gpu_mode.vram_MiB) {
       res.gpu_mode.recommend_ngl = total_ngl;

From 12f07f4e1251cb00df9974c532881e92e0333cbe Mon Sep 17 00:00:00 2001
From: vansangpfiev <sang@jan.ai>
Date: Fri, 22 Nov 2024 16:20:37 +0700
Subject: [PATCH 7/8] fix: handle macOS

---
 engine/services/model_service.cc                | 9 +++++++++
 engine/utils/hardware/gguf/gguf_file_estimate.h | 3 +++
 2 files changed, 12 insertions(+)

diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc
index dda89f565..48de10296 100644
--- a/engine/services/model_service.cc
+++ b/engine/services/model_service.cc
@@ -364,6 +364,11 @@ cpp::result<hardware::Estimation, std::string> ModelService::GetEstimation(
     for (const auto& gpu : hw_info.gpus) {
       free_vram_MiB += gpu.free_vram;
     }
+
+#if defined(__APPLE__) && defined(__MACH__)
+    free_vram_MiB = hw_info.ram.available_MiB;
+#endif
+
     return hardware::EstimateLLaMACppRun(file_path.string(),
                                          {.ngl = mc.ngl,
                                           .ctx_len = mc.ctx_len,
@@ -1062,6 +1067,10 @@ ModelService::MayFallbackToCpu(const std::string& model_path, int ngl,
 
   auto free_ram_MiB = hw_info.ram.available_MiB;
 
+#if defined(__APPLE__) && defined(__MACH__)
+  free_vram_MiB = free_ram_MiB;
+#endif
+
   hardware::RunConfig rc = {.ngl = ngl,
                             .ctx_len = ctx_len,
                             .n_batch = n_batch,
diff --git a/engine/utils/hardware/gguf/gguf_file_estimate.h b/engine/utils/hardware/gguf/gguf_file_estimate.h
index 7b1a5519d..fde0b0ac0 100644
--- a/engine/utils/hardware/gguf/gguf_file_estimate.h
+++ b/engine/utils/hardware/gguf/gguf_file_estimate.h
@@ -174,6 +174,9 @@ inline Estimation EstimateLLaMACppRun(const std::string& file_path,
       res.gpu_mode.recommend_ngl =
           (double)rc.free_vram_MiB / res.gpu_mode.vram_MiB * rc.ngl;
     }
+#if defined(__APPLE__) && defined(__MACH__)
+    res.cpu_mode.ram_MiB = res.gpu_mode.vram_MiB + res.gpu_mode.ram_MiB;
+#endif
   }
   return res;
 }

From f2651acd43db860d77e2d6d7a030bb61906a2e8a Mon Sep 17 00:00:00 2001
From: vansangpfiev <sang@jan.ai>
Date: Tue, 26 Nov 2024 08:47:00 +0700
Subject: [PATCH 8/8] chore: typo

---
 engine/cli/commands/model_list_cmd.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/engine/cli/commands/model_list_cmd.cc b/engine/cli/commands/model_list_cmd.cc
index e598e22d5..cfc7a8d3a 100644
--- a/engine/cli/commands/model_list_cmd.cc
+++ b/engine/cli/commands/model_list_cmd.cc
@@ -93,7 +93,7 @@ void ModelListCmd::Exec(const std::string& host, int port,
                      r["gpu_mode"][0]["context_length"].asString() + " - ";
                 s += "RAM: " + r["gpu_mode"][0]["ram"].asString() + " MiB - ";
                 s += "VRAM: " + r["gpu_mode"][0]["vram"].asString() + " MiB - ";
-                s += "recommend ngl: " +
+                s += "recommended ngl: " +
                      r["gpu_mode"][0]["recommend_ngl"].asString();
                 row.push_back(s);
               }