diff --git a/engine/config/gguf_parser.cc b/engine/config/gguf_parser.cc index c97c79c30..9acc97de2 100644 --- a/engine/config/gguf_parser.cc +++ b/engine/config/gguf_parser.cc @@ -2,12 +2,12 @@ #include #include #include +#include #include #include #include #include #include -#include #ifdef _WIN32 #include @@ -70,7 +70,7 @@ void GGUFHandler::OpenFile(const std::string& file_path) { #else file_size_ = std::filesystem::file_size(file_path); - + int file_descriptor = open(file_path.c_str(), O_RDONLY); // Memory-map the file data_ = static_cast( @@ -105,7 +105,8 @@ std::pair GGUFHandler::ReadString( std::memcpy(&length, data_ + offset, sizeof(uint64_t)); if (offset + 8 + length > file_size_) { - throw std::runtime_error("GGUF metadata string length exceeds file size.\n"); + throw std::runtime_error( + "GGUF metadata string length exceeds file size.\n"); } std::string value(reinterpret_cast(data_ + offset + 8), length); @@ -578,9 +579,8 @@ void GGUFHandler::ModelConfigFromMetadata() { model_config_.model = name; model_config_.id = name; model_config_.version = std::to_string(version); - model_config_.max_tokens = - std::min(kDefaultMaxContextLength, max_tokens); - model_config_.ctx_len = std::min(kDefaultMaxContextLength, max_tokens); + model_config_.max_tokens = max_tokens; + model_config_.ctx_len = max_tokens; model_config_.ngl = ngl; } diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc index c13f7cf19..dbf2cd8a0 100644 --- a/engine/services/model_service.cc +++ b/engine/services/model_service.cc @@ -954,6 +954,7 @@ cpp::result ModelService::StartModel( json_data["user_prompt"] = mc.user_template; json_data["ai_prompt"] = mc.ai_template; json_data["ctx_len"] = std::min(kDefautlContextLength, mc.ctx_len); + json_data["max_tokens"] = std::min(kDefautlContextLength, mc.ctx_len); max_model_context_length = mc.ctx_len; } else { bypass_stop_check_set_.insert(model_handle); @@ -978,6 +979,8 @@ cpp::result ModelService::StartModel( if (ctx_len) { json_data["ctx_len"] = std::min(ctx_len.value(), max_model_context_length); + json_data["max_tokens"] = + std::min(ctx_len.value(), max_model_context_length); } CTL_INF(json_data.toStyledString()); auto may_fallback_res = MayFallbackToCpu(json_data["model_path"].asString(),