From 55b7707e99f5d3a941f6d96c98d852c1713a00ab Mon Sep 17 00:00:00 2001 From: rmatif Date: Thu, 28 Aug 2025 16:38:13 +0000 Subject: [PATCH 01/13] opt tensor loading --- model.cpp | 379 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 223 insertions(+), 156 deletions(-) diff --git a/model.cpp b/model.cpp index 1be057158..a1f81c448 100644 --- a/model.cpp +++ b/model.cpp @@ -5,6 +5,11 @@ #include #include #include +#include +#include +#include +#include +#include #include "gguf_reader.hpp" #include "model.h" @@ -1948,238 +1953,300 @@ std::vector remove_duplicates(const std::vector& v std::vector res; std::unordered_map name_to_index_map; - for (size_t i = 0; i < vec.size(); ++i) { - const std::string& current_name = vec[i].name; - auto it = name_to_index_map.find(current_name); + for (const auto& ts : vec) { + const std::string& current_name = ts.name; + auto it = name_to_index_map.find(current_name); if (it != name_to_index_map.end()) { - res[it->second] = vec[i]; + // Found a duplicate, overwrite the existing one in res + res[it->second] = ts; } else { - name_to_index_map[current_name] = i; - res.push_back(vec[i]); + // Not a duplicate, add to map and push to res + name_to_index_map[current_name] = res.size(); + res.push_back(ts); } } - - // vec.resize(name_to_index_map.size()); - return res; } bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb) { std::vector processed_tensor_storages; - for (auto& tensor_storage : tensor_storages) { - // LOG_DEBUG("%s", name.c_str()); + + { + std::unordered_map processed_map; + std::mutex map_mutex; - if (is_unused_tensor(tensor_storage.name)) { - continue; + int n_threads = std::min((int)std::thread::hardware_concurrency(), (int)tensor_storages.size()); + if (n_threads < 1) { + n_threads = 1; } + std::vector workers; - preprocess_tensor(tensor_storage, processed_tensor_storages); + for (int i = 0; i < n_threads; ++i) { + workers.emplace_back([&, thread_id = i]() { + + std::unordered_map local_processed_map; + std::vector temp_storages; + + for (size_t j = thread_id; j < tensor_storages.size(); j += n_threads) { + const auto& tensor_storage = tensor_storages[j]; + if (is_unused_tensor(tensor_storage.name)) { + continue; + } + + temp_storages.clear(); + preprocess_tensor(tensor_storage, temp_storages); + + for (const auto& ts : temp_storages) { + local_processed_map[ts.name] = ts; + } + } + + if (!local_processed_map.empty()) { + std::lock_guard lock(map_mutex); + processed_map.merge(local_processed_map); + } + }); + } + for (auto& w : workers) { + w.join(); + } + + processed_tensor_storages.reserve(processed_map.size()); + for (auto const& [name, ts] : processed_map) { + processed_tensor_storages.push_back(ts); + } } - std::vector dedup = remove_duplicates(processed_tensor_storages); - processed_tensor_storages = dedup; - bool success = true; + bool success = true; + size_t total_tensors_processed = 0; + const size_t total_tensors_to_process = processed_tensor_storages.size(); + const int64_t t_start = ggml_time_ms(); + for (size_t file_index = 0; file_index < file_paths_.size(); file_index++) { std::string file_path = file_paths_[file_index]; LOG_DEBUG("loading tensors from %s", file_path.c_str()); - std::ifstream file(file_path, std::ios::binary); - if (!file.is_open()) { - LOG_ERROR("failed to open '%s'", file_path.c_str()); - return false; + std::vector file_tensors; + for (const auto& ts : processed_tensor_storages) { + if (ts.file_index == file_index) { + file_tensors.push_back(&ts); + } + } + if (file_tensors.empty()) { + continue; } bool is_zip = false; - for (auto& tensor_storage : tensor_storages) { - if (tensor_storage.file_index != file_index) { - continue; - } - if (tensor_storage.index_in_zip >= 0) { + for (auto const& ts : file_tensors) { + if (ts->index_in_zip >= 0) { is_zip = true; break; } } - struct zip_t* zip = NULL; - if (is_zip) { - zip = zip_open(file_path.c_str(), 0, 'r'); - if (zip == NULL) { - LOG_ERROR("failed to open zip '%s'", file_path.c_str()); - return false; - } + int n_threads = is_zip ? 1 : std::min((int)std::thread::hardware_concurrency(), (int)file_tensors.size()); + if (n_threads < 1) { + n_threads = 1; } - std::vector read_buffer; - std::vector convert_buffer; + std::atomic tensor_idx(0); + std::atomic failed(false); + std::vector workers; - auto read_data = [&](const TensorStorage& tensor_storage, char* buf, size_t n) { - if (zip != NULL) { - zip_entry_openbyindex(zip, tensor_storage.index_in_zip); - size_t entry_size = zip_entry_size(zip); - if (entry_size != n) { - read_buffer.resize(entry_size); - zip_entry_noallocread(zip, (void*)read_buffer.data(), entry_size); - memcpy((void*)buf, (void*)(read_buffer.data() + tensor_storage.offset), n); + for (int i = 0; i < n_threads; ++i) { + workers.emplace_back([&, file_path, is_zip]() { + std::ifstream file; + struct zip_t* zip = NULL; + if (is_zip) { + zip = zip_open(file_path.c_str(), 0, 'r'); + if (zip == NULL) { + LOG_ERROR("failed to open zip '%s'", file_path.c_str()); + failed = true; + return; + } } else { - zip_entry_noallocread(zip, (void*)buf, n); - } - zip_entry_close(zip); - } else { - file.seekg(tensor_storage.offset); - file.read(buf, n); - if (!file) { - LOG_ERROR("read tensor data failed: '%s'", file_path.c_str()); - return false; + file.open(file_path, std::ios::binary); + if (!file.is_open()) { + LOG_ERROR("failed to open '%s'", file_path.c_str()); + failed = true; + return; + } } - } - return true; - }; - int tensor_count = 0; - int64_t t0 = ggml_time_ms(); - int64_t t1 = t0; - bool partial = true; - int tensor_max = (int)processed_tensor_storages.size(); - pretty_progress(0, tensor_max, 0.0f); - for (auto& tensor_storage : processed_tensor_storages) { - if (tensor_storage.file_index != file_index) { - ++tensor_count; - continue; - } - ggml_tensor* dst_tensor = NULL; - success = on_new_tensor_cb(tensor_storage, &dst_tensor); - if (!success) { - LOG_WARN("process tensor failed: '%s'", tensor_storage.name.c_str()); - break; - } + std::vector read_buffer; + std::vector convert_buffer; - if (dst_tensor == NULL) { - ++tensor_count; - continue; - } + while (true) { + size_t idx = tensor_idx.fetch_add(1); + if (idx >= file_tensors.size() || failed) { + break; + } - size_t nbytes_to_read = tensor_storage.nbytes_to_read(); + const TensorStorage& tensor_storage = *file_tensors[idx]; + ggml_tensor* dst_tensor = NULL; - if (dst_tensor->buffer == NULL || ggml_backend_buffer_is_host(dst_tensor->buffer)) { - // for the CPU and Metal backend, we can copy directly into the tensor - if (tensor_storage.type == dst_tensor->type) { - GGML_ASSERT(ggml_nbytes(dst_tensor) == tensor_storage.nbytes()); - if (tensor_storage.is_f64 || tensor_storage.is_i64) { - read_buffer.resize(tensor_storage.nbytes_to_read()); - read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read); - } else { - read_data(tensor_storage, (char*)dst_tensor->data, nbytes_to_read); + if (!on_new_tensor_cb(tensor_storage, &dst_tensor)) { + LOG_WARN("process tensor failed: '%s'", tensor_storage.name.c_str()); + failed = true; + break; + } + + if (dst_tensor == NULL) { + continue; } - if (tensor_storage.is_bf16) { + size_t nbytes_to_read = tensor_storage.nbytes_to_read(); + + auto read_data = [&](char* buf, size_t n) { + if (zip != NULL) { + zip_entry_openbyindex(zip, tensor_storage.index_in_zip); + size_t entry_size = zip_entry_size(zip); + if (entry_size != n) { + read_buffer.resize(entry_size); + zip_entry_noallocread(zip, (void*)read_buffer.data(), entry_size); + memcpy((void*)buf, (void*)(read_buffer.data() + tensor_storage.offset), n); + } else { + zip_entry_noallocread(zip, (void*)buf, n); + } + zip_entry_close(zip); + } else { + file.seekg(tensor_storage.offset); + file.read(buf, n); + if (!file) { + LOG_ERROR("read tensor data failed: '%s'", file_path.c_str()); + failed = true; + } + } + }; + + if (dst_tensor->buffer == NULL || ggml_backend_buffer_is_host(dst_tensor->buffer)) { + if (tensor_storage.type == dst_tensor->type) { + GGML_ASSERT(ggml_nbytes(dst_tensor) == tensor_storage.nbytes()); + if (tensor_storage.is_f64 || tensor_storage.is_i64) { + read_buffer.resize(tensor_storage.nbytes_to_read()); + read_data((char*)read_buffer.data(), nbytes_to_read); + } else { + read_data((char*)dst_tensor->data, nbytes_to_read); + } + + if (tensor_storage.is_bf16) { // inplace op - bf16_to_f32_vec((uint16_t*)dst_tensor->data, (float*)dst_tensor->data, tensor_storage.nelements()); - } else if (tensor_storage.is_f8_e4m3) { + bf16_to_f32_vec((uint16_t*)dst_tensor->data, (float*)dst_tensor->data, tensor_storage.nelements()); + } else if (tensor_storage.is_f8_e4m3) { // inplace op - f8_e4m3_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements()); - } else if (tensor_storage.is_f8_e5m2) { + f8_e4m3_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements()); + } else if (tensor_storage.is_f8_e5m2) { // inplace op - f8_e5m2_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements()); - } else if (tensor_storage.is_f64) { - f64_to_f32_vec((double*)read_buffer.data(), (float*)dst_tensor->data, tensor_storage.nelements()); - } else if (tensor_storage.is_i64) { - i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)dst_tensor->data, tensor_storage.nelements()); - } - } else { - read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read())); - read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read); - - if (tensor_storage.is_bf16) { + f8_e5m2_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements()); + } else if (tensor_storage.is_f64) { + f64_to_f32_vec((double*)read_buffer.data(), (float*)dst_tensor->data, tensor_storage.nelements()); + } else if (tensor_storage.is_i64) { + i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)dst_tensor->data, tensor_storage.nelements()); + } + } else { + read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read())); + read_data((char*)read_buffer.data(), nbytes_to_read); + + if (tensor_storage.is_bf16) { // inplace op - bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements()); - } else if (tensor_storage.is_f8_e4m3) { + bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements()); + } else if (tensor_storage.is_f8_e4m3) { // inplace op - f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements()); - } else if (tensor_storage.is_f8_e5m2) { + f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements()); + } else if (tensor_storage.is_f8_e5m2) { // inplace op - f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements()); - } else if (tensor_storage.is_f64) { + f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements()); + } else if (tensor_storage.is_f64) { // inplace op - f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements()); - } else if (tensor_storage.is_i64) { + f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements()); + } else if (tensor_storage.is_i64) { // inplace op - i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements()); - } + i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements()); + } - convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data, - dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]); - } - } else { - read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read())); - read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read); + convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data, + dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]); + } + } else { + read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read())); + read_data((char*)read_buffer.data(), nbytes_to_read); - if (tensor_storage.is_bf16) { + if (tensor_storage.is_bf16) { // inplace op - bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements()); - } else if (tensor_storage.is_f8_e4m3) { + bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements()); + } else if (tensor_storage.is_f8_e4m3) { // inplace op - f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements()); - } else if (tensor_storage.is_f8_e5m2) { + f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements()); + } else if (tensor_storage.is_f8_e5m2) { // inplace op - f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements()); - } else if (tensor_storage.is_f64) { + f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements()); + } else if (tensor_storage.is_f64) { // inplace op - f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements()); - } else if (tensor_storage.is_i64) { + f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements()); + } else if (tensor_storage.is_i64) { // inplace op - i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements()); - } + i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements()); + } - if (tensor_storage.type == dst_tensor->type) { + if (tensor_storage.type == dst_tensor->type) { // copy to device memory - ggml_backend_tensor_set(dst_tensor, read_buffer.data(), 0, ggml_nbytes(dst_tensor)); - } else { + ggml_backend_tensor_set(dst_tensor, read_buffer.data(), 0, ggml_nbytes(dst_tensor)); + } else { // convert first, then copy to device memory - convert_buffer.resize(ggml_nbytes(dst_tensor)); - convert_tensor((void*)read_buffer.data(), tensor_storage.type, - (void*)convert_buffer.data(), dst_tensor->type, - (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]); - ggml_backend_tensor_set(dst_tensor, convert_buffer.data(), 0, ggml_nbytes(dst_tensor)); + convert_buffer.resize(ggml_nbytes(dst_tensor)); + convert_tensor((void*)read_buffer.data(), tensor_storage.type, + (void*)convert_buffer.data(), dst_tensor->type, + (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]); + ggml_backend_tensor_set(dst_tensor, convert_buffer.data(), 0, ggml_nbytes(dst_tensor)); + } + } } - } - ++tensor_count; - int64_t t2 = ggml_time_ms(); - if ((t2 - t1) >= 200) { - t1 = t2; - pretty_progress(tensor_count, tensor_max, (t1 - t0) / (1000.0f * tensor_count)); - partial = tensor_count != tensor_max; - } + if (zip != NULL) { + zip_close(zip); + } + }); } - if (partial) { - if (tensor_count >= 1) { - t1 = ggml_time_ms(); - pretty_progress(tensor_count, tensor_max, (t1 - t0) / (1000.0f * tensor_count)); - } - if (tensor_count < tensor_max) { - printf("\n"); + while (true) { + size_t current_idx = tensor_idx.load(); + if (current_idx >= file_tensors.size() || failed) { + break; } + pretty_progress(total_tensors_processed + current_idx, total_tensors_to_process, (ggml_time_ms() - t_start) / 1000.0f); + std::this_thread::sleep_for(std::chrono::milliseconds(200)); } - if (zip != NULL) { - zip_close(zip); + for (auto& w : workers) { + w.join(); } - if (!success) { + if (failed) { + success = false; break; } + total_tensors_processed += file_tensors.size(); + } + + pretty_progress(total_tensors_processed, total_tensors_to_process, (ggml_time_ms() - t_start) / 1000.0f); + if (total_tensors_to_process > 0) { + printf("\n"); } + return success; } bool ModelLoader::load_tensors(std::map& tensors, std::set ignore_tensors) { std::set tensor_names_in_file; + std::mutex tensor_names_mutex; auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool { const std::string& name = tensor_storage.name; // LOG_DEBUG("%s", tensor_storage.to_string().c_str()); - tensor_names_in_file.insert(name); + { + std::lock_guard lock(tensor_names_mutex); + tensor_names_in_file.insert(name); + } struct ggml_tensor* real; if (tensors.find(name) != tensors.end()) { From 6fa2b26c94612b135052b6f51ed6985da4cb0b29 Mon Sep 17 00:00:00 2001 From: rmatif Date: Sat, 6 Sep 2025 21:21:13 +0000 Subject: [PATCH 02/13] fix build failure --- model.cpp | 49 +++++++++++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/model.cpp b/model.cpp index a1f81c448..e9db172de 100644 --- a/model.cpp +++ b/model.cpp @@ -1973,48 +1973,57 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb) { std::vector processed_tensor_storages; { - std::unordered_map processed_map; - std::mutex map_mutex; - int n_threads = std::min((int)std::thread::hardware_concurrency(), (int)tensor_storages.size()); if (n_threads < 1) { n_threads = 1; } + + std::vector > local_maps(n_threads); std::vector workers; + size_t chunk_size = (tensor_storages.size() + n_threads - 1) / n_threads; for (int i = 0; i < n_threads; ++i) { workers.emplace_back([&, thread_id = i]() { + const size_t start = thread_id * chunk_size; + const size_t end = std::min(start + chunk_size, tensor_storages.size()); - std::unordered_map local_processed_map; std::vector temp_storages; - - for (size_t j = thread_id; j < tensor_storages.size(); j += n_threads) { + for (size_t j = start; j < end; ++j) { const auto& tensor_storage = tensor_storages[j]; if (is_unused_tensor(tensor_storage.name)) { continue; } - + temp_storages.clear(); preprocess_tensor(tensor_storage, temp_storages); - - for (const auto& ts : temp_storages) { - local_processed_map[ts.name] = ts; - } - } - if (!local_processed_map.empty()) { - std::lock_guard lock(map_mutex); - processed_map.merge(local_processed_map); + for (size_t k = 0; k < temp_storages.size(); ++k) { + local_maps[thread_id][temp_storages[k].name] = temp_storages[k]; + } } }); } - for (auto& w : workers) { - w.join(); + + for (size_t i = 0; i < workers.size(); ++i) { + workers[i].join(); + } + + std::unordered_map processed_map; + size_t total_keys = 0; + for (int i = 0; i < n_threads; ++i) { + total_keys += local_maps[i].size(); } - + processed_map.reserve(total_keys); + + for (int i = 0; i < n_threads; ++i) { + for (std::unordered_map::const_iterator it = local_maps[i].begin(); it != local_maps[i].end(); ++it) { + processed_map[it->first] = it->second; + } + } + processed_tensor_storages.reserve(processed_map.size()); - for (auto const& [name, ts] : processed_map) { - processed_tensor_storages.push_back(ts); + for (std::unordered_map::const_iterator it = processed_map.begin(); it != processed_map.end(); ++it) { + processed_tensor_storages.push_back(it->second); } } From 12295b2ab599a6927dd867c5d837314553ee1395 Mon Sep 17 00:00:00 2001 From: rmatif Date: Sun, 7 Sep 2025 04:21:16 +0000 Subject: [PATCH 03/13] revert the changes --- model.cpp | 49 ++++++++++++++++++++----------------------------- 1 file changed, 20 insertions(+), 29 deletions(-) diff --git a/model.cpp b/model.cpp index e9db172de..a1f81c448 100644 --- a/model.cpp +++ b/model.cpp @@ -1973,57 +1973,48 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb) { std::vector processed_tensor_storages; { + std::unordered_map processed_map; + std::mutex map_mutex; + int n_threads = std::min((int)std::thread::hardware_concurrency(), (int)tensor_storages.size()); if (n_threads < 1) { n_threads = 1; } - - std::vector > local_maps(n_threads); std::vector workers; - size_t chunk_size = (tensor_storages.size() + n_threads - 1) / n_threads; for (int i = 0; i < n_threads; ++i) { workers.emplace_back([&, thread_id = i]() { - const size_t start = thread_id * chunk_size; - const size_t end = std::min(start + chunk_size, tensor_storages.size()); + std::unordered_map local_processed_map; std::vector temp_storages; - for (size_t j = start; j < end; ++j) { + + for (size_t j = thread_id; j < tensor_storages.size(); j += n_threads) { const auto& tensor_storage = tensor_storages[j]; if (is_unused_tensor(tensor_storage.name)) { continue; } - + temp_storages.clear(); preprocess_tensor(tensor_storage, temp_storages); - - for (size_t k = 0; k < temp_storages.size(); ++k) { - local_maps[thread_id][temp_storages[k].name] = temp_storages[k]; + + for (const auto& ts : temp_storages) { + local_processed_map[ts.name] = ts; } } - }); - } - - for (size_t i = 0; i < workers.size(); ++i) { - workers[i].join(); - } - std::unordered_map processed_map; - size_t total_keys = 0; - for (int i = 0; i < n_threads; ++i) { - total_keys += local_maps[i].size(); + if (!local_processed_map.empty()) { + std::lock_guard lock(map_mutex); + processed_map.merge(local_processed_map); + } + }); } - processed_map.reserve(total_keys); - - for (int i = 0; i < n_threads; ++i) { - for (std::unordered_map::const_iterator it = local_maps[i].begin(); it != local_maps[i].end(); ++it) { - processed_map[it->first] = it->second; - } + for (auto& w : workers) { + w.join(); } - + processed_tensor_storages.reserve(processed_map.size()); - for (std::unordered_map::const_iterator it = processed_map.begin(); it != processed_map.end(); ++it) { - processed_tensor_storages.push_back(it->second); + for (auto const& [name, ts] : processed_map) { + processed_tensor_storages.push_back(ts); } } From 401c42c2c76f6e17a4cb021aeb185960843347ee Mon Sep 17 00:00:00 2001 From: rmatif Date: Sun, 7 Sep 2025 17:03:16 +0000 Subject: [PATCH 04/13] allow the use of n_threads --- model.cpp | 13 ++++++++----- model.h | 5 +++-- stable-diffusion.cpp | 2 +- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/model.cpp b/model.cpp index 5ea24911f..e35cc9287 100644 --- a/model.cpp +++ b/model.cpp @@ -1969,7 +1969,7 @@ std::vector remove_duplicates(const std::vector& v return res; } -bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb) { +bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p) { int64_t process_time_ms = 0; int64_t read_time_ms = 0; int64_t memcpy_time_ms = 0; @@ -1986,7 +1986,8 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb) { std::unordered_map processed_map; std::mutex map_mutex; - int n_threads = std::min((int)std::thread::hardware_concurrency(), (int)tensor_storages.size()); + int num_threads = n_threads_p > 0 ? n_threads_p : (int)std::thread::hardware_concurrency(); + int n_threads = std::min(num_threads, (int)tensor_storages.size()); if (n_threads < 1) { n_threads = 1; } @@ -2058,7 +2059,8 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb) { } } - int n_threads = is_zip ? 1 : std::min((int)std::thread::hardware_concurrency(), (int)file_tensors.size()); + int num_threads = n_threads_p > 0 ? n_threads_p : (int)std::thread::hardware_concurrency(); + int n_threads = is_zip ? 1 : std::min(num_threads, (int)file_tensors.size()); if (n_threads < 1) { n_threads = 1; } @@ -2287,7 +2289,8 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb) { } bool ModelLoader::load_tensors(std::map& tensors, - std::set ignore_tensors) { + std::set ignore_tensors, + int n_threads) { std::set tensor_names_in_file; std::mutex tensor_names_mutex; auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool { @@ -2330,7 +2333,7 @@ bool ModelLoader::load_tensors(std::map& tenso return true; }; - bool success = load_tensors(on_new_tensor_cb); + bool success = load_tensors(on_new_tensor_cb, n_threads); if (!success) { LOG_ERROR("load tensors from file failed"); return false; diff --git a/model.h b/model.h index fef6ace82..cfd988969 100644 --- a/model.h +++ b/model.h @@ -247,9 +247,10 @@ class ModelLoader { ggml_type get_diffusion_model_wtype(); ggml_type get_vae_wtype(); void set_wtype_override(ggml_type wtype, std::string prefix = ""); - bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb); + bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0); bool load_tensors(std::map& tensors, - std::set ignore_tensors = {}); + std::set ignore_tensors = {}, + int n_threads = 0); bool save_to_gguf_file(const std::string& file_path, ggml_type type, const std::string& tensor_type_rules); bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type); diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 17804c11b..57d6959a5 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -573,7 +573,7 @@ class StableDiffusionGGML { if (version == VERSION_SVD) { ignore_tensors.insert("conditioner.embedders.3"); } - bool success = model_loader.load_tensors(tensors, ignore_tensors); + bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads); if (!success) { LOG_ERROR("load tensors from model loader failed"); ggml_free(ctx); From 9e0d8e53cbfe0a85ea2fe32262286c1c08343fee Mon Sep 17 00:00:00 2001 From: rmatif Date: Sun, 7 Sep 2025 23:09:40 +0000 Subject: [PATCH 05/13] fix lora loading --- lora.hpp | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/lora.hpp b/lora.hpp index b7a27306c..7d3222760 100644 --- a/lora.hpp +++ b/lora.hpp @@ -115,7 +115,7 @@ struct LoraModel : public GGMLRunner { return "lora"; } - bool load_from_file(bool filter_tensor = false) { + bool load_from_file(bool filter_tensor = false, int n_threads = 0) { LOG_INFO("loading LoRA from '%s'", file_path.c_str()); if (load_failed) { @@ -131,15 +131,14 @@ struct LoraModel : public GGMLRunner { // LOG_INFO("skipping LoRA tesnor '%s'", name.c_str()); return true; } - // LOG_INFO("lora_tensor %s", name.c_str()); - for (int i = 0; i < LORA_TYPE_COUNT; i++) { - if (name.find(type_fingerprints[i]) != std::string::npos) { - type = (lora_t)i; - break; - } - } if (dry_run) { + for (int i = 0; i < LORA_TYPE_COUNT; i++) { + if (name.find(type_fingerprints[i]) != std::string::npos) { + type = (lora_t)i; + break; + } + } struct ggml_tensor* real = ggml_new_tensor(params_ctx, tensor_storage.type, tensor_storage.n_dims, @@ -153,11 +152,11 @@ struct LoraModel : public GGMLRunner { return true; }; - model_loader.load_tensors(on_new_tensor_cb); + model_loader.load_tensors(on_new_tensor_cb, 1); alloc_params_buffer(); - // exit(0); + dry_run = false; - model_loader.load_tensors(on_new_tensor_cb); + model_loader.load_tensors(on_new_tensor_cb, n_threads); LOG_DEBUG("lora type: \"%s\"/\"%s\"", lora_downs[type].c_str(), lora_ups[type].c_str()); From 507f4068933f57a3db6bc74679fde1ae9fa1a2a7 Mon Sep 17 00:00:00 2001 From: rmatif Date: Mon, 8 Sep 2025 10:48:34 +0000 Subject: [PATCH 06/13] optimize lora loading --- lora.hpp | 50 +++++++++++++++++++++++++++++++------------------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/lora.hpp b/lora.hpp index 7d3222760..3366e761b 100644 --- a/lora.hpp +++ b/lora.hpp @@ -123,36 +123,48 @@ struct LoraModel : public GGMLRunner { return false; } + std::unordered_map tensors_to_create; + std::mutex lora_mutex; bool dry_run = true; auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool { - const std::string& name = tensor_storage.name; + if (dry_run) { + const std::string& name = tensor_storage.name; - if (filter_tensor && !contains(name, "lora")) { - // LOG_INFO("skipping LoRA tesnor '%s'", name.c_str()); - return true; - } + if (filter_tensor && !contains(name, "lora")) { + return true; + } - if (dry_run) { - for (int i = 0; i < LORA_TYPE_COUNT; i++) { - if (name.find(type_fingerprints[i]) != std::string::npos) { - type = (lora_t)i; - break; + { + std::lock_guard lock(lora_mutex); + for (int i = 0; i < LORA_TYPE_COUNT; i++) { + if (name.find(type_fingerprints[i]) != std::string::npos) { + type = (lora_t)i; + break; + } } + tensors_to_create[name] = tensor_storage; } - struct ggml_tensor* real = ggml_new_tensor(params_ctx, - tensor_storage.type, - tensor_storage.n_dims, - tensor_storage.ne); - lora_tensors[name] = real; } else { - auto real = lora_tensors[name]; - *dst_tensor = real; + const std::string& name = tensor_storage.name; + if (lora_tensors.count(name)) { + *dst_tensor = lora_tensors.at(name); + } } - return true; }; - model_loader.load_tensors(on_new_tensor_cb, 1); + model_loader.load_tensors(on_new_tensor_cb, n_threads); + + for (const auto& pair : tensors_to_create) { + const auto& name = pair.first; + const auto& ts = pair.second; + struct ggml_tensor* real = ggml_new_tensor(params_ctx, + ts.type, + ts.n_dims, + ts.ne); + lora_tensors[name] = real; + } + alloc_params_buffer(); dry_run = false; From e7cd3ca09275bf9e0df6fa3321b1ed40bd1893b8 Mon Sep 17 00:00:00 2001 From: rmatif Date: Mon, 8 Sep 2025 10:49:32 +0000 Subject: [PATCH 07/13] add mutex --- lora.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/lora.hpp b/lora.hpp index 3366e761b..fde54eef5 100644 --- a/lora.hpp +++ b/lora.hpp @@ -2,6 +2,7 @@ #define __LORA_HPP__ #include "ggml_extend.hpp" +#include #define LORA_GRAPH_BASE_SIZE 10240 From 289c329b27bfce5aa146b4e5f5f60a6698b78351 Mon Sep 17 00:00:00 2001 From: rmatif Date: Mon, 8 Sep 2025 14:25:41 +0000 Subject: [PATCH 08/13] use atomic --- model.cpp | 101 ++++++++++++++++++++++++++---------------------------- 1 file changed, 48 insertions(+), 53 deletions(-) diff --git a/model.cpp b/model.cpp index e35cc9287..010848265 100644 --- a/model.cpp +++ b/model.cpp @@ -1970,24 +1970,22 @@ std::vector remove_duplicates(const std::vector& v } bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p) { - int64_t process_time_ms = 0; - int64_t read_time_ms = 0; - int64_t memcpy_time_ms = 0; - int64_t copy_to_backend_time_ms = 0; - int64_t convert_time_ms = 0; - - int64_t prev_time_ms = 0; - int64_t curr_time_ms = 0; - int64_t start_time = ggml_time_ms(); - prev_time_ms = start_time; + int64_t process_time_ms = 0; + std::atomic read_time_ms(0); + std::atomic memcpy_time_ms(0); + std::atomic copy_to_backend_time_ms(0); + std::atomic convert_time_ms(0); + + int num_threads_to_use = n_threads_p > 0 ? n_threads_p : (int)std::thread::hardware_concurrency(); + + int64_t start_time = ggml_time_ms(); std::vector processed_tensor_storages; { std::unordered_map processed_map; std::mutex map_mutex; - int num_threads = n_threads_p > 0 ? n_threads_p : (int)std::thread::hardware_concurrency(); - int n_threads = std::min(num_threads, (int)tensor_storages.size()); + int n_threads = std::min(num_threads_to_use, (int)tensor_storages.size()); if (n_threads < 1) { n_threads = 1; } @@ -2028,14 +2026,13 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread } } - curr_time_ms = ggml_time_ms(); - process_time_ms = curr_time_ms - prev_time_ms; - prev_time_ms = curr_time_ms; + process_time_ms = ggml_time_ms() - start_time; bool success = true; size_t total_tensors_processed = 0; const size_t total_tensors_to_process = processed_tensor_storages.size(); const int64_t t_start = ggml_time_ms(); + int last_n_threads = 1; for (size_t file_index = 0; file_index < file_paths_.size(); file_index++) { std::string file_path = file_paths_[file_index]; @@ -2059,11 +2056,11 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread } } - int num_threads = n_threads_p > 0 ? n_threads_p : (int)std::thread::hardware_concurrency(); - int n_threads = is_zip ? 1 : std::min(num_threads, (int)file_tensors.size()); + int n_threads = is_zip ? 1 : std::min(num_threads_to_use, (int)file_tensors.size()); if (n_threads < 1) { n_threads = 1; } + last_n_threads = n_threads; std::atomic tensor_idx(0); std::atomic failed(false); @@ -2093,6 +2090,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread std::vector convert_buffer; while (true) { + int64_t t0, t1; size_t idx = tensor_idx.fetch_add(1); if (idx >= file_tensors.size() || failed) { break; @@ -2101,6 +2099,8 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread const TensorStorage& tensor_storage = *file_tensors[idx]; ggml_tensor* dst_tensor = NULL; + t0 = ggml_time_ms(); + if (!on_new_tensor_cb(tensor_storage, &dst_tensor)) { LOG_WARN("process tensor failed: '%s'", tensor_storage.name.c_str()); failed = true; @@ -2108,6 +2108,8 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread } if (dst_tensor == NULL) { + t1 = ggml_time_ms(); + read_time_ms.fetch_add(t1 - t0); continue; } @@ -2118,28 +2120,19 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread zip_entry_openbyindex(zip, tensor_storage.index_in_zip); size_t entry_size = zip_entry_size(zip); if (entry_size != n) { + int64_t t_memcpy_start; read_buffer.resize(entry_size); - prev_time_ms = ggml_time_ms(); zip_entry_noallocread(zip, (void*)read_buffer.data(), entry_size); - curr_time_ms = ggml_time_ms(); - read_time_ms += curr_time_ms - prev_time_ms; - prev_time_ms = curr_time_ms; + t_memcpy_start = ggml_time_ms(); memcpy((void*)buf, (void*)(read_buffer.data() + tensor_storage.offset), n); - curr_time_ms = ggml_time_ms(); - memcpy_time_ms += curr_time_ms - prev_time_ms; + memcpy_time_ms.fetch_add(ggml_time_ms() - t_memcpy_start); } else { - prev_time_ms = ggml_time_ms(); zip_entry_noallocread(zip, (void*)buf, n); - curr_time_ms = ggml_time_ms(); - read_time_ms += curr_time_ms - prev_time_ms; } zip_entry_close(zip); } else { - prev_time_ms = ggml_time_ms(); file.seekg(tensor_storage.offset); file.read(buf, n); - curr_time_ms = ggml_time_ms(); - read_time_ms += curr_time_ms - prev_time_ms; if (!file) { LOG_ERROR("read tensor data failed: '%s'", file_path.c_str()); failed = true; @@ -2156,8 +2149,10 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread } else { read_data((char*)dst_tensor->data, nbytes_to_read); } + t1 = ggml_time_ms(); + read_time_ms.fetch_add(t1 - t0); - prev_time_ms = ggml_time_ms(); + t0 = ggml_time_ms(); if (tensor_storage.is_bf16) { // inplace op bf16_to_f32_vec((uint16_t*)dst_tensor->data, (float*)dst_tensor->data, tensor_storage.nelements()); @@ -2172,13 +2167,15 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread } else if (tensor_storage.is_i64) { i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)dst_tensor->data, tensor_storage.nelements()); } - curr_time_ms = ggml_time_ms(); - convert_time_ms += curr_time_ms - prev_time_ms; + t1 = ggml_time_ms(); + convert_time_ms.fetch_add(t1 - t0); } else { read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read())); read_data((char*)read_buffer.data(), nbytes_to_read); + t1 = ggml_time_ms(); + read_time_ms.fetch_add(t1 - t0); - prev_time_ms = ggml_time_ms(); + t0 = ggml_time_ms(); if (tensor_storage.is_bf16) { // inplace op bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements()); @@ -2195,17 +2192,17 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread // inplace op i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements()); } - - convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data, - dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]); - curr_time_ms = ggml_time_ms(); - convert_time_ms += curr_time_ms - prev_time_ms; + convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data, dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]); + t1 = ggml_time_ms(); + convert_time_ms.fetch_add(t1 - t0); } } else { read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read())); read_data((char*)read_buffer.data(), nbytes_to_read); + t1 = ggml_time_ms(); + read_time_ms.fetch_add(t1 - t0); - prev_time_ms = ggml_time_ms(); + t0 = ggml_time_ms(); if (tensor_storage.is_bf16) { // inplace op bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements()); @@ -2229,20 +2226,18 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread convert_time_ms += curr_time_ms - prev_time_ms; prev_time_ms = curr_time_ms; ggml_backend_tensor_set(dst_tensor, read_buffer.data(), 0, ggml_nbytes(dst_tensor)); - curr_time_ms = ggml_time_ms(); - copy_to_backend_time_ms += curr_time_ms - prev_time_ms; + t1 = ggml_time_ms(); + copy_to_backend_time_ms.fetch_add(t1 - t0); } else { // convert first, then copy to device memory convert_buffer.resize(ggml_nbytes(dst_tensor)); - convert_tensor((void*)read_buffer.data(), tensor_storage.type, - (void*)convert_buffer.data(), dst_tensor->type, - (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]); - curr_time_ms = ggml_time_ms(); - convert_time_ms += curr_time_ms - prev_time_ms; - prev_time_ms = curr_time_ms; + convert_tensor((void*)read_buffer.data(), tensor_storage.type, (void*)convert_buffer.data(), dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]); + t1 = ggml_time_ms(); + convert_time_ms.fetch_add(t1 - t0); + t0 = ggml_time_ms(); ggml_backend_tensor_set(dst_tensor, convert_buffer.data(), 0, ggml_nbytes(dst_tensor)); - curr_time_ms = ggml_time_ms(); - copy_to_backend_time_ms += curr_time_ms - prev_time_ms; + t1 = ggml_time_ms(); + copy_to_backend_time_ms.fetch_add(t1 - t0); } } } @@ -2281,10 +2276,10 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread LOG_INFO("loading tensors completed, taking %.2fs (process: %.2fs, read: %.2fs, memcpy: %.2fs, convert: %.2fs, copy_to_backend: %.2fs)", (end_time - start_time) / 1000.f, process_time_ms / 1000.f, - read_time_ms / 1000.f, - memcpy_time_ms / 1000.f, - convert_time_ms / 1000.f, - copy_to_backend_time_ms / 1000.f); + (read_time_ms.load() / (float)last_n_threads) / 1000.f, + (memcpy_time_ms.load() / (float)last_n_threads) / 1000.f, + (convert_time_ms.load() / (float)last_n_threads) / 1000.f, + (copy_to_backend_time_ms.load() / (float)last_n_threads) / 1000.f); return success; } From 62ba7f7c70ab7e1d9b934f63efc4b8eab349f358 Mon Sep 17 00:00:00 2001 From: rmatif Date: Mon, 8 Sep 2025 15:04:01 +0000 Subject: [PATCH 09/13] fix build --- model.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/model.cpp b/model.cpp index 010848265..2bb05d192 100644 --- a/model.cpp +++ b/model.cpp @@ -2222,14 +2222,15 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread if (tensor_storage.type == dst_tensor->type) { // copy to device memory - curr_time_ms = ggml_time_ms(); - convert_time_ms += curr_time_ms - prev_time_ms; - prev_time_ms = curr_time_ms; + t1 = ggml_time_ms(); + convert_time_ms.fetch_add(t1 - t0); + t0 = ggml_time_ms(); ggml_backend_tensor_set(dst_tensor, read_buffer.data(), 0, ggml_nbytes(dst_tensor)); t1 = ggml_time_ms(); copy_to_backend_time_ms.fetch_add(t1 - t0); } else { // convert first, then copy to device memory + convert_buffer.resize(ggml_nbytes(dst_tensor)); convert_tensor((void*)read_buffer.data(), tensor_storage.type, (void*)convert_buffer.data(), dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]); t1 = ggml_time_ms(); From 1e72471d441489cefbdf4c2bb65a049598fed22e Mon Sep 17 00:00:00 2001 From: leejet Date: Sun, 14 Sep 2025 22:27:11 +0800 Subject: [PATCH 10/13] fix potential duplicate issue --- lora.hpp | 8 ++++---- model.cpp | 31 +++++++++++++++++++++---------- model.h | 2 +- 3 files changed, 26 insertions(+), 15 deletions(-) diff --git a/lora.hpp b/lora.hpp index fde54eef5..f3db1fb52 100644 --- a/lora.hpp +++ b/lora.hpp @@ -1,8 +1,8 @@ #ifndef __LORA_HPP__ #define __LORA_HPP__ -#include "ggml_extend.hpp" #include +#include "ggml_extend.hpp" #define LORA_GRAPH_BASE_SIZE 10240 @@ -157,13 +157,13 @@ struct LoraModel : public GGMLRunner { model_loader.load_tensors(on_new_tensor_cb, n_threads); for (const auto& pair : tensors_to_create) { - const auto& name = pair.first; - const auto& ts = pair.second; + const auto& name = pair.first; + const auto& ts = pair.second; struct ggml_tensor* real = ggml_new_tensor(params_ctx, ts.type, ts.n_dims, ts.ne); - lora_tensors[name] = real; + lora_tensors[name] = real; } alloc_params_buffer(); diff --git a/model.cpp b/model.cpp index 2bb05d192..57757ea9c 100644 --- a/model.cpp +++ b/model.cpp @@ -1982,8 +1982,13 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread std::vector processed_tensor_storages; { - std::unordered_map processed_map; - std::mutex map_mutex; + struct IndexedStorage { + size_t index; + TensorStorage ts; + }; + + std::mutex vec_mutex; + std::vector all_results; int n_threads = std::min(num_threads_to_use, (int)tensor_storages.size()); if (n_threads < 1) { @@ -1993,7 +1998,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread for (int i = 0; i < n_threads; ++i) { workers.emplace_back([&, thread_id = i]() { - std::unordered_map local_processed_map; + std::vector local_results; std::vector temp_storages; for (size_t j = thread_id; j < tensor_storages.size(); j += n_threads) { @@ -2006,13 +2011,14 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread preprocess_tensor(tensor_storage, temp_storages); for (const auto& ts : temp_storages) { - local_processed_map[ts.name] = ts; + local_results.push_back({j, ts}); } } - if (!local_processed_map.empty()) { - std::lock_guard lock(map_mutex); - processed_map.merge(local_processed_map); + if (!local_results.empty()) { + std::lock_guard lock(vec_mutex); + all_results.insert(all_results.end(), + local_results.begin(), local_results.end()); } }); } @@ -2020,9 +2026,14 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread w.join(); } - processed_tensor_storages.reserve(processed_map.size()); - for (auto const& [name, ts] : processed_map) { - processed_tensor_storages.push_back(ts); + std::unordered_map latest_map; + for (auto& entry : all_results) { + latest_map[entry.ts.name] = entry; + } + + processed_tensor_storages.reserve(latest_map.size()); + for (auto& [name, entry] : latest_map) { + processed_tensor_storages.push_back(entry.ts); } } diff --git a/model.h b/model.h index cfd988969..1dbcc9bb6 100644 --- a/model.h +++ b/model.h @@ -250,7 +250,7 @@ class ModelLoader { bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0); bool load_tensors(std::map& tensors, std::set ignore_tensors = {}, - int n_threads = 0); + int n_threads = 0); bool save_to_gguf_file(const std::string& file_path, ggml_type type, const std::string& tensor_type_rules); bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type); From ce092341edcfb8b7d74548753ee4b35d1e21daea Mon Sep 17 00:00:00 2001 From: leejet Date: Sun, 14 Sep 2025 22:29:53 +0800 Subject: [PATCH 11/13] avoid duplicate lookup of lora tensor --- lora.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lora.hpp b/lora.hpp index f3db1fb52..1149d3b0d 100644 --- a/lora.hpp +++ b/lora.hpp @@ -147,8 +147,9 @@ struct LoraModel : public GGMLRunner { } } else { const std::string& name = tensor_storage.name; - if (lora_tensors.count(name)) { - *dst_tensor = lora_tensors.at(name); + auto iter = lora_tensors.find(name); + if (iter != lora_tensors.end()) { + *dst_tensor = iter->second; } } return true; From 94ab11fabaa6ecceca7609728581001c5a69486a Mon Sep 17 00:00:00 2001 From: leejet Date: Sun, 14 Sep 2025 22:35:42 +0800 Subject: [PATCH 12/13] fix progeress bar --- lora.hpp | 2 +- model.cpp | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/lora.hpp b/lora.hpp index 1149d3b0d..222f61b1e 100644 --- a/lora.hpp +++ b/lora.hpp @@ -147,7 +147,7 @@ struct LoraModel : public GGMLRunner { } } else { const std::string& name = tensor_storage.name; - auto iter = lora_tensors.find(name); + auto iter = lora_tensors.find(name); if (iter != lora_tensors.end()) { *dst_tensor = iter->second; } diff --git a/model.cpp b/model.cpp index 57757ea9c..d5e6785d1 100644 --- a/model.cpp +++ b/model.cpp @@ -2264,7 +2264,8 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread if (current_idx >= file_tensors.size() || failed) { break; } - pretty_progress(total_tensors_processed + current_idx, total_tensors_to_process, (ggml_time_ms() - t_start) / 1000.0f); + size_t curr_num = total_tensors_processed + current_idx; + pretty_progress(curr_num, total_tensors_to_process, (ggml_time_ms() - t_start) / 1000.0f / (curr_num + 1e-6f)); std::this_thread::sleep_for(std::chrono::milliseconds(200)); } @@ -2277,11 +2278,10 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread break; } total_tensors_processed += file_tensors.size(); - } - - pretty_progress(total_tensors_processed, total_tensors_to_process, (ggml_time_ms() - t_start) / 1000.0f); - if (total_tensors_to_process > 0) { - printf("\n"); + pretty_progress(total_tensors_processed, total_tensors_to_process, (ggml_time_ms() - t_start) / 1000.0f / (total_tensors_processed + 1e-6f)); + if (total_tensors_processed < total_tensors_to_process) { + printf("\n"); + } } int64_t end_time = ggml_time_ms(); From 4e408b0e2b052753c025f43b0beb713ac726bef4 Mon Sep 17 00:00:00 2001 From: leejet Date: Sun, 14 Sep 2025 22:45:37 +0800 Subject: [PATCH 13/13] remove unused remove_duplicates --- model.cpp | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/model.cpp b/model.cpp index d5e6785d1..8d0f46bd4 100644 --- a/model.cpp +++ b/model.cpp @@ -1949,26 +1949,6 @@ std::string ModelLoader::load_umt5_tokenizer_json() { return json_str; } -std::vector remove_duplicates(const std::vector& vec) { - std::vector res; - std::unordered_map name_to_index_map; - - for (const auto& ts : vec) { - const std::string& current_name = ts.name; - auto it = name_to_index_map.find(current_name); - - if (it != name_to_index_map.end()) { - // Found a duplicate, overwrite the existing one in res - res[it->second] = ts; - } else { - // Not a duplicate, add to map and push to res - name_to_index_map[current_name] = res.size(); - res.push_back(ts); - } - } - return res; -} - bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p) { int64_t process_time_ms = 0; std::atomic read_time_ms(0);