From 55b7707e99f5d3a941f6d96c98d852c1713a00ab Mon Sep 17 00:00:00 2001
From: rmatif <kingrealriadh@gmail.com>
Date: Thu, 28 Aug 2025 16:38:13 +0000
Subject: [PATCH 01/13] opt tensor loading

---
 model.cpp | 379 ++++++++++++++++++++++++++++++++----------------------
 1 file changed, 223 insertions(+), 156 deletions(-)
diff --git a/model.cpp b/model.cpp
index 1be057158..a1f81c448 100644
--- a/model.cpp
+++ b/model.cpp
@@ -5,6 +5,11 @@
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include <thread>
+#include <atomic>
+#include <functional>
+#include <mutex>
+#include <chrono>
 
 #include "gguf_reader.hpp"
 #include "model.h"
@@ -1948,238 +1953,300 @@ std::vector<TensorStorage> remove_duplicates(const std::vector<TensorStorage>& v
     std::vector<TensorStorage> res;
     std::unordered_map<std::string, size_t> name_to_index_map;
 
-    for (size_t i = 0; i < vec.size(); ++i) {
-        const std::string& current_name = vec[i].name;
-        auto it                         = name_to_index_map.find(current_name);
+    for (const auto& ts : vec) {
+        const std::string& current_name = ts.name;
+        auto it = name_to_index_map.find(current_name);
 
         if (it != name_to_index_map.end()) {
-            res[it->second] = vec[i];
+            // Found a duplicate, overwrite the existing one in res
+            res[it->second] = ts;
         } else {
-            name_to_index_map[current_name] = i;
-            res.push_back(vec[i]);
+            // Not a duplicate, add to map and push to res
+            name_to_index_map[current_name] = res.size();
+            res.push_back(ts);
         }
     }
-
-    // vec.resize(name_to_index_map.size());
-
     return res;
 }
 
 bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb) {
     std::vector<TensorStorage> processed_tensor_storages;
-    for (auto& tensor_storage : tensor_storages) {
-        // LOG_DEBUG("%s", name.c_str());
+    
+    {
+        std::unordered_map<std::string, TensorStorage> processed_map;
+        std::mutex map_mutex;
 
-        if (is_unused_tensor(tensor_storage.name)) {
-            continue;
+        int n_threads = std::min((int)std::thread::hardware_concurrency(), (int)tensor_storages.size());
+        if (n_threads < 1) {
+            n_threads = 1;
         }
+        std::vector<std::thread> workers;
 
-        preprocess_tensor(tensor_storage, processed_tensor_storages);
+        for (int i = 0; i < n_threads; ++i) {
+            workers.emplace_back([&, thread_id = i]() {
+
+                std::unordered_map<std::string, TensorStorage> local_processed_map;
+                std::vector<TensorStorage> temp_storages;
+
+                for (size_t j = thread_id; j < tensor_storages.size(); j += n_threads) {
+                    const auto& tensor_storage = tensor_storages[j];
+                    if (is_unused_tensor(tensor_storage.name)) {
+                        continue;
+                    }
+                    
+                    temp_storages.clear();
+                    preprocess_tensor(tensor_storage, temp_storages);
+                    
+                    for (const auto& ts : temp_storages) {
+                        local_processed_map[ts.name] = ts;
+                    }
+                }
+
+                if (!local_processed_map.empty()) {
+                    std::lock_guard<std::mutex> lock(map_mutex);
+                    processed_map.merge(local_processed_map);
+                }
+            });
+        }
+        for (auto& w : workers) {
+            w.join();
+        }
+        
+        processed_tensor_storages.reserve(processed_map.size());
+        for (auto const& [name, ts] : processed_map) {
+            processed_tensor_storages.push_back(ts);
+        }
     }
-    std::vector<TensorStorage> dedup = remove_duplicates(processed_tensor_storages);
-    processed_tensor_storages        = dedup;
 
-    bool success = true;
+    bool success                      = true;
+    size_t total_tensors_processed    = 0;
+    const size_t total_tensors_to_process = processed_tensor_storages.size();
+    const int64_t t_start             = ggml_time_ms();
+
     for (size_t file_index = 0; file_index < file_paths_.size(); file_index++) {
         std::string file_path = file_paths_[file_index];
         LOG_DEBUG("loading tensors from %s", file_path.c_str());
 
-        std::ifstream file(file_path, std::ios::binary);
-        if (!file.is_open()) {
-            LOG_ERROR("failed to open '%s'", file_path.c_str());
-            return false;
+        std::vector<const TensorStorage*> file_tensors;
+        for (const auto& ts : processed_tensor_storages) {
+            if (ts.file_index == file_index) {
+                file_tensors.push_back(&ts);
+            }
+        }
+        if (file_tensors.empty()) {
+            continue;
         }
 
         bool is_zip = false;
-        for (auto& tensor_storage : tensor_storages) {
-            if (tensor_storage.file_index != file_index) {
-                continue;
-            }
-            if (tensor_storage.index_in_zip >= 0) {
+        for (auto const& ts : file_tensors) {
+            if (ts->index_in_zip >= 0) {
                 is_zip = true;
                 break;
             }
         }
 
-        struct zip_t* zip = NULL;
-        if (is_zip) {
-            zip = zip_open(file_path.c_str(), 0, 'r');
-            if (zip == NULL) {
-                LOG_ERROR("failed to open zip '%s'", file_path.c_str());
-                return false;
-            }
+        int n_threads = is_zip ? 1 : std::min((int)std::thread::hardware_concurrency(), (int)file_tensors.size());
+        if (n_threads < 1) {
+            n_threads = 1;
         }
 
-        std::vector<uint8_t> read_buffer;
-        std::vector<uint8_t> convert_buffer;
+        std::atomic<size_t> tensor_idx(0);
+        std::atomic<bool> failed(false);
+        std::vector<std::thread> workers;
 
-        auto read_data = [&](const TensorStorage& tensor_storage, char* buf, size_t n) {
-            if (zip != NULL) {
-                zip_entry_openbyindex(zip, tensor_storage.index_in_zip);
-                size_t entry_size = zip_entry_size(zip);
-                if (entry_size != n) {
-                    read_buffer.resize(entry_size);
-                    zip_entry_noallocread(zip, (void*)read_buffer.data(), entry_size);
-                    memcpy((void*)buf, (void*)(read_buffer.data() + tensor_storage.offset), n);
+        for (int i = 0; i < n_threads; ++i) {
+            workers.emplace_back([&, file_path, is_zip]() {
+                std::ifstream file;
+                struct zip_t* zip = NULL;
+                if (is_zip) {
+                    zip = zip_open(file_path.c_str(), 0, 'r');
+                    if (zip == NULL) {
+                        LOG_ERROR("failed to open zip '%s'", file_path.c_str());
+                        failed = true;
+                        return;
+                    }
                 } else {
-                    zip_entry_noallocread(zip, (void*)buf, n);
-                }
-                zip_entry_close(zip);
-            } else {
-                file.seekg(tensor_storage.offset);
-                file.read(buf, n);
-                if (!file) {
-                    LOG_ERROR("read tensor data failed: '%s'", file_path.c_str());
-                    return false;
+                    file.open(file_path, std::ios::binary);
+                    if (!file.is_open()) {
+                        LOG_ERROR("failed to open '%s'", file_path.c_str());
+                        failed = true;
+                        return;
+                    }
                 }
-            }
-            return true;
-        };
-        int tensor_count = 0;
-        int64_t t0       = ggml_time_ms();
-        int64_t t1       = t0;
-        bool partial     = true;
-        int tensor_max   = (int)processed_tensor_storages.size();
-        pretty_progress(0, tensor_max, 0.0f);
-        for (auto& tensor_storage : processed_tensor_storages) {
-            if (tensor_storage.file_index != file_index) {
-                ++tensor_count;
-                continue;
-            }
-            ggml_tensor* dst_tensor = NULL;
 
-            success = on_new_tensor_cb(tensor_storage, &dst_tensor);
-            if (!success) {
-                LOG_WARN("process tensor failed: '%s'", tensor_storage.name.c_str());
-                break;
-            }
+                std::vector<uint8_t> read_buffer;
+                std::vector<uint8_t> convert_buffer;
 
-            if (dst_tensor == NULL) {
-                ++tensor_count;
-                continue;
-            }
+                while (true) {
+                    size_t idx = tensor_idx.fetch_add(1);
+                    if (idx >= file_tensors.size() || failed) {
+                        break;
+                    }
 
-            size_t nbytes_to_read = tensor_storage.nbytes_to_read();
+                    const TensorStorage& tensor_storage = *file_tensors[idx];
+                    ggml_tensor* dst_tensor             = NULL;
 
-            if (dst_tensor->buffer == NULL || ggml_backend_buffer_is_host(dst_tensor->buffer)) {
-                // for the CPU and Metal backend, we can copy directly into the tensor
-                if (tensor_storage.type == dst_tensor->type) {
-                    GGML_ASSERT(ggml_nbytes(dst_tensor) == tensor_storage.nbytes());
-                    if (tensor_storage.is_f64 || tensor_storage.is_i64) {
-                        read_buffer.resize(tensor_storage.nbytes_to_read());
-                        read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read);
-                    } else {
-                        read_data(tensor_storage, (char*)dst_tensor->data, nbytes_to_read);
+                    if (!on_new_tensor_cb(tensor_storage, &dst_tensor)) {
+                        LOG_WARN("process tensor failed: '%s'", tensor_storage.name.c_str());
+                        failed = true;
+                        break;
+                    }
+
+                    if (dst_tensor == NULL) {
+                        continue;
                     }
 
-                    if (tensor_storage.is_bf16) {
+                    size_t nbytes_to_read = tensor_storage.nbytes_to_read();
+
+                    auto read_data = [&](char* buf, size_t n) {
+                        if (zip != NULL) {
+                            zip_entry_openbyindex(zip, tensor_storage.index_in_zip);
+                            size_t entry_size = zip_entry_size(zip);
+                            if (entry_size != n) {
+                                read_buffer.resize(entry_size);
+                                zip_entry_noallocread(zip, (void*)read_buffer.data(), entry_size);
+                                memcpy((void*)buf, (void*)(read_buffer.data() + tensor_storage.offset), n);
+                            } else {
+                                zip_entry_noallocread(zip, (void*)buf, n);
+                            }
+                            zip_entry_close(zip);
+                        } else {
+                            file.seekg(tensor_storage.offset);
+                            file.read(buf, n);
+                            if (!file) {
+                                LOG_ERROR("read tensor data failed: '%s'", file_path.c_str());
+                                failed = true;
+                            }
+                        }
+                    };
+
+                    if (dst_tensor->buffer == NULL || ggml_backend_buffer_is_host(dst_tensor->buffer)) {
+                        if (tensor_storage.type == dst_tensor->type) {
+                            GGML_ASSERT(ggml_nbytes(dst_tensor) == tensor_storage.nbytes());
+                            if (tensor_storage.is_f64 || tensor_storage.is_i64) {
+                                read_buffer.resize(tensor_storage.nbytes_to_read());
+                                read_data((char*)read_buffer.data(), nbytes_to_read);
+                            } else {
+                                read_data((char*)dst_tensor->data, nbytes_to_read);
+                            }
+
+                            if (tensor_storage.is_bf16) {
                         // inplace op
-                        bf16_to_f32_vec((uint16_t*)dst_tensor->data, (float*)dst_tensor->data, tensor_storage.nelements());
-                    } else if (tensor_storage.is_f8_e4m3) {
+                                bf16_to_f32_vec((uint16_t*)dst_tensor->data, (float*)dst_tensor->data, tensor_storage.nelements());
+                            } else if (tensor_storage.is_f8_e4m3) {
                         // inplace op
-                        f8_e4m3_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements());
-                    } else if (tensor_storage.is_f8_e5m2) {
+                                f8_e4m3_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements());
+                            } else if (tensor_storage.is_f8_e5m2) {
                         // inplace op
-                        f8_e5m2_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements());
-                    } else if (tensor_storage.is_f64) {
-                        f64_to_f32_vec((double*)read_buffer.data(), (float*)dst_tensor->data, tensor_storage.nelements());
-                    } else if (tensor_storage.is_i64) {
-                        i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)dst_tensor->data, tensor_storage.nelements());
-                    }
-                } else {
-                    read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read()));
-                    read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read);
-
-                    if (tensor_storage.is_bf16) {
+                                f8_e5m2_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements());
+                            } else if (tensor_storage.is_f64) {
+                                f64_to_f32_vec((double*)read_buffer.data(), (float*)dst_tensor->data, tensor_storage.nelements());
+                            } else if (tensor_storage.is_i64) {
+                                i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)dst_tensor->data, tensor_storage.nelements());
+                            }
+                        } else {
+                            read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read()));
+                            read_data((char*)read_buffer.data(), nbytes_to_read);
+
+                            if (tensor_storage.is_bf16) {
                         // inplace op
-                        bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
-                    } else if (tensor_storage.is_f8_e4m3) {
+                                bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
+                            } else if (tensor_storage.is_f8_e4m3) {
                         // inplace op
-                        f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
-                    } else if (tensor_storage.is_f8_e5m2) {
+                                f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
+                            } else if (tensor_storage.is_f8_e5m2) {
                         // inplace op
-                        f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
-                    } else if (tensor_storage.is_f64) {
+                                f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
+                            } else if (tensor_storage.is_f64) {
                         // inplace op
-                        f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
-                    } else if (tensor_storage.is_i64) {
+                                f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
+                            } else if (tensor_storage.is_i64) {
                         // inplace op
-                        i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements());
-                    }
+                                i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements());
+                            }
 
-                    convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data,
-                                   dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
-                }
-            } else {
-                read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read()));
-                read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read);
+                            convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data,
+                                           dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
+                        }
+                    } else {
+                        read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read()));
+                        read_data((char*)read_buffer.data(), nbytes_to_read);
 
-                if (tensor_storage.is_bf16) {
+                        if (tensor_storage.is_bf16) {
                     // inplace op
-                    bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
-                } else if (tensor_storage.is_f8_e4m3) {
+                            bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
+                        } else if (tensor_storage.is_f8_e4m3) {
                     // inplace op
-                    f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
-                } else if (tensor_storage.is_f8_e5m2) {
+                            f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
+                        } else if (tensor_storage.is_f8_e5m2) {
                     // inplace op
-                    f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
-                } else if (tensor_storage.is_f64) {
+                            f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements());
+                        } else if (tensor_storage.is_f64) {
                     // inplace op
-                    f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
-                } else if (tensor_storage.is_i64) {
+                            f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
+                        } else if (tensor_storage.is_i64) {
                     // inplace op
-                    i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements());
-                }
+                            i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements());
+                        }
 
-                if (tensor_storage.type == dst_tensor->type) {
+                        if (tensor_storage.type == dst_tensor->type) {
                     // copy to device memory
-                    ggml_backend_tensor_set(dst_tensor, read_buffer.data(), 0, ggml_nbytes(dst_tensor));
-                } else {
+                            ggml_backend_tensor_set(dst_tensor, read_buffer.data(), 0, ggml_nbytes(dst_tensor));
+                        } else {
                     // convert first, then copy to device memory
-                    convert_buffer.resize(ggml_nbytes(dst_tensor));
-                    convert_tensor((void*)read_buffer.data(), tensor_storage.type,
-                                   (void*)convert_buffer.data(), dst_tensor->type,
-                                   (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
-                    ggml_backend_tensor_set(dst_tensor, convert_buffer.data(), 0, ggml_nbytes(dst_tensor));
+                            convert_buffer.resize(ggml_nbytes(dst_tensor));
+                            convert_tensor((void*)read_buffer.data(), tensor_storage.type,
+                                           (void*)convert_buffer.data(), dst_tensor->type,
+                                           (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
+                            ggml_backend_tensor_set(dst_tensor, convert_buffer.data(), 0, ggml_nbytes(dst_tensor));
+                        }
+                    }
                 }
-            }
-            ++tensor_count;
-            int64_t t2 = ggml_time_ms();
-            if ((t2 - t1) >= 200) {
-                t1 = t2;
-                pretty_progress(tensor_count, tensor_max, (t1 - t0) / (1000.0f * tensor_count));
-                partial = tensor_count != tensor_max;
-            }
+                if (zip != NULL) {
+                    zip_close(zip);
+                }
+            });
         }
 
-        if (partial) {
-            if (tensor_count >= 1) {
-                t1 = ggml_time_ms();
-                pretty_progress(tensor_count, tensor_max, (t1 - t0) / (1000.0f * tensor_count));
-            }
-            if (tensor_count < tensor_max) {
-                printf("\n");
+        while (true) {
+            size_t current_idx = tensor_idx.load();
+            if (current_idx >= file_tensors.size() || failed) {
+                break;
             }
+            pretty_progress(total_tensors_processed + current_idx, total_tensors_to_process, (ggml_time_ms() - t_start) / 1000.0f);
+            std::this_thread::sleep_for(std::chrono::milliseconds(200));
         }
 
-        if (zip != NULL) {
-            zip_close(zip);
+        for (auto& w : workers) {
+            w.join();
         }
 
-        if (!success) {
+        if (failed) {
+            success = false;
             break;
         }
+        total_tensors_processed += file_tensors.size();
+    }
+
+    pretty_progress(total_tensors_processed, total_tensors_to_process, (ggml_time_ms() - t_start) / 1000.0f);
+    if (total_tensors_to_process > 0) {
+        printf("\n");
     }
+
     return success;
 }
 
 bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
                                std::set<std::string> ignore_tensors) {
     std::set<std::string> tensor_names_in_file;
+    std::mutex tensor_names_mutex;
     auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
         const std::string& name = tensor_storage.name;
         // LOG_DEBUG("%s", tensor_storage.to_string().c_str());
-        tensor_names_in_file.insert(name);
+        {
+            std::lock_guard<std::mutex> lock(tensor_names_mutex);
+            tensor_names_in_file.insert(name);
+        }
 
         struct ggml_tensor* real;
         if (tensors.find(name) != tensors.end()) {

From 6fa2b26c94612b135052b6f51ed6985da4cb0b29 Mon Sep 17 00:00:00 2001
From: rmatif <kingrealriadh@gmail.com>
Date: Sat, 6 Sep 2025 21:21:13 +0000
Subject: [PATCH 02/13] fix build failure

---
 model.cpp | 49 +++++++++++++++++++++++++++++--------------------
 1 file changed, 29 insertions(+), 20 deletions(-)

diff --git a/model.cpp b/model.cpp
index a1f81c448..e9db172de 100644
--- a/model.cpp
+++ b/model.cpp
@@ -1973,48 +1973,57 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb) {
     std::vector<TensorStorage> processed_tensor_storages;
     
     {
-        std::unordered_map<std::string, TensorStorage> processed_map;
-        std::mutex map_mutex;
-
         int n_threads = std::min((int)std::thread::hardware_concurrency(), (int)tensor_storages.size());
         if (n_threads < 1) {
             n_threads = 1;
         }
+
+        std::vector<std::unordered_map<std::string, TensorStorage> > local_maps(n_threads);
         std::vector<std::thread> workers;
+        size_t chunk_size = (tensor_storages.size() + n_threads - 1) / n_threads;
 
         for (int i = 0; i < n_threads; ++i) {
             workers.emplace_back([&, thread_id = i]() {
+                const size_t start = thread_id * chunk_size;
+                const size_t end   = std::min(start + chunk_size, tensor_storages.size());
 
-                std::unordered_map<std::string, TensorStorage> local_processed_map;
                 std::vector<TensorStorage> temp_storages;
-
-                for (size_t j = thread_id; j < tensor_storages.size(); j += n_threads) {
+                for (size_t j = start; j < end; ++j) {
                     const auto& tensor_storage = tensor_storages[j];
                     if (is_unused_tensor(tensor_storage.name)) {
                         continue;
                     }
-                    
+
                     temp_storages.clear();
                     preprocess_tensor(tensor_storage, temp_storages);
-                    
-                    for (const auto& ts : temp_storages) {
-                        local_processed_map[ts.name] = ts;
-                    }
-                }
 
-                if (!local_processed_map.empty()) {
-                    std::lock_guard<std::mutex> lock(map_mutex);
-                    processed_map.merge(local_processed_map);
+                    for (size_t k = 0; k < temp_storages.size(); ++k) {
+                        local_maps[thread_id][temp_storages[k].name] = temp_storages[k];
+                    }
                 }
             });
         }
-        for (auto& w : workers) {
-            w.join();
+
+        for (size_t i = 0; i < workers.size(); ++i) {
+            workers[i].join();
+        }
+
+        std::unordered_map<std::string, TensorStorage> processed_map;
+        size_t total_keys = 0;
+        for (int i = 0; i < n_threads; ++i) {
+            total_keys += local_maps[i].size();
         }
-        
+        processed_map.reserve(total_keys);
+
+        for (int i = 0; i < n_threads; ++i) {
+            for (std::unordered_map<std::string, TensorStorage>::const_iterator it = local_maps[i].begin(); it != local_maps[i].end(); ++it) {
+                processed_map[it->first] = it->second;
+            }
+        }
+
         processed_tensor_storages.reserve(processed_map.size());
-        for (auto const& [name, ts] : processed_map) {
-            processed_tensor_storages.push_back(ts);
+        for (std::unordered_map<std::string, TensorStorage>::const_iterator it = processed_map.begin(); it != processed_map.end(); ++it) {
+            processed_tensor_storages.push_back(it->second);
         }
     }
 

From 12295b2ab599a6927dd867c5d837314553ee1395 Mon Sep 17 00:00:00 2001
From: rmatif <kingrealriadh@gmail.com>
Date: Sun, 7 Sep 2025 04:21:16 +0000
Subject: [PATCH 03/13] revert the changes

---
 model.cpp | 49 ++++++++++++++++++++-----------------------------
 1 file changed, 20 insertions(+), 29 deletions(-)

diff --git a/model.cpp b/model.cpp
index e9db172de..a1f81c448 100644
--- a/model.cpp
+++ b/model.cpp
@@ -1973,57 +1973,48 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb) {
     std::vector<TensorStorage> processed_tensor_storages;
     
     {
+        std::unordered_map<std::string, TensorStorage> processed_map;
+        std::mutex map_mutex;
+
         int n_threads = std::min((int)std::thread::hardware_concurrency(), (int)tensor_storages.size());
         if (n_threads < 1) {
             n_threads = 1;
         }
-
-        std::vector<std::unordered_map<std::string, TensorStorage> > local_maps(n_threads);
         std::vector<std::thread> workers;
-        size_t chunk_size = (tensor_storages.size() + n_threads - 1) / n_threads;
 
         for (int i = 0; i < n_threads; ++i) {
             workers.emplace_back([&, thread_id = i]() {
-                const size_t start = thread_id * chunk_size;
-                const size_t end   = std::min(start + chunk_size, tensor_storages.size());
 
+                std::unordered_map<std::string, TensorStorage> local_processed_map;
                 std::vector<TensorStorage> temp_storages;
-                for (size_t j = start; j < end; ++j) {
+
+                for (size_t j = thread_id; j < tensor_storages.size(); j += n_threads) {
                     const auto& tensor_storage = tensor_storages[j];
                     if (is_unused_tensor(tensor_storage.name)) {
                         continue;
                     }
-
+                    
                     temp_storages.clear();
                     preprocess_tensor(tensor_storage, temp_storages);
-
-                    for (size_t k = 0; k < temp_storages.size(); ++k) {
-                        local_maps[thread_id][temp_storages[k].name] = temp_storages[k];
+                    
+                    for (const auto& ts : temp_storages) {
+                        local_processed_map[ts.name] = ts;
                     }
                 }
-            });
-        }
-
-        for (size_t i = 0; i < workers.size(); ++i) {
-            workers[i].join();
-        }
 
-        std::unordered_map<std::string, TensorStorage> processed_map;
-        size_t total_keys = 0;
-        for (int i = 0; i < n_threads; ++i) {
-            total_keys += local_maps[i].size();
+                if (!local_processed_map.empty()) {
+                    std::lock_guard<std::mutex> lock(map_mutex);
+                    processed_map.merge(local_processed_map);
+                }
+            });
         }
-        processed_map.reserve(total_keys);
-
-        for (int i = 0; i < n_threads; ++i) {
-            for (std::unordered_map<std::string, TensorStorage>::const_iterator it = local_maps[i].begin(); it != local_maps[i].end(); ++it) {
-                processed_map[it->first] = it->second;
-            }
+        for (auto& w : workers) {
+            w.join();
         }
-
+        
         processed_tensor_storages.reserve(processed_map.size());
-        for (std::unordered_map<std::string, TensorStorage>::const_iterator it = processed_map.begin(); it != processed_map.end(); ++it) {
-            processed_tensor_storages.push_back(it->second);
+        for (auto const& [name, ts] : processed_map) {
+            processed_tensor_storages.push_back(ts);
         }
     }
 

From 401c42c2c76f6e17a4cb021aeb185960843347ee Mon Sep 17 00:00:00 2001
From: rmatif <kingrealriadh@gmail.com>
Date: Sun, 7 Sep 2025 17:03:16 +0000
Subject: [PATCH 04/13] allow the use of n_threads

---
 model.cpp            | 13 ++++++++-----
 model.h              |  5 +++--
 stable-diffusion.cpp |  2 +-
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/model.cpp b/model.cpp
index 5ea24911f..e35cc9287 100644
--- a/model.cpp
+++ b/model.cpp
@@ -1969,7 +1969,7 @@ std::vector<TensorStorage> remove_duplicates(const std::vector<TensorStorage>& v
     return res;
 }
 
-bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb) {
+bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p) {
     int64_t process_time_ms         = 0;
     int64_t read_time_ms            = 0;
     int64_t memcpy_time_ms          = 0;
@@ -1986,7 +1986,8 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb) {
         std::unordered_map<std::string, TensorStorage> processed_map;
         std::mutex map_mutex;
 
-        int n_threads = std::min((int)std::thread::hardware_concurrency(), (int)tensor_storages.size());
+        int num_threads = n_threads_p > 0 ? n_threads_p : (int)std::thread::hardware_concurrency();
+        int n_threads   = std::min(num_threads, (int)tensor_storages.size());
         if (n_threads < 1) {
             n_threads = 1;
         }
@@ -2058,7 +2059,8 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb) {
             }
         }
 
-        int n_threads = is_zip ? 1 : std::min((int)std::thread::hardware_concurrency(), (int)file_tensors.size());
+        int num_threads = n_threads_p > 0 ? n_threads_p : (int)std::thread::hardware_concurrency();
+        int n_threads   = is_zip ? 1 : std::min(num_threads, (int)file_tensors.size());
         if (n_threads < 1) {
             n_threads = 1;
         }
@@ -2287,7 +2289,8 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb) {
 }
 
 bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
-                               std::set<std::string> ignore_tensors) {
+                               std::set<std::string> ignore_tensors,
+                               int n_threads) {
     std::set<std::string> tensor_names_in_file;
     std::mutex tensor_names_mutex;
     auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
@@ -2330,7 +2333,7 @@ bool ModelLoader::load_tensors(std::map<std::string, struct ggml_tensor*>& tenso
         return true;
     };
 
-    bool success = load_tensors(on_new_tensor_cb);
+    bool success = load_tensors(on_new_tensor_cb, n_threads);
     if (!success) {
         LOG_ERROR("load tensors from file failed");
         return false;
diff --git a/model.h b/model.h
index fef6ace82..cfd988969 100644
--- a/model.h
+++ b/model.h
@@ -247,9 +247,10 @@ class ModelLoader {
     ggml_type get_diffusion_model_wtype();
     ggml_type get_vae_wtype();
     void set_wtype_override(ggml_type wtype, std::string prefix = "");
-    bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb);
+    bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0);
     bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
-                      std::set<std::string> ignore_tensors = {});
+                      std::set<std::string> ignore_tensors = {},
+                      int n_threads = 0);
 
     bool save_to_gguf_file(const std::string& file_path, ggml_type type, const std::string& tensor_type_rules);
     bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type);
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 17804c11b..57d6959a5 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -573,7 +573,7 @@ class StableDiffusionGGML {
         if (version == VERSION_SVD) {
             ignore_tensors.insert("conditioner.embedders.3");
         }
-        bool success = model_loader.load_tensors(tensors, ignore_tensors);
+        bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads);
         if (!success) {
             LOG_ERROR("load tensors from model loader failed");
             ggml_free(ctx);

From 9e0d8e53cbfe0a85ea2fe32262286c1c08343fee Mon Sep 17 00:00:00 2001
From: rmatif <kingrealriadh@gmail.com>
Date: Sun, 7 Sep 2025 23:09:40 +0000
Subject: [PATCH 05/13] fix lora loading

---
 lora.hpp | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/lora.hpp b/lora.hpp
index b7a27306c..7d3222760 100644
--- a/lora.hpp
+++ b/lora.hpp
@@ -115,7 +115,7 @@ struct LoraModel : public GGMLRunner {
         return "lora";
     }
 
-    bool load_from_file(bool filter_tensor = false) {
+    bool load_from_file(bool filter_tensor = false, int n_threads = 0) {
         LOG_INFO("loading LoRA from '%s'", file_path.c_str());
 
         if (load_failed) {
@@ -131,15 +131,14 @@ struct LoraModel : public GGMLRunner {
                 // LOG_INFO("skipping LoRA tesnor '%s'", name.c_str());
                 return true;
             }
-            // LOG_INFO("lora_tensor %s", name.c_str());
-            for (int i = 0; i < LORA_TYPE_COUNT; i++) {
-                if (name.find(type_fingerprints[i]) != std::string::npos) {
-                    type = (lora_t)i;
-                    break;
-                }
-            }
 
             if (dry_run) {
+                for (int i = 0; i < LORA_TYPE_COUNT; i++) {
+                    if (name.find(type_fingerprints[i]) != std::string::npos) {
+                        type = (lora_t)i;
+                        break;
+                    }
+                }
                 struct ggml_tensor* real = ggml_new_tensor(params_ctx,
                                                            tensor_storage.type,
                                                            tensor_storage.n_dims,
@@ -153,11 +152,11 @@ struct LoraModel : public GGMLRunner {
             return true;
         };
 
-        model_loader.load_tensors(on_new_tensor_cb);
+        model_loader.load_tensors(on_new_tensor_cb, 1);
         alloc_params_buffer();
-        // exit(0);
+
         dry_run = false;
-        model_loader.load_tensors(on_new_tensor_cb);
+        model_loader.load_tensors(on_new_tensor_cb, n_threads);
 
         LOG_DEBUG("lora type: \"%s\"/\"%s\"", lora_downs[type].c_str(), lora_ups[type].c_str());
 

From 507f4068933f57a3db6bc74679fde1ae9fa1a2a7 Mon Sep 17 00:00:00 2001
From: rmatif <kingrealriadh@gmail.com>
Date: Mon, 8 Sep 2025 10:48:34 +0000
Subject: [PATCH 06/13] optimize lora loading

---
 lora.hpp | 50 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 31 insertions(+), 19 deletions(-)

diff --git a/lora.hpp b/lora.hpp
index 7d3222760..3366e761b 100644
--- a/lora.hpp
+++ b/lora.hpp
@@ -123,36 +123,48 @@ struct LoraModel : public GGMLRunner {
             return false;
         }
 
+        std::unordered_map<std::string, TensorStorage> tensors_to_create;
+        std::mutex lora_mutex;
         bool dry_run          = true;
         auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool {
-            const std::string& name = tensor_storage.name;
+            if (dry_run) {
+                const std::string& name = tensor_storage.name;
 
-            if (filter_tensor && !contains(name, "lora")) {
-                // LOG_INFO("skipping LoRA tesnor '%s'", name.c_str());
-                return true;
-            }
+                if (filter_tensor && !contains(name, "lora")) {
+                    return true;
+                }
 
-            if (dry_run) {
-                for (int i = 0; i < LORA_TYPE_COUNT; i++) {
-                    if (name.find(type_fingerprints[i]) != std::string::npos) {
-                        type = (lora_t)i;
-                        break;
+                {
+                    std::lock_guard<std::mutex> lock(lora_mutex);
+                    for (int i = 0; i < LORA_TYPE_COUNT; i++) {
+                        if (name.find(type_fingerprints[i]) != std::string::npos) {
+                            type = (lora_t)i;
+                            break;
+                        }
                     }
+                    tensors_to_create[name] = tensor_storage;
                 }
-                struct ggml_tensor* real = ggml_new_tensor(params_ctx,
-                                                           tensor_storage.type,
-                                                           tensor_storage.n_dims,
-                                                           tensor_storage.ne);
-                lora_tensors[name]       = real;
             } else {
-                auto real   = lora_tensors[name];
-                *dst_tensor = real;
+                const std::string& name = tensor_storage.name;
+                if (lora_tensors.count(name)) {
+                    *dst_tensor = lora_tensors.at(name);
+                }
             }
-
             return true;
         };
 
-        model_loader.load_tensors(on_new_tensor_cb, 1);
+        model_loader.load_tensors(on_new_tensor_cb, n_threads);
+
+        for (const auto& pair : tensors_to_create) {
+            const auto& name = pair.first;
+            const auto& ts   = pair.second;
+            struct ggml_tensor* real = ggml_new_tensor(params_ctx,
+                                                       ts.type,
+                                                       ts.n_dims,
+                                                       ts.ne);
+            lora_tensors[name] = real;
+        }
+
         alloc_params_buffer();
 
         dry_run = false;

From e7cd3ca09275bf9e0df6fa3321b1ed40bd1893b8 Mon Sep 17 00:00:00 2001
From: rmatif <kingrealriadh@gmail.com>
Date: Mon, 8 Sep 2025 10:49:32 +0000
Subject: [PATCH 07/13] add mutex

---
 lora.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lora.hpp b/lora.hpp
index 3366e761b..fde54eef5 100644
--- a/lora.hpp
+++ b/lora.hpp
@@ -2,6 +2,7 @@
 #define __LORA_HPP__
 
 #include "ggml_extend.hpp"
+#include <mutex>
 
 #define LORA_GRAPH_BASE_SIZE 10240
 

From 289c329b27bfce5aa146b4e5f5f60a6698b78351 Mon Sep 17 00:00:00 2001
From: rmatif <kingrealriadh@gmail.com>
Date: Mon, 8 Sep 2025 14:25:41 +0000
Subject: [PATCH 08/13] use atomic

---
 model.cpp | 101 ++++++++++++++++++++++++++----------------------------
 1 file changed, 48 insertions(+), 53 deletions(-)

diff --git a/model.cpp b/model.cpp
index e35cc9287..010848265 100644
--- a/model.cpp
+++ b/model.cpp
@@ -1970,24 +1970,22 @@ std::vector<TensorStorage> remove_duplicates(const std::vector<TensorStorage>& v
 }
 
 bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p) {
-    int64_t process_time_ms         = 0;
-    int64_t read_time_ms            = 0;
-    int64_t memcpy_time_ms          = 0;
-    int64_t copy_to_backend_time_ms = 0;
-    int64_t convert_time_ms         = 0;
-
-    int64_t prev_time_ms = 0;
-    int64_t curr_time_ms = 0;
-    int64_t start_time   = ggml_time_ms();
-    prev_time_ms         = start_time;
+    int64_t process_time_ms = 0;
+    std::atomic<int64_t> read_time_ms(0);
+    std::atomic<int64_t> memcpy_time_ms(0);
+    std::atomic<int64_t> copy_to_backend_time_ms(0);
+    std::atomic<int64_t> convert_time_ms(0);
+
+    int num_threads_to_use = n_threads_p > 0 ? n_threads_p : (int)std::thread::hardware_concurrency();
+
+    int64_t start_time = ggml_time_ms();
     std::vector<TensorStorage> processed_tensor_storages;
 
     {
         std::unordered_map<std::string, TensorStorage> processed_map;
         std::mutex map_mutex;
 
-        int num_threads = n_threads_p > 0 ? n_threads_p : (int)std::thread::hardware_concurrency();
-        int n_threads   = std::min(num_threads, (int)tensor_storages.size());
+        int n_threads = std::min(num_threads_to_use, (int)tensor_storages.size());
         if (n_threads < 1) {
             n_threads = 1;
         }
@@ -2028,14 +2026,13 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
         }
     }
 
-    curr_time_ms    = ggml_time_ms();
-    process_time_ms = curr_time_ms - prev_time_ms;
-    prev_time_ms    = curr_time_ms;
+    process_time_ms = ggml_time_ms() - start_time;
 
     bool success                          = true;
     size_t total_tensors_processed        = 0;
     const size_t total_tensors_to_process = processed_tensor_storages.size();
     const int64_t t_start                 = ggml_time_ms();
+    int last_n_threads                    = 1;
 
     for (size_t file_index = 0; file_index < file_paths_.size(); file_index++) {
         std::string file_path = file_paths_[file_index];
@@ -2059,11 +2056,11 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
             }
         }
 
-        int num_threads = n_threads_p > 0 ? n_threads_p : (int)std::thread::hardware_concurrency();
-        int n_threads   = is_zip ? 1 : std::min(num_threads, (int)file_tensors.size());
+        int n_threads = is_zip ? 1 : std::min(num_threads_to_use, (int)file_tensors.size());
         if (n_threads < 1) {
             n_threads = 1;
         }
+        last_n_threads = n_threads;
 
         std::atomic<size_t> tensor_idx(0);
         std::atomic<bool> failed(false);
@@ -2093,6 +2090,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
                 std::vector<uint8_t> convert_buffer;
 
                 while (true) {
+                    int64_t t0, t1;
                     size_t idx = tensor_idx.fetch_add(1);
                     if (idx >= file_tensors.size() || failed) {
                         break;
@@ -2101,6 +2099,8 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
                     const TensorStorage& tensor_storage = *file_tensors[idx];
                     ggml_tensor* dst_tensor             = NULL;
 
+                    t0 = ggml_time_ms();
+
                     if (!on_new_tensor_cb(tensor_storage, &dst_tensor)) {
                         LOG_WARN("process tensor failed: '%s'", tensor_storage.name.c_str());
                         failed = true;
@@ -2108,6 +2108,8 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
                     }
 
                     if (dst_tensor == NULL) {
+                        t1 = ggml_time_ms();
+                        read_time_ms.fetch_add(t1 - t0);
                         continue;
                     }
 
@@ -2118,28 +2120,19 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
                             zip_entry_openbyindex(zip, tensor_storage.index_in_zip);
                             size_t entry_size = zip_entry_size(zip);
                             if (entry_size != n) {
+                                int64_t t_memcpy_start;
                                 read_buffer.resize(entry_size);
-                                prev_time_ms = ggml_time_ms();
                                 zip_entry_noallocread(zip, (void*)read_buffer.data(), entry_size);
-                                curr_time_ms = ggml_time_ms();
-                                read_time_ms += curr_time_ms - prev_time_ms;
-                                prev_time_ms = curr_time_ms;
+                                t_memcpy_start = ggml_time_ms();
                                 memcpy((void*)buf, (void*)(read_buffer.data() + tensor_storage.offset), n);
-                                curr_time_ms = ggml_time_ms();
-                                memcpy_time_ms += curr_time_ms - prev_time_ms;
+                                memcpy_time_ms.fetch_add(ggml_time_ms() - t_memcpy_start);
                             } else {
-                                prev_time_ms = ggml_time_ms();
                                 zip_entry_noallocread(zip, (void*)buf, n);
-                                curr_time_ms = ggml_time_ms();
-                                read_time_ms += curr_time_ms - prev_time_ms;
                             }
                             zip_entry_close(zip);
                         } else {
-                            prev_time_ms = ggml_time_ms();
                             file.seekg(tensor_storage.offset);
                             file.read(buf, n);
-                            curr_time_ms = ggml_time_ms();
-                            read_time_ms += curr_time_ms - prev_time_ms;
                             if (!file) {
                                 LOG_ERROR("read tensor data failed: '%s'", file_path.c_str());
                                 failed = true;
@@ -2156,8 +2149,10 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
                             } else {
                                 read_data((char*)dst_tensor->data, nbytes_to_read);
                             }
+                            t1 = ggml_time_ms();
+                            read_time_ms.fetch_add(t1 - t0);
 
-                            prev_time_ms = ggml_time_ms();
+                            t0 = ggml_time_ms();
                             if (tensor_storage.is_bf16) {
                                 // inplace op
                                 bf16_to_f32_vec((uint16_t*)dst_tensor->data, (float*)dst_tensor->data, tensor_storage.nelements());
@@ -2172,13 +2167,15 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
                             } else if (tensor_storage.is_i64) {
                                 i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)dst_tensor->data, tensor_storage.nelements());
                             }
-                            curr_time_ms = ggml_time_ms();
-                            convert_time_ms += curr_time_ms - prev_time_ms;
+                            t1 = ggml_time_ms();
+                            convert_time_ms.fetch_add(t1 - t0);
                         } else {
                             read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read()));
                             read_data((char*)read_buffer.data(), nbytes_to_read);
+                            t1 = ggml_time_ms();
+                            read_time_ms.fetch_add(t1 - t0);
 
-                            prev_time_ms = ggml_time_ms();
+                            t0 = ggml_time_ms();
                             if (tensor_storage.is_bf16) {
                                 // inplace op
                                 bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
@@ -2195,17 +2192,17 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
                                 // inplace op
                                 i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements());
                             }
-
-                            convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data,
-                                           dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
-                            curr_time_ms = ggml_time_ms();
-                            convert_time_ms += curr_time_ms - prev_time_ms;
+                            convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data, dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
+                            t1 = ggml_time_ms();
+                            convert_time_ms.fetch_add(t1 - t0);
                         }
                     } else {
                         read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read()));
                         read_data((char*)read_buffer.data(), nbytes_to_read);
+                        t1 = ggml_time_ms();
+                        read_time_ms.fetch_add(t1 - t0);
 
-                        prev_time_ms = ggml_time_ms();
+                        t0 = ggml_time_ms();
                         if (tensor_storage.is_bf16) {
                             // inplace op
                             bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements());
@@ -2229,20 +2226,18 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
                             convert_time_ms += curr_time_ms - prev_time_ms;
                             prev_time_ms = curr_time_ms;
                             ggml_backend_tensor_set(dst_tensor, read_buffer.data(), 0, ggml_nbytes(dst_tensor));
-                            curr_time_ms = ggml_time_ms();
-                            copy_to_backend_time_ms += curr_time_ms - prev_time_ms;
+                            t1 = ggml_time_ms();
+                            copy_to_backend_time_ms.fetch_add(t1 - t0);
                         } else {
                             // convert first, then copy to device memory
                             convert_buffer.resize(ggml_nbytes(dst_tensor));
-                            convert_tensor((void*)read_buffer.data(), tensor_storage.type,
-                                           (void*)convert_buffer.data(), dst_tensor->type,
-                                           (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
-                            curr_time_ms = ggml_time_ms();
-                            convert_time_ms += curr_time_ms - prev_time_ms;
-                            prev_time_ms = curr_time_ms;
+                            convert_tensor((void*)read_buffer.data(), tensor_storage.type, (void*)convert_buffer.data(), dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
+                            t1 = ggml_time_ms();
+                            convert_time_ms.fetch_add(t1 - t0);
+                            t0 = ggml_time_ms();
                             ggml_backend_tensor_set(dst_tensor, convert_buffer.data(), 0, ggml_nbytes(dst_tensor));
-                            curr_time_ms = ggml_time_ms();
-                            copy_to_backend_time_ms += curr_time_ms - prev_time_ms;
+                            t1 = ggml_time_ms();
+                            copy_to_backend_time_ms.fetch_add(t1 - t0);
                         }
                     }
                 }
@@ -2281,10 +2276,10 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
     LOG_INFO("loading tensors completed, taking %.2fs (process: %.2fs, read: %.2fs, memcpy: %.2fs, convert: %.2fs, copy_to_backend: %.2fs)",
              (end_time - start_time) / 1000.f,
              process_time_ms / 1000.f,
-             read_time_ms / 1000.f,
-             memcpy_time_ms / 1000.f,
-             convert_time_ms / 1000.f,
-             copy_to_backend_time_ms / 1000.f);
+             (read_time_ms.load() / (float)last_n_threads) / 1000.f,
+             (memcpy_time_ms.load() / (float)last_n_threads) / 1000.f,
+             (convert_time_ms.load() / (float)last_n_threads) / 1000.f,
+             (copy_to_backend_time_ms.load() / (float)last_n_threads) / 1000.f);
     return success;
 }
 

From 62ba7f7c70ab7e1d9b934f63efc4b8eab349f358 Mon Sep 17 00:00:00 2001
From: rmatif <kingrealriadh@gmail.com>
Date: Mon, 8 Sep 2025 15:04:01 +0000
Subject: [PATCH 09/13] fix build

---
 model.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/model.cpp b/model.cpp
index 010848265..2bb05d192 100644
--- a/model.cpp
+++ b/model.cpp
@@ -2222,14 +2222,15 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
 
                         if (tensor_storage.type == dst_tensor->type) {
                             // copy to device memory
-                            curr_time_ms = ggml_time_ms();
-                            convert_time_ms += curr_time_ms - prev_time_ms;
-                            prev_time_ms = curr_time_ms;
+                            t1 = ggml_time_ms();
+                            convert_time_ms.fetch_add(t1 - t0);
+                            t0 = ggml_time_ms();
                             ggml_backend_tensor_set(dst_tensor, read_buffer.data(), 0, ggml_nbytes(dst_tensor));
                             t1 = ggml_time_ms();
                             copy_to_backend_time_ms.fetch_add(t1 - t0);
                         } else {
                             // convert first, then copy to device memory
+
                             convert_buffer.resize(ggml_nbytes(dst_tensor));
                             convert_tensor((void*)read_buffer.data(), tensor_storage.type, (void*)convert_buffer.data(), dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]);
                             t1 = ggml_time_ms();

From 1e72471d441489cefbdf4c2bb65a049598fed22e Mon Sep 17 00:00:00 2001
From: leejet <leejet714@gmail.com>
Date: Sun, 14 Sep 2025 22:27:11 +0800
Subject: [PATCH 10/13] fix potential duplicate issue

---
 lora.hpp  |  8 ++++----
 model.cpp | 31 +++++++++++++++++++++----------
 model.h   |  2 +-
 3 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/lora.hpp b/lora.hpp
index fde54eef5..f3db1fb52 100644
--- a/lora.hpp
+++ b/lora.hpp
@@ -1,8 +1,8 @@
 #ifndef __LORA_HPP__
 #define __LORA_HPP__
 
-#include "ggml_extend.hpp"
 #include <mutex>
+#include "ggml_extend.hpp"
 
 #define LORA_GRAPH_BASE_SIZE 10240
 
@@ -157,13 +157,13 @@ struct LoraModel : public GGMLRunner {
         model_loader.load_tensors(on_new_tensor_cb, n_threads);
 
         for (const auto& pair : tensors_to_create) {
-            const auto& name = pair.first;
-            const auto& ts   = pair.second;
+            const auto& name         = pair.first;
+            const auto& ts           = pair.second;
             struct ggml_tensor* real = ggml_new_tensor(params_ctx,
                                                        ts.type,
                                                        ts.n_dims,
                                                        ts.ne);
-            lora_tensors[name] = real;
+            lora_tensors[name]       = real;
         }
 
         alloc_params_buffer();
diff --git a/model.cpp b/model.cpp
index 2bb05d192..57757ea9c 100644
--- a/model.cpp
+++ b/model.cpp
@@ -1982,8 +1982,13 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
     std::vector<TensorStorage> processed_tensor_storages;
 
     {
-        std::unordered_map<std::string, TensorStorage> processed_map;
-        std::mutex map_mutex;
+        struct IndexedStorage {
+            size_t index;
+            TensorStorage ts;
+        };
+
+        std::mutex vec_mutex;
+        std::vector<IndexedStorage> all_results;
 
         int n_threads = std::min(num_threads_to_use, (int)tensor_storages.size());
         if (n_threads < 1) {
@@ -1993,7 +1998,7 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
 
         for (int i = 0; i < n_threads; ++i) {
             workers.emplace_back([&, thread_id = i]() {
-                std::unordered_map<std::string, TensorStorage> local_processed_map;
+                std::vector<IndexedStorage> local_results;
                 std::vector<TensorStorage> temp_storages;
 
                 for (size_t j = thread_id; j < tensor_storages.size(); j += n_threads) {
@@ -2006,13 +2011,14 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
                     preprocess_tensor(tensor_storage, temp_storages);
 
                     for (const auto& ts : temp_storages) {
-                        local_processed_map[ts.name] = ts;
+                        local_results.push_back({j, ts});
                     }
                 }
 
-                if (!local_processed_map.empty()) {
-                    std::lock_guard<std::mutex> lock(map_mutex);
-                    processed_map.merge(local_processed_map);
+                if (!local_results.empty()) {
+                    std::lock_guard<std::mutex> lock(vec_mutex);
+                    all_results.insert(all_results.end(),
+                                       local_results.begin(), local_results.end());
                 }
             });
         }
@@ -2020,9 +2026,14 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
             w.join();
         }
 
-        processed_tensor_storages.reserve(processed_map.size());
-        for (auto const& [name, ts] : processed_map) {
-            processed_tensor_storages.push_back(ts);
+        std::unordered_map<std::string, IndexedStorage> latest_map;
+        for (auto& entry : all_results) {
+            latest_map[entry.ts.name] = entry;
+        }
+
+        processed_tensor_storages.reserve(latest_map.size());
+        for (auto& [name, entry] : latest_map) {
+            processed_tensor_storages.push_back(entry.ts);
         }
     }
 
diff --git a/model.h b/model.h
index cfd988969..1dbcc9bb6 100644
--- a/model.h
+++ b/model.h
@@ -250,7 +250,7 @@ class ModelLoader {
     bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0);
     bool load_tensors(std::map<std::string, struct ggml_tensor*>& tensors,
                       std::set<std::string> ignore_tensors = {},
-                      int n_threads = 0);
+                      int n_threads                        = 0);
 
     bool save_to_gguf_file(const std::string& file_path, ggml_type type, const std::string& tensor_type_rules);
     bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type);

From ce092341edcfb8b7d74548753ee4b35d1e21daea Mon Sep 17 00:00:00 2001
From: leejet <leejet714@gmail.com>
Date: Sun, 14 Sep 2025 22:29:53 +0800
Subject: [PATCH 11/13] avoid duplicate lookup of lora tensor

---
 lora.hpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lora.hpp b/lora.hpp
index f3db1fb52..1149d3b0d 100644
--- a/lora.hpp
+++ b/lora.hpp
@@ -147,8 +147,9 @@ struct LoraModel : public GGMLRunner {
                 }
             } else {
                 const std::string& name = tensor_storage.name;
-                if (lora_tensors.count(name)) {
-                    *dst_tensor = lora_tensors.at(name);
+                auto iter = lora_tensors.find(name);
+                if (iter != lora_tensors.end()) {
+                    *dst_tensor = iter->second;
                 }
             }
             return true;

From 94ab11fabaa6ecceca7609728581001c5a69486a Mon Sep 17 00:00:00 2001
From: leejet <leejet714@gmail.com>
Date: Sun, 14 Sep 2025 22:35:42 +0800
Subject: [PATCH 12/13] fix progeress bar

---
 lora.hpp  |  2 +-
 model.cpp | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/lora.hpp b/lora.hpp
index 1149d3b0d..222f61b1e 100644
--- a/lora.hpp
+++ b/lora.hpp
@@ -147,7 +147,7 @@ struct LoraModel : public GGMLRunner {
                 }
             } else {
                 const std::string& name = tensor_storage.name;
-                auto iter = lora_tensors.find(name);
+                auto iter               = lora_tensors.find(name);
                 if (iter != lora_tensors.end()) {
                     *dst_tensor = iter->second;
                 }
diff --git a/model.cpp b/model.cpp
index 57757ea9c..d5e6785d1 100644
--- a/model.cpp
+++ b/model.cpp
@@ -2264,7 +2264,8 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
             if (current_idx >= file_tensors.size() || failed) {
                 break;
             }
-            pretty_progress(total_tensors_processed + current_idx, total_tensors_to_process, (ggml_time_ms() - t_start) / 1000.0f);
+            size_t curr_num = total_tensors_processed + current_idx;
+            pretty_progress(curr_num, total_tensors_to_process, (ggml_time_ms() - t_start) / 1000.0f / (curr_num + 1e-6f));
             std::this_thread::sleep_for(std::chrono::milliseconds(200));
         }
 
@@ -2277,11 +2278,10 @@ bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_thread
             break;
         }
         total_tensors_processed += file_tensors.size();
-    }
-
-    pretty_progress(total_tensors_processed, total_tensors_to_process, (ggml_time_ms() - t_start) / 1000.0f);
-    if (total_tensors_to_process > 0) {
-        printf("\n");
+        pretty_progress(total_tensors_processed, total_tensors_to_process, (ggml_time_ms() - t_start) / 1000.0f / (total_tensors_processed + 1e-6f));
+        if (total_tensors_processed < total_tensors_to_process) {
+            printf("\n");
+        }
     }
 
     int64_t end_time = ggml_time_ms();

From 4e408b0e2b052753c025f43b0beb713ac726bef4 Mon Sep 17 00:00:00 2001
From: leejet <leejet714@gmail.com>
Date: Sun, 14 Sep 2025 22:45:37 +0800
Subject: [PATCH 13/13] remove unused remove_duplicates

---
 model.cpp | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/model.cpp b/model.cpp
index d5e6785d1..8d0f46bd4 100644
--- a/model.cpp
+++ b/model.cpp
@@ -1949,26 +1949,6 @@ std::string ModelLoader::load_umt5_tokenizer_json() {
     return json_str;
 }
 
-std::vector<TensorStorage> remove_duplicates(const std::vector<TensorStorage>& vec) {
-    std::vector<TensorStorage> res;
-    std::unordered_map<std::string, size_t> name_to_index_map;
-
-    for (const auto& ts : vec) {
-        const std::string& current_name = ts.name;
-        auto it                         = name_to_index_map.find(current_name);
-
-        if (it != name_to_index_map.end()) {
-            // Found a duplicate, overwrite the existing one in res
-            res[it->second] = ts;
-        } else {
-            // Not a duplicate, add to map and push to res
-            name_to_index_map[current_name] = res.size();
-            res.push_back(ts);
-        }
-    }
-    return res;
-}
-
 bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p) {
     int64_t process_time_ms = 0;
     std::atomic<int64_t> read_time_ms(0);