diff --git a/lora.hpp b/lora.hpp index b7a27306..222f61b1 100644 --- a/lora.hpp +++ b/lora.hpp @@ -1,6 +1,7 @@ #ifndef __LORA_HPP__ #define __LORA_HPP__ +#include #include "ggml_extend.hpp" #define LORA_GRAPH_BASE_SIZE 10240 @@ -115,7 +116,7 @@ struct LoraModel : public GGMLRunner { return "lora"; } - bool load_from_file(bool filter_tensor = false) { + bool load_from_file(bool filter_tensor = false, int n_threads = 0) { LOG_INFO("loading LoRA from '%s'", file_path.c_str()); if (load_failed) { @@ -123,41 +124,53 @@ struct LoraModel : public GGMLRunner { return false; } + std::unordered_map tensors_to_create; + std::mutex lora_mutex; bool dry_run = true; auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool { - const std::string& name = tensor_storage.name; + if (dry_run) { + const std::string& name = tensor_storage.name; - if (filter_tensor && !contains(name, "lora")) { - // LOG_INFO("skipping LoRA tesnor '%s'", name.c_str()); - return true; - } - // LOG_INFO("lora_tensor %s", name.c_str()); - for (int i = 0; i < LORA_TYPE_COUNT; i++) { - if (name.find(type_fingerprints[i]) != std::string::npos) { - type = (lora_t)i; - break; + if (filter_tensor && !contains(name, "lora")) { + return true; } - } - if (dry_run) { - struct ggml_tensor* real = ggml_new_tensor(params_ctx, - tensor_storage.type, - tensor_storage.n_dims, - tensor_storage.ne); - lora_tensors[name] = real; + { + std::lock_guard lock(lora_mutex); + for (int i = 0; i < LORA_TYPE_COUNT; i++) { + if (name.find(type_fingerprints[i]) != std::string::npos) { + type = (lora_t)i; + break; + } + } + tensors_to_create[name] = tensor_storage; + } } else { - auto real = lora_tensors[name]; - *dst_tensor = real; + const std::string& name = tensor_storage.name; + auto iter = lora_tensors.find(name); + if (iter != lora_tensors.end()) { + *dst_tensor = iter->second; + } } - return true; }; - model_loader.load_tensors(on_new_tensor_cb); + model_loader.load_tensors(on_new_tensor_cb, n_threads); + + for (const auto& pair : tensors_to_create) { + const auto& name = pair.first; + const auto& ts = pair.second; + struct ggml_tensor* real = ggml_new_tensor(params_ctx, + ts.type, + ts.n_dims, + ts.ne); + lora_tensors[name] = real; + } + alloc_params_buffer(); - // exit(0); + dry_run = false; - model_loader.load_tensors(on_new_tensor_cb); + model_loader.load_tensors(on_new_tensor_cb, n_threads); LOG_DEBUG("lora type: \"%s\"/\"%s\"", lora_downs[type].c_str(), lora_ups[type].c_str()); diff --git a/model.cpp b/model.cpp index 4e42018c..8d0f46bd 100644 --- a/model.cpp +++ b/model.cpp @@ -1,8 +1,13 @@ #include +#include +#include #include +#include +#include #include #include #include +#include #include #include @@ -1944,292 +1949,344 @@ std::string ModelLoader::load_umt5_tokenizer_json() { return json_str; } -std::vector remove_duplicates(const std::vector& vec) { - std::vector res; - std::unordered_map name_to_index_map; +bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads_p) { + int64_t process_time_ms = 0; + std::atomic read_time_ms(0); + std::atomic memcpy_time_ms(0); + std::atomic copy_to_backend_time_ms(0); + std::atomic convert_time_ms(0); - for (size_t i = 0; i < vec.size(); ++i) { - const std::string& current_name = vec[i].name; - auto it = name_to_index_map.find(current_name); + int num_threads_to_use = n_threads_p > 0 ? n_threads_p : (int)std::thread::hardware_concurrency(); - if (it != name_to_index_map.end()) { - res[it->second] = vec[i]; - } else { - name_to_index_map[current_name] = i; - res.push_back(vec[i]); + int64_t start_time = ggml_time_ms(); + std::vector processed_tensor_storages; + + { + struct IndexedStorage { + size_t index; + TensorStorage ts; + }; + + std::mutex vec_mutex; + std::vector all_results; + + int n_threads = std::min(num_threads_to_use, (int)tensor_storages.size()); + if (n_threads < 1) { + n_threads = 1; } - } + std::vector workers; - // vec.resize(name_to_index_map.size()); + for (int i = 0; i < n_threads; ++i) { + workers.emplace_back([&, thread_id = i]() { + std::vector local_results; + std::vector temp_storages; - return res; -} + for (size_t j = thread_id; j < tensor_storages.size(); j += n_threads) { + const auto& tensor_storage = tensor_storages[j]; + if (is_unused_tensor(tensor_storage.name)) { + continue; + } -bool ModelLoader::load_tensors(on_new_tensor_cb_t on_new_tensor_cb) { - int64_t process_time_ms = 0; - int64_t read_time_ms = 0; - int64_t memcpy_time_ms = 0; - int64_t copy_to_backend_time_ms = 0; - int64_t convert_time_ms = 0; - - int64_t prev_time_ms = 0; - int64_t curr_time_ms = 0; - int64_t start_time = ggml_time_ms(); - prev_time_ms = start_time; - std::vector processed_tensor_storages; - for (auto& tensor_storage : tensor_storages) { - // LOG_DEBUG("%s", name.c_str()); + temp_storages.clear(); + preprocess_tensor(tensor_storage, temp_storages); - if (is_unused_tensor(tensor_storage.name)) { - continue; + for (const auto& ts : temp_storages) { + local_results.push_back({j, ts}); + } + } + + if (!local_results.empty()) { + std::lock_guard lock(vec_mutex); + all_results.insert(all_results.end(), + local_results.begin(), local_results.end()); + } + }); + } + for (auto& w : workers) { + w.join(); } - preprocess_tensor(tensor_storage, processed_tensor_storages); + std::unordered_map latest_map; + for (auto& entry : all_results) { + latest_map[entry.ts.name] = entry; + } + + processed_tensor_storages.reserve(latest_map.size()); + for (auto& [name, entry] : latest_map) { + processed_tensor_storages.push_back(entry.ts); + } } - std::vector dedup = remove_duplicates(processed_tensor_storages); - processed_tensor_storages = dedup; - curr_time_ms = ggml_time_ms(); - process_time_ms = curr_time_ms - prev_time_ms; - prev_time_ms = curr_time_ms; - bool success = true; + process_time_ms = ggml_time_ms() - start_time; + + bool success = true; + size_t total_tensors_processed = 0; + const size_t total_tensors_to_process = processed_tensor_storages.size(); + const int64_t t_start = ggml_time_ms(); + int last_n_threads = 1; + for (size_t file_index = 0; file_index < file_paths_.size(); file_index++) { std::string file_path = file_paths_[file_index]; LOG_DEBUG("loading tensors from %s", file_path.c_str()); - std::ifstream file(file_path, std::ios::binary); - if (!file.is_open()) { - LOG_ERROR("failed to open '%s'", file_path.c_str()); - return false; + std::vector file_tensors; + for (const auto& ts : processed_tensor_storages) { + if (ts.file_index == file_index) { + file_tensors.push_back(&ts); + } + } + if (file_tensors.empty()) { + continue; } bool is_zip = false; - for (auto& tensor_storage : tensor_storages) { - if (tensor_storage.file_index != file_index) { - continue; - } - if (tensor_storage.index_in_zip >= 0) { + for (auto const& ts : file_tensors) { + if (ts->index_in_zip >= 0) { is_zip = true; break; } } - struct zip_t* zip = NULL; - if (is_zip) { - zip = zip_open(file_path.c_str(), 0, 'r'); - if (zip == NULL) { - LOG_ERROR("failed to open zip '%s'", file_path.c_str()); - return false; - } + int n_threads = is_zip ? 1 : std::min(num_threads_to_use, (int)file_tensors.size()); + if (n_threads < 1) { + n_threads = 1; } + last_n_threads = n_threads; - std::vector read_buffer; - std::vector convert_buffer; - - auto read_data = [&](const TensorStorage& tensor_storage, char* buf, size_t n) { - if (zip != NULL) { - zip_entry_openbyindex(zip, tensor_storage.index_in_zip); - size_t entry_size = zip_entry_size(zip); - if (entry_size != n) { - read_buffer.resize(entry_size); - prev_time_ms = ggml_time_ms(); - zip_entry_noallocread(zip, (void*)read_buffer.data(), entry_size); - curr_time_ms = ggml_time_ms(); - read_time_ms += curr_time_ms - prev_time_ms; - prev_time_ms = curr_time_ms; - memcpy((void*)buf, (void*)(read_buffer.data() + tensor_storage.offset), n); - curr_time_ms = ggml_time_ms(); - memcpy_time_ms += curr_time_ms - prev_time_ms; + std::atomic tensor_idx(0); + std::atomic failed(false); + std::vector workers; + + for (int i = 0; i < n_threads; ++i) { + workers.emplace_back([&, file_path, is_zip]() { + std::ifstream file; + struct zip_t* zip = NULL; + if (is_zip) { + zip = zip_open(file_path.c_str(), 0, 'r'); + if (zip == NULL) { + LOG_ERROR("failed to open zip '%s'", file_path.c_str()); + failed = true; + return; + } } else { - prev_time_ms = ggml_time_ms(); - zip_entry_noallocread(zip, (void*)buf, n); - curr_time_ms = ggml_time_ms(); - read_time_ms += curr_time_ms - prev_time_ms; - } - zip_entry_close(zip); - } else { - prev_time_ms = ggml_time_ms(); - file.seekg(tensor_storage.offset); - file.read(buf, n); - curr_time_ms = ggml_time_ms(); - read_time_ms += curr_time_ms - prev_time_ms; - if (!file) { - LOG_ERROR("read tensor data failed: '%s'", file_path.c_str()); - return false; + file.open(file_path, std::ios::binary); + if (!file.is_open()) { + LOG_ERROR("failed to open '%s'", file_path.c_str()); + failed = true; + return; + } } - } - return true; - }; - int tensor_count = 0; - int64_t t0 = ggml_time_ms(); - int64_t t1 = t0; - bool partial = true; - int tensor_max = (int)processed_tensor_storages.size(); - pretty_progress(0, tensor_max, 0.0f); - for (auto& tensor_storage : processed_tensor_storages) { - if (tensor_storage.file_index != file_index) { - ++tensor_count; - continue; - } - ggml_tensor* dst_tensor = NULL; - success = on_new_tensor_cb(tensor_storage, &dst_tensor); - if (!success) { - LOG_WARN("process tensor failed: '%s'", tensor_storage.name.c_str()); - break; - } + std::vector read_buffer; + std::vector convert_buffer; - if (dst_tensor == NULL) { - ++tensor_count; - continue; - } + while (true) { + int64_t t0, t1; + size_t idx = tensor_idx.fetch_add(1); + if (idx >= file_tensors.size() || failed) { + break; + } - size_t nbytes_to_read = tensor_storage.nbytes_to_read(); + const TensorStorage& tensor_storage = *file_tensors[idx]; + ggml_tensor* dst_tensor = NULL; - if (dst_tensor->buffer == NULL || ggml_backend_buffer_is_host(dst_tensor->buffer)) { - // for the CPU and Metal backend, we can copy directly into the tensor - if (tensor_storage.type == dst_tensor->type) { - GGML_ASSERT(ggml_nbytes(dst_tensor) == tensor_storage.nbytes()); - if (tensor_storage.is_f64 || tensor_storage.is_i64) { - read_buffer.resize(tensor_storage.nbytes_to_read()); - read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read); - } else { - read_data(tensor_storage, (char*)dst_tensor->data, nbytes_to_read); - } + t0 = ggml_time_ms(); - prev_time_ms = ggml_time_ms(); - if (tensor_storage.is_bf16) { - // inplace op - bf16_to_f32_vec((uint16_t*)dst_tensor->data, (float*)dst_tensor->data, tensor_storage.nelements()); - } else if (tensor_storage.is_f8_e4m3) { - // inplace op - f8_e4m3_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements()); - } else if (tensor_storage.is_f8_e5m2) { - // inplace op - f8_e5m2_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements()); - } else if (tensor_storage.is_f64) { - f64_to_f32_vec((double*)read_buffer.data(), (float*)dst_tensor->data, tensor_storage.nelements()); - } else if (tensor_storage.is_i64) { - i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)dst_tensor->data, tensor_storage.nelements()); + if (!on_new_tensor_cb(tensor_storage, &dst_tensor)) { + LOG_WARN("process tensor failed: '%s'", tensor_storage.name.c_str()); + failed = true; + break; } - curr_time_ms = ggml_time_ms(); - convert_time_ms += curr_time_ms - prev_time_ms; - } else { - read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read())); - read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read); - - prev_time_ms = ggml_time_ms(); - if (tensor_storage.is_bf16) { - // inplace op - bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements()); - } else if (tensor_storage.is_f8_e4m3) { - // inplace op - f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements()); - } else if (tensor_storage.is_f8_e5m2) { - // inplace op - f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements()); - } else if (tensor_storage.is_f64) { - // inplace op - f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements()); - } else if (tensor_storage.is_i64) { - // inplace op - i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements()); + + if (dst_tensor == NULL) { + t1 = ggml_time_ms(); + read_time_ms.fetch_add(t1 - t0); + continue; } - convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data, - dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]); - curr_time_ms = ggml_time_ms(); - convert_time_ms += curr_time_ms - prev_time_ms; - } - } else { - read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read())); - read_data(tensor_storage, (char*)read_buffer.data(), nbytes_to_read); - - prev_time_ms = ggml_time_ms(); - if (tensor_storage.is_bf16) { - // inplace op - bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements()); - } else if (tensor_storage.is_f8_e4m3) { - // inplace op - f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements()); - } else if (tensor_storage.is_f8_e5m2) { - // inplace op - f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements()); - } else if (tensor_storage.is_f64) { - // inplace op - f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements()); - } else if (tensor_storage.is_i64) { - // inplace op - i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements()); - } + size_t nbytes_to_read = tensor_storage.nbytes_to_read(); + + auto read_data = [&](char* buf, size_t n) { + if (zip != NULL) { + zip_entry_openbyindex(zip, tensor_storage.index_in_zip); + size_t entry_size = zip_entry_size(zip); + if (entry_size != n) { + int64_t t_memcpy_start; + read_buffer.resize(entry_size); + zip_entry_noallocread(zip, (void*)read_buffer.data(), entry_size); + t_memcpy_start = ggml_time_ms(); + memcpy((void*)buf, (void*)(read_buffer.data() + tensor_storage.offset), n); + memcpy_time_ms.fetch_add(ggml_time_ms() - t_memcpy_start); + } else { + zip_entry_noallocread(zip, (void*)buf, n); + } + zip_entry_close(zip); + } else { + file.seekg(tensor_storage.offset); + file.read(buf, n); + if (!file) { + LOG_ERROR("read tensor data failed: '%s'", file_path.c_str()); + failed = true; + } + } + }; + + if (dst_tensor->buffer == NULL || ggml_backend_buffer_is_host(dst_tensor->buffer)) { + if (tensor_storage.type == dst_tensor->type) { + GGML_ASSERT(ggml_nbytes(dst_tensor) == tensor_storage.nbytes()); + if (tensor_storage.is_f64 || tensor_storage.is_i64) { + read_buffer.resize(tensor_storage.nbytes_to_read()); + read_data((char*)read_buffer.data(), nbytes_to_read); + } else { + read_data((char*)dst_tensor->data, nbytes_to_read); + } + t1 = ggml_time_ms(); + read_time_ms.fetch_add(t1 - t0); + + t0 = ggml_time_ms(); + if (tensor_storage.is_bf16) { + // inplace op + bf16_to_f32_vec((uint16_t*)dst_tensor->data, (float*)dst_tensor->data, tensor_storage.nelements()); + } else if (tensor_storage.is_f8_e4m3) { + // inplace op + f8_e4m3_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements()); + } else if (tensor_storage.is_f8_e5m2) { + // inplace op + f8_e5m2_to_f16_vec((uint8_t*)dst_tensor->data, (uint16_t*)dst_tensor->data, tensor_storage.nelements()); + } else if (tensor_storage.is_f64) { + f64_to_f32_vec((double*)read_buffer.data(), (float*)dst_tensor->data, tensor_storage.nelements()); + } else if (tensor_storage.is_i64) { + i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)dst_tensor->data, tensor_storage.nelements()); + } + t1 = ggml_time_ms(); + convert_time_ms.fetch_add(t1 - t0); + } else { + read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read())); + read_data((char*)read_buffer.data(), nbytes_to_read); + t1 = ggml_time_ms(); + read_time_ms.fetch_add(t1 - t0); + + t0 = ggml_time_ms(); + if (tensor_storage.is_bf16) { + // inplace op + bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements()); + } else if (tensor_storage.is_f8_e4m3) { + // inplace op + f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements()); + } else if (tensor_storage.is_f8_e5m2) { + // inplace op + f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements()); + } else if (tensor_storage.is_f64) { + // inplace op + f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements()); + } else if (tensor_storage.is_i64) { + // inplace op + i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements()); + } + convert_tensor((void*)read_buffer.data(), tensor_storage.type, dst_tensor->data, dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]); + t1 = ggml_time_ms(); + convert_time_ms.fetch_add(t1 - t0); + } + } else { + read_buffer.resize(std::max(tensor_storage.nbytes(), tensor_storage.nbytes_to_read())); + read_data((char*)read_buffer.data(), nbytes_to_read); + t1 = ggml_time_ms(); + read_time_ms.fetch_add(t1 - t0); + + t0 = ggml_time_ms(); + if (tensor_storage.is_bf16) { + // inplace op + bf16_to_f32_vec((uint16_t*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements()); + } else if (tensor_storage.is_f8_e4m3) { + // inplace op + f8_e4m3_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements()); + } else if (tensor_storage.is_f8_e5m2) { + // inplace op + f8_e5m2_to_f16_vec((uint8_t*)read_buffer.data(), (uint16_t*)read_buffer.data(), tensor_storage.nelements()); + } else if (tensor_storage.is_f64) { + // inplace op + f64_to_f32_vec((double*)read_buffer.data(), (float*)read_buffer.data(), tensor_storage.nelements()); + } else if (tensor_storage.is_i64) { + // inplace op + i64_to_i32_vec((int64_t*)read_buffer.data(), (int32_t*)read_buffer.data(), tensor_storage.nelements()); + } - if (tensor_storage.type == dst_tensor->type) { - // copy to device memory - curr_time_ms = ggml_time_ms(); - convert_time_ms += curr_time_ms - prev_time_ms; - prev_time_ms = curr_time_ms; - ggml_backend_tensor_set(dst_tensor, read_buffer.data(), 0, ggml_nbytes(dst_tensor)); - curr_time_ms = ggml_time_ms(); - copy_to_backend_time_ms += curr_time_ms - prev_time_ms; - } else { - // convert first, then copy to device memory - convert_buffer.resize(ggml_nbytes(dst_tensor)); - convert_tensor((void*)read_buffer.data(), tensor_storage.type, - (void*)convert_buffer.data(), dst_tensor->type, - (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]); - curr_time_ms = ggml_time_ms(); - convert_time_ms += curr_time_ms - prev_time_ms; - prev_time_ms = curr_time_ms; - ggml_backend_tensor_set(dst_tensor, convert_buffer.data(), 0, ggml_nbytes(dst_tensor)); - curr_time_ms = ggml_time_ms(); - copy_to_backend_time_ms += curr_time_ms - prev_time_ms; + if (tensor_storage.type == dst_tensor->type) { + // copy to device memory + t1 = ggml_time_ms(); + convert_time_ms.fetch_add(t1 - t0); + t0 = ggml_time_ms(); + ggml_backend_tensor_set(dst_tensor, read_buffer.data(), 0, ggml_nbytes(dst_tensor)); + t1 = ggml_time_ms(); + copy_to_backend_time_ms.fetch_add(t1 - t0); + } else { + // convert first, then copy to device memory + + convert_buffer.resize(ggml_nbytes(dst_tensor)); + convert_tensor((void*)read_buffer.data(), tensor_storage.type, (void*)convert_buffer.data(), dst_tensor->type, (int)tensor_storage.nelements() / (int)tensor_storage.ne[0], (int)tensor_storage.ne[0]); + t1 = ggml_time_ms(); + convert_time_ms.fetch_add(t1 - t0); + t0 = ggml_time_ms(); + ggml_backend_tensor_set(dst_tensor, convert_buffer.data(), 0, ggml_nbytes(dst_tensor)); + t1 = ggml_time_ms(); + copy_to_backend_time_ms.fetch_add(t1 - t0); + } + } } - } - ++tensor_count; - int64_t t2 = ggml_time_ms(); - if ((t2 - t1) >= 200) { - t1 = t2; - pretty_progress(tensor_count, tensor_max, (t1 - t0) / (1000.0f * tensor_count)); - partial = tensor_count != tensor_max; - } + if (zip != NULL) { + zip_close(zip); + } + }); } - if (partial) { - if (tensor_count >= 1) { - t1 = ggml_time_ms(); - pretty_progress(tensor_count, tensor_max, (t1 - t0) / (1000.0f * tensor_count)); - } - if (tensor_count < tensor_max) { - printf("\n"); + while (true) { + size_t current_idx = tensor_idx.load(); + if (current_idx >= file_tensors.size() || failed) { + break; } + size_t curr_num = total_tensors_processed + current_idx; + pretty_progress(curr_num, total_tensors_to_process, (ggml_time_ms() - t_start) / 1000.0f / (curr_num + 1e-6f)); + std::this_thread::sleep_for(std::chrono::milliseconds(200)); } - if (zip != NULL) { - zip_close(zip); + for (auto& w : workers) { + w.join(); } - if (!success) { + if (failed) { + success = false; break; } + total_tensors_processed += file_tensors.size(); + pretty_progress(total_tensors_processed, total_tensors_to_process, (ggml_time_ms() - t_start) / 1000.0f / (total_tensors_processed + 1e-6f)); + if (total_tensors_processed < total_tensors_to_process) { + printf("\n"); + } } + int64_t end_time = ggml_time_ms(); LOG_INFO("loading tensors completed, taking %.2fs (process: %.2fs, read: %.2fs, memcpy: %.2fs, convert: %.2fs, copy_to_backend: %.2fs)", (end_time - start_time) / 1000.f, process_time_ms / 1000.f, - read_time_ms / 1000.f, - memcpy_time_ms / 1000.f, - convert_time_ms / 1000.f, - copy_to_backend_time_ms / 1000.f); + (read_time_ms.load() / (float)last_n_threads) / 1000.f, + (memcpy_time_ms.load() / (float)last_n_threads) / 1000.f, + (convert_time_ms.load() / (float)last_n_threads) / 1000.f, + (copy_to_backend_time_ms.load() / (float)last_n_threads) / 1000.f); return success; } bool ModelLoader::load_tensors(std::map& tensors, - std::set ignore_tensors) { + std::set ignore_tensors, + int n_threads) { std::set tensor_names_in_file; + std::mutex tensor_names_mutex; auto on_new_tensor_cb = [&](const TensorStorage& tensor_storage, ggml_tensor** dst_tensor) -> bool { const std::string& name = tensor_storage.name; // LOG_DEBUG("%s", tensor_storage.to_string().c_str()); - tensor_names_in_file.insert(name); + { + std::lock_guard lock(tensor_names_mutex); + tensor_names_in_file.insert(name); + } struct ggml_tensor* real; if (tensors.find(name) != tensors.end()) { @@ -2263,7 +2320,7 @@ bool ModelLoader::load_tensors(std::map& tenso return true; }; - bool success = load_tensors(on_new_tensor_cb); + bool success = load_tensors(on_new_tensor_cb, n_threads); if (!success) { LOG_ERROR("load tensors from file failed"); return false; diff --git a/model.h b/model.h index fef6ace8..1dbcc9bb 100644 --- a/model.h +++ b/model.h @@ -247,9 +247,10 @@ class ModelLoader { ggml_type get_diffusion_model_wtype(); ggml_type get_vae_wtype(); void set_wtype_override(ggml_type wtype, std::string prefix = ""); - bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb); + bool load_tensors(on_new_tensor_cb_t on_new_tensor_cb, int n_threads = 0); bool load_tensors(std::map& tensors, - std::set ignore_tensors = {}); + std::set ignore_tensors = {}, + int n_threads = 0); bool save_to_gguf_file(const std::string& file_path, ggml_type type, const std::string& tensor_type_rules); bool tensor_should_be_converted(const TensorStorage& tensor_storage, ggml_type type); diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 17804c11..57d6959a 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -573,7 +573,7 @@ class StableDiffusionGGML { if (version == VERSION_SVD) { ignore_tensors.insert("conditioner.embedders.3"); } - bool success = model_loader.load_tensors(tensors, ignore_tensors); + bool success = model_loader.load_tensors(tensors, ignore_tensors, n_threads); if (!success) { LOG_ERROR("load tensors from model loader failed"); ggml_free(ctx);