From 267c1399f15a278ec8c3cdcf9c90dc94151fbc38 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Tue, 1 Apr 2025 23:44:05 +0200 Subject: [PATCH 01/11] common : refactor downloading system, handle mmproj with -hf option (#12694) * (wip) refactor downloading system [no ci] * fix all examples * fix mmproj with -hf * gemma3: update readme * only handle mmproj in llava example * fix multi-shard download * windows: fix problem with std::min and std::max * fix 2 --- common/arg.cpp | 675 ++++++++++++++++-- common/common.cpp | 495 +------------ common/common.h | 44 +- examples/batched-bench/batched-bench.cpp | 2 +- examples/batched/batched.cpp | 2 +- examples/export-lora/export-lora.cpp | 2 +- examples/gritlm/gritlm.cpp | 2 +- examples/llava/README-gemma3.md | 20 + examples/llava/gemma3-cli.cpp | 6 +- examples/llava/llava-cli.cpp | 6 +- examples/llava/minicpmv-cli.cpp | 6 +- examples/llava/qwen2vl-cli.cpp | 6 +- examples/parallel/parallel.cpp | 2 +- examples/passkey/passkey.cpp | 2 +- examples/server/server.cpp | 19 +- .../speculative-simple/speculative-simple.cpp | 2 +- examples/speculative/speculative.cpp | 2 +- examples/tts/tts.cpp | 7 +- tests/test-arg-parser.cpp | 8 +- 19 files changed, 673 insertions(+), 635 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 8292adaac655d..47c26955ea374 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1,9 +1,19 @@ +#include "gguf.h" // for reading GGUF splits #include "arg.h" #include "log.h" #include "sampling.h" #include "chat.h" +// fix problem with std::min and std::max +#if defined(_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +# define NOMINMAX +#endif +#include +#endif + #include #include #include @@ -14,6 +24,14 @@ #include #include +//#define LLAMA_USE_CURL + +#if defined(LLAMA_USE_CURL) +#include +#include +#include +#endif + #include "json-schema-to-grammar.h" using json = nlohmann::ordered_json; @@ -126,46 +144,548 @@ std::string common_arg::to_string() { } // -// utils +// downloader +// + +struct common_hf_file_res { + std::string repo; // repo name with ":tag" removed + std::string ggufFile; + std::string mmprojFile; +}; + +#ifdef LLAMA_USE_CURL + +#ifdef __linux__ +#include +#elif defined(_WIN32) +# if !defined(PATH_MAX) +# define PATH_MAX MAX_PATH +# endif +#else +#include +#endif +#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083 + +// +// CURL utils // -static void common_params_handle_model_default( - std::string & model, - const std::string & model_url, - std::string & hf_repo, - std::string & hf_file, - const std::string & hf_token, - const std::string & model_default) { - if (!hf_repo.empty()) { - // short-hand to avoid specifying --hf-file -> default it to --model - if (hf_file.empty()) { - if (model.empty()) { - auto auto_detected = common_get_hf_file(hf_repo, hf_token); - if (auto_detected.first.empty() || auto_detected.second.empty()) { - exit(1); // built without CURL, error message already printed +using curl_ptr = std::unique_ptr; + +// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one +struct curl_slist_ptr { + struct curl_slist * ptr = nullptr; + ~curl_slist_ptr() { + if (ptr) { + curl_slist_free_all(ptr); + } + } +}; + +#define CURL_MAX_RETRY 3 +#define CURL_RETRY_DELAY_SECONDS 2 + +static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) { + int remaining_attempts = max_attempts; + + while (remaining_attempts > 0) { + LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts); + + CURLcode res = curl_easy_perform(curl); + if (res == CURLE_OK) { + return true; + } + + int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000; + LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay); + + remaining_attempts--; + std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay)); + } + + LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts); + + return false; +} + +// download one single file from remote URL to local path +static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token) { + // Initialize libcurl + curl_ptr curl(curl_easy_init(), &curl_easy_cleanup); + curl_slist_ptr http_headers; + if (!curl) { + LOG_ERR("%s: error initializing libcurl\n", __func__); + return false; + } + + bool force_download = false; + + // Set the URL, allow to follow http redirection + curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str()); + curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L); + + // Check if hf-token or bearer-token was specified + if (!bearer_token.empty()) { + std::string auth_header = "Authorization: Bearer " + bearer_token; + http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str()); + curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr); + } + +#if defined(_WIN32) + // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of + // operating system. Currently implemented under MS-Windows. + curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA); +#endif + + // Check if the file already exists locally + auto file_exists = std::filesystem::exists(path); + + // If the file exists, check its JSON metadata companion file. + std::string metadata_path = path + ".json"; + nlohmann::json metadata; + std::string etag; + std::string last_modified; + + if (file_exists) { + // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block). + std::ifstream metadata_in(metadata_path); + if (metadata_in.good()) { + try { + metadata_in >> metadata; + LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str()); + if (metadata.contains("url") && metadata.at("url").is_string()) { + auto previous_url = metadata.at("url").get(); + if (previous_url != url) { + LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str()); + return false; + } } - hf_repo = auto_detected.first; - hf_file = auto_detected.second; - } else { - hf_file = model; + if (metadata.contains("etag") && metadata.at("etag").is_string()) { + etag = metadata.at("etag"); + } + if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) { + last_modified = metadata.at("lastModified"); + } + } catch (const nlohmann::json::exception & e) { + LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what()); + return false; } } - // make sure model path is present (for caching purposes) - if (model.empty()) { - // this is to avoid different repo having same file name, or same file name in different subdirs - std::string filename = hf_repo + "_" + hf_file; - // to make sure we don't have any slashes in the filename - string_replace_all(filename, "/", "_"); - model = fs_get_cache_file(filename); + } else { + LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str()); + } + + // Send a HEAD request to retrieve the etag and last-modified headers + struct common_load_model_from_url_headers { + std::string etag; + std::string last_modified; + }; + + common_load_model_from_url_headers headers; + + { + typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *); + auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t { + common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata; + + static std::regex header_regex("([^:]+): (.*)\r\n"); + static std::regex etag_regex("ETag", std::regex_constants::icase); + static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase); + + std::string header(buffer, n_items); + std::smatch match; + if (std::regex_match(header, match, header_regex)) { + const std::string & key = match[1]; + const std::string & value = match[2]; + if (std::regex_match(key, match, etag_regex)) { + headers->etag = value; + } else if (std::regex_match(key, match, last_modified_regex)) { + headers->last_modified = value; + } + } + return n_items; + }; + + curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb + curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress + curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast(header_callback)); + curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers); + + bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS); + if (!was_perform_successful) { + return false; + } + + long http_code = 0; + curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code); + if (http_code != 200) { + // HEAD not supported, we don't know if the file has changed + // force trigger downloading + force_download = true; + LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code); } - } else if (!model_url.empty()) { - if (model.empty()) { - auto f = string_split(model_url, '#').front(); - f = string_split(f, '?').front(); - model = fs_get_cache_file(string_split(f, '/').back()); + } + + bool should_download = !file_exists || force_download; + if (!should_download) { + if (!etag.empty() && etag != headers.etag) { + LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str()); + should_download = true; + } else if (!last_modified.empty() && last_modified != headers.last_modified) { + LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str()); + should_download = true; + } + } + if (should_download) { + std::string path_temporary = path + ".downloadInProgress"; + if (file_exists) { + LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str()); + if (remove(path.c_str()) != 0) { + LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str()); + return false; + } + } + + // Set the output file + + struct FILE_deleter { + void operator()(FILE * f) const { + fclose(f); + } + }; + + std::unique_ptr outfile(fopen(path_temporary.c_str(), "wb")); + if (!outfile) { + LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str()); + return false; + } + + typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd); + auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t { + return fwrite(data, size, nmemb, (FILE *)fd); + }; + curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L); + curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast(write_callback)); + curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get()); + + // display download progress + curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L); + + // helper function to hide password in URL + auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string { + std::size_t protocol_pos = url.find("://"); + if (protocol_pos == std::string::npos) { + return url; // Malformed URL + } + + std::size_t at_pos = url.find('@', protocol_pos + 3); + if (at_pos == std::string::npos) { + return url; // No password in URL + } + + return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos); + }; + + // start the download + LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, + llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str()); + bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS); + if (!was_perform_successful) { + return false; + } + + long http_code = 0; + curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code); + if (http_code < 200 || http_code >= 400) { + LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code); + return false; + } + + // Causes file to be closed explicitly here before we rename it. + outfile.reset(); + + // Write the updated JSON metadata file. + metadata.update({ + {"url", url}, + {"etag", headers.etag}, + {"lastModified", headers.last_modified} + }); + std::ofstream(metadata_path) << metadata.dump(4); + LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str()); + + if (rename(path_temporary.c_str(), path.c_str()) != 0) { + LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str()); + return false; + } + } + + return true; +} + +// download multiple files from remote URLs to local paths +// the input is a vector of pairs +static bool common_download_file_multiple(const std::vector> & urls, const std::string & bearer_token) { + // Prepare download in parallel + std::vector> futures_download; + for (auto const & item : urls) { + futures_download.push_back(std::async(std::launch::async, [bearer_token](const std::pair & it) -> bool { + return common_download_file_single(it.first, it.second, bearer_token); + }, item)); + } + + // Wait for all downloads to complete + for (auto & f : futures_download) { + if (!f.get()) { + return false; + } + } + + return true; +} + +static bool common_download_model( + const common_params_model & model, + const std::string & bearer_token) { + // Basic validation of the model.url + if (model.url.empty()) { + LOG_ERR("%s: invalid model url\n", __func__); + return false; + } + + if (!common_download_file_single(model.url, model.path, bearer_token)) { + return false; + } + + // check for additional GGUFs split to download + int n_split = 0; + { + struct gguf_init_params gguf_params = { + /*.no_alloc = */ true, + /*.ctx = */ NULL, + }; + auto * ctx_gguf = gguf_init_from_file(model.path.c_str(), gguf_params); + if (!ctx_gguf) { + LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, model.path.c_str()); + return false; + } + + auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT); + if (key_n_split >= 0) { + n_split = gguf_get_val_u16(ctx_gguf, key_n_split); + } + + gguf_free(ctx_gguf); + } + + if (n_split > 1) { + char split_prefix[PATH_MAX] = {0}; + char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0}; + + // Verify the first split file format + // and extract split URL and PATH prefixes + { + if (!llama_split_prefix(split_prefix, sizeof(split_prefix), model.path.c_str(), 0, n_split)) { + LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, model.path.c_str(), n_split); + return false; + } + + if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model.url.c_str(), 0, n_split)) { + LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model.url.c_str(), n_split); + return false; + } + } + + std::vector> urls; + for (int idx = 1; idx < n_split; idx++) { + char split_path[PATH_MAX] = {0}; + llama_split_path(split_path, sizeof(split_path), split_prefix, idx, n_split); + + char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0}; + llama_split_path(split_url, sizeof(split_url), split_url_prefix, idx, n_split); + + if (std::string(split_path) == model.path) { + continue; // skip the already downloaded file + } + + urls.push_back({split_url, split_path}); + } + + // Download in parallel + common_download_file_multiple(urls, bearer_token); + } + + return true; +} + +/** + * Allow getting the HF file from the HF repo with tag (like ollama), for example: + * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4 + * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M + * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s + * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo) + * + * Return pair of (with "repo" already having tag removed) + * + * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files. + */ +static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token) { + auto parts = string_split(hf_repo_with_tag, ':'); + std::string tag = parts.size() > 1 ? parts.back() : "latest"; + std::string hf_repo = parts[0]; + if (string_split(hf_repo, '/').size() != 2) { + throw std::invalid_argument("error: invalid HF repo format, expected /[:quant]\n"); + } + + // fetch model info from Hugging Face Hub API + curl_ptr curl(curl_easy_init(), &curl_easy_cleanup); + curl_slist_ptr http_headers; + std::string res_str; + std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag; + curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str()); + curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); + typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data); + auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t { + static_cast(data)->append((char * ) ptr, size * nmemb); + return size * nmemb; + }; + curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast(write_callback)); + curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str); +#if defined(_WIN32) + curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA); +#endif + if (!bearer_token.empty()) { + std::string auth_header = "Authorization: Bearer " + bearer_token; + http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str()); + } + // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response + http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp"); + http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json"); + curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr); + + CURLcode res = curl_easy_perform(curl.get()); + + if (res != CURLE_OK) { + throw std::runtime_error("error: cannot make GET request to HF API"); + } + + long res_code; + std::string ggufFile = ""; + std::string mmprojFile = ""; + curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code); + if (res_code == 200) { + // extract ggufFile.rfilename in json, using regex + { + std::regex pattern("\"ggufFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\""); + std::smatch match; + if (std::regex_search(res_str, match, pattern)) { + ggufFile = match[1].str(); + } + } + // extract mmprojFile.rfilename in json, using regex + { + std::regex pattern("\"mmprojFile\"[\\s\\S]*?\"rfilename\"\\s*:\\s*\"([^\"]+)\""); + std::smatch match; + if (std::regex_search(res_str, match, pattern)) { + mmprojFile = match[1].str(); + } + } + } else if (res_code == 401) { + throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token"); + } else { + throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str())); + } + + // check response + if (ggufFile.empty()) { + throw std::runtime_error("error: model does not have ggufFile"); + } + + return { hf_repo, ggufFile, mmprojFile }; +} + +#else + +static bool common_download_file_single(const std::string &, const std::string &, const std::string &) { + LOG_ERR("error: built without CURL, cannot download model from internet\n"); + return false; +} + +static bool common_download_file_multiple(const std::vector> &, const std::string &) { + LOG_ERR("error: built without CURL, cannot download model from the internet\n"); + return false; +} + +static bool common_download_model( + const common_params_model &, + const std::string &) { + LOG_ERR("error: built without CURL, cannot download model from the internet\n"); + return false; +} + +static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &) { + LOG_ERR("error: built without CURL, cannot download model from the internet\n"); + return {}; +} + +#endif // LLAMA_USE_CURL + +// +// utils +// + +static void common_params_handle_model( + struct common_params_model & model, + const std::string & bearer_token, + const std::string & model_path_default, + bool is_mmproj = false) { // TODO: move is_mmproj to an enum when we have more files? + // handle pre-fill default model path and url based on hf_repo and hf_file + { + if (!model.hf_repo.empty()) { + // short-hand to avoid specifying --hf-file -> default it to --model + if (model.hf_file.empty()) { + if (model.path.empty()) { + auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token); + if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) { + exit(1); // built without CURL, error message already printed + } + model.hf_repo = auto_detected.repo; + model.hf_file = is_mmproj ? auto_detected.mmprojFile : auto_detected.ggufFile; + } else { + model.hf_file = model.path; + } + } + + // TODO: allow custom host + model.url = "https://huggingface.co/" + model.hf_repo + "/resolve/main/" + model.hf_file; + + // make sure model path is present (for caching purposes) + if (model.path.empty()) { + // this is to avoid different repo having same file name, or same file name in different subdirs + std::string filename = model.hf_repo + "_" + model.hf_file; + // to make sure we don't have any slashes in the filename + string_replace_all(filename, "/", "_"); + model.path = fs_get_cache_file(filename); + } + + } else if (!model.url.empty()) { + if (model.path.empty()) { + auto f = string_split(model.url, '#').front(); + f = string_split(f, '?').front(); + model.path = fs_get_cache_file(string_split(f, '/').back()); + } + + } else if (model.path.empty()) { + model.path = model_path_default; + } + } + + // then, download it if needed + if (!model.url.empty()) { + bool ok = common_download_model(model, bearer_token); + if (!ok) { + LOG_ERR("error: failed to download model from %s\n", model.url.c_str()); + exit(1); } - } else if (model.empty()) { - model = model_default; } } @@ -300,10 +820,16 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n"); } - // TODO: refactor model params in a common struct - common_params_handle_model_default(params.model, params.model_url, params.hf_repo, params.hf_file, params.hf_token, DEFAULT_MODEL_PATH); - common_params_handle_model_default(params.speculative.model, params.speculative.model_url, params.speculative.hf_repo, params.speculative.hf_file, params.hf_token, ""); - common_params_handle_model_default(params.vocoder.model, params.vocoder.model_url, params.vocoder.hf_repo, params.vocoder.hf_file, params.hf_token, ""); + common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH); + common_params_handle_model(params.speculative.model, params.hf_token, ""); + common_params_handle_model(params.vocoder.model, params.hf_token, ""); + + // allow --mmproj to be set from -hf + // assuming that mmproj is always in the same repo as text model + if (!params.model.hf_repo.empty() && ctx_arg.ex == LLAMA_EXAMPLE_LLAVA) { + params.mmproj.hf_repo = params.model.hf_repo; + } + common_params_handle_model(params.mmproj, params.hf_token, "", true); if (params.escape) { string_process_escapes(params.prompt); @@ -1561,7 +2087,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--mmproj"}, "FILE", "path to a multimodal projector file for LLaVA. see examples/llava/README.md", [](common_params & params, const std::string & value) { - params.mmproj = value; + params.mmproj.path = value; + } + ).set_examples({LLAMA_EXAMPLE_LLAVA})); + add_opt(common_arg( + {"--mmproj-url"}, "URL", + "URL to a multimodal projector file for LLaVA. see examples/llava/README.md", + [](common_params & params, const std::string & value) { + params.mmproj.url = value; } ).set_examples({LLAMA_EXAMPLE_LLAVA})); add_opt(common_arg( @@ -1790,14 +2323,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "or `--model-url` if set, otherwise %s)", DEFAULT_MODEL_PATH ), [](common_params & params, const std::string & value) { - params.model = value; + params.model.path = value; } ).set_examples({LLAMA_EXAMPLE_COMMON, LLAMA_EXAMPLE_EXPORT_LORA}).set_env("LLAMA_ARG_MODEL")); add_opt(common_arg( {"-mu", "--model-url"}, "MODEL_URL", "model download url (default: unused)", [](common_params & params, const std::string & value) { - params.model_url = value; + params.model.url = value; } ).set_env("LLAMA_ARG_MODEL_URL")); add_opt(common_arg( @@ -1806,35 +2339,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "example: unsloth/phi-4-GGUF:q4_k_m\n" "(default: unused)", [](common_params & params, const std::string & value) { - params.hf_repo = value; + params.model.hf_repo = value; } ).set_env("LLAMA_ARG_HF_REPO")); add_opt(common_arg( {"-hfd", "-hfrd", "--hf-repo-draft"}, "/[:quant]", "Same as --hf-repo, but for the draft model (default: unused)", [](common_params & params, const std::string & value) { - params.speculative.hf_repo = value; + params.speculative.model.hf_repo = value; } ).set_env("LLAMA_ARG_HFD_REPO")); add_opt(common_arg( {"-hff", "--hf-file"}, "FILE", "Hugging Face model file. If specified, it will override the quant in --hf-repo (default: unused)", [](common_params & params, const std::string & value) { - params.hf_file = value; + params.model.hf_file = value; } ).set_env("LLAMA_ARG_HF_FILE")); add_opt(common_arg( {"-hfv", "-hfrv", "--hf-repo-v"}, "/[:quant]", "Hugging Face model repository for the vocoder model (default: unused)", [](common_params & params, const std::string & value) { - params.vocoder.hf_repo = value; + params.vocoder.model.hf_repo = value; } ).set_env("LLAMA_ARG_HF_REPO_V")); add_opt(common_arg( {"-hffv", "--hf-file-v"}, "FILE", "Hugging Face model file for the vocoder model (default: unused)", [](common_params & params, const std::string & value) { - params.vocoder.hf_file = value; + params.vocoder.model.hf_file = value; } ).set_env("LLAMA_ARG_HF_FILE_V")); add_opt(common_arg( @@ -2454,7 +2987,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"-md", "--model-draft"}, "FNAME", "draft model for speculative decoding (default: unused)", [](common_params & params, const std::string & value) { - params.speculative.model = value; + params.speculative.model.path = value; } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODEL_DRAFT")); @@ -2462,7 +2995,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"-mv", "--model-vocoder"}, "FNAME", "vocoder model for audio generation (default: unused)", [](common_params & params, const std::string & value) { - params.vocoder.model = value; + params.vocoder.model.path = value; } ).set_examples({LLAMA_EXAMPLE_TTS, LLAMA_EXAMPLE_SERVER})); add_opt(common_arg( @@ -2485,10 +3018,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--tts-oute-default"}, string_format("use default OuteTTS models (note: can download weights from the internet)"), [](common_params & params) { - params.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF"; - params.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf"; - params.vocoder.hf_repo = "ggml-org/WavTokenizer"; - params.vocoder.hf_file = "WavTokenizer-Large-75-F16.gguf"; + params.model.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF"; + params.model.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf"; + params.vocoder.model.hf_repo = "ggml-org/WavTokenizer"; + params.vocoder.model.hf_file = "WavTokenizer-Large-75-F16.gguf"; } ).set_examples({LLAMA_EXAMPLE_TTS})); @@ -2496,8 +3029,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--embd-bge-small-en-default"}, string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"), [](common_params & params) { - params.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF"; - params.hf_file = "bge-small-en-v1.5-q8_0.gguf"; + params.model.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF"; + params.model.hf_file = "bge-small-en-v1.5-q8_0.gguf"; params.pooling_type = LLAMA_POOLING_TYPE_NONE; params.embd_normalize = 2; params.n_ctx = 512; @@ -2510,8 +3043,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--embd-e5-small-en-default"}, string_format("use default e5-small-v2 model (note: can download weights from the internet)"), [](common_params & params) { - params.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF"; - params.hf_file = "e5-small-v2-q8_0.gguf"; + params.model.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF"; + params.model.hf_file = "e5-small-v2-q8_0.gguf"; params.pooling_type = LLAMA_POOLING_TYPE_NONE; params.embd_normalize = 2; params.n_ctx = 512; @@ -2524,8 +3057,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--embd-gte-small-default"}, string_format("use default gte-small model (note: can download weights from the internet)"), [](common_params & params) { - params.hf_repo = "ggml-org/gte-small-Q8_0-GGUF"; - params.hf_file = "gte-small-q8_0.gguf"; + params.model.hf_repo = "ggml-org/gte-small-Q8_0-GGUF"; + params.model.hf_file = "gte-small-q8_0.gguf"; params.pooling_type = LLAMA_POOLING_TYPE_NONE; params.embd_normalize = 2; params.n_ctx = 512; @@ -2538,8 +3071,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--fim-qwen-1.5b-default"}, string_format("use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"), [](common_params & params) { - params.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF"; - params.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf"; + params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF"; + params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf"; params.port = 8012; params.n_gpu_layers = 99; params.flash_attn = true; @@ -2554,8 +3087,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--fim-qwen-3b-default"}, string_format("use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"), [](common_params & params) { - params.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF"; - params.hf_file = "qwen2.5-coder-3b-q8_0.gguf"; + params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF"; + params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf"; params.port = 8012; params.n_gpu_layers = 99; params.flash_attn = true; @@ -2570,8 +3103,8 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--fim-qwen-7b-default"}, string_format("use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"), [](common_params & params) { - params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF"; - params.hf_file = "qwen2.5-coder-7b-q8_0.gguf"; + params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF"; + params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf"; params.port = 8012; params.n_gpu_layers = 99; params.flash_attn = true; @@ -2586,10 +3119,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--fim-qwen-7b-spec"}, string_format("use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"), [](common_params & params) { - params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF"; - params.hf_file = "qwen2.5-coder-7b-q8_0.gguf"; - params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF"; - params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf"; + params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF"; + params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf"; + params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF"; + params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf"; params.speculative.n_gpu_layers = 99; params.port = 8012; params.n_gpu_layers = 99; @@ -2605,10 +3138,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--fim-qwen-14b-spec"}, string_format("use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"), [](common_params & params) { - params.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF"; - params.hf_file = "qwen2.5-coder-14b-q8_0.gguf"; - params.speculative.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF"; - params.speculative.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf"; + params.model.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF"; + params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf"; + params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF"; + params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf"; params.speculative.n_gpu_layers = 99; params.port = 8012; params.n_gpu_layers = 99; diff --git a/common/common.cpp b/common/common.cpp index 18ffb4e738aee..22642c84afa40 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -51,45 +51,11 @@ #include #include #endif -#if defined(LLAMA_USE_CURL) -#include -#include -#include -#endif #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data #endif -#if defined(LLAMA_USE_CURL) -#ifdef __linux__ -#include -#elif defined(_WIN32) -# if !defined(PATH_MAX) -# define PATH_MAX MAX_PATH -# endif -#else -#include -#endif -#define LLAMA_CURL_MAX_URL_LENGTH 2084 // Maximum URL Length in Chrome: 2083 - -// -// CURL utils -// - -using curl_ptr = std::unique_ptr; - -// cannot use unique_ptr for curl_slist, because we cannot update without destroying the old one -struct curl_slist_ptr { - struct curl_slist * ptr = nullptr; - ~curl_slist_ptr() { - if (ptr) { - curl_slist_free_all(ptr); - } - } -}; -#endif // LLAMA_USE_CURL - using json = nlohmann::ordered_json; // @@ -900,22 +866,14 @@ std::string fs_get_cache_file(const std::string & filename) { // // Model utils // + struct common_init_result common_init_from_params(common_params & params) { common_init_result iparams; auto mparams = common_model_params_to_llama(params); - llama_model * model = nullptr; - - if (!params.hf_repo.empty() && !params.hf_file.empty()) { - model = common_load_model_from_hf(params.hf_repo, params.hf_file, params.model, params.hf_token, mparams); - } else if (!params.model_url.empty()) { - model = common_load_model_from_url(params.model_url, params.model, params.hf_token, mparams); - } else { - model = llama_model_load_from_file(params.model.c_str(), mparams); - } - + llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams); if (model == NULL) { - LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.c_str()); + LOG_ERR("%s: failed to load model '%s'\n", __func__, params.model.path.c_str()); return iparams; } @@ -950,7 +908,7 @@ struct common_init_result common_init_from_params(common_params & params) { llama_context * lctx = llama_init_from_model(model, cparams); if (lctx == NULL) { - LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.c_str()); + LOG_ERR("%s: failed to create context with model '%s'\n", __func__, params.model.path.c_str()); llama_model_free(model); return iparams; } @@ -1164,451 +1122,6 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p return tpp; } -#ifdef LLAMA_USE_CURL - -#define CURL_MAX_RETRY 3 -#define CURL_RETRY_DELAY_SECONDS 2 - -static bool curl_perform_with_retry(const std::string & url, CURL * curl, int max_attempts, int retry_delay_seconds) { - int remaining_attempts = max_attempts; - - while (remaining_attempts > 0) { - LOG_INF("%s: Trying to download from %s (attempt %d of %d)...\n", __func__ , url.c_str(), max_attempts - remaining_attempts + 1, max_attempts); - - CURLcode res = curl_easy_perform(curl); - if (res == CURLE_OK) { - return true; - } - - int exponential_backoff_delay = std::pow(retry_delay_seconds, max_attempts - remaining_attempts) * 1000; - LOG_WRN("%s: curl_easy_perform() failed: %s, retrying after %d milliseconds...\n", __func__, curl_easy_strerror(res), exponential_backoff_delay); - - remaining_attempts--; - std::this_thread::sleep_for(std::chrono::milliseconds(exponential_backoff_delay)); - } - - LOG_ERR("%s: curl_easy_perform() failed after %d attempts\n", __func__, max_attempts); - - return false; -} - -static bool common_download_file(const std::string & url, const std::string & path, const std::string & hf_token) { - // Initialize libcurl - curl_ptr curl(curl_easy_init(), &curl_easy_cleanup); - curl_slist_ptr http_headers; - if (!curl) { - LOG_ERR("%s: error initializing libcurl\n", __func__); - return false; - } - - bool force_download = false; - - // Set the URL, allow to follow http redirection - curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str()); - curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L); - - // Check if hf-token or bearer-token was specified - if (!hf_token.empty()) { - std::string auth_header = "Authorization: Bearer " + hf_token; - http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str()); - curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr); - } - -#if defined(_WIN32) - // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of - // operating system. Currently implemented under MS-Windows. - curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA); -#endif - - // Check if the file already exists locally - auto file_exists = std::filesystem::exists(path); - - // If the file exists, check its JSON metadata companion file. - std::string metadata_path = path + ".json"; - nlohmann::json metadata; - std::string etag; - std::string last_modified; - - if (file_exists) { - // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block). - std::ifstream metadata_in(metadata_path); - if (metadata_in.good()) { - try { - metadata_in >> metadata; - LOG_INF("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str()); - if (metadata.contains("url") && metadata.at("url").is_string()) { - auto previous_url = metadata.at("url").get(); - if (previous_url != url) { - LOG_ERR("%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str()); - return false; - } - } - if (metadata.contains("etag") && metadata.at("etag").is_string()) { - etag = metadata.at("etag"); - } - if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) { - last_modified = metadata.at("lastModified"); - } - } catch (const nlohmann::json::exception & e) { - LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what()); - return false; - } - } - } else { - LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str()); - } - - // Send a HEAD request to retrieve the etag and last-modified headers - struct common_load_model_from_url_headers { - std::string etag; - std::string last_modified; - }; - - common_load_model_from_url_headers headers; - - { - typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *); - auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t { - common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata; - - static std::regex header_regex("([^:]+): (.*)\r\n"); - static std::regex etag_regex("ETag", std::regex_constants::icase); - static std::regex last_modified_regex("Last-Modified", std::regex_constants::icase); - - std::string header(buffer, n_items); - std::smatch match; - if (std::regex_match(header, match, header_regex)) { - const std::string & key = match[1]; - const std::string & value = match[2]; - if (std::regex_match(key, match, etag_regex)) { - headers->etag = value; - } else if (std::regex_match(key, match, last_modified_regex)) { - headers->last_modified = value; - } - } - return n_items; - }; - - curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 1L); // will trigger the HEAD verb - curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); // hide head request progress - curl_easy_setopt(curl.get(), CURLOPT_HEADERFUNCTION, static_cast(header_callback)); - curl_easy_setopt(curl.get(), CURLOPT_HEADERDATA, &headers); - - bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS); - if (!was_perform_successful) { - return false; - } - - long http_code = 0; - curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &http_code); - if (http_code != 200) { - // HEAD not supported, we don't know if the file has changed - // force trigger downloading - force_download = true; - LOG_ERR("%s: HEAD invalid http status code received: %ld\n", __func__, http_code); - } - } - - bool should_download = !file_exists || force_download; - if (!should_download) { - if (!etag.empty() && etag != headers.etag) { - LOG_WRN("%s: ETag header is different (%s != %s): triggering a new download\n", __func__, etag.c_str(), headers.etag.c_str()); - should_download = true; - } else if (!last_modified.empty() && last_modified != headers.last_modified) { - LOG_WRN("%s: Last-Modified header is different (%s != %s): triggering a new download\n", __func__, last_modified.c_str(), headers.last_modified.c_str()); - should_download = true; - } - } - if (should_download) { - std::string path_temporary = path + ".downloadInProgress"; - if (file_exists) { - LOG_WRN("%s: deleting previous downloaded file: %s\n", __func__, path.c_str()); - if (remove(path.c_str()) != 0) { - LOG_ERR("%s: unable to delete file: %s\n", __func__, path.c_str()); - return false; - } - } - - // Set the output file - - struct FILE_deleter { - void operator()(FILE * f) const { - fclose(f); - } - }; - - std::unique_ptr outfile(fopen(path_temporary.c_str(), "wb")); - if (!outfile) { - LOG_ERR("%s: error opening local file for writing: %s\n", __func__, path.c_str()); - return false; - } - - typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd); - auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t { - return fwrite(data, size, nmemb, (FILE *)fd); - }; - curl_easy_setopt(curl.get(), CURLOPT_NOBODY, 0L); - curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast(write_callback)); - curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, outfile.get()); - - // display download progress - curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 0L); - - // helper function to hide password in URL - auto llama_download_hide_password_in_url = [](const std::string & url) -> std::string { - std::size_t protocol_pos = url.find("://"); - if (protocol_pos == std::string::npos) { - return url; // Malformed URL - } - - std::size_t at_pos = url.find('@', protocol_pos + 3); - if (at_pos == std::string::npos) { - return url; // No password in URL - } - - return url.substr(0, protocol_pos + 3) + "********" + url.substr(at_pos); - }; - - // start the download - LOG_INF("%s: trying to download model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__, - llama_download_hide_password_in_url(url).c_str(), path.c_str(), headers.etag.c_str(), headers.last_modified.c_str()); - bool was_perform_successful = curl_perform_with_retry(url, curl.get(), CURL_MAX_RETRY, CURL_RETRY_DELAY_SECONDS); - if (!was_perform_successful) { - return false; - } - - long http_code = 0; - curl_easy_getinfo (curl.get(), CURLINFO_RESPONSE_CODE, &http_code); - if (http_code < 200 || http_code >= 400) { - LOG_ERR("%s: invalid http status code received: %ld\n", __func__, http_code); - return false; - } - - // Causes file to be closed explicitly here before we rename it. - outfile.reset(); - - // Write the updated JSON metadata file. - metadata.update({ - {"url", url}, - {"etag", headers.etag}, - {"lastModified", headers.last_modified} - }); - std::ofstream(metadata_path) << metadata.dump(4); - LOG_INF("%s: file metadata saved: %s\n", __func__, metadata_path.c_str()); - - if (rename(path_temporary.c_str(), path.c_str()) != 0) { - LOG_ERR("%s: unable to rename file: %s to %s\n", __func__, path_temporary.c_str(), path.c_str()); - return false; - } - } - - return true; -} - -struct llama_model * common_load_model_from_url( - const std::string & model_url, - const std::string & local_path, - const std::string & hf_token, - const struct llama_model_params & params) { - // Basic validation of the model_url - if (model_url.empty()) { - LOG_ERR("%s: invalid model_url\n", __func__); - return NULL; - } - - if (!common_download_file(model_url, local_path, hf_token)) { - return NULL; - } - - // check for additional GGUFs split to download - int n_split = 0; - { - struct gguf_init_params gguf_params = { - /*.no_alloc = */ true, - /*.ctx = */ NULL, - }; - auto * ctx_gguf = gguf_init_from_file(local_path.c_str(), gguf_params); - if (!ctx_gguf) { - LOG_ERR("\n%s: failed to load input GGUF from %s\n", __func__, local_path.c_str()); - return NULL; - } - - auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_SPLIT_COUNT); - if (key_n_split >= 0) { - n_split = gguf_get_val_u16(ctx_gguf, key_n_split); - } - - gguf_free(ctx_gguf); - } - - if (n_split > 1) { - char split_prefix[PATH_MAX] = {0}; - char split_url_prefix[LLAMA_CURL_MAX_URL_LENGTH] = {0}; - - // Verify the first split file format - // and extract split URL and PATH prefixes - { - if (!llama_split_prefix(split_prefix, sizeof(split_prefix), local_path.c_str(), 0, n_split)) { - LOG_ERR("\n%s: unexpected model file name: %s n_split=%d\n", __func__, local_path.c_str(), n_split); - return NULL; - } - - if (!llama_split_prefix(split_url_prefix, sizeof(split_url_prefix), model_url.c_str(), 0, n_split)) { - LOG_ERR("\n%s: unexpected model url: %s n_split=%d\n", __func__, model_url.c_str(), n_split); - return NULL; - } - } - - // Prepare download in parallel - std::vector> futures_download; - for (int idx = 1; idx < n_split; idx++) { - futures_download.push_back(std::async(std::launch::async, [&split_prefix, &split_url_prefix, &n_split, hf_token](int download_idx) -> bool { - char split_path[PATH_MAX] = {0}; - llama_split_path(split_path, sizeof(split_path), split_prefix, download_idx, n_split); - - char split_url[LLAMA_CURL_MAX_URL_LENGTH] = {0}; - llama_split_path(split_url, sizeof(split_url), split_url_prefix, download_idx, n_split); - - return common_download_file(split_url, split_path, hf_token); - }, idx)); - } - - // Wait for all downloads to complete - for (auto & f : futures_download) { - if (!f.get()) { - return NULL; - } - } - } - - return llama_model_load_from_file(local_path.c_str(), params); -} - -struct llama_model * common_load_model_from_hf( - const std::string & repo, - const std::string & remote_path, - const std::string & local_path, - const std::string & hf_token, - const struct llama_model_params & params) { - // construct hugging face model url: - // - // --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf - // https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf - // - // --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf - // https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf - // - - std::string model_url = "https://huggingface.co/"; - model_url += repo; - model_url += "/resolve/main/"; - model_url += remote_path; - - return common_load_model_from_url(model_url, local_path, hf_token, params); -} - -/** - * Allow getting the HF file from the HF repo with tag (like ollama), for example: - * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4 - * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M - * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s - * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo) - * - * Return pair of (with "repo" already having tag removed) - * - * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files. - */ -std::pair common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & hf_token) { - auto parts = string_split(hf_repo_with_tag, ':'); - std::string tag = parts.size() > 1 ? parts.back() : "latest"; - std::string hf_repo = parts[0]; - if (string_split(hf_repo, '/').size() != 2) { - throw std::invalid_argument("error: invalid HF repo format, expected /[:quant]\n"); - } - - // fetch model info from Hugging Face Hub API - json model_info; - curl_ptr curl(curl_easy_init(), &curl_easy_cleanup); - curl_slist_ptr http_headers; - std::string res_str; - std::string url = "https://huggingface.co/v2/" + hf_repo + "/manifests/" + tag; - curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str()); - curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L); - typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data); - auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t { - static_cast(data)->append((char * ) ptr, size * nmemb); - return size * nmemb; - }; - curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast(write_callback)); - curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str); -#if defined(_WIN32) - curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA); -#endif - if (!hf_token.empty()) { - std::string auth_header = "Authorization: Bearer " + hf_token; - http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str()); - } - // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response - http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp"); - http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json"); - curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr); - - CURLcode res = curl_easy_perform(curl.get()); - - if (res != CURLE_OK) { - throw std::runtime_error("error: cannot make GET request to HF API"); - } - - long res_code; - curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code); - if (res_code == 200) { - model_info = json::parse(res_str); - } else if (res_code == 401) { - throw std::runtime_error("error: model is private or does not exist; if you are accessing a gated model, please provide a valid HF token"); - } else { - throw std::runtime_error(string_format("error from HF API, response code: %ld, data: %s", res_code, res_str.c_str())); - } - - // check response - if (!model_info.contains("ggufFile")) { - throw std::runtime_error("error: model does not have ggufFile"); - } - json & gguf_file = model_info.at("ggufFile"); - if (!gguf_file.contains("rfilename")) { - throw std::runtime_error("error: ggufFile does not have rfilename"); - } - - return std::make_pair(hf_repo, gguf_file.at("rfilename")); -} - -#else - -struct llama_model * common_load_model_from_url( - const std::string & /*model_url*/, - const std::string & /*local_path*/, - const std::string & /*hf_token*/, - const struct llama_model_params & /*params*/) { - LOG_WRN("%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__); - return nullptr; -} - -struct llama_model * common_load_model_from_hf( - const std::string & /*repo*/, - const std::string & /*remote_path*/, - const std::string & /*local_path*/, - const std::string & /*hf_token*/, - const struct llama_model_params & /*params*/) { - LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__); - return nullptr; -} - -std::pair common_get_hf_file(const std::string &, const std::string &) { - LOG_WRN("%s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n", __func__); - return std::make_pair("", ""); -} - -#endif // LLAMA_USE_CURL - // // Batch utils // diff --git a/common/common.h b/common/common.h index 1c0f199774976..41ff9905e4416 100644 --- a/common/common.h +++ b/common/common.h @@ -184,6 +184,13 @@ struct common_params_sampling { std::string print() const; }; +struct common_params_model { + std::string path = ""; // model local path // NOLINT + std::string url = ""; // model url to download // NOLINT + std::string hf_repo = ""; // HF repo // NOLINT + std::string hf_file = ""; // HF file // NOLINT +}; + struct common_params_speculative { std::vector devices; // devices to use for offloading @@ -197,19 +204,11 @@ struct common_params_speculative { struct cpu_params cpuparams; struct cpu_params cpuparams_batch; - std::string hf_repo = ""; // HF repo // NOLINT - std::string hf_file = ""; // HF file // NOLINT - - std::string model = ""; // draft model for speculative decoding // NOLINT - std::string model_url = ""; // model url to download // NOLINT + struct common_params_model model; }; struct common_params_vocoder { - std::string hf_repo = ""; // HF repo // NOLINT - std::string hf_file = ""; // HF file // NOLINT - - std::string model = ""; // model path // NOLINT - std::string model_url = ""; // model url to download // NOLINT + struct common_params_model model; std::string speaker_file = ""; // speaker file path // NOLINT @@ -267,12 +266,10 @@ struct common_params { struct common_params_speculative speculative; struct common_params_vocoder vocoder; - std::string model = ""; // model path // NOLINT + struct common_params_model model; + std::string model_alias = ""; // model alias // NOLINT - std::string model_url = ""; // model url to download // NOLINT std::string hf_token = ""; // HF token // NOLINT - std::string hf_repo = ""; // HF repo // NOLINT - std::string hf_file = ""; // HF file // NOLINT std::string prompt = ""; // NOLINT std::string system_prompt = ""; // NOLINT std::string prompt_file = ""; // store the external prompt file name // NOLINT @@ -347,7 +344,7 @@ struct common_params { common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO; // multimodal models (see examples/llava) - std::string mmproj = ""; // path to multimodal projector // NOLINT + struct common_params_model mmproj; std::vector image; // path to image file(s) // embedding @@ -546,23 +543,6 @@ struct llama_model_params common_model_params_to_llama ( common_params struct llama_context_params common_context_params_to_llama(const common_params & params); struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params); -struct llama_model * common_load_model_from_url( - const std::string & model_url, - const std::string & local_path, - const std::string & hf_token, - const struct llama_model_params & params); - -struct llama_model * common_load_model_from_hf( - const std::string & repo, - const std::string & remote_path, - const std::string & local_path, - const std::string & hf_token, - const struct llama_model_params & params); - -std::pair common_get_hf_file( - const std::string & hf_repo_with_tag, - const std::string & hf_token); - // clear LoRA adapters from context, then apply new list of adapters void common_set_adapter_lora(struct llama_context * ctx, std::vector & lora); diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 430e8be512653..0f4019293d581 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -38,7 +38,7 @@ int main(int argc, char ** argv) { llama_model_params model_params = common_model_params_to_llama(params); - llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params); + llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params); if (model == NULL) { fprintf(stderr , "%s: error: unable to load model\n" , __func__); diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp index 21b95ef5e4e83..1a5de5928a526 100644 --- a/examples/batched/batched.cpp +++ b/examples/batched/batched.cpp @@ -41,7 +41,7 @@ int main(int argc, char ** argv) { llama_model_params model_params = common_model_params_to_llama(params); - llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params); + llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params); if (model == NULL) { LOG_ERR("%s: error: unable to load model\n" , __func__); diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index e7d0fbfffedb0..24dc85cf27336 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -421,7 +421,7 @@ int main(int argc, char ** argv) { g_verbose = (params.verbosity > 1); try { - lora_merge_ctx ctx(params.model, params.lora_adapters, params.out_file, params.cpuparams.n_threads); + lora_merge_ctx ctx(params.model.path, params.lora_adapters, params.out_file, params.cpuparams.n_threads); ctx.run_merge(); } catch (const std::exception & err) { fprintf(stderr, "%s\n", err.what()); diff --git a/examples/gritlm/gritlm.cpp b/examples/gritlm/gritlm.cpp index f7db7861c1ad5..539bc4d6027fb 100644 --- a/examples/gritlm/gritlm.cpp +++ b/examples/gritlm/gritlm.cpp @@ -168,7 +168,7 @@ int main(int argc, char * argv[]) { llama_backend_init(); - llama_model * model = llama_model_load_from_file(params.model.c_str(), mparams); + llama_model * model = llama_model_load_from_file(params.model.path.c_str(), mparams); // create generation context llama_context * ctx = llama_init_from_model(model, cparams); diff --git a/examples/llava/README-gemma3.md b/examples/llava/README-gemma3.md index 20bf73fb5c043..3c25ee2583027 100644 --- a/examples/llava/README-gemma3.md +++ b/examples/llava/README-gemma3.md @@ -4,6 +4,26 @@ > > This is very experimental, only used for demo purpose. +## Quick started + +You can use pre-quantized model from [ggml-org](https://huggingface.co/ggml-org)'s Hugging Face account + +```bash +# build +cmake -B build +cmake --build build --target llama-gemma3-cli + +# alternatively, install from brew (MacOS) +brew install llama.cpp + +# run it +llama-gemma3-cli -hf ggml-org/gemma-3-4b-it-GGUF +llama-gemma3-cli -hf ggml-org/gemma-3-12b-it-GGUF +llama-gemma3-cli -hf ggml-org/gemma-3-27b-it-GGUF + +# note: 1B model does not support vision +``` + ## How to get mmproj.gguf? ```bash diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/gemma3-cli.cpp index c36bb2eda0c70..7813ac19f5d67 100644 --- a/examples/llava/gemma3-cli.cpp +++ b/examples/llava/gemma3-cli.cpp @@ -78,7 +78,7 @@ struct gemma3_context { } void init_clip_model(common_params & params) { - const char * clip_path = params.mmproj.c_str(); + const char * clip_path = params.mmproj.path.c_str(); ctx_clip = clip_model_load(clip_path, params.verbosity > 1); } @@ -232,13 +232,13 @@ int main(int argc, char ** argv) { common_init(); - if (params.mmproj.empty()) { + if (params.mmproj.path.empty()) { show_additional_info(argc, argv); return 1; } gemma3_context ctx(params); - printf("%s: %s\n", __func__, params.model.c_str()); + printf("%s: %s\n", __func__, params.model.path.c_str()); bool is_single_turn = !params.prompt.empty() && !params.image.empty(); diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp index 40aa0876f24a7..a15131343635c 100644 --- a/examples/llava/llava-cli.cpp +++ b/examples/llava/llava-cli.cpp @@ -225,7 +225,7 @@ static struct llama_model * llava_init(common_params * params) { llama_model_params model_params = common_model_params_to_llama(*params); - llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params); + llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params); if (model == NULL) { LOG_ERR("%s: unable to load model\n" , __func__); return NULL; @@ -234,7 +234,7 @@ static struct llama_model * llava_init(common_params * params) { } static struct llava_context * llava_init_context(common_params * params, llama_model * model) { - const char * clip_path = params->mmproj.c_str(); + const char * clip_path = params->mmproj.path.c_str(); auto prompt = params->prompt; if (prompt.empty()) { @@ -283,7 +283,7 @@ int main(int argc, char ** argv) { common_init(); - if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { + if (params.mmproj.path.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { print_usage(argc, argv); return 1; } diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp index 12f536cf5cfff..48fddeaa4d3c4 100644 --- a/examples/llava/minicpmv-cli.cpp +++ b/examples/llava/minicpmv-cli.cpp @@ -31,7 +31,7 @@ static struct llama_model * llava_init(common_params * params) { llama_model_params model_params = common_model_params_to_llama(*params); - llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params); + llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params); if (model == NULL) { LOG_ERR("%s: unable to load model\n" , __func__); return NULL; @@ -80,7 +80,7 @@ static void llava_free(struct llava_context * ctx_llava) { } static struct clip_ctx * clip_init_context(common_params * params) { - const char * clip_path = params->mmproj.c_str(); + const char * clip_path = params->mmproj.path.c_str(); auto prompt = params->prompt; if (prompt.empty()) { @@ -290,7 +290,7 @@ int main(int argc, char ** argv) { common_init(); - if (params.mmproj.empty() || (params.image.empty())) { + if (params.mmproj.path.empty() || (params.image.empty())) { show_additional_info(argc, argv); return 1; } diff --git a/examples/llava/qwen2vl-cli.cpp b/examples/llava/qwen2vl-cli.cpp index 132a7da543c2a..c6481e482a811 100644 --- a/examples/llava/qwen2vl-cli.cpp +++ b/examples/llava/qwen2vl-cli.cpp @@ -314,7 +314,7 @@ static struct llama_model * llava_init(common_params * params) { llama_model_params model_params = common_model_params_to_llama(*params); - llama_model * model = llama_model_load_from_file(params->model.c_str(), model_params); + llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params); if (model == NULL) { LOG_ERR("%s: unable to load model\n" , __func__); return NULL; @@ -323,7 +323,7 @@ static struct llama_model * llava_init(common_params * params) { } static struct llava_context * llava_init_context(common_params * params, llama_model * model) { - const char * clip_path = params->mmproj.c_str(); + const char * clip_path = params->mmproj.path.c_str(); auto prompt = params->prompt; if (prompt.empty()) { @@ -524,7 +524,7 @@ int main(int argc, char ** argv) { common_init(); - if (params.mmproj.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { + if (params.mmproj.path.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) { print_usage(argc, argv); return 1; } diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index 588632f0432b2..e0e6da631dad3 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -405,7 +405,7 @@ int main(int argc, char ** argv) { params.prompt_file = "used built-in defaults"; } LOG_INF("External prompt file: \033[32m%s\033[0m\n", params.prompt_file.c_str()); - LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.c_str()); + LOG_INF("Model and path used: \033[32m%s\033[0m\n\n", params.model.path.c_str()); LOG_INF("Total prompt tokens: %6d, speed: %5.2f t/s\n", n_total_prompt, (double) (n_total_prompt ) / (t_main_end - t_main_start) * 1e6); LOG_INF("Total gen tokens: %6d, speed: %5.2f t/s\n", n_total_gen, (double) (n_total_gen ) / (t_main_end - t_main_start) * 1e6); diff --git a/examples/passkey/passkey.cpp b/examples/passkey/passkey.cpp index ea3a6c1fca3ee..347ea4a698f2e 100644 --- a/examples/passkey/passkey.cpp +++ b/examples/passkey/passkey.cpp @@ -64,7 +64,7 @@ int main(int argc, char ** argv) { llama_model_params model_params = common_model_params_to_llama(params); - llama_model * model = llama_model_load_from_file(params.model.c_str(), model_params); + llama_model * model = llama_model_load_from_file(params.model.path.c_str(), model_params); if (model == NULL) { LOG_ERR("%s: unable to load model\n" , __func__); diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 17a292da153c1..d140f8c4469c9 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -1876,7 +1876,7 @@ struct server_context { } bool load_model(const common_params & params) { - SRV_INF("loading model '%s'\n", params.model.c_str()); + SRV_INF("loading model '%s'\n", params.model.path.c_str()); params_base = params; @@ -1886,7 +1886,7 @@ struct server_context { ctx = llama_init.context.get(); if (model == nullptr) { - SRV_ERR("failed to load model, '%s'\n", params_base.model.c_str()); + SRV_ERR("failed to load model, '%s'\n", params_base.model.path.c_str()); return false; } @@ -1897,16 +1897,13 @@ struct server_context { add_bos_token = llama_vocab_get_add_bos(vocab); has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL; - if (!params_base.speculative.model.empty() || !params_base.speculative.hf_repo.empty()) { - SRV_INF("loading draft model '%s'\n", params_base.speculative.model.c_str()); + if (!params_base.speculative.model.path.empty() || !params_base.speculative.model.hf_repo.empty()) { + SRV_INF("loading draft model '%s'\n", params_base.speculative.model.path.c_str()); auto params_dft = params_base; params_dft.devices = params_base.speculative.devices; - params_dft.hf_file = params_base.speculative.hf_file; - params_dft.hf_repo = params_base.speculative.hf_repo; params_dft.model = params_base.speculative.model; - params_dft.model_url = params_base.speculative.model_url; params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx; params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers; params_dft.n_parallel = 1; @@ -1920,12 +1917,12 @@ struct server_context { model_dft = llama_init_dft.model.get(); if (model_dft == nullptr) { - SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.c_str()); + SRV_ERR("failed to load draft model, '%s'\n", params_base.speculative.model.path.c_str()); return false; } if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) { - SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.c_str(), params_base.model.c_str()); + SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.path.c_str(), params_base.model.path.c_str()); return false; } @@ -3865,7 +3862,7 @@ int main(int argc, char ** argv) { json data = { { "default_generation_settings", ctx_server.default_generation_settings_for_props }, { "total_slots", ctx_server.params_base.n_parallel }, - { "model_path", ctx_server.params_base.model }, + { "model_path", ctx_server.params_base.model.path }, { "chat_template", common_chat_templates_source(ctx_server.chat_templates.get()) }, { "bos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_bos(ctx_server.vocab), /* special= */ true)}, { "eos_token", common_token_to_piece(ctx_server.ctx, llama_vocab_eos(ctx_server.vocab), /* special= */ true)}, @@ -4131,7 +4128,7 @@ int main(int argc, char ** argv) { {"object", "list"}, {"data", { { - {"id", params.model_alias.empty() ? params.model : params.model_alias}, + {"id", params.model_alias.empty() ? params.model.path : params.model_alias}, {"object", "model"}, {"created", std::time(0)}, {"owned_by", "llamacpp"}, diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp index a5d2bc9d09de7..0783ed4a4c43e 100644 --- a/examples/speculative-simple/speculative-simple.cpp +++ b/examples/speculative-simple/speculative-simple.cpp @@ -24,7 +24,7 @@ int main(int argc, char ** argv) { common_init(); - if (params.speculative.model.empty()) { + if (params.speculative.model.path.empty()) { LOG_ERR("%s: --model-draft is required\n", __func__); return 1; } diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 627d01bbcb5ad..561c308830351 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -46,7 +46,7 @@ int main(int argc, char ** argv) { common_init(); - if (params.speculative.model.empty()) { + if (params.speculative.model.path.empty()) { LOG_ERR("%s: --model-draft is required\n", __func__); return 1; } diff --git a/examples/tts/tts.cpp b/examples/tts/tts.cpp index c7ac94cc54e1b..0f047986965f8 100644 --- a/examples/tts/tts.cpp +++ b/examples/tts/tts.cpp @@ -577,12 +577,7 @@ int main(int argc, char ** argv) { const llama_vocab * vocab = llama_model_get_vocab(model_ttc); - // TODO: refactor in a common struct - params.model = params.vocoder.model; - params.model_url = params.vocoder.model_url; - params.hf_repo = params.vocoder.hf_repo; - params.hf_file = params.vocoder.hf_file; - + params.model = params.vocoder.model; params.embedding = true; common_init_result llama_init_cts = common_init_from_params(params); diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp index 69604b87ceec4..537fc63a4c975 100644 --- a/tests/test-arg-parser.cpp +++ b/tests/test-arg-parser.cpp @@ -77,7 +77,7 @@ int main(void) { argv = {"binary_name", "-m", "model_file.gguf"}; assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); - assert(params.model == "model_file.gguf"); + assert(params.model.path == "model_file.gguf"); argv = {"binary_name", "-t", "1234"}; assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); @@ -89,7 +89,7 @@ int main(void) { argv = {"binary_name", "-m", "abc.gguf", "--predict", "6789", "--batch-size", "9090"}; assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); - assert(params.model == "abc.gguf"); + assert(params.model.path == "abc.gguf"); assert(params.n_predict == 6789); assert(params.n_batch == 9090); @@ -112,7 +112,7 @@ int main(void) { setenv("LLAMA_ARG_THREADS", "1010", true); argv = {"binary_name"}; assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); - assert(params.model == "blah.gguf"); + assert(params.model.path == "blah.gguf"); assert(params.cpuparams.n_threads == 1010); @@ -122,7 +122,7 @@ int main(void) { setenv("LLAMA_ARG_THREADS", "1010", true); argv = {"binary_name", "-m", "overwritten.gguf"}; assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON)); - assert(params.model == "overwritten.gguf"); + assert(params.model.path == "overwritten.gguf"); assert(params.cpuparams.n_threads == 1010); #endif // _WIN32 From 9bacd6b37461608385360fd64326c13247ccf18e Mon Sep 17 00:00:00 2001 From: Chenguang Li <757486878@qq.com> Date: Wed, 2 Apr 2025 15:22:13 +0800 Subject: [PATCH 02/11] [CANN] get_rows and dup optimization (#12671) * [CANN]get_rows and dup optimization. Co-authored-by: hipudding Signed-off-by: noemotiovon * [CANN]GET_ROWS and CPY/DUP optimization Co-authored-by: hipudding Signed-off-by: noemotiovon * [CANN]code style adjustment Signed-off-by: noemotiovon * [CANN]code style adjustment Signed-off-by: noemotiovon * [CANN]code style adjustment Signed-off-by: noemotiovon * [CANN]code style adjustment Signed-off-by: noemotiovon --------- Signed-off-by: noemotiovon Co-authored-by: noemotiovon Co-authored-by: hipudding --- ggml/src/ggml-cann/CMakeLists.txt | 2 - ggml/src/ggml-cann/aclnn_ops.cpp | 467 +++++++++--------- ggml/src/ggml-cann/ggml-cann.cpp | 26 +- ggml/src/ggml-cann/kernels/CMakeLists.txt | 30 -- ggml/src/ggml-cann/kernels/ascendc_kernels.h | 19 - ggml/src/ggml-cann/kernels/dup.cpp | 234 --------- ggml/src/ggml-cann/kernels/get_row_f16.cpp | 197 -------- ggml/src/ggml-cann/kernels/get_row_f32.cpp | 190 ------- ggml/src/ggml-cann/kernels/get_row_q4_0.cpp | 204 -------- ggml/src/ggml-cann/kernels/get_row_q8_0.cpp | 191 ------- .../ggml-cann/kernels/quantize_f16_q8_0.cpp | 218 -------- .../ggml-cann/kernels/quantize_f32_q8_0.cpp | 216 -------- .../kernels/quantize_float_to_q4_0.cpp | 295 ----------- 13 files changed, 256 insertions(+), 2033 deletions(-) delete mode 100644 ggml/src/ggml-cann/kernels/CMakeLists.txt delete mode 100644 ggml/src/ggml-cann/kernels/ascendc_kernels.h delete mode 100644 ggml/src/ggml-cann/kernels/dup.cpp delete mode 100644 ggml/src/ggml-cann/kernels/get_row_f16.cpp delete mode 100644 ggml/src/ggml-cann/kernels/get_row_f32.cpp delete mode 100644 ggml/src/ggml-cann/kernels/get_row_q4_0.cpp delete mode 100644 ggml/src/ggml-cann/kernels/get_row_q8_0.cpp delete mode 100644 ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp delete mode 100644 ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp delete mode 100644 ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp diff --git a/ggml/src/ggml-cann/CMakeLists.txt b/ggml/src/ggml-cann/CMakeLists.txt index 05cf06bfab4fc..0d8e483b291c7 100644 --- a/ggml/src/ggml-cann/CMakeLists.txt +++ b/ggml/src/ggml-cann/CMakeLists.txt @@ -51,13 +51,11 @@ if (CANN_INSTALL_DIR) ${CANN_INSTALL_DIR}/acllib/include ) - add_subdirectory(kernels) list(APPEND CANN_LIBRARIES ascendcl nnopbase opapi acl_op_compiler - ascendc_kernels ) file(GLOB GGML_SOURCES_CANN "*.cpp") diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 6bb5d08349197..8482bb53761f4 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -58,7 +59,6 @@ #include #include "ggml-impl.h" -#include "kernels/ascendc_kernels.h" #define GGML_COMMON_DECL_C @@ -99,6 +99,35 @@ static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src, ACL_CHECK(aclDestroyIntArray(repeats)); } +/** + * @brief Casts the elements of a tensor to a specified data type using the CANN backend. + * + * @details This function performs a type conversion on the elements of the input tensor `acl_src` + * and stores the results in the destination tensor `acl_dst`. The conversion type is + * determined based on the `dst` tensor's data type. + * + * @param ctx The context for the CANN backend operations. + * @param acl_src The source tensor whose elements will be cast. + * @param acl_dst The destination tensor that will store the casted elements. + * @param dst The ggml tensor specifying the target data type. + */ +static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src, + aclTensor* acl_dst, ggml_tensor* dst) { + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + ACL_CHECK(aclnnCastGetWorkspaceSize(acl_src, + ggml_cann_type_mapping(dst->type), + acl_dst, &workspaceSize, &executor)); + + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, ctx.stream())); +} + void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ggml_tensor* src = dst->src[0]; GGML_ASSERT(ggml_can_repeat(src, dst)); @@ -889,173 +918,76 @@ static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src, } void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - ggml_tensor* src = dst->src[0]; + ggml_tensor* src0 = dst->src[0]; - aclTensor* acl_src = ggml_cann_create_tensor(src); + aclTensor* acl_src = ggml_cann_create_tensor(src0); aclTensor* acl_dst = ggml_cann_create_tensor(dst); - - ggml_cann_pool_alloc src_extra_allocator(ctx.pool(), sizeof(ggml_tensor)); - ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor)); - src->extra = src_extra_allocator.get(); - dst->extra = dst_extra_allocator.get(); - ACL_CHECK(aclrtMemcpyAsync(src->extra, sizeof(ggml_tensor), src, - sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, - ctx.stream())); - ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst, - sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, - ctx.stream())); - - if ((dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32) && - ggml_are_same_shape(src, dst)) { - cann_copy(ctx, acl_src, acl_dst); - ACL_CHECK(aclDestroyTensor(acl_src)); - ACL_CHECK(aclDestroyTensor(acl_dst)); - return; - } - // TODO: simplify - if (src->type == GGML_TYPE_F16) { - if (dst->type == GGML_TYPE_Q8_0) { - aclrtlaunch_ascendc_quantize_f16_q8_0( - 24, ctx.stream(), src->data, dst->data, - ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb, - ((ggml_tensor*)dst->extra)->ne); - return; - } - if (dst->type == GGML_TYPE_Q4_0) { - aclrtlaunch_ascendc_quantize_f16_to_q4_0( - 24, ctx.stream(), src->data, dst->data, - ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb, - ((ggml_tensor*)dst->extra)->ne); - return; - } - if (dst->type == GGML_TYPE_F16) { - if (ggml_are_same_shape(src, dst)) { - cann_copy(ctx, acl_src, acl_dst); - ACL_CHECK(aclDestroyTensor(acl_src)); - ACL_CHECK(aclDestroyTensor(acl_dst)); - return; - } - if (ggml_is_contiguous(dst)) { - const size_t src_type_size = ggml_type_size(src->type); - if (src->nb[0] == src_type_size) { - // src0 is contigous on first dimension, copy by rows - int64_t rows_num = ggml_nrows(src); - - aclrtlaunch_ascendc_dup_by_rows_fp16( - rows_num, ctx.stream(), src->data, dst->data, - ((ggml_tensor*)src->extra)->ne, - ((ggml_tensor*)src->extra)->nb, - ((ggml_tensor*)dst->extra)->ne, - ((ggml_tensor*)dst->extra)->nb); - return; - } - GGML_ABORT("fatal error"); - } - GGML_ABORT("fatal error"); - } - if (dst->type == GGML_TYPE_F32) { - if (ggml_are_same_shape(src, dst)) { - cann_copy(ctx, acl_src, acl_dst); - ACL_CHECK(aclDestroyTensor(acl_src)); - ACL_CHECK(aclDestroyTensor(acl_dst)); - return; - } - if (ggml_is_contiguous(dst)) { - const size_t src_type_size = ggml_type_size(src->type); - if (src->nb[0] == src_type_size) { - // src0 is contigous on first dimension, copy by rows - int64_t rows_num = ggml_nrows(src); - aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32( - rows_num, ctx.stream(), src->data, dst->data, - ((ggml_tensor*)src->extra)->ne, - ((ggml_tensor*)src->extra)->nb, - ((ggml_tensor*)dst->extra)->ne, - ((ggml_tensor*)dst->extra)->nb); - return; - } - GGML_ABORT("fatal error"); - } - GGML_ABORT("fatal error"); - } - // TODO - GGML_ABORT("fatal error"); - } else if (src->type == GGML_TYPE_F32) { - // TODO: if (src0->type == dst->type && ne00 == ne0 && nb00 == type_size - // && nb0 == type_size) - if (dst->type == GGML_TYPE_Q8_0) { - aclrtlaunch_ascendc_quantize_f32_q8_0( - 24, ctx.stream(), src->data, dst->data, - ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb, - ((ggml_tensor*)dst->extra)->ne); - return; - } - if (dst->type == GGML_TYPE_Q4_0) { - aclrtlaunch_ascendc_quantize_f32_to_q4_0( - 24, ctx.stream(), src->data, dst->data, - ((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb, - ((ggml_tensor*)dst->extra)->ne); - return; + if (ggml_are_same_shape(src0, dst)) { + if (dst->type == src0->type) { + cann_copy(ctx, acl_src, acl_dst); + } else { + aclnn_cast(ctx, acl_src, acl_dst, dst); } - if (dst->type == GGML_TYPE_F32) { - if (ggml_are_same_shape(src, dst)) { - cann_copy(ctx, acl_src, acl_dst); - ACL_CHECK(aclDestroyTensor(acl_src)); - ACL_CHECK(aclDestroyTensor(acl_dst)); + } else { + if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) { + if (dst->type == src0->type) { + size_t cpy_size = ggml_nbytes(dst); + ACL_CHECK(aclrtMemcpyAsync( + dst->data, cpy_size, src0->data, cpy_size, + ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream())); return; - } - if (ggml_is_contiguous(dst)) { - const size_t src_type_size = ggml_type_size(src->type); - if (src->nb[0] == src_type_size) { - // src0 is contigous on first dimension, copy by rows - int64_t rows_num = ggml_nrows(src); - aclrtlaunch_ascendc_dup_by_rows_fp32( - rows_num, ctx.stream(), src->data, dst->data, - ((ggml_tensor*)src->extra)->ne, - ((ggml_tensor*)src->extra)->nb, - ((ggml_tensor*)dst->extra)->ne, - ((ggml_tensor*)dst->extra)->nb); - return; - } - GGML_ABORT("fatal error"); } else { - // TODO: dst not contiguous - GGML_ABORT("fatal error"); - } - } - if (dst->type == GGML_TYPE_F16) { - if (ggml_are_same_shape(src, dst)) { - cann_copy(ctx, acl_src, acl_dst); - ACL_CHECK(aclDestroyTensor(acl_src)); - ACL_CHECK(aclDestroyTensor(acl_dst)); + ggml_cann_pool_alloc src_buffer_allocator( + ctx.pool(), + ggml_nelements(dst) * ggml_type_size(dst->type)); + void* src_trans_buffer = src_buffer_allocator.get(); + size_t src_trans_nb[GGML_MAX_DIMS]; + src_trans_nb[0] = ggml_type_size(dst->type); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1]; + } + aclTensor* src_trans_tensor = ggml_cann_create_tensor( + src_trans_buffer, ggml_cann_type_mapping(dst->type), + ggml_type_size(dst->type), src0->ne, src_trans_nb, + GGML_MAX_DIMS); + + aclnn_cast(ctx, acl_src, src_trans_tensor, dst); + size_t cpy_size = ggml_nbytes(dst); + ACL_CHECK(aclrtMemcpyAsync( + dst->data, cpy_size, src_trans_buffer, cpy_size, + ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream())); + ACL_CHECK(aclDestroyTensor(src_trans_tensor)); return; } - if (ggml_is_contiguous(dst)) { - const size_t src_type_size = ggml_type_size(src->type); - if (src->nb[0] == src_type_size) { - // src0 is contigous on first dimension, copy by rows - int64_t rows_num = ggml_nrows(src); - aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16( - rows_num, ctx.stream(), src->data, dst->data, - ((ggml_tensor*)src->extra)->ne, - ((ggml_tensor*)src->extra)->nb, - ((ggml_tensor*)dst->extra)->ne, - ((ggml_tensor*)dst->extra)->nb); - return; - } - GGML_ABORT("fatal error"); + } else if (ggml_is_contiguous(dst)) { + ggml_cann_pool_alloc src_buffer_allocator( + ctx.pool(), ggml_nelements(dst) * ggml_type_size(dst->type)); + void* src_trans_buffer = src_buffer_allocator.get(); + size_t src_trans_nb[GGML_MAX_DIMS]; + src_trans_nb[0] = ggml_type_size(dst->type); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1]; } - } - // TODO - GGML_ABORT("fatal error"); - } else { - if (ggml_are_same_shape(src, dst)) { - cann_copy(ctx, acl_src, acl_dst); - ACL_CHECK(aclDestroyTensor(acl_src)); - ACL_CHECK(aclDestroyTensor(acl_dst)); + aclTensor* src_trans_tensor = ggml_cann_create_tensor( + src_trans_buffer, ggml_cann_type_mapping(dst->type), + ggml_type_size(dst->type), src0->ne, src_trans_nb, + GGML_MAX_DIMS); + + aclnn_cast(ctx, acl_src, src_trans_tensor, dst); + + size_t cpy_size = ggml_nbytes(dst); + ACL_CHECK(aclrtMemcpyAsync(dst->data, cpy_size, src_trans_buffer, + cpy_size, ACL_MEMCPY_DEVICE_TO_DEVICE, + ctx.stream())); + ACL_CHECK(aclDestroyTensor(src_trans_tensor)); return; + } else { + GGML_ABORT("Unsupport dst is not tontiguous."); } - GGML_ABORT("fatal error"); } + + ACL_CHECK(aclDestroyTensor(acl_src)); + ACL_CHECK(aclDestroyTensor(acl_dst)); } #ifdef __cplusplus @@ -2378,85 +2310,168 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) { ACL_CHECK(aclDestroyTensor(tmp_mask_tensor)); } -void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { - ggml_tensor* src0 = dst->src[0]; - ggml_tensor* src1 = dst->src[1]; +/** + * @brief Performs embedding operation on a 4D tensor using the CANN backend. + * + * This function extracts slices from the source tensor (`src_buffer`), + * index tensor (`index`), and destination tensor (`dst`), and performs an + * embedding operation on them. The embedding operation is applied by iterating + * over the last two dimensions of the source tensor, creating the necessary + * tensors for the source, index, and output, and executing the embedding operation. + * + * @param ctx The context for CANN backend operations. + * @param src_buffer The source buffer holding the data for the source tensor. + * @param src_ne The dimensions of the source tensor. + * @param src_nb The strides (byte offsets) of the source tensor. + * @param index The index tensor used in the embedding operation. + * @param dst The destination tensor where the result will be stored. + */ +static void aclnn_embedding_4d(ggml_backend_cann_context& ctx, void* src_buffer, + int64_t* src_ne, size_t* src_nb, ggml_tensor* index, + ggml_tensor* dst) { + for (int64_t i = 0; i < src_ne[3]; i++) { + for (int64_t j = 0; j < src_ne[2]; j++) { + // src + int64_t acl_src_ne[2] = {src_ne[0], src_ne[1]}; + size_t acl_src_nb[2] = {src_nb[0], src_nb[1]}; + aclTensor* acl_src_tensor = ggml_cann_create_tensor( + (char*)src_buffer + i * src_nb[3] + j * src_nb[2], + ggml_cann_type_mapping(dst->type), ggml_element_size(dst), + acl_src_ne, acl_src_nb, 2); + + // index + int64_t acl_index_ne[1] = {index->ne[0]}; + size_t acl_index_nb[1] = {index->nb[0]}; + aclTensor* acl_index = ggml_cann_create_tensor( + (char*)index->data + i * index->nb[2] + j * index->nb[1], + ggml_cann_type_mapping(index->type), ggml_element_size(index), + acl_index_ne, acl_index_nb, 1); + + // out + int64_t acl_out_ne[2] = {dst->ne[0], dst->ne[1]}; + size_t acl_out_nb[2] = {dst->nb[0], dst->nb[1]}; + aclTensor* acl_out = ggml_cann_create_tensor( + (char*)dst->data + i * dst->nb[3] + j * dst->nb[2], + ggml_cann_type_mapping(dst->type), ggml_element_size(dst), + acl_out_ne, acl_out_nb, 2); + + uint64_t workspaceSize = 0; + aclOpExecutor* executor; + void* workspaceAddr = nullptr; + + ACL_CHECK(aclnnEmbeddingGetWorkspaceSize( + acl_src_tensor, acl_index, acl_out, &workspaceSize, &executor)); + + if (workspaceSize > 0) { + ggml_cann_pool_alloc workspace_allocator(ctx.pool(), + workspaceSize); + workspaceAddr = workspace_allocator.get(); + } + + ACL_CHECK(aclnnEmbedding(workspaceAddr, workspaceSize, executor, + ctx.stream())); + + ACL_CHECK(aclDestroyTensor(acl_src_tensor)); + ACL_CHECK(aclDestroyTensor(acl_index)); + ACL_CHECK(aclDestroyTensor(acl_out)); + } + } +} - ggml_cann_pool_alloc src0_extra_allocator(ctx.pool(), sizeof(ggml_tensor)); - ggml_cann_pool_alloc src1_extra_allocator(ctx.pool(), sizeof(ggml_tensor)); - ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor)); - src0->extra = src0_extra_allocator.get(); - src1->extra = src1_extra_allocator.get(); - dst->extra = dst_extra_allocator.get(); - ACL_CHECK(aclrtMemcpyAsync(src0->extra, sizeof(ggml_tensor), src0, - sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, - ctx.stream())); - ACL_CHECK(aclrtMemcpyAsync(src1->extra, sizeof(ggml_tensor), src1, - sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, - ctx.stream())); - ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst, - sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE, - ctx.stream())); +void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + ggml_tensor* src0 = dst->src[0]; // src + ggml_tensor* src1 = dst->src[1]; // index switch (src0->type) { case GGML_TYPE_F32: { -#ifdef ASCEND_310P - // Special operation for get_row_f32 kernel of 310P: clear the - // content of dest data buffer when row is not aligned to 32 bytes - if ((src0->ne[0] % 8) != 0) { - size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * - src0->ne[0] * ggml_type_size(GGML_TYPE_F32); - ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len)); - } -#endif - aclrtlaunch_ascendc_get_row_f32( - 24, ctx.stream(), src0->data, src1->data, dst->data, - ((ggml_tensor*)src0->extra)->ne, - ((ggml_tensor*)src0->extra)->nb, - ((ggml_tensor*)src1->extra)->ne, - ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne, - ((ggml_tensor*)dst->extra)->nb); + aclnn_embedding_4d(ctx, src0->data, src0->ne, src0->nb, src1, + dst); break; } case GGML_TYPE_F16: { -#ifdef ASCEND_310P - // Special operation for get_row_f16 kernel of 310P: clear the - // content of dest data buffer when row is not aligned to 32 bytes - if ((src0->ne[0] % 16) != 0) { - size_t dst_len = - src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * - ggml_type_size( - GGML_TYPE_F32); // out is also f32, even input is f16 - ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len)); + aclTensor* acl_src0 = ggml_cann_create_tensor(src0); + ggml_cann_pool_alloc src_buffer_allocator( + ctx.pool(), ggml_nelements(src0) * sizeof(float_t)); + void* src_trans_buffer = src_buffer_allocator.get(); + size_t src_trans_nb[GGML_MAX_DIMS]; + src_trans_nb[0] = sizeof(float_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1]; } -#endif - aclrtlaunch_ascendc_get_row_f16( - 24, ctx.stream(), src0->data, src1->data, dst->data, - ((ggml_tensor*)src0->extra)->ne, - ((ggml_tensor*)src0->extra)->nb, - ((ggml_tensor*)src1->extra)->ne, - ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne, - ((ggml_tensor*)dst->extra)->nb); + aclTensor* src_trans_tensor = ggml_cann_create_tensor( + src_trans_buffer, ACL_FLOAT, ggml_type_size(dst->type), + src0->ne, src_trans_nb, GGML_MAX_DIMS); + aclnn_cast(ctx, acl_src0, src_trans_tensor, dst); + aclnn_embedding_4d(ctx, src_trans_buffer, src0->ne, + src_trans_nb, src1, dst); + ACL_CHECK(aclDestroyTensor(acl_src0)); + ACL_CHECK(aclDestroyTensor(src_trans_tensor)); break; } - case GGML_TYPE_Q4_0: - aclrtlaunch_ascendc_get_row_q4_0( - 24, ctx.stream(), src0->data, src1->data, dst->data, - ((ggml_tensor*)src0->extra)->ne, - ((ggml_tensor*)src1->extra)->ne, - ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne, - ((ggml_tensor*)dst->extra)->nb); - break; - case GGML_TYPE_Q8_0: - aclrtlaunch_ascendc_get_row_q8_0( - 24, ctx.stream(), src0->data, src1->data, dst->data, - ((ggml_tensor*)src0->extra)->ne, - ((ggml_tensor*)src1->extra)->ne, - ((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne, - ((ggml_tensor*)dst->extra)->nb); + case GGML_TYPE_Q8_0: { + // add 1 dim for bcast mul. + size_t weight_nb[GGML_MAX_DIMS + 1], scale_nb[GGML_MAX_DIMS + 1], + dequant_nb[GGML_MAX_DIMS + 1]; + int64_t weight_ne[GGML_MAX_DIMS + 1], scale_ne[GGML_MAX_DIMS + 1], + *dequant_ne; + int64_t scale_offset = 0; + + // [3,4,5,64] -> [3,4,5,2,32] + weight_ne[0] = QK8_0; + weight_ne[1] = src0->ne[0] / QK8_0; + weight_nb[0] = sizeof(int8_t); + weight_nb[1] = weight_nb[0] * weight_ne[0]; + for (int i = 2; i < GGML_MAX_DIMS + 1; i++) { + weight_ne[i] = src0->ne[i - 1]; + weight_nb[i] = weight_nb[i - 1] * weight_ne[i - 1]; + } + + // [3,4,5,64] -> [3,4,5,2,1] + scale_ne[0] = 1; + scale_ne[1] = src0->ne[0] / QK8_0; + scale_nb[0] = sizeof(uint16_t); + scale_nb[1] = scale_nb[0] * scale_ne[0]; + for (int i = 2; i < GGML_MAX_DIMS + 1; i++) { + scale_ne[i] = src0->ne[i - 1]; + scale_nb[i] = scale_nb[i - 1] * scale_ne[i - 1]; + } + + // [3,4,5,64] -> [3,4,5,2,32] + dequant_ne = weight_ne; + dequant_nb[0] = sizeof(float_t); + for (int i = 1; i < GGML_MAX_DIMS + 1; i++) { + dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1]; + } + + scale_offset = ggml_nelements(src0) * sizeof(int8_t); + ggml_cann_pool_alloc dequant_buffer_allocator( + ctx.pool(), ggml_nelements(src0) * sizeof(float_t)); + + aclTensor* acl_weight_tensor = ggml_cann_create_tensor( + src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb, + GGML_MAX_DIMS + 1); + aclTensor* acl_scale_tensor = ggml_cann_create_tensor( + src0->data, ACL_FLOAT16, sizeof(float16_t), scale_ne, scale_nb, + GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset); + aclTensor* dequant_tensor = ggml_cann_create_tensor( + dequant_buffer_allocator.get(), ACL_FLOAT, sizeof(float_t), + dequant_ne, dequant_nb, GGML_MAX_DIMS + 1); + + aclnn_mul(ctx, acl_weight_tensor, acl_scale_tensor, dequant_tensor); + dequant_nb[0] = sizeof(float_t); + dequant_ne = src0->ne; + for (int i = 1; i < GGML_MAX_DIMS; i++) { + dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1]; + } + + aclnn_embedding_4d(ctx, dequant_buffer_allocator.get(), + dequant_ne, dequant_nb, src1, dst); + + ACL_CHECK(aclDestroyTensor(dequant_tensor)); break; + } default: - GGML_ABORT("fatal error"); + GGML_ABORT("Unsupported tensor type for GGML_OP_GET_ROWS"); break; } } @@ -2797,8 +2812,8 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx, ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize( acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr, - nullptr, nullptr, nullptr, antiquantGroupSize, acl_output_tensor, - &workspaceSize, &executor)); + nullptr, nullptr, nullptr, antiquantGroupSize, + acl_output_tensor, &workspaceSize, &executor)); if (workspaceAddr == nullptr) { workspaceAddr = workspace_allocator.alloc(workspaceSize); } diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 68cd9920d1ace..da75f77f511a8 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -1704,7 +1704,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, switch (op->src[0]->type) { case GGML_TYPE_F32: case GGML_TYPE_F16: - case GGML_TYPE_Q4_0: case GGML_TYPE_Q8_0: return true; default: @@ -1712,16 +1711,21 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, } } break; case GGML_OP_CPY: { - switch (op->type) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q4_0: - return true; - default: - return false; + ggml_tensor *src = op->src[0]; + if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) || + (src->type != GGML_TYPE_F32 && + src->type != GGML_TYPE_F16)) { + // only support F32 and F16. + return false; } - } + + if (!ggml_are_same_shape(op, src) && !ggml_is_contiguous(op)) { + // unsupport dst is not contiguous. + return false; + } + + return true; + } break; case GGML_OP_CONT: { // TODO: support GGML_TYPE_BF16 switch (op->src[0]->type) { @@ -1762,9 +1766,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, } return true; } + case GGML_OP_DUP: case GGML_OP_IM2COL: case GGML_OP_CONCAT: - case GGML_OP_DUP: case GGML_OP_REPEAT: case GGML_OP_NONE: case GGML_OP_RESHAPE: diff --git a/ggml/src/ggml-cann/kernels/CMakeLists.txt b/ggml/src/ggml-cann/kernels/CMakeLists.txt deleted file mode 100644 index d687220c3c57e..0000000000000 --- a/ggml/src/ggml-cann/kernels/CMakeLists.txt +++ /dev/null @@ -1,30 +0,0 @@ -file(GLOB SRC_FILES - get_row_f32.cpp - get_row_f16.cpp - get_row_q4_0.cpp - get_row_q8_0.cpp - quantize_f32_q8_0.cpp - quantize_f16_q8_0.cpp - quantize_float_to_q4_0.cpp - dup.cpp -) - -set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR}) -set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim") - -if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) - set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) -elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake) - set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/ascendc_devkit/tikcpp/samples/cmake) -else() - message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the compiler package is installed.") -endif() -include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) - -ascendc_library(ascendc_kernels STATIC - ${SRC_FILES} -) - -message(STATUS "CANN: compile ascend kernels witch SOC_TYPE:${SOC_TYPE}, SOC_VERSION:${SOC_VERSION}, compile macro:-D${SOC_TYPE_COMPILE_OPTION}.") -ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}") -# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP) diff --git a/ggml/src/ggml-cann/kernels/ascendc_kernels.h b/ggml/src/ggml-cann/kernels/ascendc_kernels.h deleted file mode 100644 index 7e153208cfdbc..0000000000000 --- a/ggml/src/ggml-cann/kernels/ascendc_kernels.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef ASCENDC_KERNELS_H -#define ASCENDC_KERNELS_H - -#include "aclrtlaunch_ascendc_get_row_f32.h" -#include "aclrtlaunch_ascendc_get_row_f16.h" -#include "aclrtlaunch_ascendc_get_row_q8_0.h" -#include "aclrtlaunch_ascendc_get_row_q4_0.h" - -#include "aclrtlaunch_ascendc_quantize_f32_q8_0.h" -#include "aclrtlaunch_ascendc_quantize_f16_q8_0.h" -#include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h" -#include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h" - -#include "aclrtlaunch_ascendc_dup_by_rows_fp16.h" -#include "aclrtlaunch_ascendc_dup_by_rows_fp32.h" -#include "aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16.h" -#include "aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32.h" - -#endif // ASCENDC_KERNELS_H diff --git a/ggml/src/ggml-cann/kernels/dup.cpp b/ggml/src/ggml-cann/kernels/dup.cpp deleted file mode 100644 index d9b9574494b72..0000000000000 --- a/ggml/src/ggml-cann/kernels/dup.cpp +++ /dev/null @@ -1,234 +0,0 @@ -#include "kernel_operator.h" - -using namespace AscendC; - -#define BUFFER_NUM 2 -const int64_t SUPPORTED_MAX_DIM = 65535; // currently the limit of max block dim supportted by dup kernel is 65535template - -template -class DupByRows { - public: - __aicore__ inline DupByRows() {} - __aicore__ inline void init(GM_ADDR src, GM_ADDR dst, int64_t *input_ne_ub, - size_t *input_nb_ub) { - /* Dup by rows when src is contigous on first dimension and dst is - contiguous, each kernel process one row. - */ - - // Input has four dims. - int64_t op_block_num = GetBlockNum(); - int64_t op_block_idx = GetBlockIdx(); - - // param - num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3]; - num_elem = input_ne_ub[0]; - - // index for (ne[1], ne[2], ne[3]): (idx_ne1, idx_ne2, idx_ne3) - idx_ne3 = op_block_idx / (input_ne_ub[1] * input_ne_ub[2]); - idx_ne2 = (op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2])) - / (input_ne_ub[1]); - idx_ne1 = op_block_idx - idx_ne3 * (input_ne_ub[1] * input_ne_ub[2]) - - idx_ne2 * input_ne_ub[1]; - - // src may not contiguous in dim [1,2,3], so stride decited by ne&nb - src_stride = input_nb_ub[3] * idx_ne3 + input_nb_ub[2] * idx_ne2 - + input_nb_ub[1] * idx_ne1; - - // dst is contiguous - dst_stride = op_block_idx * (input_ne_ub[0] * sizeof(DST_T)); - - src_gm.SetGlobalBuffer(reinterpret_cast<__gm__ SRC_T *>(src + - src_stride)); - dst_gm.SetGlobalBuffer(reinterpret_cast<__gm__ DST_T *>(dst + - dst_stride)); - - pipe.InitBuffer(src_queue, BUFFER_NUM, (sizeof(SRC_T) * num_elem + - 32 - 1) / 32 * 32); - pipe.InitBuffer(dst_queue, BUFFER_NUM, (sizeof(DST_T) * num_elem + - 32 - 1) / 32 * 32); - } - - __aicore__ inline void copy_in() { - LocalTensor src_local = src_queue.AllocTensor(); - const size_t elem_per_block = 32 / sizeof(SRC_T); - size_t tail = num_elem % elem_per_block; - size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem; - DataCopy(src_local, src_gm, cpy_elements_len); - src_queue.EnQue(src_local); - } - - __aicore__ inline void copy_out() { - LocalTensor dst_local = dst_queue.DeQue(); -#ifdef ASCEND_310P - const size_t elem_per_block = 32 / sizeof(DST_T); - size_t tail = num_elem % elem_per_block; - size_t len = num_elem & ~(elem_per_block - 1); - if (len > 0) { - DataCopy(dst_gm, dst_local, len); - } - if(tail != 0) { - for (size_t i = tail; i < elem_per_block; i++) { - dst_local[len + i].SetValue(0, 0); - } - SetAtomicAdd(); - DataCopy(dst_gm[len], dst_local[len], elem_per_block); - SetAtomicNone(); - } -#else - DataCopyExtParams dataCopyParams; - dataCopyParams.blockCount = 1; - dataCopyParams.blockLen = num_elem * sizeof(DST_T); - DataCopyPad(dst_gm, dst_local, dataCopyParams); -#endif - dst_queue.FreeTensor(dst_local); - } - - __aicore__ inline void dup() { - // main process, copy one row data from src to dst. - copy_in(); - - LocalTensor src_local = src_queue.DeQue(); - LocalTensor dst_local = dst_queue.AllocTensor(); - - int32_t BLOCK_NUM = 32 / sizeof(DST_T); - DataCopy(dst_local, src_local, (num_elem + BLOCK_NUM - 1) - / BLOCK_NUM * BLOCK_NUM); - dst_queue.EnQue(dst_local); - - src_queue.FreeTensor(src_local); - copy_out(); - } - - __aicore__ inline void dup_with_cast() { - // main process, copy one row data from src to dst. - // cast dtype from src to dst. - copy_in(); - - LocalTensor src_local = src_queue.DeQue(); - LocalTensor dst_local = dst_queue.AllocTensor(); - - Cast(dst_local, src_local, RoundMode::CAST_NONE, num_elem); - dst_queue.EnQue(dst_local); - - src_queue.FreeTensor(src_local); - copy_out(); - } - - private: - - TPipe pipe; - GlobalTensor src_gm; - GlobalTensor dst_gm; - - int64_t num_rows; - int64_t num_elem; - int64_t idx_ne3; - int64_t idx_ne2; - int64_t idx_ne1; - int64_t src_stride; - int64_t dst_stride; - - TQue src_queue; - TQue dst_queue; -}; - -template -__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { - auto gm_ptr = (__gm__ uint8_t *)gm; - auto ub_ptr = (uint8_t *)(ub); - for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { - *ub_ptr = *gm_ptr; - } -} - -extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16( - GM_ADDR src_gm, - GM_ADDR dst_gm, - GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, - GM_ADDR output_ne_gm, - GM_ADDR output_nb_gm) { - - int64_t input_ne_ub[4]; - size_t input_nb_ub[4]; - int64_t output_ne_ub[4]; - size_t output_nb_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(input_nb_gm, input_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - copy_to_ub(output_nb_gm, output_nb_ub, 32); - - DupByRows op; - op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub); - op.dup(); -} - -extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32( - GM_ADDR src_gm, - GM_ADDR dst_gm, - GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, - GM_ADDR output_ne_gm, - GM_ADDR output_nb_gm) { - int64_t input_ne_ub[4]; - size_t input_nb_ub[4]; - int64_t output_ne_ub[4]; - size_t output_nb_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(input_nb_gm, input_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - copy_to_ub(output_nb_gm, output_nb_ub, 32); - - DupByRows op; - op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub); - op.dup(); -} - -extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp32_to_fp16( - GM_ADDR src_gm, - GM_ADDR dst_gm, - GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, - GM_ADDR output_ne_gm, - GM_ADDR output_nb_gm) { - - int64_t input_ne_ub[4]; - size_t input_nb_ub[4]; - int64_t output_ne_ub[4]; - size_t output_nb_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(input_nb_gm, input_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - copy_to_ub(output_nb_gm, output_nb_ub, 32); - - DupByRows op; - op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub); - op.dup_with_cast(); -} - -extern "C" __global__ __aicore__ void ascendc_dup_by_rows_fp16_to_fp32( - GM_ADDR src_gm, - GM_ADDR dst_gm, - GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, - GM_ADDR output_ne_gm, - GM_ADDR output_nb_gm) { - - // copy params from gm to ub. - int64_t input_ne_ub[4]; - size_t input_nb_ub[4]; - int64_t output_ne_ub[4]; - size_t output_nb_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(input_nb_gm, input_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - copy_to_ub(output_nb_gm, output_nb_ub, 32); - - DupByRows op; - op.init(src_gm, dst_gm, input_ne_ub, input_nb_ub); - op.dup_with_cast(); -} diff --git a/ggml/src/ggml-cann/kernels/get_row_f16.cpp b/ggml/src/ggml-cann/kernels/get_row_f16.cpp deleted file mode 100644 index 416b45104de5b..0000000000000 --- a/ggml/src/ggml-cann/kernels/get_row_f16.cpp +++ /dev/null @@ -1,197 +0,0 @@ -#include "kernel_operator.h" - -// optimize me. Use template to avoid copy code. -using namespace AscendC; - -#define BUFFER_NUM 2 - -class GET_ROW_F16 { - public: - __aicore__ inline GET_ROW_F16() {} - __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output, - int64_t *input_ne_ub, size_t *input_nb_ub, - int64_t *indices_ne_ub, size_t *indices_nb_ub, - int64_t *output_ne_ub, size_t *output_nb_ub) { - // TODO, use template for F16/f32 - int64_t op_block_num = GetBlockNum(); - op_block_idx = GetBlockIdx(); - - for (int i = 0; i < 4; i++) { - input_ne[i] = input_ne_ub[i]; - input_stride[i] = input_nb_ub[i] / input_nb_ub[0]; - - indices_ne[i] = indices_ne_ub[i]; - indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0]; - - output_ne[i] = output_ne_ub[i]; - output_stride[i] = output_nb_ub[i] / output_nb_ub[0]; - } - - // Indices has two dims. n_elements = all rows should get. - // dr, all rows should this thread get. - uint64_t n_elements = - indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3]; - dr = n_elements / op_block_num; - - uint64_t tails = n_elements % op_block_num; - if (op_block_idx < tails) { - dr += 1; - ir = dr * op_block_idx; - } else { - ir = dr * op_block_idx + tails; - } - - input_gm.SetGlobalBuffer((__gm__ half *)input); - indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices); - output_gm.SetGlobalBuffer((__gm__ float *)output); - - uint64_t input_local_buffer_size = ((input_ne[0] * sizeof(half) + 31) - & ~31); - uint64_t output_local_buffer_size = ((input_ne[0] * sizeof(float) + 31) - & ~31); - - local_buffer_elems = input_local_buffer_size / sizeof(half); - - // TODO, consider long row that can't put in UB. - // All data should asign to 32. It's ok because all data is align to 32. - pipe.InitBuffer(input_queue, BUFFER_NUM, input_local_buffer_size); - pipe.InitBuffer(output_queue, BUFFER_NUM, output_local_buffer_size); - } - - __aicore__ inline void copy_in(uint32_t offset, size_t len) { - size_t origin_len = len; - LocalTensor input_local = input_queue.AllocTensor(); - const size_t elem_per_block = 32 / sizeof(half); - size_t tail = len % elem_per_block; - len = len & ~(elem_per_block - 1); - if(tail != 0) { - len += elem_per_block; - } - DataCopy(input_local, input_gm[offset], len); - input_queue.EnQue(input_local); - } - - __aicore__ inline void copy_out(uint32_t offset, size_t len) { - LocalTensor output_local = output_queue.DeQue(); - const size_t elem_per_block = 32 / sizeof(float); - size_t tail = len % elem_per_block; - len = len & ~(elem_per_block - 1); - if (len > 0) { - DataCopy(output_gm[offset], output_local, len); - } - - if(tail != 0) { -#ifdef ASCEND_310P - for (size_t i = tail; i < elem_per_block; i++) { - output_local[len + i].SetValue(0, 0); - } - SetAtomicAdd(); - DataCopy(output_gm[offset + len], output_local[len], elem_per_block); - SetAtomicNone(); -#else - DataCopyExtParams dataCopyParams; - dataCopyParams.blockCount = 1; - dataCopyParams.blockLen = tail * sizeof(float); - DataCopyPad(output_gm[offset + len], output_local[len], - dataCopyParams); -#endif - } - output_queue.FreeTensor(output_local); - } - - __aicore__ inline void calculate_row(int64_t idx) { - const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]); - const int64_t indices_ne1_idx = - (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) / - indices_ne[0]; - const int64_t indices_ne0_idx = - (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] - - indices_ne1_idx * indices_ne[0]); - - const int64_t indices_offset = indices_ne0_idx * indices_stride[0] + - indices_ne1_idx * indices_stride[1] + - indices_ne2_idx * indices_stride[2]; - const int32_t selected_row_idx = indices_gm.GetValue(indices_offset); - - const int64_t input_offset = selected_row_idx * input_stride[1] + - indices_ne1_idx * input_stride[2] + - indices_ne2_idx * input_stride[3]; - - const int64_t output_offset = indices_ne0_idx * output_stride[1] + - indices_ne1_idx * output_stride[2] + - indices_ne2_idx * output_stride[3]; - - copy_in(input_offset, input_ne[0]); - LocalTensor input_local = input_queue.DeQue(); - LocalTensor output_local = output_queue.AllocTensor(); - - Cast(output_local, input_local, RoundMode::CAST_NONE, - local_buffer_elems); - output_queue.EnQue(output_local); - copy_out(output_offset, input_ne[0]); - - input_queue.FreeTensor(input_local); - } - - __aicore__ inline void calculate() { - for (int64_t i = ir; i < ir + dr; i++) { - calculate_row(i); - } - } - - private: - int64_t input_ne[4]; - size_t input_stride[4]; - - int64_t indices_ne[4]; - size_t indices_stride[4]; - - int64_t output_ne[4]; - size_t output_stride[4]; - - size_t local_buffer_elems; - - int64_t ir; - int64_t dr; - - TPipe pipe; - GlobalTensor input_gm; - GlobalTensor indices_gm; - GlobalTensor output_gm; - TQue input_queue; - TQue output_queue; - int64_t op_block_idx; -}; - -template -__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { - auto gm_ptr = (__gm__ uint8_t *)gm; - auto ub_ptr = (uint8_t *)(ub); - for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { - *ub_ptr = *gm_ptr; - } -} - -extern "C" __global__ __aicore__ void ascendc_get_row_f16( - GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm, - GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm, - GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) { - int64_t input_ne_ub[4]; - size_t input_nb_ub[4]; - int64_t indices_ne_ub[4]; - size_t indices_nb_ub[4]; - int64_t output_ne_ub[4]; - size_t output_nb_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(input_nb_gm, input_nb_ub, 32); - copy_to_ub(indices_ne_gm, indices_ne_ub, 32); - copy_to_ub(indices_nb_gm, indices_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - copy_to_ub(output_nb_gm, output_nb_ub, 32); - - GET_ROW_F16 op; - op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub, - indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub); - op.calculate(); -} diff --git a/ggml/src/ggml-cann/kernels/get_row_f32.cpp b/ggml/src/ggml-cann/kernels/get_row_f32.cpp deleted file mode 100644 index 02116905b18e4..0000000000000 --- a/ggml/src/ggml-cann/kernels/get_row_f32.cpp +++ /dev/null @@ -1,190 +0,0 @@ -#include "kernel_operator.h" - -// optimize me. Use template to avoid copy code. -using namespace AscendC; - -#define BUFFER_NUM 2 - -class GET_ROW_F32 { - public: - __aicore__ inline GET_ROW_F32() {} - __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output, - int64_t *input_ne_ub, size_t *input_nb_ub, - int64_t *indices_ne_ub, size_t *indices_nb_ub, - int64_t *output_ne_ub, size_t *output_nb_ub) { - int64_t op_block_num = GetBlockNum(); - op_block_idx = GetBlockIdx(); - - for (int i = 0; i < 4; i++) { - input_ne[i] = input_ne_ub[i]; - input_stride[i] = input_nb_ub[i] / input_nb_ub[0]; - - indices_ne[i] = indices_ne_ub[i]; - indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0]; - - output_ne[i] = output_ne_ub[i]; - output_stride[i] = output_nb_ub[i] / output_nb_ub[0]; - } - - // Indices has two dims. n_elements = all rows should get. - // dr, all rows should this thread get. - uint64_t n_elements = - indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3]; - dr = n_elements / op_block_num; - - uint64_t tails = n_elements % op_block_num; - if (op_block_idx < tails) { - dr += 1; - ir = dr * op_block_idx; - } else { - ir = dr * op_block_idx + tails; - } - - input_gm.SetGlobalBuffer((__gm__ float *)input); - indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices); - output_gm.SetGlobalBuffer((__gm__ float *)output); - - uint64_t local_buffer_size = ((input_ne[0] * sizeof(float) + 31) & ~31); - local_buffer_elems = local_buffer_size / sizeof(float); - - // TODO, consider long row that can't put in UB. - // All data should asign to 32. It's ok because all data is align to 32. - pipe.InitBuffer(input_queue, BUFFER_NUM, local_buffer_size); - pipe.InitBuffer(output_queue, BUFFER_NUM, local_buffer_size); - } - - __aicore__ inline void copy_in(uint32_t offset, size_t len) { - LocalTensor input_local = input_queue.AllocTensor(); - const size_t elem_per_block = 32 / sizeof(float); - size_t tail = len % elem_per_block; - len = len & ~(elem_per_block - 1); - if(tail != 0) { - len += elem_per_block; - } - DataCopy(input_local, input_gm[offset], len); - input_queue.EnQue(input_local); - } - - __aicore__ inline void copy_out(uint32_t offset, size_t len) { - LocalTensor output_local = output_queue.DeQue(); - const size_t elem_per_block = 32 / sizeof(float); - size_t tail = len % elem_per_block; - len = len & ~(elem_per_block - 1); - if (len > 0) { - DataCopy(output_gm[offset], output_local, len); - } - - if(tail != 0) { -#ifdef ASCEND_310P - for (size_t i = tail; i < elem_per_block; i++) { - output_local[len + i].SetValue(0, 0); - } - SetAtomicAdd(); - DataCopy(output_gm[offset + len], output_local[len], elem_per_block); - SetAtomicNone(); -#else - DataCopyExtParams dataCopyParams; - dataCopyParams.blockCount = 1; - dataCopyParams.blockLen = tail * sizeof(float); - DataCopyPad(output_gm[offset + len], output_local[len], - dataCopyParams); -#endif - } - output_queue.FreeTensor(output_local); - } - - __aicore__ inline void calculate_row(int64_t idx) { - const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]); - const int64_t indices_ne1_idx = - (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) / - indices_ne[0]; - const int64_t indices_ne0_idx = - (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] - - indices_ne1_idx * indices_ne[0]); - - const int64_t indices_offset = indices_ne0_idx * indices_stride[0] + - indices_ne1_idx * indices_stride[1] + - indices_ne2_idx * indices_stride[2]; - const int32_t selected_row_idx = indices_gm.GetValue(indices_offset); - - const int64_t input_offset = selected_row_idx * input_stride[1] + - indices_ne1_idx * input_stride[2] + - indices_ne2_idx * input_stride[3]; - - const int64_t output_offset = indices_ne0_idx * output_stride[1] + - indices_ne1_idx * output_stride[2] + - indices_ne2_idx * output_stride[3]; - - copy_in(input_offset, input_ne[0]); - LocalTensor input_local = input_queue.DeQue(); - LocalTensor output_local = output_queue.AllocTensor(); - - DataCopy(output_local, input_local, local_buffer_elems); - output_queue.EnQue(output_local); - copy_out(output_offset, input_ne[0]); - - input_queue.FreeTensor(input_local); - } - - __aicore__ inline void calculate() { - for (int64_t i = ir; i < ir + dr; i++) { - calculate_row(i); - } - } - - private: - int64_t input_ne[4]; - size_t input_stride[4]; - - int64_t indices_ne[4]; - size_t indices_stride[4]; - - int64_t output_ne[4]; - size_t output_stride[4]; - - size_t local_buffer_elems; - - int64_t ir; - int64_t dr; - - TPipe pipe; - GlobalTensor input_gm; - GlobalTensor indices_gm; - GlobalTensor output_gm; - TQue input_queue; - TQue output_queue; - int64_t op_block_idx; -}; - -template -__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { - auto gm_ptr = (__gm__ uint8_t *)gm; - auto ub_ptr = (uint8_t *)(ub); - for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { - *ub_ptr = *gm_ptr; - } -} - -extern "C" __global__ __aicore__ void ascendc_get_row_f32( - GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm, - GM_ADDR input_ne_gm, GM_ADDR input_nb_gm, GM_ADDR indices_ne_gm, - GM_ADDR indices_nb_gm, GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) { - int64_t input_ne_ub[4]; - size_t input_nb_ub[4]; - int64_t indices_ne_ub[4]; - size_t indices_nb_ub[4]; - int64_t output_ne_ub[4]; - size_t output_nb_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(input_nb_gm, input_nb_ub, 32); - copy_to_ub(indices_ne_gm, indices_ne_ub, 32); - copy_to_ub(indices_nb_gm, indices_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - copy_to_ub(output_nb_gm, output_nb_ub, 32); - - GET_ROW_F32 op; - op.init(input_gm, indices_gm, output_gm, input_ne_ub, input_nb_ub, - indices_ne_ub, indices_nb_ub, output_ne_ub, output_nb_ub); - op.calculate(); -} diff --git a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp b/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp deleted file mode 100644 index 4fbe722086cf0..0000000000000 --- a/ggml/src/ggml-cann/kernels/get_row_q4_0.cpp +++ /dev/null @@ -1,204 +0,0 @@ -#include "kernel_operator.h" - -// optimize me. Use template to avoid copy code. -using namespace AscendC; -#ifdef ASCEND_310P // 310P not support 4bit get row - extern "C" __global__ __aicore__ void ascendc_get_row_q4_0( - GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm, - GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm, - GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) { - // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed. - printf("Ascend310P not support 4bit get row.\n"); - } -#else - -#define BUFFER_NUM 2 - -#define QK4_0 32 - -class GET_ROW_Q4_0 { - public: - __aicore__ inline GET_ROW_Q4_0() {} - __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output, - int64_t *input_ne_ub, int64_t *indices_ne_ub, - size_t *indices_nb_ub, int64_t *output_ne_ub, - size_t *output_nb_ub) { - int64_t op_block_num = GetBlockNum(); - int64_t op_block_idx = GetBlockIdx(); - - for (int i = 0; i < 4; i++) { - input_ne[i] = input_ne_ub[i]; - indices_ne[i] = indices_ne_ub[i]; - indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0]; - scale_ne[i] = input_ne_ub[i]; - output_ne[i] = output_ne_ub[i]; - output_stride[i] = output_nb_ub[i] / output_nb_ub[0]; - } - - // one scale for a group. - scale_ne[0] /= QK4_0; - - input_stride[0] = 1; - scale_stride[0] = 1; - output_stride[0] = 1; - for (int i = 1; i < 4; i++) { - input_stride[i] = input_stride[i - 1] * input_ne[i - 1]; - scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1]; - } - - group_size_in_row = input_ne[0] / QK4_0; - int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] * - input_ne[3] / 2; - - // Indices has two dims. n_elements = all rows should get. - // dr, all rows should this thread get. - uint64_t n_elements = - indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3]; - dr = n_elements / op_block_num; - - uint64_t tails = n_elements % op_block_num; - if (op_block_idx < tails) { - dr += 1; - ir = dr * op_block_idx; - } else { - ir = dr * op_block_idx + tails; - } - - input_gm.SetGlobalBuffer((__gm__ int4b_t *)input); - scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset)); - indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices); - output_gm.SetGlobalBuffer((__gm__ float *)output); - - pipe.InitBuffer(input_queue, BUFFER_NUM, QK4_0 * sizeof(int4b_t)); - pipe.InitBuffer(cast_queue, BUFFER_NUM, QK4_0 * sizeof(half)); - pipe.InitBuffer(output_queue, BUFFER_NUM, QK4_0 * sizeof(float)); - } - - __aicore__ inline void copy_in(uint32_t offset) { - LocalTensor input_local = input_queue.AllocTensor(); - // 32 * sizeof(int4b_t) = 16, which is not aligned to 32, why no error? - DataCopy(input_local, input_gm[offset], QK4_0); - input_queue.EnQue(input_local); - } - - __aicore__ inline void copy_out(uint32_t offset) { - LocalTensor output_local = output_queue.DeQue(); - DataCopy(output_gm[offset], output_local, QK4_0); - output_queue.FreeTensor(output_local); - } - - __aicore__ inline void calculate_group(int64_t idx, int64_t group) { - const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]); - const int64_t indices_ne1_idx = - (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) / - indices_ne[0]; - const int64_t indices_ne0_idx = - (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] - - indices_ne1_idx * indices_ne[0]); - - const int64_t indices_offset = indices_ne0_idx * indices_stride[0] + - indices_ne1_idx * indices_stride[1] + - indices_ne2_idx * indices_stride[2]; - const int32_t selected_row_idx = indices_gm.GetValue(indices_offset); - - const int64_t input_offset = selected_row_idx * input_stride[1] + - indices_ne1_idx * input_stride[2] + - indices_ne2_idx * input_stride[3] + - group * QK4_0; - const int64_t scale_offset = selected_row_idx * scale_stride[1] + - indices_ne1_idx * scale_stride[2] + - indices_ne2_idx * scale_stride[3] + group; - const int64_t output_offset = indices_ne0_idx * output_stride[1] + - indices_ne1_idx * output_stride[2] + - indices_ne2_idx * output_stride[3] + - group * QK4_0; - - copy_in(input_offset); - LocalTensor input_local = input_queue.DeQue(); - LocalTensor cast_local = cast_queue.AllocTensor(); - LocalTensor output_local = output_queue.AllocTensor(); - - // TODO: cast more data to speed up. - Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0); - Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0); - - // Only mul need compile by group. - half scale = scale_gm.GetValue(scale_offset); - - Muls(output_local, output_local, (float)scale, QK4_0); - - input_queue.FreeTensor(input_local); - cast_queue.FreeTensor(cast_local); - output_queue.EnQue(output_local); - - copy_out(output_offset); - } - - __aicore__ inline void calculate() { - for (int64_t i = ir; i < ir + dr; i++) { - for (int64_t j = 0; j < group_size_in_row; j++) { - calculate_group(i, j); - } - } - } - - private: - int64_t input_ne[4]; - size_t input_stride[4]; - - int64_t scale_ne[4]; - size_t scale_stride[4]; - - int64_t indices_ne[4]; - size_t indices_stride[4]; - - int64_t output_ne[4]; - size_t output_stride[4]; - - int64_t ir; - int64_t dr; - - int64_t group_size_in_row; - - TPipe pipe; - GlobalTensor input_gm; - GlobalTensor scale_gm; - GlobalTensor indices_gm; - GlobalTensor output_gm; - TQue input_queue; - TQue output_queue; - TQue cast_queue; -}; - -template -__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { - auto gm_ptr = (__gm__ uint8_t *)gm; - auto ub_ptr = (uint8_t *)(ub); - for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { - *ub_ptr = *gm_ptr; - } -} - -extern "C" __global__ __aicore__ void ascendc_get_row_q4_0( - GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm, - GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm, - GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) { - int64_t input_ne_ub[4]; - int64_t indices_ne_ub[4]; - size_t indices_nb_ub[4]; - int64_t output_ne_ub[4]; - size_t output_nb_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(indices_ne_gm, indices_ne_ub, 32); - copy_to_ub(indices_nb_gm, indices_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - copy_to_ub(output_nb_gm, output_nb_ub, 32); - - GET_ROW_Q4_0 op; - op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub, - indices_nb_ub, output_ne_ub, output_nb_ub); - op.calculate(); -} - -#endif // #ifdef ASCEND_310P diff --git a/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp b/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp deleted file mode 100644 index ba9ab3c04832f..0000000000000 --- a/ggml/src/ggml-cann/kernels/get_row_q8_0.cpp +++ /dev/null @@ -1,191 +0,0 @@ -#include "kernel_operator.h" - -// optimize me. Use template to avoid copy code. -using namespace AscendC; - -#define BUFFER_NUM 2 - -#define QK8_0 32 - -class GET_ROW_Q8_0 { - public: - __aicore__ inline GET_ROW_Q8_0() {} - __aicore__ inline void init(GM_ADDR input, GM_ADDR indices, GM_ADDR output, - int64_t *input_ne_ub, int64_t *indices_ne_ub, - size_t *indices_nb_ub, int64_t *output_ne_ub, - size_t *output_nb_ub) { - int64_t op_block_num = GetBlockNum(); - int64_t op_block_idx = GetBlockIdx(); - - for (int i = 0; i < 4; i++) { - input_ne[i] = input_ne_ub[i]; - indices_ne[i] = indices_ne_ub[i]; - indices_stride[i] = indices_nb_ub[i] / indices_nb_ub[0]; - scale_ne[i] = input_ne_ub[i]; - output_ne[i] = output_ne_ub[i]; - output_stride[i] = output_nb_ub[i] / output_nb_ub[0]; - } - - // one scale for a group. - scale_ne[0] /= QK8_0; - - input_stride[0] = 1; - scale_stride[0] = 1; - output_stride[0] = 1; - for (int i = 1; i < 4; i++) { - input_stride[i] = input_stride[i - 1] * input_ne[i - 1]; - scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1]; - } - - group_size_in_row = input_ne[0] / QK8_0; - int64_t scale_offset = input_ne[0] * input_ne[1] * input_ne[2] * - input_ne[3] * sizeof(int8_t); - - // Indices has two dims. n_elements = all rows should get. - // dr, all rows should this thread get. - uint64_t n_elements = - indices_ne[0] * indices_ne[1] * indices_ne[2] * indices_ne[3]; - dr = n_elements / op_block_num; - - uint64_t tails = n_elements % op_block_num; - if (op_block_idx < tails) { - dr += 1; - ir = dr * op_block_idx; - } else { - ir = dr * op_block_idx + tails; - } - - input_gm.SetGlobalBuffer((__gm__ int8_t *)input); - scale_gm.SetGlobalBuffer((__gm__ half *)(input + scale_offset)); - indices_gm.SetGlobalBuffer((__gm__ int32_t *)indices); - output_gm.SetGlobalBuffer((__gm__ float *)output); - - pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t)); - pipe.InitBuffer(cast_queue, BUFFER_NUM, QK8_0 * sizeof(half)); - pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(float)); - } - - __aicore__ inline void copy_in(uint32_t offset) { - LocalTensor input_local = input_queue.AllocTensor(); - DataCopy(input_local, input_gm[offset], QK8_0); - input_queue.EnQue(input_local); - } - - __aicore__ inline void copy_out(uint32_t offset) { - LocalTensor output_local = output_queue.DeQue(); - DataCopy(output_gm[offset], output_local, QK8_0); - output_queue.FreeTensor(output_local); - } - - __aicore__ inline void calculate_group(int64_t idx, int64_t group) { - const int64_t indices_ne2_idx = idx / (indices_ne[0] * indices_ne[1]); - const int64_t indices_ne1_idx = - (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1]) / - indices_ne[0]; - const int64_t indices_ne0_idx = - (idx - indices_ne2_idx * indices_ne[0] * indices_ne[1] - - indices_ne1_idx * indices_ne[0]); - - const int64_t indices_offset = indices_ne0_idx * indices_stride[0] + - indices_ne1_idx * indices_stride[1] + - indices_ne2_idx * indices_stride[2]; - const int32_t selected_row_idx = indices_gm.GetValue(indices_offset); - - const int64_t input_offset = selected_row_idx * input_stride[1] + - indices_ne1_idx * input_stride[2] + - indices_ne2_idx * input_stride[3] + - group * QK8_0; - const int64_t scale_offset = selected_row_idx * scale_stride[1] + - indices_ne1_idx * scale_stride[2] + - indices_ne2_idx * scale_stride[3] + group; - const int64_t output_offset = indices_ne0_idx * output_stride[1] + - indices_ne1_idx * output_stride[2] + - indices_ne2_idx * output_stride[3] + - group * QK8_0; - - copy_in(input_offset); - LocalTensor input_local = input_queue.DeQue(); - LocalTensor cast_local = cast_queue.AllocTensor(); - LocalTensor output_local = output_queue.AllocTensor(); - - // TODO: cast more data to speed up. - Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0); - Cast(output_local, cast_local, RoundMode::CAST_NONE, QK8_0); - - // Only mul need compile by group. - half scale = scale_gm.GetValue(scale_offset); - Muls(output_local, output_local, (float)scale, QK8_0); - - input_queue.FreeTensor(input_local); - cast_queue.FreeTensor(cast_local); - output_queue.EnQue(output_local); - - copy_out(output_offset); - } - - __aicore__ inline void calculate() { - for (int64_t i = ir; i < ir + dr; i++) { - for (int64_t j = 0; j < group_size_in_row; j++) { - calculate_group(i, j); - } - } - } - - private: - int64_t input_ne[4]; - size_t input_stride[4]; - - int64_t scale_ne[4]; - size_t scale_stride[4]; - - int64_t indices_ne[4]; - size_t indices_stride[4]; - - int64_t output_ne[4]; - size_t output_stride[4]; - - int64_t ir; - int64_t dr; - - int64_t group_size_in_row; - - TPipe pipe; - GlobalTensor input_gm; - GlobalTensor scale_gm; - GlobalTensor indices_gm; - GlobalTensor output_gm; - TQue input_queue; - TQue output_queue; - TQue cast_queue; -}; - -template -__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { - auto gm_ptr = (__gm__ uint8_t *)gm; - auto ub_ptr = (uint8_t *)(ub); - for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { - *ub_ptr = *gm_ptr; - } -} - -extern "C" __global__ __aicore__ void ascendc_get_row_q8_0( - GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm, - GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm, - GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) { - int64_t input_ne_ub[4]; - int64_t indices_ne_ub[4]; - size_t indices_nb_ub[4]; - int64_t output_ne_ub[4]; - size_t output_nb_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(indices_ne_gm, indices_ne_ub, 32); - copy_to_ub(indices_nb_gm, indices_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - copy_to_ub(output_nb_gm, output_nb_ub, 32); - - GET_ROW_Q8_0 op; - op.init(input_gm, indices_gm, output_gm, input_ne_ub, indices_ne_ub, - indices_nb_ub, output_ne_ub, output_nb_ub); - op.calculate(); -} diff --git a/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp b/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp deleted file mode 100644 index 504b43afaa1f4..0000000000000 --- a/ggml/src/ggml-cann/kernels/quantize_f16_q8_0.cpp +++ /dev/null @@ -1,218 +0,0 @@ -#include "kernel_operator.h" - -using namespace AscendC; -#ifdef ASCEND_310P - extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0( - GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { - // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed. - printf("Ascend310P not support f16->8bit quantization.\n"); - } -#else - -#define BUFFER_NUM 2 -#define QK8_0 32 - -class QUANTIZE_F16_Q8_0 { - public: - __aicore__ inline QUANTIZE_F16_Q8_0() {} - __aicore__ inline void init(GM_ADDR input, GM_ADDR output, - int64_t *input_ne_ub, size_t *input_nb_ub, - int64_t *output_ne_ub) { - int64_t op_block_num = GetBlockNum(); - int64_t op_block_idx = GetBlockIdx(); - - for (int i = 0; i < 4; i++) { - input_ne[i] = input_ne_ub[i]; - input_stride[i] = input_nb_ub[i] / input_nb_ub[0]; - - output_ne[i] = output_ne_ub[i]; - } - - output_stride[0] = 1; - for (int i = 1; i < 4; i++) { - output_stride[i] = output_stride[i - 1] * output_ne[i - 1]; - } - - scale_ne = input_ne; - scale_stride[0] = 1; - scale_stride[1] = input_ne[0] / QK8_0; - for (int i = 2; i < 4; i++) { - scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1]; - } - - // split input tensor by rows. - uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3]; - dr = nr / op_block_num; - - uint64_t tails = nr % op_block_num; - if (op_block_idx < tails) { - dr += 1; - ir = dr * op_block_idx; - } else { - ir = dr * op_block_idx + tails; - } - - group_size_in_row = scale_stride[1]; - int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] * - output_ne[3] * sizeof(uint8_t); - - input_gm.SetGlobalBuffer((__gm__ half *)input); - output_gm.SetGlobalBuffer((__gm__ int8_t *)output); - scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size + ir * - group_size_in_row * - sizeof(half))); - - pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(half)); - pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t)); - pipe.InitBuffer(work_queue, 1, 32); - pipe.InitBuffer(max_queue, 1, 32); - pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float)); - pipe.InitBuffer(scale_queue, 1, 32); - pipe.InitBuffer(cast_queue ,1 ,QK8_0 * sizeof(float)); - } - - __aicore__ inline void copy_in(uint32_t offset) { - LocalTensor input_local = input_queue.AllocTensor(); - DataCopy(input_local, input_gm[offset], QK8_0); - input_queue.EnQue(input_local); - } - - __aicore__ inline void copy_out(uint32_t offset) { - LocalTensor output_local = output_queue.DeQue(); - DataCopy(output_gm[offset], output_local, QK8_0); - output_queue.FreeTensor(output_local); - } - - __aicore__ inline half calculate_group(int64_t row, int64_t group) { - const int64_t i3 = row / (input_ne[1] * input_ne[2]); - const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1]; - const int64_t i1 = - row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1]; - - const int64_t input_offset = i1 * input_stride[1] + - i2 * input_stride[2] + - i3 * input_stride[3] + QK8_0 * group; - - const int64_t output_offset = i1 * output_stride[1] + - i2 * output_stride[2] + - i3 * output_stride[3] + QK8_0 * group; - - copy_in(input_offset); - LocalTensor input_local = input_queue.DeQue(); - LocalTensor output_local = output_queue.AllocTensor(); - LocalTensor work_local = work_queue.AllocTensor(); - LocalTensor abs_local = abs_queue.AllocTensor(); - LocalTensor max_local = max_queue.AllocTensor(); - LocalTensor cast_local = cast_queue.AllocTensor(); - - Cast(cast_local, input_local, RoundMode::CAST_NONE, QK8_0); - Abs(abs_local, cast_local, QK8_0); - ReduceMax(max_local, abs_local, work_local, QK8_0); - - pipe_barrier(PIPE_ALL); - float d = max_local.GetValue(0); - d = d / ((1 << 7) - 1); - if (d != 0) { - Muls(cast_local, cast_local, 1.0f / d, QK8_0); - } - - Cast(cast_local, cast_local, RoundMode::CAST_ROUND, QK8_0); - Cast(input_local, cast_local, RoundMode::CAST_ROUND, QK8_0); - Cast(output_local, input_local, RoundMode::CAST_ROUND, QK8_0); - output_queue.EnQue(output_local); - copy_out(output_offset); - - input_queue.FreeTensor(input_local); - work_queue.FreeTensor(work_local); - abs_queue.FreeTensor(abs_local); - max_queue.FreeTensor(max_local); - cast_queue.FreeTensor(cast_local); - return (half)d; - } - - __aicore__ inline void calculate() { - LocalTensor scale_local = scale_queue.AllocTensor(); - uint32_t scale_local_offset = 0; - uint32_t scale_global_offset = 0; - for (int64_t i = ir; i < ir + dr; i++) { - for (int64_t j = 0; j < group_size_in_row; j++) { - half scale = calculate_group(i, j); - scale_local.SetValue(scale_local_offset++, scale); - if (scale_local_offset == 16) { - scale_local_offset = 0; - // TODO: OPTIMIZE ME - pipe_barrier(PIPE_ALL); - DataCopy(scale_gm[scale_global_offset], scale_local, 16); - pipe_barrier(PIPE_ALL); - scale_global_offset += 16; - } - } - } - - if (scale_local_offset != 0) { - pipe_barrier(PIPE_ALL); - DataCopyExtParams dataCopyParams; - dataCopyParams.blockCount = 1; - dataCopyParams.blockLen = scale_local_offset * sizeof(half); - DataCopyPad(scale_gm[scale_global_offset], scale_local, - dataCopyParams); - pipe_barrier(PIPE_ALL); - } - } - - private: - int64_t input_ne[4]; - size_t input_stride[4]; - - int64_t *scale_ne; - size_t scale_stride[4]; - - int64_t output_ne[4]; - size_t output_stride[4]; - - int64_t group_size_in_row; - - int64_t ir; - int64_t dr; - - TPipe pipe; - GlobalTensor input_gm; - GlobalTensor scale_gm; - GlobalTensor output_gm; - TQue input_queue; - TQue output_queue; - TQue work_queue; - TQue max_queue; - TQue abs_queue; - TQue scale_queue; - TQue cast_queue; - -}; - -template -__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { - auto gm_ptr = (__gm__ uint8_t *)gm; - auto ub_ptr = (uint8_t *)(ub); - for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { - *ub_ptr = *gm_ptr; - } -} - -extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0( - GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { - int64_t input_ne_ub[4]; - size_t input_nb_ub[4]; - int64_t output_ne_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(input_nb_gm, input_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - - QUANTIZE_F16_Q8_0 op; - op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); - op.calculate(); -} - -#endif // #ifdef ASCEND_310P diff --git a/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp b/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp deleted file mode 100644 index 05b0bc1df59af..0000000000000 --- a/ggml/src/ggml-cann/kernels/quantize_f32_q8_0.cpp +++ /dev/null @@ -1,216 +0,0 @@ -#include "kernel_operator.h" - -using namespace AscendC; -#ifdef ASCEND_310P // 310P not support f32->8bit quantization - extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0( - GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { - // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed. - printf("Ascend310P not support f32->8bit quantization.\n"); - } -#else - -#define BUFFER_NUM 2 -#define QK8_0 32 - -class QUANTIZE_F32_Q8_0 { - public: - __aicore__ inline QUANTIZE_F32_Q8_0() {} - __aicore__ inline void init(GM_ADDR input, GM_ADDR output, - int64_t *input_ne_ub, size_t *input_nb_ub, - int64_t *output_ne_ub) { - int64_t op_block_num = GetBlockNum(); - int64_t op_block_idx = GetBlockIdx(); - - for (int i = 0; i < 4; i++) { - input_ne[i] = input_ne_ub[i]; - input_stride[i] = input_nb_ub[i] / input_nb_ub[0]; - - output_ne[i] = output_ne_ub[i]; - } - - output_stride[0] = 1; - for (int i = 1; i < 4; i++) { - output_stride[i] = output_stride[i - 1] * output_ne[i - 1]; - } - - scale_ne = input_ne; - scale_stride[0] = 1; - scale_stride[1] = input_ne[0] / QK8_0; - for (int i = 2; i < 4; i++) { - scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1]; - } - - // split input tensor by rows. - uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3]; - dr = nr / op_block_num; - - uint64_t tails = nr % op_block_num; - if (op_block_idx < tails) { - dr += 1; - ir = dr * op_block_idx; - } else { - ir = dr * op_block_idx + tails; - } - - group_size_in_row = scale_stride[1]; - int64_t output_size = output_ne[0] * output_ne[1] * output_ne[2] * - output_ne[3] * sizeof(uint8_t); - - input_gm.SetGlobalBuffer((__gm__ float *)input); - output_gm.SetGlobalBuffer((__gm__ int8_t *)output); - scale_gm.SetGlobalBuffer((__gm__ half *)(output + output_size + - ir * group_size_in_row * - sizeof(half))); - - pipe.InitBuffer(input_queue, BUFFER_NUM, QK8_0 * sizeof(float)); - pipe.InitBuffer(output_queue, BUFFER_NUM, QK8_0 * sizeof(int8_t)); - pipe.InitBuffer(work_queue, 1, 32); - pipe.InitBuffer(max_queue, 1, 32); - pipe.InitBuffer(abs_queue, 1, QK8_0 * sizeof(float)); - pipe.InitBuffer(cast_queue, 1, QK8_0 * sizeof(half)); - pipe.InitBuffer(scale_queue, 1, 32); - } - - __aicore__ inline void copy_in(uint32_t offset) { - LocalTensor input_local = input_queue.AllocTensor(); - DataCopy(input_local, input_gm[offset], QK8_0); - input_queue.EnQue(input_local); - } - - __aicore__ inline void copy_out(uint32_t offset) { - LocalTensor output_local = output_queue.DeQue(); - DataCopy(output_gm[offset], output_local, QK8_0); - output_queue.FreeTensor(output_local); - } - - __aicore__ inline half calculate_group(int64_t row, int64_t group) { - const int64_t i3 = row / (input_ne[1] * input_ne[2]); - const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1]; - const int64_t i1 = - row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1]; - - const int64_t input_offset = i1 * input_stride[1] + - i2 * input_stride[2] + - i3 * input_stride[3] + QK8_0 * group; - - const int64_t output_offset = i1 * output_stride[1] + - i2 * output_stride[2] + - i3 * output_stride[3] + QK8_0 * group; - - copy_in(input_offset); - LocalTensor input_local = input_queue.DeQue(); - LocalTensor output_local = output_queue.AllocTensor(); - LocalTensor work_local = work_queue.AllocTensor(); - LocalTensor abs_local = abs_queue.AllocTensor(); - LocalTensor max_local = max_queue.AllocTensor(); - LocalTensor cast_local = cast_queue.AllocTensor(); - - Abs(abs_local, input_local, QK8_0); - ReduceMax(max_local, abs_local, work_local, QK8_0); - pipe_barrier(PIPE_ALL); - float d = max_local.GetValue(0); - d = d / ((1 << 7) - 1); - if (d != 0) { - Muls(input_local, input_local, 1.0f / d, QK8_0); - } - - Cast(input_local, input_local, RoundMode::CAST_ROUND, QK8_0); - Cast(cast_local, input_local, RoundMode::CAST_ROUND, QK8_0); - Cast(output_local, cast_local, RoundMode::CAST_ROUND, QK8_0); - output_queue.EnQue(output_local); - copy_out(output_offset); - - input_queue.FreeTensor(input_local); - work_queue.FreeTensor(work_local); - abs_queue.FreeTensor(abs_local); - max_queue.FreeTensor(max_local); - cast_queue.FreeTensor(cast_local); - - return (half)d; - } - - __aicore__ inline void calculate() { - LocalTensor scale_local = scale_queue.AllocTensor(); - uint32_t scale_local_offset = 0; - uint32_t scale_global_offset = 0; - for (int64_t i = ir; i < ir + dr; i++) { - for (int64_t j = 0; j < group_size_in_row; j++) { - half scale = calculate_group(i, j); - scale_local.SetValue(scale_local_offset++, scale); - if (scale_local_offset == 16) { - scale_local_offset = 0; - // TODO: OPTIMIZE ME - pipe_barrier(PIPE_ALL); - DataCopy(scale_gm[scale_global_offset], scale_local, 16); - pipe_barrier(PIPE_ALL); - scale_global_offset += 16; - } - } - } - - if (scale_local_offset != 0) { - pipe_barrier(PIPE_ALL); - DataCopyExtParams dataCopyParams; - dataCopyParams.blockCount = 1; - dataCopyParams.blockLen = scale_local_offset * sizeof(half); - DataCopyPad(scale_gm[scale_global_offset], scale_local, - dataCopyParams); - pipe_barrier(PIPE_ALL); - } - } - - private: - int64_t input_ne[4]; - size_t input_stride[4]; - - int64_t *scale_ne; - size_t scale_stride[4]; - - int64_t output_ne[4]; - size_t output_stride[4]; - - int64_t group_size_in_row; - - int64_t ir; - int64_t dr; - - TPipe pipe; - GlobalTensor input_gm; - GlobalTensor scale_gm; - GlobalTensor output_gm; - TQue input_queue; - TQue output_queue; - TQue work_queue; - TQue max_queue; - TQue abs_queue; - TQue cast_queue; - TQue scale_queue; -}; - -template -__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { - auto gm_ptr = (__gm__ uint8_t *)gm; - auto ub_ptr = (uint8_t *)(ub); - for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { - *ub_ptr = *gm_ptr; - } -} - -extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0( - GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { - int64_t input_ne_ub[4]; - size_t input_nb_ub[4]; - int64_t output_ne_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(input_nb_gm, input_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - - QUANTIZE_F32_Q8_0 op; - op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); - op.calculate(); -} - -#endif // #ifdef ASCEND_310P diff --git a/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp b/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp deleted file mode 100644 index 1188937b74461..0000000000000 --- a/ggml/src/ggml-cann/kernels/quantize_float_to_q4_0.cpp +++ /dev/null @@ -1,295 +0,0 @@ -#include "kernel_operator.h" - -using namespace AscendC; -#ifdef ASCEND_310P // 310P not support float->4bit quantization - extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0( - GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { - // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed. - printf("Ascend310P not support f32->4bit quantization.\n"); - } - - extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0( - GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { - // let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed. - printf("Ascend310P not support f16->4bit quantization.\n"); - } -#else - -#define BUFFER_NUM 2 -#define Group_Size 32 - -template -class QUANTIZE_FLOAT_TO_Q4_0 { - public: - __aicore__ inline QUANTIZE_FLOAT_TO_Q4_0() {} - __aicore__ inline void init(GM_ADDR input, GM_ADDR output, - int64_t *input_ne_ub, size_t *input_nb_ub, - int64_t *output_ne_ub) { - // TODO: fix test_case CPY(type_src=f16,type_dst=q4_0,ne=[256,4,4,4], - // permute=[0,0,0,0]): - // [CPY] NMSE = 0.000008343 > 0.000001000 FAIL - int64_t op_block_num = GetBlockNum(); - int64_t op_block_idx = GetBlockIdx(); - - // input stride of data elements - for (int i = 0; i < 4; i++) { - input_ne[i] = input_ne_ub[i]; - input_stride[i] = input_nb_ub[i] / input_nb_ub[0]; - output_ne[i] = output_ne_ub[i]; - } - - // output stride of data elements - output_stride[0] = 1; - for (int i = 1; i < 4; i++) { - output_stride[i] = output_stride[i - 1] * output_ne[i - 1]; - } - - // scale saved one by one after data:. [group1_scale, group2_scale, ...] - scale_ne = input_ne; - scale_stride[0] = 1; - scale_stride[1] = input_ne[0] / Group_Size; - for (int i = 2; i < 4; i++) { - scale_stride[i] = scale_stride[i - 1] * scale_ne[i - 1]; - } - - // split input tensor by rows. - uint64_t nr = input_ne[1] * input_ne[2] * input_ne[3]; - dr = nr / op_block_num; - - uint64_t tails = nr % op_block_num; - if (op_block_idx < tails) { - dr += 1; - ir = dr * op_block_idx; - } else { - ir = dr * op_block_idx + tails; - } - - group_size_in_row = scale_stride[1]; - int64_t scale_offset = output_ne[0] * output_ne[1] * output_ne[2] * - output_ne[3] * sizeof(uint8_t) / 2; - - input_gm.SetGlobalBuffer((__gm__ SRC_T *)input); - output_gm.SetGlobalBuffer((__gm__ int8_t *)output); - scale_gm.SetGlobalBuffer((__gm__ half *)(output + scale_offset + ir * - group_size_in_row * - sizeof(half))); - - pipe.InitBuffer(input_queue, BUFFER_NUM, Group_Size * sizeof(SRC_T)); - pipe.InitBuffer(output_queue, BUFFER_NUM, - Group_Size * sizeof(int8_t) / 2); - pipe.InitBuffer(cast_queue , 1, Group_Size * sizeof(float)); - pipe.InitBuffer(work_queue, 1, Group_Size * sizeof(float)); - pipe.InitBuffer(max_queue, 1, Group_Size * sizeof(float)); - pipe.InitBuffer(min_queue, 1, Group_Size * sizeof(float)); - pipe.InitBuffer(scale_queue, 1, Group_Size / 2 * sizeof(half)); - pipe.InitBuffer(int8_queue, 1, Group_Size * sizeof(int8_t)); - pipe.InitBuffer(half_queue, 1, Group_Size * sizeof(half)); - } - - __aicore__ inline void copy_in(uint32_t offset) { - LocalTensor input_local = input_queue.AllocTensor(); - DataCopy(input_local, input_gm[offset], Group_Size); - input_queue.EnQue(input_local); - } - - __aicore__ inline void copy_out(uint32_t offset) { - // reinterpretcast Group_Size(32) * int4b_t to Group_Size / 2 * int8_t, - // and using DataCopyPad to avoid 32 bits align. - LocalTensor output_local = output_queue.DeQue(); - LocalTensor output_int8_local = - output_local.ReinterpretCast(); - - DataCopyExtParams dataCopyParams; - dataCopyParams.blockCount = 1; - dataCopyParams.blockLen = Group_Size / 2 * sizeof(int8_t); - DataCopyPad(output_gm[offset], output_int8_local, dataCopyParams); - - output_queue.FreeTensor(output_local); - } - - __aicore__ inline void input_to_cast(LocalTensor cast_local, - LocalTensor input_local) { - DataCopy(cast_local, input_local, Group_Size); - } - - __aicore__ inline void input_to_cast(LocalTensor cast_local, - LocalTensor input_local) { - Cast(cast_local, input_local, RoundMode::CAST_NONE, Group_Size); - } - - __aicore__ inline half calculate_group(int64_t row, int64_t group) { - const int64_t i3 = row / (input_ne[1] * input_ne[2]); - const int64_t i2 = (row - i3 * input_ne[1] * input_ne[2]) / input_ne[1]; - const int64_t i1 = - row - i3 * input_ne[1] * input_ne[2] - i2 * input_ne[1]; - - const int64_t input_offset = i1 * input_stride[1] + - i2 * input_stride[2] + - i3 * input_stride[3] + Group_Size * group; - - // output_offset is stride for output_gm which datatype is int8_t and - // divided by 2 is needed for int4b_t. - const int64_t output_offset = (i1 * output_stride[1] + - i2 * output_stride[2] + - i3 * output_stride[3] + - Group_Size * group) / 2; - copy_in(input_offset); - - LocalTensor input_local = input_queue.DeQue(); - LocalTensor output_local = output_queue.AllocTensor(); - LocalTensor cast_local = cast_queue.AllocTensor(); - LocalTensor work_local = work_queue.AllocTensor(); - LocalTensor max_local = max_queue.AllocTensor(); - LocalTensor min_local = min_queue.AllocTensor(); - LocalTensor int8_local = int8_queue.AllocTensor(); - LocalTensor half_local = half_queue.AllocTensor(); - - input_to_cast(cast_local, input_local); - - ReduceMax(max_local, cast_local, work_local, Group_Size); - ReduceMin(min_local, cast_local, work_local, Group_Size); - const float max_value = max_local.GetValue(0); - const float min_value = min_local.GetValue(0); - float d = max_value; - if (min_value < 0 && (-1 * min_value) > max_value) { - d = min_value; - } - - d = d / (-8); - if (d != 0) { - Muls(cast_local, cast_local, 1.0f / d, Group_Size); - } - - // range: [-8,8] -> [0.5,16.5] -> [0,16] -> [0,15] -> [-8,7] - float scalar = 8.5f; - Adds(cast_local, cast_local, scalar, Group_Size); - Cast(cast_local, cast_local, RoundMode::CAST_FLOOR, Group_Size); - scalar = 15.0f; - Mins(cast_local, cast_local, scalar, Group_Size); - scalar = -8.0f; - Adds(cast_local, cast_local, scalar, Group_Size); - - // float->half->int4b - Cast(half_local, cast_local, RoundMode::CAST_NONE, Group_Size); - Cast(output_local, half_local, RoundMode::CAST_NONE, Group_Size); - - output_queue.EnQue(output_local); - copy_out(output_offset); - - input_queue.FreeTensor(input_local); - work_queue.FreeTensor(work_local); - max_queue.FreeTensor(max_local); - min_queue.FreeTensor(min_local); - int8_queue.FreeTensor(int8_local); - half_queue.FreeTensor(half_local); - cast_queue.FreeTensor(cast_local); - return (half)d; - } - - __aicore__ inline void calculate() { - LocalTensor scale_local = scale_queue.AllocTensor(); - uint32_t scale_local_offset = 0; - uint32_t scale_global_offset = 0; - for (int64_t i = ir; i < ir + dr; i++) { - for (int64_t j = 0; j < group_size_in_row; j++) { - half scale = calculate_group(i, j); - scale_local.SetValue(scale_local_offset++, scale); - // Copy Group_Size/2 length data each time. - if (scale_local_offset == Group_Size / 2) { - scale_local_offset = 0; - // TODO: OPTIMIZE ME - pipe_barrier(PIPE_ALL); - DataCopy(scale_gm[scale_global_offset], scale_local, - Group_Size / 2); - pipe_barrier(PIPE_ALL); - scale_global_offset += Group_Size / 2; - } - } - } - - if (scale_local_offset != 0) { - pipe_barrier(PIPE_ALL); - DataCopyExtParams dataCopyParams; - dataCopyParams.blockCount = 1; - dataCopyParams.blockLen = scale_local_offset * sizeof(half); - DataCopyPad(scale_gm[scale_global_offset], scale_local, - dataCopyParams); - pipe_barrier(PIPE_ALL); - } - scale_queue.FreeTensor(scale_local); - } - - private: - int64_t input_ne[4]; - size_t input_stride[4]; - - int64_t *scale_ne; - size_t scale_stride[4]; - - int64_t output_ne[4]; - size_t output_stride[4]; - - int64_t group_size_in_row; - - int64_t ir; - int64_t dr; - - TPipe pipe; - GlobalTensor input_gm; - GlobalTensor scale_gm; - GlobalTensor output_gm; - TQue input_queue; - TQue output_queue; - TQue work_queue; - TQue max_queue; - TQue min_queue; - TQue scale_queue; - TQue cast_queue; - TQue int8_queue; - TQue half_queue; -}; - -template -__aicore__ inline void copy_to_ub(GM_ADDR gm, T *ub, size_t size) { - auto gm_ptr = (__gm__ uint8_t *)gm; - auto ub_ptr = (uint8_t *)(ub); - for (int32_t i = 0; i < size; ++i, ++ub_ptr, ++gm_ptr) { - *ub_ptr = *gm_ptr; - } -} - -extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0( - GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { - int64_t input_ne_ub[4]; - size_t input_nb_ub[4]; - int64_t output_ne_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(input_nb_gm, input_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - - QUANTIZE_FLOAT_TO_Q4_0 op; - op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); - op.calculate(); -} - -extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0( - GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm, - GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) { - int64_t input_ne_ub[4]; - size_t input_nb_ub[4]; - int64_t output_ne_ub[4]; - - copy_to_ub(input_ne_gm, input_ne_ub, 32); - copy_to_ub(input_nb_gm, input_nb_ub, 32); - copy_to_ub(output_ne_gm, output_ne_ub, 32); - - QUANTIZE_FLOAT_TO_Q4_0 op; - op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub); - op.calculate(); -} - -#endif // #ifdef ASCEND_310P From 42eb248f46e1175349e553b6eda6cb63027d74d1 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Wed, 2 Apr 2025 09:58:34 +0200 Subject: [PATCH 03/11] common : remove json.hpp from common.cpp (#12697) * common : remove json.hpp from common.cpp * fix comment --- common/common.cpp | 28 ---------------------------- common/common.h | 4 ---- examples/server/server.cpp | 11 ++++++----- examples/server/utils.hpp | 29 ++++++++++++++++++++++++++++- 4 files changed, 34 insertions(+), 38 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 22642c84afa40..e7269ead4f94e 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -7,9 +7,6 @@ #include "common.h" #include "log.h" -// Change JSON_ASSERT from assert() to GGML_ASSERT: -#define JSON_ASSERT GGML_ASSERT -#include "json.hpp" #include "llama.h" #include @@ -56,8 +53,6 @@ #pragma warning(disable: 4244 4267) // possible loss of data #endif -using json = nlohmann::ordered_json; - // // CPU utils // @@ -1545,26 +1540,3 @@ common_control_vector_data common_control_vector_load(const std::vector -json common_grammar_trigger::to_json() const { - json out { - {"type", (int) type}, - {"value", value}, - }; - if (type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) { - out["token"] = (int) token; - } - return out; -} - -template <> -common_grammar_trigger common_grammar_trigger::from_json(const json & in) { - common_grammar_trigger out; - out.type = (common_grammar_trigger_type) in.at("type").get(); - out.value = in.at("value").get(); - if (out.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) { - out.token = (llama_token) in.at("token").get(); - } - return out; -} diff --git a/common/common.h b/common/common.h index 41ff9905e4416..ea7aef99d918a 100644 --- a/common/common.h +++ b/common/common.h @@ -121,10 +121,6 @@ struct common_grammar_trigger { common_grammar_trigger_type type; std::string value; llama_token token = LLAMA_TOKEN_NULL; - - // T can only be nlohmann::ordered_json - template T to_json() const; - template static common_grammar_trigger from_json(const T & in); }; // sampling parameters diff --git a/examples/server/server.cpp b/examples/server/server.cpp index d140f8c4469c9..760c3646433ad 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -133,7 +133,8 @@ struct slot_params { auto grammar_triggers = json::array(); for (const auto & trigger : sampling.grammar_triggers) { - grammar_triggers.push_back(trigger.to_json()); + server_grammar_trigger ct(std::move(trigger)); + grammar_triggers.push_back(ct.to_json()); } return json { @@ -372,9 +373,9 @@ struct server_task { const auto grammar_triggers = data.find("grammar_triggers"); if (grammar_triggers != data.end()) { for (const auto & t : *grammar_triggers) { - auto ct = common_grammar_trigger::from_json(t); - if (ct.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) { - const auto & word = ct.value; + server_grammar_trigger ct(t); + if (ct.value.type == COMMON_GRAMMAR_TRIGGER_TYPE_WORD) { + const auto & word = ct.value.value; auto ids = common_tokenize(vocab, word, /* add_special= */ false, /* parse_special= */ true); if (ids.size() == 1) { auto token = ids[0]; @@ -392,7 +393,7 @@ struct server_task { params.sampling.grammar_triggers.push_back({COMMON_GRAMMAR_TRIGGER_TYPE_WORD, word}); } } else { - params.sampling.grammar_triggers.push_back(ct); + params.sampling.grammar_triggers.push_back(std::move(ct.value)); } } } diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 58cdd6af92974..55cf3230d90ce 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -58,6 +58,32 @@ static T json_value(const json & body, const std::string & key, const T & defaul const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT); +// thin wrapper around common_grammar_trigger with (de)serialization functions +struct server_grammar_trigger { + common_grammar_trigger value; + + server_grammar_trigger() = default; + server_grammar_trigger(const common_grammar_trigger & value) : value(value) {} + server_grammar_trigger(const json & in) { + value.type = (common_grammar_trigger_type) in.at("type").get(); + value.value = in.at("value").get(); + if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) { + value.token = (llama_token) in.at("token").get(); + } + } + + json to_json() const { + json out { + {"type", (int) value.type}, + {"value", value.value}, + }; + if (value.type == COMMON_GRAMMAR_TRIGGER_TYPE_TOKEN) { + out["token"] = (int) value.token; + } + return out; + } +}; + // // tokenizer and input processing utils // @@ -627,7 +653,8 @@ static json oaicompat_completion_params_parse( llama_params["grammar_lazy"] = chat_params.grammar_lazy; auto grammar_triggers = json::array(); for (const auto & trigger : chat_params.grammar_triggers) { - grammar_triggers.push_back(trigger.to_json()); + server_grammar_trigger ct(trigger); + grammar_triggers.push_back(ct.to_json()); } llama_params["grammar_triggers"] = grammar_triggers; llama_params["preserved_tokens"] = chat_params.preserved_tokens; From 83a88bd6affbe148a622ac730952ac5b8b585979 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Wed, 2 Apr 2025 11:21:48 +0200 Subject: [PATCH 04/11] vocab : BailingMoE : change possessive quantifiers to greedy (#12677) --- src/llama-vocab.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 31e2055f83611..521a6ec5edbc9 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -411,7 +411,8 @@ struct llm_tokenizer_bpe : llm_tokenizer { regex_exprs = { // original regex from tokenizer.json // "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+" - "'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+", + // FIXME? Changed possessive quantifiers (?+ and ++) to greedy to avoid errors and imatrix hanging (tried atomic grouping but it's not supported?) + "'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+", }; break; default: From a10b36c91a091f4606710fba4e9327fd71e0e738 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 2 Apr 2025 14:32:59 +0300 Subject: [PATCH 05/11] llama : refactor kv cache guard (#12695) * llama : refactor kv cache guard ggml-ci * cont : fix comment [no ci] * llama : fix kv_cache restore logic ggml-ci * context : simplify kv cache updates ggml-ci * cont : better name [no ci] * llama : fix llama_decode return code when could not find KV slot ggml-ci * context : change log err -> warn [no ci] * kv-cache : add comment + warning --- examples/parallel/parallel.cpp | 2 + src/llama-context.cpp | 59 +++---------------- src/llama-kv-cache.cpp | 69 +++++++++++++++++++--- src/llama-kv-cache.h | 104 ++++++++++++--------------------- 4 files changed, 107 insertions(+), 127 deletions(-) diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index e0e6da631dad3..80698518e3102 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -106,6 +106,8 @@ int main(int argc, char ** argv) { common_params params; + params.n_predict = 128; + if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_PARALLEL)) { return 1; } diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 3479a8cca3d64..7d067afbe7399 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1201,33 +1201,7 @@ int llama_context::decode(llama_batch & inp_batch) { const int64_t n_tokens_all = batch.n_tokens; const int64_t n_embd = hparams.n_embd; - // TODO: remove this stuff - class batch_guard { - public: - batch_guard(llama_kv_cache_unified & kv_self) : kv_slot_restorer(kv_self) { - } - - ~batch_guard() { - if (!is_done) { - kv_slot_restorer.restore(); - } - } - - void done() { - is_done = true; - } - - void save(const llama_kv_cache_slot_info & slot_info) { - kv_slot_restorer.save(slot_info); - } - - private: - bool is_done = false; - - llama_kv_slot_restorer kv_slot_restorer; - }; - - batch_guard bg(*kv_self); + llama_kv_cache_guard kv_guard(kv_self.get()); GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT @@ -1280,6 +1254,9 @@ int llama_context::decode(llama_batch & inp_batch) { return -2; }; + // handle any pending defrags/shifts + kv_self_update(); + int64_t n_outputs_prev = 0; while (sbatch.n_tokens > 0) { @@ -1319,22 +1296,12 @@ int llama_context::decode(llama_batch & inp_batch) { // find KV slot { - kv_self_update(); + if (!kv_self->find_slot(ubatch)) { + LLAMA_LOG_WARN("%s: failed to find KV cache slot for ubatch of size %d\n", __func__, ubatch.n_tokens); - // if we have enough unused cells before the current head -> - // better to start searching from the beginning of the cache, hoping to fill it - if (kv_self->head > kv_self->used + 2*ubatch.n_tokens) { - kv_self->head = 0; + return 1; } - const auto slot_info = kv_self->find_slot(ubatch); - if (!slot_info) { - LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__); - return -3; - } - - bg.save(slot_info); - if (!kv_self->recurrent) { // a heuristic, to avoid attending the full cache if it is not yet utilized // after enough generations, the benefit from this heuristic disappears @@ -1371,16 +1338,6 @@ int llama_context::decode(llama_batch & inp_batch) { } } - // update the kv ring buffer - { - kv_self->head += ubatch.n_tokens; - - // Ensure kv cache head points to a valid index. - if (kv_self->head >= kv_self->size) { - kv_self->head = 0; - } - } - // plot the computation graph in dot format (for debugging purposes) //if (n_past%100 == 0) { // ggml_graph_dump_dot(gf, NULL, "llama.dot"); @@ -1467,7 +1424,7 @@ int llama_context::decode(llama_batch & inp_batch) { } // finalize the batch processing - bg.done(); + kv_guard.commit(); // set output mappings { diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 14c8933b4d6c4..7ba546c10ff74 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -11,8 +11,6 @@ #include #include -static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false}; - llama_kv_cache_unified::llama_kv_cache_unified(const llama_hparams & hparams, callbacks cbs) : hparams(hparams), cbs(std::move(cbs)) { } @@ -206,6 +204,8 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos return false; } } + + return true; } for (uint32_t i = 0; i < size; ++i) { @@ -446,16 +446,66 @@ void llama_kv_cache_unified::defrag() { } } +void llama_kv_cache_unified::restore() { + if (pending.ranges.empty()) { + return; + } + + // TODO: tmp - move to llama_kv_cache_recurrent + if (recurrent) { + seq_rm(-1, -1, -1); + return; + } + + uint32_t new_head = size; + + for (auto & range : pending.ranges) { + for (uint32_t i = range.c0; i < range.c1; ++i) { + cells[i].seq_id.clear(); + + // keep count of the number of used cells + if (cells[i].pos >= 0) { + used--; + } + + cells[i].pos = -1; + cells[i].src = -1; + } + + new_head = std::min(new_head, range.c0); + } + + if (new_head != size && new_head < head) { + head = new_head; + } +} + +void llama_kv_cache_unified::commit() { + if (pending.ranges.empty()) { + LLAMA_LOG_WARN("%s: no pending KV cache updates to commit - might indicate a bug (ref: %s)\n", + __func__, "https://github.com/ggml-org/llama.cpp/pull/12695"); + return; + } + + pending.ranges.clear(); +} + bool llama_kv_cache_unified::get_can_shift() const { return can_shift; } -llama_kv_cache_slot_info llama_kv_cache_unified::find_slot( +bool llama_kv_cache_unified::find_slot( const llama_ubatch & ubatch) { const uint32_t n_tokens = ubatch.n_tokens; const uint32_t n_seqs = ubatch.n_seqs; const uint32_t n_seq_tokens = ubatch.n_seq_tokens; + // if we have enough unused cells before the current head -> + // better to start searching from the beginning of the cache, hoping to fill it + if (head > used + 2*ubatch.n_tokens) { + head = 0; + } + if (recurrent) { // For recurrent state architectures (like Mamba or RWKV), // each cache cell can store the state for a whole sequence. @@ -477,7 +527,7 @@ llama_kv_cache_slot_info llama_kv_cache_unified::find_slot( // too big seq_id // TODO: would it be possible to resize the cache instead? LLAMA_LOG_ERROR("%s: seq_id=%d >= n_seq_max=%d Try using a bigger --parallel value\n", __func__, seq_id, size); - return llama_kv_cache_slot_info_failed; + return false; } if (j > 0) { llama_kv_cell & seq = cells[seq_id]; @@ -616,14 +666,14 @@ llama_kv_cache_slot_info llama_kv_cache_unified::find_slot( [](const llama_kv_cell& cell){ return !cell.is_empty(); }); // sanity check - return llama_kv_cache_slot_info(n >= n_seqs); + return n >= n_seqs; } // otherwise, one cell per token. if (n_tokens > size) { LLAMA_LOG_ERROR("%s: n_tokens = %d > size = %d\n", __func__, n_tokens, size); - return llama_kv_cache_slot_info_failed; + return false; } uint32_t n_tested = 0; @@ -651,7 +701,7 @@ llama_kv_cache_slot_info llama_kv_cache_unified::find_slot( if (n_tested >= size) { //LLAMA_LOG_ERROR("%s: failed to find a slot for %d tokens\n", __func__, n_tokens); - return llama_kv_cache_slot_info_failed; + return false; } } @@ -668,7 +718,9 @@ llama_kv_cache_slot_info llama_kv_cache_unified::find_slot( used += n_tokens; - return llama_kv_cache_slot_info(head, head + n_tokens); + pending.ranges.push_back({head, head + n_tokens}); + + return true; } uint32_t llama_kv_cache_unified::get_padding(const llama_cparams & cparams) const { @@ -1033,6 +1085,7 @@ bool llama_kv_cache_unified::state_read_meta(llama_io_read_i & io, uint32_t cell LLAMA_LOG_ERROR("%s: failed to find available cells in kv cache\n", __func__); return false; } + commit(); // DEBUG CHECK: kv.head should be our first cell, kv.head + cell_count - 1 should be our last cell (verify seq_id and pos values) // Assume that this is one contiguous block of cells diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 0a7ff8a4ea3e6..ff0ba3540d6e2 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -17,6 +17,9 @@ struct llama_ubatch; struct llama_kv_cache : public llama_memory_i { using llama_memory_i::llama_memory_i; + virtual void restore() = 0; // call if batch processing fails - restores the cache state + virtual void commit() = 0; // call after successful batch processing - clears any pending state + virtual int32_t get_n_tokens() const = 0; virtual uint32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache @@ -25,9 +28,24 @@ struct llama_kv_cache : public llama_memory_i { bool get_can_edit() const override { return get_can_shift(); } }; +struct llama_kv_cache_guard { + llama_kv_cache_guard(llama_kv_cache * kv) : kv(kv) {} + + ~llama_kv_cache_guard() { + kv->restore(); + } + + void commit() { + kv->commit(); + } + +private: + llama_kv_cache * kv; +}; + struct llama_kv_cell { llama_pos pos = -1; - llama_pos delta = 0; + llama_pos delta = 0; int32_t src = -1; // used by recurrent state models to copy states int32_t tail = -1; @@ -46,17 +64,6 @@ struct llama_kv_cell { } }; -// a structure holds information about the slot found in llama_kv_cache_find_slot -struct llama_kv_cache_slot_info { - std::pair boundaries; // slot boundaries [begin, end) - bool found = false; // the slot was found - - explicit llama_kv_cache_slot_info(bool found_) : found{found_} {} - llama_kv_cache_slot_info(uint32_t begin, uint32_t end) : boundaries{begin, end}, found{true} {} - - operator bool() const { return found; } -}; - // ring-buffer of cached KV data // TODO: pimpl // TODO: add notion of max sequences @@ -93,6 +100,9 @@ class llama_kv_cache_unified : public llama_kv_cache { void clear() override; void defrag() override; + virtual void restore() override; + virtual void commit() override; + bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override; void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override; void seq_keep(llama_seq_id seq_id) override; @@ -105,10 +115,9 @@ class llama_kv_cache_unified : public llama_kv_cache { // find an empty slot of size "n_tokens" in the cache // updates the cache head - // returns a structure holding information about the slot found // Note: On success, it's important that cache.head points // to the first cell of the slot. - llama_kv_cache_slot_info find_slot(const llama_ubatch & batch); + bool find_slot(const llama_ubatch & batch); // TODO: maybe not needed uint32_t get_padding(const llama_cparams & cparams) const; @@ -128,7 +137,19 @@ class llama_kv_cache_unified : public llama_kv_cache { // return true if cells have been moved bool defrag_prepare(int32_t n_max_nodes); - // state save/load + // commit/restore cache + + struct slot_range { + uint32_t c0 = 0; // note: these are cell indices, not sequence positions + uint32_t c1 = 0; + }; + + // pending cell updates that are not yet committed + struct { + std::vector ranges; + } pending; + + // state write/load void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const; void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1); @@ -183,59 +204,6 @@ class llama_kv_cache_unified : public llama_kv_cache { // using llama_kv_cache_unified::llama_kv_cache_unified; //}; -// -// kv cache restore -// - -// saves the kv_cache state for future recovery. -// used to rollback llama_kv_cache_find_slot changes. -struct llama_kv_slot_restorer { - struct llama_kv_cache_state { - uint32_t head = 0; - uint32_t n = 0; - } old_state; - - // for non-recurrent models only - // list of slots to restore - std::vector> slot_boundaries; - - bool do_restore = false; - - llama_kv_cache_unified & cache; - - explicit llama_kv_slot_restorer(llama_kv_cache_unified & cache) : cache(cache) { - old_state.head = cache.head; - old_state.n = cache.n; - } - - // saves a slot information for future restoration - void save(const llama_kv_cache_slot_info & slot) { - if (slot) { - do_restore = true; - if (slot.boundaries.first != slot.boundaries.second) { - slot_boundaries.push_back(slot.boundaries); - } - } - } - - // must be explicitly called to restore the kv_cache state - // and rollback changes from all llama_kv_cache_find_slot calls - void restore() { - if (do_restore) { - cache.head = old_state.head; - cache.n = old_state.n; - - if (cache.recurrent) { // recurrent models like Mamba or RWKV can't have a state partially erased - cache.seq_rm(-1, -1, -1); - } else { - for (auto & slot : slot_boundaries) { - cache.seq_rm(-1, slot.first, slot.second); - } - } - } - } -}; - // TODO: maybe become part of the public llama_kv_cache in the future int32_t llama_kv_cache_n_tokens(const llama_kv_cache * kv); From e0e912f49b3195ef9d0c51378629ba03c9b972da Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Wed, 2 Apr 2025 14:52:01 +0200 Subject: [PATCH 06/11] llama : add option to override model tensor buffers (#11397) * llama : add option to override tensor buffers * ggml : fix possible underflow in ggml_nbytes --- common/arg.cpp | 40 ++++++++++++++++++++++++++++++++++++++ common/common.cpp | 10 ++++++++++ common/common.h | 1 + ggml/src/ggml.c | 6 ++++++ include/llama.h | 8 ++++++++ src/llama-context.cpp | 3 ++- src/llama-model-loader.cpp | 5 ++++- src/llama-model-loader.h | 8 +++++--- src/llama-model.cpp | 30 ++++++++++++++++++++++++++-- src/llama-model.h | 2 ++ src/llama-quant.cpp | 2 +- src/llama.cpp | 2 +- 12 files changed, 108 insertions(+), 9 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 47c26955ea374..fa22e86cd14e6 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1,6 +1,7 @@ #include "gguf.h" // for reading GGUF splits #include "arg.h" +#include "common.h" #include "log.h" #include "sampling.h" #include "chat.h" @@ -848,6 +849,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context params.kv_overrides.back().key[0] = 0; } + if (!params.tensor_buft_overrides.empty()) { + params.tensor_buft_overrides.push_back({nullptr, nullptr}); + } + if (params.reranking && params.embedding) { throw std::invalid_argument("error: either --embedding or --reranking can be specified, but not both"); } @@ -2180,6 +2185,41 @@ common_params_context common_params_parser_init(common_params & params, llama_ex exit(0); } )); + add_opt(common_arg( + {"--override-tensor", "-ot"}, "=,...", + "override tensor buffer type", [](common_params & params, const std::string & value) { + /* static */ std::map buft_list; + if (buft_list.empty()) { + // enumerate all the devices and add their buffer types to the list + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + auto * dev = ggml_backend_dev_get(i); + auto * buft = ggml_backend_dev_buffer_type(dev); + if (buft) { + buft_list[ggml_backend_buft_name(buft)] = buft; + } + } + } + + for (const auto & override : string_split(value, ',')) { + std::string::size_type pos = override.find('='); + if (pos == std::string::npos) { + throw std::invalid_argument("invalid value"); + } + std::string tensor_name = override.substr(0, pos); + std::string buffer_type = override.substr(pos + 1); + + if (buft_list.find(buffer_type) == buft_list.end()) { + printf("Available buffer types:\n"); + for (const auto & it : buft_list) { + printf(" %s\n", ggml_backend_buft_name(it.second)); + } + throw std::invalid_argument("unknown buffer type"); + } + // FIXME: this leaks memory + params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)}); + } + } + )); add_opt(common_arg( {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N", "number of layers to store in VRAM", diff --git a/common/common.cpp b/common/common.cpp index e7269ead4f94e..d4882c5123cce 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1042,15 +1042,18 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { if (!params.devices.empty()) { mparams.devices = params.devices.data(); } + if (params.n_gpu_layers != -1) { mparams.n_gpu_layers = params.n_gpu_layers; } + mparams.main_gpu = params.main_gpu; mparams.split_mode = params.split_mode; mparams.tensor_split = params.tensor_split; mparams.use_mmap = params.use_mmap; mparams.use_mlock = params.use_mlock; mparams.check_tensors = params.check_tensors; + if (params.kv_overrides.empty()) { mparams.kv_overrides = NULL; } else { @@ -1058,6 +1061,13 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { mparams.kv_overrides = params.kv_overrides.data(); } + if (params.tensor_buft_overrides.empty()) { + mparams.tensor_buft_overrides = NULL; + } else { + GGML_ASSERT(params.tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern"); + mparams.tensor_buft_overrides = params.tensor_buft_overrides.data(); + } + return mparams; } diff --git a/common/common.h b/common/common.h index ea7aef99d918a..725b5123d24f9 100644 --- a/common/common.h +++ b/common/common.h @@ -279,6 +279,7 @@ struct common_params { std::vector in_files; // all input files std::vector antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts) std::vector kv_overrides; + std::vector tensor_buft_overrides; bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply) std::vector lora_adapters; // lora adapter path with user defined scale diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 161dd3fa94547..3e274d6ae3961 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -1159,6 +1159,12 @@ int64_t ggml_nrows(const struct ggml_tensor * tensor) { } size_t ggml_nbytes(const struct ggml_tensor * tensor) { + for (int i = 0; i < GGML_MAX_DIMS; ++i) { + if (tensor->ne[i] <= 0) { + return 0; + } + } + size_t nbytes; const size_t blck_size = ggml_blck_size(tensor->type); if (blck_size == 1) { diff --git a/include/llama.h b/include/llama.h index 468ab1fa485da..fca2b034ba270 100644 --- a/include/llama.h +++ b/include/llama.h @@ -280,10 +280,18 @@ extern "C" { }; }; + struct llama_model_tensor_buft_override { + const char * pattern; + ggml_backend_buffer_type_t buft; + }; + struct llama_model_params { // NULL-terminated list of devices to use for offloading (if NULL, all available devices are used) ggml_backend_dev_t * devices; + // NULL-terminated list of buffer types to use for tensors that match a pattern + const struct llama_model_tensor_buft_override * tensor_buft_overrides; + int32_t n_gpu_layers; // number of layers to store in VRAM enum llama_split_mode split_mode; // how to split the model across multiple GPUs diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 7d067afbe7399..3927079432d94 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -255,7 +255,8 @@ llama_context::llama_context( model.n_devices() > 1 && model.params.n_gpu_layers > (int) model.hparams.n_layer && model.params.split_mode == LLAMA_SPLIT_MODE_LAYER && - cparams.offload_kqv; + cparams.offload_kqv && + !model.has_tensor_overrides(); // pipeline parallelism requires support for async compute and events in all devices if (pipeline_parallel) { diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index 1be0f2d6d6c20..ec1d78e3144eb 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -445,7 +445,8 @@ llama_model_loader::llama_model_loader( std::vector & splits, bool use_mmap, bool check_tensors, - const struct llama_model_kv_override * param_overrides_p) { + const llama_model_kv_override * param_overrides_p, + const llama_model_tensor_buft_override * param_tensor_buft_overrides_p) { int trace = 0; if (getenv("LLAMA_TRACE")) { trace = atoi(getenv("LLAMA_TRACE")); @@ -457,6 +458,8 @@ llama_model_loader::llama_model_loader( } } + tensor_buft_overrides = param_tensor_buft_overrides_p; + // Load the main GGUF struct ggml_context * ctx = NULL; struct gguf_init_params params = { diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index fe35404b26889..0f52b011b6986 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -77,8 +77,9 @@ struct llama_model_loader { llama_mmaps mappings; - std::map weights_map; - std::unordered_map kv_overrides; + std::map weights_map; + std::unordered_map kv_overrides; + const llama_model_tensor_buft_override * tensor_buft_overrides; gguf_context_ptr meta; std::vector contexts; @@ -95,7 +96,8 @@ struct llama_model_loader { std::vector & splits, // optional, only need if the split does not follow naming scheme bool use_mmap, bool check_tensors, - const struct llama_model_kv_override * param_overrides_p); + const llama_model_kv_override * param_overrides_p, + const llama_model_tensor_buft_override * param_tensor_buft_overrides_p); template typename std::enable_if::value, bool>::type diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 8d525e1bec4e0..ca6e3ab2caeb1 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -378,9 +379,12 @@ struct llama_model::impl { layer_dev dev_input = {}; layer_dev dev_output = {}; std::vector dev_layer; + + bool has_tensor_overrides; }; llama_model::llama_model(const llama_model_params & params) : params(params), pimpl(std::make_unique()) { + pimpl->has_tensor_overrides = params.tensor_buft_overrides && params.tensor_buft_overrides[0].pattern; } llama_model::~llama_model() {} @@ -1571,9 +1575,26 @@ bool llama_model::load_tensors(llama_model_loader & ml) { GGML_ABORT("invalid layer %d for tensor %s", info.layer, tn.str().c_str()); } - ggml_backend_buffer_type_t buft = select_weight_buft(hparams, t_meta, op, *buft_list); + ggml_backend_buffer_type_t buft = nullptr; + + // check overrides + if (ml.tensor_buft_overrides) { + std::string tensor_name = tn.str(); + for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) { + std::regex pattern(overrides->pattern); + if (std::regex_search(tensor_name, pattern)) { + LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), ggml_backend_buft_name(overrides->buft)); + buft = overrides->buft; + break; + } + } + } + if (!buft) { - throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str())); + buft = select_weight_buft(hparams, t_meta, op, *buft_list); + if (!buft) { + throw std::runtime_error(format("failed to find a compatible buffer type for tensor %s", tn.str().c_str())); + } } // avoid using a host buffer when using mmap @@ -4151,6 +4172,10 @@ ggml_backend_buffer_type_t llama_model::select_buft(int il) const { }); } +bool llama_model::has_tensor_overrides() const { + return pimpl->has_tensor_overrides; +} + const ggml_tensor * llama_model::get_tensor(const char * name) const { auto it = std::find_if(tensors_by_name.begin(), tensors_by_name.end(), [name](const std::pair & it) { @@ -12319,6 +12344,7 @@ llm_graph_result_ptr llama_model::build_graph( llama_model_params llama_model_default_params() { llama_model_params result = { /*.devices =*/ nullptr, + /*.tensor_buft_overrides =*/ nullptr, /*.n_gpu_layers =*/ 0, /*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER, /*.main_gpu =*/ 0, diff --git a/src/llama-model.h b/src/llama-model.h index f1bf0df3a4ef6..91e6e8725acd2 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -382,6 +382,8 @@ struct llama_model { ggml_backend_buffer_type_t select_buft(int il) const; + bool has_tensor_overrides() const; + const struct ggml_tensor * get_tensor(const char * name) const; // TODO: move this to new llm_arch_model_i interface diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 09eb570779ce5..e3e10fa6cf77f 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -527,7 +527,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } std::vector splits = {}; - llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides); + llama_model_loader ml(fname_inp, splits, use_mmap, /*check_tensors*/ true, kv_overrides, nullptr); ml.init_mappings(false); // no prefetching llama_model model(llama_model_default_params()); diff --git a/src/llama.cpp b/src/llama.cpp index 81e1dd1d0873a..d5164720b2196 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -92,7 +92,7 @@ static int llama_model_load(const std::string & fname, std::vector model.t_start_us = tm.t_start_us; try { - llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides); + llama_model_loader ml(fname, splits, params.use_mmap, params.check_tensors, params.kv_overrides, params.tensor_buft_overrides); ml.print_info(); From 833e2b7409211a07df97716998c5002526642652 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 2 Apr 2025 16:38:54 +0300 Subject: [PATCH 07/11] model : print tensor size during load (#12711) * model : print tensor size during load * cont : fix units MB -> MiB Co-authored-by: Diego Devesa --------- Co-authored-by: Diego Devesa --- src/llama-model-loader.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index ec1d78e3144eb..ea73a8a7ba944 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -603,7 +603,9 @@ llama_model_loader::llama_model_loader( if (trace > 0) { const uint16_t sid = w.idx; - LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ]\n", __func__, sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str()); + LLAMA_LOG_INFO("%s: - tensor split %2d: %32s %-8s [ %s ] %8.2f MiB\n", __func__, + sid, ggml_get_name(tensor), ggml_type_name(type), llama_format_tensor_shape(tensor).c_str(), + ggml_nbytes(tensor)/1024.0f/1024.0f); } } From 92e3006bb69dfeb656ccf5c7c1c1efadb03c88c2 Mon Sep 17 00:00:00 2001 From: 0cc4m Date: Wed, 2 Apr 2025 19:12:30 +0200 Subject: [PATCH 08/11] Vulkan: Fix mmq int dot float cache size (#12722) --- ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp | 6 ++---- ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp | 4 ++-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp index 42f81356e8f60..284a35caa68ad 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq.comp @@ -234,9 +234,9 @@ void main() { #endif #if QUANT_AUXF == 1 - FLOAT_TYPE cache_a_dm[TM]; + FLOAT_TYPE cache_a_dm[WMITER * TM]; #else - FLOAT_TYPE_VEC2 cache_a_dm[TM]; + FLOAT_TYPE_VEC2 cache_a_dm[WMITER * TM]; #endif FLOAT_TYPE_VEC2 cache_b_ds[TN]; @@ -247,7 +247,6 @@ void main() { const uint iqs = loadr_a; const uint buf_ib = loadc_a + l; - // Should ds be gated to a single thread? if (iqs == 0) { #if QUANT_AUXF == 1 buf_a_dm[buf_ib] = get_d(ib); @@ -276,7 +275,6 @@ void main() { const uint buf_ib = loadc_b + l; - // Should ds be gated to a single thread? if (iqs == 0) { buf_b_ds[buf_ib] = FLOAT_TYPE_VEC2(data_b[ib].ds); } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp index c4c35e105a7a0..63b15471bd3aa 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mmq_funcs.comp @@ -17,7 +17,7 @@ i32vec2 repack(uint ib, uint iqs) { } ACC_TYPE mul_q8_1(int32_t q_sum, float da, vec2 dsb) { - return ACC_TYPE(da * (float(q_sum) * dsb.x - 8.0 * dsb.y)); + return ACC_TYPE(da * (float(q_sum) * dsb.x - 8.0f * dsb.y)); } #endif @@ -51,7 +51,7 @@ i32vec2 repack(uint ib, uint iqs) { } ACC_TYPE mul_q8_1(int32_t q_sum, float da, vec2 dsb) { - return ACC_TYPE(da * (float(q_sum) * dsb.x - 16.0 * dsb.y)); + return ACC_TYPE(da * (float(q_sum) * dsb.x - 16.0f * dsb.y)); } #endif From be0a0f8cae039e2286f757612accebfb8f21b36e Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Wed, 2 Apr 2025 12:40:32 -0500 Subject: [PATCH 09/11] vulkan: Implement grouped query attention in the coopmat2 FA shader (#12559) When adjacent batches of Q share the same batches of K/V, batch them into the same workgroup. For example, when: dst(128,32,1,1) = FA(q(128,1,32,1), k(128,16640,8,1), v(128,16640,8,1)) previously we would run 32 workgroups computing 1 result each, now we will run 8 workgroups computing 4 results each. This doesn't directly translate to better performance (at least when you have >=32 SMs), but in a subsequent change I'll enable split_k which will scale much better with 4x fewer workgroups. --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 25 ++++++- .../vulkan-shaders/flash_attn_cm2.comp | 66 ++++++++++++++----- 2 files changed, 71 insertions(+), 20 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index ee0969fe189b4..f60fe33aae18c 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -31,6 +31,7 @@ #define ROUNDUP_POW2(M, N) (((M) + (N) - 1) & ~((N) - 1)) #define CEIL_DIV(M, N) (((M) + (N)-1) / (N)) +static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; } #define VK_VENDOR_ID_AMD 0x1002 #define VK_VENDOR_ID_APPLE 0x106b @@ -501,6 +502,8 @@ struct vk_flash_attn_push_constants { uint32_t n_head_log2; float m0; float m1; + + uint32_t gqa_ratio; }; struct vk_op_push_constants { @@ -5402,7 +5405,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx const uint32_t nbm1 = mask ? mask->nb[1] : 0; const uint32_t D = neq0; - const uint32_t N = neq1; + uint32_t N = neq1; const uint32_t KV = nek1; GGML_ASSERT(ne0 == D); @@ -5460,6 +5463,22 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx vk_pipeline pipeline = pipelines[aligned]; assert(pipeline); + uint32_t gqa_ratio = 1; + uint32_t qk_ratio = neq2 / nek2; + uint32_t workgroups_x = (uint32_t)neq1; + uint32_t workgroups_y = (uint32_t)neq2; + uint32_t workgroups_z = (uint32_t)neq3; + + if (N == 1 && qk_ratio > 1 && is_pow2(qk_ratio) && gqa_ratio <= flash_attention_num_small_rows && + qk_ratio * nek2 == neq2 && nek2 == nev2 && neq3 == 1 && nek3 == 1 && nev3 == 1) { + // grouped query attention - make the N dimension equal to gqa_ratio, reduce + // workgroups proportionally in y dimension. The shader will detect gqa_ratio > 1 + // and change addressing calculations to index Q's dimension 2. + gqa_ratio = qk_ratio; + N = gqa_ratio; + workgroups_y /= N; + } + if (dryrun) { // Request descriptor sets ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1); @@ -5549,7 +5568,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx v_stride, (uint32_t)nbv2, (uint32_t)nbv3, nbm1, scale, max_bias, logit_softcap, - mask != nullptr, n_head_log2, m0, m1 }; + mask != nullptr, n_head_log2, m0, m1, gqa_ratio }; ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE}, @@ -5558,7 +5577,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE}, vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE}, }, - sizeof(vk_flash_attn_push_constants), &pc, { (uint32_t)neq1, (uint32_t)neq2, (uint32_t)neq3 }); + sizeof(vk_flash_attn_push_constants), &pc, { workgroups_x, workgroups_y, workgroups_z }); } static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp index df30355f635b8..cac8f107b5d74 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp @@ -61,6 +61,8 @@ layout (push_constant) uniform parameter { uint32_t n_head_log2; float m0; float m1; + + uint32_t gqa_ratio; } p; layout (binding = 0) readonly buffer Q {uint8_t data_q[];}; @@ -103,6 +105,28 @@ ACC_TYPE Max(const in uint32_t row, const in uint32_t col, const in ACC_TYPE ele #define DECODEFUNC #endif +// Store the output when doing grouped query attention. +// Rows index by Q's dimension 2, and the first N rows are valid. +D_TYPE perElemOpGqaStore(const in uint32_t r, const in uint32_t c, const in D_TYPE elem, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N) +{ + if (r < N && c < D) { + uint32_t offset = (iq2 + r) * D + c; + data_o[o_offset + offset] = D_TYPE(elem); + } + return elem; +} + +// Load the slope matrix, indexed by Q's dimension 2. +ACC_TYPE perElemOpComputeSlope(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t iq2) +{ + const uint32_t h = iq2 + (r & (p.gqa_ratio - 1)); + + const ACC_TYPE base = ACC_TYPE(h < p.n_head_log2 ? p.m0 : p.m1); + const int exph = int(h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1); + + return ACC_TYPE(pow(base, ACC_TYPE(exph))); +} + void main() { #ifdef NEEDS_INIT_IQ_SHMEM init_iq_shmem(gl_WorkGroupSize); @@ -116,7 +140,9 @@ void main() { const uint32_t i = gl_WorkGroupID.x; - const uint32_t iq2 = gl_WorkGroupID.y; + // When not using grouped query attention, all rows share the same iq2, equal to gl_WorkGroupID.y. + // When using grouped query attention, each workgroup does gqa_ratio consecutive values of iq2. + const uint32_t iq2 = gl_WorkGroupID.y * p.gqa_ratio; const uint32_t iq3 = gl_WorkGroupID.z; // broadcast factors @@ -149,8 +175,10 @@ void main() { tensorLayoutK = setTensorLayoutDimensionNV(tensorLayoutK, KV, D); tensorLayoutV = setTensorLayoutDimensionNV(tensorLayoutV, KV, D); - // nb?1 are already divided by the type size and are in units of elements - uint32_t q_stride = p.nb01; + // nb?1 are already divided by the type size and are in units of elements. + // When using grouped query attention, Q is indexed by iq2, so the stride + // should be nb02 (which is in bytes). + uint32_t q_stride = p.gqa_ratio > 1 ? (p.nb02 / 4) : p.nb01; uint32_t k_stride = p.nb11; uint32_t v_stride = p.nb21; // hint to the compiler that strides are aligned for the aligned variant of the shader @@ -182,16 +210,11 @@ void main() { L = coopmat(0); M = coopmat(-1.0/0.0); - ACC_TYPE slope = ACC_TYPE(1.0); + coopmat slopeMat = coopmat(1.0); // ALiBi if (p.max_bias > 0.0f) { - const uint32_t h = iq2; - - const ACC_TYPE base = ACC_TYPE(h < p.n_head_log2 ? p.m0 : p.m1); - const int exph = int(h < p.n_head_log2 ? h + 1 : 2*(h - p.n_head_log2) + 1); - - slope = pow(base, ACC_TYPE(exph)); + coopMatPerElementNV(slopeMat, slopeMat, perElemOpComputeSlope, iq2); } [[dont_unroll]] @@ -215,12 +238,16 @@ void main() { if (p.mask != 0) { tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutM = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV); tensorLayoutM = setTensorLayoutDimensionNV(tensorLayoutM, p.nem1, KV); + // When using grouped query attention, all rows use the same mask. + if (p.gqa_ratio > 1) { + tensorLayoutM = setTensorLayoutStrideNV(tensorLayoutM, 0, 1); + } coopmat mv; coopMatLoadTensorNV(mv, data_m, 0, sliceTensorLayoutNV(tensorLayoutM, i * Br, Br, j * Bc, Bc)); - S += slope*coopmat(mv); + S += slopeMat*coopmat(mv); } // Clear padding elements to -inf, so they don't contribute to rowmax @@ -297,13 +324,18 @@ void main() { O = Ldiag*O; - tensorLayoutNV<3, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutD = createTensorLayoutNV(3, gl_CooperativeMatrixClampModeConstantNV); - tensorLayoutD = setTensorLayoutDimensionNV(tensorLayoutD, p.ne2, p.ne1, D); - - // permute dimensions - tensorViewNV<3, false, 1, 0, 2> tensorViewPermute = createTensorViewNV(3, false, 1, 0, 2); uint32_t o_offset = iq3*p.ne2*p.ne1; coopmat O_D = coopmat(O); - coopMatStoreTensorNV(O_D, data_o, o_offset, sliceTensorLayoutNV(tensorLayoutD, i * Br, Br, iq2, 1, 0, D), tensorViewPermute); + if (p.gqa_ratio > 1) { + coopMatPerElementNV(O_D, O_D, perElemOpGqaStore, o_offset, iq2, N); + } else { + tensorLayoutNV<3, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutD = createTensorLayoutNV(3, gl_CooperativeMatrixClampModeConstantNV); + tensorLayoutD = setTensorLayoutDimensionNV(tensorLayoutD, p.ne2, p.ne1, D); + + // permute dimensions + tensorViewNV<3, false, 1, 0, 2> tensorViewPermute = createTensorViewNV(3, false, 1, 0, 2); + + coopMatStoreTensorNV(O_D, data_o, o_offset, sliceTensorLayoutNV(tensorLayoutD, i * Br, Br, iq2, N, 0, D), tensorViewPermute); + } } From 6f3bd38640f07e4dec7f145d2fbf093ce48c9544 Mon Sep 17 00:00:00 2001 From: bandoti <141645996+bandoti@users.noreply.github.com> Date: Wed, 2 Apr 2025 14:56:26 -0300 Subject: [PATCH 10/11] cmake: remove caching from vulkan coopmat checks (#12719) --- ggml/src/ggml-vulkan/CMakeLists.txt | 62 +++++++++++------------------ 1 file changed, 24 insertions(+), 38 deletions(-) diff --git a/ggml/src/ggml-vulkan/CMakeLists.txt b/ggml/src/ggml-vulkan/CMakeLists.txt index e3c59b75fd5a3..51e8301ce2e63 100644 --- a/ggml/src/ggml-vulkan/CMakeLists.txt +++ b/ggml/src/ggml-vulkan/CMakeLists.txt @@ -23,49 +23,35 @@ if (Vulkan_FOUND) ../../include/ggml-vulkan.h ) - if(NOT DEFINED GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) - # Compile a test shader to determine whether GL_KHR_cooperative_matrix is supported. - # If it's not, there will be an error to stderr. - # If it's supported, set a define to indicate that we should compile those shaders - execute_process(COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_coopmat_support.comp" - OUTPUT_VARIABLE glslc_output - ERROR_VARIABLE glslc_error) - - if (${glslc_error} MATCHES ".*extension not supported: GL_KHR_cooperative_matrix.*") - message(STATUS "GL_KHR_cooperative_matrix not supported by glslc") - set(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT OFF CACHE INTERNAL "Whether coopmat is supported by glslc") - else() - message(STATUS "GL_KHR_cooperative_matrix supported by glslc") - set(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT ON CACHE INTERNAL "Whether coopmat is supported by glslc") - endif() - else() - message(STATUS "GL_KHR_cooperative_matrix support already defined: ${GGML_VULKAN_COOPMAT_GLSLC_SUPPORT}") - endif() + # Compile a test shader to determine whether GL_KHR_cooperative_matrix is supported. + # If it's not, there will be an error to stderr. + # If it's supported, set a define to indicate that we should compile those shaders + execute_process(COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_coopmat_support.comp" + OUTPUT_VARIABLE glslc_output + ERROR_VARIABLE glslc_error) - if(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) + if (${glslc_error} MATCHES ".*extension not supported: GL_KHR_cooperative_matrix.*") + message(STATUS "GL_KHR_cooperative_matrix not supported by glslc") + set(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT OFF) + else() + message(STATUS "GL_KHR_cooperative_matrix supported by glslc") + set(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT ON) add_compile_definitions(GGML_VULKAN_COOPMAT_GLSLC_SUPPORT) endif() - if(NOT DEFINED GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) - # Compile a test shader to determine whether GL_NV_cooperative_matrix2 is supported. - # If it's not, there will be an error to stderr. - # If it's supported, set a define to indicate that we should compile those shaders - execute_process(COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_coopmat2_support.comp" - OUTPUT_VARIABLE glslc_output - ERROR_VARIABLE glslc_error) - - if (${glslc_error} MATCHES ".*extension not supported: GL_NV_cooperative_matrix2.*") - message(STATUS "GL_NV_cooperative_matrix2 not supported by glslc") - set(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT OFF CACHE INTERNAL "Whether coopmat2 is supported by glslc") - else() - message(STATUS "GL_NV_cooperative_matrix2 supported by glslc") - set(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT ON CACHE INTERNAL "Whether coopmat2 is supported by glslc") - endif() - else() - message(STATUS "GL_NV_cooperative_matrix2 support already defined: ${GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT}") - endif() + # Compile a test shader to determine whether GL_NV_cooperative_matrix2 is supported. + # If it's not, there will be an error to stderr. + # If it's supported, set a define to indicate that we should compile those shaders + execute_process(COMMAND ${Vulkan_GLSLC_EXECUTABLE} -o - -fshader-stage=compute --target-env=vulkan1.3 "${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders/test_coopmat2_support.comp" + OUTPUT_VARIABLE glslc_output + ERROR_VARIABLE glslc_error) - if(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) + if (${glslc_error} MATCHES ".*extension not supported: GL_NV_cooperative_matrix2.*") + message(STATUS "GL_NV_cooperative_matrix2 not supported by glslc") + set(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT OFF) + else() + message(STATUS "GL_NV_cooperative_matrix2 supported by glslc") + set(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT ON) add_compile_definitions(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) endif() From f01bd02376f919b05ee635f438311be8dfc91d7c Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Wed, 2 Apr 2025 14:25:08 -0500 Subject: [PATCH 11/11] vulkan: Implement split_k for coopmat2 flash attention. (#12627) When using group query attention, we have one workgroup per KV batch and this can be very few workgroups (e.g. just 8 in some models). Enable split_k to spread the work across SMs. This helps a lot when the KV cache is large. --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 86 ++++++++++++++++--- .../vulkan-shaders/flash_attn_cm2.comp | 40 ++++++++- .../flash_attn_split_k_reduce.comp | 59 +++++++++++++ .../vulkan-shaders/vulkan-shaders-gen.cpp | 1 + tests/test-backend-ops.cpp | 6 ++ 5 files changed, 176 insertions(+), 16 deletions(-) create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index f60fe33aae18c..f6cc28603448a 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -353,6 +353,7 @@ struct vk_device_struct { vk_pipeline pipeline_flash_attn_f32_f16_D112[GGML_TYPE_COUNT][2][2][2]; vk_pipeline pipeline_flash_attn_f32_f16_D128[GGML_TYPE_COUNT][2][2][2]; vk_pipeline pipeline_flash_attn_f32_f16_D256[GGML_TYPE_COUNT][2][2][2]; + vk_pipeline pipeline_flash_attn_split_k_reduce; std::unordered_map pipelines; std::unordered_map pipeline_descriptor_set_requirements; @@ -504,6 +505,8 @@ struct vk_flash_attn_push_constants { float m1; uint32_t gqa_ratio; + uint32_t split_kv; + uint32_t k_num; }; struct vk_op_push_constants { @@ -1476,7 +1479,7 @@ static std::array fa_rows_cols(uint32_t D, uint32_t clamp, ggml_typ // small rows, large cols if (small_rows) { - return {flash_attention_num_small_rows, 128}; + return {flash_attention_num_small_rows, 64}; } // small cols to reduce register count if (ggml_is_quantized(type) || D == 256) { @@ -2332,6 +2335,7 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_get_rows_f32[GGML_TYPE_IQ4_NL], "get_rows_iq4_nl_f32", get_rows_iq4_nl_f32_len, get_rows_iq4_nl_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1024, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256 * 4, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_flash_attn_split_k_reduce, "fa_split_k_reduce", fa_split_k_reduce_len, fa_split_k_reduce_data, "main", 2, 3 * sizeof(uint32_t), {1, 1, 1}, {}, 1, true); ggml_vk_create_pipeline(device, device->pipeline_quantize_q8_1, "quantize_q8_1", quantize_q8_1_len, quantize_q8_1_data, "main", 2, 1 * sizeof(uint32_t), {32 * device->subgroup_size / 8, 1, 1}, { device->subgroup_size }, 1); for (uint32_t i = 0; i < p021_max_gqa_ratio; ++i) { @@ -5479,9 +5483,38 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx workgroups_y /= N; } + uint32_t split_kv = KV; + uint32_t split_k = 1; + + if (gqa_ratio > 1 && ctx->device->shader_core_count > 0) { + GGML_ASSERT(workgroups_x == 1); + // Try to run two workgroups per SM. + split_k = ctx->device->shader_core_count * 2 / workgroups_y; + if (split_k > 1) { + // Try to evenly split KV into split_k chunks, but it needs to be a multiple + // of "align", so recompute split_k based on that. + split_kv = ROUNDUP_POW2(KV / split_k, pipelines[1]->align); + split_k = CEIL_DIV(KV, split_kv); + workgroups_x = split_k; + } + } + + // Reserve space for split_k temporaries. For each split, we need to store the O matrix (D x ne1) + // and the per-row m and L values (ne1 rows). + const uint64_t split_k_size = split_k > 1 ? (D * ne1 * sizeof(float) + ne1 * sizeof(float) * 2) * split_k : 0; + if (split_k_size > ctx->device->max_memory_allocation_size) { + GGML_ABORT("Requested preallocation size is too large"); + } + if (ctx->prealloc_size_split_k < split_k_size) { + ctx->prealloc_size_split_k = split_k_size; + } + if (dryrun) { // Request descriptor sets ggml_pipeline_request_descriptor_sets(ctx->device, pipeline, 1); + if (split_k > 1) { + ggml_pipeline_request_descriptor_sets(ctx->device, ctx->device->pipeline_flash_attn_split_k_reduce, 1); + } return; } @@ -5502,8 +5535,6 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - ggml_vk_sync_buffers(subctx); - vk_buffer d_Q = nullptr, d_K = nullptr, d_V = nullptr, d_D = nullptr, d_M = nullptr; size_t q_buf_offset = 0, k_buf_offset = 0, v_buf_offset = 0, d_buf_offset = 0, m_buf_offset = 0; @@ -5568,16 +5599,45 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx v_stride, (uint32_t)nbv2, (uint32_t)nbv3, nbm1, scale, max_bias, logit_softcap, - mask != nullptr, n_head_log2, m0, m1, gqa_ratio }; - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, - { - vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE}, - vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE}, - vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE}, - vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE}, - vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE}, - }, - sizeof(vk_flash_attn_push_constants), &pc, { workgroups_x, workgroups_y, workgroups_z }); + mask != nullptr, n_head_log2, m0, m1, + gqa_ratio, split_kv, split_k }; + + ggml_vk_sync_buffers(subctx); + + if (split_k > 1) { + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, + { + vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE}, + vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE}, + vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE}, + vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE}, + vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE}, + }, + // We only use split_k when group query attention is enabled, which means + // there's no more than one tile of rows (i.e. workgroups_x would have been + // one). We reuse workgroups_x to mean the number of splits, so we need to + // cancel out the divide by wg_denoms[0]. + sizeof(vk_flash_attn_push_constants), &pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z }); + + ggml_vk_sync_buffers(subctx); + const std::array pc2 = { D, (uint32_t)ne1, split_k }; + ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_flash_attn_split_k_reduce, + { + vk_subbuffer{ctx->prealloc_split_k, 0, VK_WHOLE_SIZE}, + vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE}, + }, + pc2.size() * uint32_t{sizeof(uint32_t)}, pc2.data(), { (uint32_t)ne1, 1, 1 }); + } else { + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, + { + vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE}, + vk_subbuffer{d_K, k_buf_offset, VK_WHOLE_SIZE}, + vk_subbuffer{d_V, v_buf_offset, VK_WHOLE_SIZE}, + vk_subbuffer{d_M, m_buf_offset, VK_WHOLE_SIZE}, + vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE}, + }, + sizeof(vk_flash_attn_push_constants), &pc, { workgroups_x, workgroups_y, workgroups_z }); + } } static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp index cac8f107b5d74..d78092000d839 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp @@ -63,6 +63,8 @@ layout (push_constant) uniform parameter { float m1; uint32_t gqa_ratio; + uint32_t split_kv; + uint32_t k_num; } p; layout (binding = 0) readonly buffer Q {uint8_t data_q[];}; @@ -116,6 +118,16 @@ D_TYPE perElemOpGqaStore(const in uint32_t r, const in uint32_t c, const in D_TY return elem; } +// Store column zero. This is used to save per-row m and L values for split_k. +ACC_TYPE perElemOpStoreCol0(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t o_offset, const in uint32_t iq2, const in uint32_t N) +{ + if (r < N && c == 0) { + uint32_t offset = iq2 + r; + data_o[o_offset + offset] = D_TYPE(elem); + } + return elem; +} + // Load the slope matrix, indexed by Q's dimension 2. ACC_TYPE perElemOpComputeSlope(const in uint32_t r, const in uint32_t c, const in ACC_TYPE elem, const in uint32_t iq2) { @@ -135,10 +147,18 @@ void main() { const uint32_t N = p.N; const uint32_t KV = p.KV; + uint32_t i = gl_WorkGroupID.x; + uint32_t split_k_index = 0; + + if (p.k_num > 1) { + i = 0; + split_k_index = gl_WorkGroupID.x; + } + const uint32_t Tr = CEIL_DIV(N, Br); - const uint32_t Tc = CEIL_DIV(KV, Bc); - const uint32_t i = gl_WorkGroupID.x; + const uint32_t start_j = split_k_index * p.split_kv / Bc; + const uint32_t end_j = CEIL_DIV(min(KV, (split_k_index + 1) * p.split_kv), Bc); // When not using grouped query attention, all rows share the same iq2, equal to gl_WorkGroupID.y. // When using grouped query attention, each workgroup does gqa_ratio consecutive values of iq2. @@ -218,7 +238,7 @@ void main() { } [[dont_unroll]] - for (uint32_t j = 0; j < Tc; ++j) { + for (uint32_t j = start_j; j < end_j; ++j) { coopmat S = coopmat(0); @@ -312,6 +332,20 @@ void main() { O = coopMatMulAdd(P_A, V, O); } + // If there is split_k, then the split_k resolve shader does the final + // division by L. Store the intermediate O value and per-row m and L values. + if (p.k_num > 1) { + coopmat O_D = coopmat(O); + + uint32_t o_offset = D * p.ne1 * split_k_index; + coopMatPerElementNV(O_D, O_D, perElemOpGqaStore, o_offset, iq2, N); + + o_offset = D * p.ne1 * p.k_num + p.ne1 * split_k_index * 2; + coopMatPerElementNV(L, L, perElemOpStoreCol0, o_offset, iq2, N); + coopMatPerElementNV(M, M, perElemOpStoreCol0, o_offset + p.ne1, iq2, N); + return; + } + coopmat Ldiag; // resize L by using smear/reduce diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp new file mode 100644 index 0000000000000..a7e3956854c44 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_split_k_reduce.comp @@ -0,0 +1,59 @@ +#version 450 + +#extension GL_EXT_control_flow_attributes : enable + +#define BLOCK_SIZE 32 + +layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 0) readonly buffer A {float data_a[];}; +layout (binding = 1) writeonly buffer D {float data_d[];}; + +layout (push_constant) uniform parameter { + uint D; + uint N; + uint k_num; +} p; + +void main() { + // Each workgroup handles a row + const uint n = gl_WorkGroupID.x; + const uint tid = gl_LocalInvocationID.x; + + uint D = p.D; + uint N = p.N; + uint k_num = p.k_num; + + uint l_offset = D * N * k_num + n; + uint m_offset = D * N * k_num + N + n; + uint lm_stride = N * 2; + + // Compute the max m value for the row + float m_max = -1.0/0.0; + [[unroll]] for (uint k = 0; k < k_num; ++k) { + float m = data_a[m_offset + k * lm_stride]; + m_max = max(m_max, m); + } + + // Compute L based on m_max + float L = 0; + [[unroll]] for (uint k = 0; k < k_num; ++k) { + float l = data_a[l_offset + k * lm_stride]; + float m = data_a[m_offset + k * lm_stride]; + L += exp(m - m_max) * l; + } + + L = 1.0 / L; + + // Scale and sum the O contributions based on m_max and store the result to memory + for (uint d = tid; d < D; d += BLOCK_SIZE) { + float O = 0.0; + [[unroll]] for (uint k = 0; k < k_num; ++k) { + uint o_offset = D * N * k + D * n + d; + float m = data_a[m_offset + k * lm_stride]; + O += exp(m - m_max) * data_a[o_offset]; + } + O *= L; + data_d[D * n + d] = O; + } +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 2ac4caee70e17..cf74625cc56d5 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -465,6 +465,7 @@ void process_shaders() { string_to_spv("acc_f32", "acc.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {}); + string_to_spv("fa_split_k_reduce", "flash_attn_split_k_reduce.comp", {}); string_to_spv("quantize_q8_1", "quantize_q8_1.comp", {}); string_to_spv("mul_f32", "mul.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 426a9557c4919..e61a126cf5b2f 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -4516,6 +4516,12 @@ static std::vector> make_test_cases_perf() { } } + for (int kv : { 4096, 8192, 16384, }) { + for (int hs : { 64, 128, }) { + test_cases.emplace_back(new test_flash_attn_ext(hs, hs, 8, 4, kv, 1, true, 0, 0, GGML_PREC_F32, GGML_TYPE_F16)); + } + } + return test_cases; }