From 002cb1bb3345967fbe0fa7766c2d94c2da31ef45 Mon Sep 17 00:00:00 2001 From: Charles Xu Date: Mon, 11 Aug 2025 09:59:26 +0200 Subject: [PATCH 001/140] kleidiai: fix unsigned overflow bug (#15150) * kleidiai: fix unsigned overflow bug * address review comments --- ggml/src/ggml-cpu/kleidiai/kleidiai.cpp | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp index 3a513a55d7654..dff8fa244a1c9 100644 --- a/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp +++ b/ggml/src/ggml-cpu/kleidiai/kleidiai.cpp @@ -259,7 +259,10 @@ class tensor_traits : public ggml::cpu::tensor_traits { const int64_t m_start = 0; const int64_t n_step = static_cast(kernel->get_n_step()); - const int64_t num_threads = KAI_MIN(n / n_step, nth); + int64_t num_threads = KAI_MIN(n / n_step, nth); + if (num_threads <= 0) { + num_threads = 1; + } if (ith < num_threads) { const int64_t num_n_per_thread0 = round_down(n / num_threads, n_step); @@ -309,7 +312,8 @@ class tensor_traits : public ggml::cpu::tensor_traits { GGML_ASSERT(kernel); const int ith = params->ith; - const int nth = params->nth; + const int nth_raw = params->nth; + const int nth = nth_raw > 0 ? nth_raw : 1; const size_t k = ne00; const size_t m = ne11; @@ -327,9 +331,12 @@ class tensor_traits : public ggml::cpu::tensor_traits { const size_t num_n_per_thread = kai_roundup(kai_roundup(n, nth) / nth, n_step); const size_t n_start = ith * num_n_per_thread; - size_t n_to_process = num_n_per_thread; - if ((n_start + n_to_process) > n) { - n_to_process = n - n_start; + size_t n_to_process = 0; + if (n_start < n) { + n_to_process = num_n_per_thread; + if ((n_start + n_to_process) > n) { + n_to_process = n - n_start; + } } // Calculate number of columns to be processed per thread @@ -361,8 +368,10 @@ class tensor_traits : public ggml::cpu::tensor_traits { const void* lhs_ptr = (const void*)((const char *)lhs_packed + lhs_packed_offset); float *dst_ptr = reinterpret_cast(static_cast(dst->data) + dst_offset); - variant_call(kernel->run_kernel, m, n_to_process, k, QK4_0, lhs_ptr, rhs_ptr, dst_ptr, dst_stride, - sizeof(float), -FLT_MAX, FLT_MAX); + if (n_to_process > 0) { + variant_call(kernel->run_kernel, m, n_to_process, k, QK4_0, lhs_ptr, rhs_ptr, dst_ptr, dst_stride, + sizeof(float), -FLT_MAX, FLT_MAX); + } return true; } From a3a7874272e5a060079658eb5cca4617b7f99062 Mon Sep 17 00:00:00 2001 From: Julien Denize <40604584+juliendenize@users.noreply.github.com> Date: Mon, 11 Aug 2025 10:07:49 +0200 Subject: [PATCH 002/140] convert : improve Mistral models integration (#14737) * Improve Mistral models integration with llama.cpp * Revert changes and fix gguf * Revert change * refactor convert_mistral_to_gguf.py in convert_hf_to_gguf.py * Revert collateral * Rename model name * refactor * revert * remove duplicate * Remove duplication code * Fixes * Fix flake issues * Apply comments * Apply comments * Apply comments * Fix remote * add default chat template * Revert * nit --- convert_hf_to_gguf.py | 352 ++++++++++++++++++++++++--------- convert_lora_to_gguf.py | 2 +- gguf-py/gguf/tensor_mapping.py | 37 ++-- gguf-py/gguf/utility.py | 6 +- 4 files changed, 285 insertions(+), 112 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index b8c7d97a786c7..2f9ef7f5d3f58 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -28,6 +28,14 @@ if 'NO_LOCAL_GGUF' not in os.environ: sys.path.insert(1, str(Path(__file__).parent / 'gguf-py')) import gguf +from gguf.vocab import MistralTokenizerType, MistralVocab +from mistral_common.tokens.tokenizers.base import TokenizerVersion +from mistral_common.tokens.tokenizers.multimodal import DATASET_MEAN, DATASET_STD +from mistral_common.tokens.tokenizers.tekken import Tekkenizer +from mistral_common.tokens.tokenizers.sentencepiece import ( + SentencePieceTokenizer, +) + logger = logging.getLogger("hf-to-gguf") @@ -81,6 +89,8 @@ class ModelBase: block_count: int tensor_map: gguf.TensorNameMap + is_mistral_format: bool = False + def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, *, is_big_endian: bool = False, use_temp_file: bool = False, eager: bool = False, metadata_override: Path | None = None, model_name: str | None = None, @@ -106,16 +116,17 @@ def get_remote_tensors() -> Iterator[tuple[str, Tensor]]: logger.info(f"Using remote model with HuggingFace id: {remote_hf_model_id}") remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id) self.tensor_names = set(name for name in remote_tensors.keys()) - for name, remote_tensor in gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id).items(): + for name, remote_tensor in remote_tensors.items(): yield (name, LazyTorchTensor.from_remote_tensor(remote_tensor)) self.get_tensors = get_remote_tensors else: - self.part_names = ModelBase.get_model_part_names(self.dir_model, "model", ".safetensors") + prefix = "model" if not self.is_mistral_format else "consolidated" + self.part_names = ModelBase.get_model_part_names(self.dir_model, prefix, ".safetensors") self.is_safetensors = len(self.part_names) > 0 if not self.is_safetensors: self.part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin") - self.hparams = ModelBase.load_hparams(self.dir_model) if hparams is None else hparams + self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams self.tensor_names = None self.metadata_override = metadata_override self.model_name = model_name @@ -153,19 +164,23 @@ def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: def get_tensors(self) -> Iterator[tuple[str, Tensor]]: tensor_names_from_parts: set[str] = set() - index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin" - index_name += ".index.json" - index_file = self.dir_model / index_name - - if index_file.is_file(): - self.tensor_names = set() - logger.info(f"gguf: loading model weight map from '{index_name}'") - with open(index_file, "r", encoding="utf-8") as f: - index: dict[str, Any] = json.load(f) - weight_map = index.get("weight_map") - if weight_map is None or not isinstance(weight_map, dict): - raise ValueError(f"Can't load 'weight_map' from {index_name!r}") - self.tensor_names.update(weight_map.keys()) + if not self.is_mistral_format: + index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin" + index_name += ".index.json" + index_file = self.dir_model / index_name + + if index_file.is_file(): + self.tensor_names = set() + logger.info(f"gguf: loading model weight map from '{index_name}'") + with open(index_file, "r", encoding="utf-8") as f: + index: dict[str, Any] = json.load(f) + weight_map = index.get("weight_map") + if weight_map is None or not isinstance(weight_map, dict): + raise ValueError(f"Can't load 'weight_map' from {index_name!r}") + self.tensor_names.update(weight_map.keys()) + else: + self.tensor_names = tensor_names_from_parts + weight_map = {} else: self.tensor_names = tensor_names_from_parts weight_map = {} @@ -426,7 +441,12 @@ def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str] return part_names @staticmethod - def load_hparams(dir_model: Path): + def load_hparams(dir_model: Path, is_mistral_format: bool): + if is_mistral_format: + with open(dir_model / "params.json", "r", encoding="utf-8") as f: + config = json.load(f) + return config + try: # for security reason, we don't allow loading remote code by default # if a model need remote code, we will fallback to config.json @@ -476,7 +496,10 @@ class TextModel(ModelBase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self.hf_arch = get_model_architecture(self.hparams, self.model_type) + if not self.is_mistral_format: + self.hf_arch = get_model_architecture(self.hparams, self.model_type) + else: + self.hf_arch = "" if "text_config" in self.hparams: # move the text_config to the root level @@ -542,14 +565,14 @@ def set_gguf_parameters(self): self.gguf_writer.add_head_count(n_head) logger.info(f"gguf: head count = {n_head}") - if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: + if (n_head_kv := self.find_hparam(["num_key_value_heads", "n_kv_heads"], optional=True)) is not None: self.gguf_writer.add_head_count_kv(n_head_kv) logger.info(f"gguf: key-value head count = {n_head_kv}") if (rope_theta := self.hparams.get("rope_theta")) is not None: self.gguf_writer.add_rope_freq_base(rope_theta) logger.info(f"gguf: rope theta = {rope_theta}") - if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None: + if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"])) is not None: self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: @@ -1210,12 +1233,19 @@ def __init__(self, *args, **kwargs): raise TypeError("MmprojModel must be subclassed with model_arch = gguf.MODEL_ARCH.MMPROJ") # get n_embd of the text model - if "text_config" not in self.hparams: - self.hparams["text_config"] = {} - if "audio_config" not in self.hparams: - self.hparams["audio_config"] = {} - text_config = {**self.hparams, **self.hparams["text_config"]} - self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0)) + if not self.is_mistral_format: + if "text_config" not in self.hparams: + self.hparams["text_config"] = {} + if "audio_config" not in self.hparams: + self.hparams["audio_config"] = {} + text_config = {**self.hparams, **self.hparams["text_config"]} + self.n_embd_text = text_config.get("hidden_size", text_config.get("n_embd", 0)) + else: + text_config = { + k: v for k, v in self.hparams.items() if k not in ["vision_encoder", "audio_encoder"] + } + self.n_embd_text = text_config.get("hidden_dim", 0) + assert self.n_embd_text > 0, "n_embd not found in hparams" # move vision config to the top level, while preserving the original hparams in global_config @@ -1236,11 +1266,13 @@ def __init__(self, *args, **kwargs): self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count) # load preprocessor config - with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f: - self.preprocessor_config = json.load(f) + if not self.is_mistral_format: + with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f: + self.preprocessor_config = json.load(f) def get_vision_config(self) -> dict[str, Any] | None: - return self.global_config.get("vision_config") + config_name = "vision_config" if not self.is_mistral_format else "vision_encoder" + return self.global_config.get(config_name) def get_audio_config(self) -> dict[str, Any] | None: return self.global_config.get("audio_config") @@ -1264,8 +1296,11 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads"])) # preprocessor config - self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"]) - self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_std"]) + image_mean = DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"] + image_std = DATASET_STD if self.is_mistral_format else self.preprocessor_config["image_std"] + + self.gguf_writer.add_vision_image_mean(image_mean) + self.gguf_writer.add_vision_image_std(image_std) if self.has_audio_encoder: self.gguf_writer.add_clip_has_audio_encoder(True) @@ -1924,46 +1959,12 @@ def __init__(self, *args, **kwargs): if self.hf_arch == "VLlama3ForCausalLM": self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32) - def set_vocab(self): - path_tekken_json = self.dir_model / "tekken.json" - path_tokenizer_json = self.dir_model / "tokenizer.json" - if path_tekken_json.is_file() and not path_tokenizer_json.is_file(): - return self.set_vocab_tekken() - - try: - self._set_vocab_sentencepiece() - except FileNotFoundError: - try: - self._set_vocab_llama_hf() - except (FileNotFoundError, TypeError): - # Llama 3 - self._set_vocab_gpt2() - - # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256) - if self.hparams.get("vocab_size", 32000) == 32016: - special_vocab = gguf.SpecialVocab( - self.dir_model, load_merges=False, - special_token_types = ['prefix', 'suffix', 'middle', 'eot'] - ) - special_vocab._set_special_token("prefix", 32007) - special_vocab._set_special_token("suffix", 32008) - special_vocab._set_special_token("middle", 32009) - special_vocab._set_special_token("eot", 32010) - special_vocab.add_to_gguf(self.gguf_writer) - - tokenizer_config_file = self.dir_model / 'tokenizer_config.json' - if tokenizer_config_file.is_file(): - with open(tokenizer_config_file, "r", encoding="utf-8") as f: - tokenizer_config_json = json.load(f) - if "add_prefix_space" in tokenizer_config_json: - self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) - - # Apply to granite small models only - if self.hparams.get("vocab_size", 32000) == 49152: - self.gguf_writer.add_add_bos_token(False) + def _set_vocab_mistral(self): + vocab = MistralVocab(self.dir_model) + logger.info( + f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}." + ) - def set_vocab_tekken(self): - vocab = gguf.vocab.MistralVocab(self.dir_model) self.gguf_writer.add_tokenizer_model(vocab.gguf_tokenizer_model) tokens = [] @@ -1979,7 +1980,7 @@ def set_vocab_tekken(self): f"token count ({len(tokens)}) != vocab size ({vocab.vocab_size})" ) - if vocab.tokenizer_type == gguf.vocab.MistralTokenizerType.tekken: + if vocab.tokenizer_type == MistralTokenizerType.tekken: self.gguf_writer.add_tokenizer_pre("tekken") self.gguf_writer.add_token_merges( vocab.extract_vocab_merges_from_model() @@ -2002,16 +2003,58 @@ def set_vocab_tekken(self): self.gguf_writer.add_add_bos_token(True) self.gguf_writer.add_add_eos_token(False) - script_dir = Path(__file__).parent - template_path = script_dir / "models/templates/unsloth-mistral-Devstral-Small-2507.jinja" - with open(template_path, "r", encoding="utf-8") as f: - template = f.read() - self.gguf_writer.add_chat_template(template) + template_dir = Path(__file__).parent / "models/templates/" + + template = MistralModel.get_community_chat_template(vocab, template_dir) + self.gguf_writer.add_chat_template(template) + + def set_vocab(self): + if self.is_mistral_format: + return self._set_vocab_mistral() + + path_tekken_json = self.dir_model / "tekken.json" + path_tokenizer_json = self.dir_model / "tokenizer.json" + if path_tekken_json.is_file() and not path_tokenizer_json.is_file(): + self._set_vocab_mistral() + + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + try: + self._set_vocab_llama_hf() + except (FileNotFoundError, TypeError): + # Llama 3 + self._set_vocab_gpt2() + + # Apply to CodeLlama only (and ignore for Llama 3 with a vocab size of 128256) + if self.hparams.get("vocab_size", 32000) == 32016: + special_vocab = gguf.SpecialVocab( + self.dir_model, load_merges=False, + special_token_types = ['prefix', 'suffix', 'middle', 'eot'] + ) + special_vocab._set_special_token("prefix", 32007) + special_vocab._set_special_token("suffix", 32008) + special_vocab._set_special_token("middle", 32009) + special_vocab._set_special_token("eot", 32010) + special_vocab.add_to_gguf(self.gguf_writer) + + tokenizer_config_file = self.dir_model / 'tokenizer_config.json' + if tokenizer_config_file.is_file(): + with open(tokenizer_config_file, "r", encoding="utf-8") as f: + tokenizer_config_json = json.load(f) + if "add_prefix_space" in tokenizer_config_json: + self.gguf_writer.add_add_space_prefix(tokenizer_config_json["add_prefix_space"]) + + # Apply to granite small models only + if self.hparams.get("vocab_size", 32000) == 49152: + self.gguf_writer.add_add_bos_token(False) def set_gguf_parameters(self): super().set_gguf_parameters() hparams = self.hparams - self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + + if not self.is_mistral_format: + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) if (rope_dim := hparams.get("head_dim")) is None: rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] @@ -2033,13 +2076,25 @@ def permute(weights: Tensor, n_head: int, n_head_kv: int | None): _experts: list[dict[str, Tensor]] | None = None def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - n_head = self.hparams["num_attention_heads"] - n_kv_head = self.hparams.get("num_key_value_heads") + n_head = self.find_hparam(["n_heads", "num_attention_heads"]) + n_kv_head = self.find_hparam(["n_kv_heads", "num_key_value_heads"]) + + vision_prefixes = [ + "vision_encoder.", + "vision_language_adapter.", + "patch_merger.", + "pre_mm_projector_norm", + ] + is_multimodal_tensor = "vision_tower" in name \ or "vision_model" in name \ or "audio_tower" in name \ or "model.connector" in name \ - or "multi_modal_projector" in name + or "multi_modal_projector" in name \ + or any( + name.startswith(prefix) + for prefix in vision_prefixes + ) if is_multimodal_tensor: return [] # skip vision tensors @@ -2155,13 +2210,18 @@ class LlavaVisionModel(MmprojModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - if self.hparams["model_type"] == "pixtral": + if self.hparams.get("model_type") == "pixtral": # layer_norm_eps is not in config.json, it is hard-coded in modeling_pixtral.py self.hparams["layer_norm_eps"] = self.hparams.get("layer_norm_eps", 1e-5) self.img_break_tok_id = self.get_token_id("[IMG_BREAK]") - logger.info(f"Image break token id: {self.img_break_tok_id}") + elif self.is_mistral_format: + # hparams is already vision config here so norm_eps is only defined in global_config. + self.hparams["norm_eps"] = self.global_config.get("norm_eps", None) + assert self.hparams["norm_eps"] is not None, "norm_eps not found in params.json" + self.img_break_tok_id = self.find_vparam(["image_break_token_id"]) else: raise ValueError(f"Unsupported model type: {self.hparams['model_type']}") + logger.info(f"Image break token id: {self.img_break_tok_id}") def get_token_id(self, token: str) -> int: tokenizer_config_file = self.dir_model / 'tokenizer_config.json' @@ -2175,7 +2235,7 @@ def get_token_id(self, token: str) -> int: def set_gguf_parameters(self): super().set_gguf_parameters() hparams = self.hparams - if hparams["model_type"] == "pixtral": + if hparams.get("model_type") == "pixtral": self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PIXTRAL) self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"]) @@ -2193,18 +2253,30 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused - n_head = self.hparams["num_attention_heads"] + n_head = ( + self.hparams["num_attention_heads"] if not self.is_mistral_format else self.find_vparam(["num_attention_heads"]) + ) n_kv_head = n_head - if name.startswith("multi_modal_projector.") or name.startswith("vision_tower."): + valid_prefixes = ( + "multi_modal_projector.", + "vision_tower.", + "vision_encoder.", + "vision_language_adapter.", + "patch_merger.", + "pre_mm_projector_norm", + ) + + if any(name.startswith(prefix) for prefix in valid_prefixes): # process vision tensors - if name.endswith(("q_proj.weight", "q_proj.bias")): + if name.endswith(("q_proj.weight", "q_proj.bias")) and not self.is_mistral_format: data_torch = LlamaModel.permute(data_torch, n_head, n_head) - if name.endswith(("k_proj.weight", "k_proj.bias")): + if name.endswith(("k_proj.weight", "k_proj.bias")) and not self.is_mistral_format: data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head) return [(self.map_tensor_name(name), data_torch)] - if self.img_break_tok_id > 0 and "embed_tokens.weight" in name: + embed_key = "embed_tokens.weight" if not self.is_mistral_format else "tok_embeddings.weight" + if self.img_break_tok_id > 0 and embed_key in name: logger.info(f"Extracting [IMG_BREAK] token embedding from {name}") # for pixtral model, we need to extract the [IMG_BREAK] token embedding img_break_embd = data_torch[self.img_break_tok_id] @@ -4683,7 +4755,7 @@ class NomicBertModel(BertModel): def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any): hparams = kwargs.pop("hparams", None) if hparams is None: - hparams = ModelBase.load_hparams(dir_model) + hparams = ModelBase.load_hparams(dir_model, False) self.is_moe = bool(hparams.get("moe_every_n_layers")) self.model_arch = gguf.MODEL_ARCH.NOMIC_BERT_MOE if self.is_moe else gguf.MODEL_ARCH.NOMIC_BERT @@ -8304,6 +8376,77 @@ def prepare_tensors(self): if len(experts) > 0: raise ValueError(f"Unprocessed experts: {experts}") + +class MistralModel(LlamaModel): + model_arch = gguf.MODEL_ARCH.LLAMA + model_name = "Mistral" + hf_arch = "" + is_mistral_format = True + undo_permute = False + + @staticmethod + def get_community_chat_template(vocab: MistralVocab, templates_dir: Path): + assert TokenizerVersion is not None, "mistral_common is not installed" + assert isinstance(vocab.tokenizer, (Tekkenizer, SentencePieceTokenizer)), ( + f"Expected Tekkenizer or SentencePieceTokenizer, got {type(vocab.tokenizer)}" + ) + + if vocab.tokenizer.version == TokenizerVersion.v1: + return "mistral-v1" + elif vocab.tokenizer.version == TokenizerVersion.v3 and vocab.tokenizer_type == MistralTokenizerType.spm: + return "mistral-v3" + elif vocab.tokenizer.version == TokenizerVersion.v3 and vocab.tokenizer_type == MistralTokenizerType.tekken: + return "mistral-v3-tekken" + elif vocab.tokenizer.version == TokenizerVersion.v7 and vocab.tokenizer_type == MistralTokenizerType.spm: + return "mistral-v7" + elif vocab.tokenizer.version == TokenizerVersion.v7 and vocab.tokenizer_type == MistralTokenizerType.tekken: + return "mistral-v7-tekken" + elif vocab.tokenizer.version == TokenizerVersion.v11: + template_file = "Mistral-Small-3.2-24B-Instruct-2506.jinja" + elif vocab.tokenizer.version == TokenizerVersion.v13: + template_file = "unsloth-mistral-Devstral-Small-2507.jinja" + else: + raise ValueError(f"Unknown tokenizer type: {vocab.tokenizer_type} and version {vocab.tokenizer.version}") + + template_path = templates_dir / template_file + if not template_path.exists(): + raise FileNotFoundError(f"Template file not found: {template_path}") + + with open(template_path, "r", encoding="utf-8") as f: + template = f.read() + + return template + + +class PixtralModel(LlavaVisionModel): + model_name = "Pixtral" + hf_arch = "" + is_mistral_format = True + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.PIXTRAL) + + self.gguf_writer.add_vision_attention_layernorm_eps( + self.find_hparam(["norm_eps"]) + ) + self.gguf_writer.add_rope_freq_base(self.find_vparam(["rope_theta"])) + + self.gguf_writer.add_vision_use_silu(True) + + # spatial_merge_size + if self.find_vparam(["mm_projector_id"]) == "patch_merge": + self.gguf_writer.add_vision_spatial_merge_size( + self.find_vparam(["spatial_merge_size"]) + ) + + def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str: + if name == "vision_language_adapter.w_in.weight": + return "mm.1.weight" + elif name == "vision_language_adapter.w_out.weight": + return "mm.2.weight" + return super().map_tensor_name(name, try_suffixes) + ###### CONVERSION LOGIC ###### @@ -8454,6 +8597,10 @@ def parse_args() -> argparse.Namespace: "--mmproj", action="store_true", help="(Experimental) Export multimodal projector (mmproj) for vision models. This will only work on some vision models. A prefix 'mmproj-' will be added to the output file name.", ) + parser.add_argument( + "--mistral-format", action="store_true", + help="Whether the model is stored following the Mistral format.", + ) args = parser.parse_args() if not args.print_supported_models and args.model is None: @@ -8559,17 +8706,25 @@ def main() -> None: if "mmproj" not in fname_out.name: fname_out = ModelBase.add_prefix_to_filename(fname_out, "mmproj-") + is_mistral_format = args.mistral_format + with torch.inference_mode(): output_type = ftype_map[args.outtype] model_type = ModelType.MMPROJ if args.mmproj else ModelType.TEXT - hparams = ModelBase.load_hparams(dir_model) - model_architecture = get_model_architecture(hparams, model_type) - logger.info(f"Model architecture: {model_architecture}") - try: - model_class = ModelBase.from_model_architecture(model_architecture, model_type=model_type) - except NotImplementedError: - logger.error(f"Model {model_architecture} is not supported") - sys.exit(1) + hparams = ModelBase.load_hparams(dir_model, is_mistral_format) + if not is_mistral_format: + model_architecture = get_model_architecture(hparams, model_type) + logger.info(f"Model architecture: {model_architecture}") + try: + model_class = ModelBase.from_model_architecture(model_architecture, model_type=model_type) + except NotImplementedError: + logger.error(f"Model {model_architecture} is not supported") + sys.exit(1) + elif args.mmproj: + assert hparams.get("vision_encoder") is not None, "This model does not support multimodal" + model_class = PixtralModel + else: + model_class = MistralModel model_instance = model_class(dir_model, output_type, fname_out, is_big_endian=args.bigendian, use_temp_file=args.use_temp_file, @@ -8578,7 +8733,8 @@ def main() -> None: split_max_tensors=args.split_max_tensors, split_max_size=split_str_to_n_bytes(args.split_max_size), dry_run=args.dry_run, small_first_shard=args.no_tensor_first_split, - remote_hf_model_id=hf_repo_id) + remote_hf_model_id=hf_repo_id, + ) if args.vocab_only: logger.info("Exporting model vocab...") diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py index 00a6733cbd360..a67c0536a4128 100755 --- a/convert_lora_to_gguf.py +++ b/convert_lora_to_gguf.py @@ -340,7 +340,7 @@ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]: sys.exit(1) else: logger.info(f"Loading base model: {dir_base_model.name}") - hparams = ModelBase.load_hparams(dir_base_model) + hparams = ModelBase.load_hparams(dir_base_model, False) with torch.inference_mode(): try: diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index dc7c03b464c25..c5c27980905de 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1119,7 +1119,8 @@ class TensorNameMap: "model.vision_tower.embeddings.patch_embeddings.projection", # Intern-S1 "vpm.embeddings.patch_embedding", "model.vision_model.embeddings.patch_embedding", # SmolVLM - "vision_tower.patch_conv", # pixtral + "vision_tower.patch_conv", # pixtral-hf + "vision_encoder.patch_conv", # pixtral "vision_model.patch_embedding.linear", # llama 4 "visual.patch_embed.proj", # qwen2vl ), @@ -1138,7 +1139,8 @@ class TensorNameMap: "vpm.encoder.layers.{bid}.self_attn.q_proj", "model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM "vision_model.model.layers.{bid}.self_attn.q_proj", # llama4 - "vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral + "vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral-hf + "vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral "visual.blocks.{bid}.attn.q", # qwen2vl, generated ), @@ -1153,7 +1155,8 @@ class TensorNameMap: "vpm.encoder.layers.{bid}.self_attn.k_proj", "model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM "vision_model.model.layers.{bid}.self_attn.k_proj", # llama4 - "vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral + "vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral-hf + "vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral "visual.blocks.{bid}.attn.k", # qwen2vl, generated ), @@ -1168,7 +1171,8 @@ class TensorNameMap: "vpm.encoder.layers.{bid}.self_attn.v_proj", "model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM "vision_model.model.layers.{bid}.self_attn.v_proj", # llama4 - "vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral + "vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral-hf + "vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral "visual.blocks.{bid}.attn.v", # qwen2vl, generated ), @@ -1178,7 +1182,8 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.layernorm_before", # Intern-S1 "vpm.encoder.layers.{bid}.layer_norm1", "model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM - "vision_tower.transformer.layers.{bid}.attention_norm", # pixtral + "vision_tower.transformer.layers.{bid}.attention_norm", # pixtral-hf + "vision_encoder.transformer.layers.{bid}.attention_norm", # pixtral "vision_model.model.layers.{bid}.input_layernorm", # llama4 "visual.blocks.{bid}.norm1", # qwen2vl ), @@ -1190,7 +1195,8 @@ class TensorNameMap: "vpm.encoder.layers.{bid}.self_attn.out_proj", "model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM "vision_model.model.layers.{bid}.self_attn.o_proj", # llama4 - "vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral + "vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral-hf + "vision_encoder.transformer.layers.{bid}.attention.wo", # pixtral "visual.blocks.{bid}.attn.proj", # qwen2vl ), @@ -1201,7 +1207,8 @@ class TensorNameMap: "vpm.encoder.layers.{bid}.layer_norm2", "model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM "vision_model.model.layers.{bid}.post_attention_layernorm", # llama4 - "vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral + "vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral-hf + "vision_encoder.transformer.layers.{bid}.ffn_norm", # pixtral "visual.blocks.{bid}.norm2", # qwen2vl ), @@ -1210,14 +1217,16 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.mlp.fc1", # Intern-S1 "vpm.encoder.layers.{bid}.mlp.fc1", "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3 - "vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral + "vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral-hf + "vision_encoder.transformer.layers.{bid}.feed_forward.w3", # pixtral "vision_model.model.layers.{bid}.mlp.fc1", # llama4 "visual.blocks.{bid}.mlp.fc1", # qwen2vl "visual.blocks.{bid}.mlp.up_proj", # qwen2.5vl ), MODEL_TENSOR.V_ENC_FFN_GATE: ( - "vision_tower.transformer.layers.{bid}.feed_forward.gate_proj", # pixtral + "vision_tower.transformer.layers.{bid}.feed_forward.gate_proj", # pixtral-hf + "vision_encoder.transformer.layers.{bid}.feed_forward.w1", # pixtral "visual.blocks.{bid}.mlp.gate_proj", # qwen2.5vl ), @@ -1226,7 +1235,8 @@ class TensorNameMap: "model.vision_tower.encoder.layer.{bid}.mlp.fc2", # Intern-S1 "vpm.encoder.layers.{bid}.mlp.fc2", "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3 - "vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral + "vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral-hf + "vision_encoder.transformer.layers.{bid}.feed_forward.w2", # pixtral "vision_model.model.layers.{bid}.mlp.fc2", # llama4 "visual.blocks.{bid}.mlp.fc2", # qwen2vl "visual.blocks.{bid}.mlp.down_proj", # qwen2.5vl @@ -1244,7 +1254,8 @@ class TensorNameMap: MODEL_TENSOR.V_PRE_NORM: ( "vision_tower.vision_model.pre_layrnorm", - "vision_tower.ln_pre", # pixtral + "vision_tower.ln_pre", # pixtral-hf + "vision_encoder.ln_pre", # pixtral "vision_model.layernorm_pre", # llama4 ), @@ -1261,6 +1272,7 @@ class TensorNameMap: MODEL_TENSOR.V_MM_INP_NORM: ( "multi_modal_projector.norm", + "pre_mm_projector_norm", ), MODEL_TENSOR.V_MM_SOFT_EMB_NORM: ( @@ -1316,7 +1328,8 @@ class TensorNameMap: ), MODEL_TENSOR.V_MM_PATCH_MERGER: ( - "multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1 + "multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1 - hf + "patch_merger.merging_layer", # mistral ), # audio (mtmd) diff --git a/gguf-py/gguf/utility.py b/gguf-py/gguf/utility.py index 00adcbc937398..769ccb02f0d91 100644 --- a/gguf-py/gguf/utility.py +++ b/gguf-py/gguf/utility.py @@ -145,7 +145,11 @@ def get_list_tensors_hf_model(cls, model_id: str) -> dict[str, RemoteTensor]: tensors[key] = val return tensors - raise ValueError(f"Model {model_id} does not have any safetensor files") + raise ValueError( + f"No safetensor file has been found for model {model_id}." + "If the repo has safetensor files, make sure the model is public or you have a " + "valid Hugging Face token set in the environment variable HF_TOKEN." + ) @classmethod def get_list_tensors(cls, url: str) -> dict[str, RemoteTensor]: From 1ebbaddff2a44b0599df659175d3274bd5bbeb81 Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Mon, 11 Aug 2025 10:21:24 +0200 Subject: [PATCH 003/140] perplexity : update comments/error msg to use decode [no ci] (#15227) This commit updates comments and error messages to use "decode" instead of "eval" in perplexity.cpp. The motivation for this is that `llama_eval` was renamed to `llama_decode` a while ago, but the comments and error messages still referred to "eval". This change ensures consistency and clarity. --- tools/perplexity/perplexity.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perplexity/perplexity.cpp b/tools/perplexity/perplexity.cpp index 189dcb3d72f5e..81bdc7c19cae4 100644 --- a/tools/perplexity/perplexity.cpp +++ b/tools/perplexity/perplexity.cpp @@ -525,7 +525,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params & } // We get the logits for all the tokens in the context window (params.n_ctx) - // from llama_eval above. Now, based on https://huggingface.co/docs/transformers/perplexity, + // from llama_decode below. Now, based on https://huggingface.co/docs/transformers/perplexity, // calculate the perplexity over the last half of the window (so the model always has // some context to predict the token). // @@ -559,7 +559,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params & for (int seq = 0; seq < n_seq_batch; seq++) { int seq_start = batch_start + seq*n_ctx; - // save original token and restore it after eval + // save original token and restore it after decode const auto token_org = tokens[seq_start]; // add BOS token for the first batch of each chunk @@ -584,7 +584,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params & } if (llama_decode(ctx, batch)) { - LOG_INF("%s : failed to eval\n", __func__); + LOG_INF("%s : failed to decode\n", __func__); return {tokens, -1, logit_history, prob_history}; } From 50e81bdf5db563ab57a9a722b08f96fa8a76c927 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Mon, 11 Aug 2025 11:15:44 +0200 Subject: [PATCH 004/140] convert : fix merge conflicts (#15229) --- convert_hf_to_gguf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 2f9ef7f5d3f58..444e2cbdfbb6a 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -572,7 +572,7 @@ def set_gguf_parameters(self): if (rope_theta := self.hparams.get("rope_theta")) is not None: self.gguf_writer.add_rope_freq_base(rope_theta) logger.info(f"gguf: rope theta = {rope_theta}") - if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"])) is not None: + if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"], optional=True)) is not None: self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: @@ -3598,7 +3598,7 @@ class Qwen3MoeModel(Qwen2MoeModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - hparams = ModelBase.load_hparams(self.dir_model) + hparams = ModelBase.load_hparams(self.dir_model, False) self.origin_hf_arch = hparams.get('architectures', [None])[0] def set_vocab(self): From cd3069dfcbeee8e0e96cffb93b0ebb9e595e273a Mon Sep 17 00:00:00 2001 From: Daniel Bevenius Date: Mon, 11 Aug 2025 11:21:19 +0200 Subject: [PATCH 005/140] kv-cache : log (debug) all streams in find_slot (#15176) This commit updates `llama_kv_cache_unified::find_slot` to log information for all streams when debug is enabled. The motivation for this change is that currently if a non-unified kv-cache is used, then only one stream will be logged because the code was currently uses `seq_to_stream[1]`. --- src/llama-kv-cache-unified.cpp | 102 +++++++++++++++++---------------- 1 file changed, 53 insertions(+), 49 deletions(-) diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp index e539142e6b8cd..4b58043c5899f 100644 --- a/src/llama-kv-cache-unified.cpp +++ b/src/llama-kv-cache-unified.cpp @@ -738,66 +738,70 @@ bool llama_kv_cache_unified::update(llama_context * lctx, bool do_shift, const d } llama_kv_cache_unified::slot_info llama_kv_cache_unified::find_slot(const llama_ubatch & ubatch, bool cont) const { - if (debug > 0) { - const auto & cells = v_cells[seq_to_stream[1]]; - - const uint32_t head_cur = v_heads[1]; - LLAMA_LOG_DEBUG("%s: n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n", - __func__, cells.used_max_p1(), cells.get_used(), head_cur, get_size(), n_swa); + if (debug > 0) { + for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) { + const auto seq_id = ubatch.seq_id_unq[s]; + const auto stream_id = seq_to_stream[seq_id]; + const auto & cells = v_cells[stream_id]; + const uint32_t head_cur = v_heads[stream_id]; + + LLAMA_LOG_DEBUG("%s: stream[%d], n = %5d, used = %5d, head = %5d, size = %5d, n_swa = %5d\n", + __func__, stream_id, cells.used_max_p1(), cells.get_used(), head_cur, get_size(), n_swa); + + if ((debug == 2 && n_swa > 0) || debug > 2) { + std::string ss; + for (uint32_t i = 0; i < cells.size(); ++i) { + if (cells.is_empty(i)) { + ss += '.'; + } else { + assert(cells.seq_count(i) >= 1); - if ((debug == 2 && n_swa > 0) || debug > 2) { - std::string ss; - for (uint32_t i = 0; i < cells.size(); ++i) { - if (cells.is_empty(i)) { - ss += '.'; - } else { - assert(cells.seq_count(i) >= 1); + if (cells.seq_count(i) == 1) { + ss += std::to_string(cells.seq_get(i)); + } else { + ss += 'M'; + } + } + if (i%256 == 255) { + ss += " *"; + ss += '\n'; + } + } + LLAMA_LOG_DEBUG("\n%s\n", ss.c_str()); + } - if (cells.seq_count(i) == 1) { - ss += std::to_string(cells.seq_get(i)); + if ((debug == 2 && n_swa > 0) || debug > 2) { + std::string ss; + for (uint32_t i = 0; i < cells.size(); ++i) { + std::string cur; + if (cells.is_empty(i)) { + cur = '.'; } else { - ss += 'M'; + cur = std::to_string(cells.pos_get(i)); + } + const int n = cur.size(); + for (int j = 0; j < 5 - n; ++j) { + cur += ' '; + } + ss += cur; + if (i%256 == 255) { + ss += " *"; + } + if (i%64 == 63) { + ss += '\n'; } } - if (i%256 == 255) { - ss += " *"; - ss += '\n'; - } + LLAMA_LOG_DEBUG("\n%s\n", ss.c_str()); } - LLAMA_LOG_DEBUG("\n%s\n", ss.c_str()); - } - if ((debug == 2 && n_swa > 0) || debug > 2) { - std::string ss; - for (uint32_t i = 0; i < cells.size(); ++i) { - std::string cur; - if (cells.is_empty(i)) { - cur = '.'; - } else { - cur = std::to_string(cells.pos_get(i)); - } - const int n = cur.size(); - for (int j = 0; j < 5 - n; ++j) { - cur += ' '; - } - ss += cur; - if (i%256 == 255) { - ss += " *"; - } - if (i%64 == 63) { - ss += '\n'; + for (int s = 0; s < LLAMA_MAX_SEQ; ++s) { + if (cells.seq_pos_min(s) < 0) { + continue; } - } - LLAMA_LOG_DEBUG("\n%s\n", ss.c_str()); - } - for (int s = 0; s < LLAMA_MAX_SEQ; ++s) { - if (cells.seq_pos_min(s) < 0) { - continue; + LLAMA_LOG_DEBUG("%s: stream[%d] min[%d] = %5d, max[%d] = %5d\n", __func__, stream_id, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s)); } - - LLAMA_LOG_DEBUG("%s: min[%d] = %5d, max[%d] = %5d\n", __func__, s, cells.seq_pos_min(s), s, cells.seq_pos_max(s)); } } From 228f724d9ce6c56e8cec75bfdabab4dd013def7f Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 11 Aug 2025 13:58:24 +0300 Subject: [PATCH 006/140] kv-cache : fix seq_rm with seq_id == -1 (#15226) * kv-cache : fix seq_rm with seq_id == -1 ggml-ci * cont : iterate over streams ggml-ci --- src/llama-kv-cache-unified.cpp | 48 +++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/src/llama-kv-cache-unified.cpp b/src/llama-kv-cache-unified.cpp index 4b58043c5899f..88c88552aaad0 100644 --- a/src/llama-kv-cache-unified.cpp +++ b/src/llama-kv-cache-unified.cpp @@ -223,12 +223,7 @@ void llama_kv_cache_unified::clear(bool data) { } bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) { - GGML_ASSERT(seq_id >= 0 && (size_t) seq_id < seq_to_stream.size()); - - auto & cells = v_cells[seq_to_stream[seq_id]]; - auto & head = v_heads[seq_to_stream[seq_id]]; - - uint32_t new_head = cells.size(); + GGML_ASSERT(seq_id == -1 || (seq_id >= 0 && (size_t) seq_id < seq_to_stream.size())); if (p0 < 0) { p0 = 0; @@ -239,6 +234,11 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos } if (seq_id >= 0) { + auto & cells = v_cells[seq_to_stream[seq_id]]; + auto & head = v_heads[seq_to_stream[seq_id]]; + + uint32_t new_head = cells.size(); + for (uint32_t i = 0; i < cells.size(); ++i) { if (!cells.pos_in(i, p0, p1)) { continue; @@ -250,24 +250,36 @@ bool llama_kv_cache_unified::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos } } } + + // If we freed up a slot, set head to it so searching can start there. + if (new_head != cells.size() && new_head < head) { + head = new_head; + } } else { // match any sequence - for (uint32_t i = 0; i < cells.size(); ++i) { - if (!cells.pos_in(i, p0, p1)) { - continue; - } + for (uint32_t s = 0; s < n_stream; ++s) { + auto & cells = v_cells[s]; + auto & head = v_heads[s]; - cells.rm(i); + uint32_t new_head = cells.size(); - if (new_head == cells.size()) { - new_head = i; + for (uint32_t i = 0; i < cells.size(); ++i) { + if (!cells.pos_in(i, p0, p1)) { + continue; + } + + cells.rm(i); + + if (new_head == cells.size()) { + new_head = i; + } } - } - } - // If we freed up a slot, set head to it so searching can start there. - if (new_head != cells.size() && new_head < head) { - head = new_head; + // If we freed up a slot, set head to it so searching can start there. + if (new_head != cells.size() && new_head < head) { + head = new_head; + } + } } return true; From 27093afe78912494073eb043fec93a007e49653c Mon Sep 17 00:00:00 2001 From: Zagaj Date: Mon, 11 Aug 2025 14:27:54 +0200 Subject: [PATCH 007/140] readme : update infra list (#15234) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 954fff83dac08..96e30050d3b8b 100644 --- a/README.md +++ b/README.md @@ -240,7 +240,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
Infrastructure -- [Paddler](https://github.com/distantmagic/paddler) - Stateful load balancer custom-tailored for llama.cpp +- [Paddler](https://github.com/intentee/paddler) - Open-source LLMOps platform for hosting and scaling AI in your own infrastructure - [GPUStack](https://github.com/gpustack/gpustack) - Manage GPU clusters for running LLMs - [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly - [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server From 53d0a1265826f40c5dbc01d06aeab9e14fcbd69b Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Mon, 11 Aug 2025 14:48:41 +0200 Subject: [PATCH 008/140] server : allow specifying reasoning_format in HTTP request (#15238) --- common/arg.cpp | 6 +----- common/chat.cpp | 13 +++++++++++++ common/chat.h | 1 + tools/server/README.md | 6 ++++++ tools/server/public/index.html.gz | Bin 1914076 -> 1914095 bytes tools/server/server.cpp | 8 ++++++-- tools/server/webui/src/utils/app.context.tsx | 1 + 7 files changed, 28 insertions(+), 7 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 0f01bb31454a4..3d18aaa171ce4 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2949,11 +2949,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex "- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n" "(default: auto)", [](common_params & params, const std::string & value) { - /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; } - else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; } - else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; } - else if (value == "auto") { params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; } - else { throw std::invalid_argument("invalid value"); } + params.reasoning_format = common_reasoning_format_from_name(value); } ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK")); add_opt(common_arg( diff --git a/common/chat.cpp b/common/chat.cpp index 316bd24170c9e..92fbbbe111f01 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -625,6 +625,19 @@ const char * common_reasoning_format_name(common_reasoning_format format) { } } +common_reasoning_format common_reasoning_format_from_name(const std::string & format) { + if (format == "none") { + return COMMON_REASONING_FORMAT_NONE; + } else if (format == "auto") { + return COMMON_REASONING_FORMAT_AUTO; + } else if (format == "deepseek") { + return COMMON_REASONING_FORMAT_DEEPSEEK; + } else if (format == "deepseek-legacy") { + return COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; + } + throw std::runtime_error("Unknown reasoning format: " + format); +} + static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) { std::string arguments; if (builder.is_partial()) { diff --git a/common/chat.h b/common/chat.h index eb628d8bc275d..c4d6b2e85ea2a 100644 --- a/common/chat.h +++ b/common/chat.h @@ -191,6 +191,7 @@ std::string common_chat_format_example( const char* common_chat_format_name(common_chat_format format); const char* common_reasoning_format_name(common_reasoning_format format); +common_reasoning_format common_reasoning_format_from_name(const std::string & format); common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax); common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice); diff --git a/tools/server/README.md b/tools/server/README.md index 87cef75730afb..af9264ddd38e4 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1132,6 +1132,12 @@ The `response_format` parameter supports both plain JSON output (e.g. `{"type": `chat_template_kwargs`: Allows sending additional parameters to the json templating system. For example: `{"enable_thinking": false}` +`reasoning_format`: The reasoning format to be parsed. If set to `none`, it will output the raw generated text. + +`thinking_forced_open`: Force a reasoning model to always output the reasoning. Only works on certain models. + +`parse_tool_calls`: Whether to parse the generated tool call. + *Examples:* You can use either Python `openai` library with appropriate checkpoints: diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz index 4e25eef1760ca1b5e4a2ee89004c0387182073be..a472714efb74e89e79cc02d65f0349f279417a82 100644 GIT binary patch delta 1641789 zcmV(uK2&Yubbz!0khg!8_oLwftnnN1QXTj^Kf!@$05BBxLhT994O#_CiG?rH z`|M{!VHoxMjYmV^Yxzx)K(G}Dp{HXn#c;gYv%>XYKM&*2Ljf}RPkYcbD)M7U5q|&| zxuqU%25c&Qth}-$tTExW;4~8wq!)sO;Ou-Ozp*z5`doSuKjXmD8F+vBbnN;CpZ)xY@Ow)#$GnMM`v&LVO{!|
k)JktGfuQr^Cq3KC5Vr4)FTD7< z{MHhNsi~Hjb&tGs1;$mg$ z{#v-U%nBq2LpND*2k4LMzq9a-Ri#=#XWQps3GEW14TiRDVl`!UcL+5RAx2-me(WIM)f#_6pjHSUu4Bi9h%0k9 zfQdVrM#WZVzaHQ<@TF>`rkGUE;_(5<71*L`ie-CqO8k}kv zZ_ldQGqL{FPt_j?(}$jNZ|x4THZey_4n&+w0`+Trf82T!e^5qk<%9fIpx{i?lSCuG zKc4wuM$=a011oifBd+M31mw2@uM+~R%_TDoaL#B6VO1YO7On!)&Rua@=Z=ZivLEI z`?OLI(bKoltG{)sarJx`c#yaoU}G}lICztvj#o+pBF^DgPNl>n{8VNh3YNl8bC_gq z)^VLOPNYRT>Oe3r;T>$epCb`OG?u}tdcS{njz|M7AlDEzKEbp$h&pN9y|}{{$IgcM zw85OV*2PVR9jeidxXf5T24;QoY+iBxm%zFKAb%(W@t}|Ft`J%AAECLY^{XF<$w&w| z&Wi>k379YjqoYT{VDR8VKVxFu!vSW8`^GVX7j>0ik-6nA&avJNKS8?Q{`lxb@sZ?=I6E5Pncu8Qj$LCa zwh;ye+xO0ZV*_L2F%@FX2EfH6MPMwMWErIL#yv%Nw*?p(pa8m? zgeL3l?+DA%Yq*uwrKyFRbIb1*=N6am>Ccq?QTOMkmhJb#?!waC_{T-`H@D|*Eq}jM z{#E_K49Nb+<31^#-m+KwH~5GDzg&{1cp!T}NEs3|C-NZqeLI`}2mhJ=zij%Mtog;I zsXKE^H*W9Vn7eal@!jm){QSb|;@q9vOy8Zkn>TK}o3$^RR8O;SFP5aDoLjiPV1Ji7 zx3xIzkFOAx&F%T6*`>K#tANZdEq_qU?VHP$WUQi;Y*3U&Wmd<+JN?a8&cn*ed62K1 z@Bg!M{yM_J-k-Y3pIsvW_H$v;et_g5av<_yzq9H4xiWp<=hOG?f2Qv*o4z}mFLv7e zjd#l^s&914^nGD=?&k6x>=~BtEIegmA>dh-ty$7Z8MZV#w>;0o@+{xx=6~6?{0jlO zz1Ug2F}F0oz)0FZzSVUU%!`b8om=x$lz(e(ap@K{E!@hggz((P9C&VNm8xlzR~u^$ zm4-T3Q{uraxo^!a-C4SK2Mzo(6i0spo#AL&?o{8;&)%eddV6On^mCs-w^iWWjd`-@ z?-r&_OM2!njifb83%99j{(t5zo^LKJ+`Dm`Zr+&+|D*~R7q(^>=WgACRL(BX-MGWB zE-o$4-kw|F@7&GXwtxOMXen?rw{3M`V4mAbTa>)!I*om-0#b^Z0?SvTuHO7-uKq%vZW+=C)r3pCXZ#$f1j%`S{ard zOJMnYh~gnJ;Sh5)5Wsj5*%nwv2QNzk40zw!Jab8N3-kL|AKzE9A$!)&o;^>V1gpEc z-#%7XS65e8tt}n_Sz9=Oc#23vgJG7$>eTBbcF=Kxyk47Mty912m@RxYN#t^d0wmm zTlGSn5Pvgm_0^bb+PE}N8~GF4NPlefnQ($@WTsb|bJsN`Q-h7d0fSNHa9QruyI}78quxodCUBtJByuSakQ-PrDi>9%GKI3ozMr{aINu zoY6<~4x?PrlQGX)Ua8w{?Y{wj~r9n@dJ_N7S+M#NHNpFAsI{tBfpAX!|6HmY57v_Xe(GC_nWK}DG;fz`qfo5$*$ z*2tW8ka_Jg%OoKnSkhUac2g(~Smn3Qx`z6R2(IZUpdU45ZtcDL?L-SYxUAW3#A@iCAQ(zki~h4g|Wj}Giw!l85Y zU1Knj{4Fx_j1iecjwsD`3Jn8IQ!xyLRyrpeRcq8+4dMeZ*+eaD478LY!YumTfNqv{ z12-6#?gsTmn&Ass5pA5;X0Xa1hQ%r)BW-cxArJ$i&s-}tHmj~TzB3vW2${x2gMXtg z3wNx`E7}ZK7AFx~wjUckm7?NA>@|aI!y=drjcRgr+6>w3N&ux@$Tt%Fu+ES{8@>&R z7Kk8jfvD?Xi60sE2vFRO2;=Tr-wH1H)jbLTIzACNzM6HWN1_p?QYhYSqH( zq>U~1)DOo5JTXMFLDNXsz}iT_gMZN$1|+NyVm1oGSPg!_$ z@m;=Uw6^}*GXCUKH@LI(UX1;cDCF|IVT@5m)>?*zM>R=%C(^H&^VteaBIUq`&5JVG z+6mcR__i(xOy+7L*NEG8IOy61AzRtX@{wzj3bNfw1(X~143dV(NTMJ3CVw&i68-S1 zUq@0xt1HW;nndwz5iQsoCs-xS&tL=5T5s!9Hq;enSBaDV6o2HD^?M!z}< ztu5;ews_fuandXo1bN{^$rVz4VcTw{=j-pe$6Z=pomiGo zjyq-AOt~C)=hjNvOz51@l-2s`GacL9slblJCLgkn*?D2v?s@0!nm0(9G)Ou2_Dk4% zkYn%ZU)cL+3Q6Yd>VLwxIQ|NyJRQ>Z$vI~3{X$oP+Jf6TsA;m|H+hO+q&sw z%LV(5l7TjSqI=d0i+EOFx(6cZ>g2Naz|fK`qHL7e-0#K?|AMd=fM1u3_!6$`EvWvtAw=4FYH?dMo?@)y?pY5Sq)t(KP-bX-_< z#~*EPc7MyZj1n6f^vfN#u%$(Kw}pvU$70FQr{f;_vTQ;}U*0pTG2hu*B{Fi7BgENX z2=Ql1{_6Ebp~M1$XfOM+DTZ5_U)58KMHV>w1Ei(MnJbrF!@@R0YqW)Q*n;NapH|Q> z&7~z$0f38t&t^n1d6H=dkkqlS?9Xi|jg&lAVSmtdB+$j#5?6}4O4U`>!qTD?rO{Y+ ze_a)}Z<&%M6plXWG_efVwt$*~ZjqONwu@Stxq=ENr)sA}Dza9$kV*Y>9KBy6T4yeRCt}^PhhVy<)t_g} zV}H_@n09-4>;aj3Kr;E|nX6CRpm=+$cPx)dw(F}BvyZnbuX$pc#h-B?9j_XVW>kl} z@zipg6Za*Z9&eZq#S-mLZnmbtb$eo|_9tIF_ezWBw|w#3`fKt0xjQ~GjWOQzV*`w&(IP59Jgz?Tu}Yk9FuCFD_9CnSc&X z;=QuMZQtDLxNzNaNT9c}vMnjLgzGD7Y9{)K2quwPqn{$twSK}K;-Yj#P#~ct^M7be zEzhg9$ly?@qWQ+qq$ZkxQw8vn%l-xdXDmf4VU8E1YxQN=2kEgYOsk|fMBCQX%(;3K zrN}kjnE0P~p@}rTpGHTJ@h@R19Z%1F>7-H~a9}A+`wbY=SJ6BBa;w$QF zS_%WA)!Efd27e#T*r{1p zuE}zX8MO9>K&MWIC~BST^9N(92k*`>w|7#T!wzYEZfgH+O(vTN&S4588oO}jDJq6M z)N?fKc7DNbyVF_bV!Y1~mG3@DH-1|7;BVlY-$%KXn`{fb+7v|=0C<4L4fiQO3SsS=a z7h=v&oz&*1oY{86zN6ae+!W5jPTye3taetj)j471v^8X3pY(g}>CbWP47&ES-{~K` z%NA2Gx^8xB4JY(P4S$2r=0H;}lO2&@5OU1xO?%prEfJrOoWHy?6SvUD%bMKoos2M@ z<6PENWGRfpT^ zsk5?gzJHyUacp~8maA9sL}}(j+cPtGwC8{OqhGh_+}VHC9Q1KFR#M~695Ef1h=|KGwQ6o}$7xVQYRY(tq%C^zBXvu)=LNd+PswsBb1 z(l&0$bcSp2&sZ>gBfILM`g}Z_MGB&ob}B>vn9Fltxe3HfwD&rTTNCxXkHsVKr-yqj zynJI1FY6xVCx1p^=Vfs|!$KGqZ9L~z_A&5i)Nl~Yi??MvyouH1Lo63X9`OHyrnXBI zE;Z^KNCPGZ9cJ0KUfj~79|nv5p0&H&-HZ1VybQ$U(2*p$XtUW_*p1`+34S@Z<>Z;N zvU_OES3O|LoK^y_&n?2{GN zoz#rxw>0CWzBC^7`37If0H`HLduz{SoE-1`c+JH4+_wx9KFEuO_AV;4r$V#-URvmx zbB3T&Z+~q}-@<8Vxzt;()PXEVe^Z$S18SK3UK_WdCKF|5uMGSAc=sfxA;#n^*3yXF zU_%;Wx;)#oAwSn#Sw=hlaGnk7B+``d9 ziA9Ypu5rB`Z4Z&^1yhTc@wl!saR|m(mFv5J`+rP3a*>67DLdgA6M>ny1GeVGZ(dfO zvjSNzFK3BmCsmV0fnl$fNACY3^U*8MN118z#s^brmW)#z{!*x%1z-aGL+uR+KGo%n z!)}`sjvi)8Q`8RkeTEZzuv_q5>sq$AuHhj-Y@oR~z+=@3o^m})WQ#(MY5y#2IpNIx z?|;M>ZuvaDB3TPk7`!(Wj5l`HoVoYTxYcxW-Ba5JQ(d?f%nVP-eK-!X zhDYt*lqChyjfGx6y`5C)hba1?(zlPns(+#L(NHvz8N@teOocoLOm@I9?(*OQ4;0>y zL*@~nX@VFJpX8dr>zsKbr)Ji)V3+wW*92YI1-keZUj$uqMG!m@6t=;B zz768EMH31;W39huj;Sd6?Up?0?`BSeJc2(=K9&*>81Oj|JNbAiiH98hvoo1n3xD}Z zTv=-FgVKqw^2DH115jhXNjaYW{p`@>lmQj|% zk=8lN^gEg#=Bbmj8h<|481T}_{XuzVIQ;Clh95Mc-F*sX;~Xy-YRhgqnc5;ej8iX@ z?Q->4znfrk2h?a_dxexiz9e9k-$Z+j~W>?2QRzL7fgp|rWC3#^=PFa?^N{*VDz$ zI@cMSVF$c~ak0?HflyaM7K?_X^pq1)*``bu`!)OyvO|NU0TUX6_>#3pBnz{NuaHRO zaBy~T+BUCTm380NiPP-UaDVD@(bkWefd;N?+fhVT8?cigdHE=bshlUBRNDhNG7)00 zPY;TW6D-joyj0`FIhoqQQOv>6abLq^ya*KcE! z{Q%4UAV|J?{b%@Tsj_#TvfJTI&sxeYkZ{;_2wqwqUtOm!?~Tv;t^IK-#?!~!UQpw1A&F0OBLCEgS5T!(yCx0!bWOWP z1E)~eLgO31=dFM3Cx6#GK_E^lTxU58>pu%Gr$12@ZXF>4Fc8ITZe(LNH57+vND)&* zpW9GB#Q%7DxmnAxN-NOXTs;%p2XMb(cXotk}Q(g8*UD~Z) zxF4AVBAI13*IE1QkXT*+`(VmwNb7Lxr+C~cu<;)y!_)HbZ0*2 zZe@sD$=Jlaw)P*c?LqODJg+S*$_e$ANli)g#)ZcpY{rv|JMGAXYs3Un!&x#q?r>h$ z6Sb9XYe!uMx`N)fm)7om7DX|P86=4!38ca4?D>EE%a zFR#gE71E9Nn>NUar>QJzj*q;{qpmz2^}`swOwBHTB3~#!Hd-@-Yu40jC!@g;XZ?*< zhhu|F?9B{z$C|%4DQCNoGxVvL)i>#+@ea|@*?n~Z zNao^v-fD1xb#l)AD1NIpn!C65zI>V1PeXzk1wy6xCUWmRN!bX!Oxy3F%jdPV9K&k9 zT;g^J`)VIawl`@UE-Gs*Yq{77g_Y_^Blz`7Lf%ViiN!?+Q=^90HbcTYK>Sodwm4G> zd4HnpCW2nhn4T0~aX|f(6xQu&5gSJ{&E4+BJ?WMjuFMn?oJ*#~f8$xc_*q5x{c;Gz zjSr$U^Nfh!Xgoecj2^t1=4>Pvt8=(3$5WG9wQ*&zG4mOJtBqP6-d>r#&p5Kg<=X^2 zfVQF1h0>nq+%~40U%p%!a3sL*otocN*?+y+x?EM~Rm#J4@GzSizXRpM)fSlb8h!WxPNK< z{=4g=>br>_*S_)VH(TmzGTzG^Y8ZUq-)w0^*nGFO{&fN1sHGI{3 z_2$*--u1VcJ0IHy!XP^@@2GE7`i9e$%}#VjY_}wv|D;wg!{-`sUD0e)v1lkwM>%tC zG`+RO4CwUAUAgU6Q}^fH^qr0lhJPRvm()8Ds!k z#6AUCx<9Y%!@k;>K4g>%3BH6f*cSsqOvBM$E8`Y8$%oF>f6b%*>}>mofN2{uMP*am z@8fJXHfE-zi`sL^u1N(_aDSd};Q+GXeEhO7oTBxJuzrTW5moD*58Y<*r1>^n>2qQ@9C1&8(s1?ey~2+8(=9dw!_{A3OhH39Fn>)#TTs6}V-9i$ zT4yjrgAsiquqBug$>^ckji;=~r^IarfTtU~eN7 zWU=aW@9IytcAHZ2ZMeYs`T4IeCgH?fW=z)zuNIoTPj+@udgemYdz%!EzIN0cJowNH zlb61{zn^cr&=%XgY=5sc!UFx#I`>Pr-5Sg=3_ou<4U8NPv5a0q8+ihEL+rmIC%?WB ztzq+O_6Bm^%lDB448D93VPn@NhYcw^etCUdy*dVBZ#M731FoUyI)cpdl0s`>NE;f1 zYHJtj)tlLC^jt8bKmvw+-BUgxKxj&^G_}@XygnOjz48%VG=G%NvhJk4&n2!`X7TxG zyv7n6V1ZW3b*CSp_0J#CrF(bdU^tSGJzGrTXzXrr0RWZ4!?$aMrJy<3J`jbq7;>ba zorhsR=htMSo2(cL3FOai>g?s!|gfVhHwG z$g|ZN;;Y|}n*oB#p>M+$opl%C;S?}`E3)&h4ef4tL$*|9Fx(CVdb0-4@SeRt=kejm7+8~^=`ZMo(z6?wwa`q z*%TE*eSi1l`C7`d1%Q*DYBU1)!D(Cf$mYm7mITp`L;g^ltK5CFMaB#d`-3}(V&gP& zEF7RR5ej|GE#-q=CnCXPs(oKTBiyl%+GrES9H6S3OB1$*${$NHAVU}81#}^vql@ca zDwP)Qv_G|CGQ*vkU`k5&nz1YwN3%GO--ov4@qZ_5+^z_dB8Rp3%T~Vnb>muG{7r6q zPVh@B|J_=T7OnZXzlwFb>J~)Tm?Buzx&}rrCT(mD*n6j#U9)hRa+LORVAHu!3L1K9|GGpYzDgLmKKA zn}1_qjSAuIHX~pKqFkpUzcAWpN(=@p%YV0^q7^1B?HFoFB;sTE>y|P=h?&4k#S>() zDuArtCALqk@jen%unC8Y6Bu0-1Kz$3&BeKX+_{M$s%N@yBeQo9^ELnQw2c>dP|bma zhcTobBw&T&H}moYxaVBa$N2p|3#_dks$GISaf{q0e><` z@ER7 zwh$ia!n-^?H___jsvt&ku6aWv>=(u<`jRr_V@ZilGJ$H)HULJc$8{umU@pS%Ed=Pd_MJPB%vW28@q)iHA?oJ_{$Mn$~xU4Wz;nqT1}Annp*2 z6zJL09yMtM;)tY_oW?yv)3RF{#NW_6$$IhIQ7ckDy=@U;X0dihUD0`7=7Dy;^b^lR znNT}F8UQ%sU|<--_kSn-6T7~5X)cm)>IG2FFq}Xn<9EETi#*wJ-oAXQx?)Y zYS?KwU060<1o<%PC+ZEq;n*->{J59bR|DIGtsD#5CxceA+YvHa*dH$xw%hy1eI}cm zMdO+ZP~AGeW!?sn6HRG}eE=4#^amCFdeoPx1Rn~mJ>fWLM}KPZXoJ%OpL}|-)lcEX z_YjFJjA?>cA;Xqm?StW2z@>QMP6q$N(ZQ2A$4(opix7?MoI>|FeYhQ&u8IV~|2W$W zCKsi(AMA-H#eX;rX&+c};L^_M?OQfHyQ=eN2d)TwFv>hQgmLQ+VH1o0F>qS@&4YJR z6$xu*|E;|(4S&*My;)J6E>uX;8_0X~)SdX)_B#)ohlz6%K*FO0;E<3)kjd<15Lmne zgvKFLoMeTPOiHoD37Zjx5bRQ&XW%K7!wEsKH}c#;l2v_k8aXTr{^a`USY0RaqDxY% zP8~GKljjM0Z*|rVu@5l1fe%#f+mXR`ard^CXs_>Ss(-O?CKa^M4w&tMziDnoI!y2# zR&bx^!M?pjq}lIR>w6zig{i5hu!&&_n0bj17SYyx2=|9txPI9WM#%I}eMJ0|5A{z{ zp@xTP;6)T7y?K8lA>8|faPQ+#+7b1M@Ske{jz+#Jck+Y?g{}&)6L({O(0bTL#(LVi z)AIp$GJmtA$7muBTiOqW$3`<5)wXBL!_`$7tpUuo(XWButz5#W=LnM#p&*gCTaZZD zyX~?-qBrz2ZSn{d)7;65j2aCc(uK7~%qtOOT21afG#4WK?E^7qk}xV<)3L3>nsixq zj@oZ1Jw$5*#gTS2PeR26W39Q4m=l7ay?OGGCx77r@NC(=4p;E8&JhkMT#=pv%jk|c zqm!_6d;27&Wfp*wGXB!;5aY@Fg#0z|R*C*Q7!yNepVi~FPeiwhH7M-#$>50tkRdzL zS1iK0nFtR?U>OZ5?Jc$lDKI3gJ)~u6^aCUtd^EY3?w=C%t2hW*0?56}f8ccIkR%b| zfq$6AT#_3`gS%lwjj(^{h$PP%2s>*qJ|UH4NvSB#EWb3nZMJ3iYT=DVT!P8p99X0y z0dE{iPcloYH4Wl(3=w3rnfPQE3rX6vLIPn2ekK{6^#3e@mw1Y~q~AQYS;_~GZf`qh zCr93Di-I*%jdS2kD0-Wj`owogR((DIA%79|#E?4v4}}D;sM>43wPpcwZI8bll^QEw)JSbrXOicp$`lG>&~3OI;l9|j>{5XmqE;Md|&jv?Tv zm6gV6)nL}2DANFRxZjC2m~2BZenZ?4(18s9axq7Qn<1r4-VqWA2e>;@`viD8tk>iM z5TBsKOMrW1b_r4G+9V#p3 z!2`$}1tR2*;fBE+b{JFwf&lKjanGbIq;Qpgl-lUzEr9)-$|Q6#m<0t5vKOG}K)B_c ziC`RVBrj#6KD#&Ai`0C!_%P;U@9%aJ|-ba7lg)NWeiP{xJvvgLr}y zs7;Uqd64ZhnK($`{s}YW%L`&5!F^cpOgu!2lynlFK5m`x#K_Ru-=d_* zfs9Fs@*-g2>1vb$31*=H^L-G|_rZ&V45*O7{HTQy3t}OGT+l(CbjY9ZXm$+1^q`B1}kbgt`c!2Qq!+}z!Cr8lqo}1f{3-u%iM9R!tgbE zi%&w2&804=0y4wq-<90dv6iYf_>Dj^QXPh9DqA<`K^+iL{a$v``-$!s!C z%HlaE#k`$(Hc7$2A}JhMXiJD3w4U0%%+GGx3`WQ$Gi7%u8RQAbvvs94|oFaRP-tw)V(dSG{q5=O>cx{Q=9T-(>m&5 zMHSgn`>?GX7OCwMi%E~&5)Ns!P%D_|aPW{t!g{rE4KYJaPvH=RoE&l4$l+wY^1jxL zF_MtA(`FrhJWF-Q_%Yw!5AkQy4XMUiZ~S) ztF+(t|A&^Amt&b-<6IlnOVtv|ccHr*% zqkmHrmZHv?6P(LpWekPwhVPN%@JepxuCO%8?P_kEa;e3^PY0=vtvx!pz zEK1X2+=OU_n3tX%fzDk4P3InPYAzob1vkk?Tv5H+q!hZ-p3GMdaiX@3?4a5m`f0yJEH_&rLhK)Q_bkAe%?0q5~- z94^!zit;#>n+xHJ@fV6mm7;8C6u2mV^U6a|^JeE+d(hr*L#2ZfkzaD{-h5Ir0)e?U zp`ds6r*41e?eBv9U9`Voe5=F0d&(UG8;XmoyIy6-%Hw$bP<*i;0-`%?67Uo;u76WR zjGc0#&vcG_$jZynuqZ(EN=doU(1T#|4Ag7g=;=73yD2^0j8{-zkeEd!K+K|7y za94eKIM*i2*!W<(CB@Q&l?7cn@-Co~SvJNT5?A32#^u1vvFQ zO;Dp5$eM-^w!%6r$v(Z}TGVFiBn7%&^8~q}8E^ErCp^n!z$Ts;fH_n?^v!_h#CD*r}5?*^bSz^76dq zr`Ayu+a6&!GRT&HT7MD~?rFoxGCtug6a*vSTLD!1STho|tThfry+cmM_RPvpQ9%*W z!bTok(1=4O-i^v%nN&BzN;qqp_}Pi5D7l-c_$Xpn9HOEEB!XUb$J3r9>nlcc8P3Sl z=#f9>jf;or-~?f?K~10=zDr`@9N>h%0TLv>K^4vqaW3&yQhz~aD?l!#bzcNQuu^mR zujS6Q>Vgc{B}B<>A?i}S#}M}!kz7j{2~**CoJk!m=sS|jcyr~_Bw_O9fAqut(GUAa zZT*~*9V-$hHwb@eov-)Ie|P4;JDd33C;tP-aHDtl(7z6oM&45yJrQrZTYL54+WOw? zch@nuwcWU0+kb7=K4FZcdQ;{}Zq!~D43l(AmN@6BmM zOBrE*!yb%F+HLLaS^i)_x3G{CPzd~O~roea0Dzv3OO=S!O%gsMtx(4mYavhboBEx_uFf#DqL?==0;<%we6~}m%8X*h(E+6LQocV8jg3iCMy8?M`IQHCC>%dcLIFn z_D+yXFMsd6lbwt1?t-_Si?8ks;&1NC_gw`81I{>skA*aEYQU$R>f8|0Oug78DCnVC zb_G4uU9B5bK@WA72prhL0dB+Xz&765<9hcr|G9sef3()X5YFzwTD8gZ$Tgna^U@bn zYxcl*ruyq|W65`wEN$D|=*f0&?V?~`Dj0$auYdM8!Ft`-Ob0VXr?1}Z&0c>yBU^5< zz}rQ%Giy#V(G)B+$qm7AS9Ud?-m7-DQ%fGnP(7=uvmHpp(|a48?M8h=pL816UtNFo z{q8@$-~0a6_p{Tx|G57BUWmAgl6nrSS~Ix^?XsXV{ZunwVwNN#$4Q|#wS@Dv5{uPt2>N(GH^8g z1QM^Lm&3*r9H&T1Ejs^l2oc+!ZM+sMVd&@O z@H~}oaA$nUxOJN9OfSKMjo0Xs+;OGx$p_A#7OWcCIjZqIv(}Z5SMuZ#KA**8+3dFc z-Qn+4g<){!3FU&QiRECmiHPp&B!5SZ6$_{qHntUVjcn{F>|6M}dWF3DZ0He#zNZCN znn!Hh9$-t@RqYa(x|hS%Pf^jz}q}OkS|~I0$&biDPv=1c9yo=_{=uuS+k8iON}hPvB$EId?@38 zem!HlA5&ts)cm-HA*2|b)5LD>sNnK2)1Im&JQ-p`MLWWYnTFCS^f@MYvyuNzBcH~G z!W)mU=o2QD9|o^_hJ4+PfdL!M^ZbQEnoXtl11F2SP&$#Hytz8 z?A}qaz;Dr$vE=oa$-dzu*%92yapx%O0l4At#t_Ei4qZ|&Gjt(G#1~Wn^g)g{K4gnu zrDyi|XkR25&o`gPcJ^r}&5Rq5ZBxB$clyciPQAE09d|c+X?HsAUVpuM+`abtb-R1Q zt${ySe)$qeTLbsze6zRyIoL<&ou6+^wk>wttya=ESPpeVZcaR?*B8)e zS#ySs&a}4Ee4mGn#B`TZF+C+Sa#}^bFJFR*u53^7?HpKH%?7-;HAfZ9$hS$q;Xi7_ z$%Yy#VFH@+LVUBy7Jm`@erQ8?7_AiM8=Y&@sDY78-&c0XLub8kqrcgn9#y;5!?s&IXmv!oZZ-JXsUu4+wI0bN7Fa9W~#5Z8tv`Si!Wc=+tvNG z#>=&7UZ%*T{lga)aT>STumQ|0FDCd9TIabY%+2YT5_X?FjDN3T+g8)o-E#yynnM#R z2qH6~F^i|{g-j0iJ?!o)WDT9(xGId(?bnFGWXl5J~N zQTR@UD3`T~4>^tn5G~6$!~84KGaH_MPBuI1b#E>5b~{?&GtOa>5l&}zFB;erM)}q2 z%)By9ZaUp*ye)1ME6p;wjO9d~vB?*Qi%2$Htxa+oS$||EvjJBwKj5V-VYUn`xtg?t z2kV^Y-81aQ#*Ag{~JOsrC%Gn}w4(+zom{3#>p<#DMnxl-{QVUs;+K526$H>yQy! z&KBEnW<^6yc?-UL@kXF6iyKq_TS{`|sK1H7I7eyn)Hg^5@Me}nU_f2SPZ@58Eo!7s z#TfC*KQT5JQsG+-H&&^yqcH7^q}pis-(u4|zki%p<`dj(g+TEuKfxhTN*r=E)M#o3 z0~R`b@^PObp=<#zSDS9gB}+gyW=D&Uy8LM!22;6MaJSIt6xrMDc6OT3(6LlE3P2m2 zGSK(D3=Sh+hPy_Wp?+pv_lrq3SCbMFE=@{xXf!D$$CF};o8L((eikPs`8o7SZh72( zF@J8?v-;hZoxynWUmYGt5oWbTxHfIV%^9MdW6Om-g6Ul;Xqz>%p&YX4R|cN+`v&_6 z`aP{+d%aiq#L|xhKUr&-A+!N5s^8+JZ?rGdx7WFM7~;bVRk+r5nmDYDhV2fX+iDnu z=7p=hhNxsVj*p#W)x?Wzo6&b16w6cgiGN3bZmSH}rhX3}{4VKha1#+2Bgm&An^aE& z{(BpzCQdz+CBOcOzRx8OLz%DMy7`RVJMw*BYlr(HZv~ELa0T*vR>!&=gS399dA&yW z7!%8(A}@C>wcrm{Z*_w@hisu{Uo#FysocRIEO>Bn$-TxX_QU-h1S;Ncd=7Ji8-Lp9 z6ENF1_srgc-8v4)k|M_aI?qJ-p+L5;FK^`P(UR)&$wf&JR=$=JJ1VRlNjpzff%A0~ zAPD+Okm;E~t^?BGtOW&c^qsuUj8o~v|L+(OEAn;j=*=XbdJ}?nma~NWx&?6w3fc!| zTcHWm8NL?1MR2+K>u>8vSUpy`VSj~Zy`BiC%c%d_1B_1VXAEbUJ%uNjM|Lfq2)D*@ z&}f{kVHBTx9?wjLqfy!w(z$7Ku-^6q$J%XA^7a@2EkM%0H{J74V&W00$y+S86&TJ+ zekRkY%m@S7+(Hf6x`!WS0u=O=o{aEUL?cU;amXw+6xVcgW~{Kg16Q7}y;K;fR$hOp zZT$<|c09N_=*D~plPWzGaSb_6CzoXA?L(eJS4zg4hmYdP4A$K6$TPvc4Ndo3VMg)R zB;c1$MiJpmkPbh_jm&Sddu%1)`h##{-AoKYSHjKEZ{7^AtEDQB&&|e>@2^_XZ~VkL zO0jSpIM$Y>PP%U-oa*D7*J*GU4$*%_4*)Q)5%sR%B7j4(!~rwa`)Wh8%7yPt_0fyz zX0=~xb6?JdV~=Ek+M_L(?R;1eWL>l-)9&>D#NfmoG zpVS&ZjG=^~(>?J`J~DIuj&mgAnmFRvLXS+}*ur=*FIa%M|8go z-;8_vm!VfI&)g1nB8^%1cxO(*WZ=8m*mvWCw5Eh`|4Q_T>og8Qbo4Pr*}bZ-Tjj+< zQnRrqpO@xWbi{Kf1xo4G^5BNTpL`SjMH&IwPK9gY!=56q6mZ^$=8+;v#9iYY>I8o+ z={7sU>e#6^vAEfiM*{4~xh#Lmr^49p{ZIrnPuyIOO+-T%9Tkllc3X-IZTp?p^{^{+ zW|K`R3}JtBH0{1yw?JMWd{RDGt&*%y^@ZB{rP_4i?Q7#|3b_sf<1pr-rSf;F)4_e` zfZ%ZR&>t6S^_&;5_Z8Bd@3pd=*#R%@beYK4H$8I>W4vdzZF1kXxGH~ET*1PATi52H z+06Cr7tZ2-o(~CUy7+&oN;rAuhWFbzWtP(dpKF(XVg?_x`V*Oet5=&^sP>eN8a>AK zb6HETOuRQaD6hBhQqbsgmyf(SHAts0k*d~AfL|~Ly-DGS@BkRa<+SwBgj_hN_<#W{4{a!}@kuXoB{{yQj zdL6Qpv9AgRi7c((FrOb`+W55Z_KYz%E|>2i{cGV86k(0|?Gt6JBXexw8Tw6s&R+j+D|q1P{UFU8w`R-UKD3#IG`vw* z$)+xpO3Wo^WqugBi(hldx-Mf}%c(izN+I@A#UFrmNs%aYTTZ?JeEBlB(QF)=cy|O} zkF0$9usT8j7+im@^>s`@fe?zj(T7gC;_N0s`*%M&$jZjn$WvkGlrojybUkx?Wtw(t zwxZm;aCP7aE>!(~^FH?xP8F(1>5ZF|EVJlYW)t^Z_=ZAPgU_nH=JI8b8IWVNhmsGE&*55}tmJ+?M&0 zRoWDq4e5)_brWJ9?pa&M)!&YR*$z3#@3G_QH%g(dv}Rx@ofqeAX>!@ae7p7>o=0zt zh#Z2yo60pj!9`jgw3P&P6dkfvZ3fab_@P~3tLRm~rweSEi9GILfav4aLL4n^*~*S7Z|F z{qtU<=}3J~4VE^;6*P}bPcvvmUf6-*MXhH@3%o$AJ=+}JcHvPbFkreQ3-T!#Fb*Xt z&c=VT-WD75jYqKj_~%Yu8Mp2>wzPap3(&o_ow~R6e9CR6$Qcp8V!dGM_mVj4_X}60 zyRLFpKD8>@GHR~0Ke9BvAb-NrY>h2V*X&xVXn@5KYj(}GI*t-DyG&XHEz7-B8HTfp zL2jpRI}_)JMWy4Ae)qBS&dtCMlw(R|?M}Jiw ze7Moe7IgV6S5!NV`R-Y@7wfaSF5bMy#iD!hzbm&RT|9%B1B~5@InMa!h+7pz@HnaG zZ(IJ2D*q&pJ-6EpTi0Ih{edsN-)i#9P}rV(W4$XMzSsB8ucHP;Y%TMWhXXOEL#-n#D@AU*5fXpP5V|3WY=XFH{LpKR+vz)6y7c(Hf zedunQJq85~740g5dmVm#ap%=r#|8hBH%c7btz(oT!=4@}=N&!&zJc~gqvHM4vB{Sc z2tR52oB*Y=501ft|+CRbWg8t^VI zy(R+3G^Z9IZkurWn27SE!((3dfz%5^iZv%}iA6@LGqb6zmZP9Z5TTKe^IY;po+x|b zw$*ulorp7v#hkKiB^x9gm!LjzMXCkk?my%0@OI;n_pV~0g8bewRMVAquG4?=T6!Fj zbX2ag=4LA!m5u&3lZT1E+fVLoH-_<|2(Zqa{xx1`ELVN#!TpRx1zotHu#ZL@v$V9v z6o^&8d|Vp_Tr)S^#1CmkzPimM$w?;(KSXY`hL5qS{b6o%*r*MEY;Vr3w`UQ=G!$-2 z*Q9~o4D9F}9IX5Fhk9#qrs;pB9kLQ3btTv#?3!Wv#knbz2RkETXd%F_+eT_+(=<6; zl1orBEyaw_>qmYLKgn-XK4jdaLz2(~k~Pr_E^_k^rc38%>!r zY|CzVPPZ@fS$NCTl*+q7hJdXM0bd+T26w?OK%zCUw4Fo}(r!$jZl`~Z1+R3iJ%4=S zpafF4_9WKev;Mf0N8lKQGwrTaUu3+Hv8-S z&E5VU_hfPf8JkZ2)bw;jwFMc6A;|PLX4ny1d)Kg9WHKJT=9dx<{*Rr^)yBu>WRCxG zsp0d|qZfY+Gaj2)l?8uUYwL$Kw*cyOet)bla47s?Vcx>?%Zk*O=M-C7nqON|d}(E2 zaZ!-v#kCcKEHBN?tx9Tib$MxNVM$BO0^=b2u3oP%@^7>9zyH_&tgO3Jnym#bRx_

6(nUYq!y%ESa>xoZGMuUIGjYG=g;@wzOl> z>X72MeetUDmuBsas=VW2S8CVSYyV@n_WS2jZCl*qO{lH&Lv22LkHbQk+$82zEn1k* zM*F0YTiQpy-8Fyk!{rVya(+|m#N$40VRzwVy7CH#(+|(T=x67gJ%7Y$r+OPnvN}1) z8xG`Rt;K+uv8gSz07d)k=#b-!C-6Gr=O?v-e^kHOX}!T&h(*8&>p0DcCujE%DeN2W zV8)@bVl+biU%7hqVcYiPluf!;lfd)HJlbe?hqK_|hx&i~>I^gbx9_gjHX56?m#;xD zCiUXJ@nlO5_5I72v9wflSfK;ALerUXtf_1qOUn!M$bb4j5~#Bs_T5LD7geMyn0BK+ zv(8Xx*}CK1MXlOE%qY3H=oH(I?K-+hkzX`&Jbipe9_Q=?hPE~GoknZ6jpKHWGxbO! zG>Zs>sY-vI+avD*1MW-A&~{(czO-HUHgy%?iQmHqJ9nPkd3N*lo%_Gsxqs)$i?E4D zt>A=(&Fm_Z^>8?|spFyE--8g|#()iFr^G(B8%J`9RXibc3$!LFO9s9~_N%jb`S^m8kx^XKHpGFB=EEJH{*lI-POf6Ovt% z3G;sj6N4^ewOOc0vGUNk&ZWvyhw}zsGKA1;xPuHQvN6kx9T=WunRKt_di^oSlefp` z)kFBM2-wLH=WPLT%wcsVA%_x59vm<7GVi|AkKIdx9|GGPO1ax;Ne~{Fq{bTi8D>)o zm+*IY>wCJW^$2=b?<|z1-9tP;$yJ2XGZcRwhGV2~x?2s^&dzc>P%Qe|Ycqr{xj7fB z7bgZaPe{m4)Um&dq|I5d&H4AIjl0!jw-Y?A)i!ib60+?+-J|Iy%a>JiUDv0C`d&Qj z*_5eTWKhSk5}yf0U=h4zEtrhub{GcxitqmJV|2Vt8<4Xi?RKRMoHSvLnkSMK7bOo#v;r zUERumfO8c2;zARqBn%YWRY-IX6sqcM#!3~+oYd;DQfT*C^;WUnXW7?U68@@&_LK*2 zY_&+ovrv}WrCEI(iCG*8I~WE^!&DUsR=d%bVwE2c+1fEJ1-fj6E2F-)461*hUk6fj zy50`fI2|e0#~CRea0Gi^XryArMt%yLWxmo(dE{&DhR{AP5ZcEx)rqBv_s}Xz_~|K# z+p4qApJz?|H2t`GHxR|A;jz7}NTQwNmdjmC0Js^mzEmG`{o}fhS9_%Bs5sc>L&<~2 zV=nF7bE=(pZKlq@*8nOr46T3Zr&`rHQu-VYDcyncnCrkIO)*k)>}gv_+&wn>;y!n1 z`-8YO)?Xn51c*mSW3ccn2d>QxwWqO_P`2D+Lw1}6>QbH z*bz$cZAXw1`l4DjO#zTdTrGfdQaSVAk&j(TBsDg=2<6yuL z8yOeBK*E-zE|wIKn;Lah?@*6WuXUcJzHhE>uH8sB;Z)sJ#t7D4d+pWE)y(djCP{sH zwK3UQY$VHIkMLtiC$py2cDGQRAebQfq?<~FQ0o!at}}n~Nj7L^Lwqi?RbIjF^IJ;G zw^NEA(iT2fp72wlEW;w|a?}unM(V<^uy^K)RYmIJU{eqp!~_^WNl#ThGR{6@RF}Lm zp8$yiJ{H#`1CZ~e2D$wtIXtv0+I^agk6_7SVl#!LBb-~gFJOo$dCe0He8QLR!MuW5 zE=q6(VE2FYR};0PL%|7!AH&4TrcUL9k=fuJrRXDWfz-lsLoEZ{fark@Mg7+#e5fq| zQ&2EIk~JP5mxS?2d$<;ga2+zFq`ll zdL)y9SqcVA%D5n78&q5WeTXU&2E3MprC1-oR%vc&5?kOARF;_{d^CM zKrAB)qn#Q$JPJBaEd{gN(1ZdUxsVr?+G7)uyHUH{19%2XTLr@j{XsZj@pf3djWoFV z^DXkJx4~q1V)l8mo|NF;Tdg(iCWQpoPMa$#}=z z(F>JbiuwTQf0VgbP5(H}Pw1qY@~@V#go4auHCS_;iAm8+-!GUZ)Hr`M?MCTd{MLVK zc3@KTA2Y!c$gO*;4Qy1jY3X5M2HX#~qt)rf6x)X3uP=CHpCzyh3GxXsd% z#qSHJdS5u5>B2$1wbwz6s&!%4I1hZ5XbF7yj=P^BDM(OI0uci-k;raa`EDIB zd`hGXTRqp%O~@)ahUpeecx0Co!>4~&#_-8!+tvNq&Dj|9oGRSx;44DLU&uq7sK@-Q z7ia&-9iRGsM;E#=7a#mr2B5}dhs;2&)$oBEyP|gmWCr63M@TMJq471l% zI;UJq_D{g%iN|qXzt7s1K zhLBP~Z@|KGZO-KB6tD89r(D~CzA}{xhHzO8rFjcq883S@ z&u~i5s>RLQsCJiX_m`cdhw`J_LEP$JVBo{QE32>so7?;+?XQR@@d;sj%=PC6UC4fV zjV|0r4|Yy@d$?Yge@6Zf+u0uTkw*-BNee;vSr~E~dOoGJP|rf)+n0aPt=xtQqqWBi z0xGZ?xRU|KZT|Ru1I3kRkwxfEohFJy-*90W=?fU@!0_>}Tc|zgm5c5pf8&@x!8@nu z<4e1su_te_LKq19?7!)S{9EBdri7)arnkmQPz){c|G*7$D-_sTJPphTerx*08>gFq zr?wIFyB`YB$lrWkEi8Y$(#*O#nd+|&S0Pr2r<0kfjIw3H$a7FTRRkMGLzA&jm49cv z(yVaQGd{UKpQ51y80|nm80PX+=Cb+VE~xEP9q8p1;7C5TM{*;nd{LJL7SO;y(M>+C z-`^KeL|I*^gMx9ZW5cx)#X6;JZznl`HjS0&l@S;46v4IQ?EruJAY*)#wfV5Fuej9> z6pAjfjw_ z7SdjPw}nl%ET(_Gs6Bit8@&Ab$WA1bcJ5Gn9Zw1$ueU~38Ei~;l(mBXSv+`|DHQfV z_wDWCY4Qr1Q|qZW4ni!bg9C6bfMtD43uoliw}%+C$djLveeKpWE-W&?Az`kNk&|@_ z5Z3R{iJH149$-PwZEtq&X%%YK|v8mRMFd>Um23KauUr*dioG+4Cpw5iIAz0oU*$~{SDW!a6|I65-vI~aiw|=*>KS%=!w2Ry}bTV zObVEvh*E#e;gjLO*Q)qXcNxXl>)>@RQGmehm0Y3tCVz)wZgq!xSi4t;o`x4u-3%uA zTb`TP7W09uCfq6AmjC<|%BmJSF=MYB;B_QG&X+C(i;_$CN;WClE1?|H=A&#~(O#*> zf6P8AGlXv;^SN}0ZbYJ_8`6KLnwF4BPQ(%TE00N6N6`gI;HkB2QD>{(I2=-KB{IjY(pyo$Z4iwqRc1 zUfQiwlzp*XQ|7byrTr|JS_oc6Ft*V4QMRLu=}2gVNT=vtzGMV+UhSfa@~>fhznYAf z;kSRF)SIXunot}p(h9iW!Lh*sNz`c;NjM|bD*D%{-U+q>;E~1D6nK%@4rsEIaq1Mb zPL?H-W}kwy8!|$L+X0|GdnCW0YCtOe88V=GO56IYa%&W)nRRH2 z3|6Y4ZuL75cxk<#(JcD0q;)( zQ{|jcR3gP#(FPY6X%Ab1)Qd7o?Kyc#wUTJKm(b9;mI?U|@&!wkE8@^tm@;~m z;hyQgva4e$eBkWN1AM8(TC@127A&Fp35W+Hcw<6?e(PWG-CwE*mMKES$ilv5H0*?n zV(?UEG4_bF>qIF*S~q@>{i5y@utR^Q;d8UxbxF%(Q@Z*X8vqC5mU)6XrmJRdv#yMr zZu|v|2{tEZBa8{OQtA^yzq>N#G|_#bU9XoqJGuM9j2>GH9U{%7rEh zL!#SVI&O5_WavA1WSSw@t${fo1dRxx@lExsqA_INby|Cg!5Z+60U-|q&M<#|2nU6w z$@pyn+fX&ZmX&KmCn=kQ;6}f&@WaPItY{3;+g&@aWPVF1p|os^U(TKz`mnf+HjvQG z3NW8a5x+P4(;F2N^$dl07xY{i8LGCG+WaN;;u}|id|H1MeuarDFy##3(ZsD%*|!_ zIu*-wMfEDy1euhbv7nH-!UfL&cxaqGo(&4FU_<+|t1Nc;0{Ti|3Sd;1fgso^!wQJ) zo9(g$V~P{xFW%RJy>l-ZYn!GBM8HNBBS@!z@p$S9dK#TlVZc%BYXEQyZ*GI_r#qb-mUscz`aNRnnog z?7#zg;HbbG%h|vzl}^(GF4o{6NH~bH_0EqCJQAki_C&4ljB$TySK2gYo4_skr&Z?| zfzV?9RK#Vl3ICQr`hs;QVu?c^evUM8B(S35*hKhDyFgkjVB(fJ-1lJ9iKRZp)Nf<$ z9ALk(9@4eL1B;U}b?e%+4DK0hDaIQwfZ-`y4RDI5QKMi8>{PPGQ45w>Q*pwAd8u~Q zf@x?|yJJmVt5JVm`Qi}FIpe}O`jZ<(=^7$3i}BOxeWx-lA4_A|DMrrpO1O3mBesX( z1B5iO8{*(_m~qoU3VGLfNl3P^hHM_25MBp2G>pDWq)9fD(ENki5F~nThW8+z66cdN z4X$x6#dg8$=#ErKDAPKo;~GU&Q`9Li(L+Qhw0$3P2eE%Nz5@VrvlhIDBR!FL34 zB`v)427=p&zvUhR-R*=MzQWPjOIDK|aByg1E1aA{zB1-+OWvqX4!4(DX55cz_9s@> z_EOFMT()oVrISQjS=M;BhP$AVS9N!TysEzoYdgFvk{Ne)l`N~ft0Wn6v?fZ<(Z}0> zrc-33RN#M}?~fmB<+1M2l4e$U=stkb*nUM>{7;gum7CMX4XqTlxGnHZZrrH904Jsqzu?xuGn$yvyGt z`Yn+AW<|c%9@7iVy;G>zHhQe2`4VAx&3&W34Lg6)_Yx+v5^JCU+ns!S=)ert5fKI> zqrs;h@;zKm14J*JXlDYDz|F)>YeNO~TigjtR&2V1mcoKM)liTHH}-v$AVUs+bp0$_9Q!|G`hzEP-CgO!wl=I_3c#Kl5gjx*p_5x2Oq?x zz@nb5)kY7#gxkC+#4Y0N!W9-xo!=w-E9?XuW$9(_!DL}UiW3S{&FMiJd0>X&-b(h} z^0IqZKz7DIVdhYsr?IzvJX9>fb+K@$P&a=EM=jFxXpy?*3H%0UM>?RBh(CM7O4Lpg zWh*f6UJx(LUa_mh;Nki@u-9TQj&<}&{?gABn}__qhu_%dDj!ZdEo^*Sh(p5|ZF`d; zBe@bS5@7V%{mQt1n6f+xm?bs8>0zmXs*MN8>{CR!Rzx{6HksF0p>!0P;Wtt=RI-2e zrf~WmEG=Re1jmi6+lOj`uh@FrnDNz4Ls%PBgot-X zz;VjZpg|riGF%=8TX`-Vfab|x5DWjBr8t`uhk8DnHT&7U6@+Pq+Jo=4CMT{B79#V@ zEZ&YmUK5(RYRySe;g%FzyfLIqw%UIqoK={~;Wg+jG^XIzGKp_3HzvVe+y_2JeK7Oj zuRXTC1q3I9p$QJn=BFUBMvCyjipmSCh<7%8jOE1E=!;U$iq5%GS?%0Ou zjdQ?1k~otzctkNW(g1a<>#;TgZug*?5nm&i50F}%UVa3A#pvaQyEgH31KWS{2`o*s zgrym1mgZCO^;kp`-kvvP*B*+#@2Ul~<|uPYJYP4k8;CK4i(Q-e&F;IM=10MD&ZJm_ z>|DAdic;srP7O$xIiv9zmuCSxFg)x^!dUOJeHs79TrX$4!xiL{eL}ZY`#j=ifm16`G{CFvggedgRIn0a)VW20!GG?># zm_KmWGJhT6s>$RQv*mJZ0Xa?&z68jwo))=iv6u{;jwpw~?Y0VtK0*9?^xf!>q5)UlnUFl~$UkJ~~ z*l}tSx`JS@7|&nFu*7nrp`H>Ao#+nrw16iVGtmgV|2s1;lITUt8fDO?olGpe!MmH0n$t#N(A$SohrPvw{=psVBP|y~UfbPT- zj&B(7zNt)4jd~)9rkl`-SX2tbc-%@`d(}pmGuRR`9Pmy|I zH3MWJwd=~57FVXuwZc+bQc9>9i>$b2SzpYHeFr&Y-x*}~orrPC&=oIs_*@^2MQv3h zVr`A)jDS8s;{r{GOwluj)V9tHXWRnXnwpK|<8BS{99jq`fRIOw`!Mc&lHD9|HZA_O zAtm_dGiaPYbp(G48*>^Pl|8>dS-NI~e)I3~*-^&wosqZ2pmQPUIhkY)}y`y6s~uZ;7el0&64{fDC`IDGaP(|44j|q|$G)6a9f{ z1AfXFARdkz0}~ntKlnhLSbjBXH_twu$PW|C8NVioe|09&ytC20qtM>`%E$Y*=P=@Z zhe#U}2KM;4gGq*43W%j^>e=8=J(b_1?B>q zkAHph>tV&mrb3B`@vNZgAT-LhlGj^gvIcl@eJA4-eT%C+P)+`4V}H()px$9K&Dmo} z!u)@wj2dUq6CcYA9W?Kgk8No7qmT~>#78Xvnhb9!`=)EqiNbFcfr}DJT#*6)~phK_W0ra398?e4nMd*X}(b>R3+ZJ%WANW7E9Po}M^7WUTG$>`C+3 z=wY6a1Lk{73IF`8B?!60U+gA^yUBm=WBPu|{+AuAvajJ(D%)k$2y8!*+o`^ln98OQ zl#NjzD~nu03obAvqGyg(k?8qAIrtvTnto}R;@mPlGY5g%LGu)lpBni{ZK8QP&z-ZC zO7hOQ|LQN_$(Qz64JcMPJ&VuGTl%bI=zR2#Z*)AjC--`$sb*tEZ8LEs*5%~Zbp|XFALC!z)y%glNG9;^O zIv|JJ=uEk0P!WBrZYYS$_u)FoAca=ia1g|n7+oV>^9o`Y%tiuLvvVS=96uHVkim$M6-Mtzw0v0yXV7b;GTaQW65Osumi(* z={gQ(4pY2wqC9W{ewYr5hbgvyiA&Fw2bPB|(457LVVeK>UO{&*FQbBd`}d05zZY>G zvqmSkW^n2ErN1sNJw_!d=_{+%oI}V~v{-OGkLUC>N1Vc=6n*$A`jAiR2rwOa)@P|$~M#7Fx*wYuur%JQn{gfo~o%NO*=YzSx z$;qe$=-XyPZ83#HIIlurFC^MH9LzGwJ5&?pQrTq{kFuzw`n_~@!@e=#{|w0tXMjuq zW)*h~UWvxd%g177&}emEC{+2I;HO3fVFse@LPJT!eOiCT!$3hoMJc5Td!-V5Um5)E z1B_`<<=<`V+Ex%)rub7-;lHIrl!+!vcLEd4Ctr?kl%9s7^1%W#n%u`Gob~`PMUP7Q z5eA>Ukh6Nx4=5+55&0RU7gi0CU|rZrV=mCmI&%6_#Uk!c*unwB6pf7HjhJsTVM4lx z-Y&x_>@2%O@KVS*HWt#X>`^Ir6MKJ>|GslQ~jTAA=CO|9&@y7GKiEf_SkB7=qr z%@&}oK#YxnkMse+J2;CXZU*#Y_Eq@4XJUBR^c%(yVX60$=JH(=M?@7aMieX?Dm?Z= z{(!FCD`}|z!j;!#^xj-C>>~2YQX7S(Mss;((>s5IXZ3zB?o;xxCJvfic)_?&f!*Xc zn({oJ+%uUx?N={G!Tm~0 z=W<0hZJ{b$z5stCHihuigZ=v`!XV<4#25OLoI&^oN_r1I4;Wa7Z+t2kUMp;wVcq_(lyRm zT1L;gzQtCSxi)F|BGvp}LDnn^k@4;MXY`VN;jYlcnhwE=uP za&Ph>173R(vTX2(^Kmvdr=w;k{(7fEF{*%xnlt$GTZk+^mrTXk(@Ihcx~^lQ=krBT zlwzx9Bj=rSAZAN1J`i1t1tf8nB{lubX84#DaFyWXLCfMo?Lu4?%$3|yt}#`Q^;A9n zMi*!`V&--<|6ZXLFy+H&ON-_##+!frrt!;Q#Xk2~u^L|_1Ow7TbZYrLw{e6w*pjZT z(J?M$*9_ZlTlwOJYb7iZdpJ5jiS@dU@}nENo3&vEhGvnQP61|ZT)x3wEe3||)Fy)L zS`O?g(y*d(BO7Aky7nK|po&LS#Rv^n=^L}nB9?0wwN^vG`sx*2EHM|gU2uO{lOX5; z5^bT8GSpE9%+>{0fh~JUbf&k&i(qu7zjv*bC-Fpd+!|!qEHT`Cj(1xn?|#!W{j6}+ zLhd*V1Hc4%04`3aEc`;Jz>S}ocolgv*T!%#{rnCAGl=R04Kccg_$aj*R?h z{I)+hgSjx|V`j{D?UTSDZ99KBsF14;ZY)y^412MoSYO}bm+6OXI`l{|{*p!(rq{Op z2DhDyYXy}2eEUL7M!zvcXjaCJ%#X&J2jO_}0yop(?#1p)xMORgX7jJlFF*Ijp(@kG zEQ*a*=zJz4wXm?X)O@fKxur{hyEN0N=7Uw*&6Es#eb$snf2_0bz59P^`DfcL(E&5= zaz&ow{=?bcc)4694J>OoTI?-=ppMsOgzywny86vsxU6M@;DlD^N*AB5W>>(la2$$1 zDAc7uM40Uxr%fOtVCCMCZzr! z@CVC9&HoY)w@dx=nOpnz(dAkj(HyX3kaMlgNBe9b&X>6?!n+Cz zgtutl87WhH2EA0OyC?cG__9O~C2psY1}dlU32LAs)JrIB+E$L8%|i#P!Pe%=myv>Avr+01~kKXB%*bV?;nYXnjiZU2@Dh_<4{&;dzAO>(m z2K6ed8v*zg2?EGLLJ*$1d*$&2X5{gXUL|<~B{jlA3R!<*LW(pdvm8d`mZ@%@5jcG` zk33xj%6?)4iIm4XDK8-M9-YhgMsZ34(>beD#dx;Nf#iWbz<3I$T+ZzlMo}pm1jRbL z@EuyS|48pBk7WCH$iTu{a$`}Zr;r5u$C3#7#BqCSj(Y4wne%tXH^{8=?E*LO6O7sqg0ep+t8vJ)h2qqFHkdL_&LBr6NT_MNyJ)mClH7mL zruog8(-J$;zXoheiUp6?p&d*X)nV?|KJf6j_3ib>$G3-bt=V1(du&{>7JpE`ePM{wPy8#mxAVY^G0?2K`tkQ!r*qvZyX83m-bSs1O;UDu z!8m2CR0lvke9*|6)J$s)fSQVwAi%SNHTA!p?gm?`O>y{G{B2y02&Wg>yQ3=SGodZP=(DLU8C*!Ic{Mr@_=VoooZqEImqGNwkX(_0Pkyuq-Fo*a2S z688ktgHkRaqi{ApNCmg%+Gg%=@xTEPZt!5$pjz4UPNB!VIRln~zU6%|aY%S`{|bL+ z(I(6s3?{OEvD8dpPQe`htvoKlumsg0)j=i$Rir!V^=*oISRN#U&!2I2+EnwfJO~#` zNyiOR{>qJR;_jTTLi3R@OQDQ?$=98{-|)6#hbN@ zd#fpU*?V{8-!=L77x{Oc|E}GUe^-CxU&*;9IoIyVzmk9LzWn<$|GmGOzFS9sQSkao zI)XU5i=T%u@k#&wJvjua^6%YM{EIH*)!nspvYVh05L%-8!^TeX;>A&N|4!Q2P42Ix zcwS%Q%k}%|$xgDe%0E|0ho7RI>&g0R`UW+Qq?)y(^f{}!XaC-nzjyDZuXleFlN~kv zwawxWKc6NCyGIF{`FWS69G>kaZ`P4;7kV#NihQW_6=ME1Q8+i+^bl#k?jBtwpM1)_ zRpdp=z^CBhY_n!(?DJW>4PMwj87=k8v%#=^Hl8kCeE2Xt8;l1ZKBU%zl($_E?lnq3 zZ94QCdl5?8n~CL1(*b@CKH-0S=#SI!X2Ppa+Bbh)?(Sw1jI+|_CKdx%blTZWco@WmW*w3HIdMF8!nw&`@52*iBau(0=ibEPy@$ z?-f8rV-aTYLE8yixh8)N<&$$Do;MGj%OMsb_FS(%r*hY%We&dF+8IN&dRCX1UuQ>-p=o@fU<)T99Dxo^ z{Mu2rna4L{xO-e)BJzbL<2P8NypACkej4l<5VBt+UK(SLWRTv3)~qHO{uxNoeo-7$ zPcg_PVGV3mm_1$<3QLzG9`wz^PgIF-UGuT5<(Ie-M=pO;HUnN^LqYsF#WTp#G!@Z2@|*R!+c^vBLf{vvm^4kupxff0h;*l1 zfeT6s-(kF0p%B0*1VgHh;?g?^1K~NrXXDyA@A=%uaCo#2&k?O=vmGZIljN`68bjj6 zO?!r>J;Q&IfN*)wOt8FMmQ6TwZ?@CTcKvX(%`9Fdj>doOR@oZuejGLY^|;nI;96X0oFMj#l*)E@7me7&`#adneB)i?7Erj__U6ET|*>>IJrOeQ4gMy}Iot7M?#vd2ofS;=19_Kd!Ni z^d3nex6f%U2vIH0OkLg^z=Hc+JXw5ts3j89agHsSV<+YE$KDXOE9@Y6%1$>tu#g%H zY!QFk+mJN(0Ps}qxa1>`LOZe?zCdyC!lLjVBdB&r8HF5$adV5Stnq-YVT<6gshtT? zq#PNH@dYaqcMLorU+&M44z_VQf7Myy)4(LE5^Z z(kc!jO!mCa_jJ(dFIIGTloO0!5T=BPs-tv!eev*YlPAI=_STE5Dr9tE#76iE!Z92K z!eWp$OFN6_U^uJ=GLI3@mUR}t;*EcBW}U@XyO6*_89_F}@nMS}kSMHJXYsGys=r|< z<6&*i8;LK2HXA${h>~&5)#~^tU6bj*E#ZU$^q%AYhdZ^O4&UE-KYrhTe|GfKvEm{& z?Jc*9OC7xN5o=oX;)Qmd;;QCGkzvAwrY3Z#fZ57aND#{mabI&32LBbuc;bJcVSqsK zQEZb1gUAVQ%WFHo){Dq5ZY}NdmFd5KCi%Qv&aBk+7mjn*cu&1f}$a&3Auo?JjESdhwdPUr&WPdNqln`l;Q4hUKSw)&FuYaW$_( zgb3K!9wHLj6c`bx5>`SL%K-TWtz=edpG1S?s|Lf1d1of>_lSQO`iFn@Zl7>++Fv5s zrr_KW>t^kz_wN@EZ!iCpF8|lV<$ql)|M#Qn<-+2>E!NZbt&5f9=c~)Fhwod><x;w9bFo?Ll8?UScQL^HGll_i(5+v>x{Q&M9*ZGkhQ*YKV5;lr8A^^2lWR;#dak2 z)*%%6@uWH0{nB@`q|mQ@-(c5Pv+BTpxm8CwcnvXGJyrd_i3%36V_-jQW3k!&3kp*c z|Bl;%lnQeW{HHFwYX3wCP(YjZM+AKpp~$CZpW&?_K>2OZMM{4Kf*Q2CL-bX^upYAt zi}EUxTBXY+2}j|0YCsg?KgnHngkEHi?2@b zmEhyPB0eq>aJPrRC<^}0k9^W==jkA+n8z(XwD2V^{whKKecJy#NSoQz-vOy>Q=Q3( zd=A?rN#tgSgx!C(PN0W2tE8mgSB)b?j2PvGH*~h!Irb(13^{AXb$_|KU zYS1yg23voOKIIo+YY{LT@$&m%Rv-R^wZ>7KwbGuIthBw7l?I8lTWKshyUtp|M&{R< zO?T_8B?6^rm3;_{48Dx^t?HUtVJ&dZZiRsX_s-`j#5q9;NlB`uGe~^G>SK%bT651~ zoiQb>GCpP36@&xj3+s8|!s-gcD#*1_=>prmPlb^!V;_Zp zV}%Oi@Dy4+5xPM0z-OGMgFXP(Q^q&ofrubC+$<)E;Q&2A!oOS1Mnr{2&ya2#o*;sy z!MYy_Z)jP6!AI=}PlJFM_n4>S!$(;qba0=zyexjN92G(tk!|3X#piegOFR`F{dKp- zHzMUk!#emjdEHHrOG31owq~0dz8xR>x{fyXtYay@k6Q`B&g-trY10t+nt5Z<)kQ`- zV?W86vtA|pxG(qu!DxP7+_!=;nIjjb@{Ve2l;N;{C_BwS(R!2!i;JDAY2ki5u_icp zbB8iQjLRvY7X~BQMTAPA)P8k_vGeuT(YQOoj4{0sy@%Jjc|{x4T%wT9f9)SFu!N|s z%S*Y!ocXzeNZeok{H%fIbvnlm{lHcBbnMp3sgn8$DV##GX0?kT;8R4fHq~G(YPVOC zVLzFF_LH~$!2{hs=Rc1Kx&4+r-`hUc29$N4;ma24;&=Hr z2ltjZ&HALoiG^S1?2A)ly5X2O-x!j-v^Cd%moUNwQTHz|F@Hw&6FxK%t<}`zx0TDw z{9EtSWw>~2O!S4LCe$$XlkugmtXqz|B#AG5QNkT_wG~IBFMDOJ_E;yfs4fH{)s)kR zN~S72Pp$-^Venh3p2&X`&HvK(<}$Ynn(D_ieidl-niq%az|00=^i9X=dz`^qvDybyQFphQvXqJ7p~ zy%?CYM2sb$!X*aV3jaTatOZ9&c!xwKHEcreCx(@B4>9Mr9i_ms>#Iv-Eg81&P{yGu z&YyXZjhG@#4Y(uW@)g(W)6|)1G@nw16AfZOG3Es)BRZL%C2fVti zoG<{x-{{&9zqhLW@yXfX%PmS)e>*$F4q4f3_Tlh;OZ=T~{j0KMt6Hl3>(*jrX{X@; zfk??do)#X&=U}N)O`%)+_y6&KzecxM3WQGxez$>}jhg{k;pS?=)vG-g{c!*K&e(6O z(TV&LE&1>s3#sgop?CVnTb;8JAPUrp$0ehzQ8=HJ(Ou_LH&48v%%arQqYXz#H5(sX zbf`}u95cCNpC=M61b2pT=e9pblvA(tdukhIF7=?Ec`d~~5-r6ka}45tDX_2k))ISQ zr=t=NqUoV-gEG#iVrB*t(TEIT905azqLEY})sUqc#>Dbv>%Z7Id|VHpY=k6Sac|A-FZa)M}G zXN*`{Lax;q(Ow(D{KKw)OVF!`be{dwA}43O$Z1QL{D93Ufp;pD(rORKC%K(EH!_Zk zabJMPP)qeRS_Y9|VXr$m8&dLMu65Bi_3CCV_H~!Fky5LdP-~98`fH`WlCDOE5Z7*K zmil3Y5L!E*{`{dXG6x)Y`)%^PiqXBjg)IWizcbJ`<MFcs7!M3`8(c#N_6Wa#Fb5 z2XJ=gZx6iHDAR+ILAM96Tc=B9!wmAAf55?z<)+ja=A}7);v{`)zj z1{`4`fA=zhcKqFc%M>G=xXkNPQ>X7G>sf-7_g)gqps1IZmC*^Xk3LPtl?`_rtW|so z6<)BTFL=JR+F0amw2YgwdXin=u&^O9oJ1<8990`7jmqssz;6EkqoEO|3-+4)zBO;t z#{Je`8lVsd1CzfvAhGO@0I2EfPl{g%H@VW3O(YEqC{NXY0LZQ>I84a7xx)zA+;519 z%w!;~i6|3V`1t&LUru1f`rU6(Ffi5IrXG+ii9CsN#^l7s3VVFiMvXjMfJsm_~@M{Rej0o7|`;<=ZhxmO)CMi-gm-8Sv|tEFV@ z6~kb*xqn&@qXKuwgS4_Vx5iBnq4QQYVlN$2Sqr`t#&0+K4R7T@asV@KJb=hVn_t$m zs-iyM%~jdyQ`Aegv{Ljzzn}d;L0=?c+2Jdz-<7I=Wj8TDF(`47Te%{Gtxj>LPjX*) zJ09U!-~d+qVLvxoB0kowd6~;%Vm}v*#m8JTESP()w+S|Uc~)OP&3$dcNasH;?#nZy z-p!OixIU)^xBE|4e_^G|#y!l4oRQNCdrK;7lvHLU*F~kk84gV_o@~WMQrsIAs7$bm zKY7)E$$nxjwMz1nt&8&u(^oW?U-MZb_(pBrid`wd3W$ml_RY#Km$(PwyJgjtU29CP zihyQ6ww?B-(mAy=Ovz~~9s7p#5hs&Hz4hS|N&K3ntFj;z?PcVtcZ6nyJRG@K(1xdzw#eYE)dR|XjgD?H;N6_ymt|Q}c z;f{)K+vtVH&Epq>LZ#14csSESwbLHa@ZE0`U6__Ohj3z-l^{5N0a*#d!bKqnYsfJi zcBiymk_06jxOQ`(fGcSRjcL|45kDNb>WUD*r60-9sfn@6n zGtb+BaeRR#`iKp!9Rk6)e>}nYwM?qx1MG_oTt;c<%w0ouudjMn(C9Xr9Q=uSGm)q* zB!$|ATvSXdtVF_yz+e&4XQCmIq|#R52r3`*4qW8pvX#XBx|0glc?NMAs=6tEMLoT4 zM>^9IJky<4`+k!;3E`JMvJXnC`xjql?is1W$UD}OuDm<7Fq&9{_E`! z{jmI3r?&qQG#N*5hwXaz$IliiTdpw18DbXzV1_>70HU&N4kmO5J8r|@xVk~ea_dB~ zNc53xQO4Sz(Rj0O%qa}_D8R6P!0>1Zz9_j$up?rRgkM`6^`~IY5V1*5*$GA|t^8YM z2|N4eJ{BCnWLl{_;0>Rb76%Vk>J|J1>p;|7Oq#wIjt9!+86 z?wYr3f&mdLV%BQ3Qvxp~tja(saQTn1PYU`onheXASgUAXgJWh)e;SQ{?8|j&|Ac#! z-0{En68LJHKukeT)F_H}t^MQn8N3jmfDy>G@r-do%y`*&L+sGojZZ01{zpU`sb3TO%uEX0zxUqG|BTd;@phfu#o zjkAJPj?c#du(q9a%a57E#e(rBi~}MQ#e`Smz+{b^qd%7^Ia`JeY!6p74(dQjfCdj- zC%puS!#{upLkFmHAUZg4nhBs4gKNK}nJzGSNXrK6%3s1fB5dLOX=Id49I8%s)AQp< z+yF@hwu`%e{q_l!;16lW)I^J01x-cT&6+y@LfY*_Q#J~d(7fqo6f4DthP^EamPTIW zk@4vUJxrvrk~xWtS|Mt*kV zTAp2hw}Vc+|DAjJHdDVpD1jO8&`@w_n)4A7(F~M(zMaq&G@)3TEpEhOH`L%;84RrM z9MK)wvMgZ8#lt`A)HSOC9|x`Jyau$TdPqxN!Uj6j4TrL%Rr`pWK}ajI_kuJ9Pzf}( z%Gt@mArOLS2%%LM=3eB+JTd3Bx$`Xx=(pT|Rj6GLh8Wg}54WJjK5*WgGzq9t0w#N> zj_JhO9%Dig(wWGRp2w6HqbMkEk9|3S+QAWNBTVH6PBz?DfXA7r7~`!61HXATu(|r- zZyf;%gw81}8c(*WjVb+FOzkxUS7OoIr2{Fa8Qb4?uo4X>M@OQ;5G*n2 zJiNovO1M|B1p8u%&mnrdufs=H$d_q<%VIq1Q2{IwvStJE1rTY$A(0mp%W4B)2wM%X zGZxB=9r(`?9&%@+N~FbnUfJM^3j!{7Nl^%iohypD-3_Yqu#cOO1Ip?E^iLV>yqwR? zV=_I1EGaiX*`8cmka=^pPzhPS0{Cyxd8O|*6S>T!Z;0OuFxfu<)hm{M>CR?<@cMrW zu-+g}wm=Byn~61~NWb7G#GUC^p!i1O!SvXsqrL_15BdHy3F&A4wVwNr z&F^$~Q@Q7FZYI26q+0?JOn>-ifCv_t;kWk3W*gwff7^%GKpyzbf(XCF-^g+P2@(cq z_vtj5HqBoJqkt#Z>&H4(2h5~@AB-K^$LsX{eYKA@aNZyo)qZU?`QcsdZZcl0-AmeQ zwY8+XR=b~^uGfA}cGqjaB-=Yqc`|BuKjX|8i1r!+P&}r`P?9veZ)i*OXtO4yQhVr$ z1eNnHY zH>nSs*F?!1l6u~78uvl1y$DS$UU2nUrtTJc(yUte(Eeg8#IG0_9H#NU{vLF!zdvjN zAk>s_$d8X|eIPVuO8DVjXcv{^AaIGd>RLDk;+ui#wcrZ_pqe$0fn0G_Yx-y!^6BgL z@o!&;h|muc^bQsXKX1E#&!vIP^?P$z)=Hz&K}!=hG~3c{uBg2>b%;d+)@-#3$Lkc( zSEe@-G-&|)tkFm?F><}x-T~Xswz#hBsu_ILIi@367^jL!se~-Xa3bUkS%t@XB#u%l zKkIypZS@z+f>o_A6|Sb{Gj2V9blP8>k;a9^e;5D?Is0wg75627!x1x(ZML7IBh!-r zV{w%n?9{rohmT-}VCM{wr#Jw2d&FNu@D~fq@$2RluSyqxwx^@?10GDblW~Ke2b%}0 z(v%~NatFIG!N9v97Kg9)fm!dl`Cm7UsZ7@e=l+Gkx!vk=Ml;f1H;9Ml4YUtM@QrDQ ze>L6@uSnwl#Sia)!V5#d;hVMa3M1gt+bSI-=+nC@Jxo?;8zjg+U`XJYO&6+llOxry z#88E=PS^R0oj@7M?zFaeh~4W5+Z7|JyxPZolAz1bK$(3M<0{W1L3n3~7PH?WI%q%a zd7T<|qCt3u49W2ixG?*sa#6~fZV`{YX|}_*Y=@v~pN-ppAqWp=-63*gGi>nD&KE%f z?QUqpOEulAmYvFvonugNB^}gu*Au)~@PI}@U9671Z(&SV;-nkA=raJS(iP*olc+}{j%&V9zI57fB+?l1Z*0T;x};AjZT=LD}ED! zB)1l*pgzohtx>zY5vGYb^`1aBBfq4vKo_xB63lpbWEx<302~_#{Nfv?_QRps4|^6g z5Kx7FXJ~{2MB1O3jm&l?RqH8{icC%`t~T>wTxIQp4dS0EL@sw=`J@b9MIoL8K=iY{ z-4}$1ZaR_zd7(lk!w|vl3v)aB0XXwjAl6}H*_|$bk_Hjs2EcWctr{P4jsgPhNIJOc zcsRmoC{?flPxp3noz9<%J$ouP)HxzWz`H8@` z1^t`mI~NYnYtW1uLev$?&k&IzLRb%(V|0@1zMQj!gLWCD`<_9%O5W{oYq~fc;VuZo zsljM}G=-%S4wbMm38=08;;&7hV*ryKSMuApp@XrfQy&e+wE;kC`~C_jc&-!33pcw1 z#0pj0BjTOOgAzYKiFA(XRI2lX{IdHg(xgigvu4Ju|H%OkwR_$ zzyHtwU4a_k$(0{M)NB13zs>NY+{Of~HgzF?$`@d~r32Gf9>I8?JEXUKzz(_CMc_~v zF<3*GF%4mqGX)AzCXL_QVu?~gm(%eKY$PpYk>m36hfNWy7v&KIHXMLk!a8Sj*tZpD z&I2=daj`8=crSR-QrCXJqYk;yi9j)P*0 zFW}xv#}1Tkc4=w)pmu-`Z7b#BcB#He6oyjeo+BzQTt_~zVO(qh=GGx=?}c|*$$Ld& zg*^~{T;KKW!^XiRY`C;IwJTwoGYKPq)yX#?5?nEaC{o+RzG^;#MoIG%O(ZO6+1rkN zG(|<$W3g^#uUg+r5BF%FYo_G9);sLQj&tV=3+Dw&Af##aqNB3C+IbLuhM%g)y@iFn zjIGa}v-KgCxB6)2%qH24xXeJ9DUNyee32AOY=~`pO^xFP4ESWWI8}L3XD=3i{ZA5r zKw7SPN3I|P+gU1j?fs*C6DFz${cUvpnjA*757blaW_MI*6gXRSOdIg_zVK%)Ao0XD zhI6+64z~R=>t&ao?;ozyb0o(Dqsl0?i?)RM`)bXd7d$L@MUTQ3-WqKw69~Ku2eixa zXrJ)qaVE<;x-9YP=vM1p9!+U~pZ{jQbl~>xf+AURFgaiWQGUC1%e5xqRRSL-w+3go zOzpRtBlx#a8?A&_Qn5c1?S3Eomjoz&`d8F`UlorEf1g5kDaj2Gl9>FC6V!%sM#@ix zJp==9q?5)`&I_p!7cvBC45dcR?5et9ITJf0I6l#6)J@RF5xrTute0_r1F>LOy{pi% zjfAucj)ziVs742Tk?@|5hg8}g?W-k3P!?1Yy|u4y-(GBPp*Tj24qk)pL~X=Y!H$jC zYL8$dVf`$%N$67Ad&Y-hupon&V%#Q7QL5$*e0Ce);b6h z(ZnihzSlgHZbD_cbB@M;0e)@x8ezbpue8uV;NAp98sc1d1qUb$QcTze)DdA}Wn+K} zY}Wx`3eH+VctH*aRgYJQLDy*Op;3|(qyiT5O06o~7G%Ccz(^NaG-Pl`jz0!{nmZ#K z5_2ZXA#8)yyA_Af8zvQIf-uS+FntKZ=~4uYeDT^@oYpH3a9l5cRUSg22vY?3=->=; zocu8K9nP;Z-qYIVda|7yBA*GVTMH?uz_T7=a|+8pgCkh;MQbn00Mu2s?Jt2 zWkIlAGW#b90>g@^kdwr|*PB41p_g0bpRC zPXKM%scsw~4#~oQ!oh=FZhx!XtA%%m2XbN9t_>FV*m2NAvhBNJd$GN7;O29iVaLNl z;$wQS)bujNsdfs((1Zn(->2R5bLF@K4?k@s-J{eV6!p#Z6k&apa~(F!XoVp7Me|0h z#6@()nt<>aq2g0UsGzQyYCW)!T!Vh)Qq+*xJ>DX_nV$x2leQ zoxwB;X;7K!VsIYh!Tgi6E&2!rtUAl5dE@lU%d;(TK3P^BRP&!J2dZi#9xaE)xZ3Wv5r7zqsg7~5dV;{$X4gcpd?ffemH1zez56!Q)| zOF6`S@eS@GlVgOhD@+cOVTfQjV$sIpw(bjmTRR|s1)N{iddW@=SIBdJ%?QgMjb}@t z^55~Y>3v!ing`XqN&ioTy6itOJ-WO+*{a%ZWWrOs2{HCNg-w*vi;VTA>UJ}IFwE{) z*q(l**kyP)g)^fJ+_sDWOfry01S*eVDh|WMBaJtlZOp)S%)^KkhH;E|v*UhZ8=v{5 zyDwvZ229%yYQdC)cDe`Hd~@|2gnH~xTsX4T;MfZ+P--8*9{J4`;c4q)jW6go6 z1PHc4e?7(?t3eoDez}uM6cgNCU~P_lKpQO!B*p(>?@PPdN|J3q-+uvWKNl(qFnB~* z*2ONsP{IS4vI{|hjASDl4W+>tEcI`Hdq++skw-@YyWDTxwchP6b9D0LJVZuDM(jA_ z?kwUH^ZLxTPedgqD3?@H0%aQNaFG)FnP$2AiyIV7#nNQ!h>=sEPO+nGe3#Qw0+)Zv zd8yjFY@PaZv+?=w&X8zwJ^$@PgYX*MnEDv%>P58yPh$(nfZA5sQ6Wxi%Sa}rbT=|j zFUy#zJHYeNmuh$B)lPn^tk`@W>*%qd>~`W;JF8n|_^x8)bB9hXj=9+tN#3kK9>ZOT zVf0f^DHu>24gsmk%UF(f;C~SV?Aw0~7BlwT6b*TFfe}vzIEA+Wd;=;`W-3-nAzzOm zChA3nh_skyi8cu@d!J3!|2atY=~UGJ6o$497Su$PR@PLAS+rU*}0?01nbuy#s%tRaNx_ zkGpjEX^)@9jQ7e8Q8Pcwhy1rXPAB!Q=tNPnC+;`~{@tt;#H|~bGve0UejLFWD$b6~ z<_J8Gi$Z2Rt_JzvxCY!nea&&YWE5`zW>Q%SFXB`4!F2!%4NLvijxP*MY!>I`Zt&5^ z^KP-){V3=cJ*q-qLD#Arf}DSP;~vd1l-a^NWvV2?_jgf7JSSsAxhgJa^EkfBmWe56 z)8b6#WhE*s$p6SQ@eXkH>Twx6!WDaCpt8R@$`59ZFi~9DnejisG4>r5SUdQ^9?J7B zf`0QL!REuT_&BvUAp6?X@9?V^CXhGaN zOWWLOIrJT-Hs*|ysCsJ{PpUw6)T*Mq({lFI53yr{UrEA-yXDmMxbd_Fl26$alF7O4SM(6Y^4`OUIBiZjR*z<~)-g?A)MoKyiy14Ethtf4UB4Km&i!z#^t}>Fs|8&Y|rrdjn*@>Z=AL zDkbBE)KASBaH==%kdtPk29?!|YPWvEJK^_rK;i^Zxak!1L*NU^JJ7G0Dj$_pAV_J> zkx*lQ?jVAM<<-0^eYq>8&4V>gLy7(F<+qyCiZ{|-(WHVWkEIJdh6RGgbroRA?8Jors4n zNR`luGC?XD^`^cx%)n6{&q|0-p-Bv3gpiFL2xb~x`MJ44tRMOLZX-{P@xq@Qa4FF# zWEL|@RJ3Ee+-lglgUPrL$kg%w1^7?>jGz27%>k+;LvMdl{Q|)vWH#n*whqRa9KF&_ z&zobQpSt8Og5UG-_QWNy4r-xQE_WCSy>gP7dk>{CP^5*csGM^!YIeKUW1hKs;+d8o z{dB~KQYJ$%DeIZpXuGpwj!0%Q2DpR`XyY8_BzcCj|G^4XLxn(*P@#NVuziaa<$S2+hM1 z1qhlSWv5y(s>=veInX%7%Rn>HgHi)+#m!5hXRS7S#U)(Dm`T9>DyR@X#(F?D#>U2wAo=vCR1TmMp7qtynY~kimS~=T#q%pCk_F^p7%^`a z0mzr%SOQT5=DMr;mo!-dAOSj;RapWq5!4`4@Gh^x!%5H85M5>dt%jGBSpq`=W|!Dm z0zM4h_`sVmr^r*r-OuTlB3c4De{pV6r`d--yp$uyg7reAtZ}uDZB^S z7R0oDNac$vsHbJb_!UK|o;~>`cMhmVNDj=vm~FP0S>N3~-g@!JT6ycY!ZXY<9mYDe>6S8V~V{ln}I?rGOx`n&UmP82-9sxG^TaLtF<@JfvfB< zVBiNCk@t7Ep0%4D$Z6>d%3A{N0Nwy+5*&763vT9gKI1rg@oP@(K=Ro(z;gHiKoDiI zne7$RKqKyQrk?fHehF&`3dy(xVXlS#$cYpaQJn@MzZq%e-TJwAO9e}5b$e| z7TmI2)XW77n2YiT!!-1|k89f@Cv(1A^=%r=ngA)oe#4!!M~sJ?UW>=whO%@5h=Y>- z*E`9}yqFBVs7Nb_=4lwrB5H~sXA)v@T)c1CYMiGn$j)uxe0_y^Y|0uH| znPyR{zdUVF0wnn~e~yZE{K<|W<(P3J6`Ym1PG)=9@k&AE+3A9h@4veX{}C>bLv|R( zwH@4&gsG6tKTQUNGLW-a1)DK3GlYOnktFZX!7aqbzVE;OC~~0w8C>6*fpG$n?D5=zu`g+I^6n(e$BqtEb+C>b(GY znSNyg)v0QY8vJ|v64X+p10@yd2Vy*KN*^8ZetksXN z4pWTnrD&KXe_y{$giN|6Vqb2uJkVdN0NZsR0RK;OZG91?37Y-p_ks@~f^^ghn%{tr z`I{tlr#hR~DHe|6Xwla7ypc%k+b5W8avR};a$tdTCJ^`@WHrJ!B7WE^!#-LQYA=;* zjJ(lSxU>r-T%GoC7*|7Tzxpu^fr6nePUU4c#75;cf8uca#UL{hM$JL%aqa|=Px6?4 zO<_+=MmTfAdO86rXL-Nf*7;($j`;TpjQjgb7@5eLwrskDyc9%x332Sl z$BjtYf7&xvvh$~2xM40rw@zOT`uWkYGFAv!-Hl7o1h6*)sDQp1=us~KT4sAxU`yPb zTK=YSpA;QfH9gceVs*0LVA+J3guBHKdf!ZC@>rPdUALqp@RdPL*Fn`je1DvQr}NCo zXgGB608;KI`!9UAx3 zxaFwEDc%O6?^JyO4k9$v1iIgYl^OW<=R0$tRIZx7;`E6(k_dt$k|5V=Nhh!PD3&PA z-P)NGn-sgfn=Z0L;>&=<2nv%7%NF*)Mmlxt5Cdz*ZY4UAqZEby4GV&5rYR*39+}ZJ zfA{%1xgv00KqW6uK9-mY(+8wdr|`Z7W9E2RlDQbDDx)?FXcupX<$^?L%iqqoyW0&Q z%`yHUjB5sWU6W8`O-?zP5kbpHPi0;?Bxk#IU;FCi-qxF~Kh|-~+1Prq_2y5BIg;tk zy+Nh(_mTDDGu*ULhN6}Nr1@bfI;wDtf6$q7+eMsApmqj2rA*UesxkRef4;$3`>)73Pf3wiB zhp6Ia9*N~{IaH4!P(YPpN>$`t=#_Xy4Wl^QHYj6`Txah=6~~_;$sc&-fQg;WBsc7E z6?I~#n~eO>gBkf+PGk_q4--Y*589ECD>Nwg0B;OZJRHN(`Ieu$0QYZ;NX%Kwrs)Oa ztWv^-Ut|s}S1L4~a7tG#^sDVse_(Cu>C#iWc6&_V25n*{wzl`JkqoPrb8 zMR(Getk_1E(o84JNGT|-M_ZIuMf-T5Y2#PMQ!mJW++to0M?$#g0Oooft=6WOVKc3AwWgcfA{^!%PiogRSI#M zOKF2?`IP2kXJVo-3aQn#N{VlWcKU!;X9)I1DYb3K#8Kt6g9@5cTY>Ua;8+h7(5&z{ z2Se>)gX6M(OCR7CDJImo0v|Lks9rh6_M$XDta980#R4 zE~|L7(7d0QzoV~F37~(zgz>fId|6=?`24tDXAGg_^5Hg%^WS0X z8HNm=XRDIlf50V1w}%z?9;*ns@J!q27qC|0v&%=C%bv&ue+*#iII+Tq6I|h{s|=@( zJ1fd-UhU5yIbU~!=4UVs`tma$Ll;HoXAhJp<)ohNr5k7r)H~yXpmOrU%)aqLCZ~1V z7M(A4x?r>RnTK>3sxjMb!a^%>|C^oPtvtpZ-ko^H9^C`xw6>HRzgim(e~T}R? z4?b-`n%?hse|m+*#gCYWgZJF!7rOxW>wf;!!w+1W%=3k=!WMye_gWdkP?-l`Pf4Nay{A*51Dh(UmQ~Wg^I)`7Iolf@FVk7@-18^yH zOa@$rUjLQdX2kj()oFhLM#A9Z$IE#FB+fTlzEbmMe_2A=*`*$`Gs=~A+pl5D=;o}L?lGv7cs zf;FOBf5Kqg4KUb8bzG1x&xWxWvwnOecA*J!)7obQM@D-B6!ofYb8`*H7Xu*$(=HVs zVl+VY;388O?L%2Ow-LFcv?Z%|pN+{m%EJUn2F}39BzJF##A6jssjWYYZ4tY4zYFjV zNdcI~d3gEopG`>!dE-OI^C)8+zAp=a{CbG>e;EA0_?QaQSv2AFS{0Nb*L1mPpw6QQ z_ppe#u44rwXnc`~i?nY23=7E){BZcO49NmR)pU?hV*Z%WT}koM7+{uY)SLQ+oX}o* zRM;y6HV?Wlz!cyrgk{D#tTt(R`6>P~`q~v#b%n5~h^G`Wsf9%Ty z+2_st`VLPO@$6XN$?s!k#DRO-N^{@q6MZV`50(9+{OjWS4xX!Xuf0zF>u}SADw#nR z@PKN4^eVlO81_9ODFV5EO*r5*B z3>M5(_WZD<51bGP;9m$Jk@Nvj5cI+j zIVBE<)P)fhaq~WcE71LHE#mHE=MPxeQ_ohB=ZfI@@E1w3?pne z9Z1F#ej_-j4VN)!0w#Y-m2wX3jPeUo3FGa-!QfpLl~XHM4+z>At4uNN9xb1NW8f5J zk&@qbiyh-5U|_GuO*icNwr>}AO@8_P;y3wGZ~@{POWyFL8~M}E1u8x@fu9X*GvxlM zc*%>xT@Xk$`9DS5^rzydph25L!at2{XJ+$Ah^Al+LMyg&OND>AP0W4JBv8)@R>QV& zcF}Hvj$t#5C0(C@V&~BgK4lA^e14zEXl&KX|%etz;wG{Hol2MrYVQ2@ik^}={@>Wz6>mx=0Bc)FQ-&|*qhS3(GrCqi& z{QS&YB7Fu#X+qXX>!-C4N(+e3M9xmP>D+r2xkGz&SXOzV_%8n+O=^`E^3u2pxa}psv zBiD#p2zS2eGcq>eCO_)V%V3CqdJJ^S6&jsR{xsCXCC8+=Y3jA~)B?_;Q$$)2bRc*7 z*{CnbPlfSmJQBk*EShbEfsh44PAYLp1NR(!dL)TFp56+XzZ079w;BkfiL$*V--I=( zR2HqouyD9|xH!DHxR@&y4;KLQUBO(sILhJ2Uo(gO!|q}0=+`U+nEC#H_EEl~s843D z^tkvJn4}*5b-ukC$phH1lEvXg4w?KnesgB#=1|b#f_=&6vY;Iw2V{GJ{moq z@WmQ}#~wlHHpGG+*brlO`Fw!sHXObp?kSfZ;U0Mug$>{6%gREhUjJNtzpZpqw!=

!iYX{l_U$eg&Ff!V?vJ%j1#7y^zdWe}M5yh}lzyZcxc?lcd+-F9b6JAkVGYq8KQ7Ks8*rZ1Zf(_g zIHH6f=4gX1($WX2<#%zbU|3xMIYA)U#+NGog&grALg;!QRxDPhLhe!<@lWai4aI+{ zd%!IG|4{37%fSZ_qrju{jr(SXWdjiEX$?Y69dR#vPv>DaoK_##N)Xi2yjtvb>fH_8 zn4rT{oq#Juk*xH7=+=(y)^Ls$Kc?p}qNY(rW(NPmm4!oszTA#asSpq20#Z7nYz}jA z;4TFAnu0>QMvmw;{SCU8HQctrd0&6$jK+k&+r8_Z+PhvTA`W8*y3s%z*W2ef?xHX{ zD9rdnFHi5E>y%OebhNPJcb_V@;Ox8+{zsg{s^E-K7`ni z<;N;OL0bQT=HNr6Y`me^M*M$PKW{X-s))(~&3qeu7x-U$z!pR7;%S~q+)y7Q59tX>8#DimDrg&q$hPL8WeC$HK}UTa+-Yyj|XrzOH`-NWznaz zuoA9Y7;1TE9r@iz3Uc_l=sa5IM)7j;-k1nsCDz~@VPsxIyvo9p0@yS?xEh{a789# z@L&_>rN^5Jy~iQO%;bL%uc|($YExQw1NaVg;Dw=;@T1GO?JxMZ8bVLOkILip%K z=bI)7sHz$ZHlouO_#TXdzsOWdhoV@S^tu)K*NRP^3m$!*Ov!_X2JH`%Xk|s9mmUI9 zMj)6#1@KS89#MbhVW%Me6_f{zaw-RDx5MBDnR`?kX9@FE*~Ol42*B10GX|)Kz+O&g z#UNq~tk~9eQ0Is@#H|EF2LBH^$BPLg7K}UX9%%FVAXmw#gl9)6q{0-pA7WO8PP~-{ zr`AUuH?|YwYXM`Tm_cKsc@LT?w=@y9hBBD#6Y4m{$xDCG54E-NeiBr+4P5>h7ER~w z!hnY$G8Ez4P(RF7IF5pl(VX3|s(d_GgT)GjuTDtSS};6J*7mop+{xh~poGKKOG0`R zF72JS_Hf}&LsbjjB-g*+eW%|Wy!ThV*C3M7AjL#(P35L;e7JIZ3xXcuysX zv4#{@j~cx?a}wnb%L-$fO<4@}iZ5e+P>iRK-u%P~edu^L0%6DL|xv88$ z>twTA{Zd~$J*6^EA@c?te_a|Rqk)d_cl(||vEIDK#v2qMaLXUln_hk|kdy=Z!?Pzg zo@>X>0X6XCcJK6z5N0*uA^|&f7X)e$m%PDlLAf%+n{a3#v=Gpsva~rL#;Xid0GF_g z2)BQ}RYxaqT$^*L?P=O1&wsP0ho(C{%-~K`uy1h4JHh#mK$L-m@;HrV8XP1uAJ2gw z5ZnQz)?)_|#_Az^YD%@3+gP;uAofRZCK&D^bReqgQ`F zagon>C7)bV$tU3S2;Z4jIHfo@E(vxroH8P~;%XTy)pUi8h7}RYJdvPV-+EY}x^9Bm zn^wNWAFDxQYDdl#7!=Cf2+j9VQe*GG9c}!VGE=shL zezgnfIDH{~I0(FV=Jvn75>3$Kr7V9vNO%d#WmYu;AKcPuy}4HL>^&!UgUEeF|L&EAXxZpxMe zuY5YMn#u%<{VY(boT$qI^Y7^VUCwv%^dN`Q4tx=ps*k)Qb~$eM>3JDru`YY}9(n;T z!Fayj0#XtzzNE7{3w9)3r!{|F0DKp047Ut*kUv2Xs!Z`0!6Do3&>d%@4=~9)O(NGBBF>FZBeChDj#`{P)qC}qNqAgX!ARN}OJ8n&_j`vc^&EAyRl&eOOxO{+68fNHp8wK== z;z23Y{s6UyxLbds_0HPe{^@{@f@3D55aNrvLM@E+Mm*BTcBHq}NFRF>6(b(hBB2Jw zK^SKz;Ec3BAuaps`#UX$U^W5atk>a6DDPGEz)$5`LRHXI~n9e zZ!M}Z{&g}u2?0zBuYOoIEwk~^XZM)Y)%!m5GSwb0Qn`OU2>=A3#4%be0?!+7D8#1+ zANG`-in&hYWE90dHby#!6jkJvz^YPPk@y8$0uEUmYB2Se?!s-?{5<{2`Y{9lDfPDj zHdE}&9t^pG#x<11m>R|twGaKB5cLYz2ae6>9_E?4+%*RxOGggkJuN;G~=F)%a7_u=;0-3GvViHuHCou`CgVpN* zJwU?0O_+qql{HL4df}c)xJgMVOhR0$>65@Dgff#%f~%Wn5|ZUlVG<^`6gi>%D1R~u zh9M57i^(ITT@-ZuPR-O&|H7=ub+sbjm=~7o92lN2{uoDuzB+12bu?WEUC?tdR)^+R)l`9reMOI*v^`n zvt5}L8%P^$3XyNDN7lkB*BaDD7HfR(U`Q$SO%O+a+brq6zFxybfFmq75fL$m1sIuT{?Q1wsxO?_pRfh-c`lc$PMqmmy99%$JFu40Het zK3C;`zcBzQM4Y6uJ%^Tn^!soqKd_MCKbFQL!J$3qTzH{H7At0S;O#Rgz%vi&n~8Cj zeVi>~;ZKZr_Xc_08_!F)dgx?Hs2g)#yu?5OL%{o!PnGv)IVn*lr1X|`AY1A4(8~`R0$OF6(JG#Jd z`?J~jN2YRZA2Al13Yg&AN z?s-iiu7WO=u{_3#4Wjce`?G+H5O6=PZBVrW-jL{q^!hu1ToPp{sQdxCd>r`!hrb+< zfekTpU>NWnC~|Qb20)(hr|Em%8Kl;}2T0Q@_9WUQ@|Oq_s^+!;5UJ;L)x~?+U++!j zHoaFRij1H@&3A6lLp27%pf&WWFC&!Tte zblK>J`Rbwj4hjec3eu!%=MJ;-@h=?k0ALWGK=PS?X6@^C z?<+lUI6y%OlYp-T#qEjd8@1!wVuoi7V&(B0oKv6=zrfr>!=sLw=qjU{1)}H9`Rg^P z2iZZM*;4yJ$L7{@>e|Wqkr60TGvneF7+d^SNzh&~lj*o=tC-%zckQ?LBX6 z=SVo34H;!%eNvJHnKMuN&g)?70`9QAMtPyUofq0$_^k}kJSd8M+|CCLw!-dq-cjPh z^A*^Sb!)=AF}l4aE_~X~6Na?#dL=J3xbW2e2zJJ62=UaoaJ&L|6`-=*GS3jLe}RA= zfc&<9<;e}R@Om3Ke0Jk0Jl)O{60mR#6DgTGh4YI%$#0>*4*+|`To%svm9x**{+Nk5 zm_)gM8Y|YQ$k>C|D>=|l*O_ASeksB@1&H zQEy{%JbQsgWi*xdFhCUj*Gyp^*2WS15I%ozjI?ne;Dko={wn~K~M_~_2k?ZCy}D-Pzwq|#=z z=V>+ISGB^9M%Ek4?ww{_QH%0#?L# z0nA;Ii=j@NhDrt7u|f;PTc&FPvVS!*f%mGh4ouTr!-y{HmUpmXbyk58x$hQ#H=~w2 z`x9ERIxEN3>>iCw4`2mY2Ie7hcTL(wbM}<={y!|7RBm$lHB-JRVnPi%t_H@mv~__g z;3la3w4Fgyu+6Hq+1C2i-e(@kpPkiDKmS20Cchxo zd&zAu`EkDmk23Ca*q@vGdW#!>s8T%H`GMu?d4aPi`%?OBdeJdL|B=bmz}tJKtrusjifxVU2H&qg1NSRmP!=x4H7#i!V|GDRWuxAb%X$aNf@FCJW}8s zLfx(S1a7w$a9uC_Cz}y|J450o1MvcN^6u|grW=X6C+Pue$Qy;;Bzd%ssu7oA#`M(fKYiNYvjqd=Si#-K-Gb)L{GeM zSfyA7cpF4y%o9~9Ptc6KFj9HaAYhbP(KfJO`|Z#57EXDEcl#uNTz#DR#Dhiy=OX{@ z&-hzUe|r;utLkrm#NRG;dFA-qDbmpqS0Mx3JqI&MkbN6{2hqFrFAccZD0DAysP^%l zYyFf&n<`NaiT>!iL^v8X(o(!tDF!SCCw#^7yig=u&RK16M4Uk%?-cJnGF0Q^oySYd zcUQCRcI6{n82!;g&40Xu<%AgUMgORT=Y#tNd0Qb60e-DKggCWbxraN8Kf3bkzW?>W z{aW#14{-@nLALFSBfc)(!F`K*fArA2p4yk^f&wgm-Uq#@e-Ur4CEUD2ljJ-Iej^-F zaAK1_38JH@Vn9rzxkz7+8L3B!YaeakndSh|vxNhipx;Y`*XX_w9<3Dr7&2mtdk^F? z4n$<`az78S<1A|~U+%+XpTmXctA-TWquPEle@LJ;n3c*uirHXVrCGpaE+WVZ2;HDZ zg6%qglnoJjI;n>ye5QX}_MK_BLbXXBdZ%rtog)SrY!wpGM-Ok`<`XjtgWtW;;8%Za z@PRKYU(I-Q_VJT)IQVRvYd|RCUwRjGNhr=gDcx76cUAWZ{o~4*+m$E5^WR z21Akbbu-6F^bn{Ad}7stL9L~ABJ}^FUi^`N4kT_b@$qxk0qZ>GGy%wkPzEc9Zan9tF*x>t*!i_3+a*eAdcdFdii9?DjEdLu8R%t7!=2Szk zSUcmEdJBx?~eleqkVH%iW z;)5Ds_wL`mjo}((O90{rDN4}o0WM8{k`5GAxhRjmOAEMO#RBzSTjTT*)_?G94wEks z$sT$EMC-y{{P0L~X#Mr=Z012{G#g9#Y)r?Iw zTjH$Xz7REJ@^q5cjkR2L=pTMsInF@Iy5UmD1-EU_$P+D=CtAoSdqke-5qaA0>=8cM zBl5KI*%N%S2l!+Us3t*e11?acfJcdNEFVc2Eh~g$EG1)fHJ`rU+NR&y>^C9QZPize zMDt84skqH&1q~a|h25fiJmdm@l8;E$@uC={i@Fb-uVjNw;y4&%Eg30%pQ+>3OW~Wa z%PSiIxNY`vz>x{3LEr88qe|(Dep>)MvVFr}>>GJvu=(U5$`gGhPYf=f7*y%J2=r8C zJB#*42&VemoRk1^3qF-<`=N<^*A=^GizBBgPn7Dah4w|_e9<>G;(P~xb_antcn>*z z^ylZ5m;ZAh0nw=l70QADL{y1s{D7@C5h{MuC^?&LEgZuIMwL(;#y@s&!HQ1+^mV(E z)kS|>J;&*LtJTLGk?L3(NCFq|2~@IXDHA!bxF85{P{-+c8XYcsY~9kw$PdvmGEAc@ zbyBOfPcet!5$oFuK)=m@0WcNGSRDlS_Hq9w?)61Q84^MFW!K$(eTMkW+_t6XOQ>U& z<42@hxvS#6LL8J(J&zMWKny>Nl>Vv27yEgdt(Y zoXvL^RE1#&&4b5yIOM!m>&SW~QnQSH>rD}WYO<59mO85-8monWSrpXM zrdDVHw(G*EM>s;707(NsvVu)zR9fuL#JtQ4ljHBTEi!*m_wd9JL}vq9^#D}kgD zqfQ$Z_~)EntcdF;4g+tr+3x^$H4{dBj5;BgUhyQD@;SwFC2T_0wZvk7G1i0!i-MDN*T({)KQw0 zPWAslwP%z4EfVu-RrrC@r!Lo24n|6=5?k4l6bwVpS*p3T9l8m(uPD_yi) zIw&ao!-M2}LjRh0ZWQ!-MkdS^!^Z*Bx*u^axG)Q|GALqB?~assij4A>qG7O>Ays&f z!#&hd();D@84jX7s*lnhI-Y2ks-}H@QLu@BN%4l0jJ;h(a1ax3K~s{Z^$Oa~DC0ue6c=)8 zsd`OWoYfQ}JgHTs79XbAT?kU08?|Y#{`WwP$T5%F-D>wzB{&wCCy&x$N(AhNEok5i zB3H{~2>HHpOoC&$w1yUnD>fWn=3P*Ktd0V}dy_DVX@Yc2$O&}Q(<=lfUe|ju++8)l9;QILst zfH%5xXgdx_dWU+Zlt{QqzQB3BdI>wp>RgAIX-33&C;$8{l<(jGL5@8GoOMut@ir~8 z67g6|sRPT?m6Ybhl^hFV!vJ26Jga%2#^h`BgWO$uTBsEVIgj(U?4*mF+p?_U74_X~ z$1K2DRFMY3jhl%)a8VqU92m_scXJE-7o+_o^wQ#=_^TbbqE6f8c+mu+f&SLMpf!ypKUwnSs4eyY zoA#wS{PX0KEMOGF70ei}u;eed8g2)VzvFnQa5v zp_Zae16#CiKiC!ER1j1&Kt4gY;HT0W1Tgv-%)WDJ7)S8?lymV>e{yFvTb~CQJdP#g zNT9GT>&qoHLN?Yb4`Y(8u#|vYDCMm-V1cdrNd`kg zC{&jHin2-xR+MgZ;=etyD|;K(;xRI z1hqw6stDACRFdASl?;?Vtd$}yb&<`n1w_UU2GVP}sEg;d#4XQ%T7pxb4}_c-D;DFt zXGRK~RD$WWN?(zWiMhMx@FKtwoz)f;TZ{6C!d{kM$`JP+@e7%5Y1^@xHE;WW$qN?8;=QHOx1Ti!AztOJ+{qb@WztbGd!c?i+`aI?2=RvxdIbob z22e-e@?KB+(sZlI^{ii%Vqn@fp0{yj`=(z+EZ5Dn!b1Kkg#BImRH(T|B7U(5C;47V za$dc<#1TO5HlQNgsvkj|cbqM}>Eo}K`k_aX(veW6I^za^BLN`j>n!-@vN)E_2$v-y zQKA<;XiGPv=zP=Bt-9?eOY`HWPMqq^*ILhaK z$zQcoC8uI#G?=-F#!TSxs*Y_cQ>r^2w`}_1-hEp@S>{=omS!1mvx;%k7%}eMh{+u`#_R9BadhV<^8+KGn+nvJXiaL1|$bTfh$?SWD1RRNXtG}gB7vMIGj^C z5;ra6lPY&`^2h(hO{f12@iC6@h%ms$c;K&pz6dt(Y2S&$>f=oIC7A3wtQ3ovR42HM z*vHal%sR^KBAPaf!t^BO)3Y)}5`AchqFka<7+TGm5N{?nB%dIk*_*J#E)6{{szw zYMKUNq^H(?xzH@#1-wxK77TMCrcBex4Yw3;as)^3q<63l;k0Lwy?U4&cEEWahkOyT zq=Vwdr3^mq0Q0sZd3hN65i_@^2W*u zQb`RQ{-=gaIt)flbP#i##R*_E zCxLMW4^U?miC!Ik31j88Kp)%}kck9+xP6;MBd!@oQ$h`@L=gvnmx0{BxP3;QF)y7A zgF5Q!R>U}?VTa(SAdC$MYkGn_ne8XfnZ6TZi6cJ!i zXq$GIF(ZjO+vb7M(L_UeZZ?Y=ehM|l=MM;%vu{v7=do7(G6*6gLHUga6mtx95CyaG zI?!Uo%HWiy7Aga=#m(YV^~z6w=@Q%y1`bwCc7wGz1$6YM8M6%#Mq`$LVOgR~hyF3c zInC=llXM-5#ZHLS`%HUaF??6Tq~A4)&X=SyDAj=1Bb)r+Z*FUcO9g)bb{7g5V8a7a z9UHf+Z)~+T8eP=QOt_aHXsb`d<)j8_WXB($Pt=F$I;H7Rb?^67&B3A8v^vrhpoAM0 zIFxa-zmTmMIepZjBex>zmvKpee zY^#Cu2)dh^y*zs0r_(bC%IdG6bk6z5IFgY9(oJe9$ibx*A6&Fu1}K1=xIx|qqZPVK zQu#ht2I(`KD{~bZP3NhPpOIzVYf_ z(HHPIP1W`0CdJHg7ys{l)pN=hy8C|2Nt=k$maWZ4Sxbq3;tKVV+BEP1HrJ$%a0&9n zo+R)U`=i?JRxjDvC-fcxTuHT>q*T9}aJA*m?`a3{J{aoaIW|;0{k1HrgnKnb32@N{ zV&LFX0$57qN4$@Vq=jmybAD+qT)}>fdlC<1L;BB~m))ml%K(9>k==mu_3;IEX*yI< z>80PiLROT2)^IQ070fLzIdS4b(`!Rs@(xTTEQwOf;&JC;%xQTeP3Qhr6-KU59~vL| z+qW|eq!1m9LXdVoV+&!4XI6meln0vky5 zSwx`a*QulefR_N<1}sOuh(II*tqEh@m-uIyf9~?nJ^s1RKM!nINc6Lh(1Kut&*TjVk%Zi> zezgtX*)5>^frYO6>-W^tIWEV0X_1 zMpnTo35p&ZM>z^8GeTj}14IzpYm>%|*fKkRO#Ur!1pp)pF90iTp(Ego2Ko>Xd>pOP z6N`pgBFwZ4fX6AB_gH>*{{tyGko@Ih^Qi$|gIq*j*8~F#c6~VB8pnTu+O!`BQXleq ze=*mpK%XeCaL;6Cb8ox58(`dqQbK;XP;I@+GNwP8hq;*p1c z_^+Z8t9C&)6-}X0x;wKqpA5vEBb8wJLwy?HRu||Hrda^Kb1ddXfy4&ev@mI_%Md2Ry3Kj|Njy2D>DKhWIU{%o21mAi27{ zJnteC2p}CA@@Y7`M6bk@X+=|}=cY_sPnnRR znKH;mrc6y40j@{&T`z~3pZPEH_k0ByoIriSatMg>_A`zQO7Tp(TPx68o;YOxmtMrGsM4cS0n6#InXu0RuNCF_q3OWx*Ul~@|< z^t&WVMLh^=0uELMeykoWJv$sM-?_6ce>eDdd2L=E67=AKygXPrA|Ri@r&EgAUr5jZ zs>Zp3-B(8vMAQl`L_uGwat=X%Kf_Rls-g_7N)@6R1w_qvj~~sWh#VGLA=E$sS0pNwVBC>Q znU8a-JA|;rL{RGxLzGyDc6P{(jn((kcQ5;TdNl0xs1^|{(*s;7gxe*5qQ!DXc;oX~ zh<;`PTm%h=S)lyc3BqBO+rpstWakUzsiXzfQZGljwao`8n@X;QWHb|3nYAow1NBoX z2XmL)<|Akp{@xFBy~pEd8?Rtwez6!+a# z32n8(Cb?iJCjgeteqkJc`vF`lKa?0YFHwkF=CbJ5YzaW)4+&cBx01x>39RElo@_eQ z4M)4Gu9eB%+uiF#N@k;G5Lwqm_hc8&8Z{_7DB*`npqW$b5>zOPL(UXVRFLQ%OrQ!Y%yI|%Q5thTvW1hD!AAyH}tick7 z4mn?Vy7uh%z4F@L^W*isXKSz4aqhJ_WkOEBpt(qj9{O6jGrzJnzfqn4aP)0?G#u#n zd-+GBAp#yAeY-mv9v<$&`)OvlfX1>OnP1Sat{w|$(fPuE`itL=_ttjSkJt8&pRR9i zy_j<&1n5Lx`Y!#a&Seh|XO6zz$uB(^We?{TCCkM_fGWnHGK(WnJl=w;-m-*uytgtH zz%!Om7@-=`k|R8SDao(wuZM>VX~EeNnzk?=%(&~&v1N{FiN?Yv#~$9Py!*-!)w}+? zeb4@ikWj^ciLtV!P~pjuDz7?|YR&>SB+=4?;oarogL}iJ3itwF1bv>4cYchtU?+Na( z446qAIz>a}yv4&g8--u)5AKOy)`k5;8*z7*%I{j27jX6i`yLA|&$W*;k!9hi4V$ zR7r!EItYtLSnVbj2`qC(ONZt6VDa9ghfBaOK{ik8Pc~jY{jqLqXnphLGcE|dbg5Df zTwhO@zq7^lXMAsGEiE9=){39wvNFI-gR9hk=PO|2c^2CcHlCsGF+;))KI9Lyw7hg=-uh7U(AktJ|#C;#6bYK91O$#y%xIZcyJQ1`_Rl=9;))_OR%VZXTyn}v9wk4EpFdX0i+Qrct_7uYKmE*#cjcS<4GAx3B z+mIx9+!&pS{K2{XX|fj0OB_l`{oOY>{CL4q$IAyo5lF@e!f-(Xbm>efMMIy(_>>}n z?Bj30t#XGGV5ia@5Zk^@y9~apaNCqWber&>!NAD0Wgv@y7)Qickj(-0Y9I0l5wsE!T?!4cx&ZQb-45F(L*OB_ML zy#}&inPLGIgokpW7p*!(PtwLByydmu0L-`0Fm=Lx7bMlKbLEx{h#?GP@QWdTf$?CW zwtN-=wmzji9h3G5ntgfheY-y#$*13UQ8M`SMi7LbTt7&QxYiuzK2wPl53O3Np8B-Z9*QSE$ev31@*hfBMd3$VN1#4hyLQwp2 z?Zt1a!D;$dM+sZbmK};J>n9^Pk&!8OHbjp6m<07>n0Qh}x1chZ%NLV>@f{#}nsuHX zN?fFwm__HvL}ykP`^9A)Rg_A)D_nGq`U4!=)ZJ%73d({kUHae%Ggt;Vzo11MpreuI zo5SMn8IAc2G@8p*6SdO?(_N<@JLi2{7MEOI)AxOYa_YEF8;&q4bdI2&t~eIcPmLhN z68C-9xlufHLM~D391=BuZZsxCSg~ZHU0{c??_9Xu*M+oJ2QYiXs6W(s(>RnJ@Te;n z9+$|zM+6BOM)XV}dmLcU^+HiIk1o1gHR!HSE`6EG@+^$y_)*Ogzp}-_9hGf2x;J6{ zm&*rC*p7INQ(X)0AuE(gfOZ)LoQIqJvh35l0Bs*lcsMOz*w;( z_x*h7yvMP$%#6mFZe0w)osxu(I>SCo)E+e3CsLC3z(C#+f}M}72yJ*Q)b$4;GKdm# zJ5qZv>S3#q&G4$m}JM+?-sYh7SQ z$iAc+xg_)t-S7MxVBO_chYkXo#SrVguF-waM;4IcGa$uhK#I@6pl411 zA;-}G#i}mkp6~bD^pO1JbqaxZ1d-T^gu6jyo**wxz8?jdI2@i zhQ|Ez07mkR8n*m1K#}>d~wUE*>77@w-;fR9A$!)j$uEEo)(o)GuFUIqv*Go${ zU?{&`sqN9Edn)fh%=Clu-&d|D81JPO)~0Oz}EZJA6=D6T4s6%?&M+80WOz= z&^j*y+w|5oOyHlQk9^`4>}qEi4{($aF*CU+)Zz<&d(aDWAUbJ4F0{XY?<=r62Mdt? zEL}0~s=eO7*dHLn&Qu4$^x-XeA3%_J(;WEqz@dtmCgIbC1#wmfhLhho zl0BPgbEo>-lQUtiaHk)Hn8j~0yuBFh0c4!M1>xaLvb-)>MwaK?Vw4CKL`Ril{qGpZ z<-PLTa_v_y#G(8~h~P2F*Rha@&owBHq7 z{qo!<^i4%S-KSK$$3swi2G@gxHMo@QcUnUvfC7pVo}NLRz+G_zq))r~w10_F!Z&uh zhh0%rMW%>8uWtU-#Tk&ocly`&o^3OK21G^RLhOFOiwKeMeXf$rCL|U7=Fjc#+x`wR ztb&{Y&Ke>xJ7`t$W%!js;h_mh;RSE-DOcoQMjih)!ajX{d(~@|;Q9Cqn>Yhxmaf^N zk|RiGeQ0`p^ofd_tSZv{Eu%-a`(1b;W%jIrRvB;9Q8u^^BSK_72O&^2$Z1cDo zp*hTi7tcR~m;0BpN6`$#KK=yj@xa%COIcEJJRyAyNtDQ3{S)>T2zHZ}!o=<8GjN0P zmt6dnaFlps2OS4;f%)-hbqnHQo9g5Sw^~E&7VlI$&(}wWp!TJI`<9HpDm0M<3R$y; zB|Su`%(*>Sl^yDtRz&VT11AL8lw_sjy?Y{JiB6Yb?>ZIOmdz&UNmR?pbHwO$T%;uh8{JnO8jdq&<8|y2mwf^kh8T(I(yg zz;#Bm-@ueQ@9EKn^^_Yl=l7`NQ& zML9auiND8`*H({)5v8yAvYQHF6<7Ii7((1V{BDIofR(I&L)}!-W4zpKovDtdv=ChQ zi{yZ^U_Ug_`Pkw1^s^{#64`}7(8Lv()~iwsw|Jb{3J~@N*S@Zx>KX8b!9j2nrcT&#gDG^fbHY)s|$fYlv;M zVac=l_=wJb2QH}+kzN3tjhkhgkQzzlFm9-3sz*Dm;VV>4WqXMH;v-o5#P^bGUTm3q z?gSaWC(iJBKMWNdLv>QQ+hBo}bg9>i>PyleNcYXsh_(dlZz&%Rp_*Yex24e}k79pg z>8`0H|0kL*=35&|RK&zudk{KG+U#;5!r@*+8b+pn-wAs8qH(!Snu|LGWZbe7pv*5@ z=b-CEZrt%}YNu~6is(+osrn4FuywZ6eIiKiZm46K zwev4R7|DCE+|k$GL8mAlu1a!&$9l?X%@wZ)W$V`Zj;6X&J7=7(;dY)sAy~ixH~Psr=rL@(HLyzLY859$-+&sgfkF^Znikf*=V+3km1k8PnthP72*? zcUH_|aE-!AEEwzcm7D_^HedueNO8A3{wsX$bFL&*`}sPfjXZnKy>16gR*Q-G+P z+xm^NRCWHg`>G|6HZu~;b%L%&oKrh?n8KZF%lc9~6)mH4M9ixAt#g!EPCli9To<^1 zA!Zfj8^>qR-(z~aDRe1js|<13O5H}KjU}CM{!VrA`{#NU+&|ZZxVm$A_4fg0r&?WS z_Z;DrSl3|fV0SR2TJlb+-fHog!&XZ;6CFHu)J?KTDld;5Yme~ISe6DoLe{O}6a=uX zWs)eFVcnAvp)&Rzl!}azmQ7FE(HUNUq^&12DzaBmzn(eiLB)B4>lcLG+VnagRrU`? zE6Apwtv_GiUEQc}u2rk6&)3Mlt?z8E?ylCWyF2S!&&l6yt!})09zC(Mw*7Kvw{EY7 z1bvXhU~7ACx4sRr_Z|m(bq$gAEfH?X-QBevJo?Q~-_p3D`b%|pZ4+`5JVx$+f9+ZQ z$=>?Lu5__`_Wn;akiesBuTZen9X{#qkvVYD26Kz0F|0%_#C=!7#)QPw;f)d&WtKCTf`*VC{H25^)#pWU~ZK4K9*<%A{nn2YLj zF2)Cy?S;mR2H-TAV!xHS)qZn-)b3!)x9A6-k~$puF#)I$i#VhL5sAI$7#0zf|3#{q z%_+K$VzA)_n|JoFooMr8ED|Nc563T|-^9dP#bm(xW28nd@ z%f|GgPjKv~|N9JNX@Iq^S=L7$c0s*<$sFwI{1v&(R&C zi?RHsbmzKfy5eJu!TC3ly#GE-iJ*IS(-{s#vu<3=;TM187z{IiZt6KoJRo=1cQ@9w zhQ4WhmQ+|xPgbapBkETWDRPx}x?NB%lwHQO9=z(7Q|qA;iyW8pCDjwtJ;-$1T2M2R zt%eki9K*YMa*^QPr`k8BcBChqR84TBptQe4b;!kfh38lp5LIdpIR!e?=5buMDZJ58 zAkidHjs?3IMEBf(+fKFrdKZrJ>~=&B74)=7>B-J5lr^g^8dlV5N2Sk|)q-vjHpYYU zj;axg5+V9u_F1mZss;h-R0tx@dZp8NP^vUp>*$ni28}9ZQy_d2sio|A(98s|B-BDkpM}v`ch|@^H#neXx97f82uchYkGipG$cVIJEdZ9W| z6KrcD%hw)d(T+|%Nu>Qu4!nONp6yi84llJqN1bRQjXIXo2pW^FXMHBvLnV9DD80b^meGNL3_;9f zA?2^wwmQ15XjCenzB-00U|cjA+95VTu7rEE*>r_}xy`Y7)f=*%u7p^S2V^02Z2FZm zt-Tr^!7>zT2sns5#q5<)bs}cC=q~BNqf^!N!Gq39mR~RnS@01UsnYq%Z^bnG`Mcxx zF~|Km7+7KZ-cfSG0PR<)vx4(UkxrB0a3uZ!T3pf#5KXHFhqMaTCDI!#_IrKs@Ec%h z=tj|h`ZJZF82BsAZB^go=$+o@beG)&YO#_oQfR7dx99OO zJo%N?T{TSXIY=m6C8%U1+LmGjEe*!jF#S+}iugqpmtq3#H@nlMJ+WW=*e#}1dkm#i zm5I0|fu{k&e9MpVYnW=5QwO3VnRTaf+yjjx*=E?oN2kJB;|-DQaUSZofnmokbyoFO|xi=&{bM*Vr?}gS<4%ONZmjwC>C*?R3E?4yd_V6 zi8amnG_h0rCo-+WvAw5EqLcaSJE$wQtg*j~3C{t4Kmf^(N!1*uQ1r?(ZCBATEO?13 z8H$~tP$}yMl3ws<59CpxFfKtnIZ_8C3^Z{7N?!&Bz5=O{n2Mm3|KAu){Y%WGCe3&X zEv7u}dT7=F(j(V@F8}o-x`J98Gn=Y^Kh&nmw|D95Lm2wp1(8S`GBd=|pk!WOm35S% z=PfAMXmAVW5>}o%6&hkon)+^kY6eqe-v#|HU!UCTIU|~&L~9yOj%o1bDBh|hvNTbW z-`S*?iHhk?n}tgKiCVwZaq6n4Fm)cIswqjGM?KD{qyDUQ1VQte5SuO{1gDGq8>OHw z0z=8ix%mYBk1i2+OI z?0&F*;W!OT6=)VwH$)aVf6P?VFarmkflLCHS-YR9x?$$j1~tGZ5(UF3mfrL4O{yDW zm5fy$@mVtpFo>;SvXY5^0Y~Q5(h(nIG~)ma7g|FMHRLx_TMe{V`wT5NIOR`95ea^J zXn4FQW)>NZEHYDD#}%sn8I3CV!v{riANXHXbwQJg4TW4`L4m8-Pv9z=0Jw_n1g@eQ zkE`U%E2%_W#byFmQJBY7?IkkPrN>oo(0U>hi-}AuCHRYGkt&&g8A7H84sto~$)$J6 zi}Vc)8-NaX!Iovc%$9-IL8eur)dGJqXSa}=cxxjRq2%(U(gAhx1I=z|$9qi8bDaZMFDtBZr+6z!F<^wSAK`A{W zU(DzGvwGp@mC=dsgMr zmw1N{n>v5oN2H%NrPAAol9YUpMLn@Lj9H#1vKuF|p{PdK0QloqBVng2(^5;voYcnU zAoQCwAXHbq)wwpN$SI5&nv~(h@);riV!@29)Ss5nM5YTz3bg~NO(KL5HZtyxSS$lJ z))BRCG!&@sc!`UNSe&#(a={!icZJbdD3X$htYUvcSOU8!cDU$xI~xjlq?9N3pg9NJHs0It>X*5k@cLm+{}L~cnf%EQ zc-=^s@@Vqh;zpCTxj7J?JS6!7ZuJ+(`<;LJdykN|e?KPT?&r_R(oF6?j!qCKXw18F zb9YJk{2KMudHLECsXSqYE8v^#8Dr~Azo9mGT##rO1)y-dh9DVk-Uy2*D+`~7nqI$nd>;3yOMZxF46W`(T?n-~{ z6?EdyQN>j_tu*^*omgj!f8Tq#yOyLir=19uFQY84M9uy)vW$#Ui^%0%cORI+CM@p4 zL>|Ef^zAExmW3vf&CCaZ^qa1`03|E#d02?>?hbk-5PziybII`)=J4QNnLlaz{Jn?J za^By^1U|q~JY@=2g2(ZL$&}8q315FET!K&#U&ed@ywkg;B6Aj>>PJctP77ix!xxY1 zX7>m-ZAELQWK_ie&Fz2q^V2UXm2su=Fj@g0ETPiY(pV;_YrziRSl?Pp`+M3jm~tjN zhgOknQKvuVAeLuGprP@wh3sTj$Oal4!6bPDBR4-tRJA3dtzsTBX6R$SeXxIAzENvN zc2Cus;qWV0KdaR|zYP6S7?nSTqoW}D|V#(HKm(ijEV0|s`t6Y?P#S)vzPmjsOC-Vx_#oE9?35Oo? zBOD41Da;C`hGLvbM<3_cfpC9E;L-wTnbo>2B+U#tnS-1xI4DyPcGN66=0_H-4H&-m zi}U!Fgq+QK{r5BM%)%TXy6)>;dU|DL5|(SWk8LCkWuz~$y&nH26~`S@$a6UFN4^bh zRFyGv8(zPD5`bPN>&j7r6M?YGW!3fhWD}QJ?fPV|y5QXA=k)Dv0=+uBzMbGjXqK;* zx66bagl zcpo7lVPf(X{8(>k;Neb~W0Nb}XoiOKv@p{39l%Ej=f{E5FRag^E=IU}`SM999!X}D z6aIDa;_v%K5@Irw^=7dwT?(EDp@YB^IDGyTKoB+^;nu+0e(*$S~2t?|7Gw(N+|#lL1{mI=|oq0CCkCw}M zRWERde%S*(h7<&h>jSuAD#$OldZ=RY!suSN8snK<&e43U#BCy%Tvq%xK2^psyrABK z^9|S94q9;*9+nREp_XjqyMyVuybnmsT30fU7D4^Ymu}0NfxAa1LBwp+N=wP*mo- zA)UVEu6sJ#5|%Y-V`E%c`u*BsQJ85?xF11xDb-%QNwZ9SKLFnv}oRgkYd0v zX70=l%6 z@aAf6SzW!RWPgZKuTr+MQZEXY8Lj~{2)ea_L4U-pd969=iRkws7DjN=XE3qkz~-hR z^8}zdn3wC!d=7cQYG%=RCYT$*=o@=SU=2fnj2Z-TCO8@3Gwove@Dr>Q;y)mA09CYM zQ*tc4dLY-8Gq*%mRJdCXIi~k0&y8Z_yCzP zvfLwMbR?1fPGGsLt$JzXGlf^nLjf}$acpPjzFn8vLll!uVD}oynuGh8S3Ki7h@xn)I12^ z;H^E_NW+$FhL5)Ju`jOx#|k(7HH8L@xR&HIBA*ysj_b#dEb&|#lndK#2yDUvr7ytf zJs7!#fqoTbzQt_UAJ9y#ge{=>K+;l?JTfJDgc2oVDKf}i;x4e$95#?7`xGU5OznP)&H17gDtMvB*bOBiR*W5@Uhqf?1jzZ!t(B#^I;(wtQT!mPR5U(veDw zmGiF@G=E6SY}g&*HrK;B(ip28D#=mAsZ^s_S%ieWi@c)@CTX`XhFro#j}I<=W9EYy zm7<#|Tbi1x9%&vOkCQAPEcPl;%8|^QW)L&bg5cS7VIj#o9bc!~9LG9&@eFQp)GA1& zn-$9;Jf3=>BQt-G@?0nL)}{ul#Q9~WWQ`^wkpLBi()dBma*l@a;v#S<5{C}ExNLdg zNc7*4oV`qr9fC>7e~!g<`mbNSjQ5~@38V8WT|Dz70?xAPd6KjAdke!r#I|!v?k%1S z1%~gy9lFoZoy_}`z|#k1lkNDSP6{Daxc>+qfyV2$Q&ed?Nt*5(Gd{8$oZ1*Z57h+} z7BUg?11CpD-{c$LwxOSd!g58HkX`N4+cp=g_k=0TpU2P8#c|92$&bDKXMt3`@*%&fug< z&cE!fMm_ZzdaU4pLZQ2Mq$_wWIL&ce*M<6*M}KBF6nC8Dhnuy_c3>cr`JwKujHRbT zAWT`o1Z@hE0f}yDlyK46^U_R+)3q7T0wR8$5SFh@u(ojH=h-`WUo`C~DY~ zto|8mVknEE`zW+76$h=j3t8Ay;t0GbIWyft_{+fA&eZrb*2i!Sas`vMumPo}30hqh zKfH552tu(@p_#XIV;;0QQ-|niB7&VIP3d!fA|%#?XqPW_S% zH0*kpqETSpcr7{M9E3H5trBTy6IcUm{H;p}$U)r{inq=3 zoecostjce>?#}P!@I#QIbWV@#gAId40a{||q&6oBuKEPQiGI>eq0zzX7qYa2VtOXK z`;_`=-ijP8&iKOertz1rNP1%T$4m(yVkuS(WfH9l&>QR&l)GuQWQ>_Lnq@a_nX=^F zRe$>p9_yN9iViKOh*+!^50=^3Lo2C{=T;4%4RaK)RDYqIbC*otA0wAB34~yK7AGe9 zV97$}h&&d4?E^?m7E*Bx=R`(XR5a1#Uhna47k=x z91MC^#0(cArFrNHWErA82La};9+{IQ(bQ_y9nb{n@yL{Tics;rl0T`7&0H|@6I`@_ zr*S9??n&$+mf^vh`@j6S6Mgq}noWdaaa>Nt5H{mY=3=NrF+p@y*NqnoN@2We;qC>~ z(TA_rU7=m^%FF9HhDFguCB-_|)S?9)8+^2MMidQjEg88nJf*^i@Kb8|!?^M8t3so~ z0(Arbkzl(cV!zeiQ>zwWsUuef_uNMTM~-k)m02qn#zxq-B#Yg`Y1YT%-=n~vd^bFh z#d8mqih16=Wl`_Bw^tv|E4_9&C_UicW+@!?RTenfl3z92zB_^)+S4eG*rpf*nAV}H zqdGS@?^zo6jfKNt&#QwsxmN$ zI`yJnCF8k&6pz}bH3!W`>ST^@vj}$MtRpMsCEfx}D6+~JB^O1bm{VkxO`HmN*Y6HVYVO`NAhlH1Hf~wBv2nFcqOJBX)VjCSjAS9~l;? zJ7&NRx^w4~G0AlIAheYNI0X@`BN90lLL?7agZ$0K&h$D0`OWL=R@4)KM;c_t#E*WD z94ZCaq&Ll!6mK=N3p)1B2xw^2ViRip_W*RZUE<~ zLR_*}%#ZH){s-5*gBA9OfKhON1^D4`eBML?iS{Jg-j4foF1GuLAa^k}EK1YxCT%J^ z+^cxyu-BtALWm)g?NM(aK(YX;V8zZ+fU!_HMr-;}>RvR_uF+X$8iyWRkylj_4kKpx z(2*L}kfx?VLWM&1f=>PElR`8BAFY!vEw?T*K~U&&d?9}C-&twts#Ux!Gm-`!T-Q;L zyIz6;PE{6Q^0j@}MLh8bLEQ({mtrTYP{ZAUm^y_0odz?|@$wyb@t3?Wyme*fLe7~lI$GA|uV`#;^HxkOn++5p8h)&R#mEZTyGJ`N}A z(lE6&Md<5l$T;SpT)`zLQA-?^2WYvJcWF*JYgyo)$pxu2S+e+K@&XM93aO{MxAOS! zY*br}C}iCRnF<`>c#Re|2BCL10bR0pL7+q+gjgY$T}8kUQM2llDphTS0I}=xNbP@6 zllvU*H$P5Dkf%8VA-<-1x${ET*ILAnk88}Q#I6TSJW-UTc)1M_z7S3uD1DqWj{o$G z96GvQ2-?8MQ7C&9Oum0X4`Tg!nak#q2JIvO>+xZ>o*Ffv+uz{|xUq`n)KEO$t$PQB zZu=2Hn|w>k-S!v13Z$~Fuf%}+W-5e^oCrFK%$5XKdcRn@&&I!;#RXa@aZ5%jy{BWO zY?Tv@adi^OG}I6xr3s<=xlr97jR&M3H-vt9_+&|t(Birltg^EO?BC%Dbh*n^G3}|w zV)qYv4j| z$M)F2;S1_r*c!>-tN!_I88m0O<(*LJaSP=@u22i}BDRY*)AqGu@j_9|{?9v$lO2W7 zFh4*iqRQ2iK#et3z6x3u`kyJ-p-2#pg#2p(LohKQ+f^c<5qzk){cPZonUy*!^0a}R z<51L$j_=isw~RyG;bfRn1ggF5g6SimzWcb7C{;`nr}UW(tMGUU*%C*dxJ8`$U3X3k zF02QKiHiKT5dblYlsM$L>W6DyGZciN3)nmF=pALR zV4CD~O*@Sem=B2pAVY;&hDNFVx2qvVMTpjo>M7z;#&k=-U^wy9M1~MYb$t%M+lUjA z|B>FCMck+yoz1%}N<$uVcw+Q=!slgAKSlv>&9)}GRH>0oRyjSJkuLYuMfgmXE%&z& z8o)tFe{-DS=bTlP5*!_LfsL^e%5wTU(p2Xca8_nH_ISA<*wd<=HILfzbw8vLTt^ie zSKbICR!w1)(a}rKALvVp-IGL)Njsqa(6>*5w@+SyT$&)?(~eX0&PVS*s6L&F(RgYlLHC2aw*LNlkH(QmXKPG)(>5HgVWgY$I7uuI$o zJJFGo-O@gRr?^z1)`|FAc0`@$yzlJ)0greZ#7rfq)`P5AzWO$wJ<-m3#%p{1-<`VN z@te~8$Uk1Mwo(G#)z2b_*>x(uX(EB3ec40<5NBNx`X4iI-dEC|fz)}%OmQJ--MJ}t zuTb+-)P3wfXXr*3y~)`a0-vS7_fx0uHKt=`-xNGBUBeB*P+haS2Xp(HyA$=dom%@6 zv>|m3r?qggT0cHk5BUc&JqIm#C6zmBwizsgM6fXsu1hwyw!xX2?!-P8F=Byc60n18m2ncAL{i>t7 zX{x^|Xsj&^{1hgnkFM_$uN+co1F^Pw9I&ZNq0Ps2+kr#(RC@vui<3fl;rUEEE;`2+=u(#w?rM9h;#z^KUOI{z|yJ15h9`GXeXDUoX@Vr_x_1$ zaG$IZO^p)`317A^Vs<$K1TilG++KvV0Ii#ykRcRTf;{bWyf8BNJsBVt0g?(4A_fQHiej}B?5P4%91Oe^N#k&K*63^M5o+OlX|mC7p=nfznvRl0XzcAh%c4w_G zE9V_?5$Ps>*s!csp!dlmI167Iz5Uta(chwipba74bYix99!7E&h#g3Lw)9mVBuA>2xepsXBt!;EGEh8Hu zib1NTcxdY|cmm5!usnNN8G~0E`hO^J&Ni7OG`} z7z>K5MC;fNE*#DKHAYlSoMtPy>lKrry?dM(sPeTAA+ zV;f1a)PwhJINc6jNT}a3Upsy0v@=TLkHh+sl z;Y_$bKp8s9d7xH@jz7$w%KmNRtd7za#1B9K28aQLE)$=2V_SKEL!ZlcGY)4`p2EKP zt`8AG{11iB1(zDA1 zy{DHtBGgpOW0q%>Qw|No{W>e!fC;{aO=`8=J`f%y8t{=hB>mi)TXUt6*Sn=#<$WuH z77*H&ymXc&Ym|L=M2}_oqaHPU)@ph;76TMUP>0uz5vX2yhT?L_nRoY3I)X$D2Y_~< zJ^0QyY5!7bD=NjeS`&1A(QGhwnP$R|O8eV6!@kW4?=B}krvYr zl$tLuB~HJ%qka7|GQhC^nF)EJsZ%0Tp7ozvvX(;L{jtG-{V(}-h-z?9p287|;30;A z)jVeiVSL90mpDYQ$`p7pgMdWP>HyLanJ;iWvCImIf^+(*QOht_GOzD#?i3|?DwL@wKjV9=6oQxuxUTxic=$zg^Grl-VD?t>C55&2uHm__oO z&h3f51yu`wp<8B_pVSzq@~g!s;Vg<_MaI%82N)!j5k4zmepNtl%2a~#u~K-LtO16Q zC#hd^U#{vs1J=WHqno8G)qx)Q3(Q0JEoT=@!uWpKMxK_lpY4MW(IH0v`8o@d#pI(F ze5oDB{cyn+u@BTg4;b!_9nZ(^oAy7k%_eD1H+K9}Bf>%;uWQGD5iy`LVpUQh2kt>k zTl?Y;qDKm)5~&>ztI9zlcWl$5b+_vykKKED*TZN1ol4{38oY1V*s4&TLGQN2$;;@9;!nqn;RTAEyyFrD#7zp_O~opajx0XEmMaTn ze_T|l(Ob_Eld9mH?q>WTRe84Zav1L*e|JR%rp&n3Jst|k*Yg^@8LP_B6OgIDf@jPw zk?p;G!o#<=CV^z;{Qxhj6ss5}-=7|%MZ`t9PU61i(_*7~2LIYdFOKK|?3GrgA@Rl= zB!AoSwS5}Lh_O-6Iw~2Q_J5@Fl*w8lQ3x5nqPz{3K&^0hrIng63U=c9_o z9YG_MKnp(LYVVG51v!@E&aPjrLU6W;6$cqR$^z2lIwxSN)-NjSLg>ibBTNx8#hz=3*l zRU9e=f4|t9x%ZmIw_J0?Q(bK8)?%WXcW9og&tnO!HJA8#~ z#q3S(s(Cj0z|MVWA6J2d`9NEXpmh9rp#**2wEGJ4zyWFQn^0zT&KO;{_ug@+;;TE73OIZzzL~ms2ec)fJ|HU}(bS17 zl6jg7Vfj!pVs`LjJjtsxynelbDF`PcYS%)JVB=-+OsJuH93j-G;LiTb+uypp-U2n8 zmU}ex$spJpzg{=MD&c&?cx;R5xU@nydN%j2F(w2ctdjOD?!t(XM-Xe+r>CpA1(+7z?>80R2 z&)qJSm|#OXnld~~MNF@G?uiL>ncjzi$CnBESRLnc?Q6O-OoSm2` zKm!J|SEvIqECxBLEUdF_I7(=Jqs^r#X~t#`oek{rp>p+Y0vXcFFn(>99*j)k|DiDV zI>s+=g$8_$i>)V|1cAy*q~6y4%$zQ#%{Ke(hE=-K69N_Er`$_8M*K zZhCdLZV{POxT4?7#}@I=xfi-hie+j7b-WuppSq8*Y6h(Ssjyfs=kd1VutLIup8)k$ z&=5UF-USx&`D6*^nu^p*=Yu?a3!6}0zyu^tviUCeTJ}5-Y*7#MJG(lYqC`o>L96GC zBLWpbj^VF+V7MnRCP*Su?UT|oZ}rXdMl{F-{VuBojl5d~D;iQ?!c+;dc-v80c#!6@ zLblMmO&a`t@d#yMEm4x4UB%oN2Z73(Y2PZ9lAJq?Q}-CY5*beCIT3WN*vE7wR?G*h zeMGk4c+UvQYpIkRVa2ebk!JJp3~=d<6Xp}g7e>y1G(9{bE>@_KlqDqc26Eyi$5|s^ z7b`RvW=o$4#T$l+xW57+=@oInZ$S&aPY(kM&+=qtKda^&OfS>-76Pv(TeRZ_TJ1)dTkF|eUj;=*XU z%BIqaDTjKBb6XWg2US^Jm{pd6%d1H`jr-gCC&3|R9WE-w>g?P8arrx9;M2m{l~H@G zGlMF&B1f~Qu*CS_#gvs{7kZ9m{-@JuN~LRh0N0`!Dj)^=0`3N<-_P~{_D$$dktbAe z`)B&I_{v%Tt!64x1Mpq^S4BfUa((e10xLg2$gxO_9cYs{20cT01`8*y@1Dqn|8W^O zx8}^%9s$xzz#s^|ej{w7rP;+w+RSZ$0~4;EVfGvp`7rey*<|Gs>16JK49n2EMk}am z(X^5>J8#EiZiJiaGy30xCczb(DN&|QOc|{{g_T|fEg@aSGN3-Jnxq6$( z>08L8Ho7xecTLfHIs;?KBS(gam`ql1hICHAM=iyd`D%MoDNZkt!T804QrzlXOwE; znxp9iWycr_NTm|3k@T;-P3unyxd)L?>>6KSa*HHXmT>PH(cUm)>h*-VpAodz(>qhg zE%IgOYy-y^$2A-gs|qH@$xEc-+3VG~R5gr&qLgVjTAJxX1uximJg29aW)YU^R$pnx zTtNXTZJzfW9XVw2POLqXS)WMwpqMmt%Aq_a^kf1wR7^zTZFr*GQDmh^r4(IwJgf=V zR;B`L%qgdJ#<6IeG92)q1vHHkWrZILm^@Ew@4#bh`65vu>_J0`7Btgq_M*{hv;8+e z%t`*wi|vV)^NH32vP;#Q+y8W5+LwKJovYsRmTFGaU8~-->O&0!{a^k58`T%DKXa;~ zoZGRS-Ld?Me6ID768-mQcgwmr{r`vli~es7_`gQ~FTu~#ZR^yR6|Gz&9-eq@><4b1 z9Rqn=n!JdvnU%>MJcKQ(Usi)X7JrN$+s?I;c;NnkPz#je0(r$n8o>@Uz;WbV3OqrE zs9$gaEV!SOh%|sc!A$)%-SnVMBm-j;?np!CKAIa$vj)?f-f)i6ms`5qLHpaQ)d-&t z-@Mo7Ajc%`f}rClQkHx*!@6#K1=_#rO-r?SdA?XKT>f;|VC99nf%}^R012v3XTIUc zB~Xf~lPfH0;HElb@bG9KxNuYjb!Oaa^o!`5ji*NBH1sKvu+(PT)JtFP>~8ow4&>-e z{#snDx4QS&$=*&8ce`s5v*HAZo3oabfqf&uL^UXGy4w|2np*SnbtxDzunFX{#>IXZ z`QdBBKe0RLEy@_LJj5f3cWz0|Q=pz{G)4Le{}qY`2=bBtp|ZxlWm~1O2a0Wh!RI10 zRd609>z=3w_h5=U0CHk|gUDo!taW`v;29c0R{jeFH`^`;o?M~D2sf`SpB&K!4r~=^ z%*_e1t4vKYpq1_+F8g$I0B7^HkxdZXzzawE);-a|3x$usG?8mI<+DgbY#x9dCH zgc_X}sPi!FESS6Ji7{~Bx$@{*!$WSg{ZpwIuOH`|TS9=DG4Oe?TmfMcZbcJv&n4;&|LZ;HpVli+1CNs_ z141@poMj59-kRi28yH9hhTlOW9mNgP*z|L46BX*3HM z(3+kltj0K$)u|t*H0wnHi3g+E@p5~3{a286q+%4kgU&M(9LlI#gIzJ9yUK zI2t}oL&G+L7;GOS3hE) z!Q2nRKC{0eDcr3b>vMqa06Eer5m?wiU=9Ku@fG{XSU4!z#cb32#)`7;7j-i7ugv8H z1UzhQ9!3Yzg@N`x_}|e3g$g>r;4qyt*$r0TyM#GlXp z69Lm^Nm76%-ZA@s#M6O}M^!sOKyjp~GlziHBQosbJO*X4LUMluwG)E3lA*XKa3g;I z8QL;RO5N|s6RJ8M&LP*eKLqSEDDJsfiHvjRJqF%#s|XX7!~m8_f`OIOhs3L%1%fpJ z`4CSCp-F&MPoiH)QuaUP?Id6w`U}ojg@;3xxxCMRz>?HHJ+X!DZXjM&-J4^4JP70- zf-wA}2nkt<-Ep+PGuCVFc+6oQ2$8&xF>Yf;PC^zT3yM0AD&~VlGFqTfOXfkeNAYX) z*R{+-8ESs2@P*5{e)sl_1Thli2n8<9|+u`5gV_6Hq_>XM+HqQ5o@!{u=gF z!Ut%`_8-?!i+?`i2AC&|4G2c|z93wYHgYatT`)6)S_R5BPK)LDyb>%K!Sh^}mP>K; zP}k!nPoPwPhPzlFanotufS}$hLdo=S%-GR$;Lm}llanfd1-Or--vf}B*80wX*Iqe-~Y5b*HXE}1eS$|*pc z9Oyu5KDa){#&p>JnzD6jz2Mx-qljT_zO>p|JxPH=`pbT=!}Fqnke-b+C=zcTW5&Nz)H6z znBg(H`q`8M3dddhT_|u1+XcTDgQP!hCwqa)b6ZX*qE=U_-LsX-{xb+8Q|F5H{1CUa z6`uSF&D5erbwi=lz;WShX*@uNOYNGA+Lu_DN_ZW4*4sF;tCRyamQtdGYAZy(>-ctB z8mK@m$iE1$c((lpd_6QV4jGAX4SlG64RpMo?7A5Cj18Uaqe|~BN%18MHh_;wZ$8&#vpa^rJnf6*0 zAp`{DXyl66(C+J?Uci4{MFbqw58(Jih6OcqZWCPoSCsr%AgC9rdv^~&Iw@SKPV`@` z6T6|Q%^PoVTDfY{_gJK6AcB-%?qh8wYf|MJU9(kqW%?fxB5^|qYCDj{FGc7dfVjVq zwlO(cuN|!)NK^teYBv9H;#n8eWCcNv&OOt5?(<|qHD zxqYR@z3w;&>E8J)5BW~v077y+k1`8q6-0sugnoV6zEMqowUim z9FY4H#^jo}Jf9QuQ=_G0AWLqasi@fyQ1%A_|4vZbYR5n#_&HOrR)f=p7nPBmE4Vou z$C6~Dy?y?eU(0tD%B;ArsSszoIhA2x4x zW>LhouuOJac3jica8Y?BrKF?;?hw3z(x-PqJeQrTT3mhCAe$B9&J^hunA69uB4_cy zArjt64k)u%8>l$kBf97N;I#Pveo61k< z5ZqldZn-IL-(5yxr(Nmk0`!p4{Z$+lFGu!pz^7}(zGC~R6JX3=#)=hlLO zJtM_HZtXcC(J0`tr!g5ayVI$?tE2+Z!t74G#e`~{!cfqmgN22sq#OT(jr~nNJMsXt z_P!w}@668ocPo`G8pZDloV^bIEXtjb>+mk>G_3k=suM#lbv;e+@3NY~9g4A1k<@#vNgE}6Kd4W>Qz)|B-4F=8wH4R|u+b$0&pffe zX;QHwOO{Q}bsL(s7;3QPXQN7DOsu**zVi;=q7L4;xHkWzXlgZt+}HeI)U8Hx|ImIL z1j@Z8y6`I^Jn8+noTItUAa?Y_v&%F(DrOM6vMpEdBvCXQC{$Z&7HcRA*>`%F7|JJj zuq48CddxY7c*nuNb|+^dOoZV;`hX0R3wz_6>2sc#<&MlLJyvbX!W?gC7#_`EWJnU_ z?Af^^6`?7vyG3wzfA?Y~eIrCg_fj(42`35A1_dBb<%f3*C6FQV91b8y8AP09>)P94 zlm3JEFq!UIyoxmAK-V?a&!@6WZOQpTkU_~;IT!!Ee#{>As)Sfekoqtwd!?U zKX@+wDq&7EMWG?kGtvuxD9CAFd;rI}+V^VLu4R#)PZC+~y^EgIk~wi0RS$PcGw}Tz zdLJ?}3GK8q_!jk-EPsR+VfDs*0$T#S@gi4zD2VfrHOd}BCpef<1_U3X%BR*!6+u9a z{@$7W+Ey)A4rY7g8hyG0hL*6=^76#ZD;Ls_a4sbF$=-~=sP6LEE z@cfAzD|a)5v9IF`xLGyBt)0myqFWTf0gm|v3-9h=vFynFBd zC0&Au5$wI7%HrAMd=HBOoq>W3j%P3GC4QKYlwdBoW-yxRGrDTT z*AHD{s#Fv2 z)a*#D)|Vt2h+J6|Xg9`sxc^0U!tt5uCygN}`cxXpBkI=&B7d!0RQ#jbqr@44mRr&s ztcbbBy#SGhu|$b@fzCh?(Qh4>!Fk}`l>b|f;A$YD<CrzBg2qa_!yO~-6S#r!cTC!8;wQOLXz#g2v*1%Wssjb!*os~ zg^fZ1l53=+q7IKYSgj@^9vQ*Dgf%FX0Ag35C*hdxzVSwPZJSDQMkN6n8QaYpnT;mZ z5e)nBCgJLlRT(){>mFK`l*P=M z2vRSC*kgpS0aBIVbtbOqDFhpAJ;B&K(mmL~k$-yL@@xXLBj&q#iT0#Dg(r$KEk5lM z*fH4zkp#g;7A&6Qb>=g;A5m3xt9<=pqcu=!lX1bJ_mr7jqC(|O*9|K` zXjsTG@j_>jWF3l5Jfx}B!i3mJ6`EhF#^84xXnr(L`%=tR10Dx5A?2E|kWG6$Mq~!f z5`Yjz{h}rTys0*{15D3vkcDk?bRQJdLvfMfiC<@_-teLmM&^zn|d=-@kWwR$MA5I%hS@2=OW0(>W)XCO8u7)JF1+CpH= z6*9|zSDn7(GH3XrV&dKmb2NSv2lIQo6pVA<6$#K{!c3(HW0W1Y`z*hSkrMCF0aPwt zaa&ip9z9CkhSa}Tre#7nUu21Q;Ml57Y3cie72g;d7dOyfrTwio;OX-NO z&Wrb^M}SOoOmA`SH14Y+YD7Z<4pvNLKY>jE{Ac*bKKZqNe>;pqV~Z?0Y)wUHU%7+b zA0=)I#6W*Js=d=eT&Y6@%O(uqE-*um36jK!3!ClomssFqJhhx9oLL3_^6hJoUt--w zl241a%*{YFXOAMQ)|0$8$TtT%J-6Ft1`{iv+!SDqtt6?*gS$* za#6z6Yn81}k5aa^2WWoa)Q1qI-b#8kkAqSA4&2UFn2nFUNd2#J{De^CPrb-EG`_-L&9OR}SF~^F9Ukb8`z= z%y&YJ9`nT7qM`;52{`yB|6|1=rt6jMpXh3KBH39l+}Zq6i2}L+{}@6tz43?Xgd*s* zpnt!*H+7{I`-BuNJs#zg3(zXkV@nkPb2QiBg)V*CKbJ^bGVZD*=(R31(N{>{?lN32 zpvHcS46t za6?0%_t0I=8^Y#-kxkHi$B^9{qHO8_Dfs9Mkq}^1Q@|%Q3ElaXW4>an_7BH--_Uc> ztQ?TAmDpEf`@4xW>f|bG$=B<%`gJZNKc@R#(QqQ21UdkyHw4vjey)R0(BHEv#IKZ5 z!6&)(@BBow1Nj^A@lW|y*m;^dGf`)?Jl;i`%&v@)Rr^aA0$Fw{FEjaV5y35I@CU8) zuOkK?g*&txyAmSf`#3v^$gJ0peQ@++#p9QkT0oY=@>(DFO%qa?teEHU!+57iF9s5%2<)iM zCXBh18wDez8>GfI$g#9^>szI?y@FXV3f42#H-26&8k=7(F3B6*rlf+#R3!4F=<|}p zn{u(U$h~2AvM&j7zsE(<(d(~FqKD19tbC6+XhT(iyd4w~<7*K~1zg`JlBoH~y98k}oa4r5*|Blp>jStjHVhG| zmcAH(C0OtLH>qkCaY31Hvqa{-!rtmGZkd%+L@bR2+0(sO0fBWi^`8D%hgJqkM!GRF7_kO5!55*_Kq)Nh*y%& zg^xjh!aW3M-W~-CtpM=V=7C z|I%U0zqGYD!ab_t-LPF=q|gkzB)uzy!pqg)T|_s9-?qGWOEE}QGnS_+TKS;9x z6%o2APWferI&|3GMhd3ER<`gw?t3if^+(YerKAk zqKrfIFIs3(KyZE&E$F|hPR#!^5s*RFqjpjCft7y9opcu2w?n66vQ-e^h zm?@G3Q+HoXDtX03m5?i0SGU8vRPu_xpic}^nM16z-Ak-P!CpBZBvS`oTN#f5eq`Jb z&uR9GqqdMO*X7E%KCNH#&JR8}F-J%hR*M z{_%+wB>STNiUKb@9xS8vL@-#W&itvN9VIW$fc@GL9zM!yJ{n|wBs$toeF3ewY<^lF z&0xf>V{Do!*jT*P44NEvqDXG*>Kh>VjtsVyS|96Hg9`my5o-nP>nX1rfR9VU`_m*M=N<{v(tKa~=HUDY zpyrge_A2#;+6n=cjdULE4rWRF7 z+LfP=XnZ(Ka^*hfCsq&j(JdDJIpBqnto@%W8vlTzi_1p;M(dBylM5a-$nJ@rns3ck zO7pCryeD(uCaTso=n<0Ym$Qc(Xa{o31Vw1xZ!vbTU9bRSp#Q@)-jUQ=dva0Y(I5mK z_)0UC_{_{DYG-2@#3Uu@HSqQp(OJedBT|Xp`}|o>?1~l&Rr<{ALg*#;vqiV(vF|L zC4mrH0;}tHT?HW009XF4jQ^$xo>pN3(Jj63VO zS7uzsOsVl!LK=*o)_Cdt)B582Tsm*%A)87n3NY}0M(a6da-%BXG}QMefkle3+B(@5 ziFAMoJ!uc9Hxv$_hc{@KG`zWty&=;hZ{SIOmL8_)iuos3a;#jIi*7Hu1*{MclsAyf z$H@VLHBrs1y`A$w`#R{l;y}j$0()m*5F6b76B#vbh7M@IH{BkH>+A74BH&Z(hVe`P z`RGqiBmpBi;01>S(OONzY1TQJ2qVO^K(-tY0}pXr_dq=EvGTU`n#^Kc@`h39R|Gq# z{3!FqI4j-7p;!j&6N-<0HC8eRm0J^ZTa^Oc!#mKEYbpPM3Fs`}hGaD{Q~G}ZJ3z$0;}DZaB74eU828`^ z4)y&wgn#E^7M>*b3T3cJPsRQ!YmN?w*3UU45;0f53$qRHCl1}^oum7StvT$xxsE@v z9_1>0$@v92et(A<;6|6^Hy7VRz2YsL;w-!eIub_flu;k|@C!Un!++UG^I&Ng_2ySjR(;AfLM9$ms*0r~ z35}{~m|IoUsG3i*sZUdNsRYLf6+a#26aA>HL36F8YAZ`RN>@&*Q=GWZ8Nu_}{-Jtt zs;UoxL*n#V%uq*Xu9)g)Db>QDWG;!OEmU1#@cV*5lxX~5^-6rYY+{CiqNvyLTSt^_ zgnuMW!d3u&B~=x@w|R~m3$TXd9kdA93oUh`qGmG+biYK&LDy<(2BxJ!6OFaHg|nrD z->OagmQqqP;tz|Y4&}S(2%PHsqk5flWasi+UPb18ozSBNC9*kCcg{aUQWDK6ydCZO z`l6ILscU6Ou`K~Q!j-xC{VFjKu$!V98Gqi8f!+h!e~Pj550L zs%P*6?OYKxQu$1xLvGY5XYRP-e+&4h%(pf~3p%PW*39Z4D~=oDHEVQphKwE_-&tws zw5cIOqCf~Q$9ykEd7*Kt=y-j+C5yLI{WBlyGf>6T5Gp{P5u!_a(hAtCFBBJi)qku! zh}zRlZ_VdJx&OABeJs*xLAo7Pd)9%L963x>Cek#f0$cYGVpZsf5@a(5IBvlTV@j7T zX@hNh3TmHJNh487wB8N{dKrKE^l1UB7XD3)K$0kh;HUnOIoNAOU=n=WB&I%EfN?@* z6obw6IuEZZ52Gp{m7V_Uy@Hm*Vt=ReN>iG^4>J>UckaO7PU9Gieb4Leqv zc%XITO}>3>yvYsPHCF!G=f7c#-VpOjuXZy8F&uaScyM~KjMwL6I{{e$Mt^u%r}u#o zdI1MASe~aeDXwvnLJ=+We^D=T`%61P68)REbU#_lc=A@1VHgIaAyXh7|$yNdp|sO!7(3y z_jea1S#qI8ZGdWAfqs{~*ko@U*u@{6Fx4dyKUV|Za2cFFi!Fl>p#ytcrLakKKO+&_ zWOnB*PQZBS;`kNj?hI^1@C%NwD0Z|ibno7zu`q}gs#K#eMU4Y6tAE&?nPjEes3C>0F8Ua{h7D85QL4IiTLfWPt16+C4O% zM^h_Lt8Y{s>h5Vkh2`$+9Cr=;m%@(kL6~&J~7LOl4o-3ZziYNH1K_aa|C@r2AJGFav0dRNg zHja2Q`*a=%a>edls4B#L(jRSSd;b9Ubs&5|y98`l*18LrTL7a>HTL1oEGYq~1h($>$f_XY z3hXX_fJI5j;`SB)P(wIp04#S(fsC>B`%6ae&7?X~F@FV6+!pX*IKeS%&%z|*YzZ!m zBvma_=8L6yVOeGp=*x=E%OBK+5WM_hvWM$9N=a9MHVMJIPYPh%;y|G=iTleCe${V6 zP4Zq4gyq@rLqOsF8KH)`!F{m7*9{!a{uvg00Q5X=U^li*pNal)MBKr6FCqiuk5n~$ z@d__2%zw@cZE@=v(X)>arUVIC*V#OC?M!v8A8AWV^J0}g?| zctF|8Mfpy3tM3v?UClGaAW6R@-!M1sAi@m6f`9aJ6fF(_tDmpd0foH0ctm<~f*iy9 zXXI3yS1#p{(E{TPMFd~^bG%4>WpF4w#_wkVK?Yx&xCMY#2MvTs55(!hHqW+0w8y!< zEa?Ukj7v0Ww8JZcH(xL;gy`AOT6lzr=oNsJ`~mns02z@KVVJRHg97;T7#aLvWME&G zu78XlbUwkLrvBs*2(Y$hCzvz%y?TU)W*<~XIKjD=o{t}#87hI-pv@pbnn145t`0{U zKjL+_sh(ydpL$0CKFEPZL%7u#1I{pqebdS{J>UG)VBf>!o=$kF=o-h`-S)e&I|?N` z@6*veOf}qrX=m(?*x7R{OSEtJot?Hr34i?X7@Vj}Vp@>=c9WxM3apgeN`BEd)oWbmq0P1C=y(lbj!_b47VBJA;wC3X zt<86y7y(6ZeBzyos$%(m(-ZTme#ePD`Jle6>ee>w)A9mjTuy^+d+FrA>GS>Tw6T8_7Raf457v#gK`d;x)Zjqb= zp*>VBoF%IONcz)HY9{n2pHO^~Ax%`a@`tBOKg8_G27N@j-v~#HpgdD$Il4JSjs~8* zh5iUw{LZbp**%04*Y5pyZyvH|0e{JJ_P+&uz#xpyLi~L4bPq|`#_%79P2cH)NEs*dL6IY{RNz3;M|djM|WV*5DQdcZx=;$5+{&& z|F3K;Z7x%XRL6>^8lcgg>I|SUx_3C(otl|6SOt!ThH*B;u#_OyNF>1kD1Qw#E;N60yPVIXcUK(YO)G9{lwocHZE-VwCr|Y~}AS5S92VxbH zb|ou|+AQqu1P#^#0|FSX&BcbevoewWB-+aP?IMJ(B?M|?~t9EB-69c=S*neKc?p2(Nh+RE* zx@tG(AK8nMdT=LZ0@+EDyuk|t>Oe1Cq+x)Ia8CeQz##%3AAU_A*7kft$6JMt!QgNo zR2|db{ld`7 z5S!D^I9%$HE4pq>)8FXD>hT~Zg|hOUMukB`17iHX+f)!V3l3H~1z;cA2#0vtOLl1;8VAz2kG8k+D!+#FU^;{)`z>t%ofg$YI zpvOU^6M6@h@ZEE7v#4ZeH<_qXw5Lm%bWp=R-&0v$rR7zRWQuU`HpcdergNq4xEuxg zOHL*eenjefovCwk&+$~9n+>($2IDrl7CN>14TfC4bdd_LVLr(eTxfQtM#lGs`b~4e zIe0RjR->{h{(p%KCXu{@6f}uzW7T!quFB7=9-jICQE3kcG4E-oQ<1&}KIhfQ`+jI8 z>C5;nMx{>>#3f(*480Iyjp_K`%>dQW!E|r49Oa5g{%c5l2&n)rqBk%eGF5;Pm!kuQ zERKjf@8)mMX@q``Lo{~>R@Ljj7_dRynho&#oO!ObO@F0tG3#Ry4zmJqM5V|9JP{Wz zt4#_Ht?@0vB3Pstb3GUv4QD)&y9BEsWl5R-p_5Gix;M^8qu87Ju9nZnegurj-dh#> zGDsZXkMSUcvQPF?(hRJU(>?!=R0Ol{C;v<`<||*HjTRDtf*CZigl2nRVIc~pn!wT1H+H71<{|W`KV>*+L;-NyLw_G@muRBu2ux%1j9A*@3Uz2H2>-HH z!T|{&2K$d4Ds^|Nfrp6M@emq~52TW2Zw+P@OS) zm?k(H&D_(~hn;Li%VXk>9Br*S^rC`GB;j(wkj>x~*gn8)+$m&1SG0}_EAxz-bB-*7x9YDXeH~woe^GosYLGjB^#ms|W3*yR86i4Gy0>NdJtm|1syEW*>^Xhk!YX+%O#Q%I`!qEDii;v&En5(|lMwi-Ed-HEVX|t5uGSaN$^w7~ zRsXw#4zKR+m+Dt2?0v$Hz-6^@RoRkK7q|(Kq(^P5l3;5f0ME!6g4=za8rHn>34fiE z&y$AH((e%_F?=Vum^_0Mx%30SvOd84^7F~R#(0?fBozNH?`!N{D8lsspDGQFM#J0iJ_p#Q>Zk7MHXI1!2y;6omi6p;SEa56XTxs4gLk z|3hsnBLaV0`EzM=eSh;+xSoHRuHPQ#7Z;eq9!~y5*W9cODpW03Xxss2|Hs#nhZ$w6 zMM{62T-e4Kx%irc<;wcX(l&@}SkF*BCts>*^hFgHWf;-Z-60w4REWBDHQBc;lS(DUifu=R$6V}O_ z`4uxv>myei5}~v&<{$h9$|B4vlLlZs}dq*LIM59*DsF~ z0Q?0SU+6aicVU8s5Apzmg%?<}z@Y|rAJjsD-+8WsIrZ#C^#q_D3qPoK`R;pcK3Vsx zMN`1v9l%`(kyvd5$3kB?hv3y^$1(ac-jk0lZE8sFCXKNUA2S>-mt#hcnu~C65&{n_ zA5PCZz<-^AFPP;qezhU@0{+G6&a4#i`SQlOxHBOU)Zclk%S{?oF0IZKt zFYSt2DWhl*US}$7POpx&_aGJM8>DYo$fz!KnR%h-^sKKC2@bAg0XXOYw~|BlCL4IO zFi^KX(oEg`i1|>)i1tea9ul+gj}H(3_Swg#Uv`Fu3zv=4{P zkAGM*)=Fnu=?N-!B7usUzib^!)yd#>+PJD2t&TWI9@gZ-(g~qokZ;@0_R2!LgLO`N zB)nD6Nv9+I8>mZxNK5q;giNYA_h2Xpw2JY;Nxu$1SiNwqdvJn2mKD=?PV_Ne1jm9t zD}8qEpM7bEu=@!#(Sf=dbk9!_qo?0JhJRJrke*okTK@v8PYqesHo{C3+6xm-Ort!s z40l*&Z;H{1;}g)fNqpn@#N@fWZE+~IS8WUo`*7zFCDTchAW%&8I|ibZKp;lX42v_B zaAhe_#Ycoe^c~IuA=?^Y$&A*Ax$~o%DRz#qCc?wRJy$X|R34ol5HVP{HS>QBd4GoT z%BYMuROu6PA8zFJ=tmNJ3InP;Xe(`0w3$bNq7xBSE=C+n^OKM!R97;wwx>_qcG*~v?~iw4QOpq!F`O)!WM7QoMC2!T`12t!TKXvmN{n=v z9cv)G6P^-!iqs;?;$h8cF%p_f_qGsf4)C$)Xd%Ji7=N`F$|t0GJU`7%mVYySsAM>Q z)##pXHTPh0V5xR$M#KXZ7uMlgzmUc~blNPJGcsI4(PN1qrwts}mk~NIgcj}8hvHZ& zniab#T%fXjl)$Tt+>Uq%~1SA%iSkYw|j z$U7FFGIlj0n&=24G_6)jQGZTd*FuJC5~%4k9xpA&(1c&-g^T5!eM=?QOOhfN0+J$3 zUh3zRlqIjaqO>OFf{OC|aq#gl*d#AA-JbVBj1bDhBXecgi}kLym?XkPnr7qErG@%_ z%SA(v#hbs9k$zx@PkPx7?eECR>8={ZT@_z%h&{9q#=$3~q zndI_)GU^W7S+0qema2k#o;Fm(j3sF;*qr<;-$dJ=%~0MGdTElaVYX3;IV+8QEG=~y zg}l(0I$3GyX@ZEB<;#y`rCD!_Wy6myw`j7bvL59gmzEB>OJ$$S_6@sJ)fuH@O(02f zEU-`+w}8X3N*s!qUaHuyl0-GpTgz1@h!oz0ljMsvnkarAw=rDc6rh zmERh7RDWpPGkBNsOa+FLbo)az<~-YW3ZcXr8t?2nLkoA7f;q)gTMm5ZmrlS zObLgy-)8%Djbj<3QDd@u0{m>^pGcaEOH-j>UEI#`mbkSxWg##3}#2R^g#*( zEbRt&`4BPe)PJEnVg+T7GS+(9g}GQ@ON=PbS}GTqq}^qR!a~mEBQJCANY7g?$n)MK znNGz0j(W8p@avF8bNc0&&OrPhxUS1ZXjzHY4j{b?$E>VJu|JXNMDZ9u4eKW&`<`>0 zl82J4^^*iMNxoP4R&h&NHcBR~o)l2M;Az}Aq`N106Mw>6%iLv0byp6>L=9HxiJD7l zWS7J%unlhOEg{*X##zgtwwS{_jO8?&oSFq)Poo@5Au?Bc*5f&9i93hv;KGs%+}Kbo zy+DvEmPnp;)3B#&jl=h1(zT*9EFP7uH{_CwLAlA8&6w5iO*2pc3q~K64Nd5{ZG`RK2lh zMEx8$HPbt58bEfun~j38%sO!_U`VzX=m~kwWPiG}#APrkvu9vZKQm9JCo-Fz6+sd6 zygWpxPbe)tNhr6<17aO3w&Pl6>PwWCXh4iZypG3cXQiiPCy;$|F)0_6G=fc-o;ZC0 zp)8#ey(G~sV#(~wQU-BDFCFzIc|nj}@`;7exn?GT1Yqx1Grj^$a@OE!iMP3y1yMfD zW`AdugzOd`#-Es77LGKDZpBMnyW)A%uf$=_wRZZbQ53~9i+(?hxN>us*CZ3(VUk1j zgx!~+PNqvk&oB@#mUl(k2w}WY-MW2GmI|YW=VQgJz+aH(z2V&alv<+VQw%j$$?_@;7K1jdP-0SL z$nzzHj65AODmF3&^9wXakiJhEAAiADQeB!=r@Sw#LV2g(y?{faxLcKWzdw=YpW?Q$;BZ^>>J5Usd=9#HD|M=89r;UYA7{W@_%^*HFD^j zp(I=XiR)`wwiHhX!R1wxr;Z;;DES)&WP3izgvX0kC;KQVyQz@J=z}1sNG7Q)DeuK? zVxTu$N%(TIeCTi$rPfob_IP(G?ei{}7l)2mRa|+KQm4#!TGz}Az1i|o?WR^1KeoJ} z+yqshUgfS*^#(q~QhB)U~4*l($6rTP+o5ZP9HvE<#sI%gkLPEwUNfiV{Oq zsifBUB#YH@v1wJbcpqj-pEzR4F`TT_78#0svVc18!eq)CfB~AY;EEMk%VVdOEJY(7 zSTvt55HL)LTq|3wPMPdzAV#7&l}wxm>3zf~8cWm;f&A?7)k#s9`hTp9&y9s~UO@EF zSb`-FCLykTVp;jVp%LKg2Bs`7r6mqHr6Qgerzi zIyWVyZn#g2qK!T&P4LW<(m`9e>@+2Zftl9!TZdFXiK_;hM}Oa{G_#~k?5i!8t37G`|c_!IwK4~8<2ntZmUYK{H z=G+CrbJ8%Hp&n8`Db2zI5t=s*9fN+xSVhuYYGlPArD`A_tuCoLhB`-YM_=)vRFF*6 zKuFg5Ej@7f27j&BvX}8mK8JPG=X1=egMKLg4#AfFEZDNm96~KS9}#7F{t;o6fMtQB zF#NF*c{)pIo)JJ2!fy#!cWksA(WNr_nnAG|m{Ln4RN!~axYhx}_&n5S3$D%Z*&g6z zO38UDwujA}us%Et)1UDd|b*Fn$hGdMJa;e+ZuyG}X&f9Q#i5koV2uUS4B^iWlx z&hf*pJNulMZM;0q$W7 z#zJ%mY!H(#i-b*$d#QxwdKf8G zMwdVv0|bBdk54qyWZQ^-7`5OsqxieoMWDwwa1CYDk(JsQBtB$84Wna37V?eF38nVkO~%oDd%j$ zBNE;s&|Qf~iqOys7mE%3APXwf^t()wR=K1=RqKCg2|K2EFtG4|KjIgR&=~P4#`XYf zn$v=-2Na{Lb|E9SI-tbTiIoYjlRD2M{-%NbjGM)#fytdzv&}+PD)I(Q34^y}A09R* z_@Sq#Yzl37ib28NSs?D?{IYNpazPbr(Kx+qTy?R*B2d30MDNqIf)RjEVqY%J%=w@+ z4JLn1W1;&>8oCsJARQgE=?lYYZ_0zUDyFfj5Yvi~r`}YEAVYc}R4Yabj?Rx$8-*A- zu^JyV0Y*^~blZvu)&N%p%+9o_c9SoPmFkG(9vip*yfZ5+Wb&O{!| zWgvWqrz2YmyPU>7Q))X`1r`rMp&W4|sQCzj^5s5Y(RQZ)yL-uK*uC+Id5C5^Pk*>Q zwg0jM$}){h7M437rcLmAtl-~JnO5u38X;3h@GMu^6N(A|rVKm6E7LNpis z*+;lW!2s&SVVK)u5y``LfOddNiTjmcqo~KM1koWFzNt<@8Xcb+eGl{)3-G`osK4$2 zVxwZ>5@JkR3cYQqjuLu7i6Peqj)8w%6t-j!1c;6UgoIZmD!Uft1-+sNf?i09X*} z{%+An<`piE?4lNSrIT#yjbp)~kTlF$b-1XBgPblF4P3=n7#i#--r)#S!|JUk%ME%5rzSzlFa|i=XbevF zJP>FOAT#O?MNHLpC4jwc%r7qXIHHUL2ff_?0xLkW8<9Vb!rFpwRMJne(fgf4x!#pU zu)cLqS)7^sZIG`<6ngkT550d?kcKwUN{N7@E#u#?CMlOoN-$QS?o!V*gd%kjGiZ8i z?KQ;bcaJjJqLbDyG|EoW7F-EuG^;3u3vnykSMRd3W}Tu+7VKmxbR~!t=W;hIv5z%m zRf4GpeNWxJGW`g5hqNrDhxB~?6g?+CjC8B;0{bx2tbd?4yMC!&btHf5?#E|`P~nrg z*<9CcZ1NaeiK0JdMDiUAmE{KkZ6e81<81XQCz)Yu~ulU6) zOCqA6l#I}kFuloN| z?n)d2EdV-(cM4S_p1^w25rXF;uO{Q^WE0eYh@I7hMr{S+O{^EBnY{PPR=+bzs$y!= zk}*A^Yc#{cdw5U)?nmc|n9;dH^5G3hF3k8zE;$&BSb5=+@nGwisl!@;u}&XGiv8sQ z1IeNZmF`Ie44Vy;qnwTr^MQmKVw3wTwZ!tm4UnB7Ynk_u*6~zC7e*lZ)<%wy_F+Ru zGyDp1sw{?6TrYoa>kfZlSx86cprkN&5tN7}H&Algp%zLVaHW->6TkXBIAR#I%q>0N z!MTN5MqtEHQPm8m!V8@tt0f$v0d4ceXHFwp(GYyf~9|-)%(Aa3A6PAmrzW^}asW#>q-se^VZfnC&JF!ZGCV%GR6Q;l9;EUf zQ{kE5<~sWGdr(n_Smg+P*3zC}!8z}$R+AB@;u>JWsq?!e*l`G;me-JE^y5Ii0DS)O zn052CMzMb)?J|L6WVEosS_(RZj*t|>JY`VV%%^az?x%36?v*C-SsJadp;-l7J56W} z6>BhYUZ4?I#409beAUq!zwu);_fc@On5Sa5@v-Sl!ET#U^@GHzQb1SK2#7?IXo^KQ zU`Hh5tF0h?t~c{6!hgBJ$b~^kQeu*YB8nTeQYL?^gsz6u)58QGu^5Xj=|=GnQnL?m zW@k&AE5?UY4**5-vf+N1B(zZkK;JqNAXHQzO@y>=TxVj&&6Y`2e~ z+@|y}Bj!!S{_YjrH?@!nm$xiutrf5}xMfu=CuTM9>_m=$iglDP6zeuyhI|+qW$L0J zY+`?VYN-q9@Hk%LRy}njB(;xDnC67ZO_=@yh9cwuvj7H3CPFAuQJbugnS#bnASN7x zK$PHSBUC6jTNmmGD_b#~F;SPXGK%AgFdi3@Ll&Bt=TR*)i#fjnNRM>53|NjdvDNqc zz$(zSdRxuP#_k*-?b`9Rv(o@ zaU#IF9O=SiSw@s;=KnXbCzWwbvMCEK)$)i47tZ2w%}g#+_&=HWZZOM*3F?1!l1b-; zd_?vU^SS}@+L2I3hHTvW##j?Zmw_=Q860#>_|`X{eXM(+%z&Ene`}RJxURnOvbcXd zb6zt~losu&7+$r2FNx-R;7~x5h?)Y-ZThL{x`+&boFq)B8iAraE&we@%e_gqqBjed zfRYaUg^8-gmv&&ClW_pb+{9xi5(nz!N|g)A_;_szLpg?&-3(dTj<|+TbEfALU-hty z#u#(}%kP02;tW&}L?P-Egy?V&oP&R?_xS7Cr?<`krzv=>-SOApt^oJJ0B2juutvmv z+XC?s$a3IK9$=HQ6G;0xJ^KUMK7afX{7d*ZSKJ~g6Ceay{(gV<>3itg z#dDYV(=eUD_}zs;9BsvOCJagNRcMA1=iJu#MUJ%|EAspJ>&Y)cegoq8Bq!d-FTc)y z0rsC>x7C|{=_usyXWtja&fcU}t z%G6)CzchMZi~D~)M(%5F9eIDbe!YW;u(B2)T~5e&61x&O8DROCN-ayjkAqY|o(H|` z^^~sko`5?h0LFF|w(&dyIlj%zPR`FDV}&LfS4_DN6F1~mF_9c$c$zE!h_YMLbRvg|As`7KlcIERY23ry74%*Em-ipH+R-H08o+govk3y$yd)O%qi`ubydWjT_7! z@vwEcCBuI-;s++S3MFe9Y#tx0Y8$b3G*1dVA_Mr1ZESZOvkP5>1gbj{paqkWsyNXp zqN)zh zBu8%90E{0H@%w*VxJrBD=$Mco(q9!DbN=bc&>_wp#ACkzXG#=Um@!f{Qdw28Lp;8c9YXU zK9|=+1awp;BKsRn;B=qs&*mZO3s8SnIF=~TzdVGM1=N4aOY^BL;r!G6l3jUk@+Cjv zl()_n#*jq>y9;ujLU1LYYwM#bwojnSX?C{ZGLvXzxk|7iQ$VW5XK!+Y7aS9x>vf%2 zu#qx*r zax~^cKiGfVcE{Z3Ptr2Wtkc|MmvObt9*~CP+S%aAYqX{z2##&0;lBpxt9cYuE$4Xd&gQkDEZ$l_YMGdXE)X0WQ#A4R_Z8(IJ zieZb{Ke!@#1K~~P5?f#yJg&_G+MgF&J$nw#FQWlW<3QDg@;5Y5D;SD&J;CDFax7RN zC=kC0kd<*)UKU!|v>^)&PCq@pTA?OL2e8l3`0Y(@U(R9@p5B?B-L2ibb@zuSPk$`l z{da#(Xz+LY_wO&v)MWE0mi$Y87h}Pa{H}z8#ryI*6AONk-$nUd>?=ZB%>=byDC95eGTIqTu-s+atHUrHW5`q@h! z{p%-|EXHG2^q7@LUh?q%eclTjzZa6fO38nN`=R6^k`t4ENSkh7>eJ&mHsQlx{8BT^ zODvgLj<>tNtdAB-R<#1y#1d%`Y)}FMoM*F;b8wG8E+ZOJC{oLp;ywvg}@6X6;s&SM}A-FtZ}v ze0AlgpTe8}iLYmMWfjl9x~f-MU0r53XusiGUq@HlJ+wCv`&-u4vR*dOo!%d?JK;L* zH`<;4bH##bA-V&IzLma_xl_OpWsIi3BppR$Q04I|dUC9#Z#b%^W2J)%QXGH9W#}(H zp@x2lc^uIKVTj3_OBk$7rC}w{#rC)kDkc$%M5S$TGO&URSH5S@Ka2SF>_2O8*QSb7 zFSNFZQBWs^wZ&z)mmUj1kYvk8ZX#}5#Gtjst>$sR8O{_ZUOUVH!XXY|IYb$0V{74p zs~(w2pX=U@Es_Um)i$=CH`;%goY8b_ES|+?dVk<8V%OnDpI2>e8uar9UcM@BnE=-X zfeq9%@MN$fu!3!gmPajCi&)jF9g@UFZr5O2Y4Y{jV09r)1UTTtI3O;(oa@#+l9NIL zra(GN(8zmejKr8yp0ks*HWBc_-{clDwb^0;615y5+u_G7QF4IZEzf@qIsTFhCAq4l zh8)-XC<-w)H)dK@SnA-M|$N zj@LcFNe?^e*5FAOA18mp!WmT1i63Z%Bd;k~w6f>+m~`yy&qh5Dr?}mHu2+ajlLsz6x zz1So3t@V-`*a@?%L&KR|{e~3y-DiOs6b!jjWV$JCfBEC`>%@P(zt~EjXLp`W{<#OW zC3xwmBdyO-2Y@m<rXFa(=xzIdX*>6|rjqr~|36m|(Z)m;C)<4rMm8#qw4gL# zHwtVF5FmdV5}E`M3G>Wt%ni&1%yrCF%$3aN`_^IAN`kugGry->vR17+zE$Xju(sN%ZM3l{8eG(u?S3NN8L8M*}Xnu%iow`aO zdXXw0wNti=(>NrmAzbg_1p9X5+xfr{4<6jK|G0kziDm0OaA%LIDYcuu&(GQNKl}IL z1`)rje#sSH6=1V4AnXYC4MS!lMKi@HJXpOf0{1Ii{W^md&|c<18#tZ>hkpjt{|6@x z{CGQ*Xv63M#4@+p>sZ`HaVnJkbU=<4V}0MFq3ksjQyeO_h49h?GNsYRAgO3V)t483 z@4tUo=)d@-|6-~CV!8ifrT=2J|Kd&m#b$r?b$@lOzq-lK=1Tv~Mt_6!jo1Bm%l$uI z_qSdE9MSpk(~Ab8X&bMv2P{y7pl$x1L@y)15fPXFMt`&&(481}QLaCfUtbR##Vo>C zaqB(x^T>f3r}3gVi)|4Or~1?C5mb7W37vmiH6exNnL4}Zo|Iozd>rA^MzX$*4DUXw z<}?K{nnm>#d%jPZQ44FN05I*HkA`~MQ19Ao%-Gdy{c-D0PwV!V{;T&{BX4ii_&wjM zuw_O{FN>A=XAJD`6# z5`s++x&XHvWCkpzW_KXOeJ(xJ2@k0dcN$UTMBNkW7uJ4^76*hrc!{PU(OoJM+CLtB zy@AB9k%7iS51`^Xe5QYzRwS~x>@0Ntd^Pgau?KDTw3+5zm!K7D+fU28l{+KueX~*8 z?El2DzqIlO1HBoB++F(a5Ou3~8z+Bi8$m=tMiFXyef;rZ{e1R9S zd}Xq)AOSV~EfX3!5o{2nnPApAQ*lc8rhK1BnoKkt1%`M|G5Z=P+RCNZP9(_C%qgQa z=9M+QbTCM`bJ?_$4@PlfBGisyz@uG}W6-(K-btu*FC{lQl+%(O47PtBWtvg2>@8@Z zj@`TMNYf&ATCcm=S3z#Tywq!8$w5Sjk@cqj#zJrNE?MGkcBOJLD%&qP2%kr5z>N?< zkHB{t^h+nweG}9IpEr+*<(&poUT+*Pnu6$p*UuyVUA_17! z4A1Y@m)+xTcC5A#!dunOD)i=W*dfo&W{o2I+v5^Y&%_j z$JowI;iG!@A#s1-yhPS?C^wi-r!(P|HS>gMWaJIqcC&Tb+YNld<12O}Gnu$e+PcH8 zz0B3@#++1EU-oWcc=$Z8Yl)pBLQJEA@<)>qPJg?{p zwX>U-WHCKa@E3}J0Uoz5$1{|~%8dk|uZc6MmY)kIfX08&dv!0Bl!;6Go)_!O#-e&~1$Zh{ zP$*!YnTC2_X%D$?n%VxcWh3#dRiD|vRUWm)6do+H|6E08edgJ_vUT1n+uwY)zulEh zpC&0q|VdSoXor&P@#Kew&blorIj@qIVrP{M5Wc5`XlCa&DEqlI+&mB5(I* zzsY}%lx(5kuHy@lj#-|_8)8!j-R^%yOH`|t5MRWWj{p2G8Iw6Q!oG~w(C)r7Qob4q zjeVznIFW^lA_FzwExF_lY`XJkhr9PZCBB32)H@&z*nX;s#y(wTX3(2uloy$x8nL-7oS{iWT|O z$XMWTz$-i5B6m+WCuDG=RiQ@;Jei+$aqfR4lzVm!V{a?_nll(Zuld97+*y9%TMy@c zyz+6=b!X*ma^+2TY*6;~1=kVG_yg(JgpSzA@>V1R`ZdZ(BVd4IWi&hM8_oTSP)UD+ zpiwNdJ1ZWX@#1DY=?GWbok>kq(^gwc80cg{%YdXHYyC9X7J2L zvqe%ig;*}}+u!bDGsxA{=>Gjk8**YKxm9832BQvdXaU6Z%}pKbU!_Ml14Ypa_35vt-Mskjm{H_KZmY70cf*SCUjnoaTjt7$%_* zB9DP!!=k^a6dTRXBwnK)Z9g2_rC>>{Dft?#cTX!yV>|7QTqS{aP8N#uBPIE_+>DLP zC_bjk@Ne-`{m7Iu#U(q%@}Cker&yel&<#&>oK)$=i4>8{IE7B6(QKc~Ek=K09)m3G zeQ(%O2UK})4Ea=UgW#+$9IBAOZIs~0_0uKFpzoQyA>MV*(ra7 z0pDhaVeI+2F4vI~m6);7h^3M z*{QgP%D`iP46~DWZ(c60Z@hTBzPPviQs>qBqIo0b>cJ%rnPrZkH2-~86mBEaA&GsP zp@Yar>8@8By+OZ=o;q|3bTsK1=!&7c;>h)W zCmkscXgE#Sa$qMp6U3JU-AL%f|1VUFxdaSVW_QV&hcxoiGt9e@oU&hk>rHctzp;2X zHcYgaQZ|g1Lanh_fYmTj=!T2J$AI*}SoEdlQOWVhSw;FQ z_I>dNK7TK)lggb)>_0<_?sVc3GO+5tbyl;SJxzx_;O6q?%A#M_{`T>W^XyhyJ>OI% zKxM>#*UcPn*CUsPB&~^msbWI24B*&r<5fvy{^(`DNlPV1G27)SD5ipi=N+6y<2VLx zAiPksr||-m=m;S96c`Qsqw9v?76Zp7PHGVS4d&i1(Etr1N8x|-*&EKI)>K#kKQ2*} zs_2jyOAKgp2b;?g@7TsS(Uoj0REiRo$T4dcB-vB`=$RTgXPAM1qOj_(V zv-Bo8$tZF|xuRJeU^meY#G11Wa?+*X`3~!-;G{#wMBW*cH;+tg{kq7Uy53zme*^~)}tu) zt8rg9c$%E8xDyP2v0%xB7G(r^Z3`!$!XHmTm51#e`t^x$^RiN>a! zJzlvHMZDPsRA*cS1LTL(;yDeM^Rt?f?}-?BtM+Vb%FcSRpzXA;z>F(4b_16$?BRq4 z9wNBsr3(Wm;5*&9d(G0!H8=ablWBhg@8A>_<+|njP;QuiD_}<#5>X8kj!tyr2uFK9;ICLdjT`}P-hlw`=H^X9 zUyb!qs)VC7_QJG>(4F{3_S`9OuJ6KibNDvkJp?yvSjnKl`}h|AUh?n+eh?BEdz$ng z!mp$+5HN>-{aKWbAMaMmRmjN-&XynFmL8QCR{LwK&@-J_v%cFy#TQ5KJsE=E)guV6 z{C;-7#62PMC%PGXeY*7MzUe2A%-i|S_xg9f54L4&9-mZaiG|`$7yrjK#{*%wFOSrR zC?J%}gv`*>M0wE!4(2LouzROK2 z&o%yvJV+4K$hWe+75D#pnVvK0qw@fZBBHgOmN?|Tnm~aGT$^x}AL7|MlgdY`qk1Pl zcp6_9ToJv(pK(;rlG=>)w#r7i*G`CU(>d?|tFpehYD5qBV(Kd14}WiVvEwN{#~ao` zY3j{?;%2(rFZbn(e1vO`*!2aUH(u%-ZIWmrCOg0DIFqy8hQr0?qY!+M3UIlLgj)HC-ou zMcm0d2>VE0X|;-MX~^_Tt3jcpm`F|nvNI~H+1rMF;md*UH2l40st)9Ks&pW?U;N!C znLSy;$tRn&#Sjm;?BE*4<;8C$KCuqEN~e8xvE{q4jL==Cuw`z1!nFz9pn__{n((fx z+0y3!-@6-TPf&xqR+U^^t|`G%pKOlUD5*8eeqs4TgMm6Xm9UZn5-`96;W2102DO@= zhC6h_zgtv0ar4kW##VM;EJ9)I{uPORA~8HDrMG1?2(!cNB0Gg=1%Y}8G4E}O?mM?n zKm!#42acq`jAOqcmuo=-NDc*pH|WqPJ%9BT-SPdUm%%{;A`SE7e)F$S`bYd8{q;$g z@IeD7F%m#ZIMK(J*xT-2?CwlA$xK;_^5N`S_uT$L? z^DCD~LIWTIKdzT*LIaBd&6oB<15AJSO?S?ozL0q2-q>J~y$fIp?2b`8$mhI2uX?cv z;4i9q3RhWVo9ISc2LORj-yLZx!6melWEvrr8^N%l&|#s_@VtyJ1yT@`SYDMzm~>$c z_kA5JC+Q1fhLLJaPB8oZssj6|s2MeU`yLsrs+kC{Mn=C%Gw5g=dsut=?4y59A}0$f z?zdN)t1DS)YZa%N`9)nCh6*3+f6vVhLh+51_(@R|zH~*deCg^lmsIB=sw--ui|6an zw|sv+!RDaxr6{VplQeyL&lYiRtyo~s=M~_T0p?YGw0ZV_(oR}+U6p$|T1rxW`T51Y z$w~0Zq#QFOIwP|)EV-y@eQ$qf`^C$J&4r{-JG%ozfzlsApS#@iIJ6EB8xn23?5wCj<6H;3&#bhPLb~g@}e{}9Be1cWMhpAoHGUa zCa#NZaY@W6Eu_19WxV4b=$7L#f3IJb>)RdsgsMaIPw1}p#MJ+x&OU#Wds$d*cjd^> z1W?aN4H^!zT(3;yw0~e}lxTu;hH0&g(Tm%Kzjlo^3K+>l;^G@iAnBca3f$2@I;5`r zo9*YTvgpgChzo76t{4xbNiEa-oWNB`Wlzolxqf{<9JvNTjnJH1^rS`(Hi?TyGxz~hm*ACAoZ zu$bFLD@-Sb*i&*PI+p3I;DTZHM_YbMy5llVyln3 ztL4fR3VkjNjCiO#X;03o!K6TZEN^f2O1P16$@XWyTun0~4w_d+g$y^_4fo3Wp%k~Y z4}Cn9-&uU#ay$D4EDB7$Cv$X2ku6mV)VnXJ^4|mPP23~G`nU2t<@*FyRNu<;ohpXE7a)H=ey4)agMn5H#pK-VM+@$Q z>BU*{MJEk+3UdxDjn>pFp@xFJz<}aOFQFX*7q-W#UgVS(H*w;m-RV6I2GW<(AQYp# zXPGOQ=<u@W)OMRmCNk9&hPT&?Hp%GslJpgtmInVSC{3=4Z17~ zD;0k~%k`zXg_YWDo&`5ex!yP8hTMj|gt_v38-hwkw>2J|Il7);S=2%ev@oP2gOayo zDw07>nwn7Mwpy$kqiBpJsS1XcY8-6jTeVA2HBfscXm!X#=GTVC6jER-zrAW>@@z6Q z?U}AVPa&#YgeO}*LK)qYZm5T}d)r6&>R5k+E#f3P2aUNhZ+Reqe&4-DRx6g3_k3Ge z9h*zeGO@%-oy6EOB%J6TOdpTo6bOnqvptzIGVnI(T7 z7Zb-U@z0A+G5Mh--~aOXZzSj4XwwbZcL)-320SQTz!;PLW5pa@Y^+JDMVBiFjqSC@ zuI7uMfWv@NoYt^_)*660mfvitJZY5QZ0*+8><3UEG=f_R89pNxZ0j_ne!{lg(~=i0 z3Av~)-#9<@)gxyx%?48jySIo+DaU`Xm}ppsjh8?<`J6}4oJ79kDvA+B#-=EqE=|)Y z!zhDvKjLl7DCot5n}fzw_XNCAuH1eUNO_l~1*6Y4TXd3qYYp;^?Tx7)z_z^EnwTYb zWjBdL?`jQ|vT#4%&bhVQ8Cbgj%rCTkb6bG7JhwP4`j5bGTuJ@|9b!R}nvJLlwi%0Hr)k4$D037;;v%wLGNVp%I}h2qs@6 zM&#IfW?hIaf(Es)5-?EKmRjhW#X&#ox~cQMI_l;AZ^y?~ywH!>uEmn3#^^XV9Zp5# z30Z5Lxp}z|uxFQ*F+6_?)pv6-mG{if*b2MMHV3LoPThK7o7hH|En3(fPQ20OU@agb z4GvUBw4bmbwSo<|oA!9V*3(seON;Ka|ClVS+r2%i*+3z~WBM@H;Xu>bu2nI2&;WvK zAac{@{TNRPVR!Jf>$keZys=ir7|++L=%bD;(ZcvD_@Xj2c58nsT8P94HuAvc$O?4A zha#O2qBnuZ@^#65Tqo1008Ruk%8PIL_)VOj#Q;v%L%BE?Ef@B+dA6lHv(2a3w&bm{ zNZsfl)Uv^S<>-J=U#;qhl%XjobyrL%G9*Evv_wZnh4QC0$9+Kt>A}j1!%U_!I9}RE4)CbcY843(6d6Jlf&6yfaUPI?Qjx#qeAreVxv) zC3xh8&eHHiQP~-BBNabsaEE3SaIixrxYYz255mK|tgL^uY6ht68rJKWk&$yNAxzh^ znYl2ulLn^kHmVhY|v}!Txrdjm#=6HF0|nVlb%eYv`pHwjz`kKH45 zA^+F8vkMw|D`OZYGpY?wy`;)c)H2?Rj{ki_rr~F^*>;FDL z;$OekfA;K|{_xM9`Pau^(2lyb?UmNb{QH;wsqVi1*@K?{`(e!u;czV?Cr|l2->MEXZ@?*QNP&}J#ovXp? zzSKYa=#n)|4Mc95YDOzeA zpykcCFW>fmS>9S*?7u;`p#S{cQvdDR=JM+DhsFLYWDfnUt*uJc`87c?=?C)mukLf?8L~V{(KhJwfNtOb!w+%uGg4=3UKc?O6~Jy-H;)IPKb<*nYTMKI~u0h`0~P znXSI5zT_Duz3+%5(Guig0`*A0*@gke+<#fH1Er|{}LGT&Y? zn>;j+%!d)?kXz;fKj+b5`)FV7+sDn*i)-|B3R~^p@fEL)={+>3&Qe3mzZH8ZKg^@D zVV7Q%k3>(qz?dTf8}f;PJjT|-@@z9*Y0{5*s zDA*)xuhthf7FPEDc)R{`!)3t(!H~w3%>EM(0f2$m)GToLmFi14k{_0?}Yo?gmhVTppQLw10mnA0;-xJo0)2 zU(s>EeTK}jz(q{Nw3KP;4{XrV5Q0V3yOJ47~ z93hn5W4$OOx19?$9Q~3Oy%Q(T{r+G>$v}$f#tRJwnMH6UhMHXJE~rd^q<@5s|aakYinDK^VtE5i!&#*BhhbRX@bp&4;w@5eB815DiAmFnPTsl;rR z>Rs~{&m*qFH${IqB22LEZm+a<)n#%x`Y9g{^{=$JZaeJ6y|s4jBoSP(pK&ejbygH! zmaw2OqL@j(4mGU1x96ERxb=n+fu$gpw-0pz-mTGx>_fY3Z_J&p$}FsucNMpJ6b{Y% zo3&UlAL<5=nQFrDaA_g`emfV%7~>@Ex!-`lp$c`InNWY!Uk$O#nld{{RYf=gJnTjx zSCXaGIWB+vV`2SGwdCY1KFe*kY?n>Hr+;fo!o&Az6DNG!?e|TP6{~njkm&~cZfsV} zNm&$@dx{@rC!Ww?1qcFZ9aYm5RVbSQ&dve>cgv7^-$B}HiG3C$T zDd(9pZ#HQ!b^w?WZm%gDF+-S%uaVLXR}7#n6dZqdt?QCyfRi4@%7qW$e%tbSPx5bO z#3~%^i&_J2?m``I29q+v>tG1PgLTC8 zyoQiOdAAuew}KKrzw2szM_A1s7qTB_RDa+TeqT=Eg^LUxP&b+%Yr+zTXw}D5X@7p~ z!hbj+)3H5E{%5B0jZtz_qr}VyOao#9RBnGVI@QJd_O5%kObfl*ta~Mby~Wsu(Ty}_ z9qU4xV~#42Ir+yz)`^LvCf}TsxB9wwYlsxn#=WN%QbCwr_T%#kT>`d=Sq`DoGqjkZ z5ZD)WxL2I`Al4RdyH@5zadbrFdKZf6nC8TN=1x;5A+;tO=j)PEmEUQc$;$6*XK;U> zxZVL}9QjKGFmr3XbWLrsk|a;Di0=`t4-^O$8B-n*ho!w%*ArgT(uJdg%2sBQ zc+kQ89fETZXvy>kFd}?6GSs92x!iv{tybr0LGnHLF$b;o7lM(37+>A>?)_;Y(qd(x z!eDm+BQOc8+|nT0NQx_Oc%W)js~P10^D_Y2*x+8V(A(f5yXH5#G--R za!i-4ui|bRXj@F47u!>~0E8*|%4NFr3ySvy;6?Zz=PrO!Txs*KL?TkAYV zY0d#OkeQZ>+!^DAZo8;V?YkQ(^bk5aEL_8?#XBZh<3&B8T$1FeVEKP)*S$RV5R)MJ zsFk9N;ZPlc(i!#53Sl+epSGaLNtS_@xuYhp2GRin!r?HVW6f3XjyO=94zv@gVo-@^ z2gOHZr_V7F$KOI&oUT}S<2WQMD~vtu9um6+SEb0vSWC#P3V4W>8HsJ^D_y||v;DVz zA9cGC4d{kl+z={JNSc4B`CiO(ejBkgSjexCusKwNe{ucGKf0V@A3HeZVaPzh#rI&F zr4V1q_7~j|FaI1c;bi!#H{ot`e}S01s^hoV<}mj_6|gryw{$ePeeQTgj-R|+~pQzbO2@&Do@2$`FPTuZdTO7;qEvw zNwlXjMILQ2(*ao$zqZ4l#tEIAI&>W&PwsPi3)lc9(5=s6CRYGbpr zQ~H0*>uz45d+qhpZ1zVf)jBmby6z5O*~a?7)KPIzR6m^dJ0)4w-hmeiC*m`C*E|dONIT?= zI50afXO*%hI}CF=l`0dUQR}%zH&WGnZ6f_m7#rEP*YP<8Qd3=ILhLhfU0|$5I zc4u<_BqV=T9!%qK!T0c`bws&#Our9pCnZQ2(n zZg<06%yiD>KVClEB)DnMdb`TC;iOLbajc60wY}NLfyxYCf z_Zt5t{qmOv?BZN89zIXza#i4*mdAdpkyA8rp7Xc zNtpE3yj3F~>v@T>K>V%*p0W#L?VO0xNA5cOaJbT0eiQgXJgvd+ZqPdiMRF?)5G?(% zIZuBo0oX_(&8s>}03{l8t8dwznO&74F@O&3u4TB}volkFn<4=7SnO_DIWk)oS&1W) zU~=6GF7p^`1D9!sk@E|phijZy@o^1f8f=f{4LGmO)f3c0%@zj(nyLSG((9ddtJBlR zAoGiToo~~z(5G(xJjF}1+r4OXryoE5+uwiSEi(>=$UvEIjWiKg@dg>?m&#|kLZ$U; zvlW+jwcEa(+o}iYQ#GF?jNU@QPu<$*hFzUnz4`c5ecmkd5^5qpon~xIO;}r>bTfe< z51-VK3(1zW^9eHaQ#avwHsc&KsIYVy|c{6mXjzeT>$}@1Wy%ySP+F&EiF6Zn%O9yH?jq{`Hunwx9jb!iOS+pOUcP_-?M70 zYb>Pg1$YK!_uKE!z}xbPPitp&*3*A_8;3?B zLszK1j`c{Z?0WqRb||;z>n*PaxPKGs!0Mauz8wMIN>&_rzAE-RZdEKBr8^cxwtfNF zjV$MS(`;g(n`qP9xXK6HjE8Ie|FmG41|I@o_v-J3e~ zqqvji@_fkoVyc~M>Paqr6`#(gm|A0^fow>3UhVo55u45&iRJ+9G>x~EHhItBI4@9F zwF?<4pw5f19Q+<`i5D;Re=XER>z{Aw4EhPpwf;wToHMN& zk2wUi|KNfp(uCOe9GMlx7$G5*2mpy_3^BTUu8eiiOE5HfxzGW47v3>ZL98%ertex+ zqw5^Z?y4V54MVLCd^3(E?b>uDUo?7OMA1N$U;w=yPj4i7|t z>(&4hzph5?g0EEVXwal4of!+NTFiNzWnB+#nSluM3@k%UB>x7iI~fO!+(R$K*KCW+ zHR-h;Fh@l=VUKeaHf;VDr}|cpVuPQxQ9VHojfBW@cXPhP^lw6WPCDwJRRirUHD#ckL5^6b3in+9H~va$={j=Bt zTw+`$qy(N5vl@TjY;w~fjh@=}3@4bI4mK*+?4Vy^gaPQS6EuBRPSgcBUrSPwc?zdJ**_n#f&n% zz#eo!_(F$5k`UVXIC+A|BT%5@GC^ftbfWEF!c(ML%)#fD>Q0tunhMfVw!_Uk4KsL?x*bHdFLOM4BB&Z-Z^ z;ROhQ!h|psz&Xzngb1v({90V-??Zn-WX`|f2)#7+aDvnM;h0@U6r3wi3#!Zbbld6?^YhKEmR8^(*@RSVT_uG_QH6|T=^bK!lw>$b`bAJpxR z6e%Jm&4Mw(mVO7hF`YNXY=947IPSceYk&;08jOEekj%h47gi|ppQ-}+TF*h(Rla;) zLyv(`Vw1hu=Zun`&AI%)*jQj

diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 41804f3a2bb1a..61ebe6e5e7750 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -2590,6 +2590,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ATTN_K, MODEL_TENSOR.ATTN_V, MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.OUTPUT, ], MODEL_ARCH.SMALLTHINKER: [ MODEL_TENSOR.TOKEN_EMBD, diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 18dcc6ddfe567..c759a9c6d9e05 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -2010,6 +2010,7 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" }, { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, } }, { diff --git a/src/llama-model.cpp b/src/llama-model.cpp index c4f0b12f247ee..3c8440a8f653c 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -5474,8 +5474,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } break; case LLM_ARCH_LFM2: { - tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } for (int i = 0; i < n_layer; ++i) { auto & layer = layers[i]; @@ -17787,8 +17792,7 @@ struct llm_build_lfm2 : public llm_graph_context { cb(cur, "model.embedding_norm", -1); res->t_embd = cur; - // lm_head is tied with embeddings - cur = build_lora_mm(model.tok_embd, cur); + cur = build_lora_mm(model.output, cur); cb(cur, "lm_head", -1); res->t_logits = cur; diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 1676c328364f5..b3628db64f886 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -3513,7 +3513,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str const int height = img->ny; const int total_factor = params.patch_size * params.proj_scale_factor; constexpr int min_image_tokens = 64; - constexpr int max_image_tokens = 256; + constexpr int max_image_tokens = 1024; const float min_pixels = min_image_tokens * total_factor * total_factor; const float max_pixels = max_image_tokens * total_factor * total_factor; From 4afb0a746f22abaa545b3ebdb76a400d7da3a713 Mon Sep 17 00:00:00 2001 From: 65a <10104049+65a@users.noreply.github.com> Date: Fri, 22 Aug 2025 08:10:14 +0000 Subject: [PATCH 120/140] server : Support multimodal completion and embeddings prompts in JSON format (#15108) - Use server_tokens in more places in server and util.cpp - Convert most functions that used llama_tokens to server_tokens - Modify input tokenizer to handle JSON objects as subprompts - Break out MTMD prompt parsing into utility function - Support JSON objects with multimodal_data arrays for MTMD prompts along with other existing types - Add capability to model endpoint to indicate if client can send multimodal data - Add tests. --- tools/server/README.md | 17 +- tools/server/server.cpp | 77 ++----- tools/server/tests/unit/test_completion.py | 38 ++++ tools/server/tests/unit/test_vision_api.py | 93 +++++++- tools/server/utils.hpp | 236 ++++++++++++++------- 5 files changed, 323 insertions(+), 138 deletions(-) diff --git a/tools/server/README.md b/tools/server/README.md index af9264ddd38e4..86844225ff309 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -226,6 +226,10 @@ services: ### Multimodal support Multimodal support was added in [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) and is currently an experimental feature. +It is currently available in the following endpoints: +- The OAI-compatible chat endpoint. +- The non-OAI-compatible completions endpoint. +- The non-OAI-compatible embeddings endpoint. For more details, please refer to [multimodal documentation](../../docs/multimodal.md) @@ -400,12 +404,15 @@ These input shapes and data type are allowed for `prompt`: - Single string: `"string"` - Single sequence of tokens: `[12, 34, 56]` - Mixed tokens and strings: `[12, 34, "string", 56, 78]` + - A JSON object which optionally contains multimodal data: `{ "prompt_string": "string", "multimodal_data": ["base64"] }` Multiple prompts are also supported. In this case, the completion result will be an array. - Only strings: `["string1", "string2"]` - - Strings and sequences of tokens: `["string1", [12, 34, 56]]` - - Mixed types: `[[12, 34, "string", 56, 78], [12, 34, 56], "string"]` + - Strings, JSON objects, and sequences of tokens: `["string1", [12, 34, 56], { "prompt_string": "string", "multimodal_data": ["base64"]}]` + - Mixed types: `[[12, 34, "string", 56, 78], [12, 34, 56], "string", { "prompt_string": "string" }]` + +Note for `multimodal_data` in JSON object prompts. This should be an array of strings, containing base64 encoded multimodal data such as images and audio. There must be an identical number of MTMD media markers in the string prompt element which act as placeholders for the data provided to this parameter. The multimodal data files will be substituted in order. The marker string (e.g. `<__media__>`) can be found by calling `mtmd_default_marker()` defined in [the MTMD C API](https://github.com/ggml-org/llama.cpp/blob/5fd160bbd9d70b94b5b11b0001fd7f477005e4a0/tools/mtmd/mtmd.h#L87). A client *must not* specify this field unless the server has the multimodal capability. Clients should check `/models` or `/v1/models` for the `multimodal` capability before a multimodal request. `temperature`: Adjust the randomness of the generated text. Default: `0.8` @@ -477,8 +484,6 @@ These words will not be included in the completion, so make sure to add them to `t_max_predict_ms`: Set a time limit in milliseconds for the prediction (a.k.a. text-generation) phase. The timeout will trigger if the generation takes more than the specified time (measured since the first token was generated) and if a new-line character has already been generated. Useful for FIM applications. Default: `0`, which is disabled. -`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `prompt`. You can determine the place of the image in the prompt as in the following: `USER:[img-12]Describe the image in detail.\nASSISTANT:`. In this case, `[img-12]` will be replaced by the embeddings of the image with id `12` in the following `image_data` array: `{..., "image_data": [{"data": "", "id": 12}]}`. Use `image_data` only with multimodal models, e.g., LLaVA. - `id_slot`: Assign the completion task to an specific slot. If is -1 the task will be assigned to a Idle slot. Default: `-1` `cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true` @@ -638,12 +643,12 @@ Returns a JSON object with a field `prompt` containing a string of the input mes The same as [the embedding example](../embedding) does. +This endpoint also supports multimodal embeddings. See the documentation for the `/completions` endpoint for details on how to send a multimodal prompt. + *Options:* `content`: Set the text to process. -`image_data`: An array of objects to hold base64-encoded image `data` and its `id`s to be reference in `content`. You can determine the place of the image in the content as in the following: `Image: [img-21].\nCaption: This is a picture of a house`. In this case, `[img-21]` will be replaced by the embeddings of the image with id `21` in the following `image_data` array: `{..., "image_data": [{"data": "", "id": 21}]}`. Use `image_data` only with multimodal models, e.g., LLaVA. - `embd_normalize`: Normalization for pooled embeddings. Can be one of the following values: ``` -1: No normalization diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 35b060674bbcb..6eb5aeb582b3a 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -4309,6 +4309,7 @@ int main(int argc, char ** argv) { }; const auto handle_api_show = [&ctx_server, &res_ok](const httplib::Request &, httplib::Response & res) { + bool has_mtmd = ctx_server.mctx != nullptr; json data = { { "template", common_chat_templates_source(ctx_server.chat_templates.get()), @@ -4330,7 +4331,7 @@ int main(int argc, char ** argv) { {"quantization_level", ""} }}, {"model_info", ""}, - {"capabilities", {"completion"}} + {"capabilities", has_mtmd ? json({"completion","multimodal"}) : json({"completion"})} }; res_ok(res, data); @@ -4356,56 +4357,15 @@ int main(int argc, char ** argv) { // TODO: this log can become very long, put it behind a flag or think about a more compact format //SRV_DBG("Prompt: %s\n", prompt.is_string() ? prompt.get().c_str() : prompt.dump(2).c_str()); - // process files - mtmd::bitmaps bitmaps; - const bool has_mtmd = ctx_server.mctx != nullptr; - { - if (!has_mtmd && !files.empty()) { - throw std::runtime_error("This server does not support multimodal"); - } - for (auto & file : files) { - mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(ctx_server.mctx, file.data(), file.size())); - if (!bmp.ptr) { - throw std::runtime_error("Failed to load image or audio file"); - } - // calculate bitmap hash (for KV caching) - std::string hash = fnv_hash(bmp.data(), bmp.n_bytes()); - bmp.set_id(hash.c_str()); - bitmaps.entries.push_back(std::move(bmp)); - } - } - // process prompt std::vector inputs; - if (oaicompat && has_mtmd) { - // multimodal - std::string prompt_str = prompt.get(); - mtmd_input_text inp_txt = { - prompt_str.c_str(), - /* add_special */ true, - /* parse_special */ true, - }; - mtmd::input_chunks chunks(mtmd_input_chunks_init()); - auto bitmaps_c_ptr = bitmaps.c_ptr(); - int32_t tokenized = mtmd_tokenize(ctx_server.mctx, - chunks.ptr.get(), - &inp_txt, - bitmaps_c_ptr.data(), - bitmaps_c_ptr.size()); - if (tokenized != 0) { - throw std::runtime_error("Failed to tokenize prompt"); - } - - server_tokens tmp(chunks, true); - inputs.push_back(std::move(tmp)); + if (oaicompat && ctx_server.mctx != nullptr) { + // This is the case used by OAI compatible chat path with MTMD. TODO It can be moved to the path below. + inputs.push_back(process_mtmd_prompt(ctx_server.mctx, prompt.get(), files)); } else { - // non-multimodal version - auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true); - for (auto & p : tokenized_prompts) { - auto tmp = server_tokens(p, ctx_server.mctx != nullptr); - inputs.push_back(std::move(tmp)); - } + // Everything else, including multimodal completions. + inputs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true); } tasks.reserve(inputs.size()); @@ -4574,7 +4534,7 @@ int main(int argc, char ** argv) { data["input_extra"] = input_extra; // default to empty array if it's not exist std::string prompt = json_value(data, "prompt", std::string()); - std::vector tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, false, true); + std::vector tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, false, true); SRV_DBG("creating infill tasks, n_prompts = %d\n", (int) tokenized_prompts.size()); data["prompt"] = format_infill( ctx_server.vocab, @@ -4585,7 +4545,7 @@ int main(int argc, char ** argv) { ctx_server.params_base.n_predict, ctx_server.slots[0].n_ctx, // TODO: there should be a better way ctx_server.params_base.spm_infill, - tokenized_prompts[0] + tokenized_prompts[0].get_text_tokens() // TODO: this could maybe be multimodal. ); std::vector files; // dummy @@ -4634,7 +4594,7 @@ int main(int argc, char ** argv) { if (current_state == SERVER_STATE_READY) { model_meta = ctx_server.model_meta(); } - + bool has_mtmd = ctx_server.mctx != nullptr; json models = { {"models", { { @@ -4646,7 +4606,7 @@ int main(int argc, char ** argv) { {"type", "model"}, {"description", ""}, {"tags", {""}}, - {"capabilities", {"completion"}}, + {"capabilities", has_mtmd ? json({"completion","multimodal"}) : json({"completion"})}, {"parameters", ""}, {"details", { {"parent_model", ""}, @@ -4763,7 +4723,7 @@ int main(int argc, char ** argv) { } } - auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, prompt, true, true); + auto tokenized_prompts = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, prompt, true, true); for (const auto & tokens : tokenized_prompts) { // this check is necessary for models that do not add BOS token to the input if (tokens.empty()) { @@ -4791,7 +4751,7 @@ int main(int argc, char ** argv) { task.id = ctx_server.queue_tasks.get_new_id(); task.index = i; - task.prompt_tokens = server_tokens(tokenized_prompts[i], ctx_server.mctx != nullptr); + task.prompt_tokens = std::move(tokenized_prompts[i]); // OAI-compat task.params.oaicompat = oaicompat; @@ -4878,7 +4838,10 @@ int main(int argc, char ** argv) { return; } - llama_tokens tokenized_query = tokenize_input_prompts(ctx_server.vocab, query, /* add_special */ false, true)[0]; + std::vector tokenized_queries = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, query, /* add_special */ false, true); + if (tokenized_queries.size() != 1) { + res_error(res, format_error_response("\"query\" must contain only a single prompt", ERROR_TYPE_INVALID_REQUEST)); + } // create and queue the task json responses = json::array(); @@ -4886,14 +4849,14 @@ int main(int argc, char ** argv) { std::unordered_set task_ids; { std::vector tasks; - auto tokenized_docs = tokenize_input_prompts(ctx_server.vocab, documents, /* add_special */ false, true); + auto tokenized_docs = tokenize_input_prompts(ctx_server.vocab, ctx_server.mctx, documents, /* add_special */ false, true); tasks.reserve(tokenized_docs.size()); for (size_t i = 0; i < tokenized_docs.size(); i++) { - auto tmp = format_rerank(ctx_server.vocab, tokenized_query, tokenized_docs[i]); + auto tmp = format_rerank(ctx_server.vocab, tokenized_queries[0], tokenized_docs[i]); server_task task = server_task(SERVER_TASK_TYPE_RERANK); task.id = ctx_server.queue_tasks.get_new_id(); task.index = i; - task.prompt_tokens = server_tokens(tmp, ctx_server.mctx != nullptr); + task.prompt_tokens = std::move(tmp); tasks.push_back(std::move(task)); } diff --git a/tools/server/tests/unit/test_completion.py b/tools/server/tests/unit/test_completion.py index adb6f27864ef9..11483e679a505 100644 --- a/tools/server/tests/unit/test_completion.py +++ b/tools/server/tests/unit/test_completion.py @@ -6,6 +6,8 @@ server = ServerPreset.tinyllama2() +JSON_MULTIMODAL_KEY = "multimodal_data" +JSON_PROMPT_STRING_KEY = "prompt_string" @pytest.fixture(autouse=True) def create_server(): @@ -231,6 +233,28 @@ def test_nocache_long_input_prompt(): }) assert res.status_code == 400 +def test_json_prompt_no_mtmd(): + global server + server.start() + res = server.make_request("POST", "/completion", data={ + "prompt": { JSON_PROMPT_STRING_KEY: "I believe the meaning of life is" }, + "seed": 42, + "temperature": 1.0, + "cache_prompt": False, + }) + assert res.status_code == 200 + +def test_json_prompt_mtm_error_when_not_supported(): + global server + server.start() + res = server.make_request("POST", "/completion", data={ + "prompt": { JSON_PROMPT_STRING_KEY: "I believe the meaning of life is <__media__>", JSON_MULTIMODAL_KEY: "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNk+A8AAQUBAScY42YAAAAASUVORK5CYII=" }, + "seed": 42, + "temperature": 1.0, + "cache_prompt": False, + }) + # MTMD is disabled on this model, so this should fail. + assert res.status_code != 200 def test_completion_with_tokens_input(): global server @@ -269,6 +293,20 @@ def test_completion_with_tokens_input(): assert len(res.body) == 2 assert res.body[0]["content"] == res.body[1]["content"] + # mixed JSON and tokens + res = server.make_request("POST", "/completion", data={ + "prompt": [ + tokens, + { + JSON_PROMPT_STRING_KEY: "I believe the meaning of life is", + }, + ], + }) + assert res.status_code == 200 + assert type(res.body) == list + assert len(res.body) == 2 + assert res.body[0]["content"] == res.body[1]["content"] + # mixed string and tokens in one sequence res = server.make_request("POST", "/completion", data={ "prompt": [1, 2, 3, 4, 5, 6, prompt_str, 7, 8, 9, 10, prompt_str], diff --git a/tools/server/tests/unit/test_vision_api.py b/tools/server/tests/unit/test_vision_api.py index fc63caa134293..36d14b3885175 100644 --- a/tools/server/tests/unit/test_vision_api.py +++ b/tools/server/tests/unit/test_vision_api.py @@ -10,21 +10,48 @@ response = requests.get(IMG_URL_0) response.raise_for_status() # Raise an exception for bad status codes -IMG_BASE64_0 = "data:image/png;base64," + base64.b64encode(response.content).decode("utf-8") +IMG_BASE64_URI_0 = "data:image/png;base64," + base64.b64encode(response.content).decode("utf-8") +IMG_BASE64_0 = base64.b64encode(response.content).decode("utf-8") +response = requests.get(IMG_URL_1) +response.raise_for_status() # Raise an exception for bad status codes +IMG_BASE64_URI_1 = "data:image/png;base64," + base64.b64encode(response.content).decode("utf-8") +IMG_BASE64_1 = base64.b64encode(response.content).decode("utf-8") + +JSON_MULTIMODAL_KEY = "multimodal_data" +JSON_PROMPT_STRING_KEY = "prompt_string" @pytest.fixture(autouse=True) def create_server(): global server server = ServerPreset.tinygemma3() +def test_models_supports_multimodal_capability(): + global server + server.start() # vision model may take longer to load due to download size + res = server.make_request("GET", "/models", data={}) + assert res.status_code == 200 + model_info = res.body["models"][0] + print(model_info) + assert "completion" in model_info["capabilities"] + assert "multimodal" in model_info["capabilities"] + +def test_v1_models_supports_multimodal_capability(): + global server + server.start() # vision model may take longer to load due to download size + res = server.make_request("GET", "/v1/models", data={}) + assert res.status_code == 200 + model_info = res.body["models"][0] + print(model_info) + assert "completion" in model_info["capabilities"] + assert "multimodal" in model_info["capabilities"] @pytest.mark.parametrize( "prompt, image_url, success, re_content", [ # test model is trained on CIFAR-10, but it's quite dumb due to small size ("What is this:\n", IMG_URL_0, True, "(cat)+"), - ("What is this:\n", "IMG_BASE64_0", True, "(cat)+"), # exceptional, so that we don't cog up the log + ("What is this:\n", "IMG_BASE64_URI_0", True, "(cat)+"), # exceptional, so that we don't cog up the log ("What is this:\n", IMG_URL_1, True, "(frog)+"), ("Test test\n", IMG_URL_1, True, "(frog)+"), # test invalidate cache ("What is this:\n", "malformed", False, None), @@ -36,8 +63,8 @@ def create_server(): def test_vision_chat_completion(prompt, image_url, success, re_content): global server server.start(timeout_seconds=60) # vision model may take longer to load due to download size - if image_url == "IMG_BASE64_0": - image_url = IMG_BASE64_0 + if image_url == "IMG_BASE64_URI_0": + image_url = IMG_BASE64_URI_0 res = server.make_request("POST", "/chat/completions", data={ "temperature": 0.0, "top_k": 1, @@ -58,3 +85,61 @@ def test_vision_chat_completion(prompt, image_url, success, re_content): else: assert res.status_code != 200 + +@pytest.mark.parametrize( + "prompt, image_data, success, re_content", + [ + # test model is trained on CIFAR-10, but it's quite dumb due to small size + ("What is this: <__media__>\n", IMG_BASE64_0, True, "(cat)+"), + ("What is this: <__media__>\n", IMG_BASE64_1, True, "(frog)+"), + ("What is this: <__media__>\n", "malformed", False, None), # non-image data + ("What is this:\n", "", False, None), # empty string + ] +) +def test_vision_completion(prompt, image_data, success, re_content): + global server + server.start() # vision model may take longer to load due to download size + res = server.make_request("POST", "/completions", data={ + "temperature": 0.0, + "top_k": 1, + "prompt": { JSON_PROMPT_STRING_KEY: prompt, JSON_MULTIMODAL_KEY: [ image_data ] }, + }) + if success: + assert res.status_code == 200 + content = res.body["content"] + assert match_regex(re_content, content) + else: + assert res.status_code != 200 + + +@pytest.mark.parametrize( + "prompt, image_data, success", + [ + # test model is trained on CIFAR-10, but it's quite dumb due to small size + ("What is this: <__media__>\n", IMG_BASE64_0, True), # exceptional, so that we don't cog up the log + ("What is this: <__media__>\n", IMG_BASE64_1, True), + ("What is this: <__media__>\n", "malformed", False), # non-image data + ("What is this:\n", "base64", False), # non-image data + ] +) +def test_vision_embeddings(prompt, image_data, success): + global server + server.server_embeddings=True + server.n_batch=512 + server.start() # vision model may take longer to load due to download size + res = server.make_request("POST", "/embeddings", data={ + "content": [ + { JSON_PROMPT_STRING_KEY: prompt, JSON_MULTIMODAL_KEY: [ image_data ] }, + { JSON_PROMPT_STRING_KEY: prompt, JSON_MULTIMODAL_KEY: [ image_data ] }, + { JSON_PROMPT_STRING_KEY: prompt, }, + ], + }) + if success: + assert res.status_code == 200 + content = res.body + # Ensure embeddings are stable when multimodal. + assert content[0]['embedding'] == content[1]['embedding'] + # Ensure embeddings without multimodal but same prompt do not match multimodal embeddings. + assert content[0]['embedding'] != content[2]['embedding'] + else: + assert res.status_code != 200 diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp index f3dfc8225da4d..036060bb3e9c7 100644 --- a/tools/server/utils.hpp +++ b/tools/server/utils.hpp @@ -123,6 +123,19 @@ static bool json_is_array_of_mixed_numbers_strings(const json & data) { return false; } +// does array have any individual integers/tokens? +static bool json_is_array_and_contains_numbers(const json & data) { + if (data.is_array()) { + for (const auto & e : data) { + if (e.is_number_integer()) { + return true; + } + } + return false; + } + return false; +} + // get value by path(key1 / key2) static json json_get_nested_values(const std::vector & paths, const json & js) { json result = json::object(); @@ -186,48 +199,6 @@ static llama_tokens tokenize_mixed(const llama_vocab * vocab, const json & json_ return prompt_tokens; } -/** - * break the input "prompt" object into multiple prompt if needed, then tokenize them - * this supports these cases: - * - "prompt": "string" - * - "prompt": [12, 34, 56] - * - "prompt": [12, 34, "string", 56, 78] - * and multiple prompts (multi-tasks): - * - "prompt": ["string1", "string2"] - * - "prompt": ["string1", [12, 34, 56]] - * - "prompt": [[12, 34, 56], [78, 90, 12]] - * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56]] - */ -static std::vector tokenize_input_prompts(const llama_vocab * vocab, const json & json_prompt, bool add_special, bool parse_special) { - std::vector result; - if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) { - // string or mixed - result.push_back(tokenize_mixed(vocab, json_prompt, add_special, parse_special)); - } else if (json_is_array_of_numbers(json_prompt)) { - // array of tokens - result.push_back(json_prompt.get()); - } else if (json_prompt.is_array()) { - // array of prompts - result.reserve(json_prompt.size()); - for (const auto & p : json_prompt) { - if (p.is_string() || json_is_array_of_mixed_numbers_strings(p)) { - result.push_back(tokenize_mixed(vocab, p, add_special, parse_special)); - } else if (json_is_array_of_numbers(p)) { - // array of tokens - result.push_back(p.get()); - } else { - throw std::runtime_error("element of \"prompt\" must be a string, an list of tokens, or a list of mixed strings & tokens"); - } - } - } else { - throw std::runtime_error("\"prompt\" must be a string, an list of tokens, a list of mixed strings & tokens, or a list of prompts"); - } - if (result.empty()) { - throw std::runtime_error("\"prompt\" must not be empty"); - } - return result; -} - // return the last index of character that can form a valid string // if the last character is potentially cut in half, return the index before the cut // if validate_utf8(text) == text.size(), then the whole text is valid utf8 @@ -262,35 +233,6 @@ static size_t validate_utf8(const std::string& text) { // template utils // -// format rerank task: [BOS]query[EOS][SEP]doc[EOS] -static llama_tokens format_rerank(const struct llama_vocab * vocab, const llama_tokens & query, const llama_tokens & doc) { - llama_tokens result; - - // Get EOS token - use SEP token as fallback if EOS is not available - llama_token eos_token = llama_vocab_eos(vocab); - if (eos_token == LLAMA_TOKEN_NULL) { - eos_token = llama_vocab_sep(vocab); - } - - result.reserve(doc.size() + query.size() + 4); - if (llama_vocab_get_add_bos(vocab)) { - result.push_back(llama_vocab_bos(vocab)); - } - result.insert(result.end(), query.begin(), query.end()); - if (llama_vocab_get_add_eos(vocab)) { - result.push_back(eos_token); - } - if (llama_vocab_get_add_sep(vocab)) { - result.push_back(llama_vocab_sep(vocab)); - } - result.insert(result.end(), doc.begin(), doc.end()); - if (llama_vocab_get_add_eos(vocab)) { - result.push_back(eos_token); - } - - return result; -} - // format infill task static llama_tokens format_infill( const llama_vocab * vocab, @@ -1186,6 +1128,24 @@ struct server_tokens { } } + // appends server tokens, updates the media map. copies media chunks. + void push_back(server_tokens & tokens) { + size_t start_pos = size(); + for (size_t i = 0; i < tokens.size(); i++) { + push_back(tokens[i]); + } + if (tokens.has_mtmd) { + // Assert if we are copying MTMD chunks to a server_tokens that does not have mtmd. + // We could also just check, but this will prevent silently dropping MTMD data. + GGML_ASSERT(has_mtmd); + for (auto it = tokens.map_pos_to_media.begin(); it != tokens.map_pos_to_media.end(); ) { + auto chunk = tokens.map_pos_to_media[it->first].get(); + mtmd::input_chunk_ptr new_chunk(mtmd_input_chunk_copy(chunk)); + map_pos_to_media[start_pos+it->first] = std::move(new_chunk); + } + } + } + // for compatibility with context shift and prompt truncation void insert(const llama_tokens & inp_tokens) { GGML_ASSERT(!has_mtmd); // only allow this if mtmd is disabled @@ -1356,3 +1316,137 @@ static std::string fnv_hash(const uint8_t * data, size_t len) { } return std::to_string(hash); } + + +// format rerank task: [BOS]query[EOS][SEP]doc[EOS]. +static server_tokens format_rerank(const struct llama_vocab * vocab, server_tokens & query, server_tokens & doc) { + server_tokens result = {}; + + // Get EOS token - use SEP token as fallback if EOS is not available + llama_token eos_token = llama_vocab_eos(vocab); + if (eos_token == LLAMA_TOKEN_NULL) { + eos_token = llama_vocab_sep(vocab); + } + if (llama_vocab_get_add_bos(vocab)) { + result.push_back(llama_vocab_bos(vocab)); + } + result.push_back(query); + if (llama_vocab_get_add_eos(vocab)) { + result.push_back(eos_token); + } + if (llama_vocab_get_add_sep(vocab)) { + result.push_back(llama_vocab_sep(vocab)); + } + result.push_back(doc); + if (llama_vocab_get_add_eos(vocab)) { + result.push_back(eos_token); + } + return result; +} + + +static server_tokens process_mtmd_prompt(mtmd_context * mctx, std::string prompt, std::vector files) { + mtmd::bitmaps bitmaps; + for (auto & file : files) { + mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(mctx, file.data(), file.size())); + if (!bmp.ptr) { + throw std::runtime_error("Failed to load image or audio file"); + } + // calculate bitmap hash (for KV caching) + std::string hash = fnv_hash(bmp.data(), bmp.n_bytes()); + bmp.set_id(hash.c_str()); + bitmaps.entries.push_back(std::move(bmp)); + } + // process prompt + std::vector inputs; + // multimodal + mtmd_input_text inp_txt = { + prompt.c_str(), + /* add_special */ true, + /* parse_special */ true, + }; + mtmd::input_chunks chunks(mtmd_input_chunks_init()); + auto bitmaps_c_ptr = bitmaps.c_ptr(); + int32_t tokenized = mtmd_tokenize(mctx, + chunks.ptr.get(), + &inp_txt, + bitmaps_c_ptr.data(), + bitmaps_c_ptr.size()); + if (tokenized != 0) { + throw std::runtime_error("Failed to tokenize prompt"); + } + auto result = server_tokens(chunks, true); + return result; +} + +/** + * break the input "prompt" object into multiple prompt if needed, then tokenize them + * use tokenize_input_prompts() if the input could be an array. + * this supports these cases: + * - "prompt": "string" + * - "prompt": [12, 34, 56] + * - "prompt": [12, 34, "string", 56, 78] + * - "prompt": { "prompt_string": "string", "multimodal_data": [ "base64" ] } + */ +static server_tokens tokenize_input_subprompt(const llama_vocab * vocab, mtmd_context * mctx, const json & json_prompt, bool add_special, bool parse_special) { + constexpr char JSON_STRING_PROMPT_KEY[] = "prompt_string"; + constexpr char JSON_MTMD_DATA_KEY[] = "multimodal_data"; + const bool has_mtmd = mctx != nullptr; + if (json_prompt.is_string() || json_is_array_of_mixed_numbers_strings(json_prompt)) { + // string or mixed + llama_tokens tmp = tokenize_mixed(vocab, json_prompt, add_special, parse_special); + return server_tokens(tmp, false); + } else if (json_is_array_of_numbers(json_prompt)) { + // array of tokens + llama_tokens tmp = json_prompt.get(); + return server_tokens(tmp, false); + } else if (json_prompt.contains(JSON_STRING_PROMPT_KEY)) { + // JSON object with prompt key. + if (json_prompt.contains(JSON_MTMD_DATA_KEY)) { + if (!has_mtmd) + throw std::runtime_error("Multimodal data provided, but model does not support multimodal requests."); + + // JSON object with prompt and multimodal key. + std::vector files; + for (const auto & entry : json_prompt.at(JSON_MTMD_DATA_KEY)) { + files.push_back(base64_decode(entry)); + } + return process_mtmd_prompt(mctx, json_prompt.at(JSON_STRING_PROMPT_KEY), files); + } else { + // Not multimodal, but contains a subobject. + llama_tokens tmp = tokenize_mixed(vocab, json_prompt.at(JSON_STRING_PROMPT_KEY), add_special, parse_special); + return server_tokens(tmp, false); + } + } else { + throw std::runtime_error("\"prompt\" elements must be a string, a list of tokens, a JSON object containing a prompt string, or a list of mixed strings & tokens."); + } +} + +/** + * break the input "prompt" object into multiple prompt if needed, then tokenize them + * this supports these cases: + * - "prompt": "string" + * - "prompt": [12, 34, 56] + * - "prompt": [12, 34, "string", 56, 78] + * - "prompt": { "prompt_string": "string", "multimodal_data": [ "base64" ] } + * and multiple prompts (multi-tasks): + * - "prompt": ["string1", "string2"] + * - "prompt": ["string1", [12, 34, 56]] + * - "prompt": [[12, 34, 56], [78, 90, 12]] + * - "prompt": [[12, 34, "string", 56, 78], [12, 34, 56], { "prompt_string": "string", "multimodal_data": [ "base64" ]}] + */ +static std::vector tokenize_input_prompts(const llama_vocab * vocab, mtmd_context * mctx, const json & json_prompt, bool add_special, bool parse_special) { + std::vector result; + if (json_prompt.is_array() && !json_is_array_and_contains_numbers(json_prompt)) { + result.reserve(json_prompt.size()); + for (const auto & p : json_prompt) { + result.push_back(tokenize_input_subprompt(vocab, mctx, p,add_special, parse_special)); + } + } else { + result.push_back(tokenize_input_subprompt(vocab, mctx, json_prompt, add_special, parse_special)); + } + if (result.empty()) { + throw std::runtime_error("\"prompt\" must not be empty"); + } + return result; +} From ad5c975c2d0297124fad210776ef8eed6b90d578 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Fri, 22 Aug 2025 16:11:04 +0800 Subject: [PATCH 121/140] ggml-cpu: Support Q5_0 and Q5_1 on s390x (#15486) * ggml-cpu: initial q5_0 impl for s390x Signed-off-by: Aaron Teo * ggml-cpu: updated q5_0 code for better performance Signed-off-by: Aaron Teo * ggml-cpu: use optimised hsum for better performance Signed-off-by: Aaron Teo * ggml-cpu: introduce q5_1 simd + refactor q5_0 Signed-off-by: Aaron Teo * ggml-cpu: fix incorrect return type vec_hsum Signed-off-by: Aaron Teo * ggml-cpu: q5_0 incomplete refactor + table_b2b_0 activation Signed-off-by: Aaron Teo * ggml-cpu: refactor q5_1 Signed-off-by: Aaron Teo * ggml-cpu: q5_1 update loop unroll to 4 Signed-off-by: Aaron Teo * ggml-cpu: update q5_0 unroll to 4 Signed-off-by: Aaron Teo * ggml-cpu: update build-s390x docs Signed-off-by: Aaron Teo * ggml-cpu: update unused variables q5_0 Signed-off-by: Aaron Teo * docs: update the last update date Signed-off-by: Aaron Teo --------- Signed-off-by: Aaron Teo --- docs/build-s390x.md | 7 +- ggml/src/ggml-cpu/arch-fallback.h | 2 - ggml/src/ggml-cpu/arch/s390/quants.c | 316 +++++++++++++++++++++++++++ ggml/src/ggml-cpu/ggml-cpu-impl.h | 8 + 4 files changed, 328 insertions(+), 5 deletions(-) diff --git a/docs/build-s390x.md b/docs/build-s390x.md index b36a1998144a1..f3cdd63be3ece 100644 --- a/docs/build-s390x.md +++ b/docs/build-s390x.md @@ -265,8 +265,9 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl | BF16 | 🚫 | 🚫 | ❓ | ❓ | | Q4_0 | ✅ | ✅ | ❓ | ❓ | | Q4_1 | ✅ | ✅ | ❓ | ❓ | -| Q5_0 | 🚫 | 🚫 | ❓ | ❓ | -| Q5_1 | 🚫 | 🚫 | ❓ | ❓ | +| MXFP4 | 🚫 | 🚫 | ❓ | ❓ | +| Q5_0 | ✅ | ✅ | ❓ | ❓ | +| Q5_1 | ✅ | ✅ | ❓ | ❓ | | Q8_0 | ✅ | ✅ | ❓ | ❓ | | Q2_K | 🚫 | 🚫 | ❓ | ❓ | | Q3_K | ✅ | ✅ | ❓ | ❓ | @@ -291,4 +292,4 @@ IBM VXE/VXE2 SIMD acceleration depends on the BLAS implementation. It is strongl - 🚫 - acceleration unavailable, will still run using scalar implementation - ❓ - acceleration unknown, please contribute if you can test it yourself -Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on July 31, 2025. +Last Updated by **Aaron Teo (aaron.teo1@ibm.com)** on Aug 22, 2025. diff --git a/ggml/src/ggml-cpu/arch-fallback.h b/ggml/src/ggml-cpu/arch-fallback.h index 0bfb92df17909..373408a9c0955 100644 --- a/ggml/src/ggml-cpu/arch-fallback.h +++ b/ggml/src/ggml-cpu/arch-fallback.h @@ -150,8 +150,6 @@ #elif defined(__s390x__) // quants.c #define quantize_row_q8_K_generic quantize_row_q8_K -#define ggml_vec_dot_q5_0_q8_0_generic ggml_vec_dot_q5_0_q8_0 -#define ggml_vec_dot_q5_1_q8_1_generic ggml_vec_dot_q5_1_q8_1 #define ggml_vec_dot_tq1_0_q8_K_generic ggml_vec_dot_tq1_0_q8_K #define ggml_vec_dot_tq2_0_q8_K_generic ggml_vec_dot_tq2_0_q8_K #define ggml_vec_dot_q2_K_q8_K_generic ggml_vec_dot_q2_K_q8_K diff --git a/ggml/src/ggml-cpu/arch/s390/quants.c b/ggml/src/ggml-cpu/arch/s390/quants.c index 7e4229d0e46a9..1c8176fb4d91f 100644 --- a/ggml/src/ggml-cpu/arch/s390/quants.c +++ b/ggml/src/ggml-cpu/arch/s390/quants.c @@ -23,6 +23,27 @@ #define UNUSED GGML_UNUSED +#if defined(__VXE__) || defined(__VXE2__) +#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s +#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s) +#define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s) +#define B4(c,s,n) B3(c,s,n ## c), B3(c,s,n ## s) +#define B5(c,s,n) B4(c,s,n ## c), B4(c,s,n ## s) +#define B6(c,s,n) B5(c,s,n ## c), B5(c,s,n ## s) +#define B7(c,s,n) B6(c,s,n ## c), B6(c,s,n ## s) +#define B8(c,s ) B7(c,s, c), B7(c,s, s) + +// precomputed tables for expanding 8bits to 8 bytes: +static const __attribute__((aligned(16))) uint64_t table_b2b_0[1 << 8] = { B8(00, 10) }; // ( b ) << 4 +static const __attribute__((aligned(16))) uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4 + +// permute mask for byteswapping +static const uint8x16_t v_kperm = (const uint8x16_t){ + 7, 6, 5, 4, 3, 2, 1, 0, + 15, 14, 13, 12, 11, 10, 9, 8 +}; +#endif + void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) { assert(QK8_0 == 32); assert(k % QK8_0 == 0); @@ -241,6 +262,301 @@ void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const voi #endif } +void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_0; + const int nb = n / qk; + + assert(n % qk == 0); + assert(qk == QK5_0); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q5_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; + + int ib = 0; + float sumf = 0.0f; + +#if defined(__VXE__) || defined(__VXE2__) + float32x4_t v_sum0 = vec_splats(0.0f); + float32x4_t v_sum1 = vec_splats(0.0f); + + uint32_t qh0, qh1; + uint64_t tmp0[4], tmp1[4]; + + const uint8x16_t v_m = vec_splats((uint8_t)0x0F); + + #pragma GCC unroll 4 + for (; ib + 1 < nb; ib += 2) { + const block_q5_0 * GGML_RESTRICT x0 = &x[ib + 0]; + const block_q5_0 * GGML_RESTRICT x1 = &x[ib + 1]; + const block_q8_0 * GGML_RESTRICT y0 = &y[ib + 0]; + const block_q8_0 * GGML_RESTRICT y1 = &y[ib + 1]; + + memcpy(&qh0, x0->qh, sizeof(qh0)); + memcpy(&qh1, x1->qh, sizeof(qh1)); + + tmp0[0] = table_b2b_1[(qh0 >> 0) & 0xFF]; + tmp0[1] = table_b2b_1[(qh0 >> 8) & 0xFF]; + tmp0[2] = table_b2b_1[(qh0 >> 16) & 0xFF]; + tmp0[3] = table_b2b_1[(qh0 >> 24) ]; + + tmp1[0] = table_b2b_1[(qh1 >> 0) & 0xFF]; + tmp1[1] = table_b2b_1[(qh1 >> 8) & 0xFF]; + tmp1[2] = table_b2b_1[(qh1 >> 16) & 0xFF]; + tmp1[3] = table_b2b_1[(qh1 >> 24) ]; + + int8x16_t v_qh0l = vec_xl(0, (const int8_t *)(tmp0 + 0)); + int8x16_t v_qh0h = vec_xl(0, (const int8_t *)(tmp0 + 2)); + int8x16_t v_qh1l = vec_xl(0, (const int8_t *)(tmp1 + 0)); + int8x16_t v_qh1h = vec_xl(0, (const int8_t *)(tmp1 + 2)); + + // required for fixing the byteorder + v_qh0l = vec_perm(v_qh0l, v_qh0l, v_kperm); + v_qh0h = vec_perm(v_qh0h, v_qh0h, v_kperm); + v_qh1l = vec_perm(v_qh1l, v_qh1l, v_kperm); + v_qh1h = vec_perm(v_qh1h, v_qh1h, v_kperm); + + const uint8x16_t v_x0 = vec_xl(0, (const uint8_t *)x0->qs); + const uint8x16_t v_x1 = vec_xl(0, (const uint8_t *)x1->qs); + + int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m); + int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4); + int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m); + int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4); + + const int8x16_t v_x0lf = vec_sub(v_x0l, v_qh0l); + const int8x16_t v_x0hf = vec_sub(v_x0h, v_qh0h); + const int8x16_t v_x1lf = vec_sub(v_x1l, v_qh1l); + const int8x16_t v_x1hf = vec_sub(v_x1h, v_qh1h); + + const int8x16_t v_y0l = vec_xl(0, (const int8_t *)y0->qs); + const int8x16_t v_y0h = vec_xl(QK8_0/2, (const int8_t *)y0->qs); + const int8x16_t v_y1l = vec_xl(0, (const int8_t *)y1->qs); + const int8x16_t v_y1h = vec_xl(QK8_0/2, (const int8_t *)y1->qs); + + const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h); + const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h); + + const float32x4_t v_xy0f = vec_float(v_xy0); + const float32x4_t v_xy1f = vec_float(v_xy1); + + const float32x4_t v_d0 = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d)); + const float32x4_t v_d1 = vec_splats(GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d)); + + v_sum0 = vec_madd(v_xy0f, v_d0, v_sum0); + v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1); + } + + sumf += vec_hsum(v_sum0) + vec_hsum(v_sum1); + + #pragma GCC unroll 4 + for (; ib < nb; ++ib) { + const block_q5_0 * GGML_RESTRICT x0 = &x[ib]; + const block_q8_0 * GGML_RESTRICT y0 = &y[ib]; + + uint32_t qh; + memcpy(&qh, x0->qh, sizeof(qh)); + + uint64_t tmp[4]; + tmp[0] = table_b2b_1[(qh >> 0) & 0xFF]; + tmp[1] = table_b2b_1[(qh >> 8) & 0xFF]; + tmp[2] = table_b2b_1[(qh >> 16) & 0xFF]; + tmp[3] = table_b2b_1[(qh >> 24) ]; + + int8x16_t v_qhl = vec_xl(0, (const int8_t *)(tmp + 0)); + int8x16_t v_qhh = vec_xl(0, (const int8_t *)(tmp + 2)); + + // required for fixing the byteorder + v_qhl = vec_perm(v_qhl, v_qhl, v_kperm); + v_qhh = vec_perm(v_qhh, v_qhh, v_kperm); + + const uint8x16_t v_x = vec_xl(0, (const uint8_t *)x0->qs); + int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m); + int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4); + + const int8x16_t v_xlf = vec_sub(v_xl, v_qhl); + const int8x16_t v_xhf = vec_sub(v_xh, v_qhh); + + const int8x16_t v_yl = vec_xl(0, (const int8_t *)y0->qs); + const int8x16_t v_yh = vec_xl(QK8_0/2, (const int8_t *)y0->qs); + + const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xlf, v_yl), v_xhf, v_yh); + const float32x4_t v_xyf = vec_float(v_xy); + + const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d)); + const float32x4_t v_acc = vec_madd(v_xyf, v_d, vec_splats(0.0f)); + + sumf += vec_hsum(v_acc); + } + + *s = sumf; +#else + UNUSED(nb); + UNUSED(x); + UNUSED(y); + UNUSED(ib); + UNUSED(sumf); + ggml_vec_dot_q5_0_q8_0_generic(n, s, bs, vx, bx, vy, by, nrc); +#endif +} + +void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { + const int qk = QK8_1; + const int nb = n / qk; + + assert(n % qk == 0); + assert(qk == QK5_1); + assert(nrc == 1); + UNUSED(nrc); + UNUSED(bx); + UNUSED(by); + UNUSED(bs); + + const block_q5_1 * GGML_RESTRICT x = vx; + const block_q8_1 * GGML_RESTRICT y = vy; + + int ib = 0; + float sumf = 0.0f; + +#if defined(__VXE__) || defined(__VXE2__) + float32x4_t v_sum0 = vec_splats(0.0f); + float32x4_t v_sum1 = vec_splats(0.0f); + + float summs0 = 0.0f; + float summs1 = 0.0f; + + uint32_t qh0; + uint32_t qh1; + + uint64_t tmp0[4]; + uint64_t tmp1[4]; + + const uint8x16_t v_m = vec_splats((uint8_t)0x0F); + + #pragma GCC unroll 4 + for (; ib + 1 < nb; ib += 2) { + const block_q5_1 * GGML_RESTRICT x0 = &x[ib + 0]; + const block_q5_1 * GGML_RESTRICT x1 = &x[ib + 1]; + const block_q8_1 * GGML_RESTRICT y0 = &y[ib + 0]; + const block_q8_1 * GGML_RESTRICT y1 = &y[ib + 1]; + + summs0 += GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s); + summs1 += GGML_CPU_FP16_TO_FP32(x1->m) * GGML_CPU_FP16_TO_FP32(y1->s); + + memcpy(&qh0, x0->qh, sizeof(qh0)); + memcpy(&qh1, x1->qh, sizeof(qh1)); + + tmp0[0] = table_b2b_0[(qh0 >> 0) & 0xFF]; + tmp0[1] = table_b2b_0[(qh0 >> 8) & 0xFF]; + tmp0[2] = table_b2b_0[(qh0 >> 16) & 0xFF]; + tmp0[3] = table_b2b_0[(qh0 >> 24) ]; + + tmp1[0] = table_b2b_0[(qh1 >> 0) & 0xFF]; + tmp1[1] = table_b2b_0[(qh1 >> 8) & 0xFF]; + tmp1[2] = table_b2b_0[(qh1 >> 16) & 0xFF]; + tmp1[3] = table_b2b_0[(qh1 >> 24) ]; + + int8x16_t v_qh0l = vec_xl(0, (const int8_t *)(tmp0 + 0)); + int8x16_t v_qh0h = vec_xl(0, (const int8_t *)(tmp0 + 2)); + int8x16_t v_qh1l = vec_xl(0, (const int8_t *)(tmp1 + 0)); + int8x16_t v_qh1h = vec_xl(0, (const int8_t *)(tmp1 + 2)); + + // required for fixing the byteorder + v_qh0l = vec_perm(v_qh0l, v_qh0l, v_kperm); + v_qh0h = vec_perm(v_qh0h, v_qh0h, v_kperm); + v_qh1l = vec_perm(v_qh1l, v_qh1l, v_kperm); + v_qh1h = vec_perm(v_qh1h, v_qh1h, v_kperm); + + const uint8x16_t v_x0 = vec_xl(0, x0->qs); + const uint8x16_t v_x1 = vec_xl(0, x1->qs); + + const int8x16_t v_x0l = (int8x16_t)vec_and(v_x0, v_m); + const int8x16_t v_x0h = (int8x16_t)vec_sr(v_x0, 4); + const int8x16_t v_x1l = (int8x16_t)vec_and(v_x1, v_m); + const int8x16_t v_x1h = (int8x16_t)vec_sr(v_x1, 4); + + const int8x16_t v_x0lf = vec_or(v_x0l, v_qh0l); + const int8x16_t v_x0hf = vec_or(v_x0h, v_qh0h); + const int8x16_t v_x1lf = vec_or(v_x1l, v_qh1l); + const int8x16_t v_x1hf = vec_or(v_x1h, v_qh1h); + + const int8x16_t v_y0l = vec_xl(0 , y0->qs); + const int8x16_t v_y0h = vec_xl(QK8_1/2, y0->qs); + const int8x16_t v_y1l = vec_xl(0 , y1->qs); + const int8x16_t v_y1h = vec_xl(QK8_1/2, y1->qs); + + const int32x4_t v_xy0 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x0lf, v_y0l), v_x0hf, v_y0h); + const int32x4_t v_xy1 = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_x1lf, v_y1l), v_x1hf, v_y1h); + + const float32x4_t v_xy0f = vec_float(v_xy0); + const float32x4_t v_xy1f = vec_float(v_xy1); + + const float32x4_t v_d0 = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d)); + const float32x4_t v_d1 = vec_splats(GGML_CPU_FP16_TO_FP32(x1->d) * GGML_CPU_FP16_TO_FP32(y1->d)); + + v_sum0 = vec_madd(v_xy0f, v_d0, v_sum0); + v_sum1 = vec_madd(v_xy1f, v_d1, v_sum1); + } + + sumf += vec_hsum(v_sum0) + vec_hsum(v_sum1) + summs0 + summs1; + + #pragma GCC unroll 4 + for (; ib < nb; ++ib) { + const block_q5_1 * GGML_RESTRICT x0 = &x[ib]; + const block_q8_1 * GGML_RESTRICT y0 = &y[ib]; + + float summs = GGML_CPU_FP16_TO_FP32(x0->m) * GGML_CPU_FP16_TO_FP32(y0->s); + + uint32_t qh; + memcpy(&qh, x0->qh, sizeof(qh)); + + uint64_t tmp[4]; + tmp[0] = table_b2b_0[(qh >> 0) & 0xFF]; + tmp[1] = table_b2b_0[(qh >> 8) & 0xFF]; + tmp[2] = table_b2b_0[(qh >> 16) & 0xFF]; + tmp[3] = table_b2b_0[(qh >> 24) ]; + + int8x16_t v_qhl = vec_xl(0, (const int8_t *)(tmp + 0)); + int8x16_t v_qhh = vec_xl(0, (const int8_t *)(tmp + 2)); + + // required for fixing the byteorder + v_qhl = vec_perm(v_qhl, v_qhl, v_kperm); + v_qhh = vec_perm(v_qhh, v_qhh, v_kperm); + + const uint8x16_t v_x = vec_xl(0, x0->qs); + const int8x16_t v_xl = (int8x16_t)vec_and(v_x, v_m); + const int8x16_t v_xh = (int8x16_t)vec_sr(v_x, 4); + + const int8x16_t v_xlf = vec_or(v_xl, v_qhl); + const int8x16_t v_xhf = vec_or(v_xh, v_qhh); + + const int8x16_t v_yl = vec_xl(0 , y0->qs); + const int8x16_t v_yh = vec_xl(QK8_1/2, y0->qs); + + const int32x4_t v_xy = ggml_vec_dot(ggml_vec_dot(vec_splats(0), v_xlf, v_yl), v_xhf, v_yh); + const float32x4_t v_xyf = vec_float(v_xy); + + const float32x4_t v_d = vec_splats(GGML_CPU_FP16_TO_FP32(x0->d) * GGML_CPU_FP16_TO_FP32(y0->d)); + const float32x4_t v_acc = vec_madd(v_xyf, v_d, v_acc); + + sumf += vec_hsum(v_acc) + summs; + } + + *s = sumf; +#else + UNUSED(nb); + UNUSED(x); + UNUSED(y); + UNUSED(ib); + UNUSED(sumf); + ggml_vec_dot_q5_1_q8_1_generic(n, s, bs, vx, bx, vy, by, nrc); +#endif +} + void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) { const int qk = QK8_0; const int nb = n / qk; diff --git a/ggml/src/ggml-cpu/ggml-cpu-impl.h b/ggml/src/ggml-cpu/ggml-cpu-impl.h index d839cf5c55e81..1f6844e16cd34 100644 --- a/ggml/src/ggml-cpu/ggml-cpu-impl.h +++ b/ggml/src/ggml-cpu/ggml-cpu-impl.h @@ -486,6 +486,14 @@ inline static int16x8_t vec_padd_s16(int16x8_t a, int16x8_t b) { return v_abo + v_abe; } +/** + * @see https://github.com/ggml-org/llama.cpp/pull/14037 + */ +inline float vec_hsum(float32x4_t v) { + float32x4_t v_temp = v + vec_reve(v); + return v_temp[0] + v_temp[1]; +} + inline static int32x4_t ggml_vec_dot(int32x4_t acc, int8x16_t a, int8x16_t b) { const int16x8_t p = vec_mule(a, b) + vec_mulo(a, b); return acc + (vec_unpackh(p) + vec_unpackl(p)); From 9ebebef62fd0adf8685874f154e227ea87b7c6f4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 22 Aug 2025 12:22:13 +0300 Subject: [PATCH 122/140] llama : remove KV cache defragmentation logic (#15473) ggml-ci --- common/arg.cpp | 6 +- common/common.cpp | 1 - common/common.h | 1 - examples/llama.vim | 2 +- include/llama.h | 2 +- scripts/compare-llama-bench.py | 1 - src/llama-context.cpp | 3 +- src/llama-cparams.h | 1 - src/llama-kv-cache.cpp | 367 +----------------------------- src/llama-kv-cache.h | 25 +- src/llama-kv-cells.h | 28 +-- src/llama-memory.h | 2 +- tools/llama-bench/README.md | 1 - tools/llama-bench/llama-bench.cpp | 29 +-- tools/server/README.md | 2 +- tools/server/bench/bench.py | 1 - 16 files changed, 32 insertions(+), 440 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index 1227aeb2a3915..81c4005c5e7fc 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2254,9 +2254,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex ).set_examples({LLAMA_EXAMPLE_PERPLEXITY})); add_opt(common_arg( {"-dt", "--defrag-thold"}, "N", - string_format("KV cache defragmentation threshold (default: %.1f, < 0 - disabled)", (double)params.defrag_thold), + string_format("KV cache defragmentation threshold (DEPRECATED)"), [](common_params & params, const std::string & value) { - params.defrag_thold = std::stof(value); + GGML_UNUSED(params); + GGML_UNUSED(value); + LOG_WRN("DEPRECATED: --defrag-thold is deprecated and no longer necessary to specify\n"); } ).set_env("LLAMA_ARG_DEFRAG_THOLD")); add_opt(common_arg( diff --git a/common/common.cpp b/common/common.cpp index decabcc2ed327..fdce1dcdec19b 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1152,7 +1152,6 @@ struct llama_context_params common_context_params_to_llama(const common_params & cparams.yarn_orig_ctx = params.yarn_orig_ctx; cparams.pooling_type = params.pooling_type; cparams.attention_type = params.attention_type; - cparams.defrag_thold = params.defrag_thold; cparams.cb_eval = params.cb_eval; cparams.cb_eval_user_data = params.cb_eval_user_data; cparams.offload_kqv = !params.no_kv_offload; diff --git a/common/common.h b/common/common.h index 614e41a2461e7..390dda5e531be 100644 --- a/common/common.h +++ b/common/common.h @@ -288,7 +288,6 @@ struct common_params { float yarn_beta_fast = 32.0f; // YaRN low correction dim float yarn_beta_slow = 1.0f; // YaRN high correction dim int32_t yarn_orig_ctx = 0; // YaRN original context length - float defrag_thold = 0.1f; // KV cache defragmentation threshold // offload params std::vector devices; // devices to use for offloading diff --git a/examples/llama.vim b/examples/llama.vim index af3fd3935d765..736802d365541 100644 --- a/examples/llama.vim +++ b/examples/llama.vim @@ -17,7 +17,7 @@ " " start the llama.cpp server with a FIM-compatible model. for example: " -" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa -dt 0.1 --ubatch-size 512 --batch-size 1024 --cache-reuse 256 +" $ llama-server -m {model.gguf} --port 8012 -ngl 99 -fa --ubatch-size 512 --batch-size 1024 --cache-reuse 256 " " --batch-size [512, model max context] " diff --git a/include/llama.h b/include/llama.h index 662e0971dff2f..c5622cc16b4c2 100644 --- a/include/llama.h +++ b/include/llama.h @@ -312,7 +312,7 @@ extern "C" { float yarn_beta_fast; // YaRN low correction dim float yarn_beta_slow; // YaRN high correction dim uint32_t yarn_orig_ctx; // YaRN original context size - float defrag_thold; // defragment the KV cache if holes/size > thold, <= 0 disabled (default) + float defrag_thold; // [DEPRECATED] defragment the KV cache if holes/size > thold, <= 0 disabled (default) ggml_backend_sched_eval_callback cb_eval; void * cb_eval_user_data; diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py index 8366f89a08076..0141e0a350dc9 100755 --- a/scripts/compare-llama-bench.py +++ b/scripts/compare-llama-bench.py @@ -28,7 +28,6 @@ "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides", - "defrag_thold", "use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts", ] diff --git a/src/llama-context.cpp b/src/llama-context.cpp index e8e8b3450a5d2..18cf25079d283 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -39,7 +39,6 @@ llama_context::llama_context( cparams.yarn_attn_factor = params.yarn_attn_factor; cparams.yarn_beta_fast = params.yarn_beta_fast; cparams.yarn_beta_slow = params.yarn_beta_slow; - cparams.defrag_thold = params.defrag_thold; cparams.embeddings = params.embeddings; cparams.offload_kqv = params.offload_kqv; cparams.flash_attn = params.flash_attn; @@ -978,7 +977,7 @@ int llama_context::decode(const llama_batch & batch_inp) { bool did_optimize = false; - // handle any pending defrags/shifts + // handle any pending shifts/copies memory_update(false); llama_memory_context_ptr mctx; diff --git a/src/llama-cparams.h b/src/llama-cparams.h index 38750affc500b..dbbaba9f6274c 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -24,7 +24,6 @@ struct llama_cparams { float yarn_attn_factor; float yarn_beta_fast; float yarn_beta_slow; - float defrag_thold; bool embeddings; bool causal_attn; diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index bb490cf9e82a2..70ddd5f4b952c 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -525,39 +525,11 @@ llama_memory_context_ptr llama_kv_cache::init_full() { } llama_memory_context_ptr llama_kv_cache::init_update(llama_context * lctx, bool optimize) { - bool do_shift = get_has_shift(); - - defrag_info dinfo; - - // see if we need to defrag - if (n_stream == 1) { - // note : for now do not consider defrag for n_stream > 1 - const auto & cells = v_cells[seq_to_stream[0]]; - - bool do_defrag = optimize; - - const auto thold = lctx->get_cparams().defrag_thold; - - if (!do_defrag && thold > 0.0f) { - const auto n_kv = cells.used_max_p1(); - - // - do not defrag small contexts (i.e. < 2048 tokens) - // - count the padding towards the number of used tokens - const float fragmentation = n_kv >= 2048 ? std::max(0.0f, 1.0f - (float(cells.get_used() + n_pad)/n_kv)) : 0.0f; - - if (fragmentation > thold) { - LLAMA_LOG_DEBUG("%s: fragmentation: %.2f - requesting defrag\n", __func__, fragmentation); - - do_defrag = true; - } - } + GGML_UNUSED(optimize); - if (do_defrag) { - dinfo = defrag_prepare(lctx->graph_max_nodes()); - } - } + bool do_shift = get_has_shift(); - return std::make_unique(this, lctx, do_shift, std::move(dinfo), std::move(sc_info)); + return std::make_unique(this, lctx, do_shift, std::move(sc_info)); } llama_kv_cache::slot_info_vec_t llama_kv_cache::prepare(const std::vector & ubatches) { @@ -629,7 +601,7 @@ llama_kv_cache::slot_info_vec_t llama_kv_cache::prepare(const std::vectorget_sched(); @@ -699,53 +671,6 @@ bool llama_kv_cache::update(llama_context * lctx, bool do_shift, const defrag_in } } - if (!dinfo.empty()) { - LLAMA_LOG_DEBUG("%s: defragmenting KV cache\n", __func__); - - // note: for now do not consider defrag for n_stream > 1 - auto & cells = v_cells[seq_to_stream[0]]; - auto & head = v_heads[seq_to_stream[0]]; - - // apply moves: - { - const auto n_kv = dinfo.ids.size(); - - for (uint32_t i = 0; i < n_kv; ++i) { - assert(dinfo.ids[i] <= n_kv); - - if (dinfo.ids[i] == n_kv || dinfo.ids[i] == i) { - continue; - } - - cells.mv(i, dinfo.ids[i]); - } - - // reset the head so we can find the first free slot during the next ubatch - head = 0; - } - - ggml_backend_sched_reset(sched); - - auto * res = lctx->get_gf_res_reserve(); - - res->reset(); - - auto * gf = build_graph_defrag(res, lctx, dinfo); - if (!ggml_backend_sched_alloc_graph(sched, gf)) { - LLAMA_LOG_ERROR("%s: failed to allocate compute graph for defrag\n", __func__); - return updated; - } - - res->set_inputs(nullptr); - - if (lctx->graph_compute(gf, false) != GGML_STATUS_SUCCESS) { - LLAMA_LOG_ERROR("%s: failed to compute defrag\n", __func__); - return updated; - } - - updated = true; - } - return updated; } @@ -1525,283 +1450,6 @@ ggml_cgraph * llama_kv_cache::build_graph_shift(llm_graph_result * res, llama_co return gf; } -ggml_cgraph * llama_kv_cache::build_graph_defrag( - llm_graph_result * res, - llama_context * lctx, - const defrag_info & dinfo) const { - auto * ctx = res->get_ctx(); - auto * gf = res->get_gf(); - - GGML_ASSERT(n_stream == 1 && "n_stream > 1 does not support defrag"); - - const auto & cells = v_cells[0]; - - const auto & ids = dinfo.ids; - - const auto & cparams = lctx->get_cparams(); - -#if 0 - // CPU defrag - // - // TODO: optimizations are possible: - // - multiple threads - // - avoid copying to the host memory when already there - // - // likely not worth the effort, as we have ggml_graph based defrag - // - - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); - - const uint32_t kv_size = size; - - std::vector buf_k; - std::vector buf_v; - - for (uint32_t il = 0; il < n_layer; ++il) { - const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa); - const size_t k_size = ggml_row_size(k_l[il]->type, n_embd_k_gqa*kv_size); - - const size_t v_size_el = ggml_type_size(v_l[il]->type); - const size_t v_size = ggml_row_size (v_l[il]->type, n_embd_v_gqa*kv_size); - - buf_k.resize(k_size); - buf_v.resize(v_size); - - ggml_backend_tensor_get(k_l[il], buf_k.data(), 0, buf_k.size()); - ggml_backend_tensor_get(v_l[il], buf_v.data(), 0, buf_v.size()); - - // batch move [i, i+nm) to [id, id+nm) - // note: cells can move only to a lower index - for (uint32_t i = 0; i < n_kv; ++i) { - const uint32_t id = ids[i]; - - if (i == id || id == n_kv) { - continue; - } - - uint32_t nm = 1; - - while (i + nm < n_kv && ids[i + nm] == id + nm) { - nm++; - } - - // move keys - { - const int64_t os = i*k_size_row; - const int64_t od = id*k_size_row; - - memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row); - } - - // move values (note: they are transposed) - { - const int64_t os = i; - const int64_t od = id; - - for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { - memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el); - } - } - - i += nm - 1; - } - - ggml_backend_tensor_set(k_l[il], buf_k.data(), 0, buf_k.size()); - ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size()); - } -#else - for (uint32_t i = 0; i < ids.size(); ++i) { - const uint32_t id = ids[i]; - - if (i == id || id == ids.size()) { - continue; - } - - uint32_t nm = 1; - - while (i + nm < ids.size() && ids[i + nm] == id + nm) { - nm++; - } - - for (const auto & layer : layers) { - const uint32_t il = layer.il; - - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); - const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); - - ggml_tensor * view_k_src = ggml_view_2d(ctx, layer.k, - n_embd_k_gqa, nm, - ggml_row_size(layer.k->type, n_embd_k_gqa), - ggml_row_size(layer.k->type, n_embd_k_gqa*i)); - - ggml_tensor * view_k_dst = ggml_view_2d(ctx, layer.k, - n_embd_k_gqa, nm, - ggml_row_size(layer.k->type, n_embd_k_gqa), - ggml_row_size(layer.k->type, n_embd_k_gqa*id)); - - ggml_tensor * view_v_src; - ggml_tensor * view_v_dst; - - if (cparams.flash_attn) { - // NOTE: the V cache is not transposed when using flash attention - view_v_src = ggml_view_2d(ctx, layer.v, - n_embd_v_gqa, nm, - ggml_row_size(layer.v->type, n_embd_v_gqa), - ggml_row_size(layer.v->type, n_embd_v_gqa*i)); - - view_v_dst = ggml_view_2d(ctx, layer.v, - n_embd_v_gqa, nm, - ggml_row_size(layer.v->type, n_embd_v_gqa), - ggml_row_size(layer.v->type, n_embd_v_gqa*id)); - } else { - view_v_src = ggml_view_2d(ctx, layer.v, - nm, n_embd_v_gqa, - ggml_row_size(layer.v->type, cells.size()), - ggml_row_size(layer.v->type, i)); - - view_v_dst = ggml_view_2d(ctx, layer.v, - nm, n_embd_v_gqa, - ggml_row_size(layer.v->type, cells.size()), - ggml_row_size(layer.v->type, id)); - } - - ggml_build_forward_expand(gf, ggml_cpy(ctx, view_k_src, view_k_dst)); - ggml_build_forward_expand(gf, ggml_cpy(ctx, view_v_src, view_v_dst)); - } - - i += nm - 1; - } - - //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes); -#endif - - return gf; -} - -llama_kv_cache::defrag_info llama_kv_cache::defrag_prepare(int32_t n_max_nodes) const { - GGML_ASSERT(n_stream == 1 && "n_stream > 1 does not support defrag"); - - const auto & cells = v_cells[0]; - - const uint32_t n_layer = layers.size(); - - const uint32_t n_kv = cells.used_max_p1(); - const uint32_t n_used = cells.get_used(); - - assert(n_used <= n_kv); - - //const int64_t t_start = ggml_time_us(); - - // number of cells moved - uint32_t n_moves = 0; - - // each move requires 6*n_layer tensors (see graph_build_kv_self_defrag) - // - source view, destination view, copy operation - // - x2 for keys and values - //const uint32_t max_moves = max_nodes()/(6*n_layer); - // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516 - const uint32_t max_moves = (n_max_nodes - 2*n_layer)/(6*n_layer); - - // determine which KV cells to move where - defrag_info res; - auto & ids = res.ids; - - ids.resize(n_kv, n_kv); - - for (uint32_t i0 = 0; i0 < n_used; ++i0) { - if (!cells.is_empty(i0)) { - ids[i0] = i0; - - continue; - } - - // found a hole - fill it with data from the end of the cache - - uint32_t nh = 1; - - // determine the size of the hole - while (i0 + nh < n_used && cells.is_empty(i0 + nh)) { - nh++; - } - - uint32_t nf = 0; - uint32_t is = n_kv - 1; - - // starting from the end, find nh non-empty cells - for (; is > i0; --is) { - if (cells.is_empty(is) || ids[is] != n_kv) { - continue; - } - - // non-empty cell which is not yet moved - nf++; - - if (nf == nh) { - break; - } - } - - // this can only happen if `n_used` is not accurate, which would be a bug - GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh"); - - nf = 0; - - uint32_t i1 = is; - - // are we moving a continuous block of memory? - bool cont = false; - - // should we stop searching for the next move? - bool stop = false; - - // go back and move the nf cells to the hole - for (; i1 < n_kv; ++i1) { - if (cells.is_empty(i1) || ids[i1] != n_kv) { - if (n_moves == max_moves) { - stop = true; - break; - } - - cont = false; - continue; - } - - // this cell goes to (i0 + nf) - ids[i1] = i0 + nf; - - if (!cont) { - n_moves++; - cont = true; - } - - nf++; - - if (nf == nh) { - break; - } - } - - if (stop || n_moves == max_moves) { - break; - } - - //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh); - - i0 += nh - 1; - } - - if (n_moves == 0) { - return {}; - } - - LLAMA_LOG_DEBUG("%s: (tmp log) KV defrag cell moves: %u\n", __func__, n_moves); - - LLAMA_LOG_DEBUG("%s: expected gf nodes: %u\n", __func__, 6*n_moves*n_layer); - - return res; -} - bool llama_kv_cache::is_masked_swa(llama_pos p0, llama_pos p1) const { assert(p0 >= 0 && p1 >= 0); @@ -2300,9 +1948,8 @@ llama_kv_cache_context::llama_kv_cache_context( llama_kv_cache * kv, llama_context * lctx, bool do_shift, - defrag_info dinfo, - stream_copy_info sc_info) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), lctx(lctx), do_shift(do_shift), dinfo(std::move(dinfo)), sc_info(std::move(sc_info)) { - if (!do_shift && this->dinfo.empty() && this->sc_info.empty()) { + stream_copy_info sc_info) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), lctx(lctx), do_shift(do_shift), sc_info(std::move(sc_info)) { + if (!do_shift && this->sc_info.empty()) { status = LLAMA_MEMORY_STATUS_NO_UPDATE; } } @@ -2330,7 +1977,7 @@ bool llama_kv_cache_context::apply() { // no ubatches -> this is a KV cache update if (ubatches.empty()) { - kv->update(lctx, do_shift, dinfo, sc_info); + kv->update(lctx, do_shift, sc_info); return true; } diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 5ca618e1b82e1..297a0973dd467 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -24,17 +24,6 @@ class llama_kv_cache : public llama_memory_i { // this callback is used to filter out layers that should not be included in the cache using layer_filter_cb = std::function; - struct defrag_info { - bool empty() const { - return ids.empty(); - } - - // contains information about which cell moves where: - // - cell i moves to ids[i] - // - if ids[i] == i || ids[i] == ids.size(), then cell i is not moved - std::vector ids; - }; - struct stream_copy_info { bool empty() const { assert(ssrc.size() == sdst.size()); @@ -173,7 +162,7 @@ class llama_kv_cache : public llama_memory_i { // return empty vector on failure slot_info_vec_t prepare(const std::vector & ubatches); - bool update(llama_context * lctx, bool do_shift, const defrag_info & dinfo, const stream_copy_info & sc_info); + bool update(llama_context * lctx, bool do_shift, const stream_copy_info & sc_info); // find a slot of kv cells that can hold the ubatch // if cont == true, then the slot must be continuous @@ -254,9 +243,6 @@ class llama_kv_cache : public llama_memory_i { // model layer id -> KV cache layer id std::unordered_map map_layer_ids; - // return non-empty vector if cells have been moved - defrag_info defrag_prepare(int32_t n_max_nodes) const; - size_t total_size() const; size_t size_k_bytes() const; @@ -277,11 +263,6 @@ class llama_kv_cache : public llama_memory_i { llm_graph_result * res, llama_context * lctx) const; - ggml_cgraph * build_graph_defrag( - llm_graph_result * res, - llama_context * lctx, - const defrag_info & dinfo) const; - struct cell_ranges_t { uint32_t strm; @@ -299,7 +280,6 @@ class llama_kv_cache_context : public llama_memory_context_i { public: // some shorthands using slot_info_vec_t = llama_kv_cache::slot_info_vec_t; - using defrag_info = llama_kv_cache::defrag_info; using stream_copy_info = llama_kv_cache::stream_copy_info; // used for errors @@ -314,7 +294,6 @@ class llama_kv_cache_context : public llama_memory_context_i { llama_kv_cache * kv, llama_context * lctx, bool do_shift, - defrag_info dinfo, stream_copy_info sc_info); // used to create a batch procesing context from a batch @@ -374,8 +353,6 @@ class llama_kv_cache_context : public llama_memory_context_i { bool do_shift = false; - defrag_info dinfo; - stream_copy_info sc_info; // diff --git a/src/llama-kv-cells.h b/src/llama-kv-cells.h index 2651e30331fd6..8f6bf01456c8f 100644 --- a/src/llama-kv-cells.h +++ b/src/llama-kv-cells.h @@ -77,24 +77,24 @@ class llama_kv_cells { } // move cell isrc to idst (used during defrag) - void mv(uint32_t isrc, uint32_t idst) { - assert(isrc < pos.size()); - assert(idst < pos.size()); + //void mv(uint32_t isrc, uint32_t idst) { + // assert(isrc < pos.size()); + // assert(idst < pos.size()); - assert(pos[idst] == -1); - assert(pos[isrc] != -1); + // assert(pos[idst] == -1); + // assert(pos[isrc] != -1); - pos [idst] = pos [isrc]; - shift[idst] = shift[isrc]; - seq [idst] = seq [isrc]; + // pos [idst] = pos [isrc]; + // shift[idst] = shift[isrc]; + // seq [idst] = seq [isrc]; - pos [isrc] = -1; - shift[isrc] = 0; - seq [isrc].reset(); + // pos [isrc] = -1; + // shift[isrc] = 0; + // seq [isrc].reset(); - used.erase (isrc); - used.insert(idst); - } + // used.erase (isrc); + // used.insert(idst); + //} // copy the state of cells [i, i + n) (used for save/restore the state of the cells) llama_kv_cells cp(uint32_t i, uint32_t n) const { diff --git a/src/llama-memory.h b/src/llama-memory.h index 42a7145c2f387..94d858bccc2e0 100644 --- a/src/llama-memory.h +++ b/src/llama-memory.h @@ -77,7 +77,7 @@ struct llama_memory_i { // simulate full cache, used for allocating worst-case compute buffers virtual llama_memory_context_ptr init_full() = 0; - // prepare for any pending memory updates, such as shifts, defrags, etc. + // prepare for any pending memory updates, such as shifts, copies, etc. // status == LLAMA_MEMORY_STATUS_NO_UPDATE if there is nothing to update virtual llama_memory_context_ptr init_update(llama_context * lctx, bool optimize) = 0; diff --git a/tools/llama-bench/README.md b/tools/llama-bench/README.md index 31a2730874346..bf7fd29c8c55f 100644 --- a/tools/llama-bench/README.md +++ b/tools/llama-bench/README.md @@ -43,7 +43,6 @@ test parameters: -ub, --ubatch-size (default: 512) -ctk, --cache-type-k (default: f16) -ctv, --cache-type-v (default: f16) - -dt, --defrag-thold (default: -1) -t, --threads (default: system dependent) -C, --cpu-mask (default: 0x0) --cpu-strict <0|1> (default: 0) diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index 10b48c5568612..9378706a12a7c 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -245,7 +245,6 @@ struct cmd_params { std::vector n_ubatch; std::vector type_k; std::vector type_v; - std::vector defrag_thold; std::vector n_threads; std::vector cpu_mask; std::vector cpu_strict; @@ -282,7 +281,6 @@ static const cmd_params cmd_params_defaults = { /* n_ubatch */ { 512 }, /* type_k */ { GGML_TYPE_F16 }, /* type_v */ { GGML_TYPE_F16 }, - /* defrag_thold */ { -1.0f }, /* n_threads */ { cpu_get_num_math() }, /* cpu_mask */ { "0x0" }, /* cpu_strict */ { false }, @@ -346,8 +344,6 @@ static void print_usage(int /* argc */, char ** argv) { join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); printf(" -ctv, --cache-type-v (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); - printf(" -dt, --defrag-thold (default: %s)\n", - join(cmd_params_defaults.defrag_thold, ",").c_str()); printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); printf(" -C, --cpu-mask (default: %s)\n", @@ -533,13 +529,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { break; } params.type_v.insert(params.type_v.end(), types.begin(), types.end()); - } else if (arg == "-dt" || arg == "--defrag-thold") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = string_split(argv[i], split_delim); - params.defrag_thold.insert(params.defrag_thold.end(), p.begin(), p.end()); } else if (arg == "-t" || arg == "--threads") { if (++i >= argc) { invalid_param = true; @@ -849,9 +838,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; } - if (params.defrag_thold.empty()) { - params.defrag_thold = cmd_params_defaults.defrag_thold; - } if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; } @@ -910,7 +896,6 @@ struct cmd_params_instance { int n_ubatch; ggml_type type_k; ggml_type type_v; - float defrag_thold; int n_threads; std::string cpu_mask; bool cpu_strict; @@ -1007,7 +992,6 @@ struct cmd_params_instance { cparams.n_ubatch = n_ubatch; cparams.type_k = type_k; cparams.type_v = type_v; - cparams.defrag_thold = defrag_thold; cparams.offload_kqv = !no_kv_offload; cparams.flash_attn = flash_attn; cparams.embeddings = embeddings; @@ -1037,7 +1021,6 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & nub : params.n_ubatch) for (const auto & tk : params.type_k) for (const auto & tv : params.type_v) - for (const auto & defrag_thold : params.defrag_thold) for (const auto & nkvo : params.no_kv_offload) for (const auto & fa : params.flash_attn) for (const auto & nt : params.n_threads) @@ -1058,7 +1041,6 @@ static std::vector get_cmd_params_instances(const cmd_param /* .n_ubatch = */ nub, /* .type_k = */ tk, /* .type_v = */ tv, - /* .defrag_thold = */ defrag_thold, /* .n_threads = */ nt, /* .cpu_mask = */ cm, /* .cpu_strict = */ cs, @@ -1091,7 +1073,6 @@ static std::vector get_cmd_params_instances(const cmd_param /* .n_ubatch = */ nub, /* .type_k = */ tk, /* .type_v = */ tv, - /* .defrag_thold = */ defrag_thold, /* .n_threads = */ nt, /* .cpu_mask = */ cm, /* .cpu_strict = */ cs, @@ -1124,7 +1105,6 @@ static std::vector get_cmd_params_instances(const cmd_param /* .n_ubatch = */ nub, /* .type_k = */ tk, /* .type_v = */ tv, - /* .defrag_thold = */ defrag_thold, /* .n_threads = */ nt, /* .cpu_mask = */ cm, /* .cpu_strict = */ cs, @@ -1166,7 +1146,6 @@ struct test { int poll; ggml_type type_k; ggml_type type_v; - float defrag_thold; int n_gpu_layers; llama_split_mode split_mode; int main_gpu; @@ -1201,7 +1180,6 @@ struct test { poll = inst.poll; type_k = inst.type_k; type_v = inst.type_v; - defrag_thold = inst.defrag_thold; n_gpu_layers = inst.n_gpu_layers; split_mode = inst.split_mode; main_gpu = inst.main_gpu; @@ -1257,7 +1235,6 @@ struct test { "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides", - "defrag_thold", "use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts", }; @@ -1277,7 +1254,7 @@ struct test { field == "use_mmap" || field == "embeddings") { return BOOL; } - if (field == "avg_ts" || field == "stddev_ts" || field == "defrag_thold") { + if (field == "avg_ts" || field == "stddev_ts") { return FLOAT; } return STRING; @@ -1344,7 +1321,6 @@ struct test { std::to_string(flash_attn), tensor_split_str, tensor_buft_overrides_str, - std::to_string(defrag_thold), std::to_string(use_mmap), std::to_string(embeddings), std::to_string(no_op_offload), @@ -1611,9 +1587,6 @@ struct markdown_printer : public printer { if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) { fields.emplace_back("type_v"); } - if (params.defrag_thold.size() > 1 || params.defrag_thold != cmd_params_defaults.defrag_thold) { - fields.emplace_back("defrag_thold"); - } if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) { fields.emplace_back("main_gpu"); } diff --git a/tools/server/README.md b/tools/server/README.md index 86844225ff309..baf3730add67c 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -66,7 +66,7 @@ The project is under active development, and we are [looking for feedback and co | `-nkvo, --no-kv-offload` | disable KV offload
(env: LLAMA_ARG_NO_KV_OFFLOAD) | | `-ctk, --cache-type-k TYPE` | KV cache data type for K
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_K) | | `-ctv, --cache-type-v TYPE` | KV cache data type for V
allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
(default: f16)
(env: LLAMA_ARG_CACHE_TYPE_V) | -| `-dt, --defrag-thold N` | KV cache defragmentation threshold (default: 0.1, < 0 - disabled)
(env: LLAMA_ARG_DEFRAG_THOLD) | +| `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)
(env: LLAMA_ARG_DEFRAG_THOLD) | | `-np, --parallel N` | number of parallel sequences to decode (default: 1)
(env: LLAMA_ARG_N_PARALLEL) | | `--mlock` | force system to keep model in RAM rather than swapping or compressing
(env: LLAMA_ARG_MLOCK) | | `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)
(env: LLAMA_ARG_NO_MMAP) | diff --git a/tools/server/bench/bench.py b/tools/server/bench/bench.py index 5cc6f92ab6c53..0c57a2df04a60 100644 --- a/tools/server/bench/bench.py +++ b/tools/server/bench/bench.py @@ -274,7 +274,6 @@ def start_server_background(args): server_args.extend(['--batch-size', args.batch_size]) server_args.extend(['--ubatch-size', args.ubatch_size]) server_args.extend(['--n-predict', args.max_tokens * 2]) - server_args.extend(['--defrag-thold', "0.1"]) server_args.append('--cont-batching') server_args.append('--metrics') server_args.append('--flash-attn') From b1ab91821f980f8993423c3f2a82a0a0f60c09d2 Mon Sep 17 00:00:00 2001 From: Yavor Ivanov Date: Fri, 22 Aug 2025 14:06:29 +0300 Subject: [PATCH 123/140] cuda : add Pad Reflect 1D support (#14659) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add Pad Reflect 1D CUDA support * Update ggml/src/ggml-cuda/pad_reflect_1d.cu Co-authored-by: Johannes Gäßler --------- Co-authored-by: Johannes Gäßler --- ggml/src/ggml-cuda/ggml-cuda.cu | 5 ++ ggml/src/ggml-cuda/pad_reflect_1d.cu | 82 +++++++++++++++++++++++++++ ggml/src/ggml-cuda/pad_reflect_1d.cuh | 5 ++ 3 files changed, 92 insertions(+) create mode 100644 ggml/src/ggml-cuda/pad_reflect_1d.cu create mode 100644 ggml/src/ggml-cuda/pad_reflect_1d.cuh diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 4e17fd211e1bb..d29a0b573f193 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -49,6 +49,7 @@ #include "ggml-cuda/wkv.cuh" #include "ggml-cuda/gla.cuh" #include "ggml-cuda/set-rows.cuh" +#include "ggml-cuda/pad_reflect_1d.cuh" #include "ggml.h" #include @@ -2352,6 +2353,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg case GGML_OP_PAD: ggml_cuda_op_pad(ctx, dst); break; + case GGML_OP_PAD_REFLECT_1D: + ggml_cuda_op_pad_reflect_1d(ctx, dst); + break; case GGML_OP_ARANGE: ggml_cuda_op_arange(ctx, dst); break; @@ -3490,6 +3494,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g return ggml_is_contiguous(op->src[0]); case GGML_OP_UPSCALE: case GGML_OP_PAD: + case GGML_OP_PAD_REFLECT_1D: case GGML_OP_ARANGE: case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_LEAKY_RELU: diff --git a/ggml/src/ggml-cuda/pad_reflect_1d.cu b/ggml/src/ggml-cuda/pad_reflect_1d.cu new file mode 100644 index 0000000000000..4ed34aec3d331 --- /dev/null +++ b/ggml/src/ggml-cuda/pad_reflect_1d.cu @@ -0,0 +1,82 @@ +#include "pad_reflect_1d.cuh" + +static __global__ void pad_reflect_1d_kernel_f32( + const void * __restrict__ src0, + void * __restrict__ dst, + const int64_t ne0, + const int64_t ne00, + const int64_t ne01, + const int64_t ne02, + const int64_t ne03, + const int64_t nb00, + const int64_t nb01, + const int64_t nb02, + const int64_t nb03, + const int64_t nb0, + const int64_t nb1, + const int64_t nb2, + const int64_t nb3, + const int p0, + const int p1) { + + const int64_t i3 = blockIdx.z; + const int64_t i2 = blockIdx.y; + const int64_t i1 = blockIdx.x; + + if (i1 >= ne01 || i2 >= ne02 || i3 >= ne03) { + return; + } + + const char * src0_ptr = (const char *)src0 + i3*nb03 + i2*nb02 + i1*nb01; + char * dst_ptr = (char *)dst + i3*nb3 + i2*nb2 + i1*nb1; + + for (int64_t i0 = threadIdx.x; i0 < ne0; i0 += blockDim.x) { + float value; + + if (i0 < p0) { + // Left padding - reflect + value = *(const float *)(src0_ptr + (p0 - i0) * nb00); + } else if (i0 < ne0 - p1) { + // Middle - copy + value = *(const float *)(src0_ptr + (i0 - p0) * nb00); + } else { + // Right padding - reflect + int64_t src_idx = (ne0 - p1 - p0) - (p1 + 1 - (ne0 - i0)) - 1; + value = *(const float *)(src0_ptr + src_idx * nb00); + } + + *(float *)(dst_ptr + i0 * nb0) = value; + } +} + +void ggml_cuda_op_pad_reflect_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + cudaStream_t stream = ctx.stream(); + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + const int32_t * opts = (const int32_t *) dst->op_params; + const int p0 = opts[0]; + const int p1 = opts[1]; + + const int64_t ne00 = src0->ne[0]; + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const int64_t ne0 = dst->ne[0]; + + GGML_ASSERT(ne0 == ne00 + p0 + p1); + + const dim3 block_dims(CUDA_PAD_REFLECT_1D_BLOCK_SIZE, 1, 1); + const dim3 grid_dims(ne01, ne02, ne03); + + pad_reflect_1d_kernel_f32<<>>( + src0->data, dst->data, + ne0, ne00, ne01, ne02, ne03, + src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], + dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], + p0, p1 + ); +} diff --git a/ggml/src/ggml-cuda/pad_reflect_1d.cuh b/ggml/src/ggml-cuda/pad_reflect_1d.cuh new file mode 100644 index 0000000000000..15f2ed1737b1a --- /dev/null +++ b/ggml/src/ggml-cuda/pad_reflect_1d.cuh @@ -0,0 +1,5 @@ +#include "common.cuh" + +#define CUDA_PAD_REFLECT_1D_BLOCK_SIZE 256 + +void ggml_cuda_op_pad_reflect_1d(ggml_backend_cuda_context & ctx, ggml_tensor * dst); From 92f7f0a53cf6484e16a8084ed90807c35a164809 Mon Sep 17 00:00:00 2001 From: rmatif Date: Fri, 22 Aug 2025 15:33:15 +0200 Subject: [PATCH 124/140] ggml: add `conv3d` op (#15182) * add conv3d * bump GGML_OP_COUNT --- ggml/include/ggml.h | 18 +++++ ggml/src/ggml-cpu/ggml-cpu.c | 6 ++ ggml/src/ggml-cpu/ops.cpp | 142 +++++++++++++++++++++++++++++++++++ ggml/src/ggml-cpu/ops.h | 1 + ggml/src/ggml.c | 56 +++++++++++++- tests/test-backend-ops.cpp | 124 ++++++++++++++++++++++++++++++ 6 files changed, 345 insertions(+), 2 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index b8b82e11c86f5..7e9c3c8c7a096 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -512,6 +512,7 @@ extern "C" { GGML_OP_IM2COL, GGML_OP_IM2COL_BACK, GGML_OP_CONV_2D, + GGML_OP_CONV_3D, GGML_OP_CONV_2D_DW, GGML_OP_CONV_TRANSPOSE_2D, GGML_OP_POOL_1D, @@ -1940,6 +1941,23 @@ extern "C" { int d0, // dilation dimension 0 int d1); // dilation dimension 1 + GGML_API struct ggml_tensor * ggml_conv_3d( + struct ggml_context * ctx, + struct ggml_tensor * a, // kernel [KW, KH, KD, IC * OC] + struct ggml_tensor * b, // input [W, H, D, C * N] + int s0, // stride + int s1, + int s2, + int p0, // padding + int p1, + int p2, + int d0, // dilation + int d1, + int d2, + int n_channels, + int n_batch, + int n_channels_out); + enum ggml_op_pool { GGML_OP_POOL_MAX, GGML_OP_POOL_AVG, diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index f6bea3df34a0b..0d5d3a3440aaf 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -1880,6 +1880,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_conv_2d(params, tensor); } break; + case GGML_OP_CONV_3D: + { + ggml_compute_forward_conv_3d(params, tensor); + } break; case GGML_OP_CONV_2D_DW: { ggml_compute_forward_conv_2d_dw(params, tensor); @@ -2252,6 +2256,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { case GGML_OP_IM2COL: case GGML_OP_IM2COL_BACK: case GGML_OP_CONV_2D: + case GGML_OP_CONV_3D: case GGML_OP_CONV_2D_DW: case GGML_OP_CONV_TRANSPOSE_1D: case GGML_OP_CONV_TRANSPOSE_2D: @@ -2773,6 +2778,7 @@ struct ggml_cplan ggml_graph_plan( } } break; case GGML_OP_CONV_2D: + case GGML_OP_CONV_3D: { cur = GGML_IM2COL_WORK_SIZE; } break; diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp index b72a2556a5fc9..460367cca09e9 100644 --- a/ggml/src/ggml-cpu/ops.cpp +++ b/ggml/src/ggml-cpu/ops.cpp @@ -7207,6 +7207,148 @@ void ggml_compute_forward_conv_2d( ggml_compute_forward_conv_2d_impl(params, src0, src1, dst, src0->type); } +// ggml_compute_forward_conv_3d + +static void ggml_compute_forward_conv_3d_impl(const ggml_compute_params * params, + const ggml_tensor * kernel, + const ggml_tensor * src, + ggml_tensor * dst, + ggml_type kernel_type) { + + GGML_ASSERT(ggml_is_contiguous(kernel)); + GGML_ASSERT(kernel_type == GGML_TYPE_F16 || kernel_type == GGML_TYPE_F32); + GGML_ASSERT(kernel->type == kernel_type); + + const ggml_type_traits * traits = ggml_get_type_traits(kernel_type); + + const int32_t s0 = dst->op_params[0]; + const int32_t s1 = dst->op_params[1]; + const int32_t s2 = dst->op_params[2]; + const int32_t p0 = dst->op_params[3]; + const int32_t p1 = dst->op_params[4]; + const int32_t p2 = dst->op_params[5]; + const int32_t d0 = dst->op_params[6]; + const int32_t d1 = dst->op_params[7]; + const int32_t d2 = dst->op_params[8]; + const int32_t c = dst->op_params[9]; + const int32_t n = dst->op_params[10]; + const int32_t oc = dst->op_params[11]; + + const int64_t src_w = src->ne[0]; + const int64_t src_h = src->ne[1]; + const int64_t src_d = src->ne[2]; + const int64_t knl_w = kernel->ne[0]; + const int64_t knl_h = kernel->ne[1]; + const int64_t knl_d = kernel->ne[2]; + const int64_t dst_w = dst->ne[0]; + const int64_t dst_h = dst->ne[1]; + const int64_t dst_d = dst->ne[2]; + + const float * src_data = (float *) src->data; + void * knl_data = kernel->data; + float * dst_data = (float *) dst->data; + + const int64_t knl_n_per_channel = knl_w * knl_h * knl_d; + const int64_t knl_n_total = knl_n_per_channel * c; + const int64_t patch_total = n * dst_w * dst_h * dst_d; + + const int64_t space_per_patch = knl_n_total * traits->type_size + oc * sizeof(float); + const int64_t batch_size = params->wsize / space_per_patch; + const int64_t patches_per_batch = batch_size > 8 ? (batch_size / 8) * 8 : batch_size; + const int64_t batch_n = (patch_total + patches_per_batch - 1) / patches_per_batch; + + GGML_ASSERT(patches_per_batch > 0 && batch_size >= 1); + + void * tmp = params->wdata; + + for (int64_t batch_i = 0; batch_i < batch_n; ++batch_i) { + const int64_t patch_start_batch = batch_i * patches_per_batch; + const int64_t patch_end_batch = std::min(patch_start_batch + patches_per_batch, patch_total); + const int64_t patch_n_in_batch = patch_end_batch - patch_start_batch; + + const int64_t patch_per_thread = (patch_n_in_batch + params->nth - 1) / params->nth; + const int64_t patch_start = patch_start_batch + params->ith * patch_per_thread; + const int64_t patch_end = std::min(patch_start + patch_per_thread, patch_end_batch); + + for (int64_t p = patch_start; p < patch_end; ++p) { + const int64_t p_in_batch = p % (dst_w * dst_h * dst_d); + const int64_t p_in_depth = p_in_batch % (dst_w * dst_h); + const int64_t batch_idx = p / (dst_w * dst_h * dst_d); + const int64_t dst_z = p_in_batch / (dst_w * dst_h); + const int64_t dst_y = p_in_depth / dst_w; + const int64_t dst_x = p_in_depth % dst_w; + + char * dst_row = (char *) tmp + (p % patches_per_batch) * knl_n_total * traits->type_size; + + for (int64_t ic = 0; ic < c; ++ic) { + for (int64_t kz = 0; kz < knl_d; ++kz) { + for (int64_t ky = 0; ky < knl_h; ++ky) { + for (int64_t kx = 0; kx < knl_w; ++kx) { + const int64_t sz = dst_z * s2 + kz * d2 - p2; + const int64_t sy = dst_y * s1 + ky * d1 - p1; + const int64_t sx = dst_x * s0 + kx * d0 - p0; + + int64_t dst_idx = ic * knl_n_per_channel + kz * (knl_h * knl_w) + ky * knl_w + kx; + + float src_val; + if (sz < 0 || sz >= src_d || sy < 0 || sy >= src_h || sx < 0 || sx >= src_w) { + src_val = 0.0f; + } else { + const int64_t cn_idx = batch_idx * c + ic; + const float * src_ptr = (const float *)((const char *)src_data + sx*src->nb[0] + sy*src->nb[1] + sz*src->nb[2] + cn_idx*src->nb[3]); + src_val = *src_ptr; + } + + char * element_ptr = dst_row + dst_idx * traits->type_size; + if (kernel_type == GGML_TYPE_F32) { + *(float *)element_ptr = src_val; + } else if (kernel_type == GGML_TYPE_F16) { + *(ggml_fp16_t *)element_ptr = GGML_CPU_FP32_TO_FP16(src_val); + } + } + } + } + } + } + + ggml_barrier(params->threadpool); + + float * gemm_output = (float *) ((char *) tmp + patches_per_batch * knl_n_total * traits->type_size); + ggml_call_mul_mat(kernel_type, params, patch_n_in_batch, oc, knl_n_total, tmp, knl_data, gemm_output); + + ggml_barrier(params->threadpool); + + const int64_t permute_per_thread = (patch_n_in_batch + params->nth - 1) / params->nth; + const int64_t permute_start = params->ith * permute_per_thread; + const int64_t permute_end = std::min(permute_start + permute_per_thread, patch_n_in_batch); + + for (int64_t i = permute_start; i < permute_end; ++i) { + const int64_t p = patch_start_batch + i; + const int64_t p_in_batch = p % (dst_w * dst_h * dst_d); + const int64_t p_in_depth = p_in_batch % (dst_w * dst_h); + const int64_t batch_idx = p / (dst_w * dst_h * dst_d); + const int64_t dst_z = p_in_batch / (dst_w * dst_h); + const int64_t dst_y = p_in_depth / dst_w; + const int64_t dst_x = p_in_depth % dst_w; + + for (int64_t ioc = 0; ioc < oc; ++ioc) { + const float value = gemm_output[i * oc + ioc]; + const int64_t ocn_idx = batch_idx * oc + ioc; + float * dst_ptr = (float *)((char *)dst_data + dst_x*dst->nb[0] + dst_y*dst->nb[1] + dst_z*dst->nb[2] + ocn_idx*dst->nb[3]); + *dst_ptr = value; + } + } + } +} + +void ggml_compute_forward_conv_3d( + const ggml_compute_params * params, + ggml_tensor * dst) { + const ggml_tensor * src0 = dst->src[0]; + const ggml_tensor * src1 = dst->src[1]; + ggml_compute_forward_conv_3d_impl(params, src0, src1, dst, src0->type); +} + // ggml_compute_forward_conv_transpose_2d void ggml_compute_forward_conv_transpose_2d( diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h index 82ea79eaa51cc..d0ea83843b544 100644 --- a/ggml/src/ggml-cpu/ops.h +++ b/ggml/src/ggml-cpu/ops.h @@ -70,6 +70,7 @@ void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * p void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_conv_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst); +void ggml_compute_forward_conv_3d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst); void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst); diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index a4417f1a17ef4..d76ea58f789e2 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -975,6 +975,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "IM2COL", "IM2COL_BACK", "CONV_2D", + "CONV_3D", "CONV_2D_DW", "CONV_TRANSPOSE_2D", "POOL_1D", @@ -1017,7 +1018,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "GLU", }; -static_assert(GGML_OP_COUNT == 88, "GGML_OP_COUNT != 88"); +static_assert(GGML_OP_COUNT == 89, "GGML_OP_COUNT != 89"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -1077,6 +1078,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "im2col(x)", "im2col_back(x)", "conv_2d(x)", + "conv_3d(x)", "conv_2d_dw(x)", "conv_transpose_2d(x)", "pool_1d(x)", @@ -1119,7 +1121,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "glu(x)", }; -static_assert(GGML_OP_COUNT == 88, "GGML_OP_COUNT != 88"); +static_assert(GGML_OP_COUNT == 89, "GGML_OP_COUNT != 89"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -4480,6 +4482,56 @@ struct ggml_tensor * ggml_conv_2d_direct( return result; } +// ggml_conv_3d + +struct ggml_tensor * ggml_conv_3d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int s1, + int s2, + int p0, + int p1, + int p2, + int d0, + int d1, + int d2, + int c, + int n, + int oc) { + + GGML_ASSERT(a->ne[3] == (int64_t) c * oc); + GGML_ASSERT(b->ne[3] == (int64_t) c * n); + + int64_t ne[4]; + ne[0] = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0); + ne[1] = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1); + ne[2] = ggml_calc_conv_output_size(b->ne[2], a->ne[2], s2, p2, d2); + ne[3] = (int64_t) oc * n; + + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + + ggml_set_op_params_i32(result, 0, s0); + ggml_set_op_params_i32(result, 1, s1); + ggml_set_op_params_i32(result, 2, s2); + ggml_set_op_params_i32(result, 3, p0); + ggml_set_op_params_i32(result, 4, p1); + ggml_set_op_params_i32(result, 5, p2); + ggml_set_op_params_i32(result, 6, d0); + ggml_set_op_params_i32(result, 7, d1); + ggml_set_op_params_i32(result, 8, d2); + ggml_set_op_params_i32(result, 9, c); + ggml_set_op_params_i32(result, 10, n); + ggml_set_op_params_i32(result, 11, oc); + + result->op = GGML_OP_CONV_3D; + result->src[0] = a; + result->src[1] = b; + + return result; +} + // ggml_conv_transpose_2d_p0 static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) { diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index e21e9042781e4..a51527ca55c23 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -4091,6 +4091,75 @@ struct test_conv_2d_dw : public test_case { } }; +// GGML_OP_CONV_3D +struct test_conv_3d : public test_case { + // Logical 5D dimensions + const int64_t N, IC, ID, IH, IW; + const int64_t OC, KD, KH, KW; + // Conv params + const int s0, s1, s2; + const int p0, p1, p2; + const int d0, d1, d2; + // Types + const ggml_type type_kernel; + + std::string op_desc(ggml_tensor * t) override { + GGML_UNUSED(t); + return "CONV_3D"; + } + + std::string vars() override { + return VARS_TO_STR11(N, IC, ID, IH, IW, OC, KD, KH, KW, s0, s1) + "," + + VARS_TO_STR8(s2, p0, p1, p2, d0, d1, d2, type_kernel); + } + + double max_nmse_err() override { + return 5e-4; + } + + uint64_t op_flops(ggml_tensor * t) override { + GGML_UNUSED(t); + auto calc_conv_output_size = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t { + return (ins + 2 * p - d * (ks - 1) - 1) / s + 1; + }; + const int64_t OD = calc_conv_output_size(ID, KD, s2, p2, d2); + const int64_t OH = calc_conv_output_size(IH, KH, s1, p1, d1); + const int64_t OW = calc_conv_output_size(IW, KW, s0, p0, d0); + + return (uint64_t)N * OC * OD * OH * OW * (2 * IC * KD * KH * KW - 1); + } + + test_conv_3d( + int64_t N, int64_t IC, int64_t ID, int64_t IH, int64_t IW, + int64_t OC, int64_t KD, int64_t KH, int64_t KW, + int s0, int s1, int s2, + int p0, int p1, int p2, + int d0, int d1, int d2, + ggml_type type_kernel + ) : N(N), IC(IC), ID(ID), IH(IH), IW(IW), + OC(OC), KD(KD), KH(KH), KW(KW), + s0(s0), s1(s1), s2(s2), + p0(p0), p1(p1), p2(p2), + d0(d0), d1(d1), d2(d2), + type_kernel(type_kernel) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + // GGML input tensor is packed as [W, H, D, C*N] + const int64_t ne_input[] = {IW, IH, ID, IC * N}; + ggml_tensor * input = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne_input); + ggml_set_name(input, "input"); + + // GGML kernel tensor is packed as [KW, KH, KD, IC*OC] + const int64_t ne_kernel[] = {KW, KH, KD, IC * OC}; + ggml_tensor * kernel = ggml_new_tensor(ctx, type_kernel, 4, ne_kernel); + ggml_set_name(kernel, "kernel"); + + ggml_tensor * out = ggml_conv_3d(ctx, kernel, input, s0, s1, s2, p0, p1, p2, d0, d1, d2, (int)IC, (int)N, (int)OC); + ggml_set_name(out, "out"); + return out; + } +}; + // GGML_OP_CONCAT struct test_concat : public test_case { const ggml_type type; @@ -5528,6 +5597,61 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_conv_2d_dw({32, 8, 64, 1}, {3, 3, 1, 64}, 2, 1, 1, false)); test_cases.emplace_back(new test_conv_2d_dw({32, 8, 64, 1}, {3, 3, 1, 64}, 2, 1, 1, true)); + // CONV_3D + auto calc_conv_output_size_3d = [](int64_t ins, int64_t ks, int s, int p, int d) -> int64_t { + return (ins + 2 * p - d * (ks - 1) - 1) / s + 1; + }; + + for (ggml_type kernel_type : {GGML_TYPE_F32, GGML_TYPE_F16}) { + for (int N : {1, 2}) { + for (int IC : {1, 3}) { + for (int OC : {1, 4}) { + for (int s0 : {1, 2}) { + for (int p1 : {0, 1}) { + for (int d2 : {1, 2}) { + int64_t IW = 20, IH = 22, ID = 18; + int64_t KW = 3, KH = 3, KD = 3; + int s1 = s0, s2 = s0; + int p0 = p1, p2 = p1; + int d0 = d2, d1 = d2; + + if (calc_conv_output_size_3d(IW, KW, s0, p0, d0) <= 0 || + calc_conv_output_size_3d(IH, KH, s1, p1, d1) <= 0 || + calc_conv_output_size_3d(ID, KD, s2, p2, d2) <= 0) { + continue; + } + test_cases.emplace_back(new test_conv_3d( + N, IC, ID, IH, IW, + OC, KD, KH, KW, + s0, s1, s2, p0, p1, p2, d0, d1, d2, + kernel_type)); + + // Asymmetric kernel and params + int64_t asym_KW = 5, asym_KH = 1, asym_KD = 3; + int asym_s0 = 2, asym_s1 = 1, asym_s2 = 1; + int asym_p0 = 2, asym_p1 = 0, asym_p2 = 1; + int asym_d0 = 1, asym_d1 = 1, asym_d2 = 2; + + if (calc_conv_output_size_3d(IW, asym_KW, asym_s0, asym_p0, asym_d0) <= 0 || + calc_conv_output_size_3d(IH, asym_KH, asym_s1, asym_p1, asym_d1) <= 0 || + calc_conv_output_size_3d(ID, asym_KD, asym_s2, asym_p2, asym_d2) <= 0) { + continue; + } + test_cases.emplace_back(new test_conv_3d( + N, IC, ID, IH, IW, + OC, asym_KD, asym_KH, asym_KW, + asym_s0, asym_s1, asym_s2, asym_p0, asym_p1, asym_p2, asym_d0, asym_d1, asym_d2, + kernel_type)); + } + } + } + } + } + } + // Case with kernel size 1 + test_cases.emplace_back(new test_conv_3d(1, 4, 8, 8, 8, 8, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, kernel_type)); + } + for(uint32_t Cout : {1, 9}){ for(uint32_t Cin : {1, 7}){ for(uint32_t K : {1, 3, 1337}){ From 32732f2459a598606055f0403f0e4ec148d06d68 Mon Sep 17 00:00:00 2001 From: Aldehir Rojas Date: Fri, 22 Aug 2025 11:04:08 -0500 Subject: [PATCH 125/140] model : gpt-oss add response_format support (#15494) --- common/chat.cpp | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/common/chat.cpp b/common/chat.cpp index 7f6809a4edc41..111b4a21b368c 100644 --- a/common/chat.cpp +++ b/common/chat.cpp @@ -1361,6 +1361,26 @@ static common_chat_params common_chat_params_init_gpt_oss(const common_chat_temp "<|end|>", }; + if (!inputs.json_schema.is_null()) { + data.grammar_lazy = false; + data.grammar = build_grammar([&](const common_grammar_builder & builder) { + auto schema = inputs.json_schema; + builder.resolve_refs(schema); + + auto not_end = builder.add_rule("not-end", + "[^<] | \"<\" [^|] | \"<|\" [^e] | \"<|e\" [^n] | \"<|en\" [^d] | \"<|end\" [^|] | \"<|end|\" [^>]"); + auto analysis = builder.add_rule("analysis", + "\"<|channel|>analysis<|message|>\" ( " + not_end + " )* \"<|end|>\""); + auto constraint = builder.add_rule("constraint", "\"<|constrain|>\"? [a-zA-Z0-9_-]+"); + auto final = builder.add_rule("final", + "\"<|channel|>final\" ( \" \" " + constraint + " )? \"<|message|>\" " + + builder.add_schema("response", schema) + ); + + builder.add_rule("root", "( " + analysis + " \"<|start|>assistant\" )? " + final); + }); + } + if (inputs.tools.is_array() && !inputs.tools.empty()) { data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED; data.grammar = build_grammar([&](const common_grammar_builder & builder) { @@ -2121,7 +2141,7 @@ static common_chat_params common_chat_templates_apply_jinja( } // GPT-OSS - if (src.find("<|channel|>") != std::string::npos && params.json_schema.is_null()) { + if (src.find("<|channel|>") != std::string::npos) { return common_chat_params_init_gpt_oss(tmpl, params); } From 45363632cbd593537d541e81b600242e0b3d47fc Mon Sep 17 00:00:00 2001 From: Reese Levine Date: Fri, 22 Aug 2025 11:28:03 -0700 Subject: [PATCH 126/140] ggml WebGPU: add support for quantization types (#15440) * Begin work on set_rows * Work on set rows * Add error buffers for reporting unsupported SET_ROWS indices * Remove extra comments * Work on templating for different types in shaders * Work on shader type generation * Working q4_0 mul_mat and some templating for different types * Add q4_0_f16 matmul and fix device init * Add matmul support for basic quantization types * Add q2_k and q3_k quantization * Add rest of k-quants * Get firt i-quant working * Closer to supporting all i-quants * Support rest of i-quants * Cleanup code * Fix python formatting * debug * Bugfix for memset * Add padding to end of buffers on creation * Simplify bit-shifting * Update usage of StringView --- ggml/src/ggml-webgpu/CMakeLists.txt | 4 +- ggml/src/ggml-webgpu/ggml-webgpu.cpp | 426 ++-- .../ggml-webgpu/wgsl-shaders/embed_wgsl.py | 94 +- ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl | 16 +- .../wgsl-shaders/mul_mat.tmpl.wgsl | 1794 +++++++++++++++++ .../src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl | 56 - 6 files changed, 2145 insertions(+), 245 deletions(-) create mode 100644 ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl delete mode 100644 ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl diff --git a/ggml/src/ggml-webgpu/CMakeLists.txt b/ggml/src/ggml-webgpu/CMakeLists.txt index 79ef68b85a477..78a985a4d167a 100644 --- a/ggml/src/ggml-webgpu/CMakeLists.txt +++ b/ggml/src/ggml-webgpu/CMakeLists.txt @@ -20,8 +20,8 @@ add_custom_command( COMMAND ${CMAKE_COMMAND} -E make_directory ${SHADER_OUTPUT_DIR} COMMAND ${CMAKE_COMMAND} -E env PYTHONIOENCODING=utf-8 ${Python3_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/wgsl-shaders/embed_wgsl.py - --input "${SHADER_DIR}" - --output "${SHADER_HEADER}" + --input_dir "${SHADER_DIR}" + --output_file "${SHADER_HEADER}" DEPENDS ${WGSL_SHADER_FILES} ${CMAKE_CURRENT_SOURCE_DIR}/wgsl-shaders/embed_wgsl.py VERBATIM ) diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp index ba1addc8d9f29..32f1e304e1e63 100644 --- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp +++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp @@ -118,13 +118,11 @@ struct webgpu_context_struct { std::recursive_mutex mutex; - bool device_init = false; - webgpu_buf_pool param_buf_pool; webgpu_buf_pool set_rows_error_buf_pool; wgpu::ComputePipeline memset_pipeline; - wgpu::ComputePipeline mul_mat_pipeline; + wgpu::ComputePipeline mul_mat_pipeline[30][2]; wgpu::ComputePipeline set_rows_pipeline; wgpu::ComputePipeline cpy_pipeline; @@ -238,7 +236,7 @@ static void ggml_backend_webgpu_wait_on_submission(webgpu_context & ctx) { wgpu::CallbackMode::AllowSpontaneous, [](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) { if (status != wgpu::QueueWorkDoneStatus::Success) { - GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", message.data); + GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", std::string(message).c_str()); } }), UINT64_MAX); @@ -278,7 +276,7 @@ static void ggml_backend_webgpu_submit_queue(webgpu_context & ctx) { wgpu::CallbackMode::AllowSpontaneous, [ctx, staged_param_bufs](wgpu::QueueWorkDoneStatus status, wgpu::StringView message) { if (status != wgpu::QueueWorkDoneStatus::Success) { - GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", message.data); + GGML_LOG_ERROR("ggml_webgpu: Failed to submit commands: %s\n", std::string(message).c_str()); } // Free the staged buffers ctx->param_buf_pool.free_bufs(staged_param_bufs); @@ -294,7 +292,7 @@ static void ggml_backend_webgpu_submit_queue(webgpu_context & ctx) { wgpu::CallbackMode::AllowSpontaneous, [ctx, error_bufs](wgpu::MapAsyncStatus status, wgpu::StringView message) { if (status != wgpu::MapAsyncStatus::Success) { - GGML_LOG_ERROR("ggml_webgpu: Failed to map error buffer: %s\n", message.data); + GGML_LOG_ERROR("ggml_webgpu: Failed to map error buffer: %s\n", std::string(message).c_str()); } else { const uint32_t * error_data = (const uint32_t *) error_bufs.host_buf.GetConstMappedRange(); if (*error_data) { @@ -331,6 +329,7 @@ static void ggml_backend_webgpu_map_buffer(webgpu_context & ctx, // To use, add a bind group entry to the setup for the shader you are debugging, add the buffer and // debug statements in the shader, and then call this function after encoding the commands and submitting them. static void ggml_backend_webgpu_debug(webgpu_context & ctx) { + ggml_backend_webgpu_submit_queue(ctx); wgpu::CommandEncoder encoder = ctx->device.CreateCommandEncoder(); encoder.CopyBufferToBuffer(ctx->debug_dev_buf, 0, ctx->debug_host_buf, 0, ctx->debug_host_buf.GetSize()); wgpu::CommandBuffer commands = encoder.Finish(); @@ -421,15 +420,6 @@ static void ggml_backend_webgpu_buffer_memset(webgpu_context & ctx, ggml_backend_webgpu_build_and_enqueue(ctx, ctx->memset_pipeline, params, entries, wg_x, true); } -static size_t ggml_backend_webgpu_tensor_offset(const ggml_tensor * tensor) { - return webgpu_tensor_offset(tensor) + tensor->view_offs; -} - -static wgpu::Buffer ggml_backend_webgpu_tensor_buf(const ggml_tensor * tensor) { - ggml_backend_webgpu_buffer_context * ctx = (ggml_backend_webgpu_buffer_context *) tensor->buffer->context; - return ctx->buffer; -} - /** End WebGPU Actions */ /** GGML Backend Interface */ @@ -447,19 +437,36 @@ static void ggml_backend_webgpu_free(ggml_backend_t backend) { GGML_UNUSED(ctx); } +static size_t ggml_webgpu_tensor_offset(const ggml_tensor * tensor) { + return webgpu_tensor_offset(tensor) + tensor->view_offs; +} + +static wgpu::Buffer ggml_webgpu_tensor_buf(const ggml_tensor * tensor) { + ggml_backend_webgpu_buffer_context * ctx = (ggml_backend_webgpu_buffer_context *) tensor->buffer->context; + return ctx->buffer; +} + +static size_t ggml_webgpu_tensor_misalignment(webgpu_context & ctx, ggml_tensor * t) { + size_t offset = ggml_webgpu_tensor_offset(t); + return offset & (ctx->limits.minStorageBufferOffsetAlignment - 1); +} + +static size_t ggml_webgpu_tensor_align_offset(webgpu_context & ctx, ggml_tensor * t) { + size_t offset = ggml_webgpu_tensor_offset(t); + return offset & ~(ctx->limits.minStorageBufferOffsetAlignment - 1); +} + +static size_t ggml_webgpu_tensor_binding_size(webgpu_context & ctx, ggml_tensor * t) { + return (ggml_nbytes(t) + ggml_webgpu_tensor_misalignment(ctx, t) + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & + ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1); +} + static void ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor * dst) { - size_t src_offset = ggml_backend_webgpu_tensor_offset(src); - // assumes power of 2 offset alignment - size_t src_misalignment = src_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1); - // align to minimum offset alignment - src_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1); - size_t dst_offset = ggml_backend_webgpu_tensor_offset(dst); - size_t dst_misalignment = dst_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1); - dst_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1); - uint32_t ne = (uint32_t) ggml_nelements(dst); + uint32_t ne = (uint32_t) ggml_nelements(dst); + std::vector params = { ne, - (uint32_t) (src_misalignment / ggml_type_size(src->type)), - (uint32_t) (dst_misalignment / ggml_type_size(dst->type)), + (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)), + (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)), // Convert byte-strides to element-strides (uint32_t) (src->nb[0] / ggml_type_size(src->type)), (uint32_t) (src->nb[1] / ggml_type_size(src->type)), @@ -477,15 +484,13 @@ static void ggml_webgpu_cpy(webgpu_context & ctx, ggml_tensor * src, ggml_tensor std::vector entries = { { .binding = 0, - .buffer = ggml_backend_webgpu_tensor_buf(src), - .offset = src_offset, - .size = (ggml_nbytes(src) + src_misalignment + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & - ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1) }, + .buffer = ggml_webgpu_tensor_buf(src), + .offset = ggml_webgpu_tensor_align_offset(ctx, src), + .size = ggml_webgpu_tensor_binding_size(ctx, src) }, { .binding = 1, - .buffer = ggml_backend_webgpu_tensor_buf(dst), - .offset = dst_offset, - .size = (ggml_nbytes(dst) + dst_misalignment + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & - ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1) } + .buffer = ggml_webgpu_tensor_buf(dst), + .offset = ggml_webgpu_tensor_align_offset(ctx, dst), + .size = ggml_webgpu_tensor_binding_size(ctx, dst) } }; size_t max_wg_size = ctx->limits.maxComputeWorkgroupSizeX; @@ -504,21 +509,9 @@ static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t error_bufs.host_buf.Unmap(); } - size_t src_offset = ggml_backend_webgpu_tensor_offset(src); - // assumes power of 2 offset alignment - size_t src_misalignment = src_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1); - // align to minimum offset alignment - src_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1); - size_t idx_offset = ggml_backend_webgpu_tensor_offset(idx); - size_t idx_misalignment = idx_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1); - idx_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1); - size_t dst_offset = ggml_backend_webgpu_tensor_offset(dst); - size_t dst_misalignment = dst_offset & (ctx->limits.minStorageBufferOffsetAlignment - 1); - dst_offset &= ~(ctx->limits.minStorageBufferOffsetAlignment - 1); - - std::vector params = { (uint32_t) (src_misalignment / ggml_type_size(src->type)), - (uint32_t) (idx_misalignment / ggml_type_size(idx->type)), - (uint32_t) (dst_misalignment / ggml_type_size(dst->type)), + std::vector params = { (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src) / ggml_type_size(src->type)), + (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, idx) / ggml_type_size(idx->type)), + (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)), // Convert byte-strides to element-strides (uint32_t) (src->nb[1] / ggml_type_size(src->type)), (uint32_t) (src->nb[2] / ggml_type_size(src->type)), @@ -540,18 +533,18 @@ static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t std::vector entries = { { .binding = 0, - .buffer = ggml_backend_webgpu_tensor_buf(src), - .offset = ggml_backend_webgpu_tensor_offset(src), - .size = ggml_nbytes(src) }, + .buffer = ggml_webgpu_tensor_buf(src), + .offset = ggml_webgpu_tensor_align_offset(ctx, src), + .size = ggml_webgpu_tensor_binding_size(ctx, src) }, { .binding = 1, - .buffer = ggml_backend_webgpu_tensor_buf(idx), - .offset = ggml_backend_webgpu_tensor_offset(idx), - .size = ggml_nbytes(idx) }, + .buffer = ggml_webgpu_tensor_buf(idx), + .offset = ggml_webgpu_tensor_align_offset(ctx, idx), + .size = ggml_webgpu_tensor_binding_size(ctx, idx) }, { .binding = 2, - .buffer = ggml_backend_webgpu_tensor_buf(dst), - .offset = ggml_backend_webgpu_tensor_offset(dst), - .size = ggml_nbytes(dst) }, - { .binding = 3, .buffer = error_bufs.dev_buf, .offset = 0, .size = error_bufs.dev_buf.GetSize() } + .buffer = ggml_webgpu_tensor_buf(dst), + .offset = ggml_webgpu_tensor_align_offset(ctx, dst), + .size = ggml_webgpu_tensor_binding_size(ctx, dst) }, + { .binding = 3, .buffer = error_bufs.dev_buf, .offset = 0, .size = error_bufs.dev_buf.GetSize() } }; size_t max_wg_size = ctx->limits.maxComputeWorkgroupSizeX; @@ -565,15 +558,18 @@ static void ggml_webgpu_set_rows(webgpu_context & ctx, ggml_tensor * src, ggml_t static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_tensor * src1, ggml_tensor * dst) { std::vector params = { + (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src0) / ggml_type_size(src0->type)), + (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, src1) / ggml_type_size(src1->type)), + (uint32_t) (ggml_webgpu_tensor_misalignment(ctx, dst) / ggml_type_size(dst->type)), (uint32_t) dst->ne[1], // number of rows in result (M) (uint32_t) dst->ne[0], // number of columns in result (N) (uint32_t) src0->ne[0], // number of columns in src0/src1 (K) - (uint32_t) (src0->nb[1] / ggml_type_size(src0->type)), // stride (elements) of src0 in dimension 1 - (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)), // stride (elements) of src1 in dimension 1 - (uint32_t) (src0->nb[2] / ggml_type_size(src0->type)), // stride (elements) of src0 in dimension 2 - (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)), // stride (elements) of src1 in dimension 2 - (uint32_t) (src0->nb[3] / ggml_type_size(src0->type)), // stride (elements) of src0 in dimension 3 - (uint32_t) (src1->nb[3] / ggml_type_size(src1->type)), // stride (elements) of src1 in dimension 3 + (uint32_t) (src0->nb[1] / ggml_type_size(src0->type)), // stride (elements/blocks) of src0 in dimension 1 + (uint32_t) (src1->nb[1] / ggml_type_size(src1->type)), // stride (elements/blocks) of src1 in dimension 1 + (uint32_t) (src0->nb[2] / ggml_type_size(src0->type)), // stride (elements/blocks) of src0 in dimension 2 + (uint32_t) (src1->nb[2] / ggml_type_size(src1->type)), // stride (elements/blocks) of src1 in dimension 2 + (uint32_t) (src0->nb[3] / ggml_type_size(src0->type)), // stride (elements/blocks) of src0 in dimension 3 + (uint32_t) (src1->nb[3] / ggml_type_size(src1->type)), // stride (elements/blocks) of src1 in dimension 3 (uint32_t) src0->ne[2], // batch size in dimension 2 (uint32_t) src0->ne[3], // batch size in dimension 3 (uint32_t) (src1->ne[2] / src0->ne[2]), // broadcast in dimension 2 @@ -582,22 +578,22 @@ static void ggml_webgpu_mul_mat(webgpu_context & ctx, ggml_tensor * src0, ggml_t std::vector entries = { { .binding = 0, - .buffer = ggml_backend_webgpu_tensor_buf(src0), - .offset = ggml_backend_webgpu_tensor_offset(src0), - .size = ggml_nbytes(src0) }, + .buffer = ggml_webgpu_tensor_buf(src0), + .offset = ggml_webgpu_tensor_align_offset(ctx, src0), + .size = ggml_webgpu_tensor_binding_size(ctx, src0) }, { .binding = 1, - .buffer = ggml_backend_webgpu_tensor_buf(src1), - .offset = ggml_backend_webgpu_tensor_offset(src1), - .size = ggml_nbytes(src1) }, + .buffer = ggml_webgpu_tensor_buf(src1), + .offset = ggml_webgpu_tensor_align_offset(ctx, src1), + .size = ggml_webgpu_tensor_binding_size(ctx, src1) }, { .binding = 2, - .buffer = ggml_backend_webgpu_tensor_buf(dst), - .offset = ggml_backend_webgpu_tensor_offset(dst), - .size = ggml_nbytes(dst) } + .buffer = ggml_webgpu_tensor_buf(dst), + .offset = ggml_webgpu_tensor_align_offset(ctx, dst), + .size = ggml_webgpu_tensor_binding_size(ctx, dst) }, }; uint32_t wg_x = (dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3] + WEBGPU_MUL_MAT_WG_SIZE - 1) / WEBGPU_MUL_MAT_WG_SIZE; - ggml_backend_webgpu_build_and_enqueue(ctx, ctx->mul_mat_pipeline, params, entries, wg_x); + ggml_backend_webgpu_build_and_enqueue(ctx, ctx->mul_mat_pipeline[src0->type][src1->type], params, entries, wg_x); } // Returns true if node has enqueued work into the queue, false otherwise @@ -827,7 +823,7 @@ static ggml_backend_buffer_t ggml_backend_webgpu_buffer_type_alloc_buffer(ggml_b wgpu::Buffer buf; ggml_webgpu_create_buffer(ctx->webgpu_ctx->device, buf, - size, + (size + WEBGPU_STORAGE_BUF_BINDING_MULT - 1) & ~(WEBGPU_STORAGE_BUF_BINDING_MULT - 1), wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::CopyDst, "allocated_buffer"); @@ -907,7 +903,94 @@ static void ggml_webgpu_init_memset_pipeline(webgpu_context & webgpu_ctx) { } static void ggml_webgpu_init_mul_mat_pipeline(webgpu_context & webgpu_ctx) { - ggml_webgpu_create_pipeline(webgpu_ctx->device, webgpu_ctx->mul_mat_pipeline, wgsl_mul_mat, "mul_mat"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_F32][GGML_TYPE_F32], + wgsl_mul_mat_f32_f32, + "mul_mat_f32_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_F16][GGML_TYPE_F16], + wgsl_mul_mat_f16_f16, + "mul_mat_f16_f16"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_F16][GGML_TYPE_F32], + wgsl_mul_mat_f16_f32, + "mul_mat_f16_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_0][GGML_TYPE_F32], + wgsl_mul_mat_q4_0_f32, + "mul_mat_q4_0_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_1][GGML_TYPE_F32], + wgsl_mul_mat_q4_1_f32, + "mul_mat_q4_1_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q5_0][GGML_TYPE_F32], + wgsl_mul_mat_q5_0_f32, + "mul_mat_q5_0_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q5_1][GGML_TYPE_F32], + wgsl_mul_mat_q5_1_f32, + "mul_mat_q5_1_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q8_0][GGML_TYPE_F32], + wgsl_mul_mat_q8_0_f32, + "mul_mat_q8_0_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q2_K][GGML_TYPE_F32], + wgsl_mul_mat_q2_k_f32, + "mul_mat_q2_k_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q3_K][GGML_TYPE_F32], + wgsl_mul_mat_q3_k_f32, + "mul_mat_q3_k_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q4_K][GGML_TYPE_F32], + wgsl_mul_mat_q4_k_f32, + "mul_mat_q4_k_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q5_K][GGML_TYPE_F32], + wgsl_mul_mat_q5_k_f32, + "mul_mat_q5_k_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_Q6_K][GGML_TYPE_F32], + wgsl_mul_mat_q6_k_f32, + "mul_mat_q6_k_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ2_XXS][GGML_TYPE_F32], + wgsl_mul_mat_iq2_xxs_f32, + "mul_mat_iq2_xxs_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ2_XS][GGML_TYPE_F32], + wgsl_mul_mat_iq2_xs_f32, + "mul_mat_iq2_xs_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ2_S][GGML_TYPE_F32], + wgsl_mul_mat_iq2_s_f32, + "mul_mat_iq2_s_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ3_XXS][GGML_TYPE_F32], + wgsl_mul_mat_iq3_xxs_f32, + "mul_mat_iq3_xxs_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ3_S][GGML_TYPE_F32], + wgsl_mul_mat_iq3_s_f32, + "mul_mat_iq3_s_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ1_S][GGML_TYPE_F32], + wgsl_mul_mat_iq1_s_f32, + "mul_mat_iq1_s_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ1_M][GGML_TYPE_F32], + wgsl_mul_mat_iq1_m_f32, + "mul_mat_iq1_m_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ4_NL][GGML_TYPE_F32], + wgsl_mul_mat_iq4_nl_f32, + "mul_mat_iq4_nl_f32"); + ggml_webgpu_create_pipeline(webgpu_ctx->device, + webgpu_ctx->mul_mat_pipeline[GGML_TYPE_IQ4_XS][GGML_TYPE_F32], + wgsl_mul_mat_iq4_xs_f32, + "mul_mat_iq4_xs_f32"); } static void ggml_webgpu_init_set_rows_pipeline(webgpu_context & webgpu_ctx) { @@ -933,79 +1016,6 @@ static ggml_backend_t ggml_backend_webgpu_device_init(ggml_backend_dev_t dev, co ggml_backend_webgpu_device_context * dev_ctx = static_cast(dev->context); webgpu_context webgpu_ctx = dev_ctx->webgpu_ctx; - // Multiple threads may try to initialize the device - std::lock_guard lock(webgpu_ctx->mutex); - if (!webgpu_ctx->device_init) { - // Initialize device - std::vector required_features = { wgpu::FeatureName::ShaderF16, - wgpu::FeatureName::ImplicitDeviceSynchronization }; - wgpu::DeviceDescriptor dev_desc; - dev_desc.requiredLimits = &webgpu_ctx->limits; - dev_desc.requiredFeatures = required_features.data(); - dev_desc.requiredFeatureCount = required_features.size(); - dev_desc.SetDeviceLostCallback( - wgpu::CallbackMode::AllowSpontaneous, - [](const wgpu::Device & device, wgpu::DeviceLostReason reason, wgpu::StringView message) { - GGML_UNUSED(device); - GGML_LOG_ERROR( - "ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast(reason), message.data); - }); - dev_desc.SetUncapturedErrorCallback( - [](const wgpu::Device & device, wgpu::ErrorType reason, wgpu::StringView message) { - GGML_UNUSED(device); - GGML_LOG_ERROR( - "ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast(reason), message.data); - }); - webgpu_ctx->instance.WaitAny( - webgpu_ctx->adapter.RequestDevice( - &dev_desc, - wgpu::CallbackMode::AllowSpontaneous, - [webgpu_ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) { - if (status != wgpu::RequestDeviceStatus::Success) { - GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n", message.data); - return; - } - webgpu_ctx->device = std::move(device); - }), - UINT64_MAX); - GGML_ASSERT(webgpu_ctx->device != nullptr); - - // Initialize (compute) queue - webgpu_ctx->queue = webgpu_ctx->device.GetQueue(); - - // Create buffer pool for shader parameters - webgpu_ctx->param_buf_pool.init(webgpu_ctx->device, - WEBGPU_NUM_PARAM_BUFS, - WEBGPU_PARAMS_BUF_SIZE_BYTES, - wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Uniform, - wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::MapWrite); - webgpu_ctx->set_rows_error_buf_pool.init(webgpu_ctx->device, - WEBGPU_NUM_SET_ROWS_ERROR_BUFS, - WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES, - wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::Storage, - wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead); - - ggml_webgpu_init_memset_pipeline(webgpu_ctx); - ggml_webgpu_init_mul_mat_pipeline(webgpu_ctx); - ggml_webgpu_init_set_rows_pipeline(webgpu_ctx); - ggml_webgpu_init_cpy_pipeline(webgpu_ctx); - -#ifdef GGML_WEBGPU_DEBUG - // Initialize debug buffers - ggml_webgpu_create_buffer(webgpu_ctx->device, - webgpu_ctx->debug_host_buf, - WEBGPU_DEBUG_BUF_ELEMS * sizeof(uint32_t), - wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, - "debug_host_buf"); - ggml_webgpu_create_buffer(webgpu_ctx->device, - webgpu_ctx->debug_dev_buf, - WEBGPU_DEBUG_BUF_ELEMS * sizeof(uint32_t), - wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc, - "debug_dev_buf"); -#endif - webgpu_ctx->device_init = true; - } - static ggml_backend_webgpu_context backend_ctx; backend_ctx.name = GGML_WEBGPU_NAME + std::string(": ") + dev_ctx->device_name; backend_ctx.webgpu_ctx = webgpu_ctx; @@ -1053,10 +1063,45 @@ static bool ggml_backend_webgpu_device_supports_op(ggml_backend_dev_t dev, const case GGML_OP_VIEW: case GGML_OP_PERMUTE: return true; - case GGML_OP_CPY | GGML_OP_SET_ROWS: + case GGML_OP_CPY: + case GGML_OP_SET_ROWS: return op->type == GGML_TYPE_F16 && op->src[0]->type == GGML_TYPE_F32; case GGML_OP_MUL_MAT: - return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32; + { + switch (op->src[1]->type) { + case GGML_TYPE_F16: + return op->src[0]->type == GGML_TYPE_F16; + case GGML_TYPE_F32: + switch (op->src[0]->type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ3_S: + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ1_M: + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ4_XS: + return true; + default: + return false; + } + default: + return false; + } + } default: return false; } @@ -1123,20 +1168,87 @@ static ggml_backend_dev_t ggml_backend_webgpu_reg_get_device(ggml_backend_reg_t wgpu::AdapterInfo info{}; ctx->adapter.GetInfo(&info); + // Initialize device + std::vector required_features = { wgpu::FeatureName::ShaderF16, + wgpu::FeatureName::ImplicitDeviceSynchronization }; + wgpu::DeviceDescriptor dev_desc; + dev_desc.requiredLimits = &ctx->limits; + dev_desc.requiredFeatures = required_features.data(); + dev_desc.requiredFeatureCount = required_features.size(); + dev_desc.SetDeviceLostCallback( + wgpu::CallbackMode::AllowSpontaneous, + [](const wgpu::Device & device, wgpu::DeviceLostReason reason, wgpu::StringView message) { + GGML_UNUSED(device); + GGML_LOG_ERROR( + "ggml_webgpu: Device lost! Reason: %d, Message: %s\n", static_cast(reason), std::string(message).c_str()); + }); + dev_desc.SetUncapturedErrorCallback( + [](const wgpu::Device & device, wgpu::ErrorType reason, wgpu::StringView message) { + GGML_UNUSED(device); + GGML_LOG_ERROR( + "ggml_webgpu: Device error! Reason: %d, Message: %s\n", static_cast(reason), std::string(message).c_str()); + }); + ctx->instance.WaitAny(ctx->adapter.RequestDevice( + &dev_desc, + wgpu::CallbackMode::AllowSpontaneous, + [ctx](wgpu::RequestDeviceStatus status, wgpu::Device device, wgpu::StringView message) { + if (status != wgpu::RequestDeviceStatus::Success) { + GGML_LOG_ERROR("ggml_webgpu: Failed to get a device: %s\n", std::string(message).c_str()); + return; + } + ctx->device = std::move(device); + }), + UINT64_MAX); + GGML_ASSERT(ctx->device != nullptr); + + // Initialize (compute) queue + ctx->queue = ctx->device.GetQueue(); + + // Create buffer pool for shader parameters + ctx->param_buf_pool.init(ctx->device, + WEBGPU_NUM_PARAM_BUFS, + WEBGPU_PARAMS_BUF_SIZE_BYTES, + wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::Uniform, + wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::MapWrite); + ctx->set_rows_error_buf_pool.init(ctx->device, + WEBGPU_NUM_SET_ROWS_ERROR_BUFS, + WEBGPU_SET_ROWS_ERROR_BUF_SIZE_BYTES, + wgpu::BufferUsage::CopySrc | wgpu::BufferUsage::Storage, + wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead); + + ggml_webgpu_init_memset_pipeline(ctx); + ggml_webgpu_init_mul_mat_pipeline(ctx); + ggml_webgpu_init_set_rows_pipeline(ctx); + ggml_webgpu_init_cpy_pipeline(ctx); + +#ifdef GGML_WEBGPU_DEBUG + // Initialize debug buffers + ggml_webgpu_create_buffer(ctx->device, + ctx->debug_host_buf, + WEBGPU_DEBUG_BUF_ELEMS * sizeof(uint32_t), + wgpu::BufferUsage::CopyDst | wgpu::BufferUsage::MapRead, + "debug_host_buf"); + ggml_webgpu_create_buffer(ctx->device, + ctx->debug_dev_buf, + WEBGPU_DEBUG_BUF_ELEMS * sizeof(uint32_t), + wgpu::BufferUsage::Storage | wgpu::BufferUsage::CopySrc, + "debug_dev_buf"); +#endif + static ggml_backend_webgpu_device_context device_ctx; device_ctx.webgpu_ctx = ctx; device_ctx.device_name = GGML_WEBGPU_NAME; - device_ctx.device_desc = std::string(info.description.data); + device_ctx.device_desc = info.description; GGML_LOG_INFO( "ggml_webgpu: adapter_info: vendor_id: %u | vendor: %s | architecture: %s | device_id: %u | name: %s | " "device_desc: %s\n", info.vendorID, - info.vendor.data, - info.architecture.data, + std::string(info.vendor).c_str(), + std::string(info.architecture).c_str(), info.deviceID, - info.device.data, - info.description.data); + std::string(info.device).c_str(), + std::string(info.description).c_str()); // See GGML Backend Device Interface section static ggml_backend_device device = { diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py b/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py index 962dcd6b170ed..cc8def7f13ea4 100755 --- a/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py +++ b/ggml/src/ggml-webgpu/wgsl-shaders/embed_wgsl.py @@ -1,35 +1,85 @@ import os +import re +import ast import argparse -def escape_triple_quotes(wgsl): - # Simple defense in case of embedded """ - return wgsl.replace('"""', '\\"""') +def extract_block(text, name): + pattern = rf'#define\({name}\)\s*(.*?)#end\({name}\)' + match = re.search(pattern, text, re.DOTALL) + if not match: + raise ValueError(f"Missing block: {name}") + return match.group(1).strip() -def to_cpp_string_literal(varname, content): - return f'const char* wgsl_{varname} = R"({content})";\n' +def parse_decls(decls_text): + decls = {} + for name, code in re.findall(r'#decl\((.*?)\)\s*(.*?)#enddecl\(\1\)', decls_text, re.DOTALL): + decls[name.strip()] = code.strip() + return decls + + +def replace_placeholders(shader_text, replacements): + for key, val in replacements.items(): + # Match {{KEY}} literally, where KEY is escaped + pattern = r'{{\s*' + re.escape(key) + r'\s*}}' + shader_text = re.sub(pattern, str(val), shader_text) + return shader_text + + +def write_shader(shader_name, shader_code, output_dir, outfile): + if output_dir: + wgsl_filename = os.path.join(output_dir, f"{shader_name}.wgsl") + with open(wgsl_filename, "w", encoding="utf-8") as f_out: + f_out.write(shader_code) + outfile.write(f'const char* wgsl_{shader_name} = R"({shader_code})";\n\n') + + +def generate_variants(shader_path, output_dir, outfile): + shader_base_name = shader_path.split("/")[-1].split(".")[0] + + with open(shader_path, "r", encoding="utf-8") as f: + text = f.read() + + try: + variants = ast.literal_eval(extract_block(text, "VARIANTS")) + except ValueError: + write_shader(shader_base_name, text, output_dir, outfile) + else: + decls_map = parse_decls(extract_block(text, "DECLS")) + shader_template = extract_block(text, "SHADER") + + for variant in variants: + decls = variant["DECLS"] + decls_code = "" + for key in decls: + if key not in decls_map: + raise ValueError(f"DECLS key '{key}' not found.") + decls_code += decls_map[key] + "\n\n" + + shader_variant = replace_placeholders(shader_template, variant["REPLS"]) + final_shader = re.sub(r'\bDECLS\b', decls_code, shader_variant) + + output_name = f"{shader_base_name}_" + "_".join([variant["REPLS"]["SRC0_TYPE"], variant["REPLS"]["SRC1_TYPE"]]) + write_shader(output_name, final_shader, output_dir, outfile) def main(): parser = argparse.ArgumentParser() - parser.add_argument('--input', required=True) - parser.add_argument('--output', required=True) + parser.add_argument("--input_dir", required=True) + parser.add_argument("--output_file", required=True) + parser.add_argument("--output_dir") args = parser.parse_args() - with open(args.output, 'w', encoding='utf-8') as out: - out.write("// Auto-generated shader embedding \n\n") - for fname in sorted(os.listdir(args.input)): - if not fname.endswith('.wgsl'): - continue - shader_path = os.path.join(args.input, fname) - varname = os.path.splitext(fname)[0] - with open(shader_path, 'r', encoding='utf-8') as f: - content = f.read() - content = escape_triple_quotes(content) - out.write(to_cpp_string_literal(varname, content)) - out.write('\n') - - -if __name__ == '__main__': + if args.output_dir: + os.makedirs(args.output_dir, exist_ok=True) + + with open(args.output_file, "w", encoding="utf-8") as out: + out.write("// Auto-generated shader embedding\n\n") + for fname in sorted(os.listdir(args.input_dir)): + if fname.endswith(".wgsl"): + generate_variants(os.path.join(args.input_dir, fname), args.output_dir, out) + + +if __name__ == "__main__": main() diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl index cb7c8c3e09e91..194d2d6f58c77 100644 --- a/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl +++ b/ggml/src/ggml-webgpu/wgsl-shaders/memset.wgsl @@ -19,20 +19,20 @@ fn main(@builtin(global_invocation_id) gid: vec3) { let start = params.offset; let end = params.offset + params.size; - for (var j: u32 = 0u; j < bytes_per_thread; j = j + 1u) { + for (var j: u32 = 0u; j < bytes_per_thread; j += 4) { let byte_index = start + i + j; - if (byte_index + 4u <= end) { - output_buffer[(byte_index >> 2u)] = params.value; + if (byte_index + 4 <= end) { + output_buffer[byte_index >> 2] = params.value; } else { // Handle tail (unaligned) - for (var k: u32 = 0u; k < 4u; k = k + 1u) { + for (var k: u32 = 0; k < 4; k++) { let idx = byte_index + k; if (idx < end) { - let word_idx = idx >> 2u; - let byte_offset = (idx & 3u) * 8u; - let mask = ~(0xffu << byte_offset); + let word_idx = idx >> 2; + let bit_offset = (idx & 3) * 8u; + let mask = ~(0xffu << bit_offset); let existing = output_buffer[word_idx]; - output_buffer[word_idx] = (existing & mask) | ((params.value & 0xffu) << byte_offset); + output_buffer[word_idx] = (existing & mask) | (params.value & (0xffu << bit_offset)); } } } diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl new file mode 100644 index 0000000000000..79465c298d726 --- /dev/null +++ b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.tmpl.wgsl @@ -0,0 +1,1794 @@ +#define(VARIANTS) + +[ + { + "REPLS": { + "SRC0_TYPE" : "f32", + "SRC1_TYPE" : "f32", + "BLOCK_SIZE" : 1 + }, + "DECLS" : ["FLOAT"] + }, + { + "REPLS": { + "SRC0_TYPE" : "f16", + "SRC1_TYPE" : "f16", + "BLOCK_SIZE" : 1 + }, + "DECLS" : ["FLOAT"] + }, + { + "REPLS": { + "SRC0_TYPE" : "f16", + "SRC1_TYPE" : "f32", + "BLOCK_SIZE" : 1 + }, + "DECLS" : ["FLOAT"] + }, + { + "REPLS": { + "SRC0_TYPE": "q4_0", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 32 + }, + "DECLS": ["BYTE_HELPERS", "Q4_0"] + }, + { + "REPLS": { + "SRC0_TYPE": "q4_1", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 32 + }, + "DECLS": ["BYTE_HELPERS", "Q4_1"] + }, + { + "REPLS": { + "SRC0_TYPE": "q5_0", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 32 + }, + "DECLS": ["BYTE_HELPERS", "Q5_0"] + }, + { + "REPLS": { + "SRC0_TYPE": "q5_1", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 32 + }, + "DECLS": ["BYTE_HELPERS", "Q5_1"] + }, + { + "REPLS": { + "SRC0_TYPE": "q8_0", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 32 + }, + "DECLS": ["BYTE_HELPERS", "Q8_0"] + }, + { + "REPLS": { + "SRC0_TYPE": "q2_k", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "Q2_K"] + }, + { + "REPLS": { + "SRC0_TYPE": "q3_k", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "Q3_K"] + }, + { + "REPLS": { + "SRC0_TYPE": "q4_k", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["Q45_K_SCALE_MIN", "BYTE_HELPERS", "Q4_K"] + }, + { + "REPLS": { + "SRC0_TYPE": "q5_k", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["Q45_K_SCALE_MIN", "BYTE_HELPERS", "Q5_K"] + }, + { + "REPLS": { + "SRC0_TYPE": "q6_k", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "Q6_K"] + }, + { + "REPLS": { + "SRC0_TYPE": "iq2_xxs", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "IQ23_TABLES", "IQ2_XXS"] + }, + { + "REPLS": { + "SRC0_TYPE": "iq2_xs", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "IQ23_TABLES", "IQ2_XS"] + }, + { + "REPLS": { + "SRC0_TYPE": "iq2_s", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "IQ23_TABLES", "IQ2_S"] + }, + { + "REPLS": { + "SRC0_TYPE": "iq3_xxs", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "IQ23_TABLES", "IQ3_XSS"] + }, + { + "REPLS": { + "SRC0_TYPE": "iq3_s", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "IQ23_TABLES", "IQ3_S"] + }, + { + "REPLS": { + "SRC0_TYPE": "iq1_s", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "IQ1_TABLE","IQ1_S"] + }, + { + "REPLS": { + "SRC0_TYPE": "iq1_m", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256 + }, + "DECLS": ["BYTE_HELPERS", "IQ1_TABLE","IQ1_M"] + }, + { + "REPLS": { + "SRC0_TYPE": "iq4_nl", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 32, + }, + "DECLS": ["BYTE_HELPERS", "IQ4_TABLE", "IQ4_NL"] + }, + { + "REPLS": { + "SRC0_TYPE": "iq4_xs", + "SRC1_TYPE": "f32", + "BLOCK_SIZE": 256, + }, + "DECLS": ["BYTE_HELPERS", "IQ4_TABLE", "IQ4_XS"] + } +] + +#end(VARIANTS) + +#define(DECLS) + +#decl(BYTE_HELPERS) + +fn get_byte(value: u32, index: u32) -> u32 { + return (value >> (index * 8)) & 0xFF; +} + +fn get_byte_i32(value: u32, index: u32) -> i32 { + return bitcast(((value >> (index * 8)) & 0xFF) << 24) >> 24; +} + +#enddecl(BYTE_HELPERS) + +#decl(FLOAT) +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + return f32(src0[src0_idx_base + offset]) * f32(src1[src1_idx_base + offset]); +} +#enddecl(FLOAT) + +#decl(Q4_0) +struct q4_0 { + d: f16, + qs: array +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block_q4_0 = src0[src0_idx_base + offset]; + let d = f32(block_q4_0.d); + var sum: f32 = 0.0; + for (var j: u32 = 0; j < 4; j++) { + let q_packed = bitcast(vec2(block_q4_0.qs[2 * j], block_q4_0.qs[2 * j + 1])); + for (var k: u32 = 0; k < 4; k++) { + let q_byte = get_byte(q_packed, k); + let q_hi = (f32((q_byte >> 4) & 0xF) - 8.0f) * d; + let q_lo = (f32(q_byte & 0xF) - 8.0f) * d; + let src1_offset = src1_idx_base + offset * 32 + j * 4 + k; + sum += q_lo * f32(src1[src1_offset]); + sum += q_hi * f32(src1[src1_offset + 16]); + } + } + return sum; +} +#enddecl(Q4_0) + +#decl(Q4_1) +struct q4_1 { + d: f16, + m: f16, + qs: array +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block_q4_1 = src0[src0_idx_base + offset]; + let d = f32(block_q4_1.d); + let m = f32(block_q4_1.m); + var sum: f32 = 0.0; + for (var j: u32 = 0; j < 4; j++) { + let q_packed = block_q4_1.qs[j]; + for (var k: u32 = 0; k < 4; k++) { + let q_byte = get_byte(q_packed, k); + let q_hi = f32((q_byte >> 4) & 0xF) * d + m; + let q_lo = f32(q_byte & 0xF) * d + m; + let src1_offset = src1_idx_base + offset * 32 + j * 4 + k; + sum += q_lo * f32(src1[src1_offset]); + sum += q_hi * f32(src1[src1_offset + 16]); + } + } + return sum; +} +#enddecl(Q4_1) + +#decl(Q5_0) +struct q5_0 { + d: f16, + qh: array, + qs: array +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block_q5_0 = src0[src0_idx_base + offset]; + let d = f32(block_q5_0.d); + var sum: f32 = 0.0; + let qh_packed = bitcast(vec2(block_q5_0.qh[0], block_q5_0.qh[1])); + for (var j: u32 = 0; j < 4; j++) { + let q_packed = bitcast(vec2(block_q5_0.qs[2 * j], block_q5_0.qs[2 * j + 1])); + for (var k: u32 = 0; k < 4; k++) { + let q_byte = get_byte(q_packed, k); + let qh_hi = (qh_packed >> (j * 4 + k + 12)) & 0x10; + let q_hi = (f32(((q_byte >> 4) & 0xF) | qh_hi) - 16.0) * d; + let qh_lo = ((qh_packed >> (j * 4 + k)) << 4) & 0x10; + let q_lo = (f32((q_byte & 0xF) | qh_lo) - 16.0) * d; + let src1_offset = src1_idx_base + offset * 32 + j * 4 + k; + sum += q_lo * f32(src1[src1_offset]); + sum += q_hi * f32(src1[src1_offset + 16]); + } + } + return sum; +} +#enddecl(Q5_0) + +#decl(Q5_1) +struct q5_1 { + d: f16, + m: f16, + qh: u32, + qs: array +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block_q5_1 = src0[src0_idx_base + offset]; + let d = f32(block_q5_1.d); + let m = f32(block_q5_1.m); + var sum: f32 = 0.0; + for (var j: u32 = 0; j < 4; j++) { + let q_packed = block_q5_1.qs[j]; + for (var k: u32 = 0; k < 4; k++) { + let q_byte = get_byte(q_packed, k); + let qh_hi = (block_q5_1.qh >> (j * 4 + k + 12)) & 0x10; + let q_hi = f32(((q_byte >> 4) & 0xF) | qh_hi) * d + m; + let qh_lo = ((block_q5_1.qh >> (j * 4 + k)) << 4) & 0x10; + let q_lo = f32((q_byte & 0xF) | qh_lo) * d + m; + let src1_offset = src1_idx_base + offset * 32 + j * 4 + k; + sum += q_lo * f32(src1[src1_offset]); + sum += q_hi * f32(src1[src1_offset + 16]); + } + } + return sum; +} +#enddecl(Q5_1) + +#decl(Q8_0) +struct q8_0 { + d: f16, + qs: array +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block_q8_0 = src0[src0_idx_base + offset]; + let d = f32(block_q8_0.d); + var sum: f32 = 0.0; + for (var j: u32 = 0; j < 8; j++) { + let q_packed = bitcast(vec2(block_q8_0.qs[2 * j], block_q8_0.qs[2 * j + 1])); + for (var k: u32 = 0; k < 4; k++) { + let q_byte = get_byte_i32(q_packed, k); + let q_val = f32(q_byte) * d; + let src1_offset = src1_idx_base + offset * 32 + j * 4 + k; + sum += q_val * f32(src1[src1_offset]); + } + } + return sum; +} +#enddecl(Q8_0) + +#decl(Q8_1) +struct q8_1 { + d: f16, + m: f16, + qs: array +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block_q8_1 = src0[src0_idx_base + offset]; + let d = f32(block_q8_1.d); + let m = f32(block_q8_1.m); + var sum: f32 = 0.0; + for (var j: u32 = 0; j < 8; j++) { + let q_packed = block_q8_1.qs[j]; + for (var k: u32 = 0; k < 4; k++) { + let q_byte = get_byte_i32(q_packed, k); + let q_val = f32(q_byte) * d + m; + let src1_offset = src1_idx_base + offset * 32 + j * 4 + k; + sum += q_val * f32(src1[src1_offset]); + } + } + return sum; +} +#enddecl(Q8_1) + +#decl(Q2_K) +// 16 blocks of 16 elements each +struct q2_k { + scales: array, + qs: array, + d: f16, + dmin: f16 +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + let m = f32(block.dmin); + var sum = 0.0; + var src1_i = src1_idx_base + offset * 256; + var is: u32 = 0; + // 2 halves of the block (128 elements each) + for (var q_b_idx: u32 = 0; q_b_idx < 64; q_b_idx += 32) { + // 4 groups (each group has 2 blocks of 16 elements) + for (var shift: u32 = 0; shift < 8; shift += 2) { + // 2 blocks + for (var k: u32 = 0; k < 32; k += 16) { + let sc = get_byte(block.scales[is / 4], is % 4); + is++; + let dl = d * f32(sc & 0xF); + let ml = m * f32(sc >> 4); + for (var l: u32 = 0u; l < 16; l++) { + let q_idx = q_b_idx + k + l; + let q_byte = get_byte(block.qs[q_idx / 4], q_idx % 4); + let qs_val = (q_byte >> shift) & 3; + sum += (f32(qs_val) * dl - ml) * src1[src1_i]; + src1_i++; + } + } + } + } + return sum; +} + +#enddecl(Q2_K) + +#decl(Q3_K) +// 16 blocks of 16 elements each +struct q3_k { + hmask: array, + qs: array, + scales: array, // 6-bit quantized values + d: f16 +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + + // extract 6-bit scales, which consist of 4-bits from first 8 bytes of scale, + // and 2-bits from the last 4 bytes + let kmask1: u32 = 0x03030303; + let kmask2: u32 = 0x0f0f0f0f; + var scale_vals: array; + for (var i: u32 = 0; i < 4; i++) { + scale_vals[i] = bitcast(vec2(block.scales[2 * i], block.scales[2 * i + 1])); + } + var tmp: u32 = scale_vals[2]; + scale_vals[2] = ((scale_vals[0] >> 4) & kmask2) | (((tmp >> 4) & kmask1) << 4); + scale_vals[3] = ((scale_vals[1] >> 4) & kmask2) | (((tmp >> 6) & kmask1) << 4); + scale_vals[0] = (scale_vals[0] & kmask2) | ((tmp & kmask1) << 4); + scale_vals[1] = (scale_vals[1] & kmask2) | (((tmp >> 2) & kmask1) << 4); + + // convert arrays of f16 -> u32 + var hmask_vals: array; + for (var i: u32 = 0; i < 8; i++) { + hmask_vals[i] = bitcast(vec2(block.hmask[2 * i], block.hmask[2 * i + 1])); + } + var qs_vals: array; + for (var i: u32 = 0; i < 16; i++) { + qs_vals[i] = bitcast(vec2(block.qs[2 * i], block.qs[2 * i + 1])); + } + + var sum = 0.0; + var src1_i = src1_idx_base + offset * 256; + var is: u32 = 0; + var m: u32 = 1; + // 2 halves of the block (128 elements each) + for (var q_b_idx: u32 = 0; q_b_idx < 64; q_b_idx += 32) { + // 4 groups (each group has 2 blocks of 16 elements) + for (var shift: u32 = 0; shift < 8; shift += 2) { + // 2 blocks + for (var k: u32 = 0; k < 32; k += 16) { + let sc = get_byte(scale_vals[is / 4], is % 4); + is++; + let dl = d * (f32(sc) - 32.0); + for (var l: u32 = 0u; l < 16u; l++) { + let q_idx = q_b_idx + k + l; + let hm_idx = k + l; + let q_byte = get_byte(qs_vals[q_idx / 4], q_idx % 4); + let hmask_byte = get_byte(hmask_vals[hm_idx / 4], hm_idx % 4); + let hm = select(4.0, 0.0, (hmask_byte & m) != 0); + let qs_val = (q_byte >> shift) & 3; + sum += ((f32(qs_val) - hm) * dl) * src1[src1_i]; + src1_i++; + } + } + m <<= 1; + } + } + return sum; +} + +#enddecl(Q3_K) + +#decl(Q45_K_SCALE_MIN) + +fn get_scale_min(is: u32, scales: array) -> vec2 { + if (is < 4) { + let sc_byte = get_byte(scales[is / 4], is % 4); + let min_byte = get_byte(scales[(is + 4) / 4], is % 4); + return vec2(f32(sc_byte & 63), f32(min_byte & 63)); + } else { + let sc_min_lo = get_byte(scales[(is + 4) / 4], (is + 4) % 4); + let sc_hi = get_byte(scales[(is - 4) / 4], (is - 4) % 4); + let min_hi = get_byte(scales[is / 4], is % 4); + let sc = (sc_min_lo & 0xF) | ((sc_hi >> 6) << 4); + let m = (sc_min_lo >> 4) | ((min_hi >> 6) << 4); + return vec2(f32(sc), f32(m)); + } +} + +#enddecl(Q45_K_SCALE_MIN) + +#decl(Q4_K) +// 8 blocks of 32 elements each +struct q4_k { + d: f16, + dmin: f16, + scales: array, + qs: array +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + let m = f32(block.dmin); + var sum = 0.0; + var src1_i = src1_idx_base + offset * 256; + var is: u32 = 0; + // 2 blocks each iteration + for (var q_b_idx: u32 = 0; q_b_idx < 128; q_b_idx += 32) { + for (var shift: u32 = 0; shift < 8; shift += 4) { + let scale_min = get_scale_min(is, block.scales); + is++; + let dl = d * scale_min.x; + let ml = m * scale_min.y; + for (var l: u32 = 0; l < 32; l++) { + let q_idx = q_b_idx + l; + let q_byte = get_byte(block.qs[q_idx / 4], q_idx % 4); + let qs_val = (q_byte >> shift) & 0xF; + sum += (f32(qs_val) * dl - ml) * src1[src1_i]; + src1_i++; + } + } + } + return sum; +} + +#enddecl(Q4_K) + +#decl(Q5_K) +// 8 blocks of 32 elements each +struct q5_k { + d: f16, + dmin: f16, + scales: array, + qh: array, + qs: array +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + let m = f32(block.dmin); + var sum = 0.0; + var src1_i = src1_idx_base + offset * 256; + var is: u32 = 0; + var u: u32 = 1; + // 2 blocks each iteration + for (var q_b_idx: u32 = 0; q_b_idx < 128; q_b_idx += 32) { + for (var shift: u32 = 0; shift < 8; shift += 4) { + let scale_min = get_scale_min(is, block.scales); + is++; + let dl = d * scale_min.x; + let ml = m * scale_min.y; + for (var l: u32 = 0; l < 32; l++) { + let q_idx = q_b_idx + l; + let q_byte = get_byte(block.qs[q_idx / 4], q_idx % 4); + let qh_byte = get_byte(block.qh[l / 4], l % 4); + let qs_val = (q_byte >> shift) & 0xF; + let qh_val = select(0.0, 16.0, (qh_byte & u) != 0); + sum += ((f32(qs_val) + qh_val) * dl - ml) * src1[src1_i]; + src1_i++; + } + u <<= 1; + } + } + return sum; +} + +#enddecl(Q5_K) + +#decl(Q6_K) +// 16 blocks of 16 elements each +struct q6_k { + ql: array, + qh: array, + scales: array, + d: f16 +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + + // convert arrays of f16 -> u32 + var ql_vals: array; + for (var i: u32 = 0; i < 32; i++) { + ql_vals[i] = bitcast(vec2(block.ql[2 * i], block.ql[2 * i + 1])); + } + var qh_vals: array; + for (var i: u32 = 0; i < 16; i++) { + qh_vals[i] = bitcast(vec2(block.qh[2 * i], block.qh[2 * i + 1])); + } + var scale_vals: array; + for (var i: u32 = 0; i < 4; i++) { + scale_vals[i] = bitcast(vec2(block.scales[2 * i], block.scales[2 * i + 1])); + } + + var sum = 0.0; + var src1_i = src1_idx_base + offset * 256; + var qh_b_idx: u32 = 0; + var sc_b_idx: u32 = 0; + for (var ql_b_idx: u32 = 0; ql_b_idx < 128; ql_b_idx += 64) { + for (var l: u32 = 0; l < 32; l++) { + let ql13_b = get_byte(ql_vals[(ql_b_idx + l) / 4], (ql_b_idx + l) % 4); + let ql24_b = get_byte(ql_vals[(ql_b_idx + l + 32) / 4], (ql_b_idx + l + 32) % 4); + let qh_b = get_byte(qh_vals[(qh_b_idx + l) / 4], (qh_b_idx + l) % 4); + + let q1 = f32((ql13_b & 0xF) | ((qh_b & 3) << 4)) - 32.0; + let q2 = f32((ql24_b & 0xF) | (((qh_b >> 2) & 3) << 4)) - 32.0; + let q3 = f32((ql13_b >> 4) | (((qh_b >> 4) & 3) << 4)) - 32.0; + let q4 = f32((ql24_b >> 4) | (((qh_b >> 6) & 3) << 4)) - 32.0; + + let is = l/16; + let is1 = sc_b_idx + is; + let sc1 = get_byte_i32(scale_vals[is1 / 4], is1 % 4); + let is2 = sc_b_idx + is + 2; + let sc2 = get_byte_i32(scale_vals[is2 / 4], is2 % 4); + let is3 = sc_b_idx + is + 4; + let sc3 = get_byte_i32(scale_vals[is3 / 4], is3 % 4); + let is4 = sc_b_idx + is + 6; + let sc4 = get_byte_i32(scale_vals[is4 / 4], is4 % 4); + + sum += d * f32(sc1) * q1 * src1[src1_i + l]; + sum += d * f32(sc2) * q2 * src1[src1_i + l + 32]; + sum += d * f32(sc3) * q3 * src1[src1_i + l + 64]; + sum += d * f32(sc4) * q4 * src1[src1_i + l + 96]; + } + src1_i += 128; + qh_b_idx += 32; + sc_b_idx += 8; + } + return sum; +} + +#enddecl(Q6_K) + +#decl(IQ23_TABLES) +const kmask_iq2xs : array = array( + 0x08040201u, // 1, 2, 4, 8 + 0x80402010u // 16, 32, 64, 128 +); + +const ksigns_iq2xs: array = array( + 0x03828100,0x87060584,0x8b0a0988,0x0f8e8d0c, + 0x93121190,0x17969514,0x1b9a9918,0x9f1e1d9c, + 0xa32221a0,0x27a6a524,0x2baaa928,0xaf2e2dac, + 0x33b2b130,0xb73635b4,0xbb3a39b8,0x3fbebd3c, + 0xc34241c0,0x47c6c544,0x4bcac948,0xcf4e4dcc, + 0x53d2d150,0xd75655d4,0xdb5a59d8,0x5fdedd5c, + 0x63e2e160,0xe76665e4,0xeb6a69e8,0x6feeed6c, + 0xf37271f0,0x77f6f574,0x7bfaf978,0xff7e7dfc +); +#enddecl(IQ23_TABLES) + +#decl(IQ2_XXS) + +const iq2xxs_grid = array( + 0x08080808, 0x08080808, 0x0808082b, 0x08080808, 0x08081919, 0x08080808, 0x08082b08, 0x08080808, + 0x08082b2b, 0x08080808, 0x08190819, 0x08080808, 0x08191908, 0x08080808, 0x082b0808, 0x08080808, + 0x082b082b, 0x08080808, 0x082b2b08, 0x08080808, 0x082b2b2b, 0x08080808, 0x19080819, 0x08080808, + 0x19081908, 0x08080808, 0x19190808, 0x08080808, 0x19192b08, 0x08080808, 0x192b0819, 0x08080808, + 0x192b1908, 0x08080808, 0x2b080808, 0x08080808, 0x2b08082b, 0x08080808, 0x2b082b2b, 0x08080808, + 0x2b2b082b, 0x08080808, 0x08080819, 0x08080819, 0x08081908, 0x08080819, 0x08190808, 0x08080819, + 0x08191919, 0x08080819, 0x19080808, 0x08080819, 0x2b081908, 0x08080819, 0x2b192b08, 0x08080819, + 0x08080808, 0x0808082b, 0x0808082b, 0x0808082b, 0x082b082b, 0x0808082b, 0x2b08082b, 0x0808082b, + 0x08080819, 0x08081908, 0x08081908, 0x08081908, 0x08190808, 0x08081908, 0x082b0819, 0x08081908, + 0x082b1908, 0x08081908, 0x19080808, 0x08081908, 0x1908082b, 0x08081908, 0x19082b08, 0x08081908, + 0x192b0808, 0x08081908, 0x2b080819, 0x08081908, 0x2b081908, 0x08081908, 0x2b190808, 0x08081908, + 0x2b2b1908, 0x08081908, 0x08080808, 0x08081919, 0x0808082b, 0x08081919, 0x08082b08, 0x08081919, + 0x082b0808, 0x08081919, 0x1908192b, 0x08081919, 0x192b2b19, 0x08081919, 0x2b080808, 0x08081919, + 0x2b190819, 0x08081919, 0x08082b19, 0x0808192b, 0x08190808, 0x0808192b, 0x19080808, 0x0808192b, + 0x2b081908, 0x0808192b, 0x2b2b1908, 0x0808192b, 0x08080808, 0x08082b08, 0x08081919, 0x08082b08, + 0x08082b08, 0x08082b08, 0x08191908, 0x08082b08, 0x082b2b08, 0x08082b08, 0x19080819, 0x08082b08, + 0x19081908, 0x08082b08, 0x19190808, 0x08082b08, 0x1919082b, 0x08082b08, 0x2b082b08, 0x08082b08, + 0x08081908, 0x08082b19, 0x19080808, 0x08082b19, 0x0808082b, 0x08082b2b, 0x08191908, 0x08082b2b, + 0x08080819, 0x08190808, 0x08081908, 0x08190808, 0x08190808, 0x08190808, 0x082b0819, 0x08190808, + 0x19080808, 0x08190808, 0x192b0808, 0x08190808, 0x2b081908, 0x08190808, 0x2b190808, 0x08190808, + 0x2b191919, 0x08190808, 0x08080808, 0x08190819, 0x08082b08, 0x08190819, 0x082b0808, 0x08190819, + 0x19190808, 0x08190819, 0x19192b2b, 0x08190819, 0x2b080808, 0x08190819, 0x082b1908, 0x0819082b, + 0x19081919, 0x0819082b, 0x08080808, 0x08191908, 0x08082b08, 0x08191908, 0x082b0808, 0x08191908, + 0x082b1919, 0x08191908, 0x19082b19, 0x08191908, 0x2b080808, 0x08191908, 0x08192b08, 0x08191919, + 0x192b082b, 0x08191919, 0x08080808, 0x0819192b, 0x0819192b, 0x0819192b, 0x08080819, 0x08192b08, + 0x08081908, 0x08192b08, 0x08190808, 0x08192b08, 0x19080808, 0x08192b08, 0x2b080819, 0x08192b08, + 0x08080808, 0x08192b19, 0x08081919, 0x08192b19, 0x2b2b0808, 0x08192b19, 0x19190819, 0x08192b2b, + 0x08080808, 0x082b0808, 0x0808082b, 0x082b0808, 0x08082b2b, 0x082b0808, 0x19081908, 0x082b0808, + 0x192b0819, 0x082b0808, 0x2b080808, 0x082b0808, 0x2b08082b, 0x082b0808, 0x082b2b19, 0x082b0819, + 0x19082b08, 0x082b0819, 0x08080808, 0x082b082b, 0x0808082b, 0x082b082b, 0x08080819, 0x082b1908, + 0x08081908, 0x082b1908, 0x08190808, 0x082b1908, 0x19080808, 0x082b1908, 0x1919192b, 0x082b1908, + 0x08080808, 0x082b1919, 0x19080819, 0x082b1919, 0x192b1908, 0x082b1919, 0x2b190808, 0x082b192b, + 0x08082b08, 0x082b2b08, 0x082b0808, 0x082b2b08, 0x2b191908, 0x082b2b08, 0x19081908, 0x082b2b2b, + 0x08080819, 0x19080808, 0x08081908, 0x19080808, 0x08190808, 0x19080808, 0x08192b08, 0x19080808, + 0x082b0819, 0x19080808, 0x082b1908, 0x19080808, 0x19080808, 0x19080808, 0x19082b08, 0x19080808, + 0x1919192b, 0x19080808, 0x192b0808, 0x19080808, 0x2b080819, 0x19080808, 0x2b081908, 0x19080808, + 0x2b190808, 0x19080808, 0x08080808, 0x19080819, 0x082b0808, 0x19080819, 0x192b0819, 0x19080819, + 0x2b080808, 0x19080819, 0x2b081919, 0x19080819, 0x08080819, 0x1908082b, 0x08190808, 0x1908082b, + 0x19082b08, 0x1908082b, 0x1919192b, 0x1908082b, 0x192b2b08, 0x1908082b, 0x08080808, 0x19081908, + 0x08082b08, 0x19081908, 0x082b0808, 0x19081908, 0x2b080808, 0x19081908, 0x2b192b19, 0x19081908, + 0x0819082b, 0x19081919, 0x082b1908, 0x19081919, 0x08080808, 0x1908192b, 0x08080819, 0x19082b08, + 0x08081908, 0x19082b08, 0x08190808, 0x19082b08, 0x19080808, 0x19082b08, 0x19081919, 0x19082b08, + 0x08080808, 0x19082b19, 0x19192b08, 0x19082b19, 0x192b0819, 0x19082b19, 0x2b08082b, 0x19082b19, + 0x19081919, 0x19082b2b, 0x2b190808, 0x19082b2b, 0x08080808, 0x19190808, 0x08082b08, 0x19190808, + 0x08190819, 0x19190808, 0x08192b19, 0x19190808, 0x082b0808, 0x19190808, 0x2b080808, 0x19190808, + 0x2b082b08, 0x19190808, 0x08081908, 0x19190819, 0x1908082b, 0x19190819, 0x2b2b1908, 0x19190819, + 0x2b190819, 0x1919082b, 0x2b190808, 0x19191908, 0x2b19082b, 0x19191908, 0x08082b2b, 0x19191919, + 0x08080819, 0x1919192b, 0x19191908, 0x1919192b, 0x08080808, 0x19192b08, 0x08190819, 0x19192b08, + 0x08192b19, 0x19192b08, 0x192b1908, 0x19192b08, 0x19080808, 0x19192b19, 0x08082b08, 0x19192b2b, + 0x08081908, 0x192b0808, 0x08190808, 0x192b0808, 0x19080808, 0x192b0808, 0x192b2b08, 0x192b0808, + 0x08080808, 0x192b0819, 0x19191919, 0x192b0819, 0x08192b08, 0x192b082b, 0x192b0808, 0x192b082b, + 0x08080808, 0x192b1908, 0x08081919, 0x192b1908, 0x08190808, 0x192b1919, 0x0819082b, 0x192b1919, + 0x2b081908, 0x192b1919, 0x1908082b, 0x192b2b08, 0x08080808, 0x2b080808, 0x0808082b, 0x2b080808, + 0x08082b2b, 0x2b080808, 0x19080819, 0x2b080808, 0x2b08082b, 0x2b080808, 0x08081908, 0x2b080819, + 0x08192b08, 0x2b080819, 0x19080808, 0x2b080819, 0x08190819, 0x2b08082b, 0x08080819, 0x2b081908, + 0x08081908, 0x2b081908, 0x08190808, 0x2b081908, 0x08191919, 0x2b081908, 0x19080808, 0x2b081908, + 0x192b0808, 0x2b081908, 0x08080808, 0x2b081919, 0x1908192b, 0x2b081919, 0x2b191908, 0x2b081919, + 0x08082b19, 0x2b08192b, 0x19080808, 0x2b08192b, 0x192b0808, 0x2b08192b, 0x0808082b, 0x2b082b08, + 0x08081908, 0x2b082b19, 0x08190819, 0x2b082b2b, 0x08081908, 0x2b190808, 0x08190808, 0x2b190808, + 0x082b1908, 0x2b190808, 0x19080808, 0x2b190808, 0x2b2b0819, 0x2b190808, 0x0819192b, 0x2b190819, + 0x2b080808, 0x2b190819, 0x19081919, 0x2b19082b, 0x08080808, 0x2b191908, 0x082b082b, 0x2b191908, + 0x19081908, 0x2b191908, 0x19190819, 0x2b191919, 0x2b080819, 0x2b192b08, 0x082b0808, 0x2b192b19, + 0x0808082b, 0x2b2b0808, 0x19190808, 0x2b2b0808, 0x2b081919, 0x2b2b0808, 0x08082b19, 0x2b2b0819, + 0x08080808, 0x2b2b082b, 0x08192b08, 0x2b2b1908, 0x19190808, 0x2b2b2b08, 0x08081908, 0x2b2b2b19 +); + +struct iq2_xxs { + d: f16, + qs: array +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + var src1_i = src1_idx_base + offset * 256; + var sum = 0.0; + for (var ib: u32 = 0; ib < 32; ib += 4) { + let aux0 = bitcast(vec2(block.qs[ib], block.qs[ib + 1])); + let aux1 = bitcast(vec2(block.qs[ib + 2], block.qs[ib + 3])); + let db = d * (0.5 + f32(aux1 >> 28)) * 0.25; + for (var l: u32 = 0; l < 4; l++) { + let ig = get_byte(aux0, l) * 8; + let is = (aux1 >> (7 * l)) & 127; + let signs = get_byte(ksigns_iq2xs[is / 4], is % 4); + for (var j: u32 = 0; j < 8; j++) { + let g = get_byte(iq2xxs_grid[(ig + j) / 4], (ig + j) % 4); + let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4], j % 4) & signs) != 0); + sum += db * f32(g) * m * src1[src1_i]; + src1_i++; + } + } + } + return sum; +} + +#enddecl(IQ2_XXS) + +#decl(IQ2_XS) +const iq2xs_grid = array( + 0x08080808, 0x08080808, 0x0808082b, 0x08080808, 0x08081919, 0x08080808, 0x08082b08, 0x08080808, + 0x08082b2b, 0x08080808, 0x08190819, 0x08080808, 0x08191908, 0x08080808, 0x0819192b, 0x08080808, + 0x08192b19, 0x08080808, 0x082b0808, 0x08080808, 0x082b082b, 0x08080808, 0x082b1919, 0x08080808, + 0x082b2b08, 0x08080808, 0x19080819, 0x08080808, 0x19081908, 0x08080808, 0x1908192b, 0x08080808, + 0x19082b19, 0x08080808, 0x19190808, 0x08080808, 0x1919082b, 0x08080808, 0x19191919, 0x08080808, + 0x19192b08, 0x08080808, 0x192b0819, 0x08080808, 0x192b1908, 0x08080808, 0x2b080808, 0x08080808, + 0x2b08082b, 0x08080808, 0x2b081919, 0x08080808, 0x2b082b08, 0x08080808, 0x2b190819, 0x08080808, + 0x2b191908, 0x08080808, 0x2b192b19, 0x08080808, 0x2b2b0808, 0x08080808, 0x08080819, 0x08080819, + 0x08081908, 0x08080819, 0x0808192b, 0x08080819, 0x08082b19, 0x08080819, 0x08190808, 0x08080819, + 0x0819082b, 0x08080819, 0x08191919, 0x08080819, 0x08192b08, 0x08080819, 0x08192b2b, 0x08080819, + 0x082b0819, 0x08080819, 0x082b1908, 0x08080819, 0x19080808, 0x08080819, 0x1908082b, 0x08080819, + 0x19081919, 0x08080819, 0x19082b08, 0x08080819, 0x19190819, 0x08080819, 0x19191908, 0x08080819, + 0x192b0808, 0x08080819, 0x192b2b08, 0x08080819, 0x2b080819, 0x08080819, 0x2b081908, 0x08080819, + 0x2b190808, 0x08080819, 0x08080808, 0x0808082b, 0x0808082b, 0x0808082b, 0x08081919, 0x0808082b, + 0x08082b08, 0x0808082b, 0x08190819, 0x0808082b, 0x08191908, 0x0808082b, 0x082b0808, 0x0808082b, + 0x19080819, 0x0808082b, 0x19081908, 0x0808082b, 0x19190808, 0x0808082b, 0x19191919, 0x0808082b, + 0x2b080808, 0x0808082b, 0x2b082b2b, 0x0808082b, 0x08080819, 0x08081908, 0x08081908, 0x08081908, + 0x0808192b, 0x08081908, 0x08082b19, 0x08081908, 0x08190808, 0x08081908, 0x0819082b, 0x08081908, + 0x08191919, 0x08081908, 0x08192b08, 0x08081908, 0x082b0819, 0x08081908, 0x082b1908, 0x08081908, + 0x19080808, 0x08081908, 0x1908082b, 0x08081908, 0x19081919, 0x08081908, 0x19082b08, 0x08081908, + 0x19190819, 0x08081908, 0x19191908, 0x08081908, 0x1919192b, 0x08081908, 0x192b0808, 0x08081908, + 0x2b080819, 0x08081908, 0x2b081908, 0x08081908, 0x2b190808, 0x08081908, 0x08080808, 0x08081919, + 0x0808082b, 0x08081919, 0x08081919, 0x08081919, 0x08082b08, 0x08081919, 0x08190819, 0x08081919, + 0x08191908, 0x08081919, 0x082b0808, 0x08081919, 0x19080819, 0x08081919, 0x19081908, 0x08081919, + 0x19190808, 0x08081919, 0x192b0819, 0x08081919, 0x2b080808, 0x08081919, 0x08080819, 0x0808192b, + 0x08081908, 0x0808192b, 0x08190808, 0x0808192b, 0x082b192b, 0x0808192b, 0x19080808, 0x0808192b, + 0x1908082b, 0x0808192b, 0x2b081908, 0x0808192b, 0x08080808, 0x08082b08, 0x0808082b, 0x08082b08, + 0x08081919, 0x08082b08, 0x08082b08, 0x08082b08, 0x08082b2b, 0x08082b08, 0x08190819, 0x08082b08, + 0x08191908, 0x08082b08, 0x082b0808, 0x08082b08, 0x082b1919, 0x08082b08, 0x19080819, 0x08082b08, + 0x19081908, 0x08082b08, 0x19190808, 0x08082b08, 0x19192b08, 0x08082b08, 0x2b080808, 0x08082b08, + 0x2b2b0808, 0x08082b08, 0x2b2b2b2b, 0x08082b08, 0x08080819, 0x08082b19, 0x08081908, 0x08082b19, + 0x08190808, 0x08082b19, 0x19080808, 0x08082b19, 0x2b080819, 0x08082b19, 0x2b082b19, 0x08082b19, + 0x08080808, 0x08082b2b, 0x082b0808, 0x08082b2b, 0x082b2b08, 0x08082b2b, 0x2b19192b, 0x08082b2b, + 0x2b2b0808, 0x08082b2b, 0x08080819, 0x08190808, 0x08081908, 0x08190808, 0x0808192b, 0x08190808, + 0x08082b19, 0x08190808, 0x08190808, 0x08190808, 0x0819082b, 0x08190808, 0x08191919, 0x08190808, + 0x08192b08, 0x08190808, 0x082b0819, 0x08190808, 0x082b1908, 0x08190808, 0x19080808, 0x08190808, + 0x1908082b, 0x08190808, 0x19081919, 0x08190808, 0x19082b08, 0x08190808, 0x19190819, 0x08190808, + 0x19191908, 0x08190808, 0x192b0808, 0x08190808, 0x192b2b2b, 0x08190808, 0x2b080819, 0x08190808, + 0x2b081908, 0x08190808, 0x2b190808, 0x08190808, 0x08080808, 0x08190819, 0x0808082b, 0x08190819, + 0x08081919, 0x08190819, 0x08082b08, 0x08190819, 0x08190819, 0x08190819, 0x08191908, 0x08190819, + 0x082b0808, 0x08190819, 0x19080819, 0x08190819, 0x19081908, 0x08190819, 0x19190808, 0x08190819, + 0x2b080808, 0x08190819, 0x2b191908, 0x08190819, 0x2b19192b, 0x08190819, 0x08080819, 0x0819082b, + 0x08081908, 0x0819082b, 0x0808192b, 0x0819082b, 0x08190808, 0x0819082b, 0x19080808, 0x0819082b, + 0x192b0808, 0x0819082b, 0x08080808, 0x08191908, 0x0808082b, 0x08191908, 0x08081919, 0x08191908, + 0x08082b08, 0x08191908, 0x08190819, 0x08191908, 0x08191908, 0x08191908, 0x082b0808, 0x08191908, + 0x19080819, 0x08191908, 0x19081908, 0x08191908, 0x19082b19, 0x08191908, 0x19190808, 0x08191908, + 0x192b1908, 0x08191908, 0x2b080808, 0x08191908, 0x08080819, 0x08191919, 0x08081908, 0x08191919, + 0x08190808, 0x08191919, 0x19080808, 0x08191919, 0x08080808, 0x0819192b, 0x08191908, 0x0819192b, + 0x19082b19, 0x0819192b, 0x08080819, 0x08192b08, 0x08081908, 0x08192b08, 0x08190808, 0x08192b08, + 0x0819082b, 0x08192b08, 0x19080808, 0x08192b08, 0x19191908, 0x08192b08, 0x2b08192b, 0x08192b08, + 0x08080808, 0x08192b19, 0x08081919, 0x08192b19, 0x192b192b, 0x08192b19, 0x19190819, 0x08192b2b, + 0x2b2b2b19, 0x08192b2b, 0x08080808, 0x082b0808, 0x0808082b, 0x082b0808, 0x08081919, 0x082b0808, + 0x08082b08, 0x082b0808, 0x08082b2b, 0x082b0808, 0x08190819, 0x082b0808, 0x08191908, 0x082b0808, + 0x082b0808, 0x082b0808, 0x19080819, 0x082b0808, 0x19081908, 0x082b0808, 0x19190808, 0x082b0808, + 0x2b080808, 0x082b0808, 0x2b2b0808, 0x082b0808, 0x08080819, 0x082b0819, 0x08081908, 0x082b0819, + 0x08190808, 0x082b0819, 0x19080808, 0x082b0819, 0x19082b08, 0x082b0819, 0x192b1919, 0x082b0819, + 0x08080808, 0x082b082b, 0x082b082b, 0x082b082b, 0x2b080808, 0x082b082b, 0x2b2b2b08, 0x082b082b, + 0x08080819, 0x082b1908, 0x08081908, 0x082b1908, 0x08190808, 0x082b1908, 0x082b2b19, 0x082b1908, + 0x19080808, 0x082b1908, 0x08080808, 0x082b1919, 0x19080819, 0x082b1919, 0x1919082b, 0x082b1919, + 0x2b192b19, 0x082b1919, 0x08080819, 0x082b192b, 0x08192b2b, 0x082b192b, 0x2b2b192b, 0x082b192b, + 0x08080808, 0x082b2b08, 0x08082b08, 0x082b2b08, 0x08082b2b, 0x082b2b08, 0x082b0808, 0x082b2b08, + 0x19191919, 0x082b2b08, 0x2b082b08, 0x082b2b08, 0x2b2b082b, 0x082b2b08, 0x192b2b08, 0x082b2b19, + 0x2b190808, 0x082b2b19, 0x08082b08, 0x082b2b2b, 0x082b0808, 0x082b2b2b, 0x2b08082b, 0x082b2b2b, + 0x2b082b08, 0x082b2b2b, 0x2b082b2b, 0x082b2b2b, 0x08080819, 0x19080808, 0x08081908, 0x19080808, + 0x0808192b, 0x19080808, 0x08082b19, 0x19080808, 0x08190808, 0x19080808, 0x0819082b, 0x19080808, + 0x08191919, 0x19080808, 0x08192b08, 0x19080808, 0x082b0819, 0x19080808, 0x082b1908, 0x19080808, + 0x19080808, 0x19080808, 0x1908082b, 0x19080808, 0x19081919, 0x19080808, 0x19082b08, 0x19080808, + 0x19082b2b, 0x19080808, 0x19190819, 0x19080808, 0x19191908, 0x19080808, 0x192b0808, 0x19080808, + 0x192b1919, 0x19080808, 0x2b080819, 0x19080808, 0x2b081908, 0x19080808, 0x2b190808, 0x19080808, + 0x08080808, 0x19080819, 0x0808082b, 0x19080819, 0x08081919, 0x19080819, 0x08082b08, 0x19080819, + 0x08190819, 0x19080819, 0x08191908, 0x19080819, 0x082b0808, 0x19080819, 0x19080819, 0x19080819, + 0x19081908, 0x19080819, 0x19190808, 0x19080819, 0x2b080808, 0x19080819, 0x2b081919, 0x19080819, + 0x2b2b082b, 0x19080819, 0x08080819, 0x1908082b, 0x08081908, 0x1908082b, 0x08190808, 0x1908082b, + 0x0819082b, 0x1908082b, 0x082b2b19, 0x1908082b, 0x19080808, 0x1908082b, 0x08080808, 0x19081908, + 0x0808082b, 0x19081908, 0x08081919, 0x19081908, 0x08082b08, 0x19081908, 0x08190819, 0x19081908, + 0x08191908, 0x19081908, 0x08192b19, 0x19081908, 0x082b0808, 0x19081908, 0x19080819, 0x19081908, + 0x19081908, 0x19081908, 0x19190808, 0x19081908, 0x2b080808, 0x19081908, 0x2b191908, 0x19081908, + 0x08080819, 0x19081919, 0x08081908, 0x19081919, 0x08190808, 0x19081919, 0x082b1908, 0x19081919, + 0x19080808, 0x19081919, 0x2b192b2b, 0x19081919, 0x08080808, 0x1908192b, 0x08082b2b, 0x1908192b, + 0x19081908, 0x1908192b, 0x19190808, 0x1908192b, 0x08080819, 0x19082b08, 0x08081908, 0x19082b08, + 0x08190808, 0x19082b08, 0x19080808, 0x19082b08, 0x19081919, 0x19082b08, 0x19191908, 0x19082b08, + 0x192b082b, 0x19082b08, 0x08080808, 0x19082b19, 0x08190819, 0x19082b19, 0x19081908, 0x19082b19, + 0x19190808, 0x19082b19, 0x192b2b19, 0x19082b19, 0x08081908, 0x19082b2b, 0x08080808, 0x19190808, + 0x0808082b, 0x19190808, 0x08081919, 0x19190808, 0x08082b08, 0x19190808, 0x08190819, 0x19190808, + 0x08191908, 0x19190808, 0x082b0808, 0x19190808, 0x082b2b08, 0x19190808, 0x19080819, 0x19190808, + 0x19081908, 0x19190808, 0x19190808, 0x19190808, 0x2b080808, 0x19190808, 0x08080819, 0x19190819, + 0x08081908, 0x19190819, 0x08190808, 0x19190819, 0x08191919, 0x19190819, 0x19080808, 0x19190819, + 0x1908082b, 0x19190819, 0x08080808, 0x1919082b, 0x19081908, 0x1919082b, 0x2b2b2b2b, 0x1919082b, + 0x08080819, 0x19191908, 0x08081908, 0x19191908, 0x08190808, 0x19191908, 0x082b0819, 0x19191908, + 0x19080808, 0x19191908, 0x192b0808, 0x19191908, 0x2b080819, 0x19191908, 0x2b2b0819, 0x19191908, + 0x08080808, 0x19191919, 0x08082b08, 0x19191919, 0x2b080808, 0x19191919, 0x2b082b08, 0x19191919, + 0x082b0819, 0x1919192b, 0x192b2b08, 0x1919192b, 0x2b2b0819, 0x1919192b, 0x08080808, 0x19192b08, + 0x08191908, 0x19192b08, 0x19080819, 0x19192b08, 0x19190808, 0x19192b08, 0x2b192b19, 0x19192b08, + 0x08192b2b, 0x19192b19, 0x19080808, 0x19192b19, 0x1908082b, 0x19192b19, 0x2b081919, 0x19192b2b, + 0x08080819, 0x192b0808, 0x08081908, 0x192b0808, 0x08190808, 0x192b0808, 0x19080808, 0x192b0808, + 0x19191908, 0x192b0808, 0x192b082b, 0x192b0808, 0x2b08192b, 0x192b0808, 0x2b2b2b19, 0x192b0808, + 0x08080808, 0x192b0819, 0x082b1908, 0x192b082b, 0x19082b2b, 0x192b082b, 0x2b19082b, 0x192b082b, + 0x08080808, 0x192b1908, 0x0819192b, 0x192b1908, 0x08190808, 0x192b1919, 0x19080808, 0x192b1919, + 0x19081919, 0x192b1919, 0x2b2b1908, 0x192b1919, 0x08080819, 0x192b2b08, 0x192b2b2b, 0x192b2b08, + 0x082b1919, 0x192b2b19, 0x0808192b, 0x192b2b2b, 0x19191908, 0x192b2b2b, 0x192b082b, 0x192b2b2b, + 0x08080808, 0x2b080808, 0x0808082b, 0x2b080808, 0x08081919, 0x2b080808, 0x08082b08, 0x2b080808, + 0x08190819, 0x2b080808, 0x08191908, 0x2b080808, 0x082b0808, 0x2b080808, 0x082b2b2b, 0x2b080808, + 0x19080819, 0x2b080808, 0x19081908, 0x2b080808, 0x19190808, 0x2b080808, 0x2b080808, 0x2b080808, + 0x2b08082b, 0x2b080808, 0x2b2b2b08, 0x2b080808, 0x2b2b2b2b, 0x2b080808, 0x08080819, 0x2b080819, + 0x08081908, 0x2b080819, 0x0808192b, 0x2b080819, 0x08190808, 0x2b080819, 0x19080808, 0x2b080819, + 0x19190819, 0x2b080819, 0x19192b19, 0x2b080819, 0x08080808, 0x2b08082b, 0x082b0808, 0x2b08082b, + 0x2b080808, 0x2b08082b, 0x2b08082b, 0x2b08082b, 0x2b2b0808, 0x2b08082b, 0x2b2b2b08, 0x2b08082b, + 0x08080819, 0x2b081908, 0x08081908, 0x2b081908, 0x08190808, 0x2b081908, 0x0819082b, 0x2b081908, + 0x08191919, 0x2b081908, 0x19080808, 0x2b081908, 0x192b0808, 0x2b081908, 0x2b082b19, 0x2b081908, + 0x08080808, 0x2b081919, 0x19081908, 0x2b081919, 0x2b2b1919, 0x2b081919, 0x08192b08, 0x2b08192b, + 0x192b2b2b, 0x2b08192b, 0x08080808, 0x2b082b08, 0x08082b08, 0x2b082b08, 0x082b1919, 0x2b082b08, + 0x19192b2b, 0x2b082b08, 0x2b080808, 0x2b082b08, 0x2b08082b, 0x2b082b08, 0x2b2b2b08, 0x2b082b08, + 0x0808192b, 0x2b082b19, 0x082b082b, 0x2b082b2b, 0x2b080808, 0x2b082b2b, 0x2b082b08, 0x2b082b2b, + 0x2b19192b, 0x2b082b2b, 0x2b2b2b08, 0x2b082b2b, 0x08080819, 0x2b190808, 0x08081908, 0x2b190808, + 0x08190808, 0x2b190808, 0x19080808, 0x2b190808, 0x1919192b, 0x2b190808, 0x2b081908, 0x2b190808, + 0x08080808, 0x2b190819, 0x082b082b, 0x2b190819, 0x192b1908, 0x2b190819, 0x1919192b, 0x2b19082b, + 0x2b082b19, 0x2b19082b, 0x08080808, 0x2b191908, 0x08081919, 0x2b191908, 0x19081908, 0x2b191908, + 0x19190808, 0x2b191908, 0x19192b08, 0x2b191908, 0x082b2b19, 0x2b191919, 0x2b190808, 0x2b191919, + 0x2b19082b, 0x2b191919, 0x19080819, 0x2b19192b, 0x19190819, 0x2b192b08, 0x2b2b192b, 0x2b192b08, + 0x19082b19, 0x2b192b19, 0x08191919, 0x2b192b2b, 0x192b0808, 0x2b192b2b, 0x08080808, 0x2b2b0808, + 0x0808082b, 0x2b2b0808, 0x08082b08, 0x2b2b0808, 0x08082b2b, 0x2b2b0808, 0x082b0808, 0x2b2b0808, + 0x082b2b2b, 0x2b2b0808, 0x2b2b0808, 0x2b2b0808, 0x19190819, 0x2b2b0819, 0x19192b19, 0x2b2b0819, + 0x2b2b192b, 0x2b2b0819, 0x08080808, 0x2b2b082b, 0x0808082b, 0x2b2b082b, 0x08082b08, 0x2b2b082b, + 0x082b2b2b, 0x2b2b082b, 0x2b080808, 0x2b2b082b, 0x2b2b0808, 0x2b2b082b, 0x19080808, 0x2b2b1908, + 0x2b191919, 0x2b2b1908, 0x192b1919, 0x2b2b192b, 0x2b192b08, 0x2b2b192b, 0x08082b2b, 0x2b2b2b08, + 0x082b0808, 0x2b2b2b08, 0x082b082b, 0x2b2b2b08, 0x082b2b08, 0x2b2b2b08, 0x2b2b0808, 0x2b2b2b08, + 0x2b2b2b08, 0x2b2b2b08, 0x08081908, 0x2b2b2b19, 0x2b081908, 0x2b2b2b19, 0x2b08192b, 0x2b2b2b19, + 0x082b2b08, 0x2b2b2b2b, 0x082b2b2b, 0x2b2b2b2b, 0x2b190819, 0x2b2b2b2b, 0x2b2b2b2b, 0x2b2b2b2b +); + +struct iq2_xs { + d: f16, + qs: array, + scales: array +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + var src1_i = src1_idx_base + offset * 256; + var scale_vals = array( + bitcast(vec2(block.scales[0], block.scales[1])), + bitcast(vec2(block.scales[2], block.scales[3])) + ); + var sum = 0.0; + for (var ib: u32 = 0; ib < 32; ib += 4) { + let s = get_byte(scale_vals[ib / 16], (ib % 16) / 4); + let db = array( + d * (0.5 + f32(s & 0xF)) * 0.25, + d * (0.5 + f32(s >> 4)) * 0.25 + ); + for (var l: u32 = 0; l < 4; l++) { + let qs_val = bitcast(vec2(block.qs[ib + l], 0.0)); + let ig = (qs_val & 511) * 8; + let is = qs_val >> 9; + let signs = get_byte(ksigns_iq2xs[is / 4], is % 4); + let dl = db[l/2]; + for (var j: u32 = 0; j < 8; j++) { + let g = get_byte(iq2xs_grid[(ig + j) / 4], (ig + j) % 4); + let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4], j % 4) & signs) != 0); + sum += dl * f32(g) * m * src1[src1_i]; + src1_i++; + } + } + } + return sum; +} + +#enddecl(IQ2_XS) + +#decl(IQ2_S) + +const iq2s_grid = array( + 0x08080808, 0x08080808, 0x0808082b, 0x08080808, 0x08081919, 0x08080808, 0x08082b08, 0x08080808, + 0x08082b2b, 0x08080808, 0x08190819, 0x08080808, 0x08191908, 0x08080808, 0x0819192b, 0x08080808, + 0x08192b19, 0x08080808, 0x082b0808, 0x08080808, 0x082b082b, 0x08080808, 0x082b1919, 0x08080808, + 0x082b2b08, 0x08080808, 0x19080819, 0x08080808, 0x19081908, 0x08080808, 0x1908192b, 0x08080808, + 0x19082b19, 0x08080808, 0x19190808, 0x08080808, 0x1919082b, 0x08080808, 0x19191919, 0x08080808, + 0x19192b08, 0x08080808, 0x192b0819, 0x08080808, 0x192b1908, 0x08080808, 0x192b192b, 0x08080808, + 0x192b2b19, 0x08080808, 0x2b080808, 0x08080808, 0x2b08082b, 0x08080808, 0x2b081919, 0x08080808, + 0x2b082b08, 0x08080808, 0x2b190819, 0x08080808, 0x2b191908, 0x08080808, 0x2b2b0808, 0x08080808, + 0x2b2b1919, 0x08080808, 0x2b2b2b2b, 0x08080808, 0x08080819, 0x08080819, 0x08081908, 0x08080819, + 0x0808192b, 0x08080819, 0x08082b19, 0x08080819, 0x08190808, 0x08080819, 0x0819082b, 0x08080819, + 0x08191919, 0x08080819, 0x08192b08, 0x08080819, 0x082b0819, 0x08080819, 0x082b1908, 0x08080819, + 0x19080808, 0x08080819, 0x1908082b, 0x08080819, 0x19081919, 0x08080819, 0x19082b08, 0x08080819, + 0x19190819, 0x08080819, 0x19191908, 0x08080819, 0x1919192b, 0x08080819, 0x19192b19, 0x08080819, + 0x192b0808, 0x08080819, 0x192b1919, 0x08080819, 0x192b2b08, 0x08080819, 0x2b080819, 0x08080819, + 0x2b081908, 0x08080819, 0x2b190808, 0x08080819, 0x2b19082b, 0x08080819, 0x2b191919, 0x08080819, + 0x2b2b0819, 0x08080819, 0x2b2b1908, 0x08080819, 0x08080808, 0x0808082b, 0x0808082b, 0x0808082b, + 0x08081919, 0x0808082b, 0x08082b08, 0x0808082b, 0x08190819, 0x0808082b, 0x08191908, 0x0808082b, + 0x082b0808, 0x0808082b, 0x082b2b2b, 0x0808082b, 0x19080819, 0x0808082b, 0x19081908, 0x0808082b, + 0x1908192b, 0x0808082b, 0x19082b19, 0x0808082b, 0x19190808, 0x0808082b, 0x19191919, 0x0808082b, + 0x2b080808, 0x0808082b, 0x2b081919, 0x0808082b, 0x2b082b2b, 0x0808082b, 0x2b191908, 0x0808082b, + 0x2b2b082b, 0x0808082b, 0x08080819, 0x08081908, 0x08081908, 0x08081908, 0x0808192b, 0x08081908, + 0x08082b19, 0x08081908, 0x08190808, 0x08081908, 0x0819082b, 0x08081908, 0x08191919, 0x08081908, + 0x08192b08, 0x08081908, 0x082b0819, 0x08081908, 0x082b1908, 0x08081908, 0x082b192b, 0x08081908, + 0x082b2b19, 0x08081908, 0x19080808, 0x08081908, 0x1908082b, 0x08081908, 0x19081919, 0x08081908, + 0x19082b08, 0x08081908, 0x19082b2b, 0x08081908, 0x19190819, 0x08081908, 0x19191908, 0x08081908, + 0x1919192b, 0x08081908, 0x19192b19, 0x08081908, 0x192b0808, 0x08081908, 0x192b082b, 0x08081908, + 0x192b1919, 0x08081908, 0x2b080819, 0x08081908, 0x2b081908, 0x08081908, 0x2b08192b, 0x08081908, + 0x2b082b19, 0x08081908, 0x2b190808, 0x08081908, 0x2b191919, 0x08081908, 0x2b192b08, 0x08081908, + 0x2b2b0819, 0x08081908, 0x2b2b1908, 0x08081908, 0x08080808, 0x08081919, 0x0808082b, 0x08081919, + 0x08081919, 0x08081919, 0x08082b08, 0x08081919, 0x08082b2b, 0x08081919, 0x08190819, 0x08081919, + 0x08191908, 0x08081919, 0x0819192b, 0x08081919, 0x08192b19, 0x08081919, 0x082b0808, 0x08081919, + 0x082b1919, 0x08081919, 0x082b2b08, 0x08081919, 0x19080819, 0x08081919, 0x19081908, 0x08081919, + 0x1908192b, 0x08081919, 0x19082b19, 0x08081919, 0x19190808, 0x08081919, 0x1919082b, 0x08081919, + 0x19191919, 0x08081919, 0x19192b08, 0x08081919, 0x192b0819, 0x08081919, 0x192b1908, 0x08081919, + 0x2b080808, 0x08081919, 0x2b08082b, 0x08081919, 0x2b081919, 0x08081919, 0x2b082b08, 0x08081919, + 0x2b190819, 0x08081919, 0x2b191908, 0x08081919, 0x2b2b0808, 0x08081919, 0x08080819, 0x0808192b, + 0x08081908, 0x0808192b, 0x0808192b, 0x0808192b, 0x08082b19, 0x0808192b, 0x08190808, 0x0808192b, + 0x08191919, 0x0808192b, 0x19080808, 0x0808192b, 0x19081919, 0x0808192b, 0x19082b08, 0x0808192b, + 0x19190819, 0x0808192b, 0x19191908, 0x0808192b, 0x192b0808, 0x0808192b, 0x2b080819, 0x0808192b, + 0x2b081908, 0x0808192b, 0x2b190808, 0x0808192b, 0x08080808, 0x08082b08, 0x0808082b, 0x08082b08, + 0x08081919, 0x08082b08, 0x08082b08, 0x08082b08, 0x08190819, 0x08082b08, 0x08191908, 0x08082b08, + 0x0819192b, 0x08082b08, 0x08192b19, 0x08082b08, 0x082b0808, 0x08082b08, 0x082b1919, 0x08082b08, + 0x082b2b2b, 0x08082b08, 0x19080819, 0x08082b08, 0x19081908, 0x08082b08, 0x1908192b, 0x08082b08, + 0x19082b19, 0x08082b08, 0x19190808, 0x08082b08, 0x1919082b, 0x08082b08, 0x19191919, 0x08082b08, + 0x19192b08, 0x08082b08, 0x192b0819, 0x08082b08, 0x192b1908, 0x08082b08, 0x2b080808, 0x08082b08, + 0x2b081919, 0x08082b08, 0x2b191908, 0x08082b08, 0x2b2b2b2b, 0x08082b08, 0x08080819, 0x08082b19, + 0x08081908, 0x08082b19, 0x08190808, 0x08082b19, 0x0819082b, 0x08082b19, 0x08191919, 0x08082b19, + 0x08192b08, 0x08082b19, 0x082b0819, 0x08082b19, 0x19080808, 0x08082b19, 0x19081919, 0x08082b19, + 0x19082b08, 0x08082b19, 0x19190819, 0x08082b19, 0x19191908, 0x08082b19, 0x192b0808, 0x08082b19, + 0x2b080819, 0x08082b19, 0x2b190808, 0x08082b19, 0x08080808, 0x08082b2b, 0x08190819, 0x08082b2b, + 0x08191908, 0x08082b2b, 0x082b082b, 0x08082b2b, 0x082b2b08, 0x08082b2b, 0x082b2b2b, 0x08082b2b, + 0x19190808, 0x08082b2b, 0x2b192b19, 0x08082b2b, 0x08080819, 0x08190808, 0x08081908, 0x08190808, + 0x0808192b, 0x08190808, 0x08082b19, 0x08190808, 0x08190808, 0x08190808, 0x0819082b, 0x08190808, + 0x08191919, 0x08190808, 0x08192b08, 0x08190808, 0x082b0819, 0x08190808, 0x082b1908, 0x08190808, + 0x082b192b, 0x08190808, 0x19080808, 0x08190808, 0x1908082b, 0x08190808, 0x19081919, 0x08190808, + 0x19082b08, 0x08190808, 0x19190819, 0x08190808, 0x19191908, 0x08190808, 0x1919192b, 0x08190808, + 0x19192b19, 0x08190808, 0x192b0808, 0x08190808, 0x192b082b, 0x08190808, 0x192b1919, 0x08190808, + 0x192b2b08, 0x08190808, 0x2b080819, 0x08190808, 0x2b081908, 0x08190808, 0x2b08192b, 0x08190808, + 0x2b190808, 0x08190808, 0x2b191919, 0x08190808, 0x2b192b08, 0x08190808, 0x2b2b0819, 0x08190808, + 0x2b2b1908, 0x08190808, 0x08080808, 0x08190819, 0x0808082b, 0x08190819, 0x08081919, 0x08190819, + 0x08082b08, 0x08190819, 0x08082b2b, 0x08190819, 0x08190819, 0x08190819, 0x08191908, 0x08190819, + 0x0819192b, 0x08190819, 0x08192b19, 0x08190819, 0x082b0808, 0x08190819, 0x082b082b, 0x08190819, + 0x082b1919, 0x08190819, 0x082b2b08, 0x08190819, 0x19080819, 0x08190819, 0x19081908, 0x08190819, + 0x1908192b, 0x08190819, 0x19082b19, 0x08190819, 0x19190808, 0x08190819, 0x1919082b, 0x08190819, + 0x19191919, 0x08190819, 0x19192b08, 0x08190819, 0x192b0819, 0x08190819, 0x192b1908, 0x08190819, + 0x2b080808, 0x08190819, 0x2b08082b, 0x08190819, 0x2b081919, 0x08190819, 0x2b082b08, 0x08190819, + 0x2b190819, 0x08190819, 0x2b191908, 0x08190819, 0x08080819, 0x0819082b, 0x08081908, 0x0819082b, + 0x08082b19, 0x0819082b, 0x08190808, 0x0819082b, 0x08191919, 0x0819082b, 0x082b0819, 0x0819082b, + 0x082b1908, 0x0819082b, 0x19080808, 0x0819082b, 0x19081919, 0x0819082b, 0x19190819, 0x0819082b, + 0x19191908, 0x0819082b, 0x2b080819, 0x0819082b, 0x2b081908, 0x0819082b, 0x2b190808, 0x0819082b, + 0x08080808, 0x08191908, 0x0808082b, 0x08191908, 0x08081919, 0x08191908, 0x08082b08, 0x08191908, + 0x08190819, 0x08191908, 0x08191908, 0x08191908, 0x0819192b, 0x08191908, 0x08192b19, 0x08191908, + 0x082b0808, 0x08191908, 0x082b1919, 0x08191908, 0x082b2b08, 0x08191908, 0x19080819, 0x08191908, + 0x19081908, 0x08191908, 0x1908192b, 0x08191908, 0x19082b19, 0x08191908, 0x19190808, 0x08191908, + 0x1919082b, 0x08191908, 0x19191919, 0x08191908, 0x19192b08, 0x08191908, 0x192b0819, 0x08191908, + 0x192b1908, 0x08191908, 0x2b080808, 0x08191908, 0x2b08082b, 0x08191908, 0x2b081919, 0x08191908, + 0x2b082b08, 0x08191908, 0x2b190819, 0x08191908, 0x2b191908, 0x08191908, 0x2b2b0808, 0x08191908, + 0x08080819, 0x08191919, 0x08081908, 0x08191919, 0x0808192b, 0x08191919, 0x08082b19, 0x08191919, + 0x08190808, 0x08191919, 0x0819082b, 0x08191919, 0x08191919, 0x08191919, 0x08192b08, 0x08191919, + 0x082b0819, 0x08191919, 0x082b1908, 0x08191919, 0x19080808, 0x08191919, 0x1908082b, 0x08191919, + 0x19081919, 0x08191919, 0x19082b08, 0x08191919, 0x19190819, 0x08191919, 0x19191908, 0x08191919, + 0x192b0808, 0x08191919, 0x2b080819, 0x08191919, 0x2b081908, 0x08191919, 0x2b190808, 0x08191919, + 0x08080808, 0x0819192b, 0x08081919, 0x0819192b, 0x08082b08, 0x0819192b, 0x08190819, 0x0819192b, + 0x08191908, 0x0819192b, 0x082b0808, 0x0819192b, 0x19080819, 0x0819192b, 0x19081908, 0x0819192b, + 0x19190808, 0x0819192b, 0x2b080808, 0x0819192b, 0x2b2b2b2b, 0x0819192b, 0x08080819, 0x08192b08, + 0x08081908, 0x08192b08, 0x0808192b, 0x08192b08, 0x08082b19, 0x08192b08, 0x08190808, 0x08192b08, + 0x08191919, 0x08192b08, 0x08192b08, 0x08192b08, 0x082b0819, 0x08192b08, 0x19080808, 0x08192b08, + 0x1908082b, 0x08192b08, 0x19081919, 0x08192b08, 0x19082b08, 0x08192b08, 0x19190819, 0x08192b08, + 0x19191908, 0x08192b08, 0x192b0808, 0x08192b08, 0x2b080819, 0x08192b08, 0x2b081908, 0x08192b08, + 0x08080808, 0x08192b19, 0x0808082b, 0x08192b19, 0x08081919, 0x08192b19, 0x08082b08, 0x08192b19, + 0x08190819, 0x08192b19, 0x08191908, 0x08192b19, 0x082b0808, 0x08192b19, 0x19080819, 0x08192b19, + 0x19081908, 0x08192b19, 0x19190808, 0x08192b19, 0x192b2b19, 0x08192b19, 0x2b2b082b, 0x08192b19, + 0x08081908, 0x08192b2b, 0x08190808, 0x08192b2b, 0x19080808, 0x08192b2b, 0x1919192b, 0x08192b2b, + 0x08080808, 0x082b0808, 0x0808082b, 0x082b0808, 0x08081919, 0x082b0808, 0x08082b08, 0x082b0808, + 0x08190819, 0x082b0808, 0x08191908, 0x082b0808, 0x0819192b, 0x082b0808, 0x08192b19, 0x082b0808, + 0x082b0808, 0x082b0808, 0x082b1919, 0x082b0808, 0x082b2b2b, 0x082b0808, 0x19080819, 0x082b0808, + 0x19081908, 0x082b0808, 0x19190808, 0x082b0808, 0x1919082b, 0x082b0808, 0x19191919, 0x082b0808, + 0x192b1908, 0x082b0808, 0x2b080808, 0x082b0808, 0x2b082b2b, 0x082b0808, 0x2b191908, 0x082b0808, + 0x2b2b2b2b, 0x082b0808, 0x08080819, 0x082b0819, 0x08081908, 0x082b0819, 0x08190808, 0x082b0819, + 0x0819082b, 0x082b0819, 0x08191919, 0x082b0819, 0x082b0819, 0x082b0819, 0x19080808, 0x082b0819, + 0x1908082b, 0x082b0819, 0x19081919, 0x082b0819, 0x19190819, 0x082b0819, 0x19191908, 0x082b0819, + 0x192b0808, 0x082b0819, 0x2b080819, 0x082b0819, 0x2b081908, 0x082b0819, 0x2b190808, 0x082b0819, + 0x08080808, 0x082b082b, 0x08082b2b, 0x082b082b, 0x082b082b, 0x082b082b, 0x082b2b08, 0x082b082b, + 0x082b2b2b, 0x082b082b, 0x19081908, 0x082b082b, 0x19190808, 0x082b082b, 0x2b082b08, 0x082b082b, + 0x2b082b2b, 0x082b082b, 0x2b2b2b08, 0x082b082b, 0x08080819, 0x082b1908, 0x08081908, 0x082b1908, + 0x0808192b, 0x082b1908, 0x08082b19, 0x082b1908, 0x08190808, 0x082b1908, 0x08191919, 0x082b1908, + 0x08192b08, 0x082b1908, 0x082b0819, 0x082b1908, 0x082b1908, 0x082b1908, 0x19080808, 0x082b1908, + 0x1908082b, 0x082b1908, 0x19081919, 0x082b1908, 0x19082b08, 0x082b1908, 0x19190819, 0x082b1908, + 0x19191908, 0x082b1908, 0x192b0808, 0x082b1908, 0x2b080819, 0x082b1908, 0x2b081908, 0x082b1908, + 0x2b190808, 0x082b1908, 0x08080808, 0x082b1919, 0x08081919, 0x082b1919, 0x08082b08, 0x082b1919, + 0x08190819, 0x082b1919, 0x08191908, 0x082b1919, 0x082b0808, 0x082b1919, 0x19080819, 0x082b1919, + 0x19081908, 0x082b1919, 0x19190808, 0x082b1919, 0x192b192b, 0x082b1919, 0x2b080808, 0x082b1919, + 0x08080819, 0x082b192b, 0x08081908, 0x082b192b, 0x08190808, 0x082b192b, 0x19080808, 0x082b192b, + 0x19192b19, 0x082b192b, 0x08080808, 0x082b2b08, 0x08081919, 0x082b2b08, 0x08190819, 0x082b2b08, + 0x08191908, 0x082b2b08, 0x19080819, 0x082b2b08, 0x19081908, 0x082b2b08, 0x19190808, 0x082b2b08, + 0x2b082b2b, 0x082b2b08, 0x2b2b2b2b, 0x082b2b08, 0x08080819, 0x082b2b19, 0x08081908, 0x082b2b19, + 0x08190808, 0x082b2b19, 0x2b191919, 0x082b2b19, 0x08082b2b, 0x082b2b2b, 0x082b082b, 0x082b2b2b, + 0x192b1908, 0x082b2b2b, 0x2b082b08, 0x082b2b2b, 0x2b082b2b, 0x082b2b2b, 0x08080819, 0x19080808, + 0x08081908, 0x19080808, 0x0808192b, 0x19080808, 0x08082b19, 0x19080808, 0x08190808, 0x19080808, + 0x0819082b, 0x19080808, 0x08191919, 0x19080808, 0x08192b08, 0x19080808, 0x08192b2b, 0x19080808, + 0x082b0819, 0x19080808, 0x082b1908, 0x19080808, 0x082b192b, 0x19080808, 0x19080808, 0x19080808, + 0x1908082b, 0x19080808, 0x19081919, 0x19080808, 0x19082b08, 0x19080808, 0x19082b2b, 0x19080808, + 0x19190819, 0x19080808, 0x19191908, 0x19080808, 0x1919192b, 0x19080808, 0x19192b19, 0x19080808, + 0x192b0808, 0x19080808, 0x192b082b, 0x19080808, 0x192b1919, 0x19080808, 0x2b080819, 0x19080808, + 0x2b081908, 0x19080808, 0x2b190808, 0x19080808, 0x2b191919, 0x19080808, 0x2b192b08, 0x19080808, + 0x2b2b0819, 0x19080808, 0x2b2b1908, 0x19080808, 0x08080808, 0x19080819, 0x0808082b, 0x19080819, + 0x08081919, 0x19080819, 0x08082b08, 0x19080819, 0x08190819, 0x19080819, 0x08191908, 0x19080819, + 0x0819192b, 0x19080819, 0x08192b19, 0x19080819, 0x082b0808, 0x19080819, 0x082b082b, 0x19080819, + 0x082b1919, 0x19080819, 0x19080819, 0x19080819, 0x19081908, 0x19080819, 0x1908192b, 0x19080819, + 0x19082b19, 0x19080819, 0x19190808, 0x19080819, 0x1919082b, 0x19080819, 0x19191919, 0x19080819, + 0x19192b08, 0x19080819, 0x192b0819, 0x19080819, 0x192b1908, 0x19080819, 0x2b080808, 0x19080819, + 0x2b08082b, 0x19080819, 0x2b081919, 0x19080819, 0x2b082b08, 0x19080819, 0x2b190819, 0x19080819, + 0x2b191908, 0x19080819, 0x2b2b0808, 0x19080819, 0x08080819, 0x1908082b, 0x08081908, 0x1908082b, + 0x08190808, 0x1908082b, 0x0819082b, 0x1908082b, 0x08191919, 0x1908082b, 0x08192b08, 0x1908082b, + 0x082b1908, 0x1908082b, 0x19080808, 0x1908082b, 0x19081919, 0x1908082b, 0x19082b08, 0x1908082b, + 0x19190819, 0x1908082b, 0x19191908, 0x1908082b, 0x192b0808, 0x1908082b, 0x2b080819, 0x1908082b, + 0x2b081908, 0x1908082b, 0x08080808, 0x19081908, 0x0808082b, 0x19081908, 0x08081919, 0x19081908, + 0x08082b08, 0x19081908, 0x08082b2b, 0x19081908, 0x08190819, 0x19081908, 0x08191908, 0x19081908, + 0x0819192b, 0x19081908, 0x08192b19, 0x19081908, 0x082b0808, 0x19081908, 0x082b082b, 0x19081908, + 0x082b1919, 0x19081908, 0x082b2b08, 0x19081908, 0x19080819, 0x19081908, 0x19081908, 0x19081908, + 0x1908192b, 0x19081908, 0x19082b19, 0x19081908, 0x19190808, 0x19081908, 0x1919082b, 0x19081908, + 0x19191919, 0x19081908, 0x19192b08, 0x19081908, 0x192b0819, 0x19081908, 0x192b1908, 0x19081908, + 0x2b080808, 0x19081908, 0x2b08082b, 0x19081908, 0x2b081919, 0x19081908, 0x2b082b08, 0x19081908, + 0x2b190819, 0x19081908, 0x2b191908, 0x19081908, 0x2b2b0808, 0x19081908, 0x08080819, 0x19081919, + 0x08081908, 0x19081919, 0x0808192b, 0x19081919, 0x08082b19, 0x19081919, 0x08190808, 0x19081919, + 0x0819082b, 0x19081919, 0x08191919, 0x19081919, 0x08192b08, 0x19081919, 0x082b0819, 0x19081919, + 0x082b1908, 0x19081919, 0x19080808, 0x19081919, 0x1908082b, 0x19081919, 0x19081919, 0x19081919, + 0x19082b08, 0x19081919, 0x19190819, 0x19081919, 0x19191908, 0x19081919, 0x192b0808, 0x19081919, + 0x192b2b2b, 0x19081919, 0x2b080819, 0x19081919, 0x2b081908, 0x19081919, 0x2b190808, 0x19081919, + 0x08080808, 0x1908192b, 0x0808082b, 0x1908192b, 0x08081919, 0x1908192b, 0x08082b08, 0x1908192b, + 0x08190819, 0x1908192b, 0x08191908, 0x1908192b, 0x082b0808, 0x1908192b, 0x19080819, 0x1908192b, + 0x19081908, 0x1908192b, 0x19190808, 0x1908192b, 0x2b080808, 0x1908192b, 0x2b2b1919, 0x1908192b, + 0x08080819, 0x19082b08, 0x08081908, 0x19082b08, 0x08082b19, 0x19082b08, 0x08190808, 0x19082b08, + 0x0819082b, 0x19082b08, 0x08191919, 0x19082b08, 0x08192b08, 0x19082b08, 0x082b0819, 0x19082b08, + 0x082b1908, 0x19082b08, 0x19080808, 0x19082b08, 0x1908082b, 0x19082b08, 0x19081919, 0x19082b08, + 0x19082b08, 0x19082b08, 0x19190819, 0x19082b08, 0x19191908, 0x19082b08, 0x192b0808, 0x19082b08, + 0x2b081908, 0x19082b08, 0x2b190808, 0x19082b08, 0x08080808, 0x19082b19, 0x0808082b, 0x19082b19, + 0x08081919, 0x19082b19, 0x08082b08, 0x19082b19, 0x08190819, 0x19082b19, 0x08191908, 0x19082b19, + 0x082b0808, 0x19082b19, 0x19080819, 0x19082b19, 0x19081908, 0x19082b19, 0x19190808, 0x19082b19, + 0x2b080808, 0x19082b19, 0x2b19192b, 0x19082b19, 0x08080819, 0x19082b2b, 0x08081908, 0x19082b2b, + 0x08190808, 0x19082b2b, 0x19080808, 0x19082b2b, 0x08080808, 0x19190808, 0x0808082b, 0x19190808, + 0x08081919, 0x19190808, 0x08082b08, 0x19190808, 0x08190819, 0x19190808, 0x08191908, 0x19190808, + 0x0819192b, 0x19190808, 0x08192b19, 0x19190808, 0x082b0808, 0x19190808, 0x082b082b, 0x19190808, + 0x082b1919, 0x19190808, 0x082b2b08, 0x19190808, 0x19080819, 0x19190808, 0x19081908, 0x19190808, + 0x1908192b, 0x19190808, 0x19082b19, 0x19190808, 0x19190808, 0x19190808, 0x1919082b, 0x19190808, + 0x19191919, 0x19190808, 0x19192b08, 0x19190808, 0x192b0819, 0x19190808, 0x192b1908, 0x19190808, + 0x2b080808, 0x19190808, 0x2b08082b, 0x19190808, 0x2b081919, 0x19190808, 0x2b082b08, 0x19190808, + 0x2b190819, 0x19190808, 0x2b191908, 0x19190808, 0x08080819, 0x19190819, 0x08081908, 0x19190819, + 0x0808192b, 0x19190819, 0x08082b19, 0x19190819, 0x08190808, 0x19190819, 0x0819082b, 0x19190819, + 0x08191919, 0x19190819, 0x08192b08, 0x19190819, 0x082b0819, 0x19190819, 0x082b1908, 0x19190819, + 0x19080808, 0x19190819, 0x1908082b, 0x19190819, 0x19081919, 0x19190819, 0x19082b08, 0x19190819, + 0x19190819, 0x19190819, 0x19191908, 0x19190819, 0x192b0808, 0x19190819, 0x2b080819, 0x19190819, + 0x2b081908, 0x19190819, 0x2b190808, 0x19190819, 0x08080808, 0x1919082b, 0x08081919, 0x1919082b, + 0x08082b08, 0x1919082b, 0x08190819, 0x1919082b, 0x08191908, 0x1919082b, 0x082b0808, 0x1919082b, + 0x19080819, 0x1919082b, 0x19081908, 0x1919082b, 0x19190808, 0x1919082b, 0x192b2b19, 0x1919082b, + 0x2b080808, 0x1919082b, 0x08080819, 0x19191908, 0x08081908, 0x19191908, 0x0808192b, 0x19191908, + 0x08082b19, 0x19191908, 0x08190808, 0x19191908, 0x0819082b, 0x19191908, 0x08191919, 0x19191908, + 0x08192b08, 0x19191908, 0x082b0819, 0x19191908, 0x082b1908, 0x19191908, 0x19080808, 0x19191908, + 0x1908082b, 0x19191908, 0x19081919, 0x19191908, 0x19082b08, 0x19191908, 0x19190819, 0x19191908, + 0x19191908, 0x19191908, 0x192b0808, 0x19191908, 0x2b080819, 0x19191908, 0x2b081908, 0x19191908, + 0x2b190808, 0x19191908, 0x08080808, 0x19191919, 0x0808082b, 0x19191919, 0x08081919, 0x19191919, + 0x08082b08, 0x19191919, 0x08190819, 0x19191919, 0x08191908, 0x19191919, 0x082b0808, 0x19191919, + 0x19080819, 0x19191919, 0x19081908, 0x19191919, 0x19190808, 0x19191919, 0x2b080808, 0x19191919, + 0x08080819, 0x1919192b, 0x08081908, 0x1919192b, 0x08190808, 0x1919192b, 0x082b192b, 0x1919192b, + 0x19080808, 0x1919192b, 0x08080808, 0x19192b08, 0x0808082b, 0x19192b08, 0x08081919, 0x19192b08, + 0x08082b08, 0x19192b08, 0x08190819, 0x19192b08, 0x08191908, 0x19192b08, 0x082b0808, 0x19192b08, + 0x19080819, 0x19192b08, 0x19081908, 0x19192b08, 0x19190808, 0x19192b08, 0x19192b2b, 0x19192b08, + 0x2b080808, 0x19192b08, 0x08080819, 0x19192b19, 0x08081908, 0x19192b19, 0x08190808, 0x19192b19, + 0x19080808, 0x19192b19, 0x08080808, 0x19192b2b, 0x08192b19, 0x19192b2b, 0x2b081919, 0x19192b2b, + 0x2b2b2b08, 0x19192b2b, 0x08080819, 0x192b0808, 0x08081908, 0x192b0808, 0x0808192b, 0x192b0808, + 0x08190808, 0x192b0808, 0x0819082b, 0x192b0808, 0x08191919, 0x192b0808, 0x08192b08, 0x192b0808, + 0x082b0819, 0x192b0808, 0x082b1908, 0x192b0808, 0x19080808, 0x192b0808, 0x19081919, 0x192b0808, + 0x19082b08, 0x192b0808, 0x19190819, 0x192b0808, 0x19191908, 0x192b0808, 0x192b0808, 0x192b0808, + 0x2b081908, 0x192b0808, 0x2b190808, 0x192b0808, 0x08080808, 0x192b0819, 0x0808082b, 0x192b0819, + 0x08081919, 0x192b0819, 0x08082b08, 0x192b0819, 0x08190819, 0x192b0819, 0x08191908, 0x192b0819, + 0x082b0808, 0x192b0819, 0x19080819, 0x192b0819, 0x19081908, 0x192b0819, 0x19190808, 0x192b0819, + 0x2b080808, 0x192b0819, 0x2b192b19, 0x192b0819, 0x08081908, 0x192b082b, 0x08190808, 0x192b082b, + 0x19080808, 0x192b082b, 0x1919192b, 0x192b082b, 0x2b2b0819, 0x192b082b, 0x08080808, 0x192b1908, + 0x08081919, 0x192b1908, 0x08082b08, 0x192b1908, 0x08190819, 0x192b1908, 0x08191908, 0x192b1908, + 0x082b0808, 0x192b1908, 0x19080819, 0x192b1908, 0x19081908, 0x192b1908, 0x19190808, 0x192b1908, + 0x2b080808, 0x192b1908, 0x08080819, 0x192b1919, 0x08081908, 0x192b1919, 0x08190808, 0x192b1919, + 0x19080808, 0x192b1919, 0x19082b2b, 0x192b1919, 0x192b2b08, 0x192b1919, 0x2b19082b, 0x192b1919, + 0x08080808, 0x192b192b, 0x2b191908, 0x192b192b, 0x08080819, 0x192b2b08, 0x08081908, 0x192b2b08, + 0x08190808, 0x192b2b08, 0x192b1919, 0x192b2b08, 0x2b192b08, 0x192b2b08, 0x08080808, 0x192b2b19, + 0x082b2b2b, 0x192b2b19, 0x1908082b, 0x192b2b2b, 0x2b2b0819, 0x192b2b2b, 0x08080808, 0x2b080808, + 0x0808082b, 0x2b080808, 0x08081919, 0x2b080808, 0x08082b08, 0x2b080808, 0x08190819, 0x2b080808, + 0x08191908, 0x2b080808, 0x08192b19, 0x2b080808, 0x082b0808, 0x2b080808, 0x082b1919, 0x2b080808, + 0x19080819, 0x2b080808, 0x19081908, 0x2b080808, 0x19190808, 0x2b080808, 0x1919082b, 0x2b080808, + 0x19191919, 0x2b080808, 0x19192b08, 0x2b080808, 0x192b0819, 0x2b080808, 0x2b080808, 0x2b080808, + 0x2b081919, 0x2b080808, 0x2b190819, 0x2b080808, 0x2b191908, 0x2b080808, 0x08080819, 0x2b080819, + 0x08081908, 0x2b080819, 0x08082b19, 0x2b080819, 0x08190808, 0x2b080819, 0x0819082b, 0x2b080819, + 0x08191919, 0x2b080819, 0x08192b08, 0x2b080819, 0x082b0819, 0x2b080819, 0x082b1908, 0x2b080819, + 0x19080808, 0x2b080819, 0x1908082b, 0x2b080819, 0x19081919, 0x2b080819, 0x19082b08, 0x2b080819, + 0x19190819, 0x2b080819, 0x19191908, 0x2b080819, 0x2b080819, 0x2b080819, 0x2b081908, 0x2b080819, + 0x2b190808, 0x2b080819, 0x2b2b2b19, 0x2b080819, 0x08080808, 0x2b08082b, 0x08081919, 0x2b08082b, + 0x08082b2b, 0x2b08082b, 0x08190819, 0x2b08082b, 0x08191908, 0x2b08082b, 0x19080819, 0x2b08082b, + 0x19081908, 0x2b08082b, 0x19190808, 0x2b08082b, 0x08080819, 0x2b081908, 0x08081908, 0x2b081908, + 0x0808192b, 0x2b081908, 0x08082b19, 0x2b081908, 0x08190808, 0x2b081908, 0x0819082b, 0x2b081908, + 0x08191919, 0x2b081908, 0x08192b08, 0x2b081908, 0x082b0819, 0x2b081908, 0x19080808, 0x2b081908, + 0x1908082b, 0x2b081908, 0x19081919, 0x2b081908, 0x19082b08, 0x2b081908, 0x19190819, 0x2b081908, + 0x19191908, 0x2b081908, 0x192b0808, 0x2b081908, 0x2b080819, 0x2b081908, 0x2b081908, 0x2b081908, + 0x2b190808, 0x2b081908, 0x08080808, 0x2b081919, 0x0808082b, 0x2b081919, 0x08081919, 0x2b081919, + 0x08082b08, 0x2b081919, 0x08190819, 0x2b081919, 0x08191908, 0x2b081919, 0x082b0808, 0x2b081919, + 0x19080819, 0x2b081919, 0x19081908, 0x2b081919, 0x19190808, 0x2b081919, 0x2b080808, 0x2b081919, + 0x2b082b2b, 0x2b081919, 0x08080819, 0x2b08192b, 0x08081908, 0x2b08192b, 0x08190808, 0x2b08192b, + 0x082b2b19, 0x2b08192b, 0x19080808, 0x2b08192b, 0x08080808, 0x2b082b08, 0x08081919, 0x2b082b08, + 0x08190819, 0x2b082b08, 0x08191908, 0x2b082b08, 0x19080819, 0x2b082b08, 0x19081908, 0x2b082b08, + 0x19190808, 0x2b082b08, 0x2b2b082b, 0x2b082b08, 0x08080819, 0x2b082b19, 0x08081908, 0x2b082b19, + 0x19080808, 0x2b082b19, 0x192b1919, 0x2b082b19, 0x082b082b, 0x2b082b2b, 0x19192b08, 0x2b082b2b, + 0x19192b2b, 0x2b082b2b, 0x2b08082b, 0x2b082b2b, 0x2b2b082b, 0x2b082b2b, 0x08080819, 0x2b190808, + 0x08081908, 0x2b190808, 0x08082b19, 0x2b190808, 0x08190808, 0x2b190808, 0x0819082b, 0x2b190808, + 0x08191919, 0x2b190808, 0x08192b08, 0x2b190808, 0x082b1908, 0x2b190808, 0x19080808, 0x2b190808, + 0x1908082b, 0x2b190808, 0x19081919, 0x2b190808, 0x19082b08, 0x2b190808, 0x19190819, 0x2b190808, + 0x19191908, 0x2b190808, 0x192b0808, 0x2b190808, 0x2b080819, 0x2b190808, 0x2b081908, 0x2b190808, + 0x2b190808, 0x2b190808, 0x08080808, 0x2b190819, 0x08081919, 0x2b190819, 0x08190819, 0x2b190819, + 0x08191908, 0x2b190819, 0x19080819, 0x2b190819, 0x19081908, 0x2b190819, 0x19190808, 0x2b190819, + 0x19192b2b, 0x2b190819, 0x08080819, 0x2b19082b, 0x08081908, 0x2b19082b, 0x08190808, 0x2b19082b, + 0x19080808, 0x2b19082b, 0x2b2b192b, 0x2b19082b, 0x08080808, 0x2b191908, 0x0808082b, 0x2b191908, + 0x08081919, 0x2b191908, 0x08082b08, 0x2b191908, 0x08190819, 0x2b191908, 0x08191908, 0x2b191908, + 0x082b0808, 0x2b191908, 0x19080819, 0x2b191908, 0x19081908, 0x2b191908, 0x19190808, 0x2b191908, + 0x2b080808, 0x2b191908, 0x2b19192b, 0x2b191908, 0x08080819, 0x2b191919, 0x08081908, 0x2b191919, + 0x08190808, 0x2b191919, 0x19080808, 0x2b191919, 0x2b192b08, 0x2b191919, 0x2b2b0819, 0x2b191919, + 0x08080808, 0x2b19192b, 0x1908192b, 0x2b19192b, 0x192b1908, 0x2b19192b, 0x08080819, 0x2b192b08, + 0x08081908, 0x2b192b08, 0x08190808, 0x2b192b08, 0x082b192b, 0x2b192b08, 0x19080808, 0x2b192b08, + 0x2b2b2b19, 0x2b192b08, 0x08080808, 0x2b192b19, 0x19082b19, 0x2b192b19, 0x1919082b, 0x2b192b19, + 0x2b190808, 0x2b192b2b, 0x08080808, 0x2b2b0808, 0x08081919, 0x2b2b0808, 0x08082b2b, 0x2b2b0808, + 0x08191908, 0x2b2b0808, 0x082b082b, 0x2b2b0808, 0x082b2b2b, 0x2b2b0808, 0x19080819, 0x2b2b0808, + 0x19081908, 0x2b2b0808, 0x19190808, 0x2b2b0808, 0x2b2b082b, 0x2b2b0808, 0x2b2b2b2b, 0x2b2b0808, + 0x19080808, 0x2b2b0819, 0x192b1919, 0x2b2b0819, 0x0808082b, 0x2b2b082b, 0x08082b2b, 0x2b2b082b, + 0x082b082b, 0x2b2b082b, 0x082b2b08, 0x2b2b082b, 0x082b2b2b, 0x2b2b082b, 0x2b08082b, 0x2b2b082b, + 0x2b082b08, 0x2b2b082b, 0x2b082b2b, 0x2b2b082b, 0x2b2b2b08, 0x2b2b082b, 0x08080819, 0x2b2b1908, + 0x08081908, 0x2b2b1908, 0x08190808, 0x2b2b1908, 0x19080808, 0x2b2b1908, 0x2b082b19, 0x2b2b1908, + 0x2b2b1908, 0x2b2b1908, 0x08080808, 0x2b2b1919, 0x08192b19, 0x2b2b1919, 0x19190819, 0x2b2b192b, + 0x08082b2b, 0x2b2b2b08, 0x082b2b08, 0x2b2b2b08, 0x2b2b082b, 0x2b2b2b08, 0x19191908, 0x2b2b2b19, + 0x2b08192b, 0x2b2b2b19, 0x08082b08, 0x2b2b2b2b, 0x08082b2b, 0x2b2b2b2b, 0x082b0808, 0x2b2b2b2b, + 0x082b082b, 0x2b2b2b2b, 0x082b2b08, 0x2b2b2b2b, 0x2b082b08, 0x2b2b2b2b, 0x2b2b2b2b, 0x2b2b2b2b +); + +struct iq2_s { + d: f16, + qs: array, + qh: array, + scales: array +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + var src1_i = src1_idx_base + offset * 256; + var qs_vals : array; + for (var i: u32 = 0; i < 16; i++) { + qs_vals[i] = bitcast(vec2(block.qs[i * 2], block.qs[i * 2 + 1])); + } + var qh_vals = array( + bitcast(vec2(block.qh[0], block.qh[1])), + bitcast(vec2(block.qh[2], block.qh[3])) + ); + var scale_vals = array( + bitcast(vec2(block.scales[0], block.scales[1])), + bitcast(vec2(block.scales[2], block.scales[3])) + ); + var sum = 0.0; + for (var ib: u32 = 0; ib < 8; ib ++) { + let s = get_byte(scale_vals[ib / 4], ib % 4); + let db = array( + d * (0.5 + f32(s & 0xF)) * 0.25, + d * (0.5 + f32(s >> 4)) * 0.25 + ); + let qs_w = qs_vals[ib]; + for (var l: u32 = 0; l < 4; l++) { + let qh_b = (get_byte(qh_vals[ib / 4], ib % 4) << (8 - 2 * l)) & 0x300; + let ig = (get_byte(qs_w, l) | qh_b) * 8; + let signs = get_byte(qs_vals[ib + 8], l); + let dl = db[l/2]; + for (var j: u32 = 0; j < 8; j++) { + let g = get_byte(iq2s_grid[(ig + j) / 4], (ig + j) % 4); + let m = select(1.0, -1.0, (get_byte(kmask_iq2xs[j / 4], j % 4) & signs) != 0); + sum += dl * f32(g) * m * src1[src1_i]; + src1_i++; + } + } + } + return sum; +} + + +#enddecl(IQ2_S) + +#decl(IQ3_XSS) + +const iq3xxs_grid = array( + 0x04040404, 0x04040414, 0x04040424, 0x04040c0c, 0x04040c1c, 0x04040c3e, 0x04041404, 0x04041414, + 0x04041c0c, 0x04042414, 0x04043e1c, 0x04043e2c, 0x040c040c, 0x040c041c, 0x040c0c04, 0x040c0c14, + 0x040c140c, 0x040c142c, 0x040c1c04, 0x040c1c14, 0x040c240c, 0x040c2c24, 0x040c3e04, 0x04140404, + 0x04140414, 0x04140424, 0x04140c0c, 0x04141404, 0x04141414, 0x04141c0c, 0x04141c1c, 0x04141c3e, + 0x04142c0c, 0x04142c3e, 0x04143e2c, 0x041c040c, 0x041c043e, 0x041c0c04, 0x041c0c14, 0x041c142c, + 0x041c3e04, 0x04240c1c, 0x04241c3e, 0x04242424, 0x04242c3e, 0x04243e1c, 0x04243e2c, 0x042c040c, + 0x042c043e, 0x042c1c14, 0x042c2c14, 0x04341c2c, 0x04343424, 0x043e0c04, 0x043e0c24, 0x043e0c34, + 0x043e241c, 0x043e340c, 0x0c04040c, 0x0c04041c, 0x0c040c04, 0x0c040c14, 0x0c04140c, 0x0c04141c, + 0x0c041c04, 0x0c041c14, 0x0c041c24, 0x0c04243e, 0x0c042c04, 0x0c0c0404, 0x0c0c0414, 0x0c0c0c0c, + 0x0c0c1404, 0x0c0c1414, 0x0c14040c, 0x0c14041c, 0x0c140c04, 0x0c140c14, 0x0c14140c, 0x0c141c04, + 0x0c143e14, 0x0c1c0404, 0x0c1c0414, 0x0c1c1404, 0x0c1c1c0c, 0x0c1c2434, 0x0c1c3434, 0x0c24040c, + 0x0c24042c, 0x0c242c04, 0x0c2c1404, 0x0c2c1424, 0x0c2c2434, 0x0c2c3e0c, 0x0c34042c, 0x0c3e1414, + 0x0c3e2404, 0x14040404, 0x14040414, 0x14040c0c, 0x14040c1c, 0x14041404, 0x14041414, 0x14041434, + 0x14041c0c, 0x14042414, 0x140c040c, 0x140c041c, 0x140c042c, 0x140c0c04, 0x140c0c14, 0x140c140c, + 0x140c1c04, 0x140c341c, 0x140c343e, 0x140c3e04, 0x14140404, 0x14140414, 0x14140c0c, 0x14140c3e, + 0x14141404, 0x14141414, 0x14141c3e, 0x14142404, 0x14142c2c, 0x141c040c, 0x141c0c04, 0x141c0c24, + 0x141c3e04, 0x141c3e24, 0x14241c2c, 0x14242c1c, 0x142c041c, 0x142c143e, 0x142c240c, 0x142c3e24, + 0x143e040c, 0x143e041c, 0x143e0c34, 0x143e242c, 0x1c04040c, 0x1c040c04, 0x1c040c14, 0x1c04140c, + 0x1c04141c, 0x1c042c04, 0x1c04342c, 0x1c043e14, 0x1c0c0404, 0x1c0c0414, 0x1c0c1404, 0x1c0c1c0c, + 0x1c0c2424, 0x1c0c2434, 0x1c14040c, 0x1c14041c, 0x1c140c04, 0x1c14142c, 0x1c142c14, 0x1c143e14, + 0x1c1c0c0c, 0x1c1c1c1c, 0x1c241c04, 0x1c24243e, 0x1c243e14, 0x1c2c0404, 0x1c2c0434, 0x1c2c1414, + 0x1c2c2c2c, 0x1c340c24, 0x1c341c34, 0x1c34341c, 0x1c3e1c1c, 0x1c3e3404, 0x24040424, 0x24040c3e, + 0x24041c2c, 0x24041c3e, 0x24042c1c, 0x24042c3e, 0x240c3e24, 0x24141404, 0x24141c3e, 0x24142404, + 0x24143404, 0x24143434, 0x241c043e, 0x241c242c, 0x24240424, 0x24242c0c, 0x24243424, 0x242c142c, + 0x242c241c, 0x242c3e04, 0x243e042c, 0x243e0c04, 0x243e0c14, 0x243e1c04, 0x2c040c14, 0x2c04240c, + 0x2c043e04, 0x2c0c0404, 0x2c0c0434, 0x2c0c1434, 0x2c0c2c2c, 0x2c140c24, 0x2c141c14, 0x2c143e14, + 0x2c1c0414, 0x2c1c2c1c, 0x2c240c04, 0x2c24141c, 0x2c24143e, 0x2c243e14, 0x2c2c0414, 0x2c2c1c0c, + 0x2c342c04, 0x2c3e1424, 0x2c3e2414, 0x34041424, 0x34042424, 0x34042434, 0x34043424, 0x340c140c, + 0x340c340c, 0x34140c3e, 0x34143424, 0x341c1c04, 0x341c1c34, 0x34242424, 0x342c042c, 0x342c2c14, + 0x34341c1c, 0x343e041c, 0x343e140c, 0x3e04041c, 0x3e04042c, 0x3e04043e, 0x3e040c04, 0x3e041c14, + 0x3e042c14, 0x3e0c1434, 0x3e0c2404, 0x3e140c14, 0x3e14242c, 0x3e142c14, 0x3e1c0404, 0x3e1c0c2c, + 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04 +); + +struct iq3_xxs { + d: f16, + qs: array +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + var src1_i = src1_idx_base + offset * 256; + var sum = 0.0; + for (var ib: u32 = 0; ib < 16; ib += 2) { + let sc_sign = bitcast(vec2(block.qs[ib + 32], block.qs[ib + 33])); + let db = d * (0.5 + f32(sc_sign >> 28)) * 0.5; + for (var l: u32 = 0; l < 4; l++) { + let is = (sc_sign >> (7 * l)) & 127; + let signs = get_byte(ksigns_iq2xs[is / 4], is % 4); + let ig_val = bitcast(vec2(block.qs[ib * 2 + l], 0.0)); + let ig1 = get_byte(ig_val, 0); + let ig2 = get_byte(ig_val, 1); + for (var j: u32 = 0; j < 4; j++) { + let g1 = get_byte(iq3xxs_grid[ig1], j); + let g2 = get_byte(iq3xxs_grid[ig2], j); + let m1 = select(1.0, -1.0, (get_byte(kmask_iq2xs[0], j) & signs) != 0); + let m2 = select(1.0, -1.0, (get_byte(kmask_iq2xs[1], j) & signs) != 0); + sum += db * f32(g1) * m1 * src1[src1_i]; + sum += db * f32(g2) * m2 * src1[src1_i + 4]; + src1_i++; + } + src1_i += 4; + } + } + return sum; +} + +#enddecl(IQ3_XSS) + +#decl(IQ3_S) + +const iq3s_grid = array( + 0x01010101, 0x01010103, 0x01010105, 0x0101010b, 0x0101010f, 0x01010301, 0x01010303, 0x01010305, + 0x01010309, 0x0101030d, 0x01010501, 0x01010503, 0x0101050b, 0x01010707, 0x01010901, 0x01010905, + 0x0101090b, 0x0101090f, 0x01010b03, 0x01010b07, 0x01010d01, 0x01010d05, 0x01010f03, 0x01010f09, + 0x01010f0f, 0x01030101, 0x01030103, 0x01030105, 0x01030109, 0x01030301, 0x01030303, 0x0103030b, + 0x01030501, 0x01030507, 0x0103050f, 0x01030703, 0x0103070b, 0x01030909, 0x01030d03, 0x01030d0b, + 0x01030f05, 0x01050101, 0x01050103, 0x0105010b, 0x0105010f, 0x01050301, 0x01050307, 0x0105030d, + 0x01050503, 0x0105050b, 0x01050701, 0x01050709, 0x01050905, 0x0105090b, 0x0105090f, 0x01050b03, + 0x01050b07, 0x01050f01, 0x01050f07, 0x01070107, 0x01070303, 0x0107030b, 0x01070501, 0x01070505, + 0x01070703, 0x01070707, 0x0107070d, 0x01070909, 0x01070b01, 0x01070b05, 0x01070d0f, 0x01070f03, + 0x01070f0b, 0x01090101, 0x01090307, 0x0109030f, 0x01090503, 0x01090509, 0x01090705, 0x01090901, + 0x01090907, 0x01090b03, 0x01090f01, 0x010b0105, 0x010b0109, 0x010b0501, 0x010b0505, 0x010b050d, + 0x010b0707, 0x010b0903, 0x010b090b, 0x010b090f, 0x010b0d0d, 0x010b0f07, 0x010d010d, 0x010d0303, + 0x010d0307, 0x010d0703, 0x010d0b05, 0x010d0f03, 0x010f0101, 0x010f0105, 0x010f0109, 0x010f0501, + 0x010f0505, 0x010f050d, 0x010f0707, 0x010f0b01, 0x010f0b09, 0x03010101, 0x03010103, 0x03010105, + 0x03010109, 0x03010301, 0x03010303, 0x03010307, 0x0301030b, 0x0301030f, 0x03010501, 0x03010505, + 0x03010703, 0x03010709, 0x0301070d, 0x03010b09, 0x03010b0d, 0x03010d03, 0x03010f05, 0x03030101, + 0x03030103, 0x03030107, 0x0303010d, 0x03030301, 0x03030309, 0x03030503, 0x03030701, 0x03030707, + 0x03030903, 0x03030b01, 0x03030b05, 0x03030f01, 0x03030f0d, 0x03050101, 0x03050305, 0x0305030b, + 0x0305030f, 0x03050501, 0x03050509, 0x03050705, 0x03050901, 0x03050907, 0x03050b0b, 0x03050d01, + 0x03050f05, 0x03070103, 0x03070109, 0x0307010f, 0x03070301, 0x03070307, 0x03070503, 0x0307050f, + 0x03070701, 0x03070709, 0x03070903, 0x03070d05, 0x03070f01, 0x03090107, 0x0309010b, 0x03090305, + 0x03090309, 0x03090703, 0x03090707, 0x03090905, 0x0309090d, 0x03090b01, 0x03090b09, 0x030b0103, + 0x030b0301, 0x030b0307, 0x030b0503, 0x030b0701, 0x030b0705, 0x030b0b03, 0x030d0501, 0x030d0509, + 0x030d050f, 0x030d0909, 0x030d090d, 0x030f0103, 0x030f0107, 0x030f0301, 0x030f0305, 0x030f0503, + 0x030f070b, 0x030f0903, 0x030f0d05, 0x030f0f01, 0x05010101, 0x05010103, 0x05010107, 0x0501010b, + 0x0501010f, 0x05010301, 0x05010305, 0x05010309, 0x0501030d, 0x05010503, 0x05010507, 0x0501050f, + 0x05010701, 0x05010705, 0x05010903, 0x05010907, 0x0501090b, 0x05010b01, 0x05010b05, 0x05010d0f, + 0x05010f01, 0x05010f07, 0x05010f0b, 0x05030101, 0x05030105, 0x05030301, 0x05030307, 0x0503030f, + 0x05030505, 0x0503050b, 0x05030703, 0x05030709, 0x05030905, 0x05030b03, 0x05050103, 0x05050109, + 0x0505010f, 0x05050503, 0x05050507, 0x05050701, 0x0505070f, 0x05050903, 0x05050b07, 0x05050b0f, + 0x05050f03, 0x05050f09, 0x05070101, 0x05070105, 0x0507010b, 0x05070303, 0x05070505, 0x05070509, + 0x05070703, 0x05070707, 0x05070905, 0x05070b01, 0x05070d0d, 0x05090103, 0x0509010f, 0x05090501, + 0x05090507, 0x05090705, 0x0509070b, 0x05090903, 0x05090f05, 0x05090f0b, 0x050b0109, 0x050b0303, + 0x050b0505, 0x050b070f, 0x050b0901, 0x050b0b07, 0x050b0f01, 0x050d0101, 0x050d0105, 0x050d010f, + 0x050d0503, 0x050d0b0b, 0x050d0d03, 0x050f010b, 0x050f0303, 0x050f050d, 0x050f0701, 0x050f0907, + 0x050f0b01, 0x07010105, 0x07010303, 0x07010307, 0x0701030b, 0x0701030f, 0x07010505, 0x07010703, + 0x07010707, 0x0701070b, 0x07010905, 0x07010909, 0x0701090f, 0x07010b03, 0x07010d07, 0x07010f03, + 0x07030103, 0x07030107, 0x0703010b, 0x07030309, 0x07030503, 0x07030507, 0x07030901, 0x07030d01, + 0x07030f05, 0x07030f0d, 0x07050101, 0x07050305, 0x07050501, 0x07050705, 0x07050709, 0x07050b01, + 0x07070103, 0x07070301, 0x07070309, 0x07070503, 0x07070507, 0x0707050f, 0x07070701, 0x07070903, + 0x07070907, 0x0707090f, 0x07070b0b, 0x07070f07, 0x07090107, 0x07090303, 0x0709030d, 0x07090505, + 0x07090703, 0x07090b05, 0x07090d01, 0x07090d09, 0x070b0103, 0x070b0301, 0x070b0305, 0x070b050b, + 0x070b0705, 0x070b0909, 0x070b0b0d, 0x070b0f07, 0x070d030d, 0x070d0903, 0x070f0103, 0x070f0107, + 0x070f0501, 0x070f0505, 0x070f070b, 0x09010101, 0x09010109, 0x09010305, 0x09010501, 0x09010509, + 0x0901050f, 0x09010705, 0x09010903, 0x09010b01, 0x09010f01, 0x09030105, 0x0903010f, 0x09030303, + 0x09030307, 0x09030505, 0x09030701, 0x0903070b, 0x09030907, 0x09030b03, 0x09030b0b, 0x09050103, + 0x09050107, 0x09050301, 0x0905030b, 0x09050503, 0x09050707, 0x09050901, 0x09050b0f, 0x09050d05, + 0x09050f01, 0x09070109, 0x09070303, 0x09070307, 0x09070501, 0x09070505, 0x09070703, 0x0907070b, + 0x09090101, 0x09090105, 0x09090509, 0x0909070f, 0x09090901, 0x09090f03, 0x090b010b, 0x090b010f, + 0x090b0503, 0x090b0d05, 0x090d0307, 0x090d0709, 0x090d0d01, 0x090f0301, 0x090f030b, 0x090f0701, + 0x090f0907, 0x090f0b03, 0x0b010105, 0x0b010301, 0x0b010309, 0x0b010505, 0x0b010901, 0x0b010909, + 0x0b01090f, 0x0b010b05, 0x0b010d0d, 0x0b010f09, 0x0b030103, 0x0b030107, 0x0b03010b, 0x0b030305, + 0x0b030503, 0x0b030705, 0x0b030f05, 0x0b050101, 0x0b050303, 0x0b050507, 0x0b050701, 0x0b05070d, + 0x0b050b07, 0x0b070105, 0x0b07010f, 0x0b070301, 0x0b07050f, 0x0b070909, 0x0b070b03, 0x0b070d0b, + 0x0b070f07, 0x0b090103, 0x0b090109, 0x0b090501, 0x0b090705, 0x0b09090d, 0x0b0b0305, 0x0b0b050d, + 0x0b0b0b03, 0x0b0b0b07, 0x0b0d0905, 0x0b0f0105, 0x0b0f0109, 0x0b0f0505, 0x0d010303, 0x0d010307, + 0x0d01030b, 0x0d010703, 0x0d010707, 0x0d010d01, 0x0d030101, 0x0d030501, 0x0d03050f, 0x0d030d09, + 0x0d050305, 0x0d050709, 0x0d050905, 0x0d050b0b, 0x0d050d05, 0x0d050f01, 0x0d070101, 0x0d070309, + 0x0d070503, 0x0d070901, 0x0d09050b, 0x0d090907, 0x0d090d05, 0x0d0b0101, 0x0d0b0107, 0x0d0b0709, + 0x0d0b0d01, 0x0d0d010b, 0x0d0d0901, 0x0d0f0303, 0x0d0f0307, 0x0f010101, 0x0f010109, 0x0f01010f, + 0x0f010501, 0x0f010505, 0x0f01070d, 0x0f010901, 0x0f010b09, 0x0f010d05, 0x0f030105, 0x0f030303, + 0x0f030509, 0x0f030907, 0x0f03090b, 0x0f050103, 0x0f050109, 0x0f050301, 0x0f05030d, 0x0f050503, + 0x0f050701, 0x0f050b03, 0x0f070105, 0x0f070705, 0x0f07070b, 0x0f070b07, 0x0f090103, 0x0f09010b, + 0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101 +); + +struct iq3_s { + d: f16, + qs: array, + qh: array, + signs: array, + scales: array +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + var src1_i = src1_idx_base + offset * 256; + var qh_vals = array( + bitcast(vec2(block.qh[0], block.qh[1])), + bitcast(vec2(block.qh[2], block.qh[3])) + ); + var sign_vals: array; + for (var i: u32 = 0; i < 8; i++) { + sign_vals[i] = bitcast(vec2(block.signs[i * 2], block.signs[i * 2 + 1])); + } + var scale_vals = bitcast(vec2(block.scales[0], block.scales[1])); + var sum = 0.0; + for (var ib: u32 = 0; ib < 4; ib++) { + let s = get_byte(scale_vals, ib); + let db = array( + d * (1.0 + 2.0 * f32(s & 0xF)), + d * (1.0 + 2.0 * f32(s >> 4)) + ); + for (var k: u32 = 0; k < 2; k++) { + let dl = db[k]; + let qh_byte = get_byte(qh_vals[ib / 2], (ib % 2) * 2 + k); + let sign_w = sign_vals[ib * 2 + k]; + for (var l: u32 = 0; l < 4; l++) { + let signs = get_byte(sign_w, l); + let ig_val = bitcast(vec2(block.qs[ib * 8 + k * 4 + l], 0.0)); + let ig1 = get_byte(ig_val, 0) | ((qh_byte << ((8 - (2 * l)))) & 256); + let ig2 = get_byte(ig_val, 1) | ((qh_byte << ((7 - (2 * l)))) & 256); + for (var j: u32 = 0; j < 4; j++) { + let g1 = get_byte(iq3s_grid[ig1], j); + let g2 = get_byte(iq3s_grid[ig2], j); + let m1 = select(1.0, -1.0, (get_byte(kmask_iq2xs[0], j) & signs) != 0); + let m2 = select(1.0, -1.0, (get_byte(kmask_iq2xs[1], j) & signs) != 0); + sum += dl * f32(g1) * m1 * src1[src1_i]; + sum += dl * f32(g2) * m2 * src1[src1_i + 4]; + src1_i++; + } + src1_i += 4; + } + } + } + return sum; +} +#enddecl(IQ3_S) + +#decl(IQ1_TABLE) + +const IQ1_DELTA: f32 = 0.125; + +const iq1_grid = array( + 0xfffdffff, 0xfff7fff0, 0xffccfff5, 0xffdfffc0, 0xffd7ffdd, 0xff30ffd5, 0xff03ff0c, 0xff10ff01, + 0xff7dff7f, 0xff75ff77, 0xff5fff40, 0xff57ff5d, 0xfcf3ff55, 0xfcccfcf0, 0xfcc1fcc3, 0xfcc5fcc4, + 0xfc3cfcd0, 0xfc34fc31, 0xfc00fc0d, 0xfc1cfc05, 0xfc11fc13, 0xfc70fc17, 0xfc43fc4c, 0xfc50fc41, + 0xfdfdfdff, 0xfdf5fdf7, 0xfddffdc0, 0xfdd7fddd, 0xfd30fdd5, 0xfd04fd0c, 0xfd14fd13, 0xfd7dfd7f, + 0xfd75fd77, 0xfd40fd4c, 0xfd5ffd44, 0xfd57fd5d, 0xf3ccfd55, 0xf3c1f3c3, 0xf33cf3d0, 0xf300f334, + 0xf313f305, 0xf34cf310, 0xf350f344, 0xf0f3f0fc, 0xf0f1f0f0, 0xf0c7f0c0, 0xf0d4f0c5, 0xf030f03f, + 0xf00ff035, 0xf003f00c, 0xf001f000, 0xf01ff004, 0xf010f01d, 0xf015f017, 0xf04cf07c, 0xf047f040, + 0xf05cf045, 0xf050f053, 0xf054f051, 0xf1c4f1c3, 0xf133f13c, 0xf10df10f, 0xf107f100, 0xf11cf11f, + 0xf114f111, 0xf14cf170, 0xf144f143, 0xf7fdf7ff, 0xf7f5f7f7, 0xf7dff7c0, 0xf7d7f7dd, 0xf730f7d5, + 0xf701f70c, 0xf77ff710, 0xf777f77d, 0xf740f775, 0xf75df75f, 0xf755f757, 0xf4ccf4f0, 0xf4c4f4c3, + 0xf4d0f4d3, 0xf40ff43c, 0xf400f40c, 0xf413f41c, 0xf44cf414, 0xf441f443, 0xf450f444, 0xf5fdf5ff, + 0xf5f5f5f7, 0xf5dff5c0, 0xf5d7f5dd, 0xf530f5d5, 0xf504f50c, 0xf510f51c, 0xf57df57f, 0xf577f570, + 0xf540f575, 0xf55df55f, 0xf555f557, 0xcfcccfcf, 0xcfc4cfc3, 0xcfd0cfd3, 0xcf33cf3c, 0xcf00cf0f, + 0xcf1ccf07, 0xcf10cf13, 0xcf4ccf14, 0xcf41cf43, 0xcf50cf5c, 0xccf3ccfc, 0xccf4ccf1, 0xcccdcccf, + 0xccc7ccc0, 0xccd3ccdc, 0xcc30ccd4, 0xcc0fcc35, 0xcc0dcc0c, 0xcc00cc03, 0xcc04cc01, 0xcc10cc1f, + 0xcc4dcc73, 0xcc5ccc40, 0xcdcccc53, 0xcdc1cdc3, 0xcd3fcdd0, 0xcd34cd31, 0xcd00cd0d, 0xcd05cd07, + 0xcd11cd13, 0xcd4ccd70, 0xcd41cd43, 0xc3fccd50, 0xc3f4c3f1, 0xc3c0c3c3, 0xc3c4c3c7, 0xc3d1c3dc, + 0xc330c33c, 0xc337c331, 0xc30cc335, 0xc300c303, 0xc304c301, 0xc310c31d, 0xc373c317, 0xc34fc374, + 0xc340c343, 0xc344c347, 0xc35cc345, 0xc350c353, 0xc0fdc354, 0xc0f5c0f0, 0xc0c3c0cc, 0xc0c1c0c0, + 0xc0dfc0c4, 0xc0d0c0dd, 0xc0d5c0d7, 0xc033c03c, 0xc031c030, 0xc00dc00c, 0xc000c003, 0xc004c001, + 0xc01cc005, 0xc010c013, 0xc014c011, 0xc07dc07f, 0xc070c073, 0xc075c077, 0xc04cc04f, 0xc040c043, + 0xc044c041, 0xc05fc045, 0xc050c05d, 0xc1f3c1fc, 0xc1f1c1f0, 0xc1c1c1c0, 0xc1c5c1c7, 0xc1d1c1dc, + 0xc13dc13f, 0xc130c133, 0xc135c137, 0xc100c10c, 0xc107c101, 0xc11cc104, 0xc110c113, 0xc114c117, + 0xc171c115, 0xc14dc175, 0xc153c140, 0xc7ccc154, 0xc7d0c7c1, 0xc733c73c, 0xc734c731, 0xc700c70f, + 0xc705c707, 0xc71cc71f, 0xc711c713, 0xc770c714, 0xc743c74c, 0xc4cfc750, 0xc4c0c4cd, 0xc4dcc4c5, + 0xc43dc4d0, 0xc430c433, 0xc40cc437, 0xc400c403, 0xc404c401, 0xc41fc405, 0xc415c410, 0xc44cc474, + 0xc440c44d, 0xc45cc447, 0xc454c451, 0xc5c1c5f4, 0xc5d1c5d3, 0xc531c533, 0xc50fc534, 0xc500c50d, + 0xc51cc507, 0xc514c511, 0xc54cc570, 0xc545c541, 0xdffddfff, 0xdff5dff7, 0xdfdfdfc0, 0xdfd0dfdd, + 0xdfd5dfd7, 0xdf0cdf30, 0xdf1cdf04, 0xdf7fdf10, 0xdf77df7d, 0xdf40df75, 0xdf5ddf5f, 0xdf57df50, + 0xdcf0df55, 0xdcc3dccc, 0xdcd0dcc4, 0xdc33dc3d, 0xdc00dc34, 0xdc05dc07, 0xdc13dc1c, 0xdc11dc10, + 0xdc4fdc70, 0xdc44dc41, 0xddfcdc50, 0xddf5ddf7, 0xddc0ddcc, 0xdddddddf, 0xddd5ddd7, 0xdd0cdd30, + 0xdd04dd01, 0xdd7cdd10, 0xdd75dd77, 0xdd40dd4c, 0xdd5ddd5f, 0xdd55dd57, 0xd3c3d3f0, 0xd3c4d3c1, + 0xd333d3d0, 0xd331d330, 0xd30dd334, 0xd307d300, 0xd311d305, 0xd34cd370, 0xd344d343, 0xd350d35c, + 0xd0c0d0f4, 0xd0d4d0dc, 0xd030d03f, 0xd00cd037, 0xd000d003, 0xd01dd004, 0xd017d010, 0xd04fd074, + 0xd040d043, 0xd045d047, 0xd053d05c, 0xd054d051, 0xd1cfd1f0, 0xd1c4d1cd, 0xd13cd1d0, 0xd100d134, + 0xd11cd11f, 0xd173d114, 0xd14fd171, 0xd7ffd145, 0xd7f7d7fd, 0xd7c0d7f5, 0xd7ddd7df, 0xd7d5d7d7, + 0xd70cd730, 0xd710d703, 0xd77dd77f, 0xd775d777, 0xd75dd75f, 0xd755d757, 0xd4ccd4f4, 0xd4c4d4c3, + 0xd431d4d0, 0xd40dd434, 0xd41cd400, 0xd411d413, 0xd470d414, 0xd441d44f, 0xd453d444, 0xd5ffd450, + 0xd5f7d5fd, 0xd5dfd5f5, 0xd5d7d5dd, 0xd530d5d5, 0xd501d50c, 0xd510d504, 0xd57dd57f, 0xd575d577, + 0xd55fd540, 0xd557d55d, 0x3ff0d555, 0x3fc13fcc, 0x3f343fd0, 0x3f003f0d, 0x3f053f07, 0x3f133f1c, + 0x3f433f11, 0x3f5c3f44, 0x3cff3f51, 0x3cf33cfc, 0x3cf43cf1, 0x3cc03ccd, 0x3cc73cc1, 0x3cdc3cc5, + 0x3cd43cd1, 0x3c373c30, 0x3c0c3c35, 0x3c003c03, 0x3c043c01, 0x3c103c05, 0x3c153c17, 0x3c733c7c, + 0x3c4f3c71, 0x3c403c4d, 0x3c5c3c5f, 0x3df03c5d, 0x3dc33dcc, 0x3dd03dc1, 0x3d0d3d3c, 0x3d053d00, + 0x3d143d13, 0x3d433d74, 0x33fc3d50, 0x33c433c0, 0x333033d4, 0x33353337, 0x3303330c, 0x33013300, + 0x331d331c, 0x33173310, 0x337c3315, 0x33743371, 0x334d334f, 0x335f3340, 0x3354335c, 0x30fd30fc, + 0x30f530f0, 0x30c330cc, 0x30c130c0, 0x30df30c4, 0x30d530d0, 0x3033303c, 0x30313030, 0x300f3034, + 0x3003300c, 0x30013000, 0x30043007, 0x3013301c, 0x30113010, 0x307d3014, 0x30703073, 0x304c3077, + 0x30403043, 0x30443041, 0x30503045, 0x30553057, 0x31f031fc, 0x31c331f4, 0x31c731c0, 0x31dc31c5, + 0x31d431d3, 0x313d313f, 0x31373130, 0x310c310f, 0x3100310d, 0x31043101, 0x3110311d, 0x317c3117, + 0x31753170, 0x31403143, 0x3153315c, 0x37f03151, 0x37c037cc, 0x37d037c5, 0x3734373d, 0x3700370f, + 0x371c3707, 0x37113713, 0x37703714, 0x3743374c, 0x37443741, 0x34fc3750, 0x34f134f0, 0x34cf34f5, + 0x34c034c3, 0x34dc34c7, 0x34d134d3, 0x3430343f, 0x340c3435, 0x3403340d, 0x34013400, 0x341f3404, + 0x3410341d, 0x34153411, 0x34743471, 0x3440344d, 0x34473441, 0x3453345c, 0x34543451, 0x353335c1, + 0x35343531, 0x35073500, 0x35133505, 0x35433514, 0x0ffc3550, 0x0ff00ff3, 0x0ff40ff1, 0x0fc00fcd, + 0x0fdc0fc5, 0x0fd40fd3, 0x0f300f3f, 0x0f0c0f37, 0x0f000f03, 0x0f040f01, 0x0f170f10, 0x0f740f71, + 0x0f470f40, 0x0f5c0f5f, 0x0f540f51, 0x0cf70cf0, 0x0cf50cf4, 0x0cc30ccc, 0x0cc10cc0, 0x0cc40cc7, + 0x0cd00cdf, 0x0cd70cd1, 0x0c3c0cd5, 0x0c300c33, 0x0c340c31, 0x0c0c0c0f, 0x0c030c0d, 0x0c010c00, + 0x0c040c07, 0x0c1c0c05, 0x0c100c13, 0x0c140c11, 0x0c700c7d, 0x0c430c4c, 0x0c410c40, 0x0c5f0c44, + 0x0c550c50, 0x0df10dfc, 0x0dc00dcd, 0x0ddc0dc5, 0x0d3d0dd3, 0x0d350d30, 0x0d030d0c, 0x0d010d00, + 0x0d1d0d04, 0x0d700d10, 0x0d4d0d4f, 0x0d440d40, 0x0d530d45, 0x03f003f3, 0x03c303cc, 0x03c103c0, + 0x03c403c7, 0x03d003dc, 0x03d503d7, 0x0333033c, 0x03310330, 0x03350334, 0x030c030f, 0x03000303, + 0x03070301, 0x03050304, 0x031d031c, 0x03100313, 0x03140311, 0x0377037f, 0x034c0375, 0x03400343, + 0x03440341, 0x0353035c, 0x03550350, 0x00fd00fc, 0x00f000f3, 0x00f400f1, 0x00cc00cf, 0x00c300cd, + 0x00c100c0, 0x00c500c4, 0x00d300dc, 0x00d100d0, 0x003f00d4, 0x003d003c, 0x00300033, 0x00370031, + 0x000f0034, 0x000d000c, 0x00000003, 0x00070001, 0x00050004, 0x001c001f, 0x00100013, 0x00170011, + 0x00150014, 0x0073007c, 0x00740070, 0x004f0075, 0x0043004c, 0x00410040, 0x00440047, 0x0053005c, + 0x00510050, 0x01ff0054, 0x01fd01fc, 0x01f101f3, 0x01f401f7, 0x01c301cc, 0x01c701c0, 0x01df01c4, + 0x01dd01dc, 0x01d001d3, 0x01d701d1, 0x013c01d4, 0x01310130, 0x01340137, 0x010f0135, 0x010d010c, + 0x01000103, 0x01070101, 0x01050104, 0x0113011c, 0x01140110, 0x0170017d, 0x01770171, 0x01750174, + 0x0140014c, 0x015d0145, 0x01510150, 0x01540157, 0x07f007f3, 0x07f407f1, 0x07c007cf, 0x07dc07c7, + 0x073007d5, 0x07350737, 0x0703070c, 0x07010700, 0x07040707, 0x071d071f, 0x07100713, 0x0774077d, + 0x074d074f, 0x07470740, 0x0754075c, 0x04fd04fc, 0x04f504f0, 0x04c304cc, 0x04c104c0, 0x04d004c4, + 0x0433043c, 0x04310430, 0x040f0434, 0x040d040c, 0x04000403, 0x04070401, 0x04050404, 0x0413041c, + 0x04110410, 0x047c0414, 0x04740470, 0x0443044c, 0x04410440, 0x04440447, 0x05f30450, 0x05c005f7, + 0x05df05c5, 0x05d105d0, 0x053005d4, 0x05340537, 0x0500050c, 0x05070501, 0x051d0504, 0x05170510, + 0x057c0515, 0x054d0575, 0x05410540, 0x05450547, 0x1ff0055c, 0x1fc11fc3, 0x1fd01fc4, 0x1f0f1f33, + 0x1f011f00, 0x1f051f07, 0x1f131f1c, 0x1f141f11, 0x1f411f7c, 0x1cfc1f50, 0x1cf11cf3, 0x1ccd1cf4, + 0x1cdc1cc0, 0x1cd11cdd, 0x1c301cd4, 0x1c0c1c34, 0x1c011c00, 0x1c101c04, 0x1c151c11, 0x1c751c73, + 0x1c401c4d, 0x1c511c5c, 0x1dcc1c54, 0x1dc41dc1, 0x1d3c1d3f, 0x1d001d31, 0x1d071d01, 0x1d701d1f, + 0x1d411d4c, 0x13cc1d50, 0x13c013cd, 0x13c513c1, 0x13d113dc, 0x133f13d4, 0x1330133d, 0x13351337, + 0x1303130c, 0x13011300, 0x13051304, 0x131d131f, 0x13731310, 0x13741370, 0x134d134f, 0x13401343, + 0x13471341, 0x135c1345, 0x13541353, 0x10f710f0, 0x10cc10f5, 0x10c110c0, 0x103310c4, 0x10311030, + 0x100f1034, 0x1003100c, 0x10011000, 0x101c1004, 0x10101013, 0x10141011, 0x10741071, 0x104c1075, + 0x10411040, 0x10451044, 0x1050105d, 0x10571051, 0x11f411fd, 0x11df11c0, 0x11d711d1, 0x113f11d4, + 0x11371130, 0x110c1135, 0x11001103, 0x11071101, 0x111f1105, 0x11171110, 0x117d117f, 0x11751170, + 0x11411143, 0x11441147, 0x1153115f, 0x11551151, 0x17c417c1, 0x173c17d0, 0x1700170d, 0x171c1705, + 0x17701714, 0x1747174c, 0x14fc1751, 0x14cf14f3, 0x14dc14c0, 0x14d114d3, 0x143f14d4, 0x1430143c, + 0x14371431, 0x1403140c, 0x14011400, 0x141f1404, 0x14151410, 0x1473147d, 0x14401475, 0x1453145c, + 0x14541450, 0x15c115cc, 0x153c15c7, 0x15341533, 0x1500150f, 0x15051507, 0x15101513, 0x15711514, + 0x15471543, 0x15511545, 0x7ffd7fff, 0x7ff57ff7, 0x7fdd7fdf, 0x7fd57fd7, 0x7f0f7f30, 0x7f037f0c, + 0x7f047f01, 0x7f7f7f10, 0x7f777f7d, 0x7f407f75, 0x7f5d7f5f, 0x7f557f57, 0x7ccc7cf0, 0x7cc17cc3, + 0x7cd07cc4, 0x7c337c3c, 0x7c0f7c34, 0x7c007c0d, 0x7c077c01, 0x7c137c04, 0x7c147c11, 0x7c747c70, + 0x7c417c43, 0x7c507c44, 0x7dfd7dff, 0x7df57df7, 0x7ddf7dc0, 0x7dd77ddd, 0x7d0c7dd5, 0x7d047d03, + 0x7d7f7d10, 0x7d777d7d, 0x7d407d75, 0x7d5d7d5f, 0x7d557d57, 0x73c473c3, 0x7333733c, 0x7300730c, + 0x731c7305, 0x73147313, 0x73447343, 0x70f470fc, 0x70c070cd, 0x70d170c5, 0x703f70d4, 0x7030703c, + 0x700c7037, 0x70007003, 0x70047001, 0x70107005, 0x70177011, 0x707c7015, 0x70717073, 0x704f7074, + 0x7040704d, 0x70517047, 0x71c171cc, 0x71d071c4, 0x7133713c, 0x71357134, 0x7100710f, 0x71057104, + 0x7111711c, 0x71707115, 0x7145714c, 0x77ff7153, 0x77f777fd, 0x77c077f5, 0x77dd77df, 0x77d577d7, + 0x7730773c, 0x7703770c, 0x77107704, 0x777f7714, 0x7777777d, 0x77407775, 0x775d775f, 0x77557757, + 0x74f174f0, 0x74c374cc, 0x74d074c1, 0x7433743c, 0x74347431, 0x740d740f, 0x74057400, 0x7413741c, + 0x74417470, 0x74507444, 0x75fd75ff, 0x75f575f7, 0x75df75c0, 0x75d775dd, 0x753075d5, 0x7503750c, + 0x757f7501, 0x7577757d, 0x75407575, 0x755d755f, 0x75557557, 0x4fcc4ff0, 0x4fc74fc1, 0x4fd04fc4, + 0x4f314f3c, 0x4f004f34, 0x4f054f07, 0x4f154f14, 0x4f4c4f70, 0x4f414f43, 0x4f504f44, 0x4cf34cfc, + 0x4cf44cf1, 0x4cc04ccf, 0x4cc54cc7, 0x4cd34cdc, 0x4cd44cd1, 0x4c304c3f, 0x4c0c4c0f, 0x4c004c03, + 0x4c044c01, 0x4c104c1d, 0x4c714c73, 0x4c404c4d, 0x4c5c4c47, 0x4c514c53, 0x4df04c54, 0x4dc34dcc, + 0x4dd04dc4, 0x4d314d33, 0x4d0f4d34, 0x4d004d0d, 0x4d114d07, 0x4d704d14, 0x4d414d43, 0x43fc4d54, + 0x43f143f3, 0x43c043cf, 0x43d143c7, 0x4335433f, 0x4303430c, 0x43014300, 0x43044307, 0x431c431f, + 0x4310431d, 0x43714373, 0x4343434d, 0x43474340, 0x4354435c, 0x40f040ff, 0x40f540f7, 0x40cc40cf, + 0x40c040c3, 0x40c440c1, 0x40d040dc, 0x40d540d4, 0x4033403c, 0x40314030, 0x400f4034, 0x400d400c, + 0x40004003, 0x40074001, 0x40054004, 0x4013401c, 0x40114010, 0x407c4014, 0x40774070, 0x404d404c, + 0x40404043, 0x40444041, 0x405f4045, 0x4050405d, 0x40554057, 0x41f341fc, 0x41c041cf, 0x41df41c4, + 0x41d441d1, 0x41374130, 0x410c4134, 0x4100410d, 0x41044101, 0x41174110, 0x4173417d, 0x41754174, + 0x4143414d, 0x41534140, 0x41544151, 0x47c147f0, 0x47d047c4, 0x4731473c, 0x470d470f, 0x47014700, + 0x47134705, 0x47704710, 0x4741474c, 0x47504744, 0x44f144f3, 0x44cf44f4, 0x44c044cd, 0x44c544c7, + 0x44dc44df, 0x44d144d3, 0x443d443f, 0x44374430, 0x440c4435, 0x44004403, 0x44044401, 0x4410441d, + 0x44154411, 0x4473447c, 0x444d444f, 0x44454440, 0x4451445c, 0x45c045f0, 0x453345d0, 0x45344531, + 0x4500450f, 0x451c4507, 0x454c4570, 0x45404543, 0x5fff4541, 0x5ff75ffd, 0x5fc05ff5, 0x5fdd5fdf, + 0x5fd55fd7, 0x5f0c5f30, 0x5f015f03, 0x5f7f5f04, 0x5f775f7d, 0x5f405f75, 0x5f5d5f5f, 0x5f555f57, + 0x5cf45cf0, 0x5cc35ccc, 0x5cc45cc1, 0x5c315cc5, 0x5c0c5c34, 0x5c075c00, 0x5c1c5c05, 0x5c705c13, + 0x5c4d5c4f, 0x5c445c41, 0x5df75dfd, 0x5dcf5df5, 0x5ddd5dc4, 0x5dd55dd7, 0x5d0c5d30, 0x5d045d01, + 0x5d7f5d10, 0x5d775d7d, 0x5d405d75, 0x5d5d5d5f, 0x5d555d57, 0x53d053c4, 0x5333533c, 0x5303530f, + 0x53075300, 0x531c5305, 0x53115310, 0x53145317, 0x50f15370, 0x50cf50f4, 0x50c050cd, 0x50d150c7, + 0x503d50d4, 0x500c5030, 0x50005003, 0x50045001, 0x50155010, 0x5073507c, 0x50715070, 0x504d5074, + 0x50475040, 0x51cc51f0, 0x51c551c1, 0x51d051dc, 0x51315133, 0x510d5135, 0x51015100, 0x511f5107, + 0x5171511d, 0x5140514f, 0x51445141, 0x5153515c, 0x57ff5151, 0x57f757fd, 0x57df57f5, 0x57d757dd, + 0x570c57d5, 0x57015703, 0x577f5704, 0x5777577d, 0x57405775, 0x575d575f, 0x57555757, 0x54c354f0, + 0x54dc54c4, 0x543c54d0, 0x5400540f, 0x541c5405, 0x54145411, 0x5441544f, 0x55fd55ff, 0x55f555f7, + 0x55dd55df, 0x55d555d7, 0x5503550c, 0x557f5501, 0x5577557d, 0x55405575, 0x555d555f, 0x55555557 +); + +#enddecl(IQ1_TABLE) + +#decl(IQ1_S) + +struct iq1_s { + d: f16, + qs: array, + qh: array +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + var src1_i = src1_idx_base + offset * 256; + var sum = 0.0; + for (var ib: u32 = 0; ib < 8; ib++) { + let qh = bitcast(vec2(block.qh[ib], 0.0)); + let dl = d * (2 * f32((qh >> 12) & 7) + 1); + let delta = select(IQ1_DELTA, -IQ1_DELTA, (qh & 0x8000) != 0); + let qs_w = bitcast(vec2(block.qs[ib * 2], block.qs[ib * 2 + 1])); + for (var l: u32 = 0; l < 4; l++) { + let ig = (get_byte(qs_w, l) | (((qh >> (3 * l)) & 7) << 8)) * 8; + for (var j: u32 = 0; j < 8; j++) { + let gw = iq1_grid[(ig + j) / 16]; + let g = (gw >> (((ig + j) % 16) * 2)) & 3; + let gs = bitcast(g << 30) >> 30; + sum += dl * (f32(gs) + delta) * src1[src1_i]; + src1_i++; + } + } + } + return sum; +} + +#enddecl(IQ1_S) + +#decl(IQ1_M) + +struct iq1_m { + qs: array, + qh: array, + scales: array +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + + let scale = ((block.scales[0] >> 12) & 0xF) | ((block.scales[0] >> 24) & 0x00F0) | ((block.scales[1] >> 4) & 0x0F00) | ((block.scales[1] >> 16) & 0xF000); + let d = f32(bitcast>(scale).x); + var src1_i = src1_idx_base + offset * 256; + var sum = 0.0; + for (var ib: u32 = 0; ib < 8; ib++) { + let sw = (block.scales[ib / 4] >> (16 * ((ib / 2) % 2))) & 0xFFFF; + let s1 : u32 = (sw >> (6 * (ib % 2))) & 0x7; + let s2 : u32 = (sw >> (6 * (ib % 2) + 3)) & 0x7; + var dl = array( + d * f32(2 * s1 + 1), + d * f32(2 * s2 + 1) + ); + + let qh = block.qh[ib / 2] >> (16 * (ib % 2)); + var idx = array( + get_byte(block.qs[ib], 0) | ((qh << 8) & 0x700), + get_byte(block.qs[ib], 1) | ((qh << 4) & 0x700), + get_byte(block.qs[ib], 2) | ((qh) & 0x700), + get_byte(block.qs[ib], 3) | ((qh >> 4) & 0x700) + ); + var delta = array( + select(IQ1_DELTA, -IQ1_DELTA, (qh & 0x08) != 0), + select(IQ1_DELTA, -IQ1_DELTA, (qh & 0x80) != 0), + select(IQ1_DELTA, -IQ1_DELTA, ((qh >> 8) & 0x08) != 0), + select(IQ1_DELTA, -IQ1_DELTA, ((qh >> 8) & 0x80) != 0) + ); + for (var l: u32 = 0; l < 4; l++) { + let ig = idx[l] * 8; + for (var j: u32 = 0; j < 8; j++) { + let gw = iq1_grid[(ig + j) / 16]; + let g = (gw >> (((ig + j) % 16) * 2)) & 3; + let gs = bitcast(g << 30) >> 30; + sum += dl[l/2] * (f32(gs) + delta[l]) * src1[src1_i]; + src1_i++; + } + } + } + return sum; +} + +#enddecl(IQ1_M) + +#decl(IQ4_TABLE) + +const kvalues_iq4nl = array( + -127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113 +); + +#enddecl(IQ4_TABLE) + +#decl(IQ4_NL) + +struct iq4_nl { + d: f16, + qs: array, +} + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + var src1_i = src1_idx_base + offset * 32; + var sum = 0.0; + var qs: array; + for (var i: u32 = 0; i < 4; i++) { + qs[i] = bitcast(vec2(block.qs[i * 2], block.qs[i * 2 + 1])); + } + for (var j: u32 = 0; j < 16; j++) { + let qsb = get_byte(qs[j / 4], j % 4); + sum += d * f32(kvalues_iq4nl[qsb & 0xF]) * src1[src1_i]; + sum += d * f32(kvalues_iq4nl[qsb >> 4]) * src1[src1_i + 16]; + src1_i++; + } + return sum; +} + +#enddecl(IQ4_NL) + +#decl(IQ4_XS) + +struct iq4_xs { + d: f16, + scales_h: f16, + scales_l: u32, + qs: array +}; + +fn multiply_add(src0_idx_base: u32, src1_idx_base: u32, offset: u32) -> f32 { + let block = src0[src0_idx_base + offset]; + let d = f32(block.d); + let scales_h = bitcast(vec2(block.scales_h, 0.0)); + var src1_i = src1_idx_base + offset * 256; + var sum = 0.0; + for (var ib: u32 = 0; ib < 8; ib++) { + let ls = ((get_byte(block.scales_l, ib / 2) >> (4 * (ib % 2))) & 0xF) | (((scales_h >> (2 * ib)) & 3) << 4); + let dl = d * (f32(ls) - 32.0); + for (var j: u32 = 0; j < 16; j++) { + let iqs = ib * 16 + j; + let qsb = get_byte(block.qs[iqs / 4], iqs % 4); + sum += dl * f32(kvalues_iq4nl[qsb & 0xF]) * src1[src1_i]; + sum += dl * f32(kvalues_iq4nl[qsb >> 4]) * src1[src1_i + 16]; + src1_i++; + } + src1_i += 16; + } + return sum; +} + +#enddecl(IQ4_XS) + +#end(DECLS) + +#define(SHADER) + +enable f16; + +DECLS + +struct MulMatParams { + offset_src0: u32, // in elements/blocks + offset_src1: u32, // in elements/blocks + offset_dst: u32, // in elements/blocks + m: u32, + n: u32, + k: u32, + // all strides are in elements/blocks + stride_01: u32, + stride_11: u32, + stride_02: u32, + stride_12: u32, + stride_03: u32, + stride_13: u32, + + bs02: u32, + bs03: u32, + broadcast2: u32, + broadcast3: u32 +}; + +@group(0) @binding(0) var src0: array<{{SRC0_TYPE}}>; // N rows, K columns +@group(0) @binding(1) var src1: array<{{SRC1_TYPE}}>; // M rows, K columns (transposed) +@group(0) @binding(2) var dst: array; // M rows, N columns + +@group(0) @binding(3) var params: MulMatParams; + +@compute @workgroup_size(64) +fn main(@builtin(global_invocation_id) global_id: vec3) { + let total = params.m * params.n * params.bs02 * params.broadcast2 * params.bs03 * params.broadcast3; + if (global_id.x >= total) { + return; + } + + let dst2_stride = params.m * params.n; + let dst3_stride = dst2_stride * params.bs02 * params.broadcast2; + + let dst3_idx = global_id.x / dst3_stride; + let src03_idx = dst3_idx / params.broadcast3; // src0 may be broadcast along the third dimension + let src13_idx = dst3_idx; // src1 is not broadcast + let dst3_rem = global_id.x % dst3_stride; + + let dst2_idx = dst3_rem / dst2_stride; + let src02_idx = dst2_idx / params.broadcast2; // src0 may also be broadcast along the second dimension + let src12_idx = dst2_idx; // src1 is not broadcast + + let dst2_rem = dst3_rem % dst2_stride; + + let row = dst2_rem / params.n; // output row + let col = dst2_rem % params.n; // output column + + let src0_idx_base = params.offset_src0 + src03_idx * params.stride_03 + src02_idx * params.stride_02 + col * params.stride_01; + let src1_idx_base = params.offset_src1 + src13_idx * params.stride_13 + src12_idx * params.stride_12 + row * params.stride_11; + + var sum = 0.0; + for (var i: u32 = 0u; i < params.k/{{BLOCK_SIZE}}; i = i + 1u) { + sum += multiply_add(src0_idx_base, src1_idx_base, i); + } + dst[params.offset_dst + dst3_idx * dst3_stride + dst2_idx * dst2_stride + row * params.n + col] = sum; +} + +#end(SHADER) diff --git a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl b/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl deleted file mode 100644 index 054aab566f96b..0000000000000 --- a/ggml/src/ggml-webgpu/wgsl-shaders/mul_mat.wgsl +++ /dev/null @@ -1,56 +0,0 @@ -struct MulMatParams { - m: u32, - n: u32, - k: u32, - // all strides are in elements - stride_01: u32, - stride_11: u32, - stride_02: u32, - stride_12: u32, - stride_03: u32, - stride_13: u32, - - bs02: u32, - bs03: u32, - broadcast2: u32, - broadcast3: u32 -}; - -@group(0) @binding(0) var src0: array; // N rows, K columns -@group(0) @binding(1) var src1: array; // M rows, K columns (transposed) -@group(0) @binding(2) var dst: array; // M rows, N columns - -@group(0) @binding(3) var params: MulMatParams; - -@compute @workgroup_size(64) -fn main(@builtin(global_invocation_id) global_id: vec3) { - let total = params.m * params.n * params.bs02 * params.broadcast2 * params.bs03 * params.broadcast3; - if (global_id.x >= total) { - return; - } - - let dst2_stride = params.m * params.n; - let dst3_stride = dst2_stride * params.bs02 * params.broadcast2; - - let dst3_idx = global_id.x / dst3_stride; - let src03_idx = dst3_idx / params.broadcast3; // src0 may be broadcast along the third dimension - let src13_idx = dst3_idx; // src1 is not broadcast - let dst3_rem = global_id.x % dst3_stride; - - let dst2_idx = dst3_rem / dst2_stride; - let src02_idx = dst2_idx / params.broadcast2; // src0 may also be broadcast along the second dimension - let src12_idx = dst2_idx; // src1 is not broadcast - - let dst2_rem = dst3_rem % dst2_stride; - - let row = dst2_rem / params.n; // output row - let col = dst2_rem % params.n; // output column - - var sum = 0.0; - for (var i: u32 = 0u; i < params.k; i = i + 1u) { - let src0_idx = src03_idx * params.stride_03 + src02_idx * params.stride_02 + col * params.stride_01 + i; - let src1_idx = src13_idx * params.stride_13 + src12_idx * params.stride_12 + row * params.stride_11 + i; - sum = sum + src0[src0_idx] * src1[src1_idx]; - } - dst[dst3_idx * dst3_stride + dst2_idx * dst2_stride + row * params.n + col] = sum; -} From e92734d51bcb82cc35f0a6b5a14928f0036b2c90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 22 Aug 2025 23:47:01 +0200 Subject: [PATCH 127/140] test-opt: allow slight inprecision (#15503) --- tests/test-opt.cpp | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/tests/test-opt.cpp b/tests/test-opt.cpp index f02b4cad8c674..18d3fcf2cb948 100644 --- a/tests/test-opt.cpp +++ b/tests/test-opt.cpp @@ -358,7 +358,7 @@ static std::pair test_forward_backward( double accuracy; double accuracy_unc; ggml_opt_result_accuracy(cd.result, &accuracy, &accuracy_unc); - const bool subtest_ok = ndata == 0 && loss == 0.0 && std::isnan(loss_unc) && std::isnan(accuracy) && std::isnan(accuracy_unc); + const bool subtest_ok = ndata == 0 && almost_equal(loss, 0.0, 1e-6) && std::isnan(loss_unc) && std::isnan(accuracy) && std::isnan(accuracy_unc); helper_after_test_forward_backward(optim, __func__, high_level, shuffle, "results_initial", subtest_ok, ntest, npass); } @@ -381,10 +381,12 @@ static std::pair test_forward_backward( { float weights; ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float)); - const bool subtest_ok = weights == ndata/2; + const bool subtest_ok = almost_equal(weights, ndata/2, 1e-10); helper_after_test_forward_backward(optim, __func__, high_level, shuffle, "weights_after_forward", subtest_ok, ntest, npass); } { + constexpr double atol = 1e-10; + int64_t ndata; ggml_opt_result_ndata(cd.result, &ndata); bool subtest_ok = ndata == 6; @@ -392,7 +394,7 @@ static std::pair test_forward_backward( double loss; double loss_unc; ggml_opt_result_loss(cd.result, &loss, &loss_unc); - subtest_ok = subtest_ok && loss == 33.0 && almost_equal(loss_unc, sqrt(3.5), 1e-10); + subtest_ok = subtest_ok && almost_equal(loss, 33.0, atol) && almost_equal(loss_unc, sqrt(3.5), atol); double accuracy; double accuracy_unc; @@ -437,7 +439,7 @@ static std::pair test_forward_backward( { float weights; ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float)); - const bool subtest_ok = weights == -ndata * .5; + const bool subtest_ok = almost_equal(weights, -ndata * 0.5, 1e-10); helper_after_test_forward_backward(optim, __func__, high_level, shuffle, "weights_after_forward_backward", subtest_ok, ntest, npass); } { @@ -448,7 +450,7 @@ static std::pair test_forward_backward( double loss; double loss_unc; ggml_opt_result_loss(cd.result, &loss, &loss_unc); - subtest_ok = subtest_ok && loss == 18.0 && (shuffle || loss_unc == 0.0); + subtest_ok = subtest_ok && almost_equal(loss, 18.0, 1e-10) && (shuffle || loss_unc == 0.0); double accuracy; double accuracy_unc; @@ -550,10 +552,12 @@ static std::pair test_idata_split( if (adamw) { float weights; ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float)); - const bool subtest_ok = weights == ndata/2 - epoch*idata_split; + const bool subtest_ok = almost_equal(weights, ndata/2 - epoch*idata_split, 1e-10); helper_after_test_idata_split(optim, __func__, high_level, epoch, "weights", subtest_ok, ntest, npass); } if (adamw) { + constexpr double atol = 1e-10; + int64_t ndata_result; ggml_opt_result_ndata(cd.result, &ndata_result); bool subtest_ok = ndata_result == idata_split; @@ -561,7 +565,7 @@ static std::pair test_idata_split( double loss; double loss_unc; ggml_opt_result_loss(cd.result, &loss, &loss_unc); - subtest_ok = subtest_ok && loss == 28.0 - epoch*16.0 && loss_unc == 0.0; + subtest_ok = subtest_ok && almost_equal(loss, 28.0 - epoch*16.0, atol) && almost_equal(loss_unc, 0.0, atol); double accuracy; double accuracy_unc; @@ -571,6 +575,8 @@ static std::pair test_idata_split( helper_after_test_idata_split(optim, __func__, high_level, epoch, "results_backward", subtest_ok, ntest, npass); } if (adamw) { + constexpr double atol = 1e-10; + int64_t ndata_result; ggml_opt_result_ndata(cd.result2, &ndata_result); bool subtest_ok = ndata_result == ndata - idata_split; @@ -578,7 +584,7 @@ static std::pair test_idata_split( double loss; double loss_unc; ggml_opt_result_loss(cd.result2, &loss, &loss_unc); - subtest_ok = subtest_ok && loss == 15.0 - epoch*8 && almost_equal(loss_unc, sqrt(0.5), 1e-10); + subtest_ok = subtest_ok && almost_equal(loss, 15.0 - epoch*8, atol) && almost_equal(loss_unc, sqrt(0.5), atol); double accuracy; double accuracy_unc; @@ -687,22 +693,24 @@ static std::pair test_gradient_accumulation( } bool const adamw = optim == GGML_OPT_OPTIMIZER_TYPE_ADAMW; if (adamw) { + constexpr double atol = 1e-6; float weights; ggml_backend_tensor_get(cd.weights, &weights, 0, sizeof(float)); - const bool subtest_ok = weights == (ndata/2) - epoch; + const bool subtest_ok = almost_equal(weights, (ndata/2) - epoch, atol); helper_after_test_gradient_accumulation(optim, __func__, nbatch_physical, loss_type, epoch, "weights", subtest_ok, ntest, npass); } { + constexpr double atol = 1e-6; int64_t ndata_result; ggml_opt_result_ndata(cd.result, &ndata_result); - bool subtest_ok = ndata_result == ndata/nbatch_physical; + bool subtest_ok = almost_equal(ndata_result, ndata/nbatch_physical, atol); double loss; ggml_opt_result_loss(cd.result, &loss, /*loss_unc =*/ nullptr); if (loss_type == GGML_OPT_LOSS_TYPE_SUM) { - subtest_ok = subtest_ok && loss == (39.0 - epoch*6.0); + subtest_ok = subtest_ok && almost_equal(loss, (39.0 - epoch*6.0), atol); } else if (loss_type == GGML_OPT_LOSS_TYPE_MEAN) { - subtest_ok = subtest_ok && almost_equal(loss, (39.0 - epoch*6.0) / ndata, 1e-6); + subtest_ok = subtest_ok && almost_equal(loss, (39.0 - epoch*6.0) / ndata, atol); } else { GGML_ASSERT(false); } From 330c3d2d21b55bca5517db7d2eea2ea8f131df4a Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Sat, 23 Aug 2025 01:31:54 -0500 Subject: [PATCH 128/140] vulkan: optimize mul_mat_id loading row ids into shared memory (#15427) - Spread the work across the whole workgroup. Using more threads seems to far outweigh the synchronization overhead. - Specialize the code for when the division is by a power of two. --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 6 +- .../ggml-vulkan/vulkan-shaders/mul_mm.comp | 105 +++++++++++------- .../vulkan-shaders/mul_mm_cm2.comp | 103 ++++++++++------- 3 files changed, 133 insertions(+), 81 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index fb18a55cdad2c..2c5678f4884cf 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -2168,9 +2168,9 @@ static void ggml_vk_load_shaders(vk_device& device) { s_mmq_wg_denoms_k = { 32, 64, 1 }; // spec constants and tile sizes for quant matmul_id - l_warptile_mmqid = { 256, 128, 128, 16, 0 }; - m_warptile_mmqid = { 256, 128, 64, 16, 0 }; - s_warptile_mmqid = { 256, 128, 64, 16, 0 }; + l_warptile_mmqid = { 256, 128, 128, 16, 0, device->subgroup_size }; + m_warptile_mmqid = { 256, 128, 64, 16, 0, device->subgroup_size }; + s_warptile_mmqid = { 256, 128, 64, 16, 0, device->subgroup_size }; l_mmqid_wg_denoms = { 128, 128, 1 }; m_mmqid_wg_denoms = { 128, 64, 1 }; s_mmqid_wg_denoms = { 128, 64, 1 }; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp index a61a464c7bef8..d57cc6bdec5df 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm.comp @@ -103,16 +103,74 @@ layout (constant_id = 10) const uint WARP = 32; shared FLOAT_TYPE buf_a[BM * SHMEM_STRIDE]; shared FLOAT_TYPE buf_b[BN * SHMEM_STRIDE]; +#define NUM_WARPS (BLOCK_SIZE / WARP) + #ifdef MUL_MAT_ID shared u16vec2 row_ids[4096]; uint _ne1; #ifdef COOPMAT -shared uint _ne1_sh; +shared uvec4 ballots_sh[NUM_WARPS]; +void load_row_ids(uint expert_idx, bool nei0_is_pow2) { + _ne1 = 0; + uint num_elements = p.nei1 * p.nei0; + uint nei0shift = findLSB(p.nei0); + + uint ids[16]; + uint iter = 0; + + for (uint j = 0; j < num_elements; j += BLOCK_SIZE) { + // prefetch up to 16 elements + if (iter == 0) { + [[unroll]] for (uint k = 0; k < 16; ++k) { + uint i = j + gl_LocalInvocationIndex + k*BLOCK_SIZE; + bool in_range = i < num_elements; + uint ii1; + if (nei0_is_pow2) { + ii1 = i >> nei0shift; + } else { + ii1 = i / p.nei0; + } + uint ii0 = i - ii1 * p.nei0; + ids[k] = in_range ? data_ids[ii1*p.nbi1 + ii0] : 0; + } + } + uint i = j + gl_LocalInvocationIndex; + bool in_range = i < num_elements; + uint ii1; + if (nei0_is_pow2) { + ii1 = i >> nei0shift; + } else { + ii1 = i / p.nei0; + } + uint ii0 = i - ii1 * p.nei0; + uint id = ids[iter++]; + uvec4 ballot = subgroupBallot(in_range && id == expert_idx); + + ballots_sh[gl_SubgroupID] = ballot; + barrier(); + + uint subgroup_base = 0; + uint total = 0; + for (uint k = 0; k < gl_NumSubgroups; ++k) { + if (k == gl_SubgroupID) { + subgroup_base = total; + } + total += subgroupBallotBitCount(ballots_sh[k]); + } + barrier(); + + uint idx = subgroup_base + subgroupBallotExclusiveBitCount(ballot); + if (in_range && id == expert_idx) { + row_ids[_ne1 + idx] = u16vec2(ii0, ii1); + } + _ne1 += total; + iter &= 15; + } + barrier(); +} #endif #endif // MUL_MAT_ID -#define NUM_WARPS (BLOCK_SIZE / WARP) - #ifdef COOPMAT shared ACC_TYPE coopmat_stage[TM * TN * NUM_WARPS]; #endif @@ -178,44 +236,11 @@ void main() { #ifdef MUL_MAT_ID #ifdef COOPMAT - // Spread the search across all elements in the first subgroup - if (gl_SubgroupID == 0) { - _ne1 = 0; - uint num_elements = p.nei1 * p.nei0; - - uint ids[16]; - uint iter = 0; - - for (uint j = 0; j < num_elements; j += gl_SubgroupSize) { - // prefetch up to 16 elements - if (iter == 0) { - [[unroll]] for (uint k = 0; k < 16; ++k) { - uint i = j + gl_SubgroupInvocationID + k*gl_SubgroupSize; - bool in_range = i < num_elements; - uint ii1 = i / p.nei0; - uint ii0 = i % p.nei0; - ids[k] = in_range ? data_ids[ii1*p.nbi1 + ii0] : 0; - } - } - uint i = j + gl_SubgroupInvocationID; - bool in_range = i < num_elements; - uint ii1 = i / p.nei0; - uint ii0 = i % p.nei0; - uint id = ids[iter++]; - uvec4 ballot = subgroupBallot(in_range && id == expert_idx); - uint idx = subgroupBallotExclusiveBitCount(ballot); - if (in_range && id == expert_idx) { - row_ids[_ne1 + idx] = u16vec2(ii0, ii1); - } - _ne1 += subgroupBallotBitCount(ballot); - iter &= 15; - } - _ne1_sh = _ne1; + if (bitCount(p.nei0) == 1) { + load_row_ids(expert_idx, true); + } else { + load_row_ids(expert_idx, false); } - - barrier(); - - _ne1 = _ne1_sh; #else _ne1 = 0; for (uint ii1 = 0; ii1 < p.nei1; ii1++) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp index 29e4b5c9ce2d4..4d16eb0791ddc 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mm_cm2.comp @@ -19,6 +19,7 @@ #endif #include "types.comp" +#include "utils.comp" layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; @@ -99,7 +100,8 @@ layout(buffer_reference, std430, buffer_reference_align = 2) buffer decodeBufB { }; uint _ne1; -shared uint _ne1_sh; +layout (constant_id = 5) const uint subgroup_size = 32; +shared uvec4 ballots_sh[BLOCK_SIZE / subgroup_size]; B_TYPE decodeFuncB(const in decodeBufB bl, const in uint blockCoords[2], const in uint coordInBlock[2]) { @@ -128,6 +130,64 @@ D_TYPE perElemOpD(const in uint32_t r, const in uint32_t c, const in D_TYPE elem return elem; } +void load_row_ids(uint expert_idx, bool nei0_is_pow2) { + _ne1 = 0; + uint num_elements = p.nei1 * p.nei0; + uint nei0shift = findLSB(p.nei0); + + uint ids[16]; + uint iter = 0; + + for (uint j = 0; j < num_elements; j += BLOCK_SIZE) { + // prefetch up to 16 elements + if (iter == 0) { + [[unroll]] for (uint k = 0; k < 16; ++k) { + uint i = j + gl_LocalInvocationIndex + k*BLOCK_SIZE; + bool in_range = i < num_elements; + uint ii1; + if (nei0_is_pow2) { + ii1 = i >> nei0shift; + } else { + ii1 = i / p.nei0; + } + uint ii0 = i - ii1 * p.nei0; + ids[k] = in_range ? data_ids[ii1*p.nbi1 + ii0] : 0; + } + } + uint i = j + gl_LocalInvocationIndex; + bool in_range = i < num_elements; + uint ii1; + if (nei0_is_pow2) { + ii1 = i >> nei0shift; + } else { + ii1 = i / p.nei0; + } + uint ii0 = i - ii1 * p.nei0; + uint id = ids[iter++]; + uvec4 ballot = subgroupBallot(in_range && id == expert_idx); + + ballots_sh[gl_SubgroupID] = ballot; + barrier(); + + uint subgroup_base = 0; + uint total = 0; + for (uint k = 0; k < gl_NumSubgroups; ++k) { + if (k == gl_SubgroupID) { + subgroup_base = total; + } + total += subgroupBallotBitCount(ballots_sh[k]); + } + barrier(); + + uint idx = subgroup_base + subgroupBallotExclusiveBitCount(ballot); + if (in_range && id == expert_idx) { + row_ids[_ne1 + idx] = u16vec4(fastmod(ii0, p.ne11), ii1, ii0, 0); + } + _ne1 += total; + iter &= 15; + } + barrier(); +} #endif void main() { @@ -157,45 +217,12 @@ void main() { const uint ic = gl_WorkGroupID.y; #ifdef MUL_MAT_ID - // Spread the search across all elements in the first subgroup - if (gl_SubgroupID == 0) { - _ne1 = 0; - uint num_elements = p.nei1 * p.nei0; - - uint ids[16]; - uint iter = 0; - - for (uint j = 0; j < num_elements; j += gl_SubgroupSize) { - // prefetch up to 16 elements - if (iter == 0) { - [[unroll]] for (uint k = 0; k < 16; ++k) { - uint i = j + gl_SubgroupInvocationID + k*gl_SubgroupSize; - bool in_range = i < num_elements; - uint ii1 = i / p.nei0; - uint ii0 = i % p.nei0; - ids[k] = in_range ? data_ids[ii1*p.nbi1 + ii0] : 0; - } - } - uint i = j + gl_SubgroupInvocationID; - bool in_range = i < num_elements; - uint ii1 = i / p.nei0; - uint ii0 = i % p.nei0; - uint id = ids[iter++]; - uvec4 ballot = subgroupBallot(in_range && id == expert_idx); - uint idx = subgroupBallotExclusiveBitCount(ballot); - if (in_range && id == expert_idx) { - row_ids[_ne1 + idx] = u16vec4(ii0 % p.ne11, ii1, ii0, 0); - } - _ne1 += subgroupBallotBitCount(ballot); - iter &= 15; - } - _ne1_sh = _ne1; + if (bitCount(p.nei0) == 1) { + load_row_ids(expert_idx, true); + } else { + load_row_ids(expert_idx, false); } - barrier(); - - _ne1 = _ne1_sh; - // Workgroup has no work if (ic * BN >= _ne1) return; #endif From 0a9b43e507a359ca392c037cf341f55137ad0b69 Mon Sep 17 00:00:00 2001 From: Acly Date: Sat, 23 Aug 2025 08:35:21 +0200 Subject: [PATCH 129/140] vulkan : support ggml_mean (#15393) * vulkan : support ggml_mean * vulkan : support sum, sum_rows and mean with non-contiguous tensors * vulkan : fix subbuffer size not accounting for misalign offset * tests : add backend-op tests for non-contiguous sum_rows * cuda : require contiguous src for SUM_ROWS, MEAN support * sycl : require contiguous src for SUM, SUM_ROWS, ARGSORT support * require ggml_contiguous_rows in supports_op and expect nb00=1 in the shader --- ggml/src/ggml-cuda/ggml-cuda.cu | 4 +- ggml/src/ggml-sycl/ggml-sycl.cpp | 3 +- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 82 +++++++++++++++++-- .../ggml-vulkan/vulkan-shaders/sum_rows.comp | 43 ++++++++-- tests/test-backend-ops.cpp | 21 ++++- 5 files changed, 135 insertions(+), 18 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index d29a0b573f193..aa45ab39ed89e 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3485,11 +3485,11 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_OP_CONV_TRANSPOSE_2D: case GGML_OP_POOL_2D: case GGML_OP_SUM: - case GGML_OP_SUM_ROWS: - case GGML_OP_MEAN: case GGML_OP_ARGSORT: case GGML_OP_ACC: return true; + case GGML_OP_SUM_ROWS: + case GGML_OP_MEAN: case GGML_OP_GROUP_NORM: return ggml_is_contiguous(op->src[0]); case GGML_OP_UPSCALE: diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index a0a650e92e442..12dd5dd2e6287 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4391,10 +4391,11 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g return true; case GGML_OP_UPSCALE: return op->src[0]->type == GGML_TYPE_F32 && op->op_params[0] == GGML_SCALE_MODE_NEAREST; - case GGML_OP_POOL_2D: case GGML_OP_SUM: case GGML_OP_SUM_ROWS: case GGML_OP_ARGSORT: + return ggml_is_contiguous(op->src[0]); + case GGML_OP_POOL_2D: case GGML_OP_ACC: case GGML_OP_PAD: case GGML_OP_LEAKY_RELU: diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 2c5678f4884cf..007556cf4f403 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1015,6 +1015,39 @@ struct vk_op_upscale_push_constants { float sf0; float sf1; float sf2; float sf3; }; +struct vk_op_sum_rows_push_constants +{ + uint32_t n_cols; + uint32_t ne01, ne02; + uint32_t nb01, nb02, nb03; + uint32_t nb11, nb12, nb13; + float weight; + uint32_t misalign_offsets; + uint32_t ne0_12mp, ne0_12L; + uint32_t ne0_1mp, ne0_1L; +}; + +vk_op_sum_rows_push_constants vk_op_sum_rows_push_constants_init(const ggml_tensor * src, const ggml_tensor * dst, int64_t n_cols) { + uint32_t type_size = (uint32_t)ggml_type_size(src->type); + vk_op_sum_rows_push_constants p = {}; + p.n_cols = (uint32_t)n_cols; + p.ne01 = (uint32_t)src->ne[1]; + p.ne02 = (uint32_t)src->ne[2]; + p.nb01 = (uint32_t)src->nb[1] / type_size; + p.nb02 = (uint32_t)src->nb[2] / type_size; + p.nb03 = (uint32_t)src->nb[3] / type_size; + p.nb11 = (uint32_t)dst->nb[1] / type_size; + p.nb12 = (uint32_t)dst->nb[2] / type_size; + p.nb13 = (uint32_t)dst->nb[3] / type_size; + p.weight = 1.0f; + return p; +} + +template <> void init_pushconst_fastdiv(vk_op_sum_rows_push_constants &p) { + init_fastdiv_values(p.ne01*p.ne02, p.ne0_12mp, p.ne0_12L); + init_fastdiv_values(p.ne01, p.ne0_1mp, p.ne0_1L); +} + // Allow pre-recording command buffers struct vk_staging_memcpy { vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {} @@ -3128,7 +3161,7 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_argmax_f32, "argmax_f32", argmax_f32_len, argmax_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(device, device->pipeline_sum_rows_f32, "sum_rows_f32", sum_rows_f32_len, sum_rows_f32_data, "main", 2, sizeof(vk_op_sum_rows_push_constants), {1, 1, 1}, { device->subgroup_size }, 1); ggml_vk_create_pipeline(device, device->pipeline_count_equal_i32, "count_equal_i32", count_equal_i32_len, count_equal_i32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, { device->subgroup_size }, 1); @@ -7249,6 +7282,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return nullptr; case GGML_OP_SUM: case GGML_OP_SUM_ROWS: + case GGML_OP_MEAN: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_sum_rows_f32; } @@ -7387,6 +7421,9 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) { case GGML_OP_CONV_2D_DW: case GGML_OP_IM2COL: case GGML_OP_SET_ROWS: + case GGML_OP_SUM: + case GGML_OP_SUM_ROWS: + case GGML_OP_MEAN: return true; default: return false; @@ -7421,6 +7458,16 @@ template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk GGML_UNUSED(src2); } +template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_sum_rows_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) { + const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); + const uint32_t d_offset = get_misalign_bytes(ctx, dst) / ggml_type_size(dst->type); + + p.misalign_offsets = (a_offset << 16) | d_offset; + + GGML_UNUSED(src1); + GGML_UNUSED(src2); +} + template <> void init_pushconst_tensor_offsets(ggml_backend_vk_context * ctx, vk_op_binary_push_constants &p, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst) { const uint32_t a_offset = get_misalign_bytes(ctx, src0) / ggml_type_size(src0->type); const uint32_t b_offset = get_misalign_bytes(ctx, src1) / ggml_type_size(src1->type); @@ -7571,10 +7618,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co d_buf_offset &= ~(ctx->device->properties.limits.minStorageBufferOffsetAlignment - 1); if (op_supports_incontiguous) { - x_sz = ggml_nbytes(src0); - y_sz = use_src1 ? ggml_nbytes(src1) : 0; - z_sz = use_src2 ? ggml_nbytes(src2) : 0; - d_sz = ggml_nbytes(dst); + x_sz = ggml_nbytes(src0) + get_misalign_bytes(ctx, src0); + y_sz = use_src1 ? ggml_nbytes(src1) + get_misalign_bytes(ctx, src1) : 0; + z_sz = use_src2 ? ggml_nbytes(src2) + get_misalign_bytes(ctx, src2) : 0; + d_sz = ggml_nbytes(dst) + get_misalign_bytes(ctx, dst); if (x_buf_offset + x_sz >= d_X->size) { x_sz = VK_WHOLE_SIZE; @@ -7602,6 +7649,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX_BACK: case GGML_OP_SUM_ROWS: + case GGML_OP_MEAN: case GGML_OP_ARGMAX: { const uint32_t nr = ggml_nrows(src0); @@ -8588,11 +8636,19 @@ static void ggml_vk_argsort(ggml_backend_vk_context * ctx, vk_context& subctx, c } static void ggml_vk_sum(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SUM, { (uint32_t)ggml_nelements(src0), 0, 0.0f, 0.0f }, dryrun); + vk_op_sum_rows_push_constants p = vk_op_sum_rows_push_constants_init(src0, dst, ggml_nelements(src0)); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SUM, p, dryrun); } static void ggml_vk_sum_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { - ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SUM_ROWS, { (uint32_t)src0->ne[0], 0, 0.0f, 0.0f }, dryrun); + vk_op_sum_rows_push_constants p = vk_op_sum_rows_push_constants_init(src0, dst, src0->ne[0]); + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SUM_ROWS, p, dryrun); +} + +static void ggml_vk_mean(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { + vk_op_sum_rows_push_constants p = vk_op_sum_rows_push_constants_init(src0, dst, src0->ne[0]); + p.weight = 1.0f / (float)src0->ne[0]; + ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_MEAN, p, dryrun); } static void ggml_vk_argmax(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) { @@ -9815,6 +9871,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr case GGML_OP_ARGSORT: case GGML_OP_SUM: case GGML_OP_SUM_ROWS: + case GGML_OP_MEAN: case GGML_OP_ARGMAX: case GGML_OP_COUNT_EQUAL: case GGML_OP_IM2COL: @@ -9884,6 +9941,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr case GGML_OP_ARGSORT: case GGML_OP_SUM: case GGML_OP_SUM_ROWS: + case GGML_OP_MEAN: case GGML_OP_ARGMAX: case GGML_OP_COUNT_EQUAL: case GGML_OP_IM2COL: @@ -10087,6 +10145,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr case GGML_OP_SUM_ROWS: ggml_vk_sum_rows(ctx, compute_ctx, src0, node, dryrun); + break; + case GGML_OP_MEAN: + ggml_vk_mean(ctx, compute_ctx, src0, node, dryrun); + break; case GGML_OP_ARGMAX: ggml_vk_argmax(ctx, compute_ctx, src0, node, dryrun); @@ -10246,6 +10308,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph * case GGML_OP_ARGSORT: case GGML_OP_SUM: case GGML_OP_SUM_ROWS: + case GGML_OP_MEAN: case GGML_OP_ARGMAX: case GGML_OP_COUNT_EQUAL: case GGML_OP_IM2COL: @@ -11483,8 +11546,11 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX_BACK: + return true; case GGML_OP_SUM: case GGML_OP_SUM_ROWS: + case GGML_OP_MEAN: + return op->src[0]->type == GGML_TYPE_F32 && ggml_is_contiguous_rows(op->src[0]); case GGML_OP_ARGMAX: case GGML_OP_COUNT_EQUAL: case GGML_OP_IM2COL: @@ -12043,6 +12109,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_cgraph * tensor_clone = ggml_sum(ggml_ctx, src_clone[0]); } else if (tensor->op == GGML_OP_SUM_ROWS) { tensor_clone = ggml_sum_rows(ggml_ctx, src_clone[0]); + } else if (tensor->op == GGML_OP_MEAN) { + tensor_clone = ggml_mean(ggml_ctx, src_clone[0]); } else if (tensor->op == GGML_OP_ARGMAX) { tensor_clone = ggml_argmax(ggml_ctx, src_clone[0]); } else if (tensor->op == GGML_OP_COUNT_EQUAL) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp b/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp index 961e5ffa1f56f..759204afaf9aa 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/sum_rows.comp @@ -1,9 +1,9 @@ #version 450 -#include "generic_head.comp" #include "types.comp" #extension GL_EXT_control_flow_attributes : enable + layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; @@ -11,16 +11,49 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; layout (constant_id = 0) const uint BLOCK_SIZE = 32; +layout (push_constant) uniform parameter +{ + uint n_cols; + uint ne01, ne02; + uint nb01, nb02, nb03; + uint nb11, nb12, nb13; + float weight; + uint misalign_offsets; + uint ne0_12mp, ne0_12L; + uint ne0_1mp, ne0_1L; +} p; + +uint get_aoffset() { return p.misalign_offsets >> 16; } +uint get_doffset() { return p.misalign_offsets & 0xFFFF; } + +// see init_fastdiv_values in ggml-vulkan.cpp +uint fastdiv(uint n, uint mp, uint L) { + uint msbs, lsbs; + // msbs = mulhi(n, mp) + umulExtended(n, mp, msbs, lsbs); + return (msbs + n) >> L; +} + + shared FLOAT_TYPE tmp[BLOCK_SIZE]; void main() { const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x; const uint col = gl_LocalInvocationID.x; + const float weight = p.weight; + + const uint i03 = fastdiv(row, p.ne0_12mp, p.ne0_12L); + const uint i03_offset = i03 * p.ne01*p.ne02; + const uint i02 = fastdiv(row - i03_offset, p.ne0_1mp, p.ne0_1L); + const uint i01 = row - i03_offset - i02*p.ne01; + + const uint src_idx = get_aoffset() + i01 * p.nb01 + i02 * p.nb02 + i03 * p.nb03; + const uint dst_idx = get_doffset() + i01 * p.nb11 + i02 * p.nb12 + i03 * p.nb13; - tmp[col] = FLOAT_TYPE(0.0f); + tmp[col] = FLOAT_TYPE(0.0); - for (uint i = col; i < p.KX; i += BLOCK_SIZE) { - tmp[col] += FLOAT_TYPE(data_a[row*p.KX + i]); + for (uint i = col; i < p.n_cols; i += BLOCK_SIZE) { + tmp[col] += FLOAT_TYPE(data_a[src_idx + i]); } barrier(); @@ -32,6 +65,6 @@ void main() { } if (col == 0) { - data_d[row] = D_TYPE(tmp[0]); + data_d[dst_idx] = D_TYPE(tmp[0] * weight); } } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index a51527ca55c23..2e53f8e21a5a2 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -4300,20 +4300,32 @@ struct test_sum : public test_case { struct test_sum_rows : public test_case { const ggml_type type; const std::array ne; + const bool permute; + const bool slice; std::string vars() override { - return VARS_TO_STR2(type, ne); + return VARS_TO_STR4(type, ne, permute, slice); } test_sum_rows(ggml_type type = GGML_TYPE_F32, - std::array ne = {10, 5, 4, 3}) - : type(type), ne(ne) {} + std::array ne = {10, 5, 4, 3}, + bool permute = false, bool slice = false) + : type(type), ne(ne), permute(permute), slice(slice) {} ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); ggml_set_param(a); ggml_set_name(a, "a"); + if (slice) { + a = ggml_view_4d(ctx, a, + ne[0], ne[1], ne[2] / 2, ne[3] - 1, + a->nb[1], a->nb[2] * 2, a->nb[3], /*offset=*/a->nb[3]); + } + if (permute) { + a = ggml_permute(ctx, a, 0, 2, 3, 1); + } + ggml_tensor * out = ggml_sum_rows(ctx, a); ggml_set_name(out, "out"); @@ -6195,6 +6207,9 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_sum()); test_cases.emplace_back(new test_sum_rows()); + test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 11, 5, 6, 3 }, true, false)); + test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 11, 5, 6, 3 }, false, true)); + test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 11, 5, 6, 3 }, true, true)); test_cases.emplace_back(new test_mean()); test_cases.emplace_back(new test_sum(GGML_TYPE_F32, { 33, 1, 1, 1 })); test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 33, 1, 1, 1 })); From b55f06e1aa67fb10e89f53e31bbccf37eb2678ea Mon Sep 17 00:00:00 2001 From: R0CKSTAR Date: Sat, 23 Aug 2025 14:58:57 +0800 Subject: [PATCH 130/140] vulkan.Dockerfile: install vulkan SDK using tarball (#15282) Signed-off-by: Xiaodong Ye --- .devops/vulkan.Dockerfile | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/.devops/vulkan.Dockerfile b/.devops/vulkan.Dockerfile index fcd81ffa1e94e..6cf87c67e8553 100644 --- a/.devops/vulkan.Dockerfile +++ b/.devops/vulkan.Dockerfile @@ -2,14 +2,30 @@ ARG UBUNTU_VERSION=24.04 FROM ubuntu:$UBUNTU_VERSION AS build -# Install build tools -RUN apt update && apt install -y git build-essential cmake wget +# Ref: https://vulkan.lunarg.com/doc/sdk/latest/linux/getting_started.html -# Install Vulkan SDK and cURL -RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \ - wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list https://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list && \ - apt update -y && \ - apt-get install -y vulkan-sdk libcurl4-openssl-dev curl +# Install build tools +RUN apt update && apt install -y git build-essential cmake wget xz-utils + +# Install Vulkan SDK +ARG VULKAN_VERSION=1.4.321.1 +RUN ARCH=$(uname -m) && \ + wget -qO /tmp/vulkan-sdk.tar.xz https://sdk.lunarg.com/sdk/download/${VULKAN_VERSION}/linux/vulkan-sdk-linux-${ARCH}-${VULKAN_VERSION}.tar.xz && \ + mkdir -p /opt/vulkan && \ + tar -xf /tmp/vulkan-sdk.tar.xz -C /tmp --strip-components=1 && \ + mv /tmp/${ARCH}/* /opt/vulkan/ && \ + rm -rf /tmp/* + +# Install cURL and Vulkan SDK dependencies +RUN apt install -y libcurl4-openssl-dev curl \ + libxcb-xinput0 libxcb-xinerama0 libxcb-cursor-dev + +# Set environment variables +ENV VULKAN_SDK=/opt/vulkan +ENV PATH=$VULKAN_SDK/bin:$PATH +ENV LD_LIBRARY_PATH=$VULKAN_SDK/lib:$LD_LIBRARY_PATH +ENV CMAKE_PREFIX_PATH=$VULKAN_SDK:$CMAKE_PREFIX_PATH +ENV PKG_CONFIG_PATH=$VULKAN_SDK/lib/pkgconfig:$PKG_CONFIG_PATH # Build it WORKDIR /app From 289bf4113ef5c02d8f5eb0cf2d86683d8b8bc4d9 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Sat, 23 Aug 2025 02:33:36 -0500 Subject: [PATCH 131/140] vulkan: Rewrite synchronization to allow some overlap between nodes (#15489) Track a list of nodes that need synchronization, and only sync if the new node depends on them (or overwrites them). This allows some overlap which can improve performance, and centralizes a big chunk of the synchronization logic. The remaining synchronization logic involves writes to memory other than the nodes, e.g. for dequantization or split_k. Each of these allocations has a bool indicating whether they were in use and need to be synced. This should be checked before they are written to, and set to true after they are done being consumed. --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 233 ++++++++++++++++++++++----- 1 file changed, 193 insertions(+), 40 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 007556cf4f403..c7cfb6473e37d 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -1231,6 +1231,14 @@ struct ggml_backend_vk_context { vk_pipeline_struct * prealloc_y_last_pipeline_used {}; const ggml_tensor * prealloc_y_last_tensor_used {}; + // Track which nodes have been used since the last sync, and whether they were written to + std::vector unsynced_nodes_written; + std::vector unsynced_nodes_read; + // Track which prealloc buffers have pending reads that need to be synchronized. + // These are checked before writing to the buffer (and call ggml_vk_sync_buffers if set), + // and set to true after the buffer contents are consumed. + bool prealloc_x_need_sync, prealloc_y_need_sync, prealloc_split_k_need_sync; + vk_buffer buffer_pool[MAX_VK_BUFFERS]; vk_context_ref compute_ctx; @@ -1906,14 +1914,18 @@ static vk_subbuffer ggml_vk_subbuffer(vk_buffer& buf) { return { buf, 0, VK_WHOLE_SIZE }; } -static void ggml_vk_sync_buffers(vk_context& ctx) { +static void ggml_vk_sync_buffers(ggml_backend_vk_context* ctx, vk_context& subctx) { VK_LOG_DEBUG("ggml_vk_sync_buffers()"); - const bool transfer_queue = ctx->p->q->transfer_only; + const bool transfer_queue = subctx->p->q->transfer_only; - ctx->s->buffer.pipelineBarrier( - ctx->p->q->stage_flags, - ctx->p->q->stage_flags, + if (ctx) { + ctx->prealloc_x_need_sync = ctx->prealloc_y_need_sync = ctx->prealloc_split_k_need_sync = false; + } + + subctx->s->buffer.pipelineBarrier( + subctx->p->q->stage_flags, + subctx->p->q->stage_flags, {}, { { { !transfer_queue ? (vk::AccessFlagBits::eShaderRead | vk::AccessFlagBits::eShaderWrite | vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) : (vk::AccessFlagBits::eTransferRead | vk::AccessFlagBits::eTransferWrite) }, @@ -4898,7 +4910,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont } } - ggml_vk_sync_buffers(subctx); + ggml_vk_sync_buffers(ctx, subctx); subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices); return; } @@ -4913,7 +4925,7 @@ static void ggml_vk_buffer_write_nc_async(ggml_backend_vk_context * ctx, vk_cont ggml_vk_ensure_sync_staging_buffer(ctx->device, copy_size); VkBufferCopy buf_copy{ 0, offset, copy_size }; - ggml_vk_sync_buffers(subctx); + ggml_vk_sync_buffers(ctx, subctx); vkCmdCopyBuffer(subctx->s->buffer, (VkBuffer)staging->buffer, (VkBuffer)dst->buffer, 1, &buf_copy); for (uint64_t i3 = 0; i3 < ne3; i3++) { @@ -4967,7 +4979,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz } } - ggml_vk_sync_buffers(subctx); + ggml_vk_sync_buffers(nullptr, subctx); subctx->s->buffer.copyBuffer(buf->buffer, dst->buffer, slices); return; } @@ -4988,7 +5000,7 @@ static void ggml_vk_buffer_write_2d_async(vk_context subctx, vk_buffer& dst, siz offset, copy_size}; - ggml_vk_sync_buffers(subctx); + ggml_vk_sync_buffers(nullptr, subctx); vkCmdCopyBuffer(subctx->s->buffer, (VkBuffer)staging_buffer->buffer, (VkBuffer)dst->buffer, 1, &buf_copy); if (width == spitch) { @@ -5068,7 +5080,7 @@ static void ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size if (buf != nullptr) { // Memory is pinned, use as staging buffer - ggml_vk_sync_buffers(subctx); + ggml_vk_sync_buffers(nullptr, subctx); subctx->s->buffer.copyBuffer(src->buffer, buf->buffer, slices); return; @@ -5085,7 +5097,7 @@ static void ggml_vk_buffer_read_2d_async(vk_context subctx, vk_buffer& src, size vk_buffer& staging_buffer = src->device->sync_staging; - ggml_vk_sync_buffers(subctx); + ggml_vk_sync_buffers(nullptr, subctx); subctx->s->buffer.copyBuffer(src->buffer, staging_buffer->buffer, slices); deferred_memcpy(dst, staging_buffer->ptr, copy_size, &subctx->out_memcpys); @@ -5275,13 +5287,16 @@ static void ggml_vk_matmul( uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3, uint32_t padded_n) { VK_LOG_DEBUG("ggml_vk_matmul(a: (" << a.buffer->buffer << ", " << a.offset << ", " << a.size << "), b: (" << b.buffer->buffer << ", " << b.offset << ", " << b.size << "), d: (" << d.buffer->buffer << ", " << d.offset << ", " << d.size << "), split_k: (" << (split_k_buffer.buffer != nullptr ? split_k_buffer.buffer->buffer : VK_NULL_HANDLE) << ", " << split_k_buffer.offset << ", " << split_k_buffer.size << "), m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", split_k: " << split_k << ", batch: " << batch << ", ne02: " << ne02 << ", ne12: " << ne12 << ", broadcast2: " << broadcast2 << ", broadcast3: " << broadcast3 << ", padded_n: " << padded_n << ")"); - ggml_vk_sync_buffers(subctx); if (split_k == 1) { const vk_mat_mat_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k, ne02, ne12, broadcast2, broadcast3, padded_n }; ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d }, pc, { m, n, batch }); return; } + if (ctx->prealloc_split_k_need_sync) { + ggml_vk_sync_buffers(ctx, subctx); + } + GGML_ASSERT(batch_stride_d == m * n); // Round the split size up to a multiple of 256 (k-quant alignment) @@ -5291,9 +5306,10 @@ static void ggml_vk_matmul( const vk_mat_mat_push_constants pc1 = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, k_split, ne02, ne12, broadcast2, broadcast3, padded_n }; // Make sure enough workgroups get assigned for split k to work ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1, { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch }); - ggml_vk_sync_buffers(subctx); + ggml_vk_sync_buffers(ctx, subctx); const std::array pc2 = { (uint32_t)(m * n * batch), split_k }; ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2, { m * n * batch, 1, 1 }); + ctx->prealloc_split_k_need_sync = true; } static vk_pipeline ggml_vk_guess_matmul_id_pipeline(ggml_backend_vk_context * ctx, vk_matmul_pipeline& mmp, uint32_t m, uint32_t n, bool aligned, ggml_type src0_type) { @@ -5338,7 +5354,6 @@ static void ggml_vk_matmul_id( "m: " << m << ", n: " << n << ", k: " << k << ", stride_a: " << stride_a << ", stride_b: " << stride_b << ", stride_d: " << stride_d << ", " << "batch_stride_a: " << batch_stride_a << ", batch_stride_b: " << batch_stride_b << ", batch_stride_d: " << batch_stride_d << ", " << "n_as: " << n_as << ", nei0: " << nei0 << ", nei1: " << nei1 << ", nbi1: " << nbi1 << ", ne11: " << ne11 << ")"); - ggml_vk_sync_buffers(subctx); const vk_mat_mat_id_push_constants pc = { m, n, k, stride_a, stride_b, stride_d, batch_stride_a, batch_stride_b, batch_stride_d, nei0, nei1, nbi1, ne11, padded_n }; ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, d, ids }, pc, { m, nei1, n_as }); @@ -5469,8 +5484,8 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; init_pushconst_fastdiv(pc); - ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, pc, elements); + ggml_vk_sync_buffers(ctx, subctx); } static vk_pipeline ggml_vk_get_quantize_pipeline(ggml_backend_vk_context * ctx, ggml_type type) { @@ -5488,8 +5503,8 @@ static void ggml_vk_quantize_q8_1(ggml_backend_vk_context * ctx, vk_context& sub vk_pipeline pipeline = ggml_vk_get_quantize_pipeline(ctx, GGML_TYPE_Q8_1); - ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, std::array{ne}, { ne, 1, 1 }); + ggml_vk_sync_buffers(ctx, subctx); } static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { @@ -5684,12 +5699,23 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub GGML_ASSERT(qy_sz == y_sz); } + if (x_non_contig || qx_needs_dequant) { + if (ctx->prealloc_x_need_sync) { + ggml_vk_sync_buffers(ctx, subctx); + } + } + if (y_non_contig || quantize_y) { + if (ctx->prealloc_y_need_sync) { + ggml_vk_sync_buffers(ctx, subctx); + } + } + if (x_non_contig) { ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }); } else if (qx_needs_dequant) { const std::vector pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; - ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); + ggml_vk_sync_buffers(ctx, subctx); } if (y_non_contig) { if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() || @@ -5728,6 +5754,13 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub ne10, ne10, ne01, stride_batch_x, stride_batch_y, ne20*ne21, split_k, ne12*ne13, ne02, ne12, r2, r3, padded_n ); // NOLINT + + if (x_non_contig || qx_needs_dequant) { + ctx->prealloc_x_need_sync = true; + } + if (y_non_contig || quantize_y) { + ctx->prealloc_y_need_sync = true; + } } static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { @@ -5874,6 +5907,17 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& GGML_ASSERT(qy_sz == y_sz); } + if (x_non_contig) { + if (ctx->prealloc_x_need_sync) { + ggml_vk_sync_buffers(ctx, subctx); + } + } + if (y_non_contig) { + if (ctx->prealloc_y_need_sync) { + ggml_vk_sync_buffers(ctx, subctx); + } + } + if (x_non_contig) { GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment)); ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }); @@ -5917,10 +5961,16 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& stride_batch_x, stride_batch_y, stride_batch_d, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)r2, (uint32_t)r3, }; - ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23} }, pc, { groups_x, (uint32_t)(ne12 * ne13), groups_z }); + + if (x_non_contig) { + ctx->prealloc_x_need_sync = true; + } + if (y_non_contig) { + ctx->prealloc_y_need_sync = true; + } } static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { @@ -6007,7 +6057,6 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c workgroups_z /= gqa_ratio; } - ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32[gqa_ratio - 1], { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { 1, (uint32_t)ne01, workgroups_z }); } @@ -6094,7 +6143,6 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con // compute const std::array pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, channel_stride_y, (uint32_t)(ne12 / ne02), (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)), nb03, nb13, nb23 }; - ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz }, vk_subbuffer{ d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, vk_subbuffer{ d_D, d_buffer_offset, d_sz + d_shader_offset } }, pc, { (uint32_t)ne03, (uint32_t)ne01, (uint32_t)ne12 }); } @@ -6306,13 +6354,24 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& GGML_ASSERT(qy_sz == y_sz); } + if (x_non_contig || qx_needs_dequant) { + if (ctx->prealloc_x_need_sync) { + ggml_vk_sync_buffers(ctx, subctx); + } + } + if (y_non_contig) { + if (ctx->prealloc_y_need_sync) { + ggml_vk_sync_buffers(ctx, subctx); + } + } + if (x_non_contig) { ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }); } else if (qx_needs_dequant) { const std::vector pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; - ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { vk_subbuffer{ d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, vk_subbuffer{ d_X, 0, x_sz * ne02 * ne03 } }, pc, { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); + ggml_vk_sync_buffers(ctx, subctx); } if (y_non_contig) { if (ctx->prealloc_y_last_pipeline_used != to_fp16_vk_1.get() || @@ -6343,6 +6402,13 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& stride_batch_x, stride_batch_y, ne20*ne21, n_as, nei0, nei1, nbi1 / ggml_type_size(ids->type), ne11, padded_n ); // NOLINT + + if (x_non_contig || qx_needs_dequant) { + ctx->prealloc_x_need_sync = true; + } + if (y_non_contig) { + ctx->prealloc_y_need_sync = true; + } } static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst, bool dryrun = false) { @@ -6502,6 +6568,17 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte GGML_ASSERT(qy_sz == y_sz); } + if (x_non_contig) { + if (ctx->prealloc_x_need_sync) { + ggml_vk_sync_buffers(ctx, subctx); + } + } + if (y_non_contig) { + if (ctx->prealloc_y_need_sync) { + ggml_vk_sync_buffers(ctx, subctx); + } + } + if (x_non_contig) { GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment)); ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }); @@ -6538,11 +6615,17 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte (uint32_t)x_ne, stride_batch_y, (uint32_t)(ne20*ne21), (uint32_t)nei0, (uint32_t)ne11, }; - ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, { vk_subbuffer{ d_X, x_buf_offset, x_sz * ne02 * ne03 }, vk_subbuffer{ d_Y, y_buf_offset, y_sz * ne12 * ne13 }, vk_subbuffer{ d_D, d_buf_offset, d_sz * ne22 * ne23}, vk_subbuffer{ d_ids, ids_buf_offset, ids_sz } }, pc, { groups_x, (uint32_t)nei0, groups_z }); + + if (x_non_contig) { + ctx->prealloc_x_need_sync = true; + } + if (y_non_contig) { + ctx->prealloc_y_need_sync = true; + } } static void ggml_vk_mul_mat_id(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, bool dryrun = false) { @@ -6925,9 +7008,11 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx mask_n_head_log2, m0, m1, gqa_ratio, split_kv, split_k }; - ggml_vk_sync_buffers(subctx); - if (split_k > 1) { + if (ctx->prealloc_split_k_need_sync) { + ggml_vk_sync_buffers(ctx, subctx); + } + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{d_Q, q_buf_offset, VK_WHOLE_SIZE}, @@ -6943,7 +7028,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx // cancel out the divide by wg_denoms[0]. pc, { workgroups_x * pipeline->wg_denoms[0], workgroups_y, workgroups_z }); - ggml_vk_sync_buffers(subctx); + ggml_vk_sync_buffers(ctx, subctx); const std::array pc2 = { HSV, (uint32_t)ne1, (uint32_t)ne3, split_k, (sinks != nullptr) }; ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_flash_attn_split_k_reduce, { @@ -6952,6 +7037,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx vk_subbuffer{d_D, d_buf_offset, VK_WHOLE_SIZE}, }, pc2, { (uint32_t)ne1, HSV, (uint32_t)ne3 }); + ctx->prealloc_split_k_need_sync = true; } else { ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { @@ -7820,7 +7906,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co subbuf_y = { d_X, 0, x_sz }; } - ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements); } else if (op == GGML_OP_SOFT_MAX) { // Empty src1 and src2 is possible in soft_max, but the shader needs a buffer @@ -7838,7 +7923,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co subbuf_z = { d_X, 0, x_sz }; } - ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, subbuf_y, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements); } else if (op == GGML_OP_ROPE || op == GGML_OP_ROPE_BACK) { // Empty src2 is possible in rope, but the shader needs a buffer @@ -7849,30 +7933,23 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co subbuf_z = { d_X, 0, x_sz }; } - ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, subbuf_z, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements); } else if (op == GGML_OP_IM2COL) { // im2col uses only src1 and dst buffers - ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements); } else if (op == GGML_OP_COUNT_EQUAL) { - ggml_vk_sync_buffers(subctx); // count_equal assumes that destination buffer is initialized with zeroes ggml_vk_buffer_memset_async(subctx, d_D, d_buf_offset, 0, d_sz); - ggml_vk_sync_buffers(subctx); + ggml_vk_sync_buffers(ctx, subctx); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements); } else if (op == GGML_OP_OPT_STEP_SGD) { // OPT_STEP_SGD works on src0, it does not need dst - ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz } }, pc, elements); } else if (use_src2) { - ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_Z, z_buf_offset, z_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements); } else if (use_src1) { - ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_Y, y_buf_offset, y_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements); } else { - ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ d_X, x_buf_offset, x_sz }, vk_subbuffer{ d_D, d_buf_offset, d_sz } }, pc, elements); } } @@ -7999,7 +8076,6 @@ static void ggml_vk_multi_add(ggml_backend_vk_context * ctx, vk_context& subctx, elements = { ne, 1, 1 }; } - ggml_vk_sync_buffers(subctx); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ buf[0], offset[0], VK_WHOLE_SIZE }, @@ -8112,8 +8188,6 @@ static void ggml_vk_op_f32_wkv(ggml_backend_vk_context * ctx, vk_context& subctx src_buf_ctxs[i] = (ggml_backend_vk_buffer_context *)dst->src[i]->buffer->context; } - ggml_vk_sync_buffers(subctx); - vk_buffer d_D = nullptr, d_srcs[7] = { nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr }; size_t dst_offset = 0, src_offsets[7] = { 0, 0, 0, 0, 0, 0, 0 }; bool dst_uma = false, srcs_uma[7] = { false, false, false, false, false, false, false }; @@ -8251,8 +8325,6 @@ static void ggml_vk_op_f32_opt_step_adamw(ggml_backend_vk_context * ctx, vk_cont ggml_backend_vk_buffer_context * gv_buf_ctx = (ggml_backend_vk_buffer_context *)gv->buffer->context; ggml_backend_vk_buffer_context * p_buf_ctx = (ggml_backend_vk_buffer_context *)p->buffer->context; - ggml_vk_sync_buffers(subctx); - vk_buffer d_X = nullptr, d_G = nullptr, d_GM = nullptr, d_GV = nullptr, d_P = nullptr; size_t x_offset = 0, g_offset = 0, gm_offset = 0, gv_offset = 0, p_offset = 0; bool X_uma = false, G_uma = false, GM_uma = false, GV_uma = false, P_uma = false; @@ -9964,6 +10036,83 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr } } + if (!dryrun) { + // This logic detects dependencies between modes in the graph and calls ggml_vk_sync_buffers + // to synchronize them. This handles most "normal" synchronization when computing the graph, and when + // there is no auxiliary memory use, it shouldn't be necessary to call ggml_vk_sync_buffers + // outside of this logic. When a node uses one of the prealloc buffers for something like + // dequantization or split_k, additional synchronization is needed between those passes. + bool need_sync = false; + + // Check whether "node" requires synchronization. The node requires synchronization if it + // overlaps in memory with another unsynchronized node and at least one of them is a write. + // Destination nodes are checked against both the written/read lists. Source nodes are only + // checked against the written list. Two nodes overlap in memory if they come from the same + // buffer and the tensor or view ranges overlap. + auto const &overlaps_unsynced = [&](const ggml_tensor *node, const std::vector &unsynced_nodes) -> bool { + if (unsynced_nodes.size() == 0) { + return false; + } + auto n_base = vk_tensor_offset(node) + node->view_offs; + auto n_size = ggml_nbytes(node); + ggml_backend_vk_buffer_context * a_buf_ctx = (ggml_backend_vk_buffer_context *)node->buffer->context; + vk_buffer a_buf = a_buf_ctx->dev_buffer; + for (auto &other : unsynced_nodes) { + ggml_backend_vk_buffer_context * o_buf_ctx = (ggml_backend_vk_buffer_context *)other->buffer->context; + vk_buffer o_buf = o_buf_ctx->dev_buffer; + if (a_buf == o_buf) { + auto o_base = vk_tensor_offset(other) + other->view_offs; + auto o_size = ggml_nbytes(other); + + if ((o_base <= n_base && n_base < o_base + o_size) || + (n_base <= o_base && o_base < n_base + n_size)) { + return true; + } + } + } + return false; + }; + + // For all fused ops, check if the destination node or any of the source + // nodes require synchronization. + for (int32_t i = 0; i < ctx->num_additional_fused_ops + 1 && !need_sync; ++i) { + const ggml_tensor *cur_node = cgraph->nodes[node_idx + i]; + if (overlaps_unsynced(cur_node, ctx->unsynced_nodes_read) || overlaps_unsynced(cur_node, ctx->unsynced_nodes_written)) { + need_sync = true; + break; + } + for (uint32_t j = 0; j < GGML_MAX_SRC; ++j) { + if (!cur_node->src[j]) { + continue; + } + if (overlaps_unsynced(cur_node->src[j], ctx->unsynced_nodes_written)) { + need_sync = true; + break; + } + } + } + if (need_sync) { + VK_LOG_DEBUG("node_idx=" << i << " sync"); + ctx->unsynced_nodes_written.clear(); + ctx->unsynced_nodes_read.clear(); + ggml_vk_sync_buffers(ctx, compute_ctx); + } else { + VK_LOG_DEBUG("node_idx=" << i << " unsynced"); + } + // Add the last fused node and all fused source nodes to the unsynchronized list. + const ggml_tensor * last_node = cgraph->nodes[node_idx + ctx->num_additional_fused_ops]; + ctx->unsynced_nodes_written.push_back(last_node); + for (int32_t i = 0; i < ctx->num_additional_fused_ops + 1; ++i) { + const ggml_tensor *cur_node = cgraph->nodes[node_idx + i]; + for (uint32_t j = 0; j < GGML_MAX_SRC; ++j) { + if (!cur_node->src[j]) { + continue; + } + ctx->unsynced_nodes_read.push_back(cur_node->src[j]); + } + } + } + switch (node->op) { case GGML_OP_REPEAT: ggml_vk_repeat(ctx, compute_ctx, src0, node, dryrun); @@ -10427,6 +10576,10 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) { ctx->gc.temp_buffers.clear(); ctx->prealloc_y_last_pipeline_used = {}; + ctx->unsynced_nodes_written.clear(); + ctx->unsynced_nodes_read.clear(); + ctx->prealloc_x_need_sync = ctx->prealloc_y_need_sync = ctx->prealloc_split_k_need_sync = false; + ggml_vk_command_pool_cleanup(ctx->device, ctx->compute_cmd_pool); ggml_vk_command_pool_cleanup(ctx->device, ctx->transfer_cmd_pool); From 21dc4ddaf21b8ed551d717e7606abd2cffbacdbf Mon Sep 17 00:00:00 2001 From: LaffeyNyaa <112215776+LaffeyNyaa@users.noreply.github.com> Date: Sat, 23 Aug 2025 16:38:30 +0800 Subject: [PATCH 132/140] chat : fix debug build assertion in trim function (#15520) --- src/llama-chat.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp index 0a96a9a579e26..4d6fdf822619b 100644 --- a/src/llama-chat.cpp +++ b/src/llama-chat.cpp @@ -16,10 +16,10 @@ static std::string trim(const std::string & str) { size_t start = 0; size_t end = str.size(); - while (start < end && isspace(str[start])) { + while (start < end && isspace(static_cast(str[start]))) { start += 1; } - while (end > start && isspace(str[end - 1])) { + while (end > start && isspace(static_cast(str[end - 1]))) { end -= 1; } return str.substr(start, end - start); From 9ef536907de1b50c30e0369284898d30472a755a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 23 Aug 2025 12:58:58 +0200 Subject: [PATCH 133/140] scripts: fix compare-llama-bench.py (#15521) --- scripts/compare-llama-bench.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py index 0141e0a350dc9..2cfca66e0938c 100755 --- a/scripts/compare-llama-bench.py +++ b/scripts/compare-llama-bench.py @@ -37,7 +37,6 @@ "TEXT", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "TEXT", "INTEGER", "INTEGER", "TEXT", "TEXT", "INTEGER", "TEXT", "INTEGER", "INTEGER", "INTEGER", "TEXT", "TEXT", - "REAL", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "INTEGER", "TEXT", "INTEGER", "INTEGER", "REAL", "REAL", ] From b1afcab804e3281867a5471fbd701e32eb32e512 Mon Sep 17 00:00:00 2001 From: "Piotr Wilkin (ilintar)" Date: Sat, 23 Aug 2025 15:21:52 +0200 Subject: [PATCH 134/140] model : add support for Seed-OSS (#15490) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * First draft * Fix linter errors * Added missing sinks nullptr * Don't forget the llama-arch! * We're through to the generation stage. * Fix post-attention norm * Apply suggestions from code review Co-authored-by: Sigbjørn Skjæret * Fix RoPE type * Fix tensor name and reorder llm_types * Update gguf-py/gguf/constants.py Remove nonexistent FFN_POST_NORM tensor Co-authored-by: Sigbjørn Skjæret * Update src/llama-model.h Co-authored-by: Sigbjørn Skjæret * Add basic chat template * Add chat template tests * Remake chat template test * Apply suggestions from code review Co-authored-by: Sigbjørn Skjæret * Update src/llama-chat.cpp Co-authored-by: Sigbjørn Skjæret * Reorder llm type descriptions * Update src/llama-model.cpp Co-authored-by: Sigbjørn Skjæret --------- Co-authored-by: Sigbjørn Skjæret --- convert_hf_to_gguf.py | 5 + gguf-py/gguf/constants.py | 16 +++ src/llama-arch.cpp | 18 ++++ src/llama-arch.h | 1 + src/llama-chat.cpp | 11 +++ src/llama-chat.h | 1 + src/llama-model.cpp | 183 +++++++++++++++++++++++++++++++++++ src/llama-model.h | 1 + tests/test-chat-template.cpp | 8 ++ 9 files changed, 244 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 42bf10d2169e2..35fadbc83ea1b 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -5854,6 +5854,11 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] +@ModelBase.register("SeedOssForCausalLM") +class SeedOssModel(TextModel): + model_arch = gguf.MODEL_ARCH.SEED_OSS + + @ModelBase.register("Olmo2ForCausalLM") class Olmo2Model(TextModel): model_arch = gguf.MODEL_ARCH.OLMO2 diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 61ebe6e5e7750..d03a02c7bf921 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -385,6 +385,7 @@ class MODEL_ARCH(IntEnum): DREAM = auto() SMALLTHINKER = auto() LLADA = auto() + SEED_OSS = auto() class VISION_PROJECTOR_TYPE(IntEnum): @@ -717,6 +718,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.DREAM: "dream", MODEL_ARCH.SMALLTHINKER: "smallthinker", MODEL_ARCH.LLADA: "llada", + MODEL_ARCH.SEED_OSS: "seed_oss", } VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = { @@ -1973,6 +1975,20 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.SEED_OSS: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_POST_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + ], MODEL_ARCH.OLMOE: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index c759a9c6d9e05..0ca0a4c22f814 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -93,6 +93,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_DREAM, "dream" }, { LLM_ARCH_SMALLTHINKER, "smallthinker" }, { LLM_ARCH_LLADA, "llada" }, + { LLM_ARCH_SEED_OSS, "seed_oss" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -2068,6 +2069,23 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_SEED_OSS, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_UNKNOWN, { diff --git a/src/llama-arch.h b/src/llama-arch.h index 7af587e7951bc..7008c2514c5d4 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -97,6 +97,7 @@ enum llm_arch { LLM_ARCH_DREAM, LLM_ARCH_SMALLTHINKER, LLM_ARCH_LLADA, + LLM_ARCH_SEED_OSS, LLM_ARCH_UNKNOWN, }; diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp index 4d6fdf822619b..9d8e57eac1f69 100644 --- a/src/llama-chat.cpp +++ b/src/llama-chat.cpp @@ -69,6 +69,7 @@ static const std::map LLM_CHAT_TEMPLATES = { { "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE }, { "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE }, { "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 }, + { "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS }, }; llm_chat_template llm_chat_template_from_str(const std::string & name) { @@ -201,6 +202,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) { return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE; } else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) { return LLM_CHAT_TEMPLATE_KIMI_K2; + } else if (tmpl_contains("")) { + return LLM_CHAT_TEMPLATE_SEED_OSS; } return LLM_CHAT_TEMPLATE_UNKNOWN; } @@ -752,6 +755,14 @@ int32_t llm_chat_apply_template( if (add_ass) { ss << "<|im_assistant|>assistant<|im_middle|>"; } + } else if (tmpl == LLM_CHAT_TEMPLATE_SEED_OSS) { + for (auto message: chat) { + std::string role(message->role); + ss << "" << role << "\n" << (role == "assistant" ? trim(message->content) : message->content) << ""; + } + if (add_ass) { + ss << "assistant\n"; + } } else { // template not supported return -1; diff --git a/src/llama-chat.h b/src/llama-chat.h index 35a943856fa52..21d53ed08b4c3 100644 --- a/src/llama-chat.h +++ b/src/llama-chat.h @@ -49,6 +49,7 @@ enum llm_chat_template { LLM_CHAT_TEMPLATE_OPENAI_MOE, LLM_CHAT_TEMPLATE_HUNYUAN_DENSE, LLM_CHAT_TEMPLATE_KIMI_K2, + LLM_CHAT_TEMPLATE_SEED_OSS, LLM_CHAT_TEMPLATE_UNKNOWN, }; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 3c8440a8f653c..d5148f7df36ed 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -83,6 +83,7 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_32B: return "32B"; case LLM_TYPE_34B: return "34B"; case LLM_TYPE_35B: return "35B"; + case LLM_TYPE_36B: return "36B"; case LLM_TYPE_40B: return "40B"; case LLM_TYPE_65B: return "65B"; case LLM_TYPE_70B: return "70B"; @@ -1288,6 +1289,14 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_SEED_OSS: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + switch (hparams.n_layer) { + case 64: type = LLM_TYPE_36B; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; case LLM_ARCH_OLMOE: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -3967,6 +3976,43 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0); } } break; + case LLM_ARCH_SEED_OSS: + { + const uint32_t head_dim = hparams.n_embd_head_k; + const int64_t n_qo_dim = n_head * head_dim; + const int64_t n_kv_dim = n_head_kv * head_dim; + + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + // if output is NULL, init from the input tok embed + if (output == NULL) { + output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED); + } + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_qo_dim}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_kv_dim}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_kv_dim}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_qo_dim, n_embd}, 0); + + layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_qo_dim}, TENSOR_NOT_REQUIRED); + layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED); + layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_kv_dim}, TENSOR_NOT_REQUIRED); + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0); + + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + } + } break; + case LLM_ARCH_OLMOE: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -17934,6 +17980,137 @@ struct llm_build_lfm2 : public llm_graph_context { } }; +struct llm_build_seed_oss : public llm_graph_context { + llm_build_seed_oss(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + const int64_t n_embd_head = hparams.n_embd_head_v; + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + auto * inp_attn = build_attn_inp_kv(); + + const float kq_scale = hparams.f_attention_scale == 0.0f ? 1.0f/sqrtf(float(n_embd_head)) : hparams.f_attention_scale; + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * inpSA = inpL; + + // norm + cur = build_norm(inpL, + model.layers[il].attn_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(inp_attn, + model.layers[il].wo, model.layers[il].bo, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, kq_scale, il); + cb(cur, "attn_out", il); + } + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + cur = build_norm(ffn_inp, + model.layers[il].attn_post_norm, NULL, + LLM_NORM_RMS, il); + cb(cur, "attn_post_norm", il); + + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "ffn_out", il); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = build_norm(cur, + model.output_norm, NULL, + LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // lm_head + cur = build_lora_mm(model.output, cur); + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); + } +}; + template struct llm_build_smallthinker : public llm_graph_context{ llm_build_smallthinker(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params){ @@ -18472,6 +18649,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_SEED_OSS: + { + llm = std::make_unique(*this, params); + } break; case LLM_ARCH_DOTS1: { llm = std::make_unique(*this, params); @@ -18530,6 +18711,7 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { return llm->res->get_gf(); } + // // interface implementation // @@ -18724,6 +18906,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_LFM2: case LLM_ARCH_SMALLTHINKER: case LLM_ARCH_GLM4_MOE: + case LLM_ARCH_SEED_OSS: return LLAMA_ROPE_TYPE_NEOX; case LLM_ARCH_QWEN2VL: diff --git a/src/llama-model.h b/src/llama-model.h index f639fa139811a..af4460cc01eb0 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -76,6 +76,7 @@ enum llm_type { LLM_TYPE_32B, LLM_TYPE_34B, LLM_TYPE_35B, + LLM_TYPE_36B, LLM_TYPE_40B, LLM_TYPE_65B, LLM_TYPE_70B, diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp index edfac3b08bb3f..b863367db6c99 100644 --- a/tests/test-chat-template.cpp +++ b/tests/test-chat-template.cpp @@ -290,6 +290,14 @@ int main(void) { /* .bos_token= */ "", /* .eos_token= */ "", }, + { + /* .name= */ "ByteDance-Seed/Seed-OSS-36B-Instruct", + /* .template_str */ "{# #}{%- for message in messages %}{%- if message.role in [\"user\", \"system\"] %}{{ bos_token + message.role + \"\\n\" + message.content + eos_token }}{%- elif message.role == \"assistant\" %}{{ bos_token + message.role }}{%- if message.content is defined and message.content is string and message.content|trim|length > 0 %}{{ \"\\n\" + message.content|trim + eos_token }}{%- endif %}{%- else %}{{ bos_token + message.role + \"\\n\" + message.content + eos_token }}{%- endif %}{%- endfor %}{%- if add_generation_prompt %}{{ bos_token + \"assistant\\n\" }}{%- endif %}", + /* .expected_output= */ "system\nYou are a helpful assistantuser\nHelloassistant\nHi thereuser\nWho are youassistant\nI am an assistantuser\nAnother questionassistant\n", + /* .expected_output_jinja= */ "system\nYou are a helpful assistantuser\nHelloassistant\nHi thereuser\nWho are youassistant\nI am an assistantuser\nAnother questionassistant\n", + /* .bos_token= */ "", + /* .eos_token= */ "", + } }; std::vector formatted_chat(1024); int32_t res; From 611f419cff11e4952228162a1c44cb35dff2274a Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Sat, 23 Aug 2025 13:16:17 -0500 Subject: [PATCH 135/140] vulkan: optimize rms_norm, and allow the work to spread across multiple SMs (#15281) * vulkan: optimize rms_norm, and allow the work to spread across multiple SMs There are really two parts to this change: (1) Some optimizations similar to what we have in soft_max, to unroll with different numbers of iterations. (2) A fusion optimization where we detect add followed by rms_norm, and make the add shader atomically accumulate the values^2 into memory. Then the rms_norm shader can just load that sum. This allows the rms_norm to be parallelized across multiple workgroups, it just becomes a simple per-element multiply. The fusion optimization is currently only applied when the rms_norm is on a single vector. This previously always ran on a single SM. It could apply more broadly, but when there are other dimensions the work can already spread across SMs, and there would be some complexity to tracking multiple atomic sums. * Change add+rms_norm optimization to write out an array of partial sums rather than using atomic add, to make it deterministic. The rms_norm shader fetches a subgroup's worth in parallel and uses subgroupAdd to add them up. * complete rebase against fused adds - multi_add shader can also compute partial sums * fix validation errors * disable add_rms_fusion for Intel due to possible driver bug * resolve against #15489, sync after clearing partial sums --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 195 +++++++++++++++--- ggml/src/ggml-vulkan/vulkan-shaders/add.comp | 42 +++- .../ggml-vulkan/vulkan-shaders/multi_add.comp | 42 +++- .../ggml-vulkan/vulkan-shaders/rms_norm.comp | 60 +++++- .../vulkan-shaders/rms_norm_partials.comp | 65 ++++++ .../vulkan-shaders/vulkan-shaders-gen.cpp | 12 +- tests/test-backend-ops.cpp | 15 +- 7 files changed, 380 insertions(+), 51 deletions(-) create mode 100644 ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index c7cfb6473e37d..2c8d9ecaa0a03 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -102,9 +102,9 @@ static bool is_pow2(uint32_t x) { return x > 1 && (x & (x-1)) == 0; } struct ggml_backend_vk_context; -#define MAX_PARAMETER_COUNT 8 +#define MAX_PARAMETER_COUNT 12 // Max number of adds that can be fused without exceeding MAX_PARAMETER_COUNT. -#define MAX_FUSED_ADDS (MAX_PARAMETER_COUNT - 2) +#define MAX_FUSED_ADDS (MAX_PARAMETER_COUNT - 3) struct vk_pipeline_struct { std::string name; @@ -381,6 +381,9 @@ struct vk_device_struct { bool subgroup_shuffle; bool multi_add; + bool add_rms_fusion; + uint32_t partials_binding_alignment; + bool integer_dot_product; bool subgroup_size_control; @@ -460,9 +463,12 @@ struct vk_device_struct { vk_pipeline pipeline_mul_norepeat[2][2][2]; vk_pipeline pipeline_div[2][2][2]; vk_pipeline pipeline_div_norepeat[2][2][2]; + vk_pipeline pipeline_add_rms[2][2][2]; + vk_pipeline pipeline_add_rms_norepeat[2][2][2]; // indexed by num_additional_fused_ops == num_adds - 1 vk_pipeline pipeline_multi_add[MAX_FUSED_ADDS]; + vk_pipeline pipeline_multi_add_rms[MAX_FUSED_ADDS]; vk_pipeline pipeline_add_id_f32; @@ -486,6 +492,8 @@ struct vk_device_struct { vk_pipeline pipeline_group_norm_f32; vk_pipeline pipeline_rms_norm_f32; vk_pipeline pipeline_rms_norm_mul_f32; + vk_pipeline pipeline_rms_norm_partials_f32; + vk_pipeline pipeline_rms_norm_mul_partials_f32; vk_pipeline pipeline_rms_norm_back_f32; vk_pipeline pipeline_l2_norm_f32; @@ -823,8 +831,13 @@ struct vk_op_multi_add_push_constants { uint32_t ne20; uint32_t ne21; uint32_t ne22; uint32_t ne23; // strides for srcs+dst - uint32_t nb[8][4]; + uint32_t nb[MAX_PARAMETER_COUNT][4]; + + uint32_t rms_partials; }; +// update multi_add.comp if this changes +static_assert(MAX_PARAMETER_COUNT == 12); +static_assert(sizeof(vk_op_multi_add_push_constants) <= 256); struct vk_op_add_id_push_constants { uint32_t ne0; @@ -1208,6 +1221,12 @@ class vk_perf_logger { timings[name].push_back(time); return; } + if (node->op == GGML_OP_RMS_NORM) { + std::string name = ggml_op_name(node->op); + name += "(" + std::to_string(node->ne[0]) + "," + std::to_string(node->ne[1]) + "," + std::to_string(node->ne[2]) + "," + std::to_string(node->ne[3]) + ")"; + timings[name].push_back(time); + return; + } timings[ggml_op_name(node->op)].push_back(time); } private: @@ -1222,10 +1241,13 @@ struct ggml_backend_vk_context { size_t semaphore_idx, event_idx; ggml_vk_garbage_collector gc; - size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k; - vk_buffer prealloc_x, prealloc_y, prealloc_split_k; + size_t prealloc_size_x, prealloc_size_y, prealloc_size_split_k, prealloc_size_add_rms_partials, prealloc_size_add_rms_partials_offset; + vk_buffer prealloc_x, prealloc_y, prealloc_split_k, prealloc_add_rms_partials; vk::Fence fence, almost_ready_fence; bool almost_ready_fence_pending {}; + // Set before op_add and unset after op_rms_norm to indicate that the add should + // write partial sums to accumulate the square of the vector components + bool do_add_rms_partials; // Cache most recent tensor that was converted into prealloc_y, and what pipeline it used to convert. vk_pipeline_struct * prealloc_y_last_pipeline_used {}; @@ -2987,8 +3009,12 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); - ggml_vk_create_pipeline(device, device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {0, 0}, 1); - ggml_vk_create_pipeline(device, device->pipeline_rms_norm_mul_f32, "rms_norm_mul_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {0, 1}, 1); + + ggml_vk_create_pipeline(device, device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 4, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {0, 0}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_rms_norm_mul_f32, "rms_norm_mul_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 4, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {0, 1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_rms_norm_partials_f32, "rms_norm_partials_f32", rms_norm_partials_f32_len, rms_norm_partials_f32_data, "main", 4, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {0, 0}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_rms_norm_mul_partials_f32, "rms_norm_mul_partials_f32", rms_norm_partials_f32_len, rms_norm_partials_f32_data, "main", 4, sizeof(vk_op_binary_push_constants), {1, 1, 1}, {0, 1}, 1, true); + ggml_vk_create_pipeline(device, device->pipeline_rms_norm_back_f32, "rms_norm_back_f32", rms_norm_back_f32_len, rms_norm_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_l2_norm_f32, "l2_norm_f32", l2_norm_f32_len, l2_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); @@ -3058,25 +3084,28 @@ static void ggml_vk_load_shaders(vk_device& device) { }; bool rte = device->float_controls_rte_fp16; -#define CREATE_BINARY(name, namemod, spec) \ +#define CREATE_BINARY(name, namemod, spec, bindings) \ for (int s0 : {0,1}) for (int s1 : {0,1}) for (int d : {0,1}) \ ggml_vk_create_pipeline(device, device->pipeline_ ## name ## namemod[s0][s1][d], \ #name + get_suffix(s0, s1, d) + #namemod, name ## _len[s0][s1][d][rte], name ## _data[s0][s1][d][rte], \ - "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, spec, 1); - - CREATE_BINARY(add, , {0}) - CREATE_BINARY(add, _norepeat, {1}) - CREATE_BINARY(sub, , {0}) - CREATE_BINARY(sub, _norepeat, {1}) - CREATE_BINARY(mul, , {0}) - CREATE_BINARY(mul, _norepeat, {1}) - CREATE_BINARY(div, , {0}) - CREATE_BINARY(div, _norepeat, {1}) + "main", (bindings), sizeof(vk_op_binary_push_constants), {512, 1, 1}, spec, 1); + + CREATE_BINARY(add, , {0}, 4) + CREATE_BINARY(add, _norepeat, {1}, 4) + CREATE_BINARY(sub, , {0}, 3) + CREATE_BINARY(sub, _norepeat, {1}, 3) + CREATE_BINARY(mul, , {0}, 3) + CREATE_BINARY(mul, _norepeat, {1}, 3) + CREATE_BINARY(div, , {0}, 3) + CREATE_BINARY(div, _norepeat, {1}, 3) + CREATE_BINARY(add_rms, , {0}, 4) + CREATE_BINARY(add_rms, _norepeat, {1}, 4) #undef CREATE_BINARY if (device->multi_add) { for (uint32_t i = 0; i < MAX_FUSED_ADDS; ++i) { - ggml_vk_create_pipeline(device, device->pipeline_multi_add[i], "multi_add_f32_" + std::to_string(i+1), multi_add_f32_len, multi_add_f32_data, "main", MAX_PARAMETER_COUNT, sizeof(vk_op_multi_add_push_constants), {512, 1, 1}, {i+2}, 1); + ggml_vk_create_pipeline(device, device->pipeline_multi_add[i], "multi_add_f32_" + std::to_string(i+1), multi_add_f32_len, multi_add_f32_data, "main", MAX_PARAMETER_COUNT, sizeof(vk_op_multi_add_push_constants), {512, 1, 1}, {i+2}, 1); + ggml_vk_create_pipeline(device, device->pipeline_multi_add_rms[i], "multi_add_rms_f32_" + std::to_string(i+1), multi_add_rms_f32_len, multi_add_rms_f32_data, "main", MAX_PARAMETER_COUNT, sizeof(vk_op_multi_add_push_constants), {512, 1, 1}, {i+2}, 1); } } @@ -3944,6 +3973,12 @@ static vk_device ggml_vk_get_device(size_t idx) { device->disable_fusion = getenv("GGML_VK_DISABLE_FUSION") != nullptr; + device->add_rms_fusion = !device->disable_fusion && + device->subgroup_add && + device->vendor_id != VK_VENDOR_ID_INTEL; + device->partials_binding_alignment = + std::max(4u, (uint32_t)device->properties.limits.minStorageBufferOffsetAlignment); + return device; } @@ -7080,7 +7115,7 @@ static std::array ggml_vk_get_conv_elements(const ggml_tensor *dst) return elements; } -static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, ggml_tensor * dst, ggml_op op) { +static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * src2, const ggml_tensor * dst, ggml_op op) { switch (op) { case GGML_OP_GET_ROWS: GGML_ASSERT(src1->type == GGML_TYPE_I32); @@ -7109,10 +7144,19 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const case GGML_OP_ADD: { if (ctx->num_additional_fused_ops > 0) { - return ctx->device->pipeline_multi_add[ctx->num_additional_fused_ops]; + if (ctx->do_add_rms_partials) { + return ctx->device->pipeline_multi_add_rms[ctx->num_additional_fused_ops]; + } else { + return ctx->device->pipeline_multi_add[ctx->num_additional_fused_ops]; + } + } + if (ctx->do_add_rms_partials) { + auto pipelines = ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_add_rms_norepeat : ctx->device->pipeline_add_rms; + return pipelines[src0->type == GGML_TYPE_F16][src1->type == GGML_TYPE_F16][dst->type == GGML_TYPE_F16]; + } else { + auto pipelines = ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_add_norepeat : ctx->device->pipeline_add; + return pipelines[src0->type == GGML_TYPE_F16][src1->type == GGML_TYPE_F16][dst->type == GGML_TYPE_F16]; } - auto pipelines = ggml_are_same_shape(src0, src1) ? ctx->device->pipeline_add_norepeat : ctx->device->pipeline_add; - return pipelines[src0->type == GGML_TYPE_F16][src1->type == GGML_TYPE_F16][dst->type == GGML_TYPE_F16]; } case GGML_OP_SUB: { @@ -7235,7 +7279,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return nullptr; case GGML_OP_RMS_NORM: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return ctx->num_additional_fused_ops > 0 ? ctx->device->pipeline_rms_norm_mul_f32 : ctx->device->pipeline_rms_norm_f32; + if (ctx->do_add_rms_partials) { + return ctx->num_additional_fused_ops > 0 ? ctx->device->pipeline_rms_norm_mul_partials_f32 : ctx->device->pipeline_rms_norm_partials_f32; + } else { + return ctx->num_additional_fused_ops > 0 ? ctx->device->pipeline_rms_norm_mul_f32 : ctx->device->pipeline_rms_norm_f32; + } } return nullptr; case GGML_OP_RMS_NORM_BACK: @@ -7748,7 +7796,12 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co } } break; case GGML_OP_RMS_NORM: - elements = { (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne03 }; + if (ctx->do_add_rms_partials) { + // Run one element per thread, 128 threads per workgroup + elements = { (uint32_t)CEIL_DIV(ne00, 128), 1, 1 }; + } else { + elements = { (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne03 }; + } break; case GGML_OP_SUM: @@ -7897,7 +7950,16 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co } } - if (op == GGML_OP_GLU) { + if (op == GGML_OP_ADD || op == GGML_OP_RMS_NORM) { + vk_buffer d_A = ctx->do_add_rms_partials ? ctx->prealloc_add_rms_partials : d_X; + size_t a_buf_offset = ctx->do_add_rms_partials ? ctx->prealloc_size_add_rms_partials_offset : 0; + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, + { vk_subbuffer{ d_X, x_buf_offset, x_sz }, + vk_subbuffer{ d_Y, y_buf_offset, y_sz }, + vk_subbuffer{ d_D, d_buf_offset, d_sz }, + vk_subbuffer{ d_A, a_buf_offset, VK_WHOLE_SIZE }, + }, pc, elements); + } else if (op == GGML_OP_GLU) { // Empty src1 is possible in glu, but the shader needs a buffer vk_subbuffer subbuf_y; if (use_src1) { @@ -7998,7 +8060,7 @@ static void ggml_vk_multi_add(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor *tensors[MAX_PARAMETER_COUNT]; uint32_t num_srcs = ctx->num_additional_fused_ops + 2; uint32_t num_tensors = num_srcs + 1; - GGML_ASSERT(num_tensors <= MAX_PARAMETER_COUNT); + GGML_ASSERT(num_tensors + ctx->do_add_rms_partials <= MAX_PARAMETER_COUNT); tensors[0] = first_node->src[0]; tensors[1] = first_node->src[1]; @@ -8025,8 +8087,9 @@ static void ggml_vk_multi_add(ggml_backend_vk_context * ctx, vk_context& subctx, pc.nb[i][2] = (uint32_t)t->nb[2] / sizeof(float); pc.nb[i][3] = (uint32_t)t->nb[3] / sizeof(float); } + pc.rms_partials = ctx->do_add_rms_partials; - vk_pipeline pipeline = ctx->device->pipeline_multi_add[ctx->num_additional_fused_ops]; + vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, tensors[0], tensors[1], nullptr, dst, dst->op); if (pipeline == nullptr) { std::cerr << "ggml_vulkan: Error: Missing multi_add"; @@ -8064,6 +8127,10 @@ static void ggml_vk_multi_add(ggml_backend_vk_context * ctx, vk_context& subctx, buf[i] = buf[0]; offset[i] = 0; } + if (ctx->do_add_rms_partials) { + buf[num_tensors] = ctx->prealloc_add_rms_partials; + offset[num_tensors] = ctx->prealloc_size_add_rms_partials_offset; + } std::array elements; @@ -8076,6 +8143,7 @@ static void ggml_vk_multi_add(ggml_backend_vk_context * ctx, vk_context& subctx, elements = { ne, 1, 1 }; } + static_assert(MAX_PARAMETER_COUNT == 12); ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { vk_subbuffer{ buf[0], offset[0], VK_WHOLE_SIZE }, @@ -8086,6 +8154,10 @@ static void ggml_vk_multi_add(ggml_backend_vk_context * ctx, vk_context& subctx, vk_subbuffer{ buf[5], offset[5], VK_WHOLE_SIZE }, vk_subbuffer{ buf[6], offset[6], VK_WHOLE_SIZE }, vk_subbuffer{ buf[7], offset[7], VK_WHOLE_SIZE }, + vk_subbuffer{ buf[8], offset[8], VK_WHOLE_SIZE }, + vk_subbuffer{ buf[9], offset[9], VK_WHOLE_SIZE }, + vk_subbuffer{ buf[10], offset[10], VK_WHOLE_SIZE }, + vk_subbuffer{ buf[11], offset[11], VK_WHOLE_SIZE }, }, pc, elements); } @@ -8100,7 +8172,7 @@ static void ggml_vk_add(ggml_backend_vk_context * ctx, vk_context& subctx, const (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, - 0.0f, 0.0f, 0, + 0.0f, 0.0f, ctx->do_add_rms_partials, }, dryrun); } @@ -8558,19 +8630,39 @@ static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_GROUP_NORM, { group_size, 0, eps, 0.0f }, dryrun); } +static uint32_t ggml_vk_rms_num_partials(ggml_backend_vk_context * ctx, const ggml_tensor *node) { + const uint32_t ne = (uint32_t)node->ne[0]; + const uint32_t denom = ctx->device->pipeline_add_rms[0][0][0]->wg_denoms[0]; + const uint32_t num_partials = CEIL_DIV(ne, denom); + return num_partials; +} + +static uint32_t ggml_vk_rms_partials_size(ggml_backend_vk_context * ctx, const ggml_tensor *node) { + const uint32_t num_partials = ggml_vk_rms_num_partials(ctx, node); + const uint32_t num_bytes = ROUNDUP_POW2(num_partials * sizeof(uint32_t), ctx->device->partials_binding_alignment); + return num_bytes; +} + static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, float * op_params, bool dryrun = false) { const uint32_t src0_type_size = ggml_type_size(src0->type); const uint32_t src1_type_size = ggml_type_size(src1->type); const uint32_t dst_type_size = ggml_type_size(dst->type); + uint32_t param3 = ctx->do_add_rms_partials ? ggml_vk_rms_num_partials(ctx, dst) : 0; + ggml_vk_op_f32(ctx, subctx, src0, src1, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)ggml_nelements(src0), (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2],(uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size, (uint32_t)src1->ne[0], (uint32_t)src1->ne[1], (uint32_t)src1->ne[2],(uint32_t)src1->ne[3], (uint32_t)src1->nb[0] / src1_type_size, (uint32_t)src1->nb[1] / src1_type_size, (uint32_t)src1->nb[2] / src1_type_size, (uint32_t)src1->nb[3] / src1_type_size, (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2],(uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size, 0, - op_params[0], 0.0f, 0, + op_params[0], 0.0f, (int32_t)param3, }, dryrun); + + if (ctx->do_add_rms_partials) { + ctx->prealloc_size_add_rms_partials_offset += ggml_vk_rms_partials_size(ctx, src0); + ctx->do_add_rms_partials = false; + } } static void ggml_vk_rms_norm_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { @@ -9848,6 +9940,14 @@ static void ggml_vk_preallocate_buffers(ggml_backend_vk_context * ctx) { } ctx->prealloc_split_k = ggml_vk_create_buffer_device(ctx->device, ctx->prealloc_size_split_k); } + if (ctx->prealloc_add_rms_partials == nullptr || (ctx->prealloc_size_add_rms_partials > 0 && ctx->prealloc_add_rms_partials->size < ctx->prealloc_size_add_rms_partials)) { + VK_LOG_MEMORY("ggml_vk_preallocate_buffers(add_partials_size: " << ctx->prealloc_add_rms_partials << ")"); + // Resize buffer + if (ctx->prealloc_add_rms_partials != nullptr) { + ggml_vk_destroy_buffer(ctx->prealloc_add_rms_partials); + } + ctx->prealloc_add_rms_partials = ggml_vk_create_buffer_device(ctx->device, ctx->prealloc_size_add_rms_partials); + } } static bool ggml_vk_compute_forward(ggml_backend_vk_context* ctx, ggml_cgraph * cgraph, ggml_tensor* tensor, int tensor_idx, bool use_fence, bool almost_ready); @@ -9904,10 +10004,23 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr return false; } break; + case GGML_OP_ADD: + { + int next_node_idx = node_idx + 1 + ctx->num_additional_fused_ops; + if (next_node_idx < cgraph->n_nodes && + cgraph->nodes[next_node_idx]->op == GGML_OP_RMS_NORM && + cgraph->nodes[next_node_idx]->src[0] == cgraph->nodes[next_node_idx - 1] && + ggml_nrows(cgraph->nodes[next_node_idx]) == 1 && + ctx->device->add_rms_fusion) { + if (dryrun) { + ctx->prealloc_size_add_rms_partials += ggml_vk_rms_partials_size(ctx, cgraph->nodes[node_idx]); + } + ctx->do_add_rms_partials = true; + } + } break; case GGML_OP_REPEAT: case GGML_OP_REPEAT_BACK: case GGML_OP_GET_ROWS: - case GGML_OP_ADD: case GGML_OP_ADD_ID: case GGML_OP_ACC: case GGML_OP_SUB: @@ -10029,6 +10142,9 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr // do the only thing needed for the dryrun. vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, src2, node, node->op); ggml_pipeline_request_descriptor_sets(ctx, pipeline, 1); + if (node->op == GGML_OP_RMS_NORM) { + ctx->do_add_rms_partials = false; + } return false; } default: @@ -11098,6 +11214,10 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg vk_instance.pfn_vkQueueBeginDebugUtilsLabelEXT(ctx->device->compute_queue.queue, reinterpret_cast(&dul)); } + ctx->prealloc_size_add_rms_partials = 0; + ctx->prealloc_size_add_rms_partials_offset = 0; + ctx->do_add_rms_partials = false; + uint64_t total_mat_mul_bytes = 0; for (int i = 0; i < cgraph->n_nodes; i++) { if (!ctx->device->disable_fusion) { @@ -11166,6 +11286,19 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg ctx->prealloc_y_last_pipeline_used = nullptr; ctx->prealloc_y_last_tensor_used = nullptr; + if (ctx->prealloc_size_add_rms_partials) { + if (ctx->compute_ctx.expired()) { + compute_ctx = ggml_vk_create_context(ctx, ctx->compute_cmd_pool); + ctx->compute_ctx = compute_ctx; + ggml_vk_ctx_begin(ctx->device, compute_ctx); + } else { + compute_ctx = ctx->compute_ctx.lock(); + } + // initialize partial sums to zero. + ggml_vk_buffer_memset_async(compute_ctx, ctx->prealloc_add_rms_partials, 0, 0, ctx->prealloc_size_add_rms_partials); + ggml_vk_sync_buffers(ctx, compute_ctx); + } + // Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution. // Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB // (and scaled down based on model size, so smaller models submit earlier). diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/add.comp b/ggml/src/ggml-vulkan/vulkan-shaders/add.comp index 2b4085c4f82d5..00cf2dd62fddb 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/add.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/add.comp @@ -1,20 +1,34 @@ #version 450 #extension GL_EXT_shader_16bit_storage : require +#if ADD_RMS +#extension GL_KHR_shader_subgroup_arithmetic : enable +#extension GL_KHR_shader_subgroup_basic : enable +#endif #include "types.comp" #include "generic_binary_head.comp" const uint num_threads = 256; +layout (binding = 3, std430) buffer PartialBuf {float partial_sums[];}; + layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in; +#if ADD_RMS +// XXX TODO this could be sized based on number of subgroups, but that't not considered a constant +shared FLOAT_TYPE sumsh[num_threads]; +#endif + void main() { uint idx = get_idx(); + uint orig_idx = idx; // num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation const uint num_iter = 2; + FLOAT_TYPE sum_sq = 0; + [[unroll]] for (uint i = 0; i < num_iter; ++i) { if (idx >= p.ne) { continue; @@ -22,8 +36,34 @@ void main() { uint i00, i01, i02, i03; get_indices(idx, i00, i01, i02, i03); - data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)])); + FLOAT_TYPE sum = FLOAT_TYPE(data_a[get_aoffset() + src0_idx(i00, i01, i02, i03)]) + FLOAT_TYPE(data_b[get_boffset() + src1_idx(i00, i01, i02, i03)]); + sum_sq += sum*sum; + + data_d[get_doffset() + dst_idx(i00, i01, i02, i03)] = D_TYPE(sum); idx += num_threads; } + +#if ADD_RMS + if (p.param3 != 0) { + // reduce the sum within each subgroup, then across subgroups + const uint NumSubgroups = num_threads / gl_SubgroupSize; + sum_sq = subgroupAdd(sum_sq); + if (gl_SubgroupInvocationID == 0) { + sumsh[gl_SubgroupID] = sum_sq; + } + barrier(); + [[unroll]] for (uint s = NumSubgroups / 2; s > 0; s >>= 1) { + if (gl_SubgroupID < s && gl_SubgroupInvocationID == 0) { + sum_sq += sumsh[gl_SubgroupID + s]; + sumsh[gl_SubgroupID] = sum_sq; + } + barrier(); + } + + if (gl_SubgroupID == 0 && gl_SubgroupInvocationID == 0) { + partial_sums[orig_idx / (num_iter * num_threads)] = sum_sq; + } + } +#endif } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp b/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp index 0c7acb7060f07..f2f218b04ac34 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp @@ -3,6 +3,10 @@ #extension GL_EXT_shader_16bit_storage : require #extension GL_EXT_nonuniform_qualifier : enable #extension GL_EXT_control_flow_attributes : require +#if ADD_RMS +#extension GL_KHR_shader_subgroup_arithmetic : enable +#extension GL_KHR_shader_subgroup_basic : enable +#endif #include "rte.comp" #include "types.comp" @@ -14,12 +18,16 @@ layout (push_constant) uniform parameter2 uint ne20; uint ne21; uint ne22; uint ne23; // strides for srcs+dst - uint nb[8][4]; + uint nb[12][4]; + + uint rms_partials; } p; layout (binding = 0) readonly buffer A {A_TYPE data_a[];} a[]; layout (binding = 0) writeonly buffer D {D_TYPE data_d[];} d[]; +layout (binding = 0, std430) buffer PartialBuf {float partial_sums[];} partials[]; + layout(constant_id = 0) const uint num_srcs = 2; uint src_idx(uint s, uint i00, uint i01, uint i02, uint i03) { @@ -42,14 +50,22 @@ const uint num_threads = 256; layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in; +#if ADD_RMS +// XXX TODO this could be sized based on number of subgroups, but that't not considered a constant +shared FLOAT_TYPE sumsh[num_threads]; +#endif + void main() { uint idx = get_idx(); + uint orig_idx = idx; uint ne = p.ne20 * p.ne21 * p.ne22 * p.ne23; // num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation const uint num_iter = 2; + FLOAT_TYPE sum_sq = 0; + [[unroll]] for (uint i = 0; i < num_iter; ++i) { if (idx >= ne) { continue; @@ -61,8 +77,32 @@ void main() { [[unroll]] for (uint s = 0; s < num_srcs; ++s) { sum += FLOAT_TYPE(a[s].data_a[src_idx(s, i00, i01, i02, i03)]); } + sum_sq += sum*sum; d[num_srcs].data_d[dst_idx(i00, i01, i02, i03)] = D_TYPE(sum); idx += num_threads; } + +#if ADD_RMS + if (p.rms_partials != 0) { + // reduce the sum within each subgroup, then across subgroups + const uint NumSubgroups = num_threads / gl_SubgroupSize; + sum_sq = subgroupAdd(sum_sq); + if (gl_SubgroupInvocationID == 0) { + sumsh[gl_SubgroupID] = sum_sq; + } + barrier(); + [[unroll]] for (uint s = NumSubgroups / 2; s > 0; s >>= 1) { + if (gl_SubgroupID < s && gl_SubgroupInvocationID == 0) { + sum_sq += sumsh[gl_SubgroupID + s]; + sumsh[gl_SubgroupID] = sum_sq; + } + barrier(); + } + + if (gl_SubgroupID == 0 && gl_SubgroupInvocationID == 0) { + partials[num_srcs + 1].partial_sums[orig_idx / (num_iter * num_threads)] = sum_sq; + } + } +#endif } diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp index bdd7db2d6987a..41197e9301ad8 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp @@ -10,9 +10,9 @@ layout (constant_id = 1) const bool do_multiply = false; layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in; -shared FLOAT_TYPE sum[BLOCK_SIZE]; +shared FLOAT_TYPE sumsh[BLOCK_SIZE]; -void main() { +void rms_norm(uint num_iters) { const uint ncols = p.ne00; const uint nrows = gl_NumWorkGroups.x; const uint nchannels = gl_NumWorkGroups.y; @@ -30,38 +30,76 @@ void main() { uint32_t b_offset = src1_idx(0, row, channel, samp) + get_boffset(); uint32_t d_offset = ((samp*nchannels + channel)*nrows + row)*ncols + get_doffset(); - sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp + FLOAT_TYPE sum = FLOAT_TYPE(0.0f); // partial sum for thread in warp - [[unroll]] for (uint col = tid; col < ncols; col += BLOCK_SIZE) { - const FLOAT_TYPE xi = FLOAT_TYPE(data_a[a_offset + col]); - sum[tid] += xi * xi; + [[unroll]] for (uint col = tid, idx = 0; idx < num_iters; col += BLOCK_SIZE, ++idx) { + FLOAT_TYPE xi = FLOAT_TYPE(0); + if (col < ncols) { + xi = FLOAT_TYPE(data_a[a_offset + col]); + } + sum += xi * xi; } + sumsh[tid] = sum; // sum up partial sums and write back result barrier(); [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) { if (tid < s) { - sum[tid] += sum[tid + s]; + sum += sumsh[tid + s]; + sumsh[tid] = sum; } barrier(); } + sum = sumsh[0]; - const FLOAT_TYPE mean = sum[0] / FLOAT_TYPE(ncols); + const FLOAT_TYPE mean = sum / FLOAT_TYPE(ncols); const FLOAT_TYPE scale = inversesqrt(mean + FLOAT_TYPE(p.param1)); if (do_multiply) { if (ncols > p.ne10) { - [[unroll]] for (uint col = tid; col < ncols; col += BLOCK_SIZE) { + [[unroll]] for (uint col = tid, idx = 0; idx < num_iters; col += BLOCK_SIZE, ++idx) { + if (col >= ncols) { + continue; + } data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col]) * FLOAT_TYPE(data_b[b_offset + fastmod(col, p.ne10)])); } } else { - [[unroll]] for (uint col = tid; col < ncols; col += BLOCK_SIZE) { + [[unroll]] for (uint col = tid, idx = 0; idx < num_iters; col += BLOCK_SIZE, ++idx) { + if (col >= ncols) { + continue; + } data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col]) * FLOAT_TYPE(data_b[b_offset + col])); } } } else { - [[unroll]] for (uint col = tid; col < ncols; col += BLOCK_SIZE) { + [[unroll]] for (uint col = tid, idx = 0; idx < num_iters; col += BLOCK_SIZE, ++idx) { + if (col >= ncols) { + continue; + } data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col])); } } } + +void main() { + // instantiate the rms_norm function for several different + // dimensions, to allow loop unrolling + uint num_blocks = (p.ne00 + BLOCK_SIZE - 1) / BLOCK_SIZE; + if (num_blocks > 32) { + rms_norm(num_blocks); + } else if (num_blocks > 16) { + rms_norm(32); + } else if (num_blocks > 8) { + rms_norm(16); + } else if (num_blocks > 4) { + rms_norm(8); + } else if (num_blocks == 4) { + rms_norm(4); + } else if (num_blocks == 3) { + rms_norm(3); + } else if (num_blocks == 2) { + rms_norm(2); + } else if (num_blocks == 1) { + rms_norm(1); + } +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp new file mode 100644 index 0000000000000..ba4677c293392 --- /dev/null +++ b/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm_partials.comp @@ -0,0 +1,65 @@ +#version 450 + +#include "generic_binary_head.comp" +#include "types.comp" + +#extension GL_EXT_control_flow_attributes : enable +#extension GL_KHR_shader_subgroup_arithmetic : enable +#extension GL_KHR_shader_subgroup_basic : enable + +#define BLOCK_SIZE 128 + +layout (constant_id = 1) const bool do_multiply = false; + +layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in; + +layout (binding = 3, std430) readonly buffer PartialsBuf {float partial_sums[];}; + +shared FLOAT_TYPE sumsh[BLOCK_SIZE]; + +void main() { + const uint ncols = p.ne00; + const uint nrows = gl_NumWorkGroups.x; + const uint nchannels = gl_NumWorkGroups.y; + + const uint row = 0; + const uint channel = gl_WorkGroupID.y; + const uint samp = gl_WorkGroupID.z; + // The work is split across multiple workgroups in the x dimension. Each invocation + // processes one element + const uint tid = gl_GlobalInvocationID.x; + + const uint stride_row = p.nb01; + const uint stride_channel = p.nb02; + const uint stride_sample = p.nb03; + + uint32_t a_offset = samp*stride_sample + channel*stride_channel + row*stride_row + get_aoffset(); + uint32_t b_offset = src1_idx(0, row, channel, samp) + get_boffset(); + uint32_t d_offset = ((samp*nchannels + channel)*nrows + row)*ncols + get_doffset(); + + FLOAT_TYPE sum = FLOAT_TYPE(0.0f); // partial sum for thread in warp + + uint32_t num_partials = p.param3; + for (uint32_t i = gl_SubgroupInvocationID; i < num_partials; i += gl_SubgroupSize) { + sum += partial_sums[i]; + } + sum = subgroupAdd(sum); + + uint col = tid; + if (col >= ncols) { + return; + } + + const FLOAT_TYPE mean = sum / FLOAT_TYPE(ncols); + const FLOAT_TYPE scale = inversesqrt(mean + FLOAT_TYPE(p.param1)); + + if (do_multiply) { + if (ncols > p.ne10) { + data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col]) * FLOAT_TYPE(data_b[b_offset + fastmod(col, p.ne10)])); + } else { + data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col]) * FLOAT_TYPE(data_b[b_offset + col])); + } + } else { + data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col])); + } +} diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp index 123ae044914ed..50a27748317be 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/vulkan-shaders-gen.cpp @@ -503,6 +503,7 @@ void process_shaders() { string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("group_norm_f32", "group_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}})); + string_to_spv("rms_norm_partials_f32", "rms_norm_partials.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("rms_norm_back_f32", "rms_norm_back.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}})); string_to_spv("l2_norm_f32", "l2_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}})); @@ -538,13 +539,15 @@ void process_shaders() { s += std::string(dst_f16 ? "_f16" : "_f32"); return s; }; - for (std::string op : {"add", "sub", "mul", "div"}) { + for (std::string op : {"add", "sub", "mul", "div", "add_rms", }) { for (auto src0_f16 : {false, true}) { for (auto src1_f16 : {false, true}) { for (auto dst_f16 : {false, true}) { for (auto rte : {false, true}) { + auto source = op == "add_rms" ? std::string("add") : op; auto name = op + get_suffix(src0_f16, src1_f16, dst_f16) + (rte ? "_rte" : ""); - string_to_spv(name.c_str(), op + ".comp", {{"A_TYPE", get_type_str(src0_f16)}, {"B_TYPE", get_type_str(src1_f16)}, {"D_TYPE", get_type_str(dst_f16)}, {"FLOAT_TYPE", "float"}, {"RTE16", rte ? "1" : "0"}}); + auto add_rms = op == "add_rms" ? "1" : "0"; + string_to_spv(name.c_str(), source + ".comp", {{"A_TYPE", get_type_str(src0_f16)}, {"B_TYPE", get_type_str(src1_f16)}, {"D_TYPE", get_type_str(dst_f16)}, {"FLOAT_TYPE", "float"}, {"RTE16", rte ? "1" : "0"}, {"ADD_RMS" , add_rms}}); } } } @@ -687,7 +690,8 @@ void process_shaders() { string_to_spv("add_id_f32", "add_id.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}})); - string_to_spv("multi_add_f32", "multi_add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"RTE16", "1"}}); + string_to_spv("multi_add_f32", "multi_add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"RTE16", "1"}, {"ADD_RMS" , "0"}}); + string_to_spv("multi_add_rms_f32", "multi_add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}, {"RTE16", "1"}, {"ADD_RMS" , "1"}}); for (auto &c : compiles) { c.wait(); @@ -745,7 +749,7 @@ void write_output_files() { } std::string suffixes[2] = {"_f32", "_f16"}; - for (const char *op : {"add", "sub", "mul", "div"}) { + for (const char *op : {"add", "sub", "mul", "div", "add_rms"}) { fprintf(hdr, "extern unsigned char *%s_data[2][2][2][2];\n", op); fprintf(hdr, "extern uint64_t %s_len[2][2][2][2];\n", op); std::string data = "unsigned char *" + std::string(op) + "_data[2][2][2][2] = "; diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 2e53f8e21a5a2..1e1e43f50594d 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -2858,6 +2858,7 @@ struct test_rms_norm_mul_add : public test_case { const std::array ne; const float eps; const bool broadcast; + const bool multi_add; // test a sequence of adds feeding into rms_norm std::string op_desc(ggml_tensor * t) override { GGML_UNUSED(t); @@ -2867,13 +2868,13 @@ struct test_rms_norm_mul_add : public test_case { bool run_whole_graph() override { return true; } std::string vars() override { - return VARS_TO_STR4(type, ne, eps, broadcast); + return VARS_TO_STR5(type, ne, eps, broadcast, multi_add); } test_rms_norm_mul_add(ggml_type type = GGML_TYPE_F32, std::array ne = {64, 5, 4, 3}, - float eps = 1e-6f, bool broadcast = false) - : type(type), ne(ne), eps(eps), broadcast(broadcast) {} + float eps = 1e-6f, bool broadcast = false, bool multi_add = false) + : type(type), ne(ne), eps(eps), broadcast(broadcast), multi_add(multi_add) {} ggml_tensor * build_graph(ggml_context * ctx) override { std::array broadcast_dims = {ne[0]*2, ne[1]*3, ne[2]*3, ne[3]*4}; @@ -2891,6 +2892,9 @@ struct test_rms_norm_mul_add : public test_case { // Use a, b and c early, so we don't end up with an OP_NONE between rms_norm and mul a = ggml_add(ctx, ggml_add(ctx, a, b), c); + if (multi_add) { + a = ggml_add(ctx, ggml_add(ctx, a, b), c); + } ggml_tensor * out = ggml_add(ctx, ggml_mul(ctx, ggml_rms_norm(ctx, a, eps), b), c); ggml_set_name(out, "out"); @@ -5842,6 +5846,11 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps)); test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {64, 5, 4, 3}, eps, true)); } + for (uint32_t n : {1, 511, 1025, 8192, 33*512}) { + for (bool multi_add : {false, true}) { + test_cases.emplace_back(new test_rms_norm_mul_add(GGML_TYPE_F32, {n, 1, 1, 1}, 1e-6f, false, multi_add)); + } + } test_cases.emplace_back(new test_l2_norm(GGML_TYPE_F32, {64, 5, 4, 3}, 1e-12f)); From 710dfc465a68f7443b87d9f792cffba00ed739fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 23 Aug 2025 21:37:06 +0200 Subject: [PATCH 136/140] CUDA: fix half2 -> half conversion for HIP (#15529) --- ggml/src/ggml-cuda/fattn-tile-f16.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/fattn-tile-f16.cu b/ggml/src/ggml-cuda/fattn-tile-f16.cu index 6239d184d0a67..a900799a991c0 100644 --- a/ggml/src/ggml-cuda/fattn-tile-f16.cu +++ b/ggml/src/ggml-cuda/fattn-tile-f16.cu @@ -258,7 +258,7 @@ static __global__ void flash_attn_tile_ext_f16( const half val = hexp(sink - kqmax[j0/nwarps]); kqsum[j0/nwarps] = kqsum[j0/nwarps] * KQ_max_scale; if (threadIdx.x == 0) { - kqsum[j0/nwarps].x = __hadd(kqsum[j0/nwarps].x, val); + kqsum[j0/nwarps].x = __hadd(__low2half(kqsum[j0/nwarps]), val); } #pragma unroll From e78cf0d4b1bdbbc2479f11d58ce0c8f51f755875 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Sun, 24 Aug 2025 03:48:21 -0500 Subject: [PATCH 137/140] vulkan: workaround MoltenVK compile failure in multi_add (#15506) * vulkan: workaround MoltenVK compile failure in multi_add * Update ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp Co-authored-by: 0cc4m --- ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp b/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp index f2f218b04ac34..854a2ad818760 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/multi_add.comp @@ -23,8 +23,11 @@ layout (push_constant) uniform parameter2 uint rms_partials; } p; -layout (binding = 0) readonly buffer A {A_TYPE data_a[];} a[]; -layout (binding = 0) writeonly buffer D {D_TYPE data_d[];} d[]; +// Workaround for MoltenVK Bug, see https://github.com/ggml-org/llama.cpp/issues/15498 +// layout (binding = 0) readonly buffer A {A_TYPE data_a[];} a[]; +// layout (binding = 0) writeonly buffer D {D_TYPE data_d[];} d[]; +layout (binding = 0) buffer A {A_TYPE data_a[];} a[]; +layout (binding = 0) buffer D {D_TYPE data_d[];} d[]; layout (binding = 0, std430) buffer PartialBuf {float partial_sums[];} partials[]; From a9c6ffcbfacee092bfaaa400306fceda18199737 Mon Sep 17 00:00:00 2001 From: Ruben Ortlam Date: Sun, 24 Aug 2025 10:48:53 +0200 Subject: [PATCH 138/140] vulkan: enable Conv2D for Apple after MoltenVK fixed the bug (#15526) --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index 2c8d9ecaa0a03..c77d1d32a0695 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -11853,14 +11853,13 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm // Op is disabled for Apple because it segfaults at pipeline create time on MoltenVK ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; const vk_device& device = ggml_vk_get_device(ctx->device); - bool is_Apple = ggml_vk_get_device(ctx->device)->vendor_id == VK_VENDOR_ID_APPLE; // Channel-contiguous format is not supported yet. return ((op->src[0]->type == GGML_TYPE_F32 || op->src[0]->type == GGML_TYPE_F16) && op->src[1]->type == GGML_TYPE_F32 && op->type == GGML_TYPE_F32 && ggml_is_contiguous(op->src[0]) && ggml_is_contiguous(op->src[1]) && - ggml_is_contiguous(op)) && !is_Apple; + ggml_is_contiguous(op)); } default: return false; From c9a24fb93208fbbd3da6d903eb75431bfa97e59e Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Sun, 24 Aug 2025 04:24:25 -0500 Subject: [PATCH 139/140] vulkan: Support FA with any multiple of 8 head sizes (#15537) The scalar FA shader already handled multiples of 8. The coopmat1 FA shader assumed 16x16x16 and the shared memory allocations need the HSK dimensions padded to a multiple of 16. NVIDIA's coopmat2 implementation requires multiples of 16 for N and K, and needs the matrix dimensions padded and loads clamped. Store the FA pipelines in a map, indexed by the pipeline state. --- ggml/src/ggml-vulkan/ggml-vulkan.cpp | 213 ++++++++---------- .../vulkan-shaders/flash_attn_base.comp | 4 + .../vulkan-shaders/flash_attn_cm1.comp | 23 +- .../vulkan-shaders/flash_attn_cm2.comp | 36 +-- tests/test-backend-ops.cpp | 4 +- 5 files changed, 143 insertions(+), 137 deletions(-) diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp index c77d1d32a0695..a5406f761274d 100644 --- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp @@ -115,6 +115,8 @@ struct vk_pipeline_struct { uint32_t parameter_count; std::array wg_denoms; uint32_t align; + // true if fields have been set by ggml_vk_create_pipeline + bool initialized {}; // set to true to request the pipeline is compiled after the dryrun bool needed {}; // set to true when the shader has been compiled @@ -227,21 +229,6 @@ enum vk_device_architecture { NVIDIA_PRE_TURING, }; -// HSK x HSV -enum FaHeadSizes { - FA_HEAD_SIZE_64, - FA_HEAD_SIZE_80, - FA_HEAD_SIZE_96, - FA_HEAD_SIZE_112, - FA_HEAD_SIZE_128, - FA_HEAD_SIZE_192, - FA_HEAD_SIZE_192_128, - FA_HEAD_SIZE_256, - FA_HEAD_SIZE_576_512, - FA_HEAD_SIZE_UNSUPPORTED, - FA_HEAD_SIZE_COUNT = FA_HEAD_SIZE_UNSUPPORTED, -}; - static vk_device_architecture get_device_architecture(const vk::PhysicalDevice& device) { vk::PhysicalDeviceProperties props = device.getProperties(); @@ -351,6 +338,28 @@ enum dmmv_wg_sizes { DMMV_WG_SIZE_COUNT, }; +enum FaCodePath { + FA_SCALAR, + FA_COOPMAT1, + FA_COOPMAT2, +}; + +struct vk_fa_pipeline_state { + vk_fa_pipeline_state(uint32_t HSK, uint32_t HSV, bool small_rows, FaCodePath path, bool aligned, bool f32acc) + : HSK(HSK), HSV(HSV), small_rows(small_rows), path(path), aligned(aligned), f32acc(f32acc) {} + + uint32_t HSK, HSV; + bool small_rows; + FaCodePath path; + bool aligned; + bool f32acc; + + bool operator<(const vk_fa_pipeline_state &b) const { + return std::tie(HSK, HSV, small_rows, path, aligned, f32acc) < + std::tie(b.HSK, b.HSV, b.small_rows, b.path, b.aligned, b.f32acc); + } +}; + static constexpr uint32_t num_argsort_pipelines = 11; static constexpr uint32_t max_argsort_cols = 1 << (num_argsort_pipelines-1); @@ -541,16 +550,11 @@ struct vk_device_struct { vk_pipeline pipeline_conv2d_dw_whcn_f32, pipeline_conv2d_dw_whcn_f16_f32; vk_pipeline pipeline_conv2d_dw_cwhn_f32, pipeline_conv2d_dw_cwhn_f16_f32; - // [2][2][2] is for {f16acc,f32acc}x{large,small_rows}x{unaligned, aligned} - vk_pipeline pipeline_flash_attn_f32_f16_cm2[GGML_TYPE_COUNT][FA_HEAD_SIZE_COUNT][2][2][2]; - - vk_pipeline pipeline_flash_attn_f32_f16_cm1[GGML_TYPE_COUNT][FA_HEAD_SIZE_COUNT][2][2][2]; - - vk_pipeline pipeline_flash_attn_f32_f16[GGML_TYPE_COUNT][FA_HEAD_SIZE_COUNT][2][2][2]; + std::map pipeline_flash_attn_f32_f16[GGML_TYPE_COUNT]; vk_pipeline pipeline_flash_attn_split_k_reduce; - std::unordered_map pipelines; + std::vector all_pipelines; std::vector> pinned_memory; @@ -581,15 +585,15 @@ struct vk_device_struct { compute_queue.cmd_pool.destroy(device); transfer_queue.cmd_pool.destroy(device); - for (auto& pipeline : pipelines) { - if (pipeline.second.expired()) { + for (auto& pipeline : all_pipelines) { + if (pipeline.expired()) { continue; } - vk_pipeline pl = pipeline.second.lock(); + vk_pipeline pl = pipeline.lock(); ggml_vk_destroy_pipeline(device, pl); } - pipelines.clear(); + all_pipelines.clear(); device.destroyDescriptorSetLayout(dsl); @@ -1499,7 +1503,7 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin { std::lock_guard guard(device->mutex); - device->pipelines.insert({ pipeline->name, pipeline }); + device->all_pipelines.push_back(pipeline); } { @@ -1974,47 +1978,12 @@ static void ggml_vk_wait_events(vk_context& ctx, std::vector&& events ); } -enum FaCodePath { - FA_SCALAR, - FA_COOPMAT1, - FA_COOPMAT2, -}; - -static FaHeadSizes fa_get_head_sizes(uint32_t hsk, uint32_t hsv) { - if (hsk != 192 && hsk != 576 && hsk != hsv) { - return FA_HEAD_SIZE_UNSUPPORTED; - } - switch (hsk) { - case 64: return FA_HEAD_SIZE_64; - case 80: return FA_HEAD_SIZE_80; - case 96: return FA_HEAD_SIZE_96; - case 112: return FA_HEAD_SIZE_112; - case 128: return FA_HEAD_SIZE_128; - case 192: - if (hsv == 192) { - return FA_HEAD_SIZE_192; - } else if (hsv == 128) { - return FA_HEAD_SIZE_192_128; - } else { - return FA_HEAD_SIZE_UNSUPPORTED; - } - case 256: return FA_HEAD_SIZE_256; - case 576: - if (hsv == 512) { - return FA_HEAD_SIZE_576_512; - } else { - return FA_HEAD_SIZE_UNSUPPORTED; - } - default: return FA_HEAD_SIZE_UNSUPPORTED; - } -} - // number of rows/cols for flash attention shader static constexpr uint32_t flash_attention_num_small_rows = 32; static constexpr uint32_t scalar_flash_attention_num_small_rows = 1; static uint32_t get_fa_scalar_num_large_rows(uint32_t hsv) { - if (hsv >= 512) { + if (hsv >= 192) { return 2; } else { return 8; @@ -2044,7 +2013,13 @@ static std::array fa_rows_cols(FaCodePath path, uint32_t hsk, uint3 if (small_rows) { return {scalar_flash_attention_num_small_rows, 64}; } else { - return {get_fa_scalar_num_large_rows(hsv), 32}; + if ((hsv | hsk) & 8) { + // HSV/HSK not being a multiple of 16 makes D_split smaller, which makes cols_per_iter + // larger, and Bc needs to be >= cols_per_thread. 64 is large enough, 32 is not. + return {get_fa_scalar_num_large_rows(hsv), 64}; + } else { + return {get_fa_scalar_num_large_rows(hsv), 32}; + } } } @@ -2062,8 +2037,8 @@ static std::array fa_rows_cols(FaCodePath path, uint32_t hsk, uint3 } // small cols to reduce register count - if (ggml_is_quantized(type) || hsk >= 256) { - if (hsk >= 512) { + if (ggml_is_quantized(type) || hsk >= 256 || hsv >= 256) { + if (hsk >= 512 || hsv >= 512) { return {32, 32}; } else { return {64, 32}; @@ -2072,6 +2047,10 @@ static std::array fa_rows_cols(FaCodePath path, uint32_t hsk, uint3 return {64, 64}; } +static uint32_t fa_align(FaCodePath path, uint32_t hsk, uint32_t hsv, ggml_type type, bool small_rows) { + return fa_rows_cols(path, hsk, hsv, 0, type, small_rows)[1]; +} + static bool ggml_vk_matmul_shmem_support(const vk_device& device, const std::vector& warptile, bool mul_mat_id, ggml_type src0_type) { uint32_t lut_size = 0; @@ -2337,11 +2316,14 @@ static void ggml_vk_load_shaders(vk_device& device) { if (!pipeline) { pipeline = std::make_shared(); + } + if (!pipeline->initialized) { pipeline->name = name; pipeline->parameter_count = parameter_count; pipeline->push_constant_size = push_constant_size; pipeline->wg_denoms = wg_denoms; pipeline->align = align; + pipeline->initialized = true; } if (!pipeline->needed || pipeline->compiled) { @@ -2387,26 +2369,30 @@ static void ggml_vk_load_shaders(vk_device& device) { return {wg_size, rows_cols[0], rows_cols[1], hsk, hsv, clamp, D_split}; }; -#define CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, HSK, HSV, HEAD_SIZES) \ - ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][0][0][0], "flash_attn_f32_f16_" #HEAD_SIZES "_f16acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,false), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,false), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ - ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][0][0][1], "flash_attn_f32_f16_" #HEAD_SIZES "_aligned_f16acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,false), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,false), fa_rows_cols(FAPATH,HSK,HSV,0,TYPE,false)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ - ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][1][0][0], "flash_attn_f32_f16_" #HEAD_SIZES "_f32acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,false), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,false), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ - ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][1][0][1], "flash_attn_f32_f16_" #HEAD_SIZES "_aligned_f32acc" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,false), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,false), fa_rows_cols(FAPATH,HSK,HSV,0,TYPE,false)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ - ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][0][1][0], "flash_attn_f32_f16_" #HEAD_SIZES "_f16acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,true), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,true), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ - ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][0][1][1], "flash_attn_f32_f16_" #HEAD_SIZES "_aligned_f16acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,true), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,true), fa_rows_cols(FAPATH,HSK,HSV,0,TYPE,true)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ - ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][1][1][0], "flash_attn_f32_f16_" #HEAD_SIZES "_f32acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,true), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,true), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ - ggml_vk_create_pipeline(device, device->pipeline_flash_attn_f32_f16 ## SUFFIX[TYPE][FA_HEAD_SIZE_##HEAD_SIZES][1][1][1], "flash_attn_f32_f16_" #HEAD_SIZES "_aligned_f32acc_smallrows" #NAMELC #SUFFIX, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,true), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,true), fa_rows_cols(FAPATH,HSK,HSV,0,TYPE,true)[1], true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ - #define CREATE_FA(TYPE, NAMELC, FAPATH, SUFFIX) \ - CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 64, 64, 64) \ - CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 80, 80, 80) \ - CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 96, 96, 96) \ - CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 112, 112, 112) \ - CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 128, 128, 128) \ - CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 192, 192, 192) \ - CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 192, 128, 192_128) \ - CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 256, 256, 256) \ - CREATE_FA2(TYPE, NAMELC, FAPATH, SUFFIX, 576, 512, 576_512) + for (auto &fa : device->pipeline_flash_attn_f32_f16[TYPE]) { \ + uint32_t HSK = fa.first.HSK; \ + uint32_t HSV = fa.first.HSV; \ + bool small_rows = fa.first.small_rows; \ + FaCodePath path = fa.first.path; \ + bool aligned = fa.first.aligned; \ + bool f32acc = fa.first.f32acc; \ + if (path == FAPATH) { \ + if (aligned) { \ + if (f32acc) { \ + ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_aligned_f32acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,small_rows), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,small_rows), fa_align(FAPATH,HSK,HSV,TYPE,small_rows), true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ + } else { \ + ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_aligned_f16acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,0,TYPE,small_rows), fa_spec_constants(FAPATH, HSK,HSV,0,TYPE,small_rows), fa_align(FAPATH,HSK,HSV,TYPE,small_rows), true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ + } \ + } else { \ + if (f32acc) { \ + ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_f32acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## SUFFIX ## _data, "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,small_rows), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,small_rows), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ + } else { \ + ggml_vk_create_pipeline(device, fa.second, "flash_attn_f32_f16_f16acc" #NAMELC, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _len, flash_attn_f32_f16_ ## NAMELC ## _f16acc ## SUFFIX ## _data, "main", 6, sizeof(vk_flash_attn_push_constants), fa_wg_denoms(FAPATH, HSK,HSV,1,TYPE,small_rows), fa_spec_constants(FAPATH, HSK,HSV,1,TYPE,small_rows), 1, true, FAPATH==FA_COOPMAT1, (FAPATH==FA_COOPMAT1 ? 32 : 0)); \ + } \ + } \ + } \ + } CREATE_FA(GGML_TYPE_F16, f16, FA_SCALAR, ) CREATE_FA(GGML_TYPE_Q4_0, q4_0, FA_SCALAR, ) @@ -2429,7 +2415,6 @@ static void ggml_vk_load_shaders(vk_device& device) { CREATE_FA(GGML_TYPE_IQ4_NL, iq4_nl, FA_COOPMAT2, _cm2) } #endif -#undef CREATE_FA2 #undef CREATE_FA #if defined(VK_NV_cooperative_matrix2) && defined(GGML_VULKAN_COOPMAT2_GLSLC_SUPPORT) @@ -6731,18 +6716,21 @@ static bool ggml_vk_flash_attn_coopmat_shmem_support(const vk_device& device, co const uint32_t Br = coopmat1_flash_attention_num_large_rows; const uint32_t Bc = scalar_flash_attention_Bc; + const uint32_t hsk_pad = ROUNDUP_POW2(hsk, 16); + const uint32_t acctype = f32acc ? 4 : 2; const uint32_t f16vec4 = 8; const uint32_t tmpsh = wg_size * sizeof(float); const uint32_t tmpshv4 = wg_size * 4 * acctype; - const uint32_t Qf = Br * (hsk / 4 + 2) * f16vec4; + const uint32_t qstride = hsk_pad / 4 + 2; + const uint32_t Qf = Br * qstride * f16vec4; const uint32_t sfshstride = (hsk <= 128) ? (Br + 8) : Br; const uint32_t sfsh = Bc * sfshstride * acctype; - const uint32_t kshstride = hsk / 4 + 2; + const uint32_t kshstride = hsk_pad / 4 + 2; const uint32_t ksh = Bc * kshstride * f16vec4; const uint32_t slope = Br * sizeof(float); @@ -6853,7 +6841,6 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx workgroups_y /= N; } - vk_pipeline *pipelines; bool small_rows = N <= get_fa_num_small_rows(path); // coopmat1 does not actually support "small rows" (it needs 16 rows). @@ -6873,37 +6860,36 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx small_rows = true; } - bool f32acc = path == FA_SCALAR || dst->op_params[3] == GGML_PREC_F32; - - FaHeadSizes head_sizes = fa_get_head_sizes(k->ne[0], v->ne[0]); - - switch (path) { - case FA_SCALAR: - pipelines = &ctx->device->pipeline_flash_attn_f32_f16[k->type][head_sizes][f32acc][small_rows][0]; - break; - case FA_COOPMAT1: - pipelines = &ctx->device->pipeline_flash_attn_f32_f16_cm1[k->type][head_sizes][f32acc][small_rows][0]; - break; - case FA_COOPMAT2: - pipelines = &ctx->device->pipeline_flash_attn_f32_f16_cm2[k->type][head_sizes][f32acc][small_rows][0]; - break; - default: - GGML_ASSERT(0); - } - assert(pipelines); - const uint32_t q_stride = (uint32_t)(nbq1 / ggml_type_size(q->type)); const uint32_t k_stride = (uint32_t)(nbk1 / ggml_type_size(k->type)); const uint32_t v_stride = (uint32_t)(nbv1 / ggml_type_size(v->type)); - bool aligned = (KV % pipelines[1]->align) == 0 && + uint32_t alignment = fa_align(path, HSK, HSV, k->type, small_rows); + bool aligned = (KV % alignment) == 0 && // the "aligned" shader variant will forcibly align strides, for performance (q_stride & 7) == 0 && (k_stride & 7) == 0 && (v_stride & 7) == 0; + // Need to use the coopmat2 variant that clamps loads when HSK/HSV aren't sufficiently aligned. + if (((HSK | HSV) % 16) != 0 && path == FA_COOPMAT2) { + aligned = false; + } // mask dim1 is padded to 64, we rely on this to avoid clamping mask loads GGML_ASSERT((nem1 % GGML_KQ_MASK_PAD) == 0); - vk_pipeline pipeline = pipelines[aligned]; + bool f32acc = path == FA_SCALAR || dst->op_params[3] == GGML_PREC_F32; + + vk_fa_pipeline_state fa_pipeline_state(HSK, HSV, small_rows, path, aligned, f32acc); + + vk_pipeline pipeline = nullptr; + + auto &pipelines = ctx->device->pipeline_flash_attn_f32_f16[k->type]; + auto it = pipelines.find(fa_pipeline_state); + if (it != pipelines.end()) { + pipeline = it->second; + } else { + pipelines[fa_pipeline_state] = pipeline = std::make_shared(); + } + assert(pipeline); uint32_t split_kv = KV; @@ -6919,7 +6905,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx if (split_k > 1) { // Try to evenly split KV into split_k chunks, but it needs to be a multiple // of "align", so recompute split_k based on that. - split_kv = ROUNDUP_POW2(std::max(1u, KV / split_k), pipelines[1]->align); + split_kv = ROUNDUP_POW2(std::max(1u, KV / split_k), alignment); split_k = CEIL_DIV(KV, split_kv); workgroups_x = split_k; } @@ -11629,8 +11615,9 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context; auto device = ggml_vk_get_device(ctx->device); bool coopmat2 = device->coopmat2; - FaHeadSizes head_sizes = fa_get_head_sizes(op->src[1]->ne[0], op->src[2]->ne[0]); - if (head_sizes == FA_HEAD_SIZE_UNSUPPORTED) { + uint32_t HSK = op->src[1]->ne[0]; + uint32_t HSV = op->src[2]->ne[0]; + if ((HSK % 8) != 0 || (HSV % 8) != 0) { return false; } if (op->src[4] && op->src[4]->type != GGML_TYPE_F32) { diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp index b57c9dcfc4ee5..f73e17e1fa8d9 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.comp @@ -9,6 +9,10 @@ layout (constant_id = 4) const uint32_t HSV = 32; layout (constant_id = 5) const uint32_t Clamp = 0; layout (constant_id = 6) const uint32_t D_split = 16; +// Round up head sizes to a multiple of 16, for coopmat1/coopmat2 paths +const uint32_t HSK_pad = (HSK + 15) & ~15; +const uint32_t HSV_pad = (HSV + 15) & ~15; + layout (push_constant) uniform parameter { uint32_t N; uint32_t KV; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp index 81cc3f81fce77..97c2a54129709 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm1.comp @@ -46,14 +46,14 @@ const uint32_t MatBc = 16; shared FLOAT_TYPE tmpsh[gl_WorkGroupSize.x]; shared ACC_TYPEV4 tmpshv4[gl_WorkGroupSize.x]; -const uint32_t qstride = HSK / 4 + 2; // in units of f16vec4 +const uint32_t qstride = HSK_pad / 4 + 2; // in units of f16vec4 shared f16vec4 Qf[Br * qstride]; // Avoid padding for hsk==256 to make it fit in 48KB shmem. const uint32_t sfshstride = (HSK <= 128) ? (Br + 8) : Br; shared ACC_TYPE sfsh[Bc * sfshstride]; -const uint32_t kshstride = HSK / 4 + 2; // in units of f16vec4 +const uint32_t kshstride = HSK_pad / 4 + 2; // in units of f16vec4 shared f16vec4 ksh[Bc * kshstride]; shared float slope[Br]; @@ -74,6 +74,21 @@ void main() { #define tile_row(r) (row_tid * rows_per_thread + (r)) + // Zero-initialize shared memory for Q/K when HSK is not a multiple of 16 (HSK_pad > HSK). + if ((HSK % 16) != 0) { + [[unroll]] for (uint i = 0; i < Br * qstride; i += gl_WorkGroupSize.x) { + if (i + tid < Br * qstride) { + Qf[i + tid] = f16vec4(0); + } + } + [[unroll]] for (uint i = 0; i < Bc * kshstride; i += gl_WorkGroupSize.x) { + if (i + tid < Bc * kshstride) { + ksh[i + tid] = f16vec4(0); + } + } + barrier(); + } + uint32_t q_offset = (iq2*p.nb02+iq3*p.nb03) / 4; [[unroll]] for (uint32_t idx = 0; idx < Br * HSK / 4; idx += gl_WorkGroupSize.x) { @@ -151,14 +166,14 @@ void main() { } barrier(); - // K * Q^T -> S^T: Bc x HSK * HSK x Br -> Bc x Br + // K * Q^T -> S^T: Bc x HSK_pad * HSK_pad x Br -> Bc x Br // Bc split across workgroup (four subgroups), loop over HSK in chunks of 16: 16 x 16 * 16 x 16 -> 16 x 16 // This is written transposed in order to allow for N being 8 if implementations need it coopmat SfMat = coopmat(0); coopmat KMat; coopmat QMat; - for (uint32_t d = 0; d < HSK / 16; ++d) { + for (uint32_t d = 0; d < HSK_pad / 16; ++d) { coopMatLoad(QMat, Qf, d * 16 / 4, qstride, gl_CooperativeMatrixLayoutColumnMajor); uint coord = (gl_SubgroupID * MatBc) * kshstride + d * 16 / 4; diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp index b0564ca0bfc83..77ae5ff01d03e 100644 --- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp +++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp @@ -104,16 +104,16 @@ void main() { tensorLayoutK = setTensorLayoutStrideNV(tensorLayoutK, k_stride, 1); tensorLayoutV = setTensorLayoutStrideNV(tensorLayoutV, v_stride, 1); - coopmat Q; - coopmat Qf16; + coopmat Q; + coopmat Qf16; uint32_t q_offset = iq2*p.nb02+iq3*p.nb03; - coopMatLoadTensorNV(Q, data_q, q_offset, sliceTensorLayoutNV(tensorLayoutQ, i * Br, Br, 0, HSK)); + coopMatLoadTensorNV(Q, data_q, q_offset, sliceTensorLayoutNV(tensorLayoutQ, i * Br, Br, 0, HSK_pad)); - Qf16 = coopmat(Q); + Qf16 = coopmat(Q); Qf16 *= float16_t(p.scale); - coopmat O = coopmat(0); + coopmat O = coopmat(0); coopmat L, M; @@ -140,10 +140,10 @@ void main() { coopmat S = coopmat(0); - coopmat K_T; + coopmat K_T; uint32_t k_offset = ik2*p.nb12 + ik3*p.nb13; - coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK), tensorViewTranspose DECODEFUNC); + coopMatLoadTensorNV(K_T, data_k, k_offset, sliceTensorLayoutNV(tensorLayoutK, j * Bc, Bc, 0, HSK_pad), tensorViewTranspose DECODEFUNC); S = coopMatMulAdd(Qf16, K_T, S); if (p.logit_softcap != 0.0f) { @@ -208,31 +208,31 @@ void main() { rowsum = coopmat(0.0); rowsum = coopMatMulAdd(P_A, One, rowsum); - coopmat V; + coopmat V; uint32_t v_offset = iv2*p.nb22 + iv3*p.nb23; - coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, HSV) DECODEFUNC); + coopMatLoadTensorNV(V, data_v, v_offset, sliceTensorLayoutNV(tensorLayoutV, j * Bc, Bc, 0, HSV_pad) DECODEFUNC); L = eM*L + rowsum; // This is the "diagonal" matrix in the paper, but since we do componentwise // multiply rather than matrix multiply it has the diagonal element smeared // across the row - coopmat eMdiag; + coopmat eMdiag; // resize eM by using smear/reduce coopMatReduceNV(eMdiag, eM, gl_CooperativeMatrixReduceRowNV, smearReduce); // multiply with fp16 accumulation, then add to O. - coopmat PV = coopmat(0); + coopmat PV = coopmat(0); PV = coopMatMulAdd(P_A, V, PV); - O = eMdiag * O + coopmat(PV); + O = eMdiag * O + coopmat(PV); } // If there is split_k, then the split_k resolve shader does the final // division by L. Store the intermediate O value and per-row m and L values. if (p.k_num > 1) { - coopmat O_D = coopmat(O); + coopmat O_D = coopmat(O); uint32_t o_offset = HSV * p.ne1 * (split_k_index + iq3 * p.k_num); coopMatPerElementNV(O_D, O_D, perElemOpGqaStore, o_offset, iq2, N); @@ -243,16 +243,16 @@ void main() { return; } - coopmat Ldiag; + coopmat Ldiag; // resize L by using smear/reduce coopMatReduceNV(Ldiag, L, gl_CooperativeMatrixReduceRowNV, smearReduce); if ((p.mask_n_head_log2 & SINK_ENABLE_BIT) != 0) { - coopmat S; + coopmat S; coopMatPerElementNV(S, S, perElemOpGetSink, iq2); - coopmat Mr; + coopmat Mr; // resize M by using smear/reduce coopMatReduceNV(Mr, M, gl_CooperativeMatrixReduceRowNV, smearReduce); @@ -285,7 +285,7 @@ void main() { uint32_t o_offset = iq3*p.ne2*p.ne1*HSV; - coopmat O_D = coopmat(O); + coopmat O_D = coopmat(O); if (p.gqa_ratio > 1) { coopMatPerElementNV(O_D, O_D, perElemOpGqaStore, o_offset, iq2, N); } else { @@ -295,6 +295,6 @@ void main() { // permute dimensions tensorViewNV<3, false, 1, 0, 2> tensorViewPermute = createTensorViewNV(3, false, 1, 0, 2); - coopMatStoreTensorNV(O_D, data_o, o_offset, sliceTensorLayoutNV(tensorLayoutD, i * Br, Br, iq2, N, 0, HSV), tensorViewPermute); + coopMatStoreTensorNV(O_D, data_o, o_offset, sliceTensorLayoutNV(tensorLayoutD, i * Br, Br, iq2, N, 0, HSV_pad), tensorViewPermute); } } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 1e1e43f50594d..74886b4549056 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -6239,8 +6239,8 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_timestep_embedding()); test_cases.emplace_back(new test_leaky_relu()); - for (int hsk : { 64, 80, 128, 192, 256, 576 }) { - for (int hsv : { 64, 80, 128, 192, 256, 512 }) { + for (int hsk : { 40, 64, 80, 128, 192, 256, 576 }) { + for (int hsv : { 40, 64, 80, 128, 192, 256, 512 }) { if (hsk != 192 && hsk != 576 && hsk != hsv) continue; if (hsk == 192 && (hsv != 128 && hsv != 192)) continue; if (hsk == 576 && hsv != 512) continue; // DeepSeek MLA From b730706a49e576fb882dc34d9966345778b3ab0b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 24 Aug 2025 13:07:07 +0300 Subject: [PATCH 140/140] kv-cache : support layer reuse (#15504) * kv-cache : support layer reuse ggml-ci * cont : update comments [no ci] --- src/llama-hparams.cpp | 25 +++++++++++++ src/llama-hparams.h | 6 +++ src/llama-kv-cache-iswa.cpp | 31 ++++++++++++---- src/llama-kv-cache-iswa.h | 6 ++- src/llama-kv-cache.cpp | 68 +++++++++++++++++----------------- src/llama-kv-cache.h | 28 +++++++------- src/llama-memory-hybrid.cpp | 57 ++++++++++++++-------------- src/llama-memory-hybrid.h | 40 +++++++++----------- src/llama-memory-recurrent.cpp | 14 +++---- src/llama-memory-recurrent.h | 18 ++++----- src/llama-memory.h | 8 ++++ src/llama-model.cpp | 38 +++++++++++++------ 12 files changed, 203 insertions(+), 136 deletions(-) diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 7a06368dcda68..91636572da8b2 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -153,3 +153,28 @@ bool llama_hparams::is_swa(uint32_t il) const { GGML_ABORT("fatal error"); } + +bool llama_hparams::has_kv(uint32_t il) const { + if (n_layer_kv_from_start >= 0) { + if (il < (uint32_t) n_layer_kv_from_start) { + return true; + } + + return false; + } + + // by default, all layers have kv + return true; +} + +uint32_t llama_hparams::n_layer_kv() const { + uint32_t res = 0; + + for (uint32_t il = 0; il < n_layer; ++il) { + if (has_kv(il)) { + res++; + } + } + + return res; +} diff --git a/src/llama-hparams.h b/src/llama-hparams.h index bd23122443271..60415f0c202a4 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -41,6 +41,7 @@ struct llama_hparams { uint32_t n_embd; uint32_t n_embd_features = 0; uint32_t n_layer; + int32_t n_layer_kv_from_start = -1; // if non-negative, the first n_layer_kv_from_start layers have KV cache uint32_t n_rot; uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head @@ -221,6 +222,11 @@ struct llama_hparams { uint32_t n_pos_per_embd() const; bool is_swa(uint32_t il) const; + + bool has_kv(uint32_t il) const; + + // number of layers for which has_kv() returns true + uint32_t n_layer_kv() const; }; static_assert(std::is_trivially_copyable::value, "llama_hparams must be trivially copyable"); diff --git a/src/llama-kv-cache-iswa.cpp b/src/llama-kv-cache-iswa.cpp index a11ee5a5b185d..d7342914c6b7c 100644 --- a/src/llama-kv-cache-iswa.cpp +++ b/src/llama-kv-cache-iswa.cpp @@ -22,9 +22,26 @@ llama_kv_cache_iswa::llama_kv_cache_iswa( uint32_t kv_size, uint32_t n_seq_max, uint32_t n_ubatch, - uint32_t n_pad) : hparams(model.hparams), unified(unified) { - llama_kv_cache::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); }; - llama_kv_cache::layer_filter_cb filter_swa = [&](int32_t il) { return model.hparams.is_swa(il); }; + uint32_t n_pad, + const layer_filter_cb & filter, + const layer_reuse_cb & reuse) : hparams(model.hparams), unified(unified) { + + // chain filters + const layer_filter_cb filter_base = [&](int32_t il) { + if (filter && !filter(il)) { + return false; + } + + return !model.hparams.is_swa(il); + }; + + const layer_filter_cb filter_swa = [&](int32_t il) { + if (filter && !filter(il)) { + return false; + } + + return model.hparams.is_swa(il); + }; const uint32_t size_base = kv_size; @@ -41,16 +58,16 @@ llama_kv_cache_iswa::llama_kv_cache_iswa( LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base); kv_base = std::make_unique( - model, std::move(filter_base), type_k, type_v, + model, type_k, type_v, v_trans, offload, unified, size_base, n_seq_max, n_pad, - 0, LLAMA_SWA_TYPE_NONE); + 0, LLAMA_SWA_TYPE_NONE, filter_base, reuse); LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa); kv_swa = std::make_unique( - model, std::move(filter_swa), type_k, type_v, + model, type_k, type_v, v_trans, offload, unified, size_swa, n_seq_max, n_pad, - hparams.n_swa, hparams.swa_type); + hparams.n_swa, hparams.swa_type, filter_swa, reuse); } void llama_kv_cache_iswa::clear(bool data) { diff --git a/src/llama-kv-cache-iswa.h b/src/llama-kv-cache-iswa.h index dd673f18e7e08..5ed134b795800 100644 --- a/src/llama-kv-cache-iswa.h +++ b/src/llama-kv-cache-iswa.h @@ -20,11 +20,13 @@ class llama_kv_cache_iswa : public llama_memory_i { bool v_trans, bool offload, bool swa_full, - bool , + bool unified, uint32_t kv_size, uint32_t n_seq_max, uint32_t n_ubatch, - uint32_t n_pad); + uint32_t n_pad, + const layer_filter_cb & filter, + const layer_reuse_cb & reuse); ~llama_kv_cache_iswa() = default; diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 70ddd5f4b952c..d7ab56ccd9aac 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -17,32 +17,25 @@ // llama_kv_cache::llama_kv_cache( - const llama_model & model, - layer_filter_cb && filter, - ggml_type type_k, - ggml_type type_v, - bool v_trans, - bool offload, - bool unified, - uint32_t kv_size, - uint32_t n_seq_max, - uint32_t n_pad, - uint32_t n_swa, - llama_swa_type swa_type) : + const llama_model & model, + ggml_type type_k, + ggml_type type_v, + bool v_trans, + bool offload, + bool unified, + uint32_t kv_size, + uint32_t n_seq_max, + uint32_t n_pad, + uint32_t n_swa, + llama_swa_type swa_type, + const layer_filter_cb & filter, + const layer_reuse_cb & reuse) : model(model), hparams(model.hparams), v_trans(v_trans), n_seq_max(n_seq_max), n_stream(unified ? 1 : n_seq_max), n_pad(n_pad), n_swa(n_swa), swa_type(swa_type) { GGML_ASSERT(kv_size % n_pad == 0); - // TODO: this is temporary until we support passing reuse layer filters [KV_REUSE] - auto n_layer_cache = hparams.n_layer; - if (model.arch == LLM_ARCH_GEMMA3N) { - n_layer_cache = 20; - } - if (model.arch == LLM_ARCH_GLM4_MOE) { - // GLM-4.5: Only process up to last layer, skip final NextN layer - n_layer_cache = hparams.n_layer - hparams.nextn_predict_layers; - } + const uint32_t n_layer_kv = hparams.n_layer_kv(); // create a context for each buffer type std::map ctx_map; @@ -50,7 +43,7 @@ llama_kv_cache::llama_kv_cache( auto it = ctx_map.find(buft); if (it == ctx_map.end()) { ggml_init_params params = { - /*.mem_size =*/ size_t(2u*(1 + n_stream)*n_layer_cache*ggml_tensor_overhead()), + /*.mem_size =*/ size_t(2u*(1 + n_stream)*n_layer_kv*ggml_tensor_overhead()), /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; @@ -97,9 +90,14 @@ llama_kv_cache::llama_kv_cache( __func__, hparams.n_embd_v_gqa_max()); } - for (uint32_t il = 0; il < n_layer_cache; il++) { + for (uint32_t il = 0; il < hparams.n_layer; il++) { + if (!hparams.has_kv(il)) { + LLAMA_LOG_DEBUG("%s: layer %3d: does not have KV cache\n", __func__, il); + continue; + } + if (filter && !filter(il)) { - LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il); + LLAMA_LOG_DEBUG("%s: layer %3d: filtered\n", __func__, il); continue; } @@ -147,23 +145,27 @@ llama_kv_cache::llama_kv_cache( layers.push_back({ il, k, v, k_stream, v_stream, }); } - // TODO: this is temporary until we support passing reuse layer filters [KV_REUSE] - if (model.arch == LLM_ARCH_GEMMA3N) { - LLAMA_LOG_DEBUG("%s: GEMMA3N: reuse layers [%d, %d]\n", __func__, n_layer_cache, hparams.n_layer - 1); + if (reuse) { + LLAMA_LOG_DEBUG("%s: reusing layers:\n", __func__); - for (uint32_t il = n_layer_cache; il < hparams.n_layer; il++) { - if (filter && !filter(il)) { - LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il); + for (uint32_t il = 0; il < hparams.n_layer; il++) { + const int32_t il_reuse = reuse(il); + + if (il_reuse < 0) { + LLAMA_LOG_DEBUG("%s: - layer %3d: no reuse\n", __func__, il); continue; } - const bool is_swa = hparams.is_swa(il); - const uint32_t il_reuse = n_layer_cache - (is_swa ? 2 : 1); + if (filter && !filter(il)) { + LLAMA_LOG_DEBUG("%s: - layer %3d: filtered\n", __func__, il); + continue; + } GGML_ASSERT(map_layer_ids.find(il_reuse) != map_layer_ids.end()); + map_layer_ids[il] = map_layer_ids[il_reuse]; - LLAMA_LOG_DEBUG("%s: layer %3d: reuse layer %d, isw = %d\n", __func__, il, il_reuse, is_swa); + LLAMA_LOG_DEBUG("%s: - layer %3d: reuse layer %d, is_swa = %d\n", __func__, il, il_reuse, hparams.is_swa(il)); } } diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 297a0973dd467..76a5cb1e28e7e 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -21,9 +21,6 @@ class llama_kv_cache : public llama_memory_i { public: static uint32_t get_padding(const llama_cparams & cparams); - // this callback is used to filter out layers that should not be included in the cache - using layer_filter_cb = std::function; - struct stream_copy_info { bool empty() const { assert(ssrc.size() == sdst.size()); @@ -82,18 +79,19 @@ class llama_kv_cache : public llama_memory_i { using slot_info_vec_t = std::vector; llama_kv_cache( - const llama_model & model, - layer_filter_cb && filter, - ggml_type type_k, - ggml_type type_v, - bool v_trans, - bool offload, - bool unified, - uint32_t kv_size, - uint32_t n_seq_max, - uint32_t n_pad, - uint32_t n_swa, - llama_swa_type swa_type); + const llama_model & model, + ggml_type type_k, + ggml_type type_v, + bool v_trans, + bool offload, + bool unified, + uint32_t kv_size, + uint32_t n_seq_max, + uint32_t n_pad, + uint32_t n_swa, + llama_swa_type swa_type, + const layer_filter_cb & filter, + const layer_reuse_cb & reuse); ~llama_kv_cache() = default; diff --git a/src/llama-memory-hybrid.cpp b/src/llama-memory-hybrid.cpp index f8303dacbf8ad..ba61ebaa885fe 100644 --- a/src/llama-memory-hybrid.cpp +++ b/src/llama-memory-hybrid.cpp @@ -9,32 +9,29 @@ // llama_memory_hybrid::llama_memory_hybrid( - const llama_model & model, - /* attn */ - ggml_type type_k, - ggml_type type_v, - bool v_trans, - uint32_t kv_size, - uint32_t n_pad, - uint32_t n_swa, - llama_swa_type swa_type, - /* recurrent */ - ggml_type type_r, - ggml_type type_s, - uint32_t rs_size, - /* common */ - uint32_t n_seq_max, - bool offload, - bool unified, - /* layer filters */ - layer_filter_cb && filter_attn, - layer_filter_cb && filter_recr) : + const llama_model & model, + /* attn */ + ggml_type type_k, + ggml_type type_v, + bool v_trans, + uint32_t kv_size, + uint32_t n_pad, + uint32_t n_swa, + llama_swa_type swa_type, + /* recurrent */ + ggml_type type_r, + ggml_type type_s, + uint32_t rs_size, + /* common */ + uint32_t n_seq_max, + bool offload, + bool unified, + /* layer filters */ + const layer_filter_cb & filter_attn, + const layer_filter_cb & filter_recr) : hparams(model.hparams), mem_attn(new llama_kv_cache( model, - filter_attn == nullptr ? - [&](int32_t il) { return !hparams.is_recurrent(il); } - : filter_attn, type_k, type_v, v_trans, @@ -44,18 +41,22 @@ llama_memory_hybrid::llama_memory_hybrid( n_seq_max, n_pad, n_swa, - swa_type + swa_type, + filter_attn == nullptr ? + [&](int32_t il) { return !hparams.is_recurrent(il); } + : filter_attn, + nullptr )), mem_recr(new llama_memory_recurrent( model, - filter_recr == nullptr ? - [&](int32_t il) { return hparams.is_recurrent(il); } - : filter_recr, type_r, type_s, offload, rs_size, - n_seq_max + n_seq_max, + filter_recr == nullptr ? + [&](int32_t il) { return hparams.is_recurrent(il); } + : filter_recr )) {} llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) { diff --git a/src/llama-memory-hybrid.h b/src/llama-memory-hybrid.h index e9c64ee40aae4..11a3565178297 100644 --- a/src/llama-memory-hybrid.h +++ b/src/llama-memory-hybrid.h @@ -18,31 +18,27 @@ class llama_memory_hybrid : public llama_memory_i { public: - - // this callback is used to filter out layers that should not be included in the cache - using layer_filter_cb = std::function; - llama_memory_hybrid( const llama_model & model, /* attn */ - ggml_type type_k, - ggml_type type_v, - bool v_trans, - uint32_t kv_size, - uint32_t n_pad, - uint32_t n_swa, - llama_swa_type swa_type, - /* recurrent */ - ggml_type type_r, - ggml_type type_s, - uint32_t rs_size, - /* common */ - uint32_t n_seq_max, - bool offload, - bool unified, - /* layer filters */ - layer_filter_cb && filter_attn = nullptr, - layer_filter_cb && filter_recr = nullptr); + ggml_type type_k, + ggml_type type_v, + bool v_trans, + uint32_t kv_size, + uint32_t n_pad, + uint32_t n_swa, + llama_swa_type swa_type, + /* recurrent */ + ggml_type type_r, + ggml_type type_s, + uint32_t rs_size, + /* common */ + uint32_t n_seq_max, + bool offload, + bool unified, + /* layer filters */ + const layer_filter_cb & filter_attn = nullptr, + const layer_filter_cb & filter_recr = nullptr); ~llama_memory_hybrid() = default; diff --git a/src/llama-memory-recurrent.cpp b/src/llama-memory-recurrent.cpp index 849675c418891..08716ed91aed1 100644 --- a/src/llama-memory-recurrent.cpp +++ b/src/llama-memory-recurrent.cpp @@ -16,13 +16,13 @@ // llama_memory_recurrent::llama_memory_recurrent( - const llama_model & model, - layer_filter_cb && filter, - ggml_type type_r, - ggml_type type_s, - bool offload, - uint32_t mem_size, - uint32_t n_seq_max) : hparams(model.hparams), n_seq_max(n_seq_max) { + const llama_model & model, + ggml_type type_r, + ggml_type type_s, + bool offload, + uint32_t mem_size, + uint32_t n_seq_max, + const layer_filter_cb & filter) : hparams(model.hparams), n_seq_max(n_seq_max) { const int32_t n_layer = hparams.n_layer; head = 0; diff --git a/src/llama-memory-recurrent.h b/src/llama-memory-recurrent.h index c8e8623602f78..c4daf00495bc2 100644 --- a/src/llama-memory-recurrent.h +++ b/src/llama-memory-recurrent.h @@ -15,18 +15,14 @@ // see the implementation of llama_kv_cache_context_i for an example how to do it class llama_memory_recurrent : public llama_memory_i { public: - - // this callback is used to filter out layers that should not be included in the cache - using layer_filter_cb = std::function; - llama_memory_recurrent( - const llama_model & model, - layer_filter_cb && filter, - ggml_type type_r, - ggml_type type_s, - bool offload, - uint32_t mem_size, - uint32_t n_seq_max); + const llama_model & model, + ggml_type type_r, + ggml_type type_s, + bool offload, + uint32_t mem_size, + uint32_t n_seq_max, + const layer_filter_cb & filter); ~llama_memory_recurrent() = default; diff --git a/src/llama-memory.h b/src/llama-memory.h index 94d858bccc2e0..ccd1f073b0848 100644 --- a/src/llama-memory.h +++ b/src/llama-memory.h @@ -3,6 +3,7 @@ #include "llama.h" #include +#include struct llama_ubatch; @@ -64,6 +65,13 @@ using llama_memory_context_ptr = std::unique_ptr; // general concept of LLM memory // the KV cache is a type of LLM memory, but there can be other types struct llama_memory_i { + // this callback is used to filter out layers that should not be included in the cache + using layer_filter_cb = std::function; + + // this callback is used to specify which layers should reuse memory from other layers + // return negative value to indicate that the layer il should not reuse memory + using layer_reuse_cb = std::function; + virtual ~llama_memory_i() = default; // split the input batch into a set of ubatches and verify that they can fit into the cache diff --git a/src/llama-model.cpp b/src/llama-model.cpp index d5148f7df36ed..7d3429617bef9 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1115,6 +1115,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.swa_type = LLAMA_SWA_TYPE_STANDARD; hparams.set_swa_pattern(5); + hparams.n_layer_kv_from_start = 20; hparams.rope_freq_base_train_swa = 10000.0f; hparams.rope_freq_scale_train_swa = 1.0f; hparams.f_attention_scale = 1.0f; @@ -1474,12 +1475,15 @@ void llama_model::load_hparams(llama_model_loader & ml) { // Expert gating function (GLM-4.5 uses sigmoid) ml.get_key(LLM_KV_EXPERT_GATING_FUNC, hparams.expert_gating_func, false); if (hparams.expert_gating_func == LLAMA_EXPERT_GATING_FUNC_TYPE_NONE) { - hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID; + hparams.expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID; } // NextN/MTP parameters ml.get_key(LLM_KV_NEXTN_PREDICT_LAYERS, hparams.nextn_predict_layers, false); + // TODO: when MTP is implemented, this should probably be updated if needed + hparams.n_layer_kv_from_start = hparams.n_layer - hparams.nextn_predict_layers; + switch (hparams.n_layer) { case 47: type = LLM_TYPE_106B_A12B; break; // GLM-4.5-Air (46 layers + 1 NextN layer) case 93: type = LLM_TYPE_355B_A32B; break; // GLM-4.5 (92 layers + 1 NextN layer) @@ -10524,7 +10528,6 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { const int64_t n_embd_altup; const int64_t n_altup; const int i_altup_act; - const int n_layer_kv = 20; // number of layers having KV [KV_REUSE] const int n_layer_sparsity = 10; // number of layers using activation sparsity const float f_sparsity_std_mul = 1.6448533535003662f; // std_multiplier = normal_dist.icdf(0.95) @@ -10574,8 +10577,6 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { for (int il = 0; il < n_layer; ++il) { // this block is made to be closely resemble Gemma3p5DecoderLayer on python code - const bool has_kv = (il < n_layer_kv); - const float freq_base_l = model.get_rope_freq_base (cparams, il); const float freq_scale_l = model.get_rope_freq_scale(cparams, il); @@ -10595,7 +10596,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { ggml_tensor * laurel_out = laurel(cur, il); // [n_embd, n_tokens] // self-attention - if (has_kv) { + if (hparams.has_kv(il)) { // compute Q and K and RoPE them ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); @@ -10635,7 +10636,7 @@ struct llm_build_gemma3n_iswa : public llm_graph_context { model.layers[il].wo, NULL, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, hparams.f_attention_scale, il); } else { - // no KV layers + // reuse KV cache of earlier layers ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); cb(Qcur, "Qcur", il); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); @@ -18256,12 +18257,12 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, if (llm_arch_is_recurrent(arch)) { res = new llama_memory_recurrent( *this, - nullptr, GGML_TYPE_F32, GGML_TYPE_F32, cparams.offload_kqv, std::max((uint32_t) 1, cparams.n_seq_max), - cparams.n_seq_max); + cparams.n_seq_max, + nullptr); } else if (llm_arch_is_hybrid(arch)) { const auto padding = llama_kv_cache::get_padding(cparams); @@ -18302,6 +18303,18 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx); + llama_memory_i::layer_reuse_cb reuse = nullptr; + + if (arch == LLM_ARCH_GEMMA3N) { + reuse = [&](int32_t il) { + if (il >= (int32_t) hparams.n_layer_kv_from_start) { + return (int32_t) hparams.n_layer_kv_from_start - (hparams.is_swa(il) ? 2 : 1); + } + + return -1; + }; + } + if (hparams.swa_type != LLAMA_SWA_TYPE_NONE) { GGML_ASSERT(hparams.is_swa_any()); @@ -18316,13 +18329,14 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, n_ctx_per_stream, cparams.n_seq_max, cparams.n_ubatch, - padding); + padding, + nullptr, + reuse); } else { GGML_ASSERT(!hparams.is_swa_any()); res = new llama_kv_cache( *this, - nullptr, params.type_k, params.type_v, !cparams.flash_attn, @@ -18332,7 +18346,9 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, cparams.n_seq_max, padding, hparams.n_swa, - hparams.swa_type); + hparams.swa_type, + nullptr, + nullptr); } } }