From bcfa458230f9da0cddda141cdb7ad592cfd94a4d Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Mon, 27 Oct 2025 20:49:52 -0400 Subject: [PATCH 1/5] update gguf and support for CompressedLinear Signed-off-by: n1ck-guo --- .../export_to_gguf/convert_hf_to_gguf.py | 1683 ++++++++++++++--- auto_round/export/export_to_gguf/export.py | 3 + auto_round/utils.py | 6 +- 3 files changed, 1408 insertions(+), 284 deletions(-) diff --git a/auto_round/export/export_to_gguf/convert_hf_to_gguf.py b/auto_round/export/export_to_gguf/convert_hf_to_gguf.py index a35d02447..e81f9593a 100644 --- a/auto_round/export/export_to_gguf/convert_hf_to_gguf.py +++ b/auto_round/export/export_to_gguf/convert_hf_to_gguf.py @@ -41,17 +41,39 @@ if "NO_LOCAL_GGUF" not in os.environ: sys.path.insert(1, str(Path(__file__).parent / "gguf-py")) +import gguf +from gguf.vocab import MistralTokenizerType, MistralVocab -from auto_round.utils import LazyImport +try: + from mistral_common.tokens.tokenizers.base import TokenizerVersion # pyright: ignore[reportMissingImports] + from mistral_common.tokens.tokenizers.multimodal import ( + DATASET_MEAN as _MISTRAL_COMMON_DATASET_MEAN, # pyright: ignore[reportMissingImports] + ) + from mistral_common.tokens.tokenizers.multimodal import DATASET_STD as _MISTRAL_COMMON_DATASET_STD + from mistral_common.tokens.tokenizers.sentencepiece import ( # pyright: ignore[reportMissingImports] + SentencePieceTokenizer, + ) + from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pyright: ignore[reportMissingImports] + + _mistral_common_installed = True + _mistral_import_error_msg = "" +except ImportError: + _MISTRAL_COMMON_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073) + _MISTRAL_COMMON_DATASET_STD = (0.26862954, 0.26130258, 0.27577711) + + _mistral_common_installed = False + TokenizerVersion = None + Tekkenizer = None + SentencePieceTokenizer = None + _mistral_import_error_msg = ( + "Mistral format requires `mistral-common` to be installed. Please run " + "`pip install mistral-common[image,audio]` to install it." + ) -gguf = LazyImport("gguf") -MistralTokenizerType = LazyImport("gguf.vocab.MistralTokenizerType") -MistralVocab = LazyImport("gguf.vocab.MistralVocab") -DATASET_MEAN = LazyImport("mistral_common.tokens.tokenizers.multimodal.DATASET_MEAN") -DATASET_STD = LazyImport("mistral_common.tokens.tokenizers.multimodal.DATASET_STD") logger = logging.getLogger("hf-to-gguf") + ###### MODEL DEFINITIONS ###### @@ -85,10 +107,9 @@ class ModelBase: endianess: gguf.GGUFEndian use_temp_file: bool lazy: bool - part_names: list[str] - is_safetensors: bool + dry_run: bool hparams: dict[str, Any] - tensor_names: set[str] | None + model_tensors: dict[str, Callable[[], Tensor]] gguf_writer: gguf.GGUFWriter model_name: str | None metadata_override: Path | None @@ -102,7 +123,10 @@ class ModelBase: block_count: int tensor_map: gguf.TensorNameMap + # Mistral format specifics is_mistral_format: bool = False + disable_mistral_community_chat_template: bool = False + sentence_transformers_dense_modules: bool = False def __init__( self, @@ -121,10 +145,15 @@ def __init__( small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None, + disable_mistral_community_chat_template: bool = False, + sentence_transformers_dense_modules: bool = False, ): if type(self) is ModelBase or type(self) is TextModel or type(self) is MmprojModel: raise TypeError(f"{type(self).__name__!r} should not be directly instantiated") + if self.is_mistral_format and not _mistral_common_installed: + raise ImportError(_mistral_import_error_msg) + self.dir_model = dir_model self.ftype = ftype self.fname_out = fname_out @@ -132,26 +161,11 @@ def __init__( self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE self.use_temp_file = use_temp_file self.lazy = not eager or (remote_hf_model_id is not None) + self.dry_run = dry_run self.remote_hf_model_id = remote_hf_model_id - if remote_hf_model_id is not None: - self.is_safetensors = True - - def get_remote_tensors() -> Iterator[tuple[str, Tensor]]: - logger.info(f"Using remote model with HuggingFace id: {remote_hf_model_id}") - remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id) - self.tensor_names = set(name for name in remote_tensors.keys()) - for name, remote_tensor in remote_tensors.items(): - yield (name, LazyTorchTensor.from_remote_tensor(remote_tensor)) - - self.get_tensors = get_remote_tensors - else: - prefix = "model" if not self.is_mistral_format else "consolidated" - self.part_names = ModelBase.get_model_part_names(self.dir_model, prefix, ".safetensors") - self.is_safetensors = len(self.part_names) > 0 - if not self.is_safetensors: - self.part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin") + self.sentence_transformers_dense_modules = sentence_transformers_dense_modules self.hparams = ModelBase.load_hparams(self.dir_model, self.is_mistral_format) if hparams is None else hparams - self.tensor_names = None + self.model_tensors = self.index_tensors(remote_hf_model_id=remote_hf_model_id) self.metadata_override = metadata_override self.model_name = model_name self.dir_model_card = dir_model # overridden in convert_lora_to_gguf.py @@ -167,6 +181,8 @@ def get_remote_tensors() -> Iterator[tuple[str, Tensor]]: logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})") self.ftype = gguf.LlamaFileType.MOSTLY_BF16 + self.dequant_model() + # Configure GGUF Writer self.gguf_writer = gguf.GGUFWriter( path=None, @@ -179,6 +195,9 @@ def get_remote_tensors() -> Iterator[tuple[str, Tensor]]: small_first_shard=small_first_shard, ) + # Mistral specific + self.disable_mistral_community_chat_template = disable_mistral_community_chat_template + @classmethod def add_prefix_to_filename(cls, path: Path, prefix: str) -> Path: stem, suffix = path.stem, path.suffix @@ -193,34 +212,49 @@ def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any: return None raise KeyError(f"could not find any of: {keys}") - def get_tensors(self) -> Iterator[tuple[str, Tensor]]: # pylint: disable=E0202 - tensor_names_from_parts: set[str] = set() + def index_tensors(self, remote_hf_model_id: str | None = None) -> dict[str, Callable[[], Tensor]]: + tensors: dict[str, Callable[[], Tensor]] = {} + + if remote_hf_model_id is not None: + is_safetensors = True + + logger.info(f"Using remote model with HuggingFace id: {remote_hf_model_id}") + remote_tensors = gguf.utility.SafetensorRemote.get_list_tensors_hf_model(remote_hf_model_id) + for name, remote_tensor in remote_tensors.items(): + tensors[name] = lambda r=remote_tensor: LazyTorchTensor.from_remote_tensor(r) + + return tensors + + prefix = "model" if not self.is_mistral_format else "consolidated" + part_names: list[str] = ModelBase.get_model_part_names(self.dir_model, prefix, ".safetensors") + is_safetensors: bool = len(part_names) > 0 + if not is_safetensors: + part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin") + + tensor_names_from_index: set[str] = set() if not self.is_mistral_format: - index_name = "model.safetensors" if self.is_safetensors else "pytorch_model.bin" + index_name = "model.safetensors" if is_safetensors else "pytorch_model.bin" index_name += ".index.json" index_file = self.dir_model / index_name if index_file.is_file(): - self.tensor_names = set() logger.info(f"gguf: loading model weight map from '{index_name}'") with open(index_file, "r", encoding="utf-8") as f: index: dict[str, Any] = json.load(f) weight_map = index.get("weight_map") if weight_map is None or not isinstance(weight_map, dict): raise ValueError(f"Can't load 'weight_map' from {index_name!r}") - self.tensor_names.update(weight_map.keys()) + tensor_names_from_index.update(weight_map.keys()) else: - self.tensor_names = tensor_names_from_parts weight_map = {} else: - self.tensor_names = tensor_names_from_parts weight_map = {} - for part_name in self.part_names: - logger.info(f"gguf: loading model part '{part_name}'") + for part_name in part_names: + logger.info(f"gguf: indexing model part '{part_name}'") ctx: ContextManager[Any] - if self.is_safetensors: + if is_safetensors: from safetensors import safe_open ctx = cast(ContextManager[Any], safe_open(self.dir_model / part_name, framework="pt", device="cpu")) @@ -230,34 +264,164 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]: # pylint: disable=E0202 ) with ctx as model_part: - tensor_names_from_parts.update(model_part.keys()) + assert model_part is not None for name in model_part.keys(): - if self.is_safetensors: + if is_safetensors: if self.lazy: data = model_part.get_slice(name) - data = LazyTorchTensor.from_safetensors_slice(data) + data_gen = lambda data=data: LazyTorchTensor.from_safetensors_slice(data) # noqa: E731 else: data = model_part.get_tensor(name) + data_gen = lambda data=data: data # noqa: E731 else: data = model_part[name] if self.lazy: - data = LazyTorchTensor.from_eager(data) - yield name, data + data_gen = lambda data=data: LazyTorchTensor.from_eager(data) # noqa: E731 + else: + data_gen = lambda data=data: data # noqa: E731 + tensors[name] = data_gen # verify tensor name presence and identify potentially missing files - if len(tensor_names_from_parts.symmetric_difference(self.tensor_names)) > 0: - missing = sorted(self.tensor_names.difference(tensor_names_from_parts)) - extra = sorted(tensor_names_from_parts.difference(self.tensor_names)) - missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map)) - if len(extra) == 0 and len(missing_files) > 0: - raise ValueError(f"Missing or incomplete model files: {missing_files}\n" f"Missing tensors: {missing}") + if len(tensor_names_from_index) > 0: + tensor_names_from_parts = set(tensors.keys()) + if len(tensor_names_from_parts.symmetric_difference(tensor_names_from_index)) > 0: + missing = sorted(tensor_names_from_index.difference(tensor_names_from_parts)) + extra = sorted(tensor_names_from_parts.difference(tensor_names_from_index)) + missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map)) + if len(extra) == 0 and len(missing_files) > 0: + raise ValueError( + f"Missing or incomplete model files: {missing_files}\n" f"Missing tensors: {missing}" + ) + else: + raise ValueError( + "Mismatch between weight map and model parts for tensor names:\n" + f"Missing tensors: {missing}\n" + f"Extra tensors: {extra}" + ) + + return tensors + + def dequant_model(self): + tensors_to_remove: list[str] = [] + new_tensors: dict[str, Callable[[], Tensor]] = {} + + if (quant_config := self.hparams.get("quantization_config")) and isinstance(quant_config, dict): + quant_method = quant_config.get("quant_method") + + def dequant_bitnet(weight: Tensor, scale: Tensor) -> Tensor: + weight = weight.view(torch.uint8) + orig_shape = weight.shape + + shift = torch.tensor([0, 2, 4, 6], dtype=torch.uint8).reshape((4, *(1 for _ in range(len(orig_shape))))) + data = weight.unsqueeze(0).expand((4, *orig_shape)) >> shift + data = data & 3 + data = (data.float() - 1).reshape((orig_shape[0] * 4, *orig_shape[1:])) + + # The scale is inverted + return data / scale.float() + + def dequant_simple(weight: Tensor, scale: Tensor) -> Tensor: + scale = scale.float() + + if weight_block_size := quant_config.get("weight_block_size"): + # TODO: make sure it's a list of integers + for i, size in enumerate(weight_block_size): + scale = scale.repeat_interleave(size, i) + # unpad the scale (e.g. when the tensor size isn't a multiple of the block size) + scale = scale[tuple(slice(0, size) for size in weight.shape)] + + return weight.float() * scale + + def dequant_gptq(g_idx: Tensor, qweight: Tensor, qzeros: Tensor, scales: Tensor) -> Tensor: + bits = quant_config["bits"] + assert bits in (2, 3, 4, 8) + assert qweight.dtype == qzeros.dtype + maxq = (2**bits) - 1 + weight = None + zeros = None + pack_dtype_bits = qweight.dtype.itemsize * 8 + + if bits in [2, 4, 8]: + pack_factor = pack_dtype_bits // bits + wf = torch.tensor(list(range(0, pack_dtype_bits, bits)), dtype=torch.int32).unsqueeze(0) + if self.lazy: + wf = LazyTorchTensor.from_eager(wf) + + zeros = torch.bitwise_right_shift( + qzeros.unsqueeze(2).expand(-1, -1, pack_factor), wf.unsqueeze(0) + ).to(torch.int16 if bits == 8 else torch.int8) + zeros = torch.bitwise_and(zeros, maxq).reshape(scales.shape) + + weight = torch.bitwise_and( + torch.bitwise_right_shift( + qweight.unsqueeze(1).expand(-1, pack_factor, -1), wf.unsqueeze(-1) + ).to(torch.int16 if bits == 8 else torch.int8), + maxq, + ) + elif bits == 3: + raise NotImplementedError("3-bit gptq dequantization is not yet implemented") + + assert weight is not None + assert zeros is not None + + weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2]) + + # gptq_v2 doesn't need to offset zeros + if quant_config.get("checkpoint_format", "gptq") == "gptq": + zeros += 1 + + return (scales[g_idx].float() * (weight - zeros[g_idx]).float()).T + + if quant_method == "bitnet": + for name in self.model_tensors.keys(): + if name.endswith(".weight_scale"): + weight_name = name.removesuffix("_scale") + w = self.model_tensors[weight_name] + s = self.model_tensors[name] + self.model_tensors[weight_name] = lambda w=w, s=s: dequant_bitnet(w(), s()) + tensors_to_remove.append(name) + elif quant_method == "fp8": + for name in self.model_tensors.keys(): + if name.endswith(".weight_scale_inv"): + weight_name = name.removesuffix("_scale_inv") + w = self.model_tensors[weight_name] + s = self.model_tensors[name] + self.model_tensors[weight_name] = lambda w=w, s=s: dequant_simple(w(), s()) + tensors_to_remove.append(name) + elif quant_method == "gptq": + for name in self.model_tensors.keys(): + if name.endswith(".qweight"): + base_name = name.removesuffix(".qweight") + g_idx = self.model_tensors[base_name + ".g_idx"] + qweight = self.model_tensors[base_name + ".qweight"] + qzeros = self.model_tensors[base_name + ".qzeros"] + scales = self.model_tensors[base_name + ".scales"] + new_tensors[base_name + ".weight"] = ( + lambda g=g_idx, z=qzeros, w=qweight, s=scales: dequant_gptq(g(), w(), z(), s()) + ) + tensors_to_remove += [ + base_name + n + for n in ( + ".g_idx", + ".qzeros", + ".qweight", + ".scales", + ) + ] else: - raise ValueError( - "Mismatch between weight map and model parts for tensor names:\n" - f"Missing tensors: {missing}\n" - f"Extra tensors: {extra}" - ) + raise NotImplementedError(f"Quant method is not yet supported: {quant_method!r}") + + for name in tensors_to_remove: + if name in self.model_tensors: + del self.model_tensors[name] + + for name, value in new_tensors.items(): + self.model_tensors[name] = value + + def get_tensors(self) -> Iterator[tuple[str, Tensor]]: + for name, gen in self.model_tensors.items(): + yield name, gen() def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str: if key not in gguf.MODEL_TENSORS[self.model_arch]: @@ -334,10 +498,6 @@ def prepare_tensors(self): # data = data_torch.squeeze().numpy() data = data_torch.numpy() - # if data ends up empty, it means data_torch was a scalar tensor -> restore - if len(data.shape) == 0: - data = data_torch.numpy() - n_dims = len(data.shape) data_qtype: gguf.GGMLQuantizationType | bool = self.tensor_force_quant(name, new_name, bid, n_dims) @@ -649,6 +809,12 @@ def set_gguf_parameters(self): if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: self.gguf_writer.add_expert_used_count(n_experts_used) logger.info(f"gguf: experts used count = {n_experts_used}") + if (n_expert_groups := self.hparams.get("n_group")) is not None: + self.gguf_writer.add_expert_group_count(n_expert_groups) + logger.info(f"gguf: expert groups count = {n_expert_groups}") + if (n_group_used := self.hparams.get("topk_group")) is not None: + self.gguf_writer.add_expert_group_used_count(n_group_used) + logger.info(f"gguf: expert groups used count = {n_group_used}") if (head_dim := self.hparams.get("head_dim")) is not None: self.gguf_writer.add_key_length(head_dim) @@ -723,8 +889,7 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) if previous_token != token: logger.info( - f"{repr(previous_token)} is encoded and decoded back to " - f"{repr(token)} using AutoTokenizer" + f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer" ) if added_tokens_decoder[i].special or self.does_token_look_special(token): @@ -750,7 +915,7 @@ def get_vocab_base_pre(self, tokenizer) -> str: # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can # use in llama.cpp to implement the same pre-tokenizer - chktxt = "\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````\"\"\"\"......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL" # pylint: disable=C0301 + chktxt = "\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````\"\"\"\"......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL" chktok = tokenizer.encode(chktxt) chkhsh = sha256(str(chktok).encode()).hexdigest() @@ -802,6 +967,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "d4540891389ea895b53b399da6ac824becc30f2fba0e9ddbb98f92e55ca0e97c": # ref: https://huggingface.co/Qwen/Qwen3-Embedding-0.6B res = "qwen2" + if chkhsh == "66b8d4e19ab16c3bfd89bce5d785fb7e0155e8648708a1f42077cb9fe002c273": + # ref: https://huggingface.co/alvarobartt/grok-2-tokenizer + res = "grok-2" if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5": # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B res = "llama-bpe" @@ -952,6 +1120,12 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756": # ref: https://huggingface.co/JetBrains/Mellum-4b-base res = "mellum" + if chkhsh == "9b1be57e70d20d9501b2b3186e792d81181ae36ada3903c26f9fea418cf87206": + # ref: https://huggingface.co/inclusionAI/Ling-mini-base-2.0 + res = "bailingmoe2" + if chkhsh == "53e325976a6e142379c19b09afcae354f2f496f147afa8f9e189a33fe4e3024e": + # ref: https://huggingface.co/ibm-granite/granite-docling-258M + res = "granite-docling" if res is None: logger.warning("\n") @@ -1292,6 +1466,58 @@ def _try_set_pooling_type(self) -> None: raise NotImplementedError("Only MEAN, CLS, and LAST pooling types supported") self.gguf_writer.add_pooling_type(pooling_type) + def _set_vocab_interns1(self): + tokens: list[str] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + vocab = getattr(tokenizer, "vocab", tokenizer.get_vocab()) + vocab_size = self.hparams.get("vocab_size", len(vocab)) + assert max(vocab.values()) < vocab_size + + tokpre = self.get_vocab_base_pre(tokenizer) + + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()} + added_vocab = tokenizer.get_added_vocab() + + added_tokens_decoder = tokenizer.added_tokens_decoder + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + else: + token: str = reverse_vocab[i] + if token in added_vocab: + # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized. + # To avoid unexpected issues - we make sure to normalize non-normalized tokens + if not added_tokens_decoder[i].normalized: + previous_token = token + token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) + if previous_token != token: + logger.info( + f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer" + ) + + if added_tokens_decoder[i].special or self.does_token_look_special(token): + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + toktypes.append(gguf.TokenType.NORMAL) + tokens.append(token) + + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab._set_special_token("bos", 151643) + special_vocab.add_to_gguf(self.gguf_writer) + class MmprojModel(ModelBase): model_type = ModelType.MMPROJ @@ -1347,6 +1573,7 @@ def __init__(self, *args, **kwargs): self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count) # load preprocessor config + self.preprocessor_config = {} if not self.is_mistral_format: with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f: self.preprocessor_config = json.load(f) @@ -1361,8 +1588,26 @@ def get_audio_config(self) -> dict[str, Any] | None: def set_type(self): self.gguf_writer.add_type(gguf.GGUFType.MMPROJ) - def set_gguf_parameters(self): + def prepare_metadata(self, vocab_only: bool): + super().prepare_metadata(vocab_only=vocab_only) + + output_type: str = self.ftype.name.partition("_")[2] + + if self.fname_out.is_dir(): + fname_default: str = gguf.naming_convention( + self.metadata.name, + self.metadata.basename, + self.metadata.finetune, + self.metadata.version, + size_label=None, + output_type=output_type, + model_type=None, + ) + self.fname_out = self.fname_out / f"mmproj-{fname_default}.gguf" + else: + self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type) + def set_gguf_parameters(self): self.gguf_writer.add_file_type(self.ftype) if self.has_vision_encoder: @@ -1370,7 +1615,8 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_projection_dim(self.n_embd_text) # vision config - self.gguf_writer.add_vision_image_size(self.find_vparam(["image_size"])) + self.image_size = self.find_vparam(["image_size"]) + self.gguf_writer.add_vision_image_size(self.image_size) self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"])) self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"])) self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size"])) @@ -1378,8 +1624,10 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_head_count(self.find_vparam(["num_attention_heads"])) # preprocessor config - image_mean = DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"] - image_std = DATASET_STD if self.is_mistral_format else self.preprocessor_config["image_std"] + image_mean = ( + _MISTRAL_COMMON_DATASET_MEAN if self.is_mistral_format else self.preprocessor_config["image_mean"] + ) + image_std = _MISTRAL_COMMON_DATASET_STD if self.is_mistral_format else self.preprocessor_config["image_std"] self.gguf_writer.add_vision_image_mean(image_mean) self.gguf_writer.add_vision_image_std(image_std) @@ -1416,6 +1664,16 @@ def _find_param(self, obj: dict[str, Any], keys: Iterable[str], optional: bool = return None raise KeyError(f"could not find any of: {keys}") + def tensor_force_quant(self, name, new_name, bid, n_dims): + del bid, name, n_dims # unused + if ".patch_embd.weight" in new_name: + return ( + gguf.GGMLQuantizationType.F16 + if self.ftype == gguf.LlamaFileType.MOSTLY_F16 + else gguf.GGMLQuantizationType.F32 + ) + return False + @ModelBase.register("GPTNeoXForCausalLM") class GPTNeoXModel(TextModel): @@ -1444,6 +1702,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter tensors: list[tuple[str, Tensor]] = [] if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name): + # Map bloom-style qkv_linear to gpt-style qkv_linear + # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa + # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed)) data_torch = torch.cat( ( @@ -1498,6 +1759,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter tensors: list[tuple[str, Tensor]] = [] if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name): + # Map bloom-style qkv_linear to gpt-style qkv_linear + # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa + # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed)) data_torch = torch.cat( ( @@ -1599,6 +1863,8 @@ def set_gguf_parameters(self): self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) self.gguf_writer.add_head_count(head_count) self.gguf_writer.add_head_count_kv(head_count_kv) + # note: config provides rms norm but it is actually layer norm + # ref: https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571 self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"]) @@ -2050,6 +2316,9 @@ def __init__(self, *args, **kwargs): self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32) def _set_vocab_mistral(self): + if not _mistral_common_installed: + raise ImportError(_mistral_import_error_msg) + vocab = MistralVocab(self.dir_model) logger.info(f"Converting tokenizer {vocab.tokenizer_type} of size {vocab.vocab_size}.") @@ -2071,8 +2340,7 @@ def _set_vocab_mistral(self): self.gguf_writer.add_token_merges(vocab.extract_vocab_merges_from_model()) logger.info( - f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, " - f"{vocab.unk_id}, {vocab.pad_id}." + f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}." ) self.gguf_writer.add_bos_token_id(vocab.bos_id) @@ -2090,8 +2358,19 @@ def _set_vocab_mistral(self): template_dir = Path(__file__).parent / "models/templates/" - template = MistralModel.get_community_chat_template(vocab, template_dir) - self.gguf_writer.add_chat_template(template) + if not self.is_mistral_format or not self.disable_mistral_community_chat_template: + # Log only for Mistral format that the official tokenization and detokenization is via `mistral-common`. + if self.is_mistral_format: + logger.info( + "Using a Mistral community chat template. These templates can be subject to errors in early days or weeks after a release. " + "Mistral recommends to use `mistral-common` to perform tokenization and detokenization." + ) + template = MistralModel.get_community_chat_template(vocab, template_dir, self.is_mistral_format) + self.gguf_writer.add_chat_template(template) + else: + logger.info( + "Not using a Mistral community chat template. Ensure to perform the tokenization and detokenization via `mistral-common`." + ) def set_vocab(self): if self.is_mistral_format: @@ -2295,18 +2574,21 @@ def set_gguf_parameters(self): ) class LlavaVisionModel(MmprojModel): img_break_tok_id = -1 + use_break_tok = True def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if self.hparams.get("model_type") == "pixtral": # layer_norm_eps is not in config.json, it is hard-coded in modeling_pixtral.py self.hparams["layer_norm_eps"] = self.hparams.get("layer_norm_eps", 1e-5) - self.img_break_tok_id = self.get_token_id("[IMG_BREAK]") + if self.use_break_tok: + self.img_break_tok_id = self.get_token_id("[IMG_BREAK]") elif self.is_mistral_format: # hparams is already vision config here so norm_eps is only defined in global_config. self.hparams["norm_eps"] = self.global_config.get("norm_eps", None) assert self.hparams["norm_eps"] is not None, "norm_eps not found in params.json" - self.img_break_tok_id = self.find_vparam(["image_break_token_id"]) + if self.use_break_tok: + self.img_break_tok_id = self.find_vparam(["image_break_token_id"]) else: raise ValueError(f"Unsupported model type: {self.hparams['model_type']}") logger.info(f"Image break token id: {self.img_break_tok_id}") @@ -2378,7 +2660,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @ModelBase.register("Idefics3ForConditionalGeneration", "SmolVLMForConditionalGeneration") class SmolVLMModel(MmprojModel): - def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) if self.hparams["model_type"] == "smolvlm_vision": @@ -2395,11 +2676,14 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2)) self.gguf_writer.add_vision_use_gelu(True) + # Add the preprocessor longest edge size + preproc_image_size = self.preprocessor_config.get("size", {}).get("longest_edge", self.image_size) + self.gguf_writer.add_vision_preproc_image_size(preproc_image_size) + def tensor_force_quant(self, name, new_name, bid, n_dims): - del bid, new_name, n_dims # unused if ".embeddings." in name: return gguf.GGMLQuantizationType.F32 - return False + return super().tensor_force_quant(name, new_name, bid, n_dims) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused @@ -2411,7 +2695,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [] # skip other tensors -@ModelBase.register("Llama4ForConditionalGeneration") +@ModelBase.register( + "Llama4ForConditionalGeneration", + "Llama4ForCausalLM", +) class Llama4Model(LlamaModel): model_arch = gguf.MODEL_ARCH.LLAMA4 undo_permute = False @@ -2429,6 +2716,10 @@ def set_gguf_parameters(self): super().set_gguf_parameters() self.gguf_writer.add_interleave_moe_layer_step(self.hparams["interleave_moe_layer_step"]) self.gguf_writer.add_expert_feed_forward_length(self.hparams["intermediate_size_moe"]) + if "layer_types" in self.hparams: + if all(lt == "full_attention" for lt in self.hparams["layer_types"]): + # all layers are full attention (for MobileLLM), disable swa + self.gguf_writer.add_sliding_window(0) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): if name.startswith("language_model."): @@ -2456,7 +2747,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): @ModelBase.register("Llama4ForConditionalGeneration") class Llama4VisionModel(MmprojModel): - def set_gguf_parameters(self): super().set_gguf_parameters() self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LLAMA4) @@ -2690,6 +2980,9 @@ def weight_quant(self, weight: Tensor) -> Tensor: weight = weight.float() scale = weight.abs().mean().clamp(min=1e-5) iscale = 1 / scale + # TODO: multiply by the scale directly instead of inverting it twice + # (this is also unnecessarily doubly inverted upstream) + # ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10 result = (weight * iscale).round().clamp(-1, 1) / iscale return result.type(dtype) @@ -2714,12 +3007,22 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter yield (new_name, data_torch) -@ModelBase.register("GrokForCausalLM") +@ModelBase.register("GrokForCausalLM", "Grok1ForCausalLM") class GrokModel(TextModel): model_arch = gguf.MODEL_ARCH.GROK def set_vocab(self): - self._set_vocab_sentencepiece() + if (self.dir_model / "tokenizer.model").is_file(): + self._set_vocab_sentencepiece() + return + + if not (self.dir_model / "tokenizer.json").is_file() or not (self.dir_model / "chat_template.jinja").is_file(): + logger.error( + "Error: Missing vocab and chat template, download files from https://huggingface.co/alvarobartt/grok-2-tokenizer" + ) + sys.exit(1) + + self._set_vocab_gpt2() def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -2727,11 +3030,46 @@ def __init__(self, *args, **kwargs): def set_gguf_parameters(self): super().set_gguf_parameters() - _experts: list[dict[str, Tensor]] | None = None + self.gguf_writer.add_attn_logit_softcapping(self.hparams.get("attn_logit_softcapping", 30.0)) + self.gguf_writer.add_router_logit_softcapping(self.hparams.get("router_logit_softcapping", 30.0)) + if final_logit_softcap := self.hparams.get("final_logit_softcapping"): + self.gguf_writer.add_final_logit_softcapping(final_logit_softcap) + + if (rope_dim := self.hparams.get("head_dim")) is None: + rope_dim = self.hparams["hidden_size"] // self.hparams["num_attention_heads"] + + if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None: + self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) + + # Treat "original" as "yarn", seems to have been a mistake + if self.hparams.get("rope_type") in ("yarn", "original"): + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(self.hparams["scaling_factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["original_max_position_embeddings"]) + self.gguf_writer.add_rope_scaling_yarn_ext_factor(self.hparams["extrapolation_factor"]) + self.gguf_writer.add_rope_scaling_yarn_attn_factor(self.hparams["attn_factor"]) + self.gguf_writer.add_rope_scaling_yarn_beta_fast(self.hparams["beta_fast"]) + self.gguf_writer.add_rope_scaling_yarn_beta_slow(self.hparams["beta_slow"]) + + if temp_len := self.hparams.get("attn_temperature_len"): + self.gguf_writer.add_attn_temperature_length(temp_len) + + self.gguf_writer.add_attn_output_scale(self.hparams.get("attn_output_multiplier", rope_dim**-0.5)) + self.gguf_writer.add_embedding_scale(self.hparams["embedding_multiplier_scale"]) + self.gguf_writer.add_logit_scale(self.hparams["output_multiplier_scale"]) + + _experts: list[dict[str, list[Tensor]]] | None = None + _cur_expert = "" def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + tensors: list[tuple[str, Tensor]] = [] + is_expert = ".moe." in name or ".block_sparse_moe.experts." in name + + if not is_expert: + tensors.append((self.map_tensor_name(name), data_torch)) + # process the experts separately - if name.find(".moe.") != -1: + if is_expert or self._cur_expert: n_experts = self.hparams["num_local_experts"] assert bid is not None @@ -2739,32 +3077,41 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if self._experts is None: self._experts = [{} for _ in range(self.block_count)] - self._experts[bid][name] = data_torch - - if len(self._experts[bid]) >= n_experts * 3: - tensors: list[tuple[str, Tensor]] = [] + # concatenate split tensors + if name in self._experts[bid]: + self._cur_expert = name + self._experts[bid][name].append(data_torch) + return [] + elif is_expert: + self._cur_expert = name + self._experts[bid][name] = [data_torch] + return [] + else: + self._cur_expert = "" - # merge the experts into a single 3d tensor - for wid in ["linear", "linear_1", "linear_v"]: - data: list[Tensor] = [] + for bid in range(self.block_count): + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for wid in [("linear", "w1", 0), ("linear_1", "w2", 1), ("linear_v", "w3", 0)]: + data: list[Tensor] = [] - for xid in range(n_experts): - ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight" - data.append(self._experts[bid][ename]) - del self._experts[bid][ename] + for xid in range(n_experts): + ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid[0]}.weight" + if ename not in self._experts[bid]: + ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.{wid[1]}.weight" + tensor_list = self._experts[bid][ename] + data.append(torch.cat(tensor_list, dim=wid[2]) if len(tensor_list) > 1 else tensor_list[0]) + del self._experts[bid][ename] - data_torch = torch.stack(data, dim=0) + data_torch = torch.stack(data, dim=0) - merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight" + merged_name = f"transformer.decoder_layer.{bid}.moe.{wid[0]}.weight" - new_name = self.map_tensor_name(merged_name) + new_name = self.map_tensor_name(merged_name) - tensors.append((new_name, data_torch)) - return tensors - else: - return [] + yield (new_name, data_torch) - return [(self.map_tensor_name(name), data_torch)] + yield from tensors @ModelBase.register("DbrxForCausalLM") @@ -3032,6 +3379,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter or name.startswith("multi_modal_projector") or name.startswith("vision_model") or name.startswith("audio_tower") + or name.startswith("model.vision_tower") + or name.startswith("model.multi_modal_projector") ): # skip vision and audio tensors return [] @@ -3213,7 +3562,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter yield from super().modify_tensors(data_torch, name, bid) -@ModelBase.register("Ernie4_5_ForCausalLM") +@ModelBase.register("Ernie4_5_ForCausalLM", "Ernie4_5ForCausalLM") class Ernie4_5Model(TextModel): model_arch = gguf.MODEL_ARCH.ERNIE4_5 @@ -3393,7 +3742,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @ModelBase.register("Qwen2VLModel", "Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration") class Qwen2VLVisionModel(MmprojModel): - def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) assert self.hparams_vision is not None @@ -3433,12 +3781,9 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_attention_layernorm_eps(self.global_config.get("rms_norm_eps", 1e-6)) def tensor_force_quant(self, name, new_name, bid, n_dims): - del bid, name, n_dims # unused - if ".patch_embd." in new_name: - return gguf.GGMLQuantizationType.F16 if ".position_embd." in new_name: return gguf.GGMLQuantizationType.F32 - return False + return super().tensor_force_quant(name, new_name, bid, n_dims) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused @@ -3511,10 +3856,9 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: yield ("audio_tower.embed_positions.weight", pos_embd) def tensor_force_quant(self, name, new_name, bid, n_dims): - del bid, new_name, n_dims # unused if ".conv" in name and ".weight" in name: return gguf.GGMLQuantizationType.F16 - return False + return super().tensor_force_quant(name, new_name, bid, n_dims) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if name.startswith("thinker."): @@ -3536,7 +3880,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @ModelBase.register("InternVisionModel") class InternVisionModel(MmprojModel): - def set_gguf_parameters(self): assert self.hparams_vision is not None if isinstance(self.hparams_vision["image_size"], list): @@ -3561,12 +3904,9 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio)) def tensor_force_quant(self, name, new_name, bid, n_dims): - del bid, name, n_dims # unused - if ".patch_embd." in new_name: - return gguf.GGMLQuantizationType.F16 if ".position_embd." in new_name: return gguf.GGMLQuantizationType.F32 - return False + return super().tensor_force_quant(name, new_name, bid, n_dims) def _mapping_interns1_name(self, name): names_map = { @@ -3735,95 +4075,110 @@ def prepare_tensors(self): class Qwen3Model(Qwen2Model): model_arch = gguf.MODEL_ARCH.QWEN3 - -@ModelBase.register("Qwen3MoeForCausalLM") -class Qwen3MoeModel(Qwen2MoeModel): - model_arch = gguf.MODEL_ARCH.QWEN3MOE + # extra logic for rerank models + is_rerank: bool = False + is_tied_embeddings: bool = False + token_false_id: int | None = None + token_true_id: int | None = None def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - hparams = ModelBase.load_hparams(self.dir_model, False) + + # track for intern-s1-mini + hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False) self.origin_hf_arch = hparams.get("architectures", [None])[0] + # a bit hacky, but currently the only way to detect if this is a rerank model + # ref: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B + readme_path = self.dir_model / "README.md" + readme_text = "" + if readme_path.exists(): + with readme_path.open("r", encoding="utf-8") as f: + readme_text = f.read() + if "# Qwen3-Reranker" in readme_text: + self._find_rerank_config() + def set_vocab(self): - # deal with intern-s1 + # deal with intern-s1-mini if self.origin_hf_arch == "InternS1ForConditionalGeneration": self._set_vocab_interns1() return - try: - self._set_vocab_sentencepiece() - except FileNotFoundError: - self._set_vocab_gpt2() - - def _set_vocab_interns1(self): - tokens: list[str] = [] - toktypes: list[int] = [] + super().set_vocab() + def _find_rerank_config(self): from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) - vocab = getattr(tokenizer, "vocab", tokenizer.get_vocab()) - vocab_size = self.hparams.get("vocab_size", len(vocab)) - assert max(vocab.values()) < vocab_size + tokenizer = AutoTokenizer.from_pretrained(self.dir_model) - tokpre = self.get_vocab_base_pre(tokenizer) + self.is_rerank = True + self.is_tied_embeddings = self.hparams.get("tie_word_embeddings", False) + self.token_false_id = tokenizer.convert_tokens_to_ids("no") + self.token_true_id = tokenizer.convert_tokens_to_ids("yes") + self.sep_token_id = tokenizer.convert_tokens_to_ids("|") - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()} - added_vocab = tokenizer.get_added_vocab() + assert self.token_false_id is not None and self.token_true_id is not None - added_tokens_decoder = tokenizer.added_tokens_decoder + def set_gguf_parameters(self): + super().set_gguf_parameters() + if self.is_rerank: + self.gguf_writer.add_pooling_type(gguf.PoolingType.RANK) + self.gguf_writer.add_classifier_output_labels(["yes", "no"]) + self.gguf_writer.add_chat_template( + [ + { + "name": "rerank", + "template": '<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>\n' + "<|im_start|>user\n: Given a web search query, retrieve relevant passages that answer the query\n: {query}\n: {document}<|im_end|>\n" + "<|im_start|>assistant\n\n\n\n\n", + } + ] + ) - for i in range(vocab_size): - if i not in reverse_vocab: - tokens.append(f"[PAD{i}]") - toktypes.append(gguf.TokenType.UNUSED) - else: - token: str = reverse_vocab[i] - if token in added_vocab: - # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized. - # To avoid unexpected issues - we make sure to normalize non-normalized tokens - if not added_tokens_decoder[i].normalized: - previous_token = token - token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) - if previous_token != token: - logger.info( - f"{repr(previous_token)} is encoded and decoded back to {repr(token)} " - "using AutoTokenizer" - ) + def _get_cls_out_tensor(self, data_torch: Tensor) -> Tensor: + # extract "yes" and "no" tokens from the output lm_head tensor + false_row = data_torch[self.token_false_id] + true_row = data_torch[self.token_true_id] + return torch.stack([true_row, false_row], dim=0) - if added_tokens_decoder[i].special or self.does_token_look_special(token): - toktypes.append(gguf.TokenType.CONTROL) - else: - toktypes.append(gguf.TokenType.USER_DEFINED) - else: - toktypes.append(gguf.TokenType.NORMAL) - tokens.append(token) + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if "model.vision_" in name: + # skip multimodal tensors + return [] - self.gguf_writer.add_tokenizer_model("gpt2") - self.gguf_writer.add_tokenizer_pre(tokpre) - self.gguf_writer.add_token_list(tokens) - self.gguf_writer.add_token_types(toktypes) + if self.is_rerank: + is_tied_head = self.is_tied_embeddings and "embed_tokens" in name + is_real_head = not self.is_tied_embeddings and "lm_head" in name + if is_tied_head or is_real_head: + cls_out_head = ( + gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.CLS_OUT] + ".weight", + self._get_cls_out_tensor(data_torch), + ) + if is_tied_head: + embed = (self.map_tensor_name(name), data_torch) + return [cls_out_head, embed] + if is_real_head: + return [cls_out_head] - special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) - special_tokens_map_file = self.dir_model / "special_tokens_map.json" - additional_special_tokens = [] - if special_tokens_map_file.is_file(): - with open(special_tokens_map_file, encoding="utf-8") as f: - additional_special_tokens = json.load(f).get("additional_special_tokens", []) - tokenizer_cfg_file = self.dir_model / "special_tokens_map.json" - if tokenizer_cfg_file.is_file(): - with open(tokenizer_cfg_file, encoding="utf-8") as f: - added_tokens_decoder = json.load(f).get("added_tokens_decoder", {}) - token2ids_map = { - data["content"]: int(token) for token, data in added_tokens_decoder.items() if data["special"] - } - for token in additional_special_tokens: - if token in token2ids_map: - special_vocab._set_special_token(token, token2ids_map[token]) - special_vocab._set_special_token("eos", 151645) - special_vocab._set_special_token("bos", 151643) - special_vocab.add_to_gguf(self.gguf_writer) + return super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("Qwen3MoeForCausalLM") +class Qwen3MoeModel(Qwen2MoeModel): + model_arch = gguf.MODEL_ARCH.QWEN3MOE + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + hparams = ModelBase.load_hparams(self.dir_model, False) + self.origin_hf_arch = hparams.get("architectures", [None])[0] + + def set_vocab(self): + # deal with intern-s1 + if self.origin_hf_arch == "InternS1ForConditionalGeneration": + self._set_vocab_interns1() + return + + super().set_vocab() @ModelBase.register("GPT2LMHeadModel") @@ -3958,8 +4313,7 @@ def set_vocab(self): if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: if tokens[token_id] != token: logger.warning( - f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} ' - f'-> {token.decode("utf-8")!r}' + f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}' ) tokens[token_id] = token scores[token_id] = -1000.0 @@ -3978,8 +4332,7 @@ def set_vocab(self): if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: if tokens[token_id] != token: logger.warning( - f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} ' - f'-> {token.decode("utf-8")!r}' + f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}' ) tokens[token_id] = token scores[token_id] = -1000.0 @@ -4061,8 +4414,7 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: raise ValueError( - f"The length of rope long and short factors must be {rope_dims / 2}. " - f"long_factors = {len(long_factors)}, short_factors = {len(short_factors)}." + f"The length of rope long and short factors must be {rope_dims / 2}. long_factors = {len(long_factors)}, short_factors = {len(short_factors)}." ) yield ( @@ -4277,7 +4629,8 @@ def set_gguf_parameters(self): # This logic matches modeling_plamo.py's is_mamba function mamba_step = hparams.get("mamba_step", 2) mamba_enabled = hparams.get("mamba_enabled", True) - mamba_layers = [] + num_key_value_heads = [] + num_attention_heads = [] if mamba_enabled: for i in range(block_count): @@ -4287,17 +4640,21 @@ def set_gguf_parameters(self): else: is_mamba = (i % mamba_step) != (mamba_step // 2) if is_mamba: - mamba_layers.append(0) + num_key_value_heads.append(0) + num_attention_heads.append(0) else: - mamba_layers.append(hparams.get("num_key_value_heads", 4)) + num_key_value_heads.append(hparams.get("num_key_value_heads", 4)) + num_attention_heads.append(hparams.get("num_attention_heads", 32)) - if mamba_layers: - self.gguf_writer.add_head_count_kv(mamba_layers) + if num_key_value_heads and num_attention_heads: + self.gguf_writer.add_head_count_kv(num_key_value_heads) + self.gguf_writer.add_head_count(num_attention_heads) self.gguf_writer.add_context_length(hparams.get("max_position_embeddings", 2048)) self.gguf_writer.add_embedding_length(hparams.get("hidden_size", 4096)) + self.gguf_writer.add_key_length(hparams.get("hidden_size_per_head", 128)) + self.gguf_writer.add_value_length(hparams.get("hidden_size_per_head", 128)) self.gguf_writer.add_block_count(block_count) - self.gguf_writer.add_head_count(hparams.get("num_attention_heads", 32)) self.gguf_writer.add_layer_norm_rms_eps(hparams.get("rms_norm_eps", 1e-06)) self.gguf_writer.add_rope_freq_base(hparams.get("rope_theta", 10000)) @@ -4368,31 +4725,10 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR) self.gguf_writer.add_rope_scaling_factor(1.0) - _has_tok_embd = False - def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - del bid # unused - - output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT) - tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD) - - new_name = self.map_tensor_name(name) - - # assuming token_embd.weight is seen before output.weight - if not self._has_tok_embd and new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT): - # even though the tensor file(s) does not contain the word embeddings they are still in the weight map - if self.tensor_names and "transformer.wte.weight" in self.tensor_names: - logger.debug(f"{tok_embd_name} not found before {output_name}, assuming they are tied") - self.tensor_names.remove("transformer.wte.weight") - elif new_name == tok_embd_name: - self._has_tok_embd = True - - return [(new_name, data_torch)] - - -@ModelBase.register("InternLM2ForCausalLM") -class InternLM2Model(TextModel): - model_arch = gguf.MODEL_ARCH.INTERNLM2 +@ModelBase.register("InternLM2ForCausalLM") +class InternLM2Model(TextModel): + model_arch = gguf.MODEL_ARCH.INTERNLM2 def set_vocab(self): # (TODO): Is there a better way? @@ -4475,8 +4811,7 @@ def set_vocab(self): if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: if tokens[token_id] != token: logger.warning( - f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} ' - f'-> {token.decode("utf-8")!r}' + f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}' ) tokens[token_id] = token scores[token_id] = -1000.0 @@ -4498,8 +4833,7 @@ def set_vocab(self): if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: if tokens[token_id] != token: logger.warning( - f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r}' - f' -> {token.decode("utf-8")!r}' + f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}' ) tokens[token_id] = token scores[token_id] = -1000.0 @@ -5035,11 +5369,41 @@ def modify_tensors(self, data_torch, name, bid): @ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification") class XLMRobertaModel(BertModel): model_arch = gguf.MODEL_ARCH.BERT + _lora_files = {} + _lora_names = [] - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) + def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any): + hparams = kwargs.pop("hparams", None) + if hparams is None: + hparams = ModelBase.load_hparams(dir_model, False) + + if lora_names := hparams.get("lora_adaptations"): + self._lora_names = lora_names + self.model_arch = gguf.MODEL_ARCH.JINA_BERT_V3 + + super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs) self._xlmroberta_tokenizer_init() + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + if self._lora_names: + for name in self._lora_names: + fname = self.add_prefix_to_filename(self.fname_out, f"lora-{name}-") + self._lora_files[name] = gguf.GGUFWriter( + fname, + arch=gguf.MODEL_ARCH_NAMES[self.model_arch], + endianess=self.endianess, + use_temp_file=self.use_temp_file, + dry_run=self.dry_run, + ) + + return super().generate_extra_tensors() + + def set_type(self): + for lora_writer in self._lora_files.values(): + lora_writer.add_type(gguf.GGUFType.ADAPTER) + lora_writer.add_string(gguf.Keys.Adapter.TYPE, "lora") + super().set_type() + def set_vocab(self): self._xlmroberta_set_vocab() @@ -5048,13 +5412,61 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main name = name.removeprefix("roberta.") + # jina-embeddings-v3 + if ".parametrizations." in name: + name = name.replace(".parametrizations.", ".") + name = name.removesuffix(".original") + # position embeddings start at pad_token_id + 1, so just chop down the weight tensor if name == "embeddings.position_embeddings.weight": if self._position_offset is not None: data_torch = data_torch[self._position_offset :, :] + if name.endswith(".0.lora_A") or name.endswith(".0.lora_B"): + if name.startswith("pooler.dense"): + return [] + + num_loras = data_torch.size(0) + assert num_loras == len(self._lora_names) + + # Split out each LoRA in their own GGUF + for i, lora_writer in enumerate(self._lora_files.values()): + new_name = self.map_tensor_name(name[:-9]) + name[-7:].lower() + data = data_torch[i, :, :] + # Transpose/flip token_embd/types into correct shape + if new_name == "token_embd.weight.lora_b": + data = data.T + elif new_name.startswith("token_types.weight."): + new_name = new_name[:-1] + ("a" if new_name[-1:] == "b" else "b") + lora_writer.add_tensor(new_name, data.float().numpy(), raw_dtype=gguf.GGMLQuantizationType.F32) + + return [] + return super().modify_tensors(data_torch, name, bid) + def set_gguf_parameters(self): + super().set_gguf_parameters() + + # jina-embeddings-v3 + if rotary_emb_base := self.hparams.get("rotary_emb_base"): + self.gguf_writer.add_rope_freq_base(rotary_emb_base) + lora_alpha = self.hparams.get("lora_alpha") + if lora_prompt_prefixes := self.hparams.get("task_instructions"): + assert self._lora_files and all(lora_name in lora_prompt_prefixes for lora_name in self._lora_files.keys()) + for lora_name, lora_writer in self._lora_files.items(): + lora_writer.add_float32(gguf.Keys.Adapter.LORA_ALPHA, lora_alpha if lora_alpha is not None else 1.0) + lora_writer.add_string(gguf.Keys.Adapter.LORA_TASK_NAME, lora_name) + if lora_prompt_prefixes: + lora_writer.add_string(gguf.Keys.Adapter.LORA_PROMPT_PREFIX, lora_prompt_prefixes[lora_name]) + + def write(self): + super().write() + for lora_writer in self._lora_files.values(): + lora_writer.write_header_to_file() + lora_writer.write_kv_data_to_file() + lora_writer.write_tensors_to_file(progress=True) + lora_writer.close() + @ModelBase.register("GemmaForCausalLM") class GemmaModel(TextModel): @@ -5103,6 +5515,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") return [] + # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 if name.endswith("norm.weight"): data_torch = data_torch + 1 @@ -5147,6 +5560,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") return [] + # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 if name.endswith("norm.weight"): data_torch = data_torch + 1 @@ -5217,9 +5631,88 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] +@ModelBase.register("Gemma3TextModel") +class EmbeddingGemma(Gemma3Model): + model_arch = gguf.MODEL_ARCH.GEMMA_EMBEDDING + module_paths = [] + dense_features_dims = {} + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if self.sentence_transformers_dense_modules: + # read modules.json to determine if model has Dense layers + modules_file = self.dir_model / "modules.json" + if modules_file.is_file(): + with open(modules_file, encoding="utf-8") as modules_json_file: + mods = json.load(modules_json_file) + for mod in mods: + if mod["type"] == "sentence_transformers.models.Dense": + mod_path = mod["path"] + # check if model.safetensors file for Dense layer exists + model_tensors_file = self.dir_model / mod_path / "model.safetensors" + if model_tensors_file.is_file(): + self.module_paths.append(mod_path) + # read config.json of the Dense layer to get in/out features + mod_conf_file = self.dir_model / mod_path / "config.json" + if mod_conf_file.is_file(): + with open(mod_conf_file, encoding="utf-8") as mod_conf_json_file: + mod_conf = json.load(mod_conf_json_file) + # hparams dense_2_feat_out and dense_3_feat_in are required when loading model's dense weights + prefix = self._get_dense_prefix(mod_path) + if mod_conf["in_features"] is not None and mod_conf["out_features"] is not None: + self.dense_features_dims[prefix] = ( + mod_conf["in_features"], + mod_conf["out_features"], + ) + + def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: + from safetensors.torch import load_file + + module_paths = list(self.module_paths) + for i, module_path in enumerate(module_paths): + tensors_file = self.dir_model / module_path / "model.safetensors" + local_tensors = load_file(tensors_file) + tensor_name = self._get_dense_prefix(module_path) + for name, local_tensor in local_tensors.items(): + if not name.endswith(".weight"): + continue + orig_name = name.replace("linear", tensor_name) + name = self.map_tensor_name(orig_name) + yield name, local_tensor.clone() + + @staticmethod + def _get_dense_prefix(module_path) -> str: + """Get the tensor name prefix for the Dense layer from module path.""" + tensor_name = "dense_2" if module_path == "2_Dense" else "dense_3" + return tensor_name + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + # Override the sliding window size as it gets adjusted by the Gemma3TextConfig + # constructor. We want to use the value from the original model's config.json. + # ref: https://github.com/huggingface/transformers/pull/40700 + with open(self.dir_model / "config.json", "r", encoding="utf-8") as f: + config = json.load(f) + orig_sliding_window = config.get("sliding_window") + if orig_sliding_window is None: + raise ValueError("sliding_window not found in model config - this is required for the model") + + logger.info( + f"Using original sliding_window from config: {orig_sliding_window} " + f"instead of {self.hparams['sliding_window']}" + ) + self.gguf_writer.add_sliding_window(orig_sliding_window) + if self.sentence_transformers_dense_modules: + for dense, dims in self.dense_features_dims.items(): + logger.info(f"Setting dense layer {dense} in/out features to {dims}") + self.gguf_writer.add_dense_features_dims(dense, dims[0], dims[1]) + + self._try_set_pooling_type() + + @ModelBase.register("Gemma3ForConditionalGeneration") class Gemma3VisionModel(MmprojModel): - def set_gguf_parameters(self): super().set_gguf_parameters() hparams = self.hparams @@ -5239,13 +5732,12 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_projector_scale_factor(proj_scale_factor) def tensor_force_quant(self, name, new_name, bid, n_dims): - del bid, new_name, n_dims # unused # related to https://github.com/ggml-org/llama.cpp/issues/13025 if "input_projection" in name: return gguf.GGMLQuantizationType.F16 if ".embeddings." in name: return gguf.GGMLQuantizationType.F32 - return False + return super().tensor_force_quant(name, new_name, bid, n_dims) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused @@ -5755,6 +6247,9 @@ def set_gguf_parameters(self): d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16 + # ceiling division + # ref: https://stackoverflow.com/a/17511341/22827863 + # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16) rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 use_dt_b_c_norm = False @@ -5910,26 +6405,21 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter class JambaModel(TextModel): model_arch = gguf.MODEL_ARCH.JAMBA - def get_vocab_base_pre(self, tokenizer) -> str: - del tokenizer # unused - - return "gpt-2" - def set_vocab(self): if (self.dir_model / "tokenizer.model").is_file(): - # Using Jamba's tokenizer.json causes errors on model load - # (something about "byte not found in vocab"), - # but there's a working tokenizer.model self._set_vocab_sentencepiece() else: - # Some Jamba models only have a tokenizer.json, which works. - self._set_vocab_gpt2() + self._set_vocab_llama_hf() + self.gguf_writer.add_add_space_prefix(False) def set_gguf_parameters(self): d_model = self.find_hparam(["hidden_size", "mamba_d_model"]) d_conv = self.find_hparam(["mamba_d_conv"], optional=True) or 4 d_inner = self.hparams["mamba_expand"] * d_model d_state = self.find_hparam(["mamba_d_state"], optional=True) or 16 + # ceiling division + # ref: https://stackoverflow.com/a/17511341/22827863 + # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 dt_rank = self.find_hparam(["mamba_dt_rank"], optional=True) or -(d_model // -16) rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-6 n_kv_head = self.hparams["num_key_value_heads"] @@ -6084,10 +6574,40 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] +@ModelBase.register("SeedOssForCausalLM") +class SeedOssModel(TextModel): + model_arch = gguf.MODEL_ARCH.SEED_OSS + + @ModelBase.register("Olmo2ForCausalLM") +@ModelBase.register("Olmo3ForCausalLM") class Olmo2Model(TextModel): model_arch = gguf.MODEL_ARCH.OLMO2 + def set_gguf_parameters(self): + super().set_gguf_parameters() + + rope_scaling = self.hparams.get("rope_scaling") or {} + if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) + self.gguf_writer.add_rope_scaling_attn_factors(rope_scaling["attention_factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) + + if "sliding_window" in self.hparams: + self.gguf_writer.add_sliding_window(self.hparams["sliding_window"]) + + sliding_window_pattern = [] + if "layer_types" in self.hparams: + sliding_window_pattern = [t == "sliding_attention" for t in self.hparams["layer_types"]] + else: + # Olmo2 does not use sliding window attention. + # Olmo3 defaults to using sliding window for all layers except every 4th. + for i in range(self.hparams["num_hidden_layers"]): + sliding_window_pattern.append((i + 1) % 4 != 0) + + self.gguf_writer.add_sliding_window_pattern(sliding_window_pattern) + @ModelBase.register("OlmoeForCausalLM") class OlmoeModel(TextModel): @@ -6173,6 +6693,7 @@ class OpenELMModel(TextModel): @staticmethod def _make_divisible(v: float | int, divisor: int) -> int: + # ref: https://huggingface.co/apple/OpenELM-270M-Instruct/blob/eb111ff2e6724348e5b905984063d4064d4bc579/configuration_openelm.py#L34-L38 new_v = max(divisor, int(v + divisor / 2) // divisor * divisor) # Make sure that round down does not go down by more than 10%. if new_v < 0.9 * v: @@ -6315,8 +6836,7 @@ def set_vocab(self): token_score = 0.0 logger.info( - f"Setting added token {token_id} to '{token_content}' " - "(type: {token_type}, score: {token_score:.2f})" + f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})" ) tokens[token_id] = token_content.encode("utf-8") toktypes[token_id] = token_type @@ -6485,9 +7005,11 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") -@ModelBase.register("DeepseekV2ForCausalLM") -@ModelBase.register("DeepseekV3ForCausalLM") -@ModelBase.register("KimiVLForConditionalGeneration") +@ModelBase.register( + "DeepseekV2ForCausalLM", + "DeepseekV3ForCausalLM", + "KimiVLForConditionalGeneration", +) class DeepseekV2Model(TextModel): model_arch = gguf.MODEL_ARCH.DEEPSEEK2 @@ -6834,6 +7356,8 @@ def set_gguf_parameters(self): self.gguf_writer.add_embedding_length(self.hparams["d_model"]) self.gguf_writer.add_feed_forward_length(self.hparams["d_ff"]) self.gguf_writer.add_block_count(self.hparams["num_layers"]) + if (dec_n_layer := self.hparams.get("num_decoder_layers")) is not None: + self.gguf_writer.add_decoder_block_count(dec_n_layer) self.gguf_writer.add_head_count(self.hparams["num_heads"]) self.gguf_writer.add_key_length(self.hparams["d_kv"]) self.gguf_writer.add_value_length(self.hparams["d_kv"]) @@ -6846,6 +7370,10 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused + # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", + # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored + # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder + # and decoder and ignore the remaining ones. if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]: if not self.shared_token_embeddings_found: name = "shared.weight" @@ -6978,6 +7506,10 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused + # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", + # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored + # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder + # and decoder and ignore the remaining ones. if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]: if not self.shared_token_embeddings_found: name = "shared.weight" @@ -7146,8 +7678,8 @@ def set_vocab(self): and "visible_text(m.content).endswith" in special_vocab.chat_template ): special_vocab.chat_template = special_vocab.chat_template.replace( - """{{ visible_text(m.content) }}\n{{- '/nothing' if (enable_thinking is defined and not enable_thinking and not visible_text(m.content).endswith("/nothing")) else '' -}}""", # pylint: disable=C0301 - """{% set content = visible_text(m.content) %}{{ content }}\n{{- '/nothing' if (enable_thinking is defined and not enable_thinking and not content.endswith("/nothing")) else '' -}}""", # pylint: disable=C0301 + """{{ visible_text(m.content) }}\n{{- '/nothing' if (enable_thinking is defined and not enable_thinking and not visible_text(m.content).endswith("/nothing")) else '' -}}""", + """{% set content = visible_text(m.content) %}{{ content }}\n{{- '/nothing' if (enable_thinking is defined and not enable_thinking and not content.endswith("/nothing")) else '' -}}""", ) special_vocab.add_to_gguf(self.gguf_writer) @@ -7272,6 +7804,8 @@ def set_vocab_chatglm3(self): text = piece.encode("utf-8") score = 0.0 + # Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py), + # it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size() if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size(): score = tokenizer.tokenizer.sp_model.get_score(token_id) @@ -7699,15 +8233,30 @@ def __init__(self, *args, **kwargs): self._attn_layers = self.get_attn_layers() self._ssm_layers = [i for i in range(self.block_count) if i not in self._attn_layers] + # There are some models in this family that are non-hybrid, but keep the + # same parent class by setting all layers to "attention." If this is the + # case, the model architecture needs to be updated to a standard + # "granite" or "granitemoe" model + if not self._ssm_layers: + has_experts = self.find_hparam(["num_experts_per_tok"], optional=True) + new_arch = gguf.MODEL_ARCH.GRANITE_MOE if has_experts else gguf.MODEL_ARCH.GRANITE + self.model_arch = new_arch + self.gguf_writer.arch = gguf.MODEL_ARCH_NAMES[new_arch] + self.gguf_writer.add_architecture() + # n_group and d_inner are used during reshape_tensors for mamba2 - self.d_model = self.find_hparam(["hidden_size", "d_model"]) - self.n_group = self.find_hparam(["n_groups"]) - self.d_inner = self.find_hparam(["expand"]) * self.d_model + # NOTE: Explicitly include hparam prefix prefix for d_model to + # disambiguate with top-level head_dim + # NOTE 2: If needed for future models, this can be isolated in a method + # to separate the prefix setting and the keys used + self.d_model = self.find_hparam([f"{self.hparam_prefixes[0]}_head_dim", "hidden_size", "d_model"]) + self.n_group = self.find_hparam(["n_groups", "num_groups"]) + self.d_inner = self.find_hparam(["expand", "num_heads"]) * self.d_model def get_attn_layers(self): # Explicit list of layer type names if layer_types := self.hparams.get("layer_types"): - return [i for i, layer_type in enumerate(layer_types) if layer_type == "attention"] + return [i for i, typ in enumerate(layer_types) if typ == "attention"] # Layer types indicated by index or period attn_layers = self.hparams.get("attn_layer_indices", []) @@ -7749,12 +8298,12 @@ def set_gguf_parameters(self): ## Mamba mixer params ## self.gguf_writer.add_ssm_conv_kernel(self.find_hparam(["conv_kernel", "d_conv"])) - self.gguf_writer.add_ssm_state_size(self.find_hparam(["state_size", "d_state"])) + self.gguf_writer.add_ssm_state_size(self.find_hparam(["state_size", "d_state", "state_dim", "ssm_state_size"])) self.gguf_writer.add_ssm_group_count(self.n_group) self.gguf_writer.add_ssm_inner_size(self.d_inner) # NOTE: The mamba_dt_rank is _not_ the right field for how this is used # in llama.cpp - self.gguf_writer.add_ssm_time_step_rank(self.find_hparam(["n_heads"])) + self.gguf_writer.add_ssm_time_step_rank(self.find_hparam(["n_heads", "num_heads"])) ## Attention params ## head_count_kv = self.find_hparam(["num_key_value_heads", "n_head_kv"]) @@ -7763,8 +8312,8 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_dimension_count(rope_dim) self.gguf_writer.add_head_count_kv(head_count_kv_vec) - ## If Bamba, use rope, otherwise don't - use_rope = "BambaForCausalLM" in self.hparams["architectures"] + ## If Bamba or non-hybrid, use rope, otherwise don't + use_rope = "BambaForCausalLM" in self.hparams["architectures"] or not self._ssm_layers self.gguf_writer.add_rope_scaling_finetuned(use_rope) if not use_rope: self.gguf_writer.add_context_length(2**20) @@ -7779,6 +8328,58 @@ def set_vocab(self): Mamba2Model.set_vocab(self) +@ModelBase.register("NemotronHForCausalLM") +class NemotronHModel(GraniteHybridModel): + """Hybrid mamba2/attention model from NVIDIA""" + + model_arch = gguf.MODEL_ARCH.NEMOTRON_H + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # Save the top-level head_dim for later + self.head_dim = self.hparams.get("head_dim", self.hparams.get("attention_head_dim")) + assert self.head_dim is not None, "Could not find the attention head dim in config" + + # Don't use expand to calculate d_inner + self.d_inner = self.find_hparam(["num_heads"]) * self.d_model + + # Update the ssm / attn / mlp layers + # M: Mamba2, *: Attention, -: MLP + hybrid_override_pattern = self.hparams["hybrid_override_pattern"] + self._ssm_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == "M"] + self._mlp_layers = [i for i, val in enumerate(hybrid_override_pattern) if val == "-"] + + def get_attn_layers(self): + hybrid_override_pattern = self.hparams["hybrid_override_pattern"] + assert ( + len(hybrid_override_pattern) == self.block_count + ), "Mismatch between hybrid override and num_hidden_layers!" + return [i for i, val in enumerate(hybrid_override_pattern) if val == "*"] + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + self.gguf_writer.add_key_length(self.head_dim) + self.gguf_writer.add_value_length(self.head_dim) + + # Set feed_forward_length + # NOTE: This will trigger an override warning. This is preferable to + # duplicating all the parent logic + n_ff = self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"]) + self.gguf_writer.add_feed_forward_length( + [n_ff if i in self._mlp_layers else 0 for i in range(self.block_count)] + ) + + def set_vocab(self): + super().set_vocab() + + # The tokenizer _does_ add a BOS token (via post_processor type + # TemplateProcessing) but does not set add_bos_token to true in the + # config, so we need to explicitly override it here. + self.gguf_writer.add_add_bos_token(True) + + @ModelBase.register("BailingMoeForCausalLM") class BailingMoeModel(TextModel): model_arch = gguf.MODEL_ARCH.BAILINGMOE @@ -7888,6 +8489,220 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") +@ModelBase.register("BailingMoeV2ForCausalLM") +class BailingMoeV2Model(TextModel): + model_arch = gguf.MODEL_ARCH.BAILINGMOE2 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if nextn_layers := self.hparams.get("num_nextn_predict_layers", 0): + self.block_count = self.hparams["num_hidden_layers"] + nextn_layers + self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count) + + def set_vocab(self): + self._set_vocab_gpt2() + + def set_gguf_parameters(self): + super().set_gguf_parameters() + hparams = self.hparams + if (rope_dim := hparams.get("head_dim")) is None: + rope_dim = hparams["hidden_size"] // hparams["num_attention_heads"] + + self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5))) + rope_scaling = self.hparams.get("rope_scaling") or {} + if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) + else: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE) + self.gguf_writer.add_leading_dense_block_count(hparams["first_k_dense_replace"]) + self.gguf_writer.add_vocab_size(hparams["vocab_size"]) + self.gguf_writer.add_expert_feed_forward_length(hparams["moe_intermediate_size"]) + self.gguf_writer.add_expert_shared_feed_forward_length( + hparams.get( + "moe_shared_expert_intermediate_size", hparams["moe_intermediate_size"] * hparams["num_shared_experts"] + ) + ) + self.gguf_writer.add_expert_weights_scale(hparams["routed_scaling_factor"]) + self.gguf_writer.add_expert_count(hparams["num_experts"]) + self.gguf_writer.add_expert_shared_count(hparams["num_shared_experts"]) + self.gguf_writer.add_expert_weights_norm(hparams["norm_topk_prob"]) + + if hparams["score_function"] == "sigmoid": + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) + elif hparams["score_function"] == "softmax": + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) + else: + raise ValueError(f"Unsupported score_function value: {hparams['score_function']}") + + if (nextn_layers := self.hparams.get("num_nextn_predict_layers")) is not None: + self.gguf_writer.add_nextn_predict_layers(nextn_layers) + + _experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if "mlp.experts" in name: + n_experts = self.hparams["num_experts"] + assert bid is not None + + tensors: list[tuple[str, Tensor]] = [] + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + data: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + data.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(data, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + + return tensors + + if name.endswith(".expert_bias"): + name = name.replace(".expert_bias", ".expert_bias.bias") + + return [(self.map_tensor_name(name), data_torch)] + + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + +@ModelBase.register("GroveMoeForCausalLM", "modeling_grove_moe.GroveMoeForCausalLM") +class GroveMoeModel(TextModel): + model_arch = gguf.MODEL_ARCH.GROVEMOE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + if (n_experts := self.hparams.get("num_experts")) is not None: + self.gguf_writer.add_expert_count(n_experts) + if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None: + self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) + logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}") + # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L299 + self.gguf_writer.add_expert_chunk_feed_forward_length(self.hparams.get("head_dim") or 128) + # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L298 + self.gguf_writer.add_experts_per_group(2) + # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L376 + self.gguf_writer.add_expert_group_scale(0.05) + # YaRN is not enabled by default + # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts + rope_scaling = self.hparams.get("rope_scaling") or {} + if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: + self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) + self.gguf_writer.add_rope_scaling_factor(rope_scaling["factor"]) + self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling["original_max_position_embeddings"]) + + _experts: list[dict[str, Tensor]] | None = None + _chunk_experts: list[dict[str, Tensor]] | None = None + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.endswith(".expert_bias"): + # FIXME?: Unused https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L303 + return [] + + # process the experts separately + if name.find("chunk_experts") != -1: + n_experts = self.hparams["num_experts"] // 2 # see add_experts_per_group + assert bid is not None + + if self._chunk_experts is None: + self._chunk_experts = [{} for _ in range(self.block_count)] + + self._chunk_experts[bid][name] = data_torch + + if len(self._chunk_experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + data: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.chunk_experts.{xid}.{w_name}.weight" + data.append(self._chunk_experts[bid][ename]) + del self._chunk_experts[bid][ename] + + data_torch = torch.stack(data, dim=0) + + merged_name = f"model.layers.{bid}.mlp.chunk_experts.{w_name}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + elif name.find("experts") != -1: + n_experts = self.hparams["num_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + data: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + data.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(data, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + def prepare_tensors(self): + super().prepare_tensors() + + if self._chunk_experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + chunk_experts = [k for d in self._chunk_experts for k in d.keys()] + if len(chunk_experts) > 0: + raise ValueError(f"Unprocessed adjugate experts: {chunk_experts}") + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + @ModelBase.register("ChameleonForConditionalGeneration") @ModelBase.register("ChameleonForCausalLM") # obsolete class ChameleonModel(TextModel): @@ -7921,6 +8736,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] + # see: https://github.com/huggingface/transformers/blob/72fb02c47dbbe1999ae105319f24631cad6e2e00/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py#L176-L203 @staticmethod def _reverse_hf_permute(data_torch, n_heads, hidden_dim): head_dim = hidden_dim // n_heads @@ -7936,8 +8752,7 @@ class UltravoxModel(TextModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) raise NotImplementedError( - "Ultravox does not have text decoder. Instead, it uses Llama or other models for text. " - "If you want to get the audio encoder, please use --mmproj argument" + "Ultravox does not have text decoder. Instead, it uses Llama or other models for text. If you want to get the audio encoder, please use --mmproj argument" ) @@ -7960,10 +8775,9 @@ def set_gguf_parameters(self): self.gguf_writer.add_audio_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5)) def tensor_force_quant(self, name, new_name, bid, n_dims): - del bid, new_name, n_dims # unused if ".conv" in name and ".weight" in name: return gguf.GGMLQuantizationType.F16 - return False + return super().tensor_force_quant(name, new_name, bid, n_dims) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused @@ -8191,6 +9005,8 @@ def set_gguf_parameters(self): # Rope rope_scaling = hparams.get("rope_scaling", {}) if rope_scaling.get("type") == "dynamic": + # HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/ + # 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf) alpha = rope_scaling.get("alpha", 1000) base = hparams.get("rope_theta", 10000.0) dim = hparams["hidden_size"] // hparams["num_attention_heads"] # 128 @@ -8202,6 +9018,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length self.gguf_writer.add_context_length(256 * 1024) # 256k context length + # if any of our assumptions about the values are wrong, something has changed and this may need to be updated assert ( alpha == 1000 and base == 10000.0 @@ -8256,6 +9073,76 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") +@ModelBase.register("LLaDAMoEModel", "LLaDAMoEModelLM") +class LLaDAMoEModel(TextModel): + model_arch = gguf.MODEL_ARCH.LLADA_MOE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + if (n_experts := self.hparams.get("num_experts")) is not None: + self.gguf_writer.add_expert_count(n_experts) + + if (expert_intermediate_size := self.hparams.get("expert_intermediate_size")) is not None: + self.gguf_writer.add_expert_feed_forward_length(expert_intermediate_size) + + # number of experts used per token (top-k) + if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None: + self.gguf_writer.add_expert_used_count(n_experts_used) + + self.gguf_writer.add_mask_token_id(156895) + self.gguf_writer.add_causal_attention(False) + self.gguf_writer.add_diffusion_shift_logits(False) + + _experts: list[dict[str, Tensor]] | None = None + + # Copied from: Qwen2MoeModel + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # process the experts separately + if name.find("experts") != -1: + n_experts = self.hparams["num_experts"] + assert bid is not None + + if self._experts is None: + self._experts = [{} for _ in range(self.block_count)] + + self._experts[bid][name] = data_torch + + if len(self._experts[bid]) >= n_experts * 3: + tensors: list[tuple[str, Tensor]] = [] + + # merge the experts into a single 3d tensor + for w_name in ["down_proj", "gate_proj", "up_proj"]: + data: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.mlp.experts.{xid}.{w_name}.weight" + data.append(self._experts[bid][ename]) + del self._experts[bid][ename] + + data_torch = torch.stack(data, dim=0) + + merged_name = f"model.layers.{bid}.mlp.experts.{w_name}.weight" + + new_name = self.map_tensor_name(merged_name) + + tensors.append((new_name, data_torch)) + return tensors + else: + return [] + + return [(self.map_tensor_name(name), data_torch)] + + # Copied from: Qwen2MoeModel + def prepare_tensors(self): + super().prepare_tensors() + + if self._experts is not None: + # flatten `list[dict[str, Tensor]]` into `list[str]` + experts = [k for d in self._experts for k in d.keys()] + if len(experts) > 0: + raise ValueError(f"Unprocessed experts: {experts}") + + @ModelBase.register("HunYuanDenseV1ForCausalLM") class HunYuanModel(TextModel): model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE @@ -8323,6 +9210,8 @@ def set_gguf_parameters(self): # Rope rope_scaling = hparams.get("rope_scaling", {}) if rope_scaling.get("type") == "dynamic": + # HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/ + # 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf) alpha = rope_scaling.get("alpha", 50) base = hparams.get("rope_theta", 10000.0) dim = hparams["head_dim"] @@ -8334,6 +9223,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length self.gguf_writer.add_context_length(256 * 1024) # 256k context length + # if any of our assumptions about the values are wrong, something has changed and this may need to be updated assert base == 10000.0 and self.hparams["max_position_embeddings"] in [ 32 * 1024, 256 * 1024, @@ -8368,6 +9258,13 @@ def set_vocab(self): class GptOssModel(TextModel): model_arch = gguf.MODEL_ARCH.GPT_OSS + # TODO: remove once MXFP4 is supported more generally + def dequant_model(self): + quant_config = self.hparams.get("quantization_config") + if quant_config is not None and quant_config.get("quant_method") == "mxfp4": + return + return super().dequant_model() + def transform_nibble_layout(self, tensor): assert tensor.dtype == torch.uint8 assert tensor.shape[-1] == 16 @@ -8488,8 +9385,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_orig_ctx_len(rope_scaling.get("original_max_position_embeddings", 4096)) -@ModelBase.register("Lfm2ForCausalLM") -@ModelBase.register("LFM2ForCausalLM") +@ModelBase.register("Lfm2ForCausalLM", "LFM2ForCausalLM") class LFM2Model(TextModel): model_arch = gguf.MODEL_ARCH.LFM2 @@ -8524,6 +9420,13 @@ def set_gguf_parameters(self): self._add_feed_forward_length() def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + is_vision_tensor = "vision_tower" in name or "multi_modal_projector" in name + if is_vision_tensor: + # skip vision tensors + return [] + + name = name.replace("language_model.", "") + # conv op requires 2d tensor if "conv.conv" in name: data_torch = data_torch.squeeze(1) @@ -8531,6 +9434,110 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] +@ModelBase.register("Lfm2MoeForCausalLM") +class LFM2MoeModel(TextModel): + model_arch = gguf.MODEL_ARCH.LFM2MOE + + def set_gguf_parameters(self): + # set num_key_value_heads only for attention layers + self.hparams["num_key_value_heads"] = [ + self.hparams["num_key_value_heads"] if layer_type == "full_attention" else 0 + for layer_type in self.hparams["layer_types"] + ] + + super().set_gguf_parameters() + + self.gguf_writer.add_expert_count(self.hparams["num_experts"]) + self.gguf_writer.add_expert_feed_forward_length(self.hparams["moe_intermediate_size"]) + self.gguf_writer.add_leading_dense_block_count(self.hparams["num_dense_layers"]) + self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) + + self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"]) + + # cache for experts weights for merging + _experts_cache: dict[int, dict[str, Tensor]] = {} + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # conv op requires 2d tensor + if "conv.conv" in name: + data_torch = data_torch.squeeze(1) + + if name.endswith(".expert_bias"): + name = name.replace(".expert_bias", ".expert_bias.bias") + + # merge expert weights + if "experts" in name: + n_experts = self.hparams["num_experts"] + assert bid is not None + + expert_cache = self._experts_cache.setdefault(bid, {}) + expert_cache[name] = data_torch + expert_weights = ["w1", "w2", "w3"] + + # not enough expert weights to merge + if len(expert_cache) < n_experts * len(expert_weights): + return [] + + tensors: list[tuple[str, Tensor]] = [] + for w_name in expert_weights: + data: list[Tensor] = [] + + for xid in range(n_experts): + ename = f"model.layers.{bid}.feed_forward.experts.{xid}.{w_name}.weight" + data.append(expert_cache[ename]) + del expert_cache[ename] + + data_torch = torch.stack(data, dim=0) + merged_name = f"layers.{bid}.feed_forward.experts.{w_name}.weight" + new_name = self.map_tensor_name(merged_name) + tensors.append((new_name, data_torch)) + + del self._experts_cache[bid] + return tensors + + return [(self.map_tensor_name(name), data_torch)] + + def prepare_tensors(self): + super().prepare_tensors() + assert not self._experts_cache + + +@ModelBase.register("Lfm2VlForConditionalGeneration") +class LFM2VLModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + # TODO(tarek): for dynamic resolution image_size is not specified, setting here for compatibility + self.hparams_vision["image_size"] = 256 + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LFM2) + self.gguf_writer.add_vision_attention_layernorm_eps(self.find_vparam(["layer_norm_eps"])) + self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("downsample_factor", 2)) + self.gguf_writer.add_vision_use_gelu(True) + # python notation, e.g. for vision_feature_layer == -1, we pick last layer -> vision_feature_layers_to_drop = 0 + vision_feature_layers_to_drop = -(self.global_config.get("vision_feature_layer", -1) + 1) + self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys) - vision_feature_layers_to_drop) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + is_vision_tensor = "vision_tower" in name or "multi_modal_projector" in name + + if is_vision_tensor: + # remove "model." prefix + name = name.replace("model.vision_tower.", "vision_tower.") + name = name.replace("model.multi_modal_projector.", "multi_modal_projector.") + + if "patch_embedding.weight" in name: + data_torch = data_torch.view(data_torch.shape[0], 16, 16, 3).permute(0, 3, 1, 2) + + return [(self.map_tensor_name(name), data_torch)] + + return [] # skip other tensors + + @ModelBase.register("SmallThinkerForCausalLM") class SmallThinkerModel(TextModel): model_arch = gguf.MODEL_ARCH.SMALLTHINKER @@ -8618,6 +9625,43 @@ def prepare_tensors(self): raise ValueError(f"Unprocessed experts: {experts}") +@ModelBase.register("ApertusForCausalLM") +class ApertusModel(LlamaModel): + model_arch = gguf.MODEL_ARCH.APERTUS + undo_permute = False + + _alpha_n = {} + _alpha_p = {} + _beta = {} + _eps = {} + + def modify_tensors(self, data_torch, name, bid): + # Handle xIELU activation parameters + n_layers = self.hparams["num_hidden_layers"] + if name.endswith(".act_fn.alpha_n"): + self._alpha_n[bid] = data_torch.to("cpu").float().item() + if len(self._alpha_n) == n_layers: + self.gguf_writer.add_xielu_alpha_n([self._alpha_n[k] for k in sorted(self._alpha_n)]) + return [] + if name.endswith(".act_fn.alpha_p"): + self._alpha_p[bid] = data_torch.to("cpu").float().item() + if len(self._alpha_p) == n_layers: + self.gguf_writer.add_xielu_alpha_p([self._alpha_p[k] for k in sorted(self._alpha_p)]) + return [] + if name.endswith(".act_fn.beta"): + self._beta[bid] = data_torch.to("cpu").float().item() + if len(self._beta) == n_layers: + self.gguf_writer.add_xielu_beta([self._beta[k] for k in sorted(self._beta)]) + return [] + if name.endswith(".act_fn.eps"): + self._eps[bid] = data_torch.to("cpu").float().item() + if len(self._eps) == n_layers: + self.gguf_writer.add_xielu_eps([self._eps[k] for k in sorted(self._eps)]) + return [] + + return super().modify_tensors(data_torch, name, bid) + + class MistralModel(LlamaModel): model_arch = gguf.MODEL_ARCH.LLAMA model_name = "Mistral" @@ -8626,12 +9670,10 @@ class MistralModel(LlamaModel): undo_permute = False @staticmethod - def get_community_chat_template(vocab, templates_dir: Path): - from mistral_common.tokens.tokenizers.base import TokenizerVersion # pylint: disable=E0401 - from mistral_common.tokens.tokenizers.sentencepiece import SentencePieceTokenizer # pylint: disable=E0401 - from mistral_common.tokens.tokenizers.tekken import Tekkenizer # pylint: disable=E0401 - - assert TokenizerVersion is not None, "mistral_common is not installed" + def get_community_chat_template(vocab: MistralVocab, templates_dir: Path, is_mistral_format: bool): + assert ( + TokenizerVersion is not None and Tekkenizer is not None and SentencePieceTokenizer is not None + ), _mistral_import_error_msg assert isinstance( vocab.tokenizer, (Tekkenizer, SentencePieceTokenizer) ), f"Expected Tekkenizer or SentencePieceTokenizer, got {type(vocab.tokenizer)}" @@ -8651,7 +9693,13 @@ def get_community_chat_template(vocab, templates_dir: Path): elif vocab.tokenizer.version == TokenizerVersion.v13: template_file = "unsloth-mistral-Devstral-Small-2507.jinja" else: - raise ValueError(f"Unknown tokenizer type: {vocab.tokenizer_type} and version {vocab.tokenizer.version}") + err_message = f"Unknown tokenizer type: {vocab.tokenizer_type} and version {vocab.tokenizer.version}" + if is_mistral_format: + err_message += ( + " . Please pass --disable-mistral-community-chat-template argument to the CLI " + "if you want to skip this error and use the Mistral official `mistral-common` pre-processing library." + ) + raise ValueError(err_message) template_path = templates_dir / template_file if not template_path.exists(): @@ -8689,6 +9737,58 @@ def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", " return super().map_tensor_name(name, try_suffixes) +@ModelBase.register("LightOnOCRForConditionalGeneration") +class LightOnOCRVisionModel(LlavaVisionModel): + is_mistral_format = False + use_break_tok = False + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.LIGHTONOCR) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None): + name = name.replace("model.vision_encoder.", "vision_tower.") + name = name.replace("model.vision_projection.", "multi_modal_projector.") + return super().modify_tensors(data_torch, name, bid) + + +@ModelBase.register("KimiVLForConditionalGeneration") +class KimiVLModel(MmprojModel): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + assert self.hparams_vision is not None + self.hparams_vision["image_size"] = 64 * 14 # for compatibility + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.KIMIVL) + self.gguf_writer.add_vision_use_gelu(True) + self.gguf_writer.add_vision_projector_scale_factor(2) + # eps is the same as pytorch's default value + assert self.hparams_vision is not None + self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-5)) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused + is_vision_tensor = "vision_tower" in name or "multi_modal_projector" in name + + if is_vision_tensor: + if "pos_emb.weight" in name: + data_torch = data_torch.view(data_torch.shape[0] * data_torch.shape[1], data_torch.shape[2]) + elif "wqkv" in name: + split_dim = 0 if "weight" in name else -1 + wq, wk, wv = data_torch.chunk(3, dim=split_dim) + return [ + (self.map_tensor_name(name.replace("wqkv", "wq")), wq), + (self.map_tensor_name(name.replace("wqkv", "wk")), wk), + (self.map_tensor_name(name.replace("wqkv", "wv")), wv), + ] + + return [(self.map_tensor_name(name), data_torch)] + + return [] # skip other tensors + + ###### CONVERSION LOGIC ###### @@ -8706,6 +9806,9 @@ class LazyTorchTensor(gguf.LazyBase): torch.uint8: np.uint8, } + # used for safetensors slices + # ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046 + # TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734 _dtype_str_map: dict[str, torch.dtype] = { "F64": torch.float64, "F32": torch.float32, @@ -8740,7 +9843,11 @@ def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) - def from_safetensors_slice(cls, st_slice: Any) -> Tensor: dtype = cls._dtype_str_map[st_slice.get_dtype()] shape: tuple[int, ...] = tuple(st_slice.get_shape()) - lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:]) + lazy = cls( + meta=cls.meta_with_dtype_and_shape(dtype, shape), + args=(st_slice,), + func=lambda s: s[...] if len(s.get_shape()) == 0 else s[:], + ) return cast(torch.Tensor, lazy) @classmethod @@ -8783,9 +9890,7 @@ def parse_args() -> argparse.Namespace: type=str, choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], default="f16", - help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, " - "q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity " - "16-bit float type depending on the first loaded tensor type", + help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", ) parser.add_argument( "--bigendian", @@ -8846,23 +9951,36 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--remote", action="store_true", - help="(Experimental) Read safetensors file remotely without downloading to disk. " - "Config and tokenizer files will still be downloaded. To use this feature, " - "you need to specify Hugging Face model repo name instead of a local directory. " - "For example: 'HuggingFaceTB/SmolLM2-1.7B-Instruct'. Note: To access gated repo, " - "set HF_TOKEN environment variable to your Hugging Face token.", + help="(Experimental) Read safetensors file remotely without downloading to disk. Config and tokenizer files will still be downloaded. To use this feature, you need to specify Hugging Face model repo name instead of a local directory. For example: 'HuggingFaceTB/SmolLM2-1.7B-Instruct'. Note: To access gated repo, set HF_TOKEN environment variable to your Hugging Face token.", ) parser.add_argument( "--mmproj", action="store_true", - help="(Experimental) Export multimodal projector (mmproj) for vision models. " - "This will only work on some vision models. A prefix 'mmproj-' will be added to the output file name.", + help="(Experimental) Export multimodal projector (mmproj) for vision models. This will only work on some vision models. A prefix 'mmproj-' will be added to the output file name.", ) parser.add_argument( "--mistral-format", action="store_true", help="Whether the model is stored following the Mistral format.", ) + parser.add_argument( + "--disable-mistral-community-chat-template", + action="store_true", + help=( + "Whether to disable usage of Mistral community chat templates. If set, use the Mistral official `mistral-common` library for tokenization and detokenization of Mistral models. " + "Using `mistral-common` ensure correctness and zero-day support of tokenization for models converted from the Mistral format but requires to manually setup the tokenization server." + ), + ) + + parser.add_argument( + "--sentence-transformers-dense-modules", + action="store_true", + help=( + "Whether to include sentence-transformers dense modules." + "It can be used for sentence-transformers models, like google/embeddinggemma-300m" + "Default these modules are not included." + ), + ) args = parser.parse_args() if not args.print_supported_models and args.model is None: @@ -8927,9 +10045,11 @@ def main() -> None: hf_repo_id = args.model from huggingface_hub import snapshot_download - local_dir = snapshot_download( - repo_id=hf_repo_id, allow_patterns=["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"] - ) + allowed_patterns = ["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"] + if args.sentence_transformers_dense_modules: + # include sentence-transformers dense modules safetensors files + allowed_patterns.append("*.safetensors") + local_dir = snapshot_download(repo_id=hf_repo_id, allow_patterns=allowed_patterns) dir_model = Path(local_dir) logger.info(f"Downloaded config and tokenizer to {local_dir}") else: @@ -8965,11 +10085,10 @@ def main() -> None: logger.info(f"Loading model: {dir_model.name}") - if args.mmproj: - if "mmproj" not in fname_out.name: - fname_out = ModelBase.add_prefix_to_filename(fname_out, "mmproj-") - is_mistral_format = args.mistral_format + if is_mistral_format and not _mistral_common_installed: + raise ImportError(_mistral_import_error_msg) + disable_mistral_community_chat_template = args.disable_mistral_community_chat_template with torch.inference_mode(): output_type = ftype_map[args.outtype] @@ -9003,6 +10122,8 @@ def main() -> None: dry_run=args.dry_run, small_first_shard=args.no_tensor_first_split, remote_hf_model_id=hf_repo_id, + disable_mistral_community_chat_template=disable_mistral_community_chat_template, + sentence_transformers_dense_modules=args.sentence_transformers_dense_modules, ) if args.vocab_only: diff --git a/auto_round/export/export_to_gguf/export.py b/auto_round/export/export_to_gguf/export.py index 88a73095b..2e9d76ead 100644 --- a/auto_round/export/export_to_gguf/export.py +++ b/auto_round/export/export_to_gguf/export.py @@ -98,9 +98,12 @@ def create_model_class( raise TypeError(f"{output_type} type is not supported") output_type = FTYPE_MAP.get(output_type.lower()) + hparams = convert_hf_to_gguf.ModelBase.load_hparams(Path(tmp_work_dir), "mistral" in model.config.model_type) + hparams.pop("quantization_config") model_instance = model_class( dir_model=Path(tmp_work_dir), ftype=output_type, + hparams=hparams, fname_out=Path(output_dir), is_big_endian=False, model_name=model_name, diff --git a/auto_round/utils.py b/auto_round/utils.py index afb7b2940..8897dfaf3 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -80,7 +80,7 @@ def __getitem__(self, key): SUPPORTED_LAYER_TYPES = (torch.nn.Linear, transformers.pytorch_utils.Conv1D) # Changed to str as it relies on triton or others lib to load this -INNER_SUPPORTED_LAYER_TYPES = ("FP8Linear",) +INNER_SUPPORTED_LAYER_TYPES = ("FP8Linear", "CompressedLinear") # transformers.integrations.finegrained_fp8.FP8Linear if deepspeed_exists: from deepspeed.module_inject import LinearAllreduce, LinearLayer @@ -1388,7 +1388,7 @@ def _is_fp8_model(model: torch.nn.Module) -> bool: def _is_fp8_linear(module: torch.nn.Module) -> bool: if hasattr(module, "is_fp8_linear"): return module.is_fp8_linear - if not (type(module) == torch.nn.Linear or module.__class__.__name__ == "FP8Linear"): + if type(module) != torch.nn.Linear and module.__class__.__name__ not in ["FP8Linear", "CompressedLinear"]: return False if module.weight is None: return False @@ -2340,7 +2340,7 @@ def convert_fp8_model_to_16b_model(model, dtype=torch.bfloat16): """ cnt = 0 for n, m in model.named_modules(): - if m.__class__.__name__ == "FP8Linear": + if m.__class__.__name__ in ["FP8Linear", "CompressedLinear"]: new_module = convert_fp8_layer_to_linear(m, dtype=dtype) set_module(model, n, new_module) cnt += 1 From 2783507c1aa065b41907eb7461d7b9f386214623 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Mon, 27 Oct 2025 20:55:52 -0400 Subject: [PATCH 2/5] fix Signed-off-by: n1ck-guo --- auto_round/export/export_to_gguf/export.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/export/export_to_gguf/export.py b/auto_round/export/export_to_gguf/export.py index 2e9d76ead..526056263 100644 --- a/auto_round/export/export_to_gguf/export.py +++ b/auto_round/export/export_to_gguf/export.py @@ -99,7 +99,7 @@ def create_model_class( output_type = FTYPE_MAP.get(output_type.lower()) hparams = convert_hf_to_gguf.ModelBase.load_hparams(Path(tmp_work_dir), "mistral" in model.config.model_type) - hparams.pop("quantization_config") + hparams.pop("quantization_config", None) model_instance = model_class( dir_model=Path(tmp_work_dir), ftype=output_type, From 11da716e7c26c6969141f9752f1f3800aa9e5466 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Mon, 27 Oct 2025 21:16:17 -0400 Subject: [PATCH 3/5] update Signed-off-by: n1ck-guo --- auto_round/export/export_to_gguf/convert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/export/export_to_gguf/convert.py b/auto_round/export/export_to_gguf/convert.py index 91f206667..370623a58 100644 --- a/auto_round/export/export_to_gguf/convert.py +++ b/auto_round/export/export_to_gguf/convert.py @@ -401,7 +401,7 @@ def prepare_tensors(cls): skip = False for tensor_info in cls.gguf_writer.tensors: if new_name in tensor_info: - print("new_name already add to gguf_writer, skip") + logger.warning(f"{new_name} already add to gguf_writer, skip") skip = True break if skip: From 63944b11fde019d3cf7423ae5a0ece5177d7c6a0 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Mon, 27 Oct 2025 23:18:53 -0400 Subject: [PATCH 4/5] fix Signed-off-by: n1ck-guo --- auto_round/__main__.py | 2 +- .../export_to_gguf/convert_hf_to_gguf.py | 341 ++---------------- test/test_cpu/test_gguf_format.py | 4 +- 3 files changed, 39 insertions(+), 308 deletions(-) diff --git a/auto_round/__main__.py b/auto_round/__main__.py index a3ab6f2e5..c403ee863 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -789,7 +789,7 @@ def run_eval(): if args.eval_task_by_task: eval_task_by_task( model=args.model, - device=args.device, + device=args.device_map, tasks=args.tasks, batch_size=args.eval_bs, trust_remote_code=not args.disable_trust_remote_code, diff --git a/auto_round/export/export_to_gguf/convert_hf_to_gguf.py b/auto_round/export/export_to_gguf/convert_hf_to_gguf.py index e81f9593a..0ce6a7cec 100644 --- a/auto_round/export/export_to_gguf/convert_hf_to_gguf.py +++ b/auto_round/export/export_to_gguf/convert_hf_to_gguf.py @@ -41,8 +41,8 @@ if "NO_LOCAL_GGUF" not in os.environ: sys.path.insert(1, str(Path(__file__).parent / "gguf-py")) -import gguf -from gguf.vocab import MistralTokenizerType, MistralVocab +import gguf # pylint: disable=E0401 +from gguf.vocab import MistralTokenizerType, MistralVocab # pylint: disable=E0401 try: from mistral_common.tokens.tokenizers.base import TokenizerVersion # pyright: ignore[reportMissingImports] @@ -889,7 +889,8 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) if previous_token != token: logger.info( - f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer" + f"{repr(previous_token)} is encoded and decoded back to {repr(token)} " + f"using AutoTokenizer" ) if added_tokens_decoder[i].special or self.does_token_look_special(token): @@ -897,7 +898,7 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: else: # NOTE: this was added for Gemma. # Encoding and decoding the tokens above isn't sufficient for this case. - token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces + token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") toktypes.append(gguf.TokenType.USER_DEFINED) else: toktypes.append(gguf.TokenType.NORMAL) @@ -915,7 +916,7 @@ def get_vocab_base_pre(self, tokenizer) -> str: # we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can # use in llama.cpp to implement the same pre-tokenizer - chktxt = "\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````\"\"\"\"......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL" + chktxt = "\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶\u200d🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````\"\"\"\"......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL" # pylint: disable=C0301 chktok = tokenizer.encode(chktxt) chkhsh = sha256(str(chktok).encode()).hexdigest() @@ -1312,7 +1313,7 @@ def _create_vocab_sentencepiece(self): if token_data.get("special") or self.does_token_look_special(token): toktypes[token_id] = SentencePieceTokenTypes.CONTROL else: - token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") # pre-normalize user-defined spaces + token = token.replace(b"\xe2\x96\x81".decode("utf-8"), " ") toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED scores[token_id] = -1000.0 @@ -1498,7 +1499,8 @@ def _set_vocab_interns1(self): token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) if previous_token != token: logger.info( - f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer" + f"{repr(previous_token)} is encoded and decoded back to {repr(token)} " + "using AutoTokenizer" ) if added_tokens_decoder[i].special or self.does_token_look_special(token): @@ -1702,9 +1704,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter tensors: list[tuple[str, Tensor]] = [] if re.match(r"gpt_neox\.layers\.\d+\.attention\.query_key_value\.weight", name): - # Map bloom-style qkv_linear to gpt-style qkv_linear - # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa - # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed)) data_torch = torch.cat( ( @@ -1759,9 +1758,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter tensors: list[tuple[str, Tensor]] = [] if re.match(r"h\.\d+\.self_attention\.query_key_value\.weight", name): - # Map bloom-style qkv_linear to gpt-style qkv_linear - # bloom: https://github.com/huggingface/transformers/blob/main/src/transformers/models/bloom/modeling_bloom.py#L238-L252 # noqa - # gpt-2: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py#L312 # noqa qkv_weights = data_torch.reshape((n_head, 3, n_embed // n_head, n_embed)) data_torch = torch.cat( ( @@ -1863,8 +1859,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) self.gguf_writer.add_head_count(head_count) self.gguf_writer.add_head_count_kv(head_count_kv) - # note: config provides rms norm but it is actually layer norm - # ref: https://huggingface.co/OrionStarAI/Orion-14B-Chat/blob/276a17221ce42beb45f66fac657a41540e71f4f5/modeling_orion.py#L570-L571 self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"]) @@ -2340,7 +2334,8 @@ def _set_vocab_mistral(self): self.gguf_writer.add_token_merges(vocab.extract_vocab_merges_from_model()) logger.info( - f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}, {vocab.pad_id}." + f"Setting bos, eos, unk and pad token IDs to {vocab.bos_id}, {vocab.eos_id}, {vocab.unk_id}," + f" {vocab.pad_id}." ) self.gguf_writer.add_bos_token_id(vocab.bos_id) @@ -2362,14 +2357,16 @@ def _set_vocab_mistral(self): # Log only for Mistral format that the official tokenization and detokenization is via `mistral-common`. if self.is_mistral_format: logger.info( - "Using a Mistral community chat template. These templates can be subject to errors in early days or weeks after a release. " + "Using a Mistral community chat template. These templates can be subject to " + "errors in early days or weeks after a release. " "Mistral recommends to use `mistral-common` to perform tokenization and detokenization." ) template = MistralModel.get_community_chat_template(vocab, template_dir, self.is_mistral_format) self.gguf_writer.add_chat_template(template) else: logger.info( - "Not using a Mistral community chat template. Ensure to perform the tokenization and detokenization via `mistral-common`." + "Not using a Mistral community chat template. Ensure to perform the tokenization and" + " detokenization via `mistral-common`." ) def set_vocab(self): @@ -2980,9 +2977,6 @@ def weight_quant(self, weight: Tensor) -> Tensor: weight = weight.float() scale = weight.abs().mean().clamp(min=1e-5) iscale = 1 / scale - # TODO: multiply by the scale directly instead of inverting it twice - # (this is also unnecessarily doubly inverted upstream) - # ref: https://huggingface.co/1bitLLM/bitnet_b1_58-3B/blob/af89e318d78a70802061246bf037199d2fb97020/utils_quant.py#L10 result = (weight * iscale).round().clamp(-1, 1) / iscale return result.type(dtype) @@ -3018,7 +3012,8 @@ def set_vocab(self): if not (self.dir_model / "tokenizer.json").is_file() or not (self.dir_model / "chat_template.jinja").is_file(): logger.error( - "Error: Missing vocab and chat template, download files from https://huggingface.co/alvarobartt/grok-2-tokenizer" + "Error: Missing vocab and chat template, download files from" + " https://huggingface.co/alvarobartt/grok-2-tokenizer" ) sys.exit(1) @@ -3870,8 +3865,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter # transpose conv1 and conv2 bias data_torch = data_torch.unsqueeze(-1) if "audio_bos_eos_token" in name: - # this tensor is left unused in transformers code - # https://github.com/huggingface/transformers/blob/6e3063422c4b1c014aa60c32b9254fd2902f0f28/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py#L1809 return [] return [(self.map_tensor_name(name), data_torch)] @@ -4007,7 +4000,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_expert_shared_feed_forward_length(shared_expert_intermediate_size) logger.info(f"gguf: expert shared feed forward length = {shared_expert_intermediate_size}") # YaRN is not enabled by default - # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts rope_scaling = self.hparams.get("rope_scaling") or {} if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) @@ -4128,8 +4120,8 @@ def set_gguf_parameters(self): [ { "name": "rerank", - "template": '<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>\n' - "<|im_start|>user\n: Given a web search query, retrieve relevant passages that answer the query\n: {query}\n: {document}<|im_end|>\n" + "template": '<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>\n' # pylint: disable=C0301 + "<|im_start|>user\n: Given a web search query, retrieve relevant passages that answer the query\n: {query}\n: {document}<|im_end|>\n" # pylint: disable=C0301 "<|im_start|>assistant\n\n\n\n\n", } ] @@ -4313,7 +4305,8 @@ def set_vocab(self): if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: if tokens[token_id] != token: logger.warning( - f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}' + f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} ' + f' -> {token.decode("utf-8")!r}' ) tokens[token_id] = token scores[token_id] = -1000.0 @@ -4332,7 +4325,8 @@ def set_vocab(self): if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: if tokens[token_id] != token: logger.warning( - f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}' + f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} ' + f'-> {token.decode("utf-8")!r}' ) tokens[token_id] = token scores[token_id] = -1000.0 @@ -4414,7 +4408,8 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]: if len(long_factors) != len(short_factors) or len(long_factors) != rope_dims / 2: raise ValueError( - f"The length of rope long and short factors must be {rope_dims / 2}. long_factors = {len(long_factors)}, short_factors = {len(short_factors)}." + f"The length of rope long and short factors must be {rope_dims / 2}. " + "long_factors = {len(long_factors)}, short_factors = {len(short_factors)}." ) yield ( @@ -4811,7 +4806,8 @@ def set_vocab(self): if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: if tokens[token_id] != token: logger.warning( - f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}' + f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} ' + f'-> {token.decode("utf-8")!r}' ) tokens[token_id] = token scores[token_id] = -1000.0 @@ -4833,7 +4829,8 @@ def set_vocab(self): if toktypes[token_id] != SentencePieceTokenTypes.UNUSED: if tokens[token_id] != token: logger.warning( - f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} -> {token.decode("utf-8")!r}' + f'replacing token {token_id}: {tokens[token_id].decode("utf-8")!r} ' + f'-> {token.decode("utf-8")!r}' ) tokens[token_id] = token scores[token_id] = -1000.0 @@ -5355,7 +5352,7 @@ def set_gguf_parameters(self): self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") - self.gguf_writer.add_pooling_type(gguf.PoolingType.CLS) # https://huggingface.co/chandar-lab/NeoBERT#how-to-use + self.gguf_writer.add_pooling_type(gguf.PoolingType.CLS) def modify_tensors(self, data_torch, name, bid): if name.startswith("decoder."): @@ -5515,7 +5512,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") return [] - # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 if name.endswith("norm.weight"): data_torch = data_torch + 1 @@ -5560,7 +5556,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter logger.debug(f"Skipping get tensor {name!r} in safetensors so that convert can end normally.") return [] - # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 if name.endswith("norm.weight"): data_torch = data_torch + 1 @@ -5657,7 +5652,6 @@ def __init__(self, *args, **kwargs): if mod_conf_file.is_file(): with open(mod_conf_file, encoding="utf-8") as mod_conf_json_file: mod_conf = json.load(mod_conf_json_file) - # hparams dense_2_feat_out and dense_3_feat_in are required when loading model's dense weights prefix = self._get_dense_prefix(mod_path) if mod_conf["in_features"] is not None and mod_conf["out_features"] is not None: self.dense_features_dims[prefix] = ( @@ -6247,9 +6241,6 @@ def set_gguf_parameters(self): d_conv = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4 d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16 - # ceiling division - # ref: https://stackoverflow.com/a/17511341/22827863 - # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16) rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5 use_dt_b_c_norm = False @@ -6417,9 +6408,6 @@ def set_gguf_parameters(self): d_conv = self.find_hparam(["mamba_d_conv"], optional=True) or 4 d_inner = self.hparams["mamba_expand"] * d_model d_state = self.find_hparam(["mamba_d_state"], optional=True) or 16 - # ceiling division - # ref: https://stackoverflow.com/a/17511341/22827863 - # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58 dt_rank = self.find_hparam(["mamba_dt_rank"], optional=True) or -(d_model // -16) rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-6 n_kv_head = self.hparams["num_key_value_heads"] @@ -6693,7 +6681,6 @@ class OpenELMModel(TextModel): @staticmethod def _make_divisible(v: float | int, divisor: int) -> int: - # ref: https://huggingface.co/apple/OpenELM-270M-Instruct/blob/eb111ff2e6724348e5b905984063d4064d4bc579/configuration_openelm.py#L34-L38 new_v = max(divisor, int(v + divisor / 2) // divisor * divisor) # Make sure that round down does not go down by more than 10%. if new_v < 0.9 * v: @@ -6836,7 +6823,8 @@ def set_vocab(self): token_score = 0.0 logger.info( - f"Setting added token {token_id} to '{token_content}' (type: {token_type}, score: {token_score:.2f})" + f"Setting added token {token_id} to '{token_content}' (type: {token_type}, " + f"score: {token_score:.2f})" ) tokens[token_id] = token_content.encode("utf-8") toktypes[token_id] = token_type @@ -7370,10 +7358,6 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused - # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", - # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored - # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder - # and decoder and ignore the remaining ones. if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]: if not self.shared_token_embeddings_found: name = "shared.weight" @@ -7506,10 +7490,6 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused - # T5 based models contain shared token embeddings tensors saved randomly as either "encoder.embed_tokens.weight", - # "decoder.embed_tokens.weight" or "shared.weight" tensor. In some models there are even multiple of them stored - # in the safetensors files. We use the first tensor from these three as the token embeddings for both encoder - # and decoder and ignore the remaining ones. if name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "shared.weight"]: if not self.shared_token_embeddings_found: name = "shared.weight" @@ -7678,8 +7658,8 @@ def set_vocab(self): and "visible_text(m.content).endswith" in special_vocab.chat_template ): special_vocab.chat_template = special_vocab.chat_template.replace( - """{{ visible_text(m.content) }}\n{{- '/nothing' if (enable_thinking is defined and not enable_thinking and not visible_text(m.content).endswith("/nothing")) else '' -}}""", - """{% set content = visible_text(m.content) %}{{ content }}\n{{- '/nothing' if (enable_thinking is defined and not enable_thinking and not content.endswith("/nothing")) else '' -}}""", + """{{ visible_text(m.content) }}\n{{- '/nothing' if (enable_thinking is defined and not enable_thinking and not visible_text(m.content).endswith("/nothing")) else '' -}}""", # pylint: disable=C0301 + """{% set content = visible_text(m.content) %}{{ content }}\n{{- '/nothing' if (enable_thinking is defined and not enable_thinking and not content.endswith("/nothing")) else '' -}}""", # pylint: disable=C0301 ) special_vocab.add_to_gguf(self.gguf_writer) @@ -7966,10 +7946,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_factor(self.hparams["factor"]) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # * Adding +1 to LayerNorm's weights here to implement layernorm1p w/o changing anything on the GGML engine side - # model.layers.{l}.input_layernorm.weight - # model.layers.{l}.post_attention_layernorm.weight - # model.norm.weight if name.endswith("norm.weight"): data_torch = data_torch + 1 @@ -8599,14 +8575,10 @@ def set_gguf_parameters(self): if (moe_intermediate_size := self.hparams.get("moe_intermediate_size")) is not None: self.gguf_writer.add_expert_feed_forward_length(moe_intermediate_size) logger.info(f"gguf: expert feed forward length = {moe_intermediate_size}") - # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L299 self.gguf_writer.add_expert_chunk_feed_forward_length(self.hparams.get("head_dim") or 128) - # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L298 self.gguf_writer.add_experts_per_group(2) - # FIXME?: Hardcoded https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L376 self.gguf_writer.add_expert_group_scale(0.05) # YaRN is not enabled by default - # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts rope_scaling = self.hparams.get("rope_scaling") or {} if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) @@ -8618,7 +8590,6 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: if name.endswith(".expert_bias"): - # FIXME?: Unused https://huggingface.co/inclusionAI/GroveMoE-Inst/blob/c4c69e5970d18907b5e6ddccdfd55176fe292df1/modeling_grove_moe.py#L303 return [] # process the experts separately @@ -8736,7 +8707,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [(self.map_tensor_name(name), data_torch)] - # see: https://github.com/huggingface/transformers/blob/72fb02c47dbbe1999ae105319f24631cad6e2e00/src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py#L176-L203 @staticmethod def _reverse_hf_permute(data_torch, n_heads, hidden_dim): head_dim = hidden_dim // n_heads @@ -9005,8 +8975,6 @@ def set_gguf_parameters(self): # Rope rope_scaling = hparams.get("rope_scaling", {}) if rope_scaling.get("type") == "dynamic": - # HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/ - # 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf) alpha = rope_scaling.get("alpha", 1000) base = hparams.get("rope_theta", 10000.0) dim = hparams["hidden_size"] // hparams["num_attention_heads"] # 128 @@ -9018,7 +8986,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length self.gguf_writer.add_context_length(256 * 1024) # 256k context length - # if any of our assumptions about the values are wrong, something has changed and this may need to be updated assert ( alpha == 1000 and base == 10000.0 @@ -9210,8 +9177,6 @@ def set_gguf_parameters(self): # Rope rope_scaling = hparams.get("rope_scaling", {}) if rope_scaling.get("type") == "dynamic": - # HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/ - # 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf) alpha = rope_scaling.get("alpha", 50) base = hparams.get("rope_theta", 10000.0) dim = hparams["head_dim"] @@ -9223,7 +9188,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length self.gguf_writer.add_context_length(256 * 1024) # 256k context length - # if any of our assumptions about the values are wrong, something has changed and this may need to be updated assert base == 10000.0 and self.hparams["max_position_embeddings"] in [ 32 * 1024, 256 * 1024, @@ -9517,7 +9481,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_vision_attention_layernorm_eps(self.find_vparam(["layer_norm_eps"])) self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("downsample_factor", 2)) self.gguf_writer.add_vision_use_gelu(True) - # python notation, e.g. for vision_feature_layer == -1, we pick last layer -> vision_feature_layers_to_drop = 0 vision_feature_layers_to_drop = -(self.global_config.get("vision_feature_layer", -1) + 1) self.gguf_writer.add_vision_block_count(self.find_vparam(self.n_block_keys) - vision_feature_layers_to_drop) @@ -9560,8 +9523,6 @@ def set_gguf_parameters(self): self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SOFTMAX) else: self.gguf_writer.add_expert_gating_func(gguf.ExpertGatingFuncType.SIGMOID) - # YaRN is not enabled by default - # To enable it, please refer to this guide: https://huggingface.co/Qwen/Qwen3-30B-A3B#processing-long-texts rope_scaling = self.hparams.get("rope_scaling") or {} if rope_scaling.get("rope_type", rope_scaling.get("type")) == "yarn" and "factor" in rope_scaling: self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN) @@ -9697,7 +9658,8 @@ def get_community_chat_template(vocab: MistralVocab, templates_dir: Path, is_mis if is_mistral_format: err_message += ( " . Please pass --disable-mistral-community-chat-template argument to the CLI " - "if you want to skip this error and use the Mistral official `mistral-common` pre-processing library." + "if you want to skip this error and use the Mistral official `mistral-common` " + "pre-processing library." ) raise ValueError(err_message) @@ -9806,9 +9768,6 @@ class LazyTorchTensor(gguf.LazyBase): torch.uint8: np.uint8, } - # used for safetensors slices - # ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046 - # TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734 _dtype_str_map: dict[str, torch.dtype] = { "F64": torch.float64, "F32": torch.float32, @@ -9873,121 +9832,6 @@ def __torch_function__(cls, func, types, args=(), kwargs=None): return cls._wrap_fn(func)(*args, **kwargs) -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser(description="Convert a huggingface model to a GGML compatible file") - parser.add_argument( - "--vocab-only", - action="store_true", - help="extract only the vocab", - ) - parser.add_argument( - "--outfile", - type=Path, - help="path to write to; default: based on input. {ftype} will be replaced by the outtype.", - ) - parser.add_argument( - "--outtype", - type=str, - choices=["f32", "f16", "bf16", "q8_0", "tq1_0", "tq2_0", "auto"], - default="f16", - help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, q8_0 for Q8_0, tq1_0 or tq2_0 for ternary, and auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type", - ) - parser.add_argument( - "--bigendian", - action="store_true", - help="model is executed on big endian machine", - ) - parser.add_argument( - "model", - type=str, - help="directory containing model file or huggingface repository ID (if --remote)", - nargs="?", - ) - parser.add_argument( - "--use-temp-file", - action="store_true", - help="use the tempfile library while processing (helpful when running out of memory, process killed)", - ) - parser.add_argument( - "--no-lazy", - action="store_true", - help="use more RAM by computing all outputs before writing (use in case lazy evaluation is broken)", - ) - parser.add_argument( - "--model-name", - type=str, - default=None, - help="name of the model", - ) - parser.add_argument( - "--verbose", - action="store_true", - help="increase output verbosity", - ) - parser.add_argument( - "--split-max-tensors", - type=int, - default=0, - help="max tensors in each split", - ) - parser.add_argument( - "--split-max-size", - type=str, - default="0", - help="max size per split N(M|G)", - ) - parser.add_argument( - "--dry-run", - action="store_true", - help="only print out a split plan and exit, without writing any new files", - ) - parser.add_argument( - "--no-tensor-first-split", - action="store_true", - help="do not add tensors to the first split (disabled by default)", - ) - parser.add_argument("--metadata", type=Path, help="Specify the path for an authorship metadata override file") - parser.add_argument("--print-supported-models", action="store_true", help="Print the supported models") - parser.add_argument( - "--remote", - action="store_true", - help="(Experimental) Read safetensors file remotely without downloading to disk. Config and tokenizer files will still be downloaded. To use this feature, you need to specify Hugging Face model repo name instead of a local directory. For example: 'HuggingFaceTB/SmolLM2-1.7B-Instruct'. Note: To access gated repo, set HF_TOKEN environment variable to your Hugging Face token.", - ) - parser.add_argument( - "--mmproj", - action="store_true", - help="(Experimental) Export multimodal projector (mmproj) for vision models. This will only work on some vision models. A prefix 'mmproj-' will be added to the output file name.", - ) - parser.add_argument( - "--mistral-format", - action="store_true", - help="Whether the model is stored following the Mistral format.", - ) - parser.add_argument( - "--disable-mistral-community-chat-template", - action="store_true", - help=( - "Whether to disable usage of Mistral community chat templates. If set, use the Mistral official `mistral-common` library for tokenization and detokenization of Mistral models. " - "Using `mistral-common` ensure correctness and zero-day support of tokenization for models converted from the Mistral format but requires to manually setup the tokenization server." - ), - ) - - parser.add_argument( - "--sentence-transformers-dense-modules", - action="store_true", - help=( - "Whether to include sentence-transformers dense modules." - "It can be used for sentence-transformers models, like google/embeddinggemma-300m" - "Default these modules are not included." - ), - ) - - args = parser.parse_args() - if not args.print_supported_models and args.model is None: - parser.error("the following arguments are required: model") - return args - - def split_str_to_n_bytes(split_str: str) -> int: if split_str.endswith("K"): n = int(split_str[:-1]) * 1000 @@ -10026,116 +9870,3 @@ def get_model_architecture(hparams: dict[str, Any], model_type: ModelType) -> st if arch is None: raise ValueError("Failed to detect model architecture") return arch - - -def main() -> None: - args = parse_args() - - if args.print_supported_models: - logger.error("Supported models:") - ModelBase.print_registered_models() - sys.exit(0) - - if args.verbose: - logging.basicConfig(level=logging.DEBUG) - else: - logging.basicConfig(level=logging.INFO) - - if args.remote: - hf_repo_id = args.model - from huggingface_hub import snapshot_download - - allowed_patterns = ["LICENSE", "*.json", "*.md", "*.txt", "tokenizer.model"] - if args.sentence_transformers_dense_modules: - # include sentence-transformers dense modules safetensors files - allowed_patterns.append("*.safetensors") - local_dir = snapshot_download(repo_id=hf_repo_id, allow_patterns=allowed_patterns) - dir_model = Path(local_dir) - logger.info(f"Downloaded config and tokenizer to {local_dir}") - else: - hf_repo_id = None - dir_model = Path(args.model) - - if not dir_model.is_dir(): - logger.error(f"Error: {dir_model} is not a directory") - sys.exit(1) - - ftype_map: dict[str, gguf.LlamaFileType] = { - "f32": gguf.LlamaFileType.ALL_F32, - "f16": gguf.LlamaFileType.MOSTLY_F16, - "bf16": gguf.LlamaFileType.MOSTLY_BF16, - "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0, - "tq1_0": gguf.LlamaFileType.MOSTLY_TQ1_0, - "tq2_0": gguf.LlamaFileType.MOSTLY_TQ2_0, - "auto": gguf.LlamaFileType.GUESSED, - } - - is_split = args.split_max_tensors > 0 or args.split_max_size != "0" - if args.use_temp_file and is_split: - logger.error("Error: Cannot use temp file when splitting") - sys.exit(1) - - if args.outfile is not None: - fname_out = args.outfile - elif hf_repo_id: - # if remote, use the model ID as the output file name - fname_out = Path("./" + hf_repo_id.replace("/", "-") + "-{ftype}.gguf") - else: - fname_out = dir_model - - logger.info(f"Loading model: {dir_model.name}") - - is_mistral_format = args.mistral_format - if is_mistral_format and not _mistral_common_installed: - raise ImportError(_mistral_import_error_msg) - disable_mistral_community_chat_template = args.disable_mistral_community_chat_template - - with torch.inference_mode(): - output_type = ftype_map[args.outtype] - model_type = ModelType.MMPROJ if args.mmproj else ModelType.TEXT - hparams = ModelBase.load_hparams(dir_model, is_mistral_format) - if not is_mistral_format: - model_architecture = get_model_architecture(hparams, model_type) - logger.info(f"Model architecture: {model_architecture}") - try: - model_class = ModelBase.from_model_architecture(model_architecture, model_type=model_type) - except NotImplementedError: - logger.error(f"Model {model_architecture} is not supported") - sys.exit(1) - elif args.mmproj: - assert hparams.get("vision_encoder") is not None, "This model does not support multimodal" - model_class = PixtralModel - else: - model_class = MistralModel - - model_instance = model_class( - dir_model, - output_type, - fname_out, - is_big_endian=args.bigendian, - use_temp_file=args.use_temp_file, - eager=args.no_lazy, - metadata_override=args.metadata, - model_name=args.model_name, - split_max_tensors=args.split_max_tensors, - split_max_size=split_str_to_n_bytes(args.split_max_size), - dry_run=args.dry_run, - small_first_shard=args.no_tensor_first_split, - remote_hf_model_id=hf_repo_id, - disable_mistral_community_chat_template=disable_mistral_community_chat_template, - sentence_transformers_dense_modules=args.sentence_transformers_dense_modules, - ) - - if args.vocab_only: - logger.info("Exporting model vocab...") - model_instance.write_vocab() - logger.info(f"Model vocab successfully exported to {model_instance.fname_out}") - else: - logger.info("Exporting model...") - model_instance.write() - out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out - logger.info(f"Model successfully exported to {out_path}") - - -if __name__ == "__main__": - main() diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py index c366e264a..3a5cb3d43 100644 --- a/test/test_cpu/test_gguf_format.py +++ b/test/test_cpu/test_gguf_format.py @@ -329,9 +329,9 @@ def test_vlm_gguf(self): for file_name in os.listdir(quantized_model_path): file_size = os.path.getsize(os.path.join(quantized_model_path, file_name)) / 1024**2 if file_name == "mmproj-model.gguf": - self.assertAlmostEqual(file_size, 2535, delta=1.0) + self.assertAlmostEqual(file_size, 2537, delta=5.0) else: - self.assertAlmostEqual(file_size, 892, delta=1.0) + self.assertAlmostEqual(file_size, 892, delta=5.0) shutil.rmtree("./saved", ignore_errors=True) def test_qtype_setting(self): From 0d553dde2812622cb8dcba3be638f1d9f370d4d3 Mon Sep 17 00:00:00 2001 From: n1ck-guo Date: Tue, 28 Oct 2025 01:47:27 -0400 Subject: [PATCH 5/5] codescan Signed-off-by: n1ck-guo --- auto_round/export/export_to_gguf/convert_hf_to_gguf.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/auto_round/export/export_to_gguf/convert_hf_to_gguf.py b/auto_round/export/export_to_gguf/convert_hf_to_gguf.py index 0ce6a7cec..c7c327f1b 100644 --- a/auto_round/export/export_to_gguf/convert_hf_to_gguf.py +++ b/auto_round/export/export_to_gguf/convert_hf_to_gguf.py @@ -7784,8 +7784,6 @@ def set_vocab_chatglm3(self): text = piece.encode("utf-8") score = 0.0 - # Referencing the tokenizer Python implementation(https://huggingface.co/THUDM/chatglm3-6b/blob/main/tokenization_chatglm.py), - # it is only valid if it is less than tokenizer.tokenizer.sp_model.vocab_size() if len(piece) != 0 and token_id < tokenizer.tokenizer.sp_model.vocab_size(): score = tokenizer.tokenizer.sp_model.get_score(token_id) @@ -8722,7 +8720,8 @@ class UltravoxModel(TextModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) raise NotImplementedError( - "Ultravox does not have text decoder. Instead, it uses Llama or other models for text. If you want to get the audio encoder, please use --mmproj argument" + "Ultravox does not have text decoder. Instead, it uses Llama or other models for text." + " If you want to get the audio encoder, please use --mmproj argument" )