Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
ac0ad86
add transformers class
Kaihui-intel Jan 21, 2025
097128a
add ut
Kaihui-intel Feb 9, 2025
634b14f
rm breakpoint
Kaihui-intel Feb 10, 2025
d249af4
modify ut name
Kaihui-intel Feb 10, 2025
f46e72e
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 10, 2025
406f93b
add Mllama/Llava class
Kaihui-intel Feb 10, 2025
4763dff
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 10, 2025
d1f6463
Merge branch 'kaihui/vllm' of https://github.com/intel/neural-compres…
Kaihui-intel Feb 10, 2025
cf9ad2f
clean code
Kaihui-intel Feb 10, 2025
56e2caf
fix auto_round.export & trust_remote_code
Kaihui-intel Feb 10, 2025
d12a9f5
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 10, 2025
ba46b21
add quant_nontext_module
Kaihui-intel Feb 10, 2025
82c00f1
Merge branch 'kaihui/vllm' of https://github.com/intel/neural-compres…
Kaihui-intel Feb 10, 2025
1383165
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 10, 2025
b25f8f5
add torchvision into ut req
Kaihui-intel Feb 10, 2025
a57af91
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 10, 2025
b239521
disable phi3 ut
Kaihui-intel Feb 10, 2025
daec2cd
Merge branch 'kaihui/vllm' of https://github.com/intel/neural-compres…
Kaihui-intel Feb 10, 2025
7aeeeca
reduce vlm ut
Kaihui-intel Feb 10, 2025
5a33c16
remove specific index-url for torch installation to avoid conflict
XuehaoSun Feb 11, 2025
95bac7c
update torch installation
XuehaoSun Feb 11, 2025
9cd265e
fix torchvision==0.20.1
XuehaoSun Feb 11, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .azure-pipelines/scripts/ut/3x/run_3x_pt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ echo "##[section]import check pass"
echo "##[group]set up UT env..."
export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH
pip install -r /neural-compressor/test/3x/torch/requirements.txt
pip install torch==2.5.1 torchvision==0.20.1 # For auto-round
pip install pytest-cov
pip install pytest-html
echo "##[endgroup]"
Expand Down
10 changes: 6 additions & 4 deletions neural_compressor/torch/algorithms/weight_only/autoround.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def __init__(
enable_torch_compile: bool = None,
# mllm
is_mllm: bool = False,
quant_nontext_module: Union[str, list] = None,
quant_nontext_module: bool = False,
extra_data_dir: str = None,
image_processor=None,
processor=None,
Expand Down Expand Up @@ -150,7 +150,7 @@ def __init__(
act_dynamic (bool): Whether to use dynamic activation quantization. Default is True.
enable_norm_bias_tuning (bool): Whether to enable fast norm/layer_bias tuning.
enable_torch_compile (bool): Whether to enable torch compile to optimize quant_block/layer, torch>=2.6 True.
quant_nontext_module (Union[str, list]): Whether to quantize nontext module.
quant_nontext_module (bool): Whether to quantize nontext module.
is_mllm (bool): Indicates whether the model to be quantized is a multi-modal model (MLLM).
extra_data_dir (str): The path for extra data such as images, audio or videos.
processor (transformers.AutoProcessor): Any multi-modal model will require an object to encode or
Expand Down Expand Up @@ -383,7 +383,9 @@ def get_mllm_dataloader(
template, model=model, tokenizer=tokenizer, processor=processor, image_processor=image_processor
)
dataset = template.default_dataset if dataset is None else dataset
if quant_nontext_module or (dataset in CALIB_DATASETS.keys() and not _only_text_test(model, tokenizer)):
if quant_nontext_module or (
dataset in CALIB_DATASETS.keys() and not _only_text_test(model, tokenizer, "cpu", template.model_type)
):
if quant_nontext_module:
logger.warning(
"Quantitative nontext module is not supported for plain text datasets,"
Expand All @@ -399,7 +401,7 @@ def get_mllm_dataloader(
truncation = False
gradient_accumulate_steps = batch_size * gradient_accumulate_steps
batch_size = 1

seed = 42 # The seed is fixed to 42 in transformers
seqlen = 2048 if seqlen is None else seqlen # set text only calibration default args
truncation = True if truncation is None else truncation
dataset = dataset.replace(" ", "")
Expand Down
4 changes: 2 additions & 2 deletions neural_compressor/torch/quantization/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -950,7 +950,7 @@ def __init__(
enable_torch_compile: bool = None,
# mllm
is_mllm: bool = False,
quant_nontext_module: Union[str, list] = None,
quant_nontext_module: bool = False,
extra_data_dir: str = None,
processor=None,
image_processor=None,
Expand Down Expand Up @@ -994,7 +994,7 @@ def __init__(
export_format (str, optional): The format used for exporting the quantized model. Defaults to "itrex".
enable_norm_bias_tuning (bool): Whether to enable fast norm/layer_bias tuning.
enable_torch_compile (bool): Whether to enable torch compile to optimize quant_block/layer, torch>=2.6 True.
quant_nontext_module (Union[str, list]): Whether to quantize nontext module.
quant_nontext_module (bool): Whether to quantize nontext module.
extra_data_dir (str): The path for extra data such as images, audio or videos.
is_mllm (bool): Indicates whether the model to be quantized is a multi-modal model (MLLM).
processor (transformers.AutoProcessor): Any multi-modal model will require an object to encode or
Expand Down
1 change: 1 addition & 0 deletions neural_compressor/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,5 @@
AutoModelForCausalLM,
AutoModel,
AutoModelForSeq2SeqLM,
Qwen2VLForConditionalGeneration,
)
9 changes: 8 additions & 1 deletion neural_compressor/transformers/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,11 @@
# limitations under the License.

from .modeling_auto import _BaseINCAutoModelClass
from .modeling_auto import AutoModel, AutoModelForCausalLM, AutoModelForSeq2SeqLM
from .modeling_auto import (
AutoModel,
AutoModelForCausalLM,
AutoModelForSeq2SeqLM,
Qwen2VLForConditionalGeneration,
MllamaForConditionalGeneration,
LlavaForConditionalGeneration,
)
51 changes: 33 additions & 18 deletions neural_compressor/transformers/models/modeling_auto.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,24 +354,27 @@ def load_low_bit(cls, pretrained_model_name_or_path, *model_args, **kwargs):
else:
commit_hash = getattr(config, "_commit_hash", None)

has_remote_code = hasattr(config, "auto_map") and cls.ORIG_MODEL.__name__ in config.auto_map

has_local_code = type(config) in cls.ORIG_MODEL._model_mapping.keys()
trust_remote_code = resolve_trust_remote_code(
trust_remote_code,
pretrained_model_name_or_path,
has_local_code,
has_remote_code,
)
if has_remote_code and trust_remote_code:
class_ref = config.auto_map[cls.ORIG_MODEL.__name__]
model_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs_orig)
if os.path.isdir(pretrained_model_name_or_path):
model_class.register_for_auto_class(cls.ORIG_MODEL.__name__)
else:
cls.ORIG_MODEL.register(config.__class__, model_class, exist_ok=True)
elif type(config) in cls.ORIG_MODEL._model_mapping.keys():
model_class = _get_model_class(config, cls.ORIG_MODEL._model_mapping)
if "AutoModel" in cls.ORIG_MODEL.__name__:
has_remote_code = hasattr(config, "auto_map") and cls.ORIG_MODEL.__name__ in config.auto_map
has_local_code = type(config) in cls.ORIG_MODEL._model_mapping.keys()

trust_remote_code = resolve_trust_remote_code(
trust_remote_code,
pretrained_model_name_or_path,
has_local_code,
has_remote_code,
)
if has_remote_code and trust_remote_code:
class_ref = config.auto_map[cls.ORIG_MODEL.__name__]
model_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs_orig)
if os.path.isdir(pretrained_model_name_or_path):
model_class.register_for_auto_class(cls.ORIG_MODEL.__name__)
else:
cls.ORIG_MODEL.register(config.__class__, model_class, exist_ok=True)
elif type(config) in cls.ORIG_MODEL._model_mapping.keys():
model_class = _get_model_class(config, cls.ORIG_MODEL._model_mapping)
else:
model_class = cls.ORIG_MODEL

# This variable will flag if we're loading a sharded checkpoint. In this case the archive file is just the
# index of the files.
Expand Down Expand Up @@ -747,3 +750,15 @@ class AutoModel(_BaseINCAutoModelClass):

class AutoModelForSeq2SeqLM(_BaseINCAutoModelClass):
ORIG_MODEL = transformers.AutoModelForSeq2SeqLM


class Qwen2VLForConditionalGeneration(_BaseINCAutoModelClass):
ORIG_MODEL = transformers.Qwen2VLForConditionalGeneration


class MllamaForConditionalGeneration(_BaseINCAutoModelClass):
ORIG_MODEL = transformers.MllamaForConditionalGeneration


class LlavaForConditionalGeneration(_BaseINCAutoModelClass):
ORIG_MODEL = transformers.LlavaForConditionalGeneration
115 changes: 103 additions & 12 deletions neural_compressor/transformers/quantization/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import json
import math
import os
import re
import types

from datasets import load_dataset
Expand All @@ -33,11 +34,16 @@
convert,
prepare,
)
from neural_compressor.torch.utils import is_ipex_available
from neural_compressor.torch.utils import is_ipex_available, is_package_available

if is_ipex_available():
import intel_extension_for_pytorch as ipex

if is_package_available("auto_round"):
import auto_round
import transformers
from auto_round.export.export_to_itrex.model_wrapper import WeightOnlyLinear as auto_round_woq_linear

from typing import Union

torch = LazyImport("torch")
Expand Down Expand Up @@ -126,10 +132,12 @@ def _replace_linear(
if (
isinstance(module, torch.nn.Linear)
or isinstance(module, INCWeightOnlyLinear)
or (is_ipex_available() and isinstance(module, ipex.nn.utils._weight_prepack._IPEXLinear))
or (is_package_available("auto_round") and isinstance(module, auto_round_woq_linear))
) and (name not in modules_to_not_convert):
# Check if the current key is not in the `modules_to_not_convert`
if not any(key in ".".join(current_key_name) for key in modules_to_not_convert):
if not any(key in ".".join(current_key_name) for key in modules_to_not_convert) and not any(
re.match(pattern, ".".join(current_key_name)) for pattern in modules_to_not_convert
):
in_features = module.in_features
out_features = module.out_features
if device == "cpu" or device == torch.device("cpu") or device == "auto":
Expand Down Expand Up @@ -475,6 +483,54 @@ def convert_to_quantized_model(model, config, device="cpu"):
run_fn(model, *run_args)
model = convert(model)
elif config.quant_method.value == "autoround":
if config.is_vlm is True:
from transformers import AutoProcessor, AutoTokenizer

from neural_compressor.torch.algorithms.weight_only.autoround import (
get_mllm_dataloader as get_autoround_dataloader,
)

tokenizer = AutoTokenizer.from_pretrained(model.config._name_or_path)
processor = AutoProcessor.from_pretrained(model.config._name_or_path, trust_remote_code=True)
(
dataloader,
template,
config.truncation,
config.batch_size,
config.gradient_accumulate_steps,
config.seq_len,
config.n_samples,
) = get_autoround_dataloader(
template=None,
model=model,
tokenizer=tokenizer,
image_processor=None,
dataset=config.dataset,
extra_data_dir=None,
seqlen=config.seq_len,
batch_size=config.batch_size,
split=None,
apply_template=None,
truncation=False,
nsamples=config.n_samples,
seed=42,
gradient_accumulate_steps=config.gradient_accumulate_steps,
quant_nontext_module=config.quant_nontext_module,
processor=processor,
)
else:
from neural_compressor.torch.algorithms.weight_only.autoround import (
get_dataloader as get_autoround_dataloader,
)

dataloader = get_autoround_dataloader(
tokenizer=config.tokenizer,
seqlen=config.seq_len,
dataset_name=config.dataset,
seed=42,
bs=config.batch_size,
nsamples=config.n_samples,
)
quant_config = AutoRoundConfig(
dtype=dtype,
bits=config.bits,
Expand All @@ -486,24 +542,59 @@ def convert_to_quantized_model(model, config, device="cpu"):
seqlen=config.seq_len,
nsamples=config.n_samples,
iters=config.iters,
batch_size=config.batch_size,
scale_dtype=config.scale_dtype,
use_layer_wise=config.use_layer_wise,
# vlm arguments
is_mllm=config.is_vlm,
quant_nontext_module=config.quant_nontext_module,
truncation=config.truncation,
gradient_accumulate_steps=config.gradient_accumulate_steps,
export_format=config.export_format,
)

# vlm set non-text module config
if config.is_vlm is True:
from neural_compressor.torch.utils.utility import (
find_matching_blocks,
get_layer_names_in_block,
get_multimodal_block_names,
)

def set_nontext_module_config(model, to_quant_block_names, config):
all_block_list = get_multimodal_block_names(model, quant_vision=True)
all_block_set = set(tuple(block) for block in all_block_list)
quant_block_set = set(tuple(block) for block in to_quant_block_names)
set_to_full_prec = list(all_block_set - quant_block_set)
set_to_full_prec = get_layer_names_in_block(model, to_quant_block_names=set_to_full_prec)
for name in set_to_full_prec:
config.modules_to_not_convert.append(name)

# skip layers not in blocks
config.modules_to_not_convert.append("model.vision_embed_tokens.img_projection*")
config.modules_to_not_convert.append("transformer.visual.attn_pool.*_proj")
config.modules_to_not_convert.append("model.mm_projector*")
config.modules_to_not_convert.append("multi_modal_projector")
config.modules_to_not_convert.append("visual.merger")

all_blocks = get_multimodal_block_names(model, quant_config.quant_nontext_module)
to_quant_block_names = find_matching_blocks(model, all_blocks, quant_config.to_quant_block_names)
set_nontext_module_config(model, to_quant_block_names, config)

for n, m in model.named_modules():
if isinstance(m, torch.nn.Linear) or isinstance(m, transformers.modeling_utils.Conv1D):
if m.weight.shape[0] % 32 != 0 or m.weight.shape[1] % 32 != 0:
config.modules_to_not_convert.append(n)
print(
f"{n} will not be quantized due to its shape not being divisible by 32,"
" resulting in an exporting issue to autogptq"
)
if config.modules_to_not_convert != []:
for module in config.modules_to_not_convert:
module_name = ".*" + module
quant_config.set_local(module_name, AutoRoundConfig(dtype="fp32"))
logger.info(f"Do AutoRound algorithm with config {quant_config}")
from neural_compressor.torch.algorithms.weight_only.autoround import get_dataloader as get_autoround_dataloader

dataloader = get_autoround_dataloader(
tokenizer=config.tokenizer,
seqlen=config.seq_len,
dataset_name=config.dataset,
seed=42,
bs=config.batch_size,
nsamples=config.n_samples,
)
run_fn = run_fn_for_autoround
run_args = (dataloader,)
model = prepare(model=model, quant_config=quant_config)
Expand Down
13 changes: 13 additions & 0 deletions neural_compressor/transformers/utils/quantization_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -543,6 +543,12 @@ def __init__(
iters: int = 200,
use_layer_wise: bool = None,
quant_lm_head: bool = False,
# vlm arguments
is_vlm: bool = False,
quant_nontext_module: bool = False,
truncation: bool = False,
gradient_accumulate_steps: int = 1,
export_format="itrex",
**kwargs,
):

Expand Down Expand Up @@ -594,6 +600,13 @@ def __init__(
self.use_layer_wise = use_layer_wise
self.model_path = kwargs.get("model_path", "")

# vlm arguments
self.is_vlm = is_vlm
self.quant_nontext_module = quant_nontext_module
self.truncation = truncation
self.gradient_accumulate_steps = gradient_accumulate_steps
self.export_format = export_format

def to_diff_dict(self) -> Dict[str, Any]:
"""Removes all attributes from config which correspond to the default config attributes
for better readability and serializes to a Python dictionary.
Expand Down
8 changes: 4 additions & 4 deletions test/3x/torch/quantization/weight_only/test_autoround.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,13 +238,13 @@ def test_mllm(self):
image_processor=None,
dataset="liuhaotian/llava_conv_58k",
extra_data_dir=None,
seqlen=512,
seqlen=32,
batch_size=1,
split=None,
apply_template=None,
truncation=False,
seed=42,
nsamples=5,
nsamples=1,
gradient_accumulate_steps=1,
quant_nontext_module=False,
processor=processor,
Expand All @@ -253,9 +253,9 @@ def test_mllm(self):
bits=4,
group_size=128,
is_mllm=True,
nsamples=5,
nsamples=1,
batch_size=batch_size,
iters=2,
iters=1,
seqlen=seqlen,
quant_nontext_module=False,
truncation=truncation,
Expand Down
Loading