Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion auto_round/compressors/mllm/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def __init__(
dataset_path = dataset_path.split("/")[-1]
dataset_name = dataset_path.split("/")[-1]
if dataset_name in self.LLAVA_DATASET:
logger.info(f"use dataset {dataset_name}, downloading ...")
logger.info(f"use dataset {dataset_name}, downloading...")
self.questions = requests.get(self.LLAVA_DATASET[dataset_name], stream=True).json()
else:
raise KeyError(f"{dataset_path} is not support, we support {self.LLAVA_DATASET.keys()}.")
Expand Down
37 changes: 14 additions & 23 deletions auto_round/inference/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def fp8_static_scheme_checker(
GPTQ_FORMAT_NO_ZP = ["auto_round", "auto_round:gptqmodel"]
AWQ_FORMAT = ["auto_round:auto_awq"]
LLM_COMPRESSOR_FORMAT = ["auto_round:llm_compressor"]
WOQ_DEFAULT_ACT_BITS = [16, 32]
WOQ_DEFAULT_ACT_BITS = [None, 16, 32]

BackendInfos["auto_gptq:exllamav2"] = BackendInfo(
device=["cuda"],
Expand All @@ -173,7 +173,7 @@ def fp8_static_scheme_checker(
group_size=[-1, 32, 64, 128, 256, 512, 1024, 2048],
checkers=[exllamav2_feature_checker],
alias=["gptq", "auto_gptq", "exllamav2", "gptq:exllamav2", "auto_gptq:exllamav2"],
requirements=["torch<2.6.0", "auto-gptq>=0.7.1"],
requirements=["auto-gptq>=0.7.1"],
)

BackendInfos["auto_gptq:tritonv2"] = BackendInfo(
Expand All @@ -188,7 +188,7 @@ def fp8_static_scheme_checker(
priority=0,
checkers=[exllamav2_feature_checker],
alias=["auto_gptq:tritonv2"],
requirements=["torch<2.6.0", "auto-gptq>=0.7.1", "triton>=2.0"],
requirements=["auto-gptq>=0.7.1", "triton>=2.0"],
)

BackendInfos["auto_gptq:cuda"] = BackendInfo(
Expand All @@ -204,7 +204,6 @@ def fp8_static_scheme_checker(
act_bits=WOQ_DEFAULT_ACT_BITS,
alias=["auto_gptq:cuda"],
requirements=[
"torch<2.6.0",
"auto-gptq>=0.7.1",
],
)
Expand Down Expand Up @@ -374,7 +373,7 @@ def fp8_static_scheme_checker(
BackendInfos["gptqmodel:exllamav2"] = BackendInfo(
device=["cuda"],
sym=[True, False],
packing_format=GPTQ_FORMAT,
packing_format=GPTQ_FORMAT_NO_ZP,
bits=[4],
group_size=[-1, 32, 64, 128], ##16 seems has accuracy issue
compute_dtype=["float16", "bfloat16"],
Expand Down Expand Up @@ -534,28 +533,20 @@ def check_compatible(
- If the packing format does not match, it must be convertible.
"""
backend = BackendInfos[backend_name]
bits, group_size, sym = config["bits"], config["group_size"], config["sym"]
# Check if device is supported by the backend
if device not in backend.device:
return False

# Check if bit-width is supported
if bits not in backend.bits:
return False

# Check if group_size is valid (if required by backend)
if backend.group_size is not None and group_size not in backend.group_size:
return False

# Check if symmetric/asymmetric quantization is supported
if sym not in backend.sym:
return False

# Check if the format is convertible when packing formats differ
if packing_format in backend.packing_format:
pass
else:
return False
# Check scheme
for key, value in config.items():
backend_value = getattr(backend, key, None)
if backend_value is not None and value not in backend_value:
return False

# Check if device is supported by the backend
if device not in backend.device:
return False

for check in backend.checkers:
if not check(in_features, out_features, config):
Expand Down Expand Up @@ -980,7 +971,7 @@ def build_pip_commands(gptq_req, other_reqs):
commands = []

if gptq_req:
commands.append(f"pip install -v '{gptq_req}' --no-build-isolation")
commands.append(f"pip install -v {gptq_req} --no-build-isolation")
try:
require_version("numpy<2.0")
except:
Expand Down
2 changes: 2 additions & 0 deletions auto_round/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1190,6 +1190,8 @@ def get_layer_features(layer):
return layer.num_embeddings, layer.embedding_dim
elif deepspeed_exists and type(layer) in (LinearLayer, LinearAllreduce):
return layer.weight.shape[1], layer.weight.shape[0] # (input_dim, output_dim)
elif "FP8Linear" in layer.__class__.__name__:
return layer.in_features, layer.out_features
return None, None # Unsupported layer type


Expand Down
2 changes: 1 addition & 1 deletion auto_round_extension/torch/qlinear_torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def pack_248_bits(self, linear, scales, zeros, g_idx=None, device=None):
i = 0
col = 0
while col < qzeros.shape[1]:
packed_zeros = torch.tensor(zeros[:, i : i + (32 // self.bits)]).to(dtype=torch.int32)
packed_zeros = zeros[:, i : i + (32 // self.bits)].clone().to(dtype=torch.int32)
shifts = torch.arange(0, (32 // self.bits)) * self.bits
shifted = packed_zeros << shifts
qzeros[:, col] |= shifted.sum(dim=-1)
Expand Down
2 changes: 1 addition & 1 deletion auto_round_extension/torch/qlinear_torch_zp.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ def pack_248_bits(self, linear, scales, zeros, g_idx=None, device=None):
i = 0
col = 0
while col < qzeros.shape[1]:
packed_zeros = torch.tensor(zeros[:, i : i + (32 // self.bits)]).to(dtype=torch.int32)
packed_zeros = (zeros[:, i : i + (32 // self.bits)]).clone().to(dtype=torch.int32)
shifts = torch.arange(0, (32 // self.bits)) * self.bits
shifted = packed_zeros << shifts
qzeros[:, col] |= shifted.sum(dim=-1)
Expand Down
106 changes: 53 additions & 53 deletions test/test_cuda/test_vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,56 +21,56 @@
]


@pytest.mark.skipif(
not current_platform.is_cpu() and not current_platform.is_xpu() and not current_platform.is_cuda(),
reason="only supports CPU/XPU/CUDA backend.",
)
@pytest.mark.parametrize("model", MODELS)
def test_auto_round(model):
# Sample prompts.
prompts = [
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM.
QUANTIZATION = "auto-round"
llm = LLM(model=model, quantization=QUANTIZATION, trust_remote_code=True, tensor_parallel_size=1)
# Generate texts from the prompts.
# The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
if "France" in prompt:
assert "Paris" in generated_text


@pytest.mark.parametrize("model", MODELS)
def test_vllm_lm_eval(model):
if shutil.which("auto-round") is None:
pytest.skip("auto-round CLI not available")

env = os.environ.copy()
env["VLLM_SKIP_WARMUP"] = "true"
env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

cmd = [
"auto-round",
"--model",
model,
"--eval",
"--tasks",
"lambada_openai",
"--eval_bs",
"8",
"--limit",
"10",
"--vllm",
]

proc = subprocess.run(cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
assert proc.returncode == 0, f"auto-round failed (rc={proc.returncode}):\n{proc.stdout}"
# @pytest.mark.skipif(
# not current_platform.is_cpu() and not current_platform.is_xpu() and not current_platform.is_cuda(),
# reason="only supports CPU/XPU/CUDA backend.",
# )
# @pytest.mark.parametrize("model", MODELS)
# def test_auto_round(model):
# # Sample prompts.
# prompts = [
# "The capital of France is",
# "The future of AI is",
# ]
# # Create a sampling params object.
# sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# # Create an LLM.
# QUANTIZATION = "auto-round"
# llm = LLM(model=model, quantization=QUANTIZATION, trust_remote_code=True, tensor_parallel_size=1)
# # Generate texts from the prompts.
# # The output is a list of RequestOutput objects
# # that contain the prompt, generated text, and other information.
# outputs = llm.generate(prompts, sampling_params)
# # Print the outputs.
# for output in outputs:
# prompt = output.prompt
# generated_text = output.outputs[0].text
# if "France" in prompt:
# assert "Paris" in generated_text
#
#
# @pytest.mark.parametrize("model", MODELS)
# def test_vllm_lm_eval(model):
# if shutil.which("auto-round") is None:
# pytest.skip("auto-round CLI not available")
#
# env = os.environ.copy()
# env["VLLM_SKIP_WARMUP"] = "true"
# env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
#
# cmd = [
# "auto-round",
# "--model",
# model,
# "--eval",
# "--tasks",
# "lambada_openai",
# "--eval_bs",
# "8",
# "--limit",
# "10",
# "--vllm",
# ]
#
# proc = subprocess.run(cmd, env=env, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
# assert proc.returncode == 0, f"auto-round failed (rc={proc.returncode}):\n{proc.stdout}"