Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 2 additions & 17 deletions auto_round/export/export_to_autogptq/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,9 @@
import inspect
import json
import os
from concurrent.futures import ThreadPoolExecutor
from dataclasses import fields
from typing import Any, Callable, Dict, Union

import threadpoolctl as tctl

# MIT License
#
# Copyright (c) 2023 潘其威(William)
Expand Down Expand Up @@ -295,21 +292,9 @@ def save_quantized_as_autogptq(
model = copy.deepcopy(model.to("cpu"))

names = list(layer_config.keys())
max_workers = 1
if not torch.cuda.is_available() and not torch.xpu.is_available():
max_workers = 2 ## 2 with cuda packing will cause hang occasionally
if not unsupported_meta_device(model):
with ThreadPoolExecutor(max_workers=max_workers) as executor:
with tqdm(total=len(names), leave=True) as pbar:

def wrapper(name):
pbar.set_description(f"packing {name}")
with tctl.threadpool_limits(limits=1):
pack_layer(name, model, backend, device)
pbar.update(1)

for _ in executor.map(wrapper, names):
pass
for name in tqdm(names, desc="packing", leave=True):
pack_layer(name, model, backend, device)
if output_dir is None:
return model
quantization_config["lm_head"] = lm_head_quantized
Expand Down
18 changes: 2 additions & 16 deletions auto_round/export/export_to_autoround/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,10 @@
import inspect
import json
import os
from concurrent.futures import ThreadPoolExecutor
from dataclasses import fields
from enum import Enum
from typing import Callable, Union

import threadpoolctl as tctl
import torch
import torch.nn as nn
import transformers
Expand Down Expand Up @@ -309,21 +307,9 @@ def save_quantized_as_autoround(
quantization_config["extra_config"] = extra_config

names = list(layer_config.keys())
max_workers = 1
if not torch.cuda.is_available() and not torch.xpu.is_available():
max_workers = 2 ## 2 with cuda packing will cause hang occasionally
if not unsupported_meta_device(model):
with ThreadPoolExecutor(max_workers=max_workers) as executor:
with tqdm(total=len(names), leave=True) as pbar:

def wrapper(name):
pbar.set_description(f"packing {name}")
with tctl.threadpool_limits(limits=1):
pack_layer(name, model, backend, device)
pbar.update(1)

for _ in executor.map(wrapper, names):
pass
for name in tqdm(names, desc="packing", leave=True):
pack_layer(name, model, backend, device)
filter_quantization_config(quantization_config)
if hasattr(model, "config"):
model.config.quantization_config = quantization_config
Expand Down
20 changes: 3 additions & 17 deletions auto_round/export/export_to_autoround/export_to_fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,9 @@
import copy
import json
import os
from concurrent.futures import ThreadPoolExecutor
from dataclasses import fields
from typing import Callable, Union

import threadpoolctl as tctl
import torch
import transformers
from tqdm import tqdm
Expand Down Expand Up @@ -175,7 +173,7 @@ def pack_layer(layer_name, model, data_type, device=None, unsqueeze=False):
)
if (
unsqueeze
and isinstance(linear_cls, FP8QLinear)
and isinstance(my_linear, FP8QLinear)
and len(my_linear.weight_scale.shape)
and my_linear.weight_scale.shape[0] != 1
):
Expand Down Expand Up @@ -240,20 +238,8 @@ def save_quantized_as_autoround(
if len(extra_config) > 0:
quantization_config["extra_config"] = extra_config
names = list(layer_config.keys())
max_workers = 1
if not torch.cuda.is_available() and not torch.xpu.is_available():
max_workers = 2 ## 2 with cuda packing will cause hang occasionally
with ThreadPoolExecutor(max_workers=max_workers) as executor:
with tqdm(total=len(names), leave=True) as pbar:

def wrapper(name):
pbar.set_description(f"packing {name}")
with tctl.threadpool_limits(limits=1):
pack_layer(name, model, serialization_dict.get("data_type", "fp8"), device)
pbar.update(1)

for _ in executor.map(wrapper, names):
pass
for name in tqdm(names, desc="packing", leave=True):
pack_layer(name, model, serialization_dict.get("data_type", "fp8"), device)
regex_config = quantization_config.pop("regex_config")
if regex_config is not None:
for name in regex_config.keys():
Expand Down
18 changes: 2 additions & 16 deletions auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,9 @@
import inspect
import json
import os
from concurrent.futures import ThreadPoolExecutor
from dataclasses import fields
from typing import Callable, Union

import threadpoolctl as tctl
import torch
import torch.nn as nn
import transformers
Expand Down Expand Up @@ -235,20 +233,8 @@ def save_quantized_as_fp(
if len(extra_config) > 0:
quantization_config["extra_config"] = extra_config
names = list(layer_config.keys())
max_workers = 1
if not torch.cuda.is_available() and not torch.xpu.is_available():
max_workers = 2 ## 2 with cuda packing will cause hang occasionally
with ThreadPoolExecutor(max_workers=max_workers) as executor:
with tqdm(total=len(names), leave=True) as pbar:

def wrapper(name):
pbar.set_description(f"packing {name}")
with tctl.threadpool_limits(limits=1):
pack_layer(name, model, backend, device)
pbar.update(1)

for _ in executor.map(wrapper, names):
pass
for name in tqdm(names, desc="packing", leave=True):
pack_layer(name, model, backend, device)
filter_quantization_config(quantization_config)

if hasattr(model, "config"):
Expand Down
18 changes: 2 additions & 16 deletions auto_round/export/export_to_awq/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,8 @@
import copy
import json
import os
from concurrent.futures import ThreadPoolExecutor
from typing import Callable, Union

import threadpoolctl as tctl
import torch
import torch.nn as nn
from tqdm import tqdm
Expand Down Expand Up @@ -179,21 +177,9 @@ def save_quantized_as_autoawq(
names = list(layer_config.keys())

backend = None
max_workers = 1
if not torch.cuda.is_available() and not torch.xpu.is_available():
max_workers = 2 ## 2 with cuda packing will cause hang occasionally
if not unsupported_meta_device(model):
with ThreadPoolExecutor(max_workers=max_workers) as executor:
with tqdm(total=len(names), leave=True) as pbar:

def wrapper(name):
pbar.set_description(f"packing {name}")
with tctl.threadpool_limits(limits=1):
pack_layer(name, compressed_model, backend, device)
pbar.update(1)

for _ in executor.map(wrapper, names):
pass
for name in tqdm(names, desc="packing", leave=True):
pack_layer(name, compressed_model, backend, device)
if output_dir is None:
return model

Expand Down
18 changes: 2 additions & 16 deletions auto_round/export/export_to_llmcompressor/export_to_fp.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,8 @@
import inspect
import json
import os
from concurrent.futures import ThreadPoolExecutor
from typing import Callable, Union

import threadpoolctl as tctl
import torch
import torch.nn as nn
import transformers
Expand Down Expand Up @@ -190,21 +188,9 @@ def save_quantized_as_fp(
update_fused_layer_global_scales(module, base_name="input")

names = list(layer_config.keys())
max_workers = 1
if not torch.cuda.is_available() or not torch.xpu.is_available():
max_workers = 2 ## 2 with cuda packing will cause hang occasionally
if not unsupported_meta_device(model):
with ThreadPoolExecutor(max_workers=max_workers) as executor:
with tqdm(total=len(names), leave=True) as pbar:

def wrapper(name):
pbar.set_description(f"packing {name}")
with tctl.threadpool_limits(limits=1):
pack_layer(name, model, device)
pbar.update(1)

for _ in executor.map(wrapper, names):
pass
for name in tqdm(names, desc="packing", leave=True):
pack_layer(name, model, device)

ignore = generate_ignore_regex_list(regex_config=regex_config, layer_config=layer_config)

Expand Down
24 changes: 2 additions & 22 deletions auto_round/export/export_to_llmcompressor/export_to_static_fp.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,8 @@
import json
import os
import sys
from concurrent.futures import ThreadPoolExecutor
from typing import Callable, Union

import threadpoolctl as tctl
import torch
import transformers
from tqdm import tqdm
Expand All @@ -36,7 +34,6 @@
get_module,
get_packing_device,
is_gaudi2,
is_hpex_available,
logger,
set_module,
unsupported_meta_device,
Expand Down Expand Up @@ -151,26 +148,9 @@ def save_quantized_as_static_fp(
image_processor = kwargs.get("image_processor", None)

names = list(layer_config.keys())
max_workers = 1
if not torch.cuda.is_available() and not torch.xpu.is_available():
max_workers = 2 ## 2 with cuda packing will cause hang occasionally
if not unsupported_meta_device(model):

if is_hpex_available(): # packing will cause hang occasionally on hpu
for name in tqdm(names, total=len(names), leave=True, desc="packing"):
pack_layer(name, model, serialization_dict.get("data_type", "fp8"), device)
else:
with ThreadPoolExecutor(max_workers=max_workers) as executor:
with tqdm(total=len(names), leave=True) as pbar:

def wrapper(name):
pbar.set_description(f"packing {name}")
with tctl.threadpool_limits(limits=1):
pack_layer(name, model, serialization_dict.get("data_type", "fp8"), device)
pbar.update(1)

for _ in executor.map(wrapper, names):
pass
for name in tqdm(names, desc="packing", leave=True):
pack_layer(name, model, serialization_dict.get("data_type", "fp8"), device)

# Get llm-compressor format config
check_compressed_tensors_supported()
Expand Down
1 change: 0 additions & 1 deletion requirements-hpu.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,3 @@ tqdm
packaging
pillow
transformers
threadpoolctl
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ accelerate
datasets
numpy
py-cpuinfo
threadpoolctl
torch
tqdm
transformers>=4.38
Expand Down
13 changes: 13 additions & 0 deletions test/helpers.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import concurrent.futures
import copy
import os
import re
Expand All @@ -14,6 +15,18 @@
transformers_version = version.parse(transformers.__version__)


def _raise_threaded_packing(*args, **kwargs):
raise AssertionError("Packing should not create a thread pool or call threadpoolctl.")


def forbid_threaded_packing(monkeypatch, module):
monkeypatch.setattr(concurrent.futures, "ThreadPoolExecutor", _raise_threaded_packing)
monkeypatch.setattr(module, "ThreadPoolExecutor", _raise_threaded_packing, raising=False)
tctl = getattr(module, "tctl", None)
if tctl is not None:
monkeypatch.setattr(tctl, "threadpool_limits", _raise_threaded_packing)


def generate_prompt(model_obj_or_str, tokenizer=None, text="The capital of France is,", max_new_tokens=10, device=None):
"""Generate text using a model and tokenizer.

Expand Down
49 changes: 48 additions & 1 deletion test/test_cpu/export/test_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,12 @@
from transformers import AutoModelForCausalLM, AutoRoundConfig, AutoTokenizer

from auto_round import AutoRound
from auto_round.export.export_to_autogptq import export as autogptq_export
from auto_round.export.export_to_autoround import export as autoround_export
from auto_round.export.export_to_autoround import export_to_fp8 as autoround_fp8_export
from auto_round.export.export_to_awq import export as awq_export

from ...helpers import get_model_path, opt_name_or_path, transformers_version
from ...helpers import forbid_threaded_packing, get_model_path, opt_name_or_path, transformers_version


def _get_folder_size(path: str) -> float:
Expand Down Expand Up @@ -496,3 +500,46 @@ def test_llmc_dynamic_wint8aint8_export_with_tuning(self, dataloader):
with safe_open(os.path.join(quantized_model_path, "model.safetensors"), framework="pt") as f:
assert "model.decoder.layers.8.self_attn.k_proj.weight_scale" in f.keys()
assert f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype == torch.int8


@pytest.mark.parametrize(
"format_name,export_module,sym",
[
("auto_gptq", autogptq_export, False),
("auto_awq", awq_export, False),
("auto_round", autoround_export, True),
],
)
def test_weight_only_exports_pack_serially(tiny_opt_model_path, tmp_path, monkeypatch, format_name, export_module, sym):
autoround = AutoRound(
tiny_opt_model_path,
bits=4,
group_size=128,
sym=sym,
iters=0,
disable_opt_rtn=True,
)
autoround.quantize()
forbid_threaded_packing(monkeypatch, export_module)
autoround.save_quantized(output_dir=tmp_path, inplace=False, format=format_name)
assert os.path.exists(os.path.join(tmp_path, "config.json"))


def test_fp8_autoround_export_packs_serially(tiny_opt_model_path, tmp_path, monkeypatch):
from safetensors import safe_open

autoround = AutoRound(
tiny_opt_model_path,
bits=8,
group_size=-1,
iters=0,
scheme="FP8_STATIC",
nsamples=2,
seqlen=2,
static_kv_dtype="fp8",
)
autoround.quantize()
forbid_threaded_packing(monkeypatch, autoround_fp8_export)
autoround.save_quantized(output_dir=tmp_path, format="auto_round")
with safe_open(os.path.join(tmp_path, "model.safetensors"), framework="pt") as f:
assert "model.decoder.layers.0.self_attn.k_proj.weight_scale" in f.keys()
Loading
Loading