From de3fe48324789475038fbae21bc864d6636a6900 Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Fri, 12 Jan 2024 02:29:47 +0800
Subject: [PATCH 01/14] add export funcs for autoround

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 .../quantization/autoround/main.py            |  30 +-
 .../adaptor/torch_utils/autoround/__init__.py |   3 +-
 .../torch_utils/autoround/autoround.py        | 158 +++++++-
 .../torch_utils/autoround/model_wrapper.py    | 344 ++++++++++++++++++
 .../adaptor/pytorch_adaptor/test_autoround.py |  89 +++++
 5 files changed, 602 insertions(+), 22 deletions(-)
 create mode 100644 neural_compressor/adaptor/torch_utils/autoround/model_wrapper.py
 create mode 100644 test/adaptor/pytorch_adaptor/test_autoround.py

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/autoround/main.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/autoround/main.py
index 4217ada9976..9a54b3fb458 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/autoround/main.py
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/autoround/main.py
@@ -1,21 +1,21 @@
 import argparse
-
-from neural_compressor.adaptor.torch_utils.autoround import AutoRound, AutoOPTRound, AutoAdamRound
+import sys
+from neural_compressor.adaptor.torch_utils.autoround import (AutoRound,
+                                                             AutoOPTRound,
+                                                             AutoAdamRound)
 
 parser = argparse.ArgumentParser()
 import torch
 import os
+import re
+import json
 
 os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
 torch.use_deterministic_algorithms(True, warn_only=True)
 from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
-
 from transformers import set_seed
-
 from eval import eval_model
 
-import re
-
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
 
@@ -83,6 +83,9 @@
 
     parser.add_argument("--enable_minmax_tuning", action='store_true',
                         help="whether enable weight minmax tuning")
+    
+    parser.add_argument("--use_optimum_format", default=True,
+                        help="whether use HuggingFace format.")
 
     # parser.add_argument("--tasks", default=["lambada_openai", "hellaswag", "winogrande", "piqa"],
     #                     help="lm-eval tasks")
@@ -186,9 +189,17 @@
 
     optq = round(model, tokenizer, args.num_bits, args.group_size, scheme, bs=args.train_bs,
                  seqlen=seqlen, n_blocks=args.n_blocks, iters=args.iters, lr=args.lr,
-                 minmax_lr=args.minmax_lr, use_quant_input=args.use_quant_input,
-                 amp=args.amp, n_samples=args.n_samples, low_gpu_mem_usage=args.low_gpu_mem_usage, seed=args.seed, gradient_accumulate_steps=args.gradient_accumulate_steps)  ##TODO args pass
-    optq.quantize()
+                 use_quant_input=args.use_quant_input, amp=args.amp, n_samples=args.n_samples,
+                 low_gpu_mem_usage=args.low_gpu_mem_usage, minmax_lr=args.minmax_lr,
+                 seed=args.seed, gradient_accumulate_steps=args.gradient_accumulate_steps)  ##TODO args pass
+    q_model, q_config = optq.quantize()
+    if args.use_optimum_format:
+        output_dir = args.output_dir + "_" + args.model_name.split('/')[-1] + "/"
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+        q_config_path =  os.path.join(output_dir, "qconfig.json")
+        with open(q_config_path, "w") as f:
+                json.dump(q_config, f, indent=4)
 
     torch.cuda.empty_cache()
     model.eval()
@@ -202,3 +213,4 @@
     eval_model(output_dir=output_dir, model=model, tokenizer=tokenizer, tasks=args.tasks, \
                eval_bs=args.eval_bs, use_accelerate=args.low_gpu_mem_usage, device=cuda_device, excel_file=excel_name,
                limit=None)
+
diff --git a/neural_compressor/adaptor/torch_utils/autoround/__init__.py b/neural_compressor/adaptor/torch_utils/autoround/__init__.py
index 96c727d97ef..6567bdbb92d 100644
--- a/neural_compressor/adaptor/torch_utils/autoround/__init__.py
+++ b/neural_compressor/adaptor/torch_utils/autoround/__init__.py
@@ -11,4 +11,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .autoround import AutoRound, AutoOPTRound, AutoAdamRound
+from .autoround import AutoRound, AutoOPTRound, AutoAdamRound, export_compressed_model
+
diff --git a/neural_compressor/adaptor/torch_utils/autoround/autoround.py b/neural_compressor/adaptor/torch_utils/autoround/autoround.py
index fa66a25e0ea..ac5c0d49af5 100644
--- a/neural_compressor/adaptor/torch_utils/autoround/autoround.py
+++ b/neural_compressor/adaptor/torch_utils/autoround/autoround.py
@@ -19,18 +19,17 @@
     from neural_compressor.utils import logger
 except:  # pragma: no cover
     import logging
-
     import torch
-
     logger = logging.getLogger()
 
 import copy
 import time
 from collections import UserDict
 from functools import partial
-
+from .model_wrapper import WeightOnlyLinear
 from torch.amp import autocast
 from torch.functional import F
+from typing import Union
 
 
 def quant_weight_asym(weight, num_bits=4, v=0, min_scale=0, max_scale=0):
@@ -136,17 +135,31 @@ def quant_weight(weight, num_bits=4, group_size=-1, scheme="asym", v=0, min_scal
         Quantized and dequantized weight, scale, zero-point
     """
     if group_size == -1 or weight.shape[1] < group_size:
-        return quant_weight_actor(weight, num_bits, scheme=scheme, v=v, min_scale=min_scale, max_scale=max_scale)
+        return quant_weight_actor(
+            weight,
+            num_bits,
+            scheme=scheme,
+            v=v,
+            min_scale=min_scale,
+            max_scale=max_scale
+        )
     orig_shape = weight.shape
     if weight.shape[1] % group_size == 0:
         weight = weight.reshape(-1, group_size)
         if isinstance(v, torch.Tensor):
             v = v.reshape(-1, group_size)
-
         weight, scale, zp = quant_weight_actor(
-            weight, num_bits, scheme=scheme, v=v, min_scale=min_scale, max_scale=max_scale
+            weight,
+            num_bits,
+            scheme=scheme,
+            v=v,
+            min_scale=min_scale,
+            max_scale=max_scale
         )
         weight = weight.reshape(orig_shape)
+        scale = scale.reshape(orig_shape[0], -1) #TODO validating the feasibility on conv1d
+        if zp is not None:
+            zp = zp.reshape(orig_shape[0], -1)
         return weight, scale, zp
 
     else:
@@ -157,14 +170,125 @@ def quant_weight(weight, num_bits=4, group_size=-1, scheme="asym", v=0, min_scal
         if isinstance(v, torch.Tensor):
             v = v.reshape(-1, group_size)
         weight_new, scale, zp = quant_weight_actor(
-            weight_new, num_bits, scheme=scheme, v=v, min_scale=min_scale, max_scale=max_scale
+            weight_new,
+            num_bits,
+            scheme=scheme,
+            v=v,
+            min_scale=min_scale,
+            max_scale=max_scale
         )
         weight_new = weight_new.reshape(orig_shape[0], -1)
-
+        scale = scale.reshape(orig_shape[0], -1)
+        if zp is not None:
+            zp = zp.reshape(orig_shape[0], -1)
         weight_new = weight_new[:, :-pad_len]
+        scale = scale[:, :-pad_len]
+        zp = zp[:, :-pad_len]
         return weight_new, scale, zp
 
 
+def quant_weight_w_scale(weight, scale, zp, group_size=-1):
+    """Quant and dequant tensor with group size.
+
+    Args:
+        weight: input weight
+        scale: scale
+        zp: zero point
+        group_size (int, optional): how many elements share one scale/zp. Defaults to -1.
+
+    Returns:
+        output: int weight.
+    """
+    device = weight.device
+    scale = scale.to(device)
+    if zp is not None:
+        zp = zp.to(device)
+    if group_size == -1:
+        return torch.round(weight / scale) if zp is None else torch.round(weight / scale + zp)
+    int_weight = torch.zeros(weight.shape).to(device)
+    leng = weight.shape[1] // group_size
+    tail_flag = False if weight.shape[1] % group_size == 0 else True
+    for i in range(leng):
+        int_weight_tmp = weight[:, i * group_size : (i + 1) * group_size] / scale[:, i].unsqueeze(1)
+        if zp is not None:
+            int_weight_tmp += zp[:, i].unsqueeze(1)
+        int_weight[:, i * group_size : (i + 1) * group_size] = torch.round(int_weight_tmp)
+    if tail_flag:
+        int_weight_tmp = weight[:, leng * group_size :] / scale[:, -1].unsqueeze(1)
+        if zp is not None:
+            int_weight_tmp += zp[:, -1].unsqueeze(1)
+        int_weight[:, leng * group_size :] = torch.round(int_weight_tmp)
+    return int_weight
+
+
+def export_compressed_model(
+        model,
+        weight_config:Union[str, dict],
+        enable_full_range=False,
+        compression_dtype=torch.int32,
+        compression_dim=1,
+        scale_dtype=torch.float32,
+        device="cpu",
+        use_optimum_format=True,
+    ):
+    """Convert Linear to WeightOnlyLinear for low memory inference.
+
+    Args:
+        weight_config (str|dict): qconfig dict or Path of qconfig.json.
+        enable_full_range (bool, optional): Whether to leverage the full compression range
+                                            under symmetric quantization. Defaults to False.
+        compression_dtype (torch.Tensor, optional): The target dtype after comoression.
+                                                    Defaults to torch.int32.
+        compression_dim (int, optional): Select from [0, 1], 0 is output channel,
+                                            1 is input channel. Defaults to 1.
+        scale_dtype (torch.Tensor, optional): Use float32 or float16.
+                                                Defaults to torch.float32.
+        device (str, optional): choose device for compression. Defaults to cpu.
+        use_optimum_format (bool, optional): use the popular huggingface compression format.
+            1: compression_dim: weight = 1, zeros = 0 and both are transposed.
+            2: zeros -= 1 before compression. Why we need it?
+            3: g_idx: use same number for one group instead of recording the channel order.
+            4. parameter name changed, such as 'packed_weight' -> 'qweight'.
+            5. zeros is always needed even for sym.
+    """
+    from .model_wrapper import WeightOnlyLinear
+    compressed_model = copy.deepcopy(model)
+    if isinstance(weight_config, str):
+        with open(weight_config, "r") as f:
+            q_config = json.load(f)
+    else:
+        q_config = weight_config
+    for k, v in q_config.items():
+        print(f"Compressing {k} on device {device}")
+        if v["data_type"] == "float":
+            continue
+        else:
+            dtype = v["data_type"]
+            num_bits = v["bits"]
+            group_size = v["group_size"]
+            scheme = v["scheme"]
+        m = get_module(compressed_model, k)
+        fp_weight = m.weight.data
+        scale = torch.tensor(v["scale"], dtype=torch.float32) # may exist dtype dismatch problem
+        zp = None if scheme == "sym" else torch.tensor(v["zp"], dtype=torch.int32)
+        int_weight = quant_weight_w_scale(fp_weight, scale, zp, group_size)
+        int_weight = int_weight.type(torch.int32)
+        new_module = WeightOnlyLinear(
+            m.in_features,
+            m.out_features,
+            num_bits,
+            group_size,
+            dtype=dtype,
+            zp=zp is not None,
+            bias=m.bias is not None,
+            device=device,
+            use_optimum_format=True,
+        )
+        new_module.pack(int_weight, scale, zp, m.bias)
+        set_module(compressed_model, k, new_module)
+    return compressed_model
+
+
 def round_ste(x: torch.Tensor):
     """Straight-Through Estimator for rounding.
     This function is adapted from omniquant.
@@ -976,7 +1100,6 @@ def __init__(
         self.amp = amp
         self.use_quant_input = use_quant_input
         self.enable_minmax_tuning = enable_minmax_tuning
-        self.n_samples = n_samples
         self.n_blocks = n_blocks
         self.bits = bits
         self.group_size = group_size
@@ -997,6 +1120,7 @@ def __init__(
         self.tokenizer = tokenizer
         self.seqlen = seqlen
         self.train_bs = bs
+        self.n_samples = bs*(n_samples//bs)
         self.n_blocks = n_blocks
         self.device = device
         self.amp_dtype = torch.float16
@@ -1393,20 +1517,29 @@ def quantize(self):
             if n in self.weight_config.keys():
                 if hasattr(m, "scale"):
                     self.weight_config[n]["scale"] = m.scale
+                    # self.weight_config[n]["scale_dtype"] = m.scale.dtype
                     self.weight_config[n]["zp"] = m.zp
+                    # self.weight_config[n]["zp_dtype"] = m.zp.dtype
                     delattr(m, "scale")
                     delattr(m, "zp")
                 else:
                     self.weight_config[n]["data_type"] = "float"
-                    if self.amp_dtype == torch.bfloat16:
-                        self.weight_config[n]["data_type"] = "bfloat"
-                    self.weight_config[n]["bits"] = 16
+                    self.weight_config[n]["bits"] = 32
+                    if self.amp:
+                        self.weight_config[n]["bits"] = 16
+                        if self.amp_dtype == torch.bfloat16:
+                            self.weight_config[n]["data_type"] = "bfloat"
                     self.weight_config[n]["group_size"] = None
                     self.weight_config[n]["sym"] = None
 
+        for k, v in self.weight_config.items():
+            for m, n in v.items():
+                if isinstance(n, torch.Tensor):
+                    self.weight_config[k][m] = n.tolist()
         end_time = time.time()
         cost_time = end_time - start_time
         logger.info(f"quantization runtime {cost_time}")
+        
         return self.model, self.weight_config
 
 
@@ -1665,3 +1798,4 @@ def __init__(
             optimizer,
             **kwargs,
         )
+
diff --git a/neural_compressor/adaptor/torch_utils/autoround/model_wrapper.py b/neural_compressor/adaptor/torch_utils/autoround/model_wrapper.py
new file mode 100644
index 00000000000..2ed950a802c
--- /dev/null
+++ b/neural_compressor/adaptor/torch_utils/autoround/model_wrapper.py
@@ -0,0 +1,344 @@
+#
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Torch.nn.Module Class Definition."""
+# Note: Do not import this file unless you have already imported torch,
+# since the model classes inherit torch.nn.Module.
+import math
+
+import torch
+from packaging.version import Version
+from torch.autograd import Function
+from torch.nn import functional as F
+import logging
+logger = logging.getLogger()
+
+
+NF4 = [
+    -1.0,
+    -0.6961928009986877,
+    -0.5250730514526367,
+    -0.39491748809814453,
+    -0.28444138169288635,
+    -0.18477343022823334,
+    -0.09105003625154495,
+    0.0,
+    0.07958029955625534,
+    0.16093020141124725,
+    0.24611230194568634,
+    0.33791524171829224,
+    0.44070982933044434,
+    0.5626170039176941,
+    0.7229568362236023,
+    1.0,
+]
+FP4_BNB = [-12.0, -8.0, -6.0, -4.0, -3.0, -2.0, -0.0625, 0, 0.0625, 2.0, 3.0, 4.0, 6.0, 8.0, 12.0]
+FP4_E2M1 = [-6.0, -4.0, -3.0, -2.0, -1.5, -1.0, -0.0625, 0, 0.0625, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0]
+
+# the order is the same as float list, bit value range is [-7, 7]
+# 1111 = -1, 1110 = -2, 1101= -3, ...
+
+NF4_BIT = [7, 1, 2, 3, 4, 5, 6, 0, -8, -7, -6, -5, -4, -3, -2, -1]
+FP4_BNB_BIT = [-5, -6, -3, -4, -1, -2, -7, 0, 1, 6, 7, 4, 5, 2, 3]
+FP4_E2M1_BIT = [-1, -2, -3, -4, -5, -6, -7, 0, 1, 2, 3, 4, 5, 6, 7]
+
+FLOAT_MAPPING = {"nf4": NF4, "fp4": FP4_BNB, "fp4_e2m1_bnb": FP4_BNB, "fp4_e2m1": FP4_E2M1}
+INT_MAPPING = {"nf4": NF4_BIT, "fp4": FP4_BNB_BIT, "fp4_e2m1_bnb": FP4_BNB_BIT, "fp4_e2m1": FP4_E2M1_BIT}
+
+def get_torch_version():
+    try:
+        torch_version = torch.__version__.split("+")[0]
+    except ValueError as e:  # pragma: no cover
+        assert False, "Got an unknown version of torch: {}".format(e)
+    version = Version(torch_version)
+    return version
+
+
+PT_VERSION = get_torch_version().release
+
+class WeightOnlyLinear(torch.nn.Module):
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        bits,
+        groupsize,
+        dtype="int",
+        zp=False,
+        bias=False,
+        scale_dtype=torch.float32,
+        compression_dtype=torch.int32,
+        compression_dim=1,
+        device="cpu",
+        use_optimum_format=True,
+    ):
+        super().__init__()
+        self.use_optimum_format = use_optimum_format
+        self.dtype = dtype
+        if "int" not in self.dtype:  # for nf4, fp4
+
+            float_list = FLOAT_MAPPING[self.dtype]
+            int_list = INT_MAPPING[self.dtype]
+            self.int2float_mapping = {}
+            for k, v in zip(int_list, float_list):
+                self.int2float_mapping[k] = v
+        self.device = device
+        self.in_features = in_features
+        self.out_features = out_features
+        self.bits = bits
+        self.groupsize = groupsize if groupsize != -1 else in_features
+        self.compression_dim = compression_dim
+        assert compression_dtype in [
+            torch.int8,
+            torch.int16,
+            torch.int32,
+            torch.int64,
+        ], "Only support torch.int8|16|32|64 as compressed dtype."
+        dtype_bits_mapping = {torch.int8: 8, torch.int16: 16, torch.int32: 32, torch.int64: 64}
+        self.compress_bits = dtype_bits_mapping[compression_dtype]
+        self.n_pack = self.compress_bits // self.bits
+        # K is input channel, N is output channel
+        assert compression_dim in [0, 1], (
+            "Only support 0 or 1 as compression dimension, " + "0 is output channel, 1 is input channel."
+        )
+        if self.use_optimum_format:
+            self.float_type = torch.float16
+            self.compression_dtype = torch.int32
+            self.register_buffer(
+                "scales",
+                torch.zeros(
+                    (math.ceil(in_features / self.groupsize), out_features),
+                    dtype=self.float_type,
+                ).to(device),
+            )
+            self.scales = self.scales.T
+            self.register_buffer(
+                "qweight",
+                torch.zeros(
+                    (math.ceil(in_features / self.n_pack), out_features),
+                    dtype=self.compression_dtype,
+                ).to(device),
+            )
+            self.qweight = self.qweight.T
+            self.register_buffer(
+                "qzeros",
+                torch.zeros(
+                    (math.ceil(self.in_features / self.groupsize), math.ceil(self.out_features / self.n_pack)),
+                    dtype=self.compression_dtype,
+                ).to(device),
+            )
+            self.qzeros = self.qzeros.T
+            self.register_buffer("bias", torch.zeros(self.out_features, dtype=self.float_type).to(device))
+        else:
+            self.compression_dtype = compression_dtype
+            self.float_type = scale_dtype
+            self.register_buffer(
+                "scales",
+                torch.zeros(
+                    (out_features, math.ceil(in_features / self.groupsize)),
+                    dtype=self.float_type,
+                ).to(device),
+            )
+            if compression_dim == 1:
+                self.register_buffer(
+                    "qweight",
+                    torch.zeros(
+                        (out_features, math.ceil(in_features / self.n_pack)),
+                        dtype=self.compression_dtype,
+                    ).to(device),
+                )
+                if zp:
+                    self.register_buffer(
+                        "qzeros",
+                        torch.zeros(
+                            (self.out_features, math.ceil(self.in_features / self.groupsize / self.n_pack)),
+                            dtype=self.compression_dtype,
+                        ).to(device),
+                    )
+            else:
+                self.register_buffer(
+                    "qweight",
+                    torch.zeros(
+                        (math.ceil(out_features / self.n_pack), in_features),
+                        dtype=self.compression_dtype,
+                    ).to(device),
+                )
+                if zp:
+                    self.register_buffer(
+                        "qzeros",
+                        torch.zeros(
+                            (math.ceil(self.out_features / self.n_pack), math.ceil(self.in_features / self.groupsize)),
+                            dtype=self.compression_dtype,
+                        ).to(device),
+                    )
+            if bias:
+                self.register_buffer("bias", torch.zeros(self.out_features, dtype=self.float_type).to(device))
+            else:
+                self.bias = None
+
+    def pack(self, int_weight, scale, zp, bias):
+        int_weight = int_weight.to(self.device)
+        if self.use_optimum_format and zp is None:
+            # to avoid overflow
+            int_weight = int_weight.type(torch.int32)
+            shift_bias = 2 ** (self.bits - 1)
+            int_weight += shift_bias
+            zp = torch.zeros_like(scale, dtype=torch.uint8) + shift_bias
+        if bias is not None:
+            assert hasattr(self, "bias"), "bias is not set when initializing."
+            self.bias = bias.type(self.float_type).to(self.device)
+        assert scale.shape == self.scales.shape, "Scale shape is mismatched."
+        self.scales = scale.type(self.float_type).to(self.device)
+        if not self.use_optimum_format and self.compression_dim == 0:
+            int_weight = int_weight.T
+            self.qweight = self.qweight.T
+        origin_shape = int_weight.shape
+        target_shape = self.qweight.shape
+        assert origin_shape[0] == target_shape[0], "output channels mismatch, please check."
+        mask = torch.tensor(2**self.bits - 1, dtype=self.compression_dtype).to(self.device)
+
+        # pack weight
+        for j in range(target_shape[1]):
+            start = self.n_pack * j
+            end = self.n_pack * (j + 1)
+            tmp = int_weight[:, start:end].type(self.compression_dtype)
+            for e in range(tmp.shape[1]):
+                tmp[:, e] &= mask
+                tmp[:, e] = tmp[:, e] << (self.bits * e)
+                self.qweight[:, j] |= tmp[:, e]
+        if not self.use_optimum_format and self.compression_dim == 0:
+            self.qweight = self.qweight.T
+
+        if zp is not None:
+            zp = zp.to(self.device)
+            if self.use_optimum_format:
+                zp -= 1
+            if self.use_optimum_format or self.compression_dim == 0:
+                zp = zp.T
+                self.qzeros = self.qzeros.T
+            assert hasattr(self, "qzeros"), "zp is not set when initializing."
+            target_shape = self.qzeros.shape
+            for j in range(target_shape[1]):
+                start = self.n_pack * j
+                end = self.n_pack * (j + 1)
+                tmp = zp[:, start:end].type(self.compression_dtype)
+                for e in range(tmp.shape[1]):
+                    tmp[:, e] &= mask
+                    tmp[:, e] = tmp[:, e] << (self.bits * e)
+                    self.qzeros[:, j] |= tmp[:, e]
+            if self.use_optimum_format or self.compression_dim == 0:
+                self.qzeros = self.qzeros.T
+        if self.use_optimum_format:
+            self.scales = self.scales.T
+            self.qweight = self.qweight.T
+            self.qzeros = self.qzeros.T
+
+    def recover(self):
+        logger.debug(f"Recovering {self} weight")
+        scales = self.scales.T if self.use_optimum_format else self.scales
+        qweight = self.qweight.T if self.use_optimum_format else self.qweight
+
+        device = scales.device
+        fp32_weight = torch.zeros(self.out_features, self.in_features, dtype=self.float_type).to(device)
+        mask = torch.tensor(2**self.bits - 1, dtype=self.compression_dtype).to(device)
+        if hasattr(self, "qzeros"):
+            weight_dtype = torch.uint8
+        else:
+            weight_dtype = torch.int8
+        # unpack weight
+        weight = torch.zeros(self.out_features, self.in_features, dtype=weight_dtype).to(device)
+        if not self.use_optimum_format and self.compression_dim == 0:
+            weight = weight.T
+            qweight = qweight.T
+        origin_shape = weight.shape
+        target_shape = qweight.shape
+        for j in range(target_shape[1]):
+            for e in range(self.n_pack):
+                index = j * self.n_pack + e
+                if index >= origin_shape[1]:
+                    continue
+                tmp = qweight[:, j]
+                tmp = tmp << (self.compress_bits - self.bits * (e + 1))
+                tmp = tmp >> self.compress_bits - self.bits
+                if weight_dtype == torch.uint8:
+                    tmp &= mask  # remove sign bit
+                weight[:, index] = tmp.type(weight_dtype)
+        if not self.use_optimum_format and self.compression_dim == 0:
+            weight = weight.T
+        if "int" not in self.dtype:
+            new_weight = torch.zeros(self.out_features, self.in_features).to(device)
+            for k, v in self.int2float_mapping.items():
+                new_weight += torch.where(weight == k, v, 0)
+            weight = new_weight
+        # unpack zero_point
+        if hasattr(self, "qzeros"):
+            zp_dtype = self.compression_dtype  # to avoid overflow when weight-zp
+            zp = torch.zeros(scales.shape, dtype=zp_dtype).to(device)
+            qzeros = self.qzeros.T if self.use_optimum_format else self.qzeros
+            if self.use_optimum_format or self.compression_dim == 0:
+                zp = zp.T
+                qzeros = qzeros.T
+            origin_shape = zp.shape
+            target_shape = qzeros.shape
+            for j in range(target_shape[1]):
+                for e in range(self.n_pack):
+                    index = j * self.n_pack + e
+                    if index >= origin_shape[1]:
+                        continue
+                    tmp = qzeros[:, j]
+                    tmp = tmp << (self.compress_bits - self.bits * (e + 1))
+                    tmp = tmp >> self.compress_bits - self.bits
+                    tmp &= mask
+                    zp[:, index] = tmp.type(zp_dtype)
+            if self.use_optimum_format or self.compression_dim == 0:
+                zp = zp.T
+            if self.use_optimum_format:
+                # zp -= 1 may cause zp == -1, after recover it becomes 2**self.bits - 1
+                zp += 1
+                zp = torch.where(zp > (2**self.bits - 1), 0, zp)
+            # recover fp32 weight with int_weight, scale, and zero_point
+            for idx in range(self.in_features):
+                g_idx = idx // self.groupsize
+                fp32_weight[:, idx] = (weight[:, idx] - zp[:, g_idx]) * scales[:, g_idx]
+        else:
+            # recover fp32 weight with int_weight, scale
+            for idx in range(self.in_features):
+                g_idx = idx // self.groupsize
+                fp32_weight[:, idx] = weight[:, idx] * scales[:, g_idx]
+        return fp32_weight
+
+    def forward(self, input):
+        weight = self.recover()
+        device = self.scales.device
+        if weight.dtype == torch.float16 and device.type == "cpu":
+            weight = weight.float()
+            self.bias = self.bias.float() if self.bias is not None else None
+        input = input.type(weight.dtype)
+        return F.linear(input, weight, self.bias)
+
+    def extra_repr(self) -> str:
+        tmp_str = "in_features={}, out_features={}, bits={}, group_size={}, bias={}".format(
+            self.in_features,
+            self.out_features,
+            self.bits,
+            self.groupsize,
+            self.bias is not None,
+        )
+        if self.use_optimum_format:
+            tmp_str += ", use_optimum_format=True"
+        return tmp_str
+
+
diff --git a/test/adaptor/pytorch_adaptor/test_autoround.py b/test/adaptor/pytorch_adaptor/test_autoround.py
new file mode 100644
index 00000000000..4936765fe36
--- /dev/null
+++ b/test/adaptor/pytorch_adaptor/test_autoround.py
@@ -0,0 +1,89 @@
+import copy
+import os
+import shutil
+import unittest
+
+import torch
+import transformers
+import sys
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from neural_compressor.adaptor.torch_utils.autoround import (AutoRound, 
+                                                             AutoOPTRound,
+                                                             AutoAdamRound,
+                                                             export_compressed_model)
+
+
+class SimpleDataLoader:
+    def __init__(self):
+        self.batch_size = 1
+
+    def __iter__(self):
+        for i in range(2):
+            yield torch.randn([1, 30])
+
+
+class LLMDataLoader:
+    def __init__(self):
+        self.batch_size = 1
+
+    def __iter__(self):
+        for i in range(2):
+            yield torch.ones([1, 10], dtype=torch.long)
+
+
+class TestPytorchWeightOnlyAdaptor(unittest.TestCase):
+    approach = "weight_only"
+
+    @classmethod
+    def setUpClass(self):
+        self.dataloader = SimpleDataLoader()
+        self.gptj = transformers.AutoModelForCausalLM.from_pretrained(
+            "hf-internal-testing/tiny-random-GPTJForCausalLM",
+            torchscript=True,
+        )
+        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+            "hf-internal-testing/tiny-random-GPTJForCausalLM",
+            trust_remote_code=True
+        )
+        self.gptj_no_jit = transformers.AutoModelForCausalLM.from_pretrained(
+            "hf-internal-testing/tiny-random-GPTJForCausalLM",
+        )
+        self.llm_dataloader = LLMDataLoader()
+        self.lm_input = torch.ones([1, 10], dtype=torch.long)
+
+    @classmethod
+    def tearDownClass(self):
+        shutil.rmtree("./saved", ignore_errors=True)
+        shutil.rmtree("runs", ignore_errors=True)
+
+    def test_RTN_int_quant(self):
+        model = copy.deepcopy(self.gptj)
+        out1 = model(self.lm_input)
+        round = AutoRound
+        optq_1 = round(model, self.tokenizer, n_samples=20, amp=False, seqlen=10)
+        q_model, weight_config1 = optq_1.quantize()
+        compressed_model = export_compressed_model(q_model, weight_config1)
+        out2 = model(self.lm_input)
+        out3 = compressed_model(self.lm_input)
+        self.assertTrue(torch.all(torch.isclose(out1[0], out2[0], atol=1e-1)))
+        self.assertFalse(torch.all(out1[0] == out2[0]))
+        self.assertTrue(torch.all(torch.isclose(out2[0], out3[0], atol=1e-3)))
+        self.assertTrue("transformer.h.0.attn.k_proj.qzeros" in compressed_model.state_dict().keys())
+        
+        # model = copy.deepcopy(self.gptj)
+        # out6 = model(self.lm_input)
+        # optq_2 = round(model, self.tokenizer, n_samples=20, amp=False, seqlen=10)
+        # q_model, weight_config2 = optq_2.quantize()
+        # out4 = q_model(self.lm_input)
+        # out5 = model(self.lm_input)
+        
+        # self.assertTrue(torch.all(out1[0] == out6[0]))
+        # self.assertTrue(torch.all(out4[0] == out5[0]))
+        # self.assertTrue(torch.all(torch.isclose(out6[0], out5[0], atol=1e-1)))
+        
+        
+        
+if __name__ == "__main__":
+    unittest.main()
+

From 1454c37002b0f84a203112260ed8473a832e4713 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 12 Jan 2024 01:15:50 +0000
Subject: [PATCH 02/14] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../adaptor/torch_utils/autoround/__init__.py |  1 -
 .../torch_utils/autoround/autoround.py        | 59 ++++++++-----------
 .../torch_utils/autoround/model_wrapper.py    |  9 +--
 .../adaptor/pytorch_adaptor/test_autoround.py | 27 ++++-----
 4 files changed, 41 insertions(+), 55 deletions(-)

diff --git a/neural_compressor/adaptor/torch_utils/autoround/__init__.py b/neural_compressor/adaptor/torch_utils/autoround/__init__.py
index 6567bdbb92d..24067829635 100644
--- a/neural_compressor/adaptor/torch_utils/autoround/__init__.py
+++ b/neural_compressor/adaptor/torch_utils/autoround/__init__.py
@@ -12,4 +12,3 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .autoround import AutoRound, AutoOPTRound, AutoAdamRound, export_compressed_model
-
diff --git a/neural_compressor/adaptor/torch_utils/autoround/autoround.py b/neural_compressor/adaptor/torch_utils/autoround/autoround.py
index ac5c0d49af5..4dd215dc887 100644
--- a/neural_compressor/adaptor/torch_utils/autoround/autoround.py
+++ b/neural_compressor/adaptor/torch_utils/autoround/autoround.py
@@ -19,17 +19,21 @@
     from neural_compressor.utils import logger
 except:  # pragma: no cover
     import logging
+
     import torch
+
     logger = logging.getLogger()
 
 import copy
 import time
 from collections import UserDict
 from functools import partial
-from .model_wrapper import WeightOnlyLinear
+from typing import Union
+
 from torch.amp import autocast
 from torch.functional import F
-from typing import Union
+
+from .model_wrapper import WeightOnlyLinear
 
 
 def quant_weight_asym(weight, num_bits=4, v=0, min_scale=0, max_scale=0):
@@ -135,29 +139,17 @@ def quant_weight(weight, num_bits=4, group_size=-1, scheme="asym", v=0, min_scal
         Quantized and dequantized weight, scale, zero-point
     """
     if group_size == -1 or weight.shape[1] < group_size:
-        return quant_weight_actor(
-            weight,
-            num_bits,
-            scheme=scheme,
-            v=v,
-            min_scale=min_scale,
-            max_scale=max_scale
-        )
+        return quant_weight_actor(weight, num_bits, scheme=scheme, v=v, min_scale=min_scale, max_scale=max_scale)
     orig_shape = weight.shape
     if weight.shape[1] % group_size == 0:
         weight = weight.reshape(-1, group_size)
         if isinstance(v, torch.Tensor):
             v = v.reshape(-1, group_size)
         weight, scale, zp = quant_weight_actor(
-            weight,
-            num_bits,
-            scheme=scheme,
-            v=v,
-            min_scale=min_scale,
-            max_scale=max_scale
+            weight, num_bits, scheme=scheme, v=v, min_scale=min_scale, max_scale=max_scale
         )
         weight = weight.reshape(orig_shape)
-        scale = scale.reshape(orig_shape[0], -1) #TODO validating the feasibility on conv1d
+        scale = scale.reshape(orig_shape[0], -1)  # TODO validating the feasibility on conv1d
         if zp is not None:
             zp = zp.reshape(orig_shape[0], -1)
         return weight, scale, zp
@@ -170,12 +162,7 @@ def quant_weight(weight, num_bits=4, group_size=-1, scheme="asym", v=0, min_scal
         if isinstance(v, torch.Tensor):
             v = v.reshape(-1, group_size)
         weight_new, scale, zp = quant_weight_actor(
-            weight_new,
-            num_bits,
-            scheme=scheme,
-            v=v,
-            min_scale=min_scale,
-            max_scale=max_scale
+            weight_new, num_bits, scheme=scheme, v=v, min_scale=min_scale, max_scale=max_scale
         )
         weight_new = weight_new.reshape(orig_shape[0], -1)
         scale = scale.reshape(orig_shape[0], -1)
@@ -222,15 +209,15 @@ def quant_weight_w_scale(weight, scale, zp, group_size=-1):
 
 
 def export_compressed_model(
-        model,
-        weight_config:Union[str, dict],
-        enable_full_range=False,
-        compression_dtype=torch.int32,
-        compression_dim=1,
-        scale_dtype=torch.float32,
-        device="cpu",
-        use_optimum_format=True,
-    ):
+    model,
+    weight_config: Union[str, dict],
+    enable_full_range=False,
+    compression_dtype=torch.int32,
+    compression_dim=1,
+    scale_dtype=torch.float32,
+    device="cpu",
+    use_optimum_format=True,
+):
     """Convert Linear to WeightOnlyLinear for low memory inference.
 
     Args:
@@ -252,6 +239,7 @@ def export_compressed_model(
             5. zeros is always needed even for sym.
     """
     from .model_wrapper import WeightOnlyLinear
+
     compressed_model = copy.deepcopy(model)
     if isinstance(weight_config, str):
         with open(weight_config, "r") as f:
@@ -269,7 +257,7 @@ def export_compressed_model(
             scheme = v["scheme"]
         m = get_module(compressed_model, k)
         fp_weight = m.weight.data
-        scale = torch.tensor(v["scale"], dtype=torch.float32) # may exist dtype dismatch problem
+        scale = torch.tensor(v["scale"], dtype=torch.float32)  # may exist dtype dismatch problem
         zp = None if scheme == "sym" else torch.tensor(v["zp"], dtype=torch.int32)
         int_weight = quant_weight_w_scale(fp_weight, scale, zp, group_size)
         int_weight = int_weight.type(torch.int32)
@@ -1120,7 +1108,7 @@ def __init__(
         self.tokenizer = tokenizer
         self.seqlen = seqlen
         self.train_bs = bs
-        self.n_samples = bs*(n_samples//bs)
+        self.n_samples = bs * (n_samples // bs)
         self.n_blocks = n_blocks
         self.device = device
         self.amp_dtype = torch.float16
@@ -1539,7 +1527,7 @@ def quantize(self):
         end_time = time.time()
         cost_time = end_time - start_time
         logger.info(f"quantization runtime {cost_time}")
-        
+
         return self.model, self.weight_config
 
 
@@ -1798,4 +1786,3 @@ def __init__(
             optimizer,
             **kwargs,
         )
-
diff --git a/neural_compressor/adaptor/torch_utils/autoround/model_wrapper.py b/neural_compressor/adaptor/torch_utils/autoround/model_wrapper.py
index 2ed950a802c..bd73fddd94d 100644
--- a/neural_compressor/adaptor/torch_utils/autoround/model_wrapper.py
+++ b/neural_compressor/adaptor/torch_utils/autoround/model_wrapper.py
@@ -15,6 +15,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Torch.nn.Module Class Definition."""
+import logging
+
 # Note: Do not import this file unless you have already imported torch,
 # since the model classes inherit torch.nn.Module.
 import math
@@ -23,7 +25,7 @@
 from packaging.version import Version
 from torch.autograd import Function
 from torch.nn import functional as F
-import logging
+
 logger = logging.getLogger()
 
 
@@ -58,6 +60,7 @@
 FLOAT_MAPPING = {"nf4": NF4, "fp4": FP4_BNB, "fp4_e2m1_bnb": FP4_BNB, "fp4_e2m1": FP4_E2M1}
 INT_MAPPING = {"nf4": NF4_BIT, "fp4": FP4_BNB_BIT, "fp4_e2m1_bnb": FP4_BNB_BIT, "fp4_e2m1": FP4_E2M1_BIT}
 
+
 def get_torch_version():
     try:
         torch_version = torch.__version__.split("+")[0]
@@ -69,6 +72,7 @@ def get_torch_version():
 
 PT_VERSION = get_torch_version().release
 
+
 class WeightOnlyLinear(torch.nn.Module):
     def __init__(
         self,
@@ -89,7 +93,6 @@ def __init__(
         self.use_optimum_format = use_optimum_format
         self.dtype = dtype
         if "int" not in self.dtype:  # for nf4, fp4
-
             float_list = FLOAT_MAPPING[self.dtype]
             int_list = INT_MAPPING[self.dtype]
             self.int2float_mapping = {}
@@ -340,5 +343,3 @@ def extra_repr(self) -> str:
         if self.use_optimum_format:
             tmp_str += ", use_optimum_format=True"
         return tmp_str
-
-
diff --git a/test/adaptor/pytorch_adaptor/test_autoround.py b/test/adaptor/pytorch_adaptor/test_autoround.py
index 4936765fe36..415c16bf688 100644
--- a/test/adaptor/pytorch_adaptor/test_autoround.py
+++ b/test/adaptor/pytorch_adaptor/test_autoround.py
@@ -1,17 +1,19 @@
 import copy
 import os
 import shutil
+import sys
 import unittest
 
 import torch
 import transformers
-import sys
-
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from neural_compressor.adaptor.torch_utils.autoround import (AutoRound, 
-                                                             AutoOPTRound,
-                                                             AutoAdamRound,
-                                                             export_compressed_model)
+
+from neural_compressor.adaptor.torch_utils.autoround import (
+    AutoAdamRound,
+    AutoOPTRound,
+    AutoRound,
+    export_compressed_model,
+)
 
 
 class SimpleDataLoader:
@@ -43,8 +45,7 @@ def setUpClass(self):
             torchscript=True,
         )
         self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-GPTJForCausalLM",
-            trust_remote_code=True
+            "hf-internal-testing/tiny-random-GPTJForCausalLM", trust_remote_code=True
         )
         self.gptj_no_jit = transformers.AutoModelForCausalLM.from_pretrained(
             "hf-internal-testing/tiny-random-GPTJForCausalLM",
@@ -70,20 +71,18 @@ def test_RTN_int_quant(self):
         self.assertFalse(torch.all(out1[0] == out2[0]))
         self.assertTrue(torch.all(torch.isclose(out2[0], out3[0], atol=1e-3)))
         self.assertTrue("transformer.h.0.attn.k_proj.qzeros" in compressed_model.state_dict().keys())
-        
+
         # model = copy.deepcopy(self.gptj)
         # out6 = model(self.lm_input)
         # optq_2 = round(model, self.tokenizer, n_samples=20, amp=False, seqlen=10)
         # q_model, weight_config2 = optq_2.quantize()
         # out4 = q_model(self.lm_input)
         # out5 = model(self.lm_input)
-        
+
         # self.assertTrue(torch.all(out1[0] == out6[0]))
         # self.assertTrue(torch.all(out4[0] == out5[0]))
         # self.assertTrue(torch.all(torch.isclose(out6[0], out5[0], atol=1e-1)))
-        
-        
-        
+
+
 if __name__ == "__main__":
     unittest.main()
-

From fbfd8c27339fd11de4a900bd07cd70926be24947 Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Fri, 12 Jan 2024 10:21:39 +0800
Subject: [PATCH 03/14] fixtypo

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 neural_compressor/adaptor/torch_utils/autoround/autoround.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/neural_compressor/adaptor/torch_utils/autoround/autoround.py b/neural_compressor/adaptor/torch_utils/autoround/autoround.py
index ac5c0d49af5..2e7f51510cb 100644
--- a/neural_compressor/adaptor/torch_utils/autoround/autoround.py
+++ b/neural_compressor/adaptor/torch_utils/autoround/autoround.py
@@ -24,6 +24,7 @@
 
 import copy
 import time
+import json
 from collections import UserDict
 from functools import partial
 from .model_wrapper import WeightOnlyLinear

From 10c80b7f6ef7e51fb457c4d96418359a8648d58e Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 12 Jan 2024 02:24:19 +0000
Subject: [PATCH 04/14] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 neural_compressor/adaptor/torch_utils/autoround/autoround.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neural_compressor/adaptor/torch_utils/autoround/autoround.py b/neural_compressor/adaptor/torch_utils/autoround/autoround.py
index bf4fb63ccd2..a0a4f3c564e 100644
--- a/neural_compressor/adaptor/torch_utils/autoround/autoround.py
+++ b/neural_compressor/adaptor/torch_utils/autoround/autoround.py
@@ -25,8 +25,8 @@
     logger = logging.getLogger()
 
 import copy
-import time
 import json
+import time
 from collections import UserDict
 from functools import partial
 from typing import Union

From b7164648e52e1d7166500ff251dd6a3aa819b3f4 Mon Sep 17 00:00:00 2001
From: wenhuach21 <108330088+wenhuach21@users.noreply.github.com>
Date: Fri, 12 Jan 2024 18:12:56 +0800
Subject: [PATCH 05/14] fixed incorrect default hyperparameters

---
 .../language-modeling/quantization/autoround/main.py          | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/autoround/main.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/autoround/main.py
index 9a54b3fb458..f699d3e298c 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/autoround/main.py
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/autoround/main.py
@@ -44,13 +44,13 @@
     parser.add_argument("--sym", action='store_true',
                          help=" sym quantization")
 
-    parser.add_argument("--iters", default=400, type=int,
+    parser.add_argument("--iters", default=200, type=int,
                         help=" iters")
 
     parser.add_argument("--use_quant_input", action='store_true',
                         help="whether to use the output of quantized block to tune the next block")
 
-    parser.add_argument("--lr", default=0.05, type=float,
+    parser.add_argument("--lr", default=0.005, type=float,
                         help="step size")
 
     parser.add_argument("--minmax_lr", default=None, type=float,

From 8a248a64f2a497682451b13354d0db9a92074bfe Mon Sep 17 00:00:00 2001
From: wenhuach21 <108330088+wenhuach21@users.noreply.github.com>
Date: Fri, 12 Jan 2024 19:14:28 +0800
Subject: [PATCH 06/14] fixed mixstral7b*8 issue

---
 neural_compressor/adaptor/torch_utils/autoround/autoround.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/neural_compressor/adaptor/torch_utils/autoround/autoround.py b/neural_compressor/adaptor/torch_utils/autoround/autoround.py
index a0a4f3c564e..348e6ec61e2 100644
--- a/neural_compressor/adaptor/torch_utils/autoround/autoround.py
+++ b/neural_compressor/adaptor/torch_utils/autoround/autoround.py
@@ -932,6 +932,7 @@ def get_block_names(model):
     for n, m in model.named_modules():
         if hasattr(type(m), "__name__") and "ModuleList" in type(m).__name__:
             target_m = (n, m)
+            break
     for n, m in target_m[1].named_children():
         block_names.append(target_m[0] + "." + n)
     return block_names

From d0f645c7469a9ca9848cb8d5f0bcbb1c840cb7e5 Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Mon, 15 Jan 2024 16:26:19 +0800
Subject: [PATCH 07/14] port export func to file

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 .../adaptor/torch_utils/autoround/__init__.py |   4 +-
 .../torch_utils/autoround/autoround.py        | 106 ++++--------------
 2 files changed, 27 insertions(+), 83 deletions(-)

diff --git a/neural_compressor/adaptor/torch_utils/autoround/__init__.py b/neural_compressor/adaptor/torch_utils/autoround/__init__.py
index 24067829635..c1c06c56b38 100644
--- a/neural_compressor/adaptor/torch_utils/autoround/__init__.py
+++ b/neural_compressor/adaptor/torch_utils/autoround/__init__.py
@@ -11,4 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .autoround import AutoRound, AutoOPTRound, AutoAdamRound, export_compressed_model
+from .autoround import AutoRound, AutoOPTRound, AutoAdamRound
+from .export import export_compressed_model
+
diff --git a/neural_compressor/adaptor/torch_utils/autoround/autoround.py b/neural_compressor/adaptor/torch_utils/autoround/autoround.py
index 348e6ec61e2..57e4a24723c 100644
--- a/neural_compressor/adaptor/torch_utils/autoround/autoround.py
+++ b/neural_compressor/adaptor/torch_utils/autoround/autoround.py
@@ -14,27 +14,20 @@
 
 try:
     from neural_compressor.utils.utility import LazyImport
-
     torch = LazyImport("torch")
     from neural_compressor.utils import logger
 except:  # pragma: no cover
     import logging
-
     import torch
-
     logger = logging.getLogger()
 
 import copy
-import json
 import time
 from collections import UserDict
 from functools import partial
-from typing import Union
-
 from torch.amp import autocast
 from torch.functional import F
 
-from .model_wrapper import WeightOnlyLinear
 
 
 def quant_weight_asym(weight, num_bits=4, v=0, min_scale=0, max_scale=0):
@@ -140,17 +133,29 @@ def quant_weight(weight, num_bits=4, group_size=-1, scheme="asym", v=0, min_scal
         Quantized and dequantized weight, scale, zero-point
     """
     if group_size == -1 or weight.shape[1] < group_size:
-        return quant_weight_actor(weight, num_bits, scheme=scheme, v=v, min_scale=min_scale, max_scale=max_scale)
+        return quant_weight_actor(
+            weight,
+            num_bits,
+            scheme=scheme,
+            v=v,
+            min_scale=min_scale,
+            max_scale=max_scale
+        )
     orig_shape = weight.shape
     if weight.shape[1] % group_size == 0:
         weight = weight.reshape(-1, group_size)
         if isinstance(v, torch.Tensor):
             v = v.reshape(-1, group_size)
         weight, scale, zp = quant_weight_actor(
-            weight, num_bits, scheme=scheme, v=v, min_scale=min_scale, max_scale=max_scale
+            weight,
+            num_bits,
+            scheme=scheme,
+            v=v,
+            min_scale=min_scale,
+            max_scale=max_scale
         )
         weight = weight.reshape(orig_shape)
-        scale = scale.reshape(orig_shape[0], -1)  # TODO validating the feasibility on conv1d
+        scale = scale.reshape(orig_shape[0], -1) #TODO validating the feasibility on conv1d
         if zp is not None:
             zp = zp.reshape(orig_shape[0], -1)
         return weight, scale, zp
@@ -163,7 +168,12 @@ def quant_weight(weight, num_bits=4, group_size=-1, scheme="asym", v=0, min_scal
         if isinstance(v, torch.Tensor):
             v = v.reshape(-1, group_size)
         weight_new, scale, zp = quant_weight_actor(
-            weight_new, num_bits, scheme=scheme, v=v, min_scale=min_scale, max_scale=max_scale
+            weight_new,
+            num_bits,
+            scheme=scheme,
+            v=v,
+            min_scale=min_scale,
+            max_scale=max_scale
         )
         weight_new = weight_new.reshape(orig_shape[0], -1)
         scale = scale.reshape(orig_shape[0], -1)
@@ -209,75 +219,6 @@ def quant_weight_w_scale(weight, scale, zp, group_size=-1):
     return int_weight
 
 
-def export_compressed_model(
-    model,
-    weight_config: Union[str, dict],
-    enable_full_range=False,
-    compression_dtype=torch.int32,
-    compression_dim=1,
-    scale_dtype=torch.float32,
-    device="cpu",
-    use_optimum_format=True,
-):
-    """Convert Linear to WeightOnlyLinear for low memory inference.
-
-    Args:
-        weight_config (str|dict): qconfig dict or Path of qconfig.json.
-        enable_full_range (bool, optional): Whether to leverage the full compression range
-                                            under symmetric quantization. Defaults to False.
-        compression_dtype (torch.Tensor, optional): The target dtype after comoression.
-                                                    Defaults to torch.int32.
-        compression_dim (int, optional): Select from [0, 1], 0 is output channel,
-                                            1 is input channel. Defaults to 1.
-        scale_dtype (torch.Tensor, optional): Use float32 or float16.
-                                                Defaults to torch.float32.
-        device (str, optional): choose device for compression. Defaults to cpu.
-        use_optimum_format (bool, optional): use the popular huggingface compression format.
-            1: compression_dim: weight = 1, zeros = 0 and both are transposed.
-            2: zeros -= 1 before compression. Why we need it?
-            3: g_idx: use same number for one group instead of recording the channel order.
-            4. parameter name changed, such as 'packed_weight' -> 'qweight'.
-            5. zeros is always needed even for sym.
-    """
-    from .model_wrapper import WeightOnlyLinear
-
-    compressed_model = copy.deepcopy(model)
-    if isinstance(weight_config, str):
-        with open(weight_config, "r") as f:
-            q_config = json.load(f)
-    else:
-        q_config = weight_config
-    for k, v in q_config.items():
-        print(f"Compressing {k} on device {device}")
-        if v["data_type"] == "float":
-            continue
-        else:
-            dtype = v["data_type"]
-            num_bits = v["bits"]
-            group_size = v["group_size"]
-            scheme = v["scheme"]
-        m = get_module(compressed_model, k)
-        fp_weight = m.weight.data
-        scale = torch.tensor(v["scale"], dtype=torch.float32)  # may exist dtype dismatch problem
-        zp = None if scheme == "sym" else torch.tensor(v["zp"], dtype=torch.int32)
-        int_weight = quant_weight_w_scale(fp_weight, scale, zp, group_size)
-        int_weight = int_weight.type(torch.int32)
-        new_module = WeightOnlyLinear(
-            m.in_features,
-            m.out_features,
-            num_bits,
-            group_size,
-            dtype=dtype,
-            zp=zp is not None,
-            bias=m.bias is not None,
-            device=device,
-            use_optimum_format=True,
-        )
-        new_module.pack(int_weight, scale, zp, m.bias)
-        set_module(compressed_model, k, new_module)
-    return compressed_model
-
-
 def round_ste(x: torch.Tensor):
     """Straight-Through Estimator for rounding.
     This function is adapted from omniquant.
@@ -1110,7 +1051,7 @@ def __init__(
         self.tokenizer = tokenizer
         self.seqlen = seqlen
         self.train_bs = bs
-        self.n_samples = bs * (n_samples // bs)
+        self.n_samples = bs*(n_samples//bs)
         self.n_blocks = n_blocks
         self.device = device
         self.amp_dtype = torch.float16
@@ -1529,7 +1470,7 @@ def quantize(self):
         end_time = time.time()
         cost_time = end_time - start_time
         logger.info(f"quantization runtime {cost_time}")
-
+        
         return self.model, self.weight_config
 
 
@@ -1788,3 +1729,4 @@ def __init__(
             optimizer,
             **kwargs,
         )
+

From ab12cf14230c5d5f04a45ca894cc8f4a905f595f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 15 Jan 2024 08:27:41 +0000
Subject: [PATCH 08/14] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../adaptor/torch_utils/autoround/__init__.py |  1 -
 .../torch_utils/autoround/autoround.py        | 35 ++++++-------------
 2 files changed, 10 insertions(+), 26 deletions(-)

diff --git a/neural_compressor/adaptor/torch_utils/autoround/__init__.py b/neural_compressor/adaptor/torch_utils/autoround/__init__.py
index c1c06c56b38..060e02e45c9 100644
--- a/neural_compressor/adaptor/torch_utils/autoround/__init__.py
+++ b/neural_compressor/adaptor/torch_utils/autoround/__init__.py
@@ -13,4 +13,3 @@
 # limitations under the License.
 from .autoround import AutoRound, AutoOPTRound, AutoAdamRound
 from .export import export_compressed_model
-
diff --git a/neural_compressor/adaptor/torch_utils/autoround/autoround.py b/neural_compressor/adaptor/torch_utils/autoround/autoround.py
index 57e4a24723c..f225f9c332e 100644
--- a/neural_compressor/adaptor/torch_utils/autoround/autoround.py
+++ b/neural_compressor/adaptor/torch_utils/autoround/autoround.py
@@ -14,22 +14,25 @@
 
 try:
     from neural_compressor.utils.utility import LazyImport
+
     torch = LazyImport("torch")
     from neural_compressor.utils import logger
 except:  # pragma: no cover
     import logging
+
     import torch
+
     logger = logging.getLogger()
 
 import copy
 import time
 from collections import UserDict
 from functools import partial
+
 from torch.amp import autocast
 from torch.functional import F
 
 
-
 def quant_weight_asym(weight, num_bits=4, v=0, min_scale=0, max_scale=0):
     """Quantizes and dequantizes weight asymmetrically.
 
@@ -133,29 +136,17 @@ def quant_weight(weight, num_bits=4, group_size=-1, scheme="asym", v=0, min_scal
         Quantized and dequantized weight, scale, zero-point
     """
     if group_size == -1 or weight.shape[1] < group_size:
-        return quant_weight_actor(
-            weight,
-            num_bits,
-            scheme=scheme,
-            v=v,
-            min_scale=min_scale,
-            max_scale=max_scale
-        )
+        return quant_weight_actor(weight, num_bits, scheme=scheme, v=v, min_scale=min_scale, max_scale=max_scale)
     orig_shape = weight.shape
     if weight.shape[1] % group_size == 0:
         weight = weight.reshape(-1, group_size)
         if isinstance(v, torch.Tensor):
             v = v.reshape(-1, group_size)
         weight, scale, zp = quant_weight_actor(
-            weight,
-            num_bits,
-            scheme=scheme,
-            v=v,
-            min_scale=min_scale,
-            max_scale=max_scale
+            weight, num_bits, scheme=scheme, v=v, min_scale=min_scale, max_scale=max_scale
         )
         weight = weight.reshape(orig_shape)
-        scale = scale.reshape(orig_shape[0], -1) #TODO validating the feasibility on conv1d
+        scale = scale.reshape(orig_shape[0], -1)  # TODO validating the feasibility on conv1d
         if zp is not None:
             zp = zp.reshape(orig_shape[0], -1)
         return weight, scale, zp
@@ -168,12 +159,7 @@ def quant_weight(weight, num_bits=4, group_size=-1, scheme="asym", v=0, min_scal
         if isinstance(v, torch.Tensor):
             v = v.reshape(-1, group_size)
         weight_new, scale, zp = quant_weight_actor(
-            weight_new,
-            num_bits,
-            scheme=scheme,
-            v=v,
-            min_scale=min_scale,
-            max_scale=max_scale
+            weight_new, num_bits, scheme=scheme, v=v, min_scale=min_scale, max_scale=max_scale
         )
         weight_new = weight_new.reshape(orig_shape[0], -1)
         scale = scale.reshape(orig_shape[0], -1)
@@ -1051,7 +1037,7 @@ def __init__(
         self.tokenizer = tokenizer
         self.seqlen = seqlen
         self.train_bs = bs
-        self.n_samples = bs*(n_samples//bs)
+        self.n_samples = bs * (n_samples // bs)
         self.n_blocks = n_blocks
         self.device = device
         self.amp_dtype = torch.float16
@@ -1470,7 +1456,7 @@ def quantize(self):
         end_time = time.time()
         cost_time = end_time - start_time
         logger.info(f"quantization runtime {cost_time}")
-        
+
         return self.model, self.weight_config
 
 
@@ -1729,4 +1715,3 @@ def __init__(
             optimizer,
             **kwargs,
         )
-

From 0794fe091356a130063d50cfc56f5f3e7bec7110 Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Mon, 15 Jan 2024 16:45:00 +0800
Subject: [PATCH 09/14] add export

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 .../adaptor/torch_utils/autoround/export.py   | 80 +++++++++++++++++++
 1 file changed, 80 insertions(+)
 create mode 100644 neural_compressor/adaptor/torch_utils/autoround/export.py

diff --git a/neural_compressor/adaptor/torch_utils/autoround/export.py b/neural_compressor/adaptor/torch_utils/autoround/export.py
new file mode 100644
index 00000000000..0fc027e3245
--- /dev/null
+++ b/neural_compressor/adaptor/torch_utils/autoround/export.py
@@ -0,0 +1,80 @@
+
+import copy
+from typing import Union
+import json
+try:
+    from neural_compressor.utils.utility import LazyImport
+    torch = LazyImport("torch")
+    from neural_compressor.utils import logger
+except:  # pragma: no cover
+    import torch
+    import logging
+    logger = logging.getLogger()
+
+def export_compressed_model(
+        model,
+        weight_config:Union[str, dict],
+        enable_full_range=False,
+        compression_dtype=torch.int32,
+        compression_dim=1,
+        scale_dtype=torch.float32,
+        device="cpu",
+        use_optimum_format=True,
+    ):
+    """Convert Linear to WeightOnlyLinear for low memory inference.
+
+    Args:
+        weight_config (str|dict): qconfig dict or Path of qconfig.json.
+        enable_full_range (bool, optional): Whether to leverage the full compression range
+                                            under symmetric quantization. Defaults to False.
+        compression_dtype (torch.Tensor, optional): The target dtype after comoression.
+                                                    Defaults to torch.int32.
+        compression_dim (int, optional): Select from [0, 1], 0 is output channel,
+                                            1 is input channel. Defaults to 1.
+        scale_dtype (torch.Tensor, optional): Use float32 or float16.
+                                                Defaults to torch.float32.
+        device (str, optional): choose device for compression. Defaults to cpu.
+        use_optimum_format (bool, optional): use the popular huggingface compression format.
+            1: compression_dim: weight = 1, zeros = 0 and both are transposed.
+            2: zeros -= 1 before compression. Why we need it?
+            3: g_idx: use same number for one group instead of recording the channel order.
+            4. parameter name changed, such as 'packed_weight' -> 'qweight'.
+            5. zeros is always needed even for sym.
+    """
+    from .model_wrapper import WeightOnlyLinear
+    from .autoround import quant_weight_w_scale, get_module, set_module
+    compressed_model = copy.deepcopy(model)
+    if isinstance(weight_config, str):
+        with open(weight_config, "r") as f:
+            q_config = json.load(f)
+    else:
+        q_config = weight_config
+    for k, v in q_config.items():
+        logger.info(f"Compressing {k} on device {device}")
+        if v["data_type"] == "float":
+            continue
+        else:
+            dtype = v["data_type"]
+            num_bits = v["bits"]
+            group_size = v["group_size"]
+            scheme = v["scheme"]
+        m = get_module(compressed_model, k)
+        fp_weight = m.weight.data
+        scale = torch.tensor(v["scale"], dtype=torch.float32) # may exist dtype dismatch problem
+        zp = None if scheme == "sym" else torch.tensor(v["zp"], dtype=torch.int32)
+        int_weight = quant_weight_w_scale(fp_weight, scale, zp, group_size)
+        int_weight = int_weight.type(torch.int32)
+        new_module = WeightOnlyLinear(
+            m.in_features,
+            m.out_features,
+            num_bits,
+            group_size,
+            dtype=dtype,
+            zp=zp is not None,
+            bias=m.bias is not None,
+            device=device,
+            use_optimum_format=True,
+        )
+        new_module.pack(int_weight, scale, zp, m.bias)
+        set_module(compressed_model, k, new_module)
+    return compressed_model

From bbe009a0356af8dc4995db754e06481488001b73 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 15 Jan 2024 08:47:29 +0000
Subject: [PATCH 10/14] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../adaptor/torch_utils/autoround/export.py   | 45 +++++++++++++------
 1 file changed, 32 insertions(+), 13 deletions(-)

diff --git a/neural_compressor/adaptor/torch_utils/autoround/export.py b/neural_compressor/adaptor/torch_utils/autoround/export.py
index 0fc027e3245..d47ed343022 100644
--- a/neural_compressor/adaptor/torch_utils/autoround/export.py
+++ b/neural_compressor/adaptor/torch_utils/autoround/export.py
@@ -1,26 +1,44 @@
+# Copyright (c) 2024 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import copy
-from typing import Union
 import json
+from typing import Union
+
 try:
     from neural_compressor.utils.utility import LazyImport
+
     torch = LazyImport("torch")
     from neural_compressor.utils import logger
 except:  # pragma: no cover
-    import torch
     import logging
+
+    import torch
+
     logger = logging.getLogger()
 
+
 def export_compressed_model(
-        model,
-        weight_config:Union[str, dict],
-        enable_full_range=False,
-        compression_dtype=torch.int32,
-        compression_dim=1,
-        scale_dtype=torch.float32,
-        device="cpu",
-        use_optimum_format=True,
-    ):
+    model,
+    weight_config: Union[str, dict],
+    enable_full_range=False,
+    compression_dtype=torch.int32,
+    compression_dim=1,
+    scale_dtype=torch.float32,
+    device="cpu",
+    use_optimum_format=True,
+):
     """Convert Linear to WeightOnlyLinear for low memory inference.
 
     Args:
@@ -41,8 +59,9 @@ def export_compressed_model(
             4. parameter name changed, such as 'packed_weight' -> 'qweight'.
             5. zeros is always needed even for sym.
     """
+    from .autoround import get_module, quant_weight_w_scale, set_module
     from .model_wrapper import WeightOnlyLinear
-    from .autoround import quant_weight_w_scale, get_module, set_module
+
     compressed_model = copy.deepcopy(model)
     if isinstance(weight_config, str):
         with open(weight_config, "r") as f:
@@ -60,7 +79,7 @@ def export_compressed_model(
             scheme = v["scheme"]
         m = get_module(compressed_model, k)
         fp_weight = m.weight.data
-        scale = torch.tensor(v["scale"], dtype=torch.float32) # may exist dtype dismatch problem
+        scale = torch.tensor(v["scale"], dtype=torch.float32)  # may exist dtype dismatch problem
         zp = None if scheme == "sym" else torch.tensor(v["zp"], dtype=torch.int32)
         int_weight = quant_weight_w_scale(fp_weight, scale, zp, group_size)
         int_weight = int_weight.type(torch.int32)

From 7c482265cf4e84ee17c62c7edb78519217c59dfd Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Mon, 15 Jan 2024 23:06:39 +0800
Subject: [PATCH 11/14] fixtypo

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 .../torch_utils/autoround/autoround.py        | 36 +++++++++++++------
 test/algorithm/test_autoround.py              | 10 +++---
 2 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/neural_compressor/adaptor/torch_utils/autoround/autoround.py b/neural_compressor/adaptor/torch_utils/autoround/autoround.py
index f225f9c332e..c7119b0258f 100644
--- a/neural_compressor/adaptor/torch_utils/autoround/autoround.py
+++ b/neural_compressor/adaptor/torch_utils/autoround/autoround.py
@@ -14,25 +14,22 @@
 
 try:
     from neural_compressor.utils.utility import LazyImport
-
     torch = LazyImport("torch")
     from neural_compressor.utils import logger
 except:  # pragma: no cover
     import logging
-
     import torch
-
     logger = logging.getLogger()
 
 import copy
 import time
 from collections import UserDict
 from functools import partial
-
 from torch.amp import autocast
 from torch.functional import F
 
 
+
 def quant_weight_asym(weight, num_bits=4, v=0, min_scale=0, max_scale=0):
     """Quantizes and dequantizes weight asymmetrically.
 
@@ -136,17 +133,29 @@ def quant_weight(weight, num_bits=4, group_size=-1, scheme="asym", v=0, min_scal
         Quantized and dequantized weight, scale, zero-point
     """
     if group_size == -1 or weight.shape[1] < group_size:
-        return quant_weight_actor(weight, num_bits, scheme=scheme, v=v, min_scale=min_scale, max_scale=max_scale)
+        return quant_weight_actor(
+            weight,
+            num_bits,
+            scheme=scheme,
+            v=v,
+            min_scale=min_scale,
+            max_scale=max_scale
+        )
     orig_shape = weight.shape
     if weight.shape[1] % group_size == 0:
         weight = weight.reshape(-1, group_size)
         if isinstance(v, torch.Tensor):
             v = v.reshape(-1, group_size)
         weight, scale, zp = quant_weight_actor(
-            weight, num_bits, scheme=scheme, v=v, min_scale=min_scale, max_scale=max_scale
+            weight,
+            num_bits,
+            scheme=scheme,
+            v=v,
+            min_scale=min_scale,
+            max_scale=max_scale
         )
         weight = weight.reshape(orig_shape)
-        scale = scale.reshape(orig_shape[0], -1)  # TODO validating the feasibility on conv1d
+        scale = scale.reshape(orig_shape[0], -1) #TODO validating the feasibility on conv1d
         if zp is not None:
             zp = zp.reshape(orig_shape[0], -1)
         return weight, scale, zp
@@ -159,7 +168,12 @@ def quant_weight(weight, num_bits=4, group_size=-1, scheme="asym", v=0, min_scal
         if isinstance(v, torch.Tensor):
             v = v.reshape(-1, group_size)
         weight_new, scale, zp = quant_weight_actor(
-            weight_new, num_bits, scheme=scheme, v=v, min_scale=min_scale, max_scale=max_scale
+            weight_new,
+            num_bits,
+            scheme=scheme,
+            v=v,
+            min_scale=min_scale,
+            max_scale=max_scale
         )
         weight_new = weight_new.reshape(orig_shape[0], -1)
         scale = scale.reshape(orig_shape[0], -1)
@@ -1037,7 +1051,8 @@ def __init__(
         self.tokenizer = tokenizer
         self.seqlen = seqlen
         self.train_bs = bs
-        self.n_samples = bs * (n_samples // bs)
+        self.n_samples = bs*(n_samples//bs)
+        assert self.n_samples > 0, f"Recommend setting an n_samples that is divisible by batch size{self.train_bs}"
         self.n_blocks = n_blocks
         self.device = device
         self.amp_dtype = torch.float16
@@ -1456,7 +1471,7 @@ def quantize(self):
         end_time = time.time()
         cost_time = end_time - start_time
         logger.info(f"quantization runtime {cost_time}")
-
+        
         return self.model, self.weight_config
 
 
@@ -1715,3 +1730,4 @@ def __init__(
             optimizer,
             **kwargs,
         )
+
diff --git a/test/algorithm/test_autoround.py b/test/algorithm/test_autoround.py
index 499ce0b282e..5458fefcce9 100644
--- a/test/algorithm/test_autoround.py
+++ b/test/algorithm/test_autoround.py
@@ -2,7 +2,6 @@
 
 import torch
 from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
-
 from neural_compressor.adaptor.torch_utils.autoround.autoround import AutoAdamRound, AutoOPTRound, AutoRound
 
 
@@ -23,12 +22,12 @@ def setUpClass(self):
 
     @classmethod
     def test_signround(self):
-        round = AutoRound(self.model, self.tokenizer, device="cpu", iters=5, seqlen=8, n_samples=1, group_size=7)
+        round = AutoRound(self.model, self.tokenizer, device="cpu", iters=5, seqlen=8, n_samples=8, group_size=7)
         round.quantize()
 
     @classmethod
     def test_Adamround(self):
-        round = AutoOPTRound(self.model, self.tokenizer, device="cpu", iters=2, seqlen=8, n_samples=1, scheme="sym")
+        round = AutoOPTRound(self.model, self.tokenizer, device="cpu", iters=2, seqlen=8, n_samples=8, scheme="sym")
         round.quantize()
 
 
@@ -49,14 +48,15 @@ def setUpClass(self):
 
     @classmethod
     def test_signround(self):
-        round = AutoRound(self.model, self.tokenizer, device="cpu", iters=5, seqlen=8, n_samples=1, n_blocks=2)
+        round = AutoRound(self.model, self.tokenizer, device="cpu", iters=5, seqlen=8, n_samples=8, n_blocks=2)
         round.quantize()
 
     @classmethod
     def test_Adamround(self):
-        round = AutoAdamRound(self.model, self.tokenizer, device="cpu", iters=5, seqlen=8, n_samples=1)
+        round = AutoAdamRound(self.model, self.tokenizer, device="cpu", iters=5, seqlen=8, n_samples=8)
         round.quantize()
 
 
 if __name__ == "__main__":
     unittest.main()
+

From 3276e5bae275626ad1ee21dba0c1af3f0d8f291a Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 15 Jan 2024 15:08:02 +0000
Subject: [PATCH 12/14] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../torch_utils/autoround/autoround.py        | 35 ++++++-------------
 test/algorithm/test_autoround.py              |  2 +-
 2 files changed, 11 insertions(+), 26 deletions(-)

diff --git a/neural_compressor/adaptor/torch_utils/autoround/autoround.py b/neural_compressor/adaptor/torch_utils/autoround/autoround.py
index c7119b0258f..0c9b55906a1 100644
--- a/neural_compressor/adaptor/torch_utils/autoround/autoround.py
+++ b/neural_compressor/adaptor/torch_utils/autoround/autoround.py
@@ -14,22 +14,25 @@
 
 try:
     from neural_compressor.utils.utility import LazyImport
+
     torch = LazyImport("torch")
     from neural_compressor.utils import logger
 except:  # pragma: no cover
     import logging
+
     import torch
+
     logger = logging.getLogger()
 
 import copy
 import time
 from collections import UserDict
 from functools import partial
+
 from torch.amp import autocast
 from torch.functional import F
 
 
-
 def quant_weight_asym(weight, num_bits=4, v=0, min_scale=0, max_scale=0):
     """Quantizes and dequantizes weight asymmetrically.
 
@@ -133,29 +136,17 @@ def quant_weight(weight, num_bits=4, group_size=-1, scheme="asym", v=0, min_scal
         Quantized and dequantized weight, scale, zero-point
     """
     if group_size == -1 or weight.shape[1] < group_size:
-        return quant_weight_actor(
-            weight,
-            num_bits,
-            scheme=scheme,
-            v=v,
-            min_scale=min_scale,
-            max_scale=max_scale
-        )
+        return quant_weight_actor(weight, num_bits, scheme=scheme, v=v, min_scale=min_scale, max_scale=max_scale)
     orig_shape = weight.shape
     if weight.shape[1] % group_size == 0:
         weight = weight.reshape(-1, group_size)
         if isinstance(v, torch.Tensor):
             v = v.reshape(-1, group_size)
         weight, scale, zp = quant_weight_actor(
-            weight,
-            num_bits,
-            scheme=scheme,
-            v=v,
-            min_scale=min_scale,
-            max_scale=max_scale
+            weight, num_bits, scheme=scheme, v=v, min_scale=min_scale, max_scale=max_scale
         )
         weight = weight.reshape(orig_shape)
-        scale = scale.reshape(orig_shape[0], -1) #TODO validating the feasibility on conv1d
+        scale = scale.reshape(orig_shape[0], -1)  # TODO validating the feasibility on conv1d
         if zp is not None:
             zp = zp.reshape(orig_shape[0], -1)
         return weight, scale, zp
@@ -168,12 +159,7 @@ def quant_weight(weight, num_bits=4, group_size=-1, scheme="asym", v=0, min_scal
         if isinstance(v, torch.Tensor):
             v = v.reshape(-1, group_size)
         weight_new, scale, zp = quant_weight_actor(
-            weight_new,
-            num_bits,
-            scheme=scheme,
-            v=v,
-            min_scale=min_scale,
-            max_scale=max_scale
+            weight_new, num_bits, scheme=scheme, v=v, min_scale=min_scale, max_scale=max_scale
         )
         weight_new = weight_new.reshape(orig_shape[0], -1)
         scale = scale.reshape(orig_shape[0], -1)
@@ -1051,7 +1037,7 @@ def __init__(
         self.tokenizer = tokenizer
         self.seqlen = seqlen
         self.train_bs = bs
-        self.n_samples = bs*(n_samples//bs)
+        self.n_samples = bs * (n_samples // bs)
         assert self.n_samples > 0, f"Recommend setting an n_samples that is divisible by batch size{self.train_bs}"
         self.n_blocks = n_blocks
         self.device = device
@@ -1471,7 +1457,7 @@ def quantize(self):
         end_time = time.time()
         cost_time = end_time - start_time
         logger.info(f"quantization runtime {cost_time}")
-        
+
         return self.model, self.weight_config
 
 
@@ -1730,4 +1716,3 @@ def __init__(
             optimizer,
             **kwargs,
         )
-
diff --git a/test/algorithm/test_autoround.py b/test/algorithm/test_autoround.py
index 5458fefcce9..dc78186af08 100644
--- a/test/algorithm/test_autoround.py
+++ b/test/algorithm/test_autoround.py
@@ -2,6 +2,7 @@
 
 import torch
 from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
+
 from neural_compressor.adaptor.torch_utils.autoround.autoround import AutoAdamRound, AutoOPTRound, AutoRound
 
 
@@ -59,4 +60,3 @@ def test_Adamround(self):
 
 if __name__ == "__main__":
     unittest.main()
-

From 97773ee5552a9a64e8084993c15573044b2d5c78 Mon Sep 17 00:00:00 2001
From: "Zhang, Weiwei1" <weiwei1.zhang@intel.com>
Date: Tue, 16 Jan 2024 09:13:41 +0800
Subject: [PATCH 13/14] change device type

Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com>
---
 .../adaptor/pytorch_adaptor/test_autoround.py | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/test/adaptor/pytorch_adaptor/test_autoround.py b/test/adaptor/pytorch_adaptor/test_autoround.py
index 415c16bf688..b92e1dfb973 100644
--- a/test/adaptor/pytorch_adaptor/test_autoround.py
+++ b/test/adaptor/pytorch_adaptor/test_autoround.py
@@ -1,19 +1,15 @@
 import copy
 import os
 import shutil
-import sys
 import unittest
 
 import torch
 import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from neural_compressor.adaptor.torch_utils.autoround import (
-    AutoAdamRound,
-    AutoOPTRound,
-    AutoRound,
-    export_compressed_model,
-)
+from neural_compressor.adaptor.torch_utils.autoround import (AutoRound, 
+                                                             AutoOPTRound,
+                                                             AutoAdamRound,
+                                                             export_compressed_model)
 
 
 class SimpleDataLoader:
@@ -45,7 +41,8 @@ def setUpClass(self):
             torchscript=True,
         )
         self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-GPTJForCausalLM", trust_remote_code=True
+            "hf-internal-testing/tiny-random-GPTJForCausalLM",
+            trust_remote_code=True
         )
         self.gptj_no_jit = transformers.AutoModelForCausalLM.from_pretrained(
             "hf-internal-testing/tiny-random-GPTJForCausalLM",
@@ -62,7 +59,7 @@ def test_RTN_int_quant(self):
         model = copy.deepcopy(self.gptj)
         out1 = model(self.lm_input)
         round = AutoRound
-        optq_1 = round(model, self.tokenizer, n_samples=20, amp=False, seqlen=10)
+        optq_1 = round(model, self.tokenizer, n_samples=20, device="cpu", amp=False, seqlen=10, iters=50)
         q_model, weight_config1 = optq_1.quantize()
         compressed_model = export_compressed_model(q_model, weight_config1)
         out2 = model(self.lm_input)
@@ -71,18 +68,21 @@ def test_RTN_int_quant(self):
         self.assertFalse(torch.all(out1[0] == out2[0]))
         self.assertTrue(torch.all(torch.isclose(out2[0], out3[0], atol=1e-3)))
         self.assertTrue("transformer.h.0.attn.k_proj.qzeros" in compressed_model.state_dict().keys())
-
+        
         # model = copy.deepcopy(self.gptj)
         # out6 = model(self.lm_input)
         # optq_2 = round(model, self.tokenizer, n_samples=20, amp=False, seqlen=10)
         # q_model, weight_config2 = optq_2.quantize()
         # out4 = q_model(self.lm_input)
         # out5 = model(self.lm_input)
-
+        
         # self.assertTrue(torch.all(out1[0] == out6[0]))
         # self.assertTrue(torch.all(out4[0] == out5[0]))
         # self.assertTrue(torch.all(torch.isclose(out6[0], out5[0], atol=1e-1)))
-
-
+        
+        
+        
 if __name__ == "__main__":
     unittest.main()
+
+

From c5e848ec7a569e1ff0bee4f0c6d18c933eb29af2 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 16 Jan 2024 01:17:44 +0000
Subject: [PATCH 14/14] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 .../adaptor/pytorch_adaptor/test_autoround.py | 25 +++++++++----------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/test/adaptor/pytorch_adaptor/test_autoround.py b/test/adaptor/pytorch_adaptor/test_autoround.py
index b92e1dfb973..2081729f7ad 100644
--- a/test/adaptor/pytorch_adaptor/test_autoround.py
+++ b/test/adaptor/pytorch_adaptor/test_autoround.py
@@ -6,10 +6,13 @@
 import torch
 import transformers
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from neural_compressor.adaptor.torch_utils.autoround import (AutoRound, 
-                                                             AutoOPTRound,
-                                                             AutoAdamRound,
-                                                             export_compressed_model)
+
+from neural_compressor.adaptor.torch_utils.autoround import (
+    AutoAdamRound,
+    AutoOPTRound,
+    AutoRound,
+    export_compressed_model,
+)
 
 
 class SimpleDataLoader:
@@ -41,8 +44,7 @@ def setUpClass(self):
             torchscript=True,
         )
         self.tokenizer = transformers.AutoTokenizer.from_pretrained(
-            "hf-internal-testing/tiny-random-GPTJForCausalLM",
-            trust_remote_code=True
+            "hf-internal-testing/tiny-random-GPTJForCausalLM", trust_remote_code=True
         )
         self.gptj_no_jit = transformers.AutoModelForCausalLM.from_pretrained(
             "hf-internal-testing/tiny-random-GPTJForCausalLM",
@@ -68,21 +70,18 @@ def test_RTN_int_quant(self):
         self.assertFalse(torch.all(out1[0] == out2[0]))
         self.assertTrue(torch.all(torch.isclose(out2[0], out3[0], atol=1e-3)))
         self.assertTrue("transformer.h.0.attn.k_proj.qzeros" in compressed_model.state_dict().keys())
-        
+
         # model = copy.deepcopy(self.gptj)
         # out6 = model(self.lm_input)
         # optq_2 = round(model, self.tokenizer, n_samples=20, amp=False, seqlen=10)
         # q_model, weight_config2 = optq_2.quantize()
         # out4 = q_model(self.lm_input)
         # out5 = model(self.lm_input)
-        
+
         # self.assertTrue(torch.all(out1[0] == out6[0]))
         # self.assertTrue(torch.all(out4[0] == out5[0]))
         # self.assertTrue(torch.all(torch.isclose(out6[0], out5[0], atol=1e-1)))
-        
-        
-        
-if __name__ == "__main__":
-    unittest.main()
 
 
+if __name__ == "__main__":
+    unittest.main()