From f0cc13150a4656a82020c9966163120df7defe41 Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Wed, 29 Oct 2025 03:01:53 -0400
Subject: [PATCH 01/13] enhance auto device map and support XPU

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 auto_round/utils/device.py | 169 +++++++++++++++++++++++++------------
 1 file changed, 113 insertions(+), 56 deletions(-)

diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index 850c95343..21e375408 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -458,34 +458,64 @@ def check_memory_availability(device, inputs, weight, org_seqlen, org_bs):
     return False, seqlen, bs
 
 
-def estimate_tuning_block_mem(block: torch.nn.Module, input_ids: list[torch.Tensor]) -> tuple[float, float]:
+def estimate_tuning_block_mem(
+    block: torch.nn.Module, input_ids: list[torch.Tensor], pick_samples: int
+) -> tuple[dict, float]:
     """
     Calculates the memory consumption of a specific block in the model.
 
     Args:
         block (torch.nn.Module): The block of the model to analyze.
         input_ids (list[torch.Tensor]): A list of input tensors for the block.
+        pick_samples (int): Number of samples to consider for memory estimation.
 
     Returns:
         tuple: A tuple containing the following:
-            - block_memory (float): The memory consumption (in GB) of the block's linear layers.
+            - layer_memory_dict (dict): A dictionary mapping layer names to their memory consumption (in GB).
+              Format: {layer_name: {"param_memory": float, "output_memory": float}}
+              SDPA layers are represented with a fixed 1GB output memory.
             - input_output_memory (float): The memory consumption (in GB) for input and output
                 tensors of the block.
     """
-    # Calculate all block parameters memory
-    from auto_round.utils.model import check_to_quantized
+    # Calculate all block parameters memory and build layer-wise memory dict
+    from auto_round.utils.model import get_layer_features
 
+    layer_memory_dict = {}
     total_param_mem = 0
+
+    # Calculate batch_size and sequence_length from input_ids for output memory estimation
+    seq_len = input_ids[0].shape[1] if input_ids and len(input_ids[0].shape) >= 2 else 1
+    element_size = input_ids[0].element_size() if input_ids else 2  # Default to 2 bytes (fp16/bf16)
+
     for name, module in block.named_modules():
         if check_to_quantized(module):
+            layer_name = name
             param_size = module.weight.nbytes
             total_param_mem += param_size
-    block_memory = total_param_mem / 1024**3  # Convert to GB
+            param_memory_gb = param_size / 1024**3
+
+            # Estimate output memory based on input_features and out_features
+            in_features, out_features = get_layer_features(module)
+            if in_features is not None and out_features is not None:
+                # Output tensor size: batch_size * seq_len * out_features * element_size
+                output_size = pick_samples * seq_len * out_features * element_size
+                output_memory_gb = output_size / 1024**3
+            else:
+                output_memory_gb = 0.0
+
+            # memory * 2, because it contains grad tensor.
+            layer_memory_dict[layer_name] = {"param_memory": param_memory_gb * 2, "output_memory": output_memory_gb * 2}
 
     # Assuming bfloat16 or float32, input and output
     input_output_memory = 2 * sum(tensor.nbytes for tensor in input_ids) / 1024**3
+    if torch.xpu.is_available():
+        # https://github.com/intel/torch-xpu-ops/issues/2232
+        # sdpa on XPU takes more memory than expected.
+        additional_memory = 12
+    else:
+        additional_memory = 1  # sdpa usage and loss calculation usage
 
-    return block_memory, input_output_memory
+    return layer_memory_dict, input_output_memory, additional_memory
 
 
 def out_of_vram(error_msg):
@@ -538,7 +568,7 @@ def get_device_memory(i: int = 0) -> int:
     if torch.cuda.is_available():
         total_memory = bytes_to_gigabytes(torch.cuda.get_device_properties(i).total_memory)
     elif torch.xpu.is_available():
-        raise RuntimeError("XPU does not support device_map='auto' currently.")
+        total_memory = bytes_to_gigabytes(torch.xpu.get_device_properties(i).total_memory)
     else:
         raise RuntimeError("No supported device found (CUDA or XPU).")
     return total_memory
@@ -629,7 +659,11 @@ def set_non_auto_device_map(
 
 
 def set_auto_device_map_for_block_with_tuning(
-    block: torch.nn.Module, device_map, input_ids: list[torch.Tensor], low_gpu_mem_usage=False, mem_per_param_scale=13.0
+    block: torch.nn.Module,
+    device_map,
+    input_ids: list[torch.Tensor],
+    low_gpu_mem_usage=False,
+    pick_samples=8,
 ):
     """
     Automatically sets the device map for the block based on available GPUs and memory constraints.
@@ -639,9 +673,7 @@ def set_auto_device_map_for_block_with_tuning(
         device_map (str | int | dict): Specifies the device mapping.
         input_ids (list[torch.Tensor]): List of input tensors used for estimating memory requirements.
         low_gpu_mem_usage (bool, optional): If True, ignoring input/output memory. Defaults to False.
-        mem_per_param_scale (float, optional): Scaling factor for estimating memory usage per parameter in the block.
-            Typical values range from 10.0 to 20.0 depending on model size and GPU memory characteristics.
-            Higher values are more conservative and help avoid out-of-memory errors. Defaults to 13.0.
+        pick_samples (int, optional): Number of samples to consider for memory estimation. Defaults to 8.
 
     Returns:
         None
@@ -654,10 +686,11 @@ def set_auto_device_map_for_block_with_tuning(
         The mem_per_param_scale parameter should be adjusted based on empirical memory usage observations.
     """
     if torch.cuda.is_available():
-        num_gpus = torch.cuda.device_count()
+        num_devices = torch.cuda.device_count()
+        device_name = "cuda"
     elif torch.xpu.is_available():
-        logger.warning_once("XPU does not support auto device map yet, using device 0 for tuning.")
-        return
+        num_devices = torch.xpu.device_count()
+        device_name = "xpu"
     else:
         raise RuntimeError("No CUDA or XPU devices found.")
     device_list = None
@@ -665,52 +698,75 @@ def set_auto_device_map_for_block_with_tuning(
         device_list = [int(dev) for dev in device_map.split(",") if dev.isdigit()]
 
     if device_list:
-        cuda_devices = [f"cuda:{i}" for i in device_list]
-        device_0 = cuda_devices[0]
+        cuda_xpu_devices = [f"{device_name}:{i}" for i in device_list]
+        device_0 = cuda_xpu_devices[0]
     else:
-        cuda_devices = [f"cuda:{i}" for i in range(num_gpus)]
-        device_0 = "cuda:0"
+        cuda_xpu_devices = [f"{device_name}:{i}" for i in range(num_devices)]
+        device_0 = f"{device_name}:0"
 
     device_0_memory = get_device_memory(device_list[0] if device_list else 0)
-    block_memory, input_output_memory = estimate_tuning_block_mem(block, input_ids)
+    layer_memory_dict, input_output_memory, additional_memory = estimate_tuning_block_mem(
+        block, input_ids, pick_samples
+    )
     if low_gpu_mem_usage:
         input_output_memory = 0
 
-    if (block_memory * mem_per_param_scale + input_output_memory) < device_0_memory:
-        return  # fit in one GPU
-
+    # Calculate total block memory from layer memory dict (including both param and output memory)
+    total_block_param_memory = sum(info["param_memory"] for info in layer_memory_dict.values())
+    total_block_output_memory = sum(info["output_memory"] for info in layer_memory_dict.values())
+
+    # Average dispatch strategy
+    # card_0_left_memory = card_0_mem - input_output_memory - additional_memory - layer_outputs_memory
+    card_0_left_memory = device_0_memory - input_output_memory - additional_memory - total_block_output_memory
+
+    # Calculate total available memory across all devices
+    total_available_memory = card_0_left_memory
+    for i in range(1, len(cuda_xpu_devices)):
+        device_idx = device_list[i] if device_list else i
+        total_available_memory += get_device_memory(device_idx)
+
+    # Calculate total params (in GB, considering param_memory only for calculation)
+    total_params = total_block_param_memory
+    mem_per_param = total_available_memory / total_params
+
+    # Initialize device memory tracking
+    device_memory = {}
+    device_memory[device_0] = card_0_left_memory
+    for i in range(1, len(cuda_xpu_devices)):
+        device_idx = device_list[i] if device_list else i
+        device_memory[cuda_xpu_devices[i]] = get_device_memory(device_idx)
+
+    # Dispatch layers to devices based on mem_per_param
+    # Use devices in order, switch to next device when current one is full
     device_map = {}
-    device_memory = {device: get_device_memory(int(device.split(":")[1])) for device in cuda_devices}
-    device_memory[device_0] = device_0_memory - input_output_memory
-
-    device_idx = 0
     names = []
-    # First, fill device 0 to its maximum capacity, then distribute the remaining layers evenly across other devices
-    for n, m in block.named_modules():
-        if check_to_quantized(m):
-            layer_name = m.tmp_name
-            names.append(layer_name)
-            layer_memory = m.weight.nbytes / 1024**3
-            if device_idx == 0 and layer_memory * mem_per_param_scale < device_memory[cuda_devices[device_idx]]:
-                device_map[layer_name] = cuda_devices[device_idx]
-                device_memory[cuda_devices[device_idx]] -= layer_memory * mem_per_param_scale
-            elif device_idx == 0:
-                device_idx += 1  # Move to the next device once device 0 is full
-                device_map[layer_name] = cuda_devices[device_idx]
-                device_memory[cuda_devices[device_idx]] -= layer_memory * mem_per_param_scale
-            else:
-                # Calculate the target device index based on even distribution
-                sorted_devices = sorted(cuda_devices, key=lambda d: device_memory[d], reverse=True)
-                device_idx = sorted_devices[0]
-                if layer_memory * mem_per_param_scale < device_memory[device_idx]:
-                    device_map[layer_name] = device_idx
-                    device_memory[device_idx] -= layer_memory * mem_per_param_scale
-                else:
-                    logger.warning_once(
-                        f"Block {block.tmp_name} not fit in available GPU memory. "
-                        "Consider using more GPUs or reducing mem_per_param_scale if OOM occurs."
-                    )
 
+    current_device_idx = 0
+    current_device = cuda_xpu_devices[current_device_idx]
+
+    for layer_name, mem_info in layer_memory_dict.items():
+        names.append(layer_name)
+        # Calculate estimated memory for this layer
+        layer_param_memory = mem_info["param_memory"]
+
+        # All layer outputs are on card_0, so all cards only need to store parameters
+        estimated_memory = layer_param_memory * mem_per_param
+
+        # Try to fit in current device
+        if estimated_memory <= device_memory[current_device]:
+            device_map[layer_name] = current_device
+            device_memory[current_device] -= estimated_memory
+        else:
+            # Current device is full, try to switch to next device
+            if current_device_idx < len(cuda_xpu_devices) - 1:
+                current_device_idx += 1
+                current_device = cuda_xpu_devices[current_device_idx]
+
+            # Place on current device (either new device or last device)
+            device_map[layer_name] = current_device
+            device_memory[current_device] -= estimated_memory
+
+    print(device_map)
     set_non_auto_device_map(block, device_map, names)
 
 
@@ -789,16 +845,17 @@ def set_avg_auto_device_map(model: torch.nn.Module, device_map):
     else:
         if torch.cuda.is_available():
             num_devices = torch.cuda.device_count()
+            device_name = "cuda"
         elif torch.xpu.is_available():
-            logger.warning_once("XPU does not support auto device map yet, using device 0 for tuning.")
-            return
+            num_devices = torch.xpu.device_count()
+            device_name = "xpu"
         else:
             return
 
     if device_list:
-        cuda_devices = [f"cuda:{i}" for i in device_list]
+        cuda_xpu_devices = [f"{device_name}:{i}" for i in device_list]
     else:
-        cuda_devices = [f"cuda:{i}" for i in range(num_devices)]
+        cuda_xpu_devices = [f"{device_name}:{i}" for i in range(num_devices)]
 
     for block_names in block_name_list:
         for block_name in block_names:
@@ -814,7 +871,7 @@ def set_avg_auto_device_map(model: torch.nn.Module, device_map):
             device_index = 0
             for res in res_list:
                 for key in res.keys():
-                    set_tuning_device_for_layer(block_module, key, cuda_devices[device_index])
+                    set_tuning_device_for_layer(block_module, key, cuda_xpu_devices[device_index])
                 device_index += 1
 
 

From a582d4666f56fa90bcd7986034790ff9d1b76028 Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Wed, 29 Oct 2025 03:08:59 -0400
Subject: [PATCH 02/13] remove mem_per_param_scale

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 auto_round/__main__.py           |  9 ---------
 auto_round/compressors/base.py   | 14 ++------------
 auto_round/compressors/config.py |  5 -----
 auto_round/utils/device.py       |  1 -
 4 files changed, 2 insertions(+), 27 deletions(-)

diff --git a/auto_round/__main__.py b/auto_round/__main__.py
index c403ee863..2b68475e8 100644
--- a/auto_round/__main__.py
+++ b/auto_round/__main__.py
@@ -157,14 +157,6 @@ def __init__(self, *args, **kwargs):
             type=float,
             help="Learning rate specifically for min-max tuning. " "If None, uses the same value as --lr. ",
         )
-        tuning.add_argument(
-            "--mem_per_param_scale",
-            default=13,
-            type=float,
-            help="Memory scaling factor for parameter memory estimation. "
-            "Adjust this if you need to control memory usage during tuning. "
-            "Lower values reduce memory usage but may affect accuracy.",
-        )
         tuning.add_argument(
             "--gradient_accumulate_steps",
             default=1,
@@ -522,7 +514,6 @@ def tune(args):
         enable_deterministic_algorithms=args.enable_deterministic_algorithms,
         lr=args.lr,
         minmax_lr=args.minmax_lr,
-        mem_per_param_scale=args.mem_per_param_scale,
         nblocks=args.nblocks,
         to_quant_block_names=args.to_quant_block_names,
         scale_dtype=args.scale_dtype,
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index c148164ae..921a5763f 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -230,8 +230,6 @@ def __init__(
         enable_deterministic_algorithms = kwargs.pop("enable_deterministic_algorithms", False)
         static_kv_dtype = kwargs.pop("static_kv_dtype", None)
         device = kwargs.pop("device", None)
-        # Scale factor for RAM usage per parameter.
-        mem_per_param_scale = kwargs.pop("mem_per_param_scale", None)
         self.quant_lm_head = kwargs.pop("quant_lm_head", False)
         self.mllm = kwargs.pop("mllm") if "mllm" in kwargs else False
         self.diffusion = kwargs.pop("diffusion") if "diffusion" in kwargs else False
@@ -332,10 +330,6 @@ def __init__(
         self.optimizer = self._get_optimizer(None)
         self.disable_opt_rtn = disable_opt_rtn
         self.is_packing_immediate = False  # whether to pack the layer immediately after tuning
-        if mem_per_param_scale is None:
-            self.mem_per_param_scale = 13 if self.iters != 0 else 1
-        else:
-            self.mem_per_param_scale = mem_per_param_scale
 
         # KV cache, this one does not affect tuning but will collect some infos during tuning
         self.static_kv_dtype = static_kv_dtype
@@ -1428,9 +1422,7 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str])
                     convert_fp8_model_to_16b_model(block, dtype=self.amp_dtype)
 
                 if self.device_map == "auto" or (isinstance(self.device_map, str) and "," in self.device_map):
-                    set_auto_device_map_for_block_with_tuning(
-                        block, self.device_map, input_ids, self.low_gpu_mem_usage, self.mem_per_param_scale
-                    )
+                    set_auto_device_map_for_block_with_tuning(block, self.device_map, input_ids, self.low_gpu_mem_usage)
                 # Dispatch model if needed
                 if self.device_map is not None:
                     from accelerate.hooks import AlignDevicesHook, add_hook_to_module
@@ -2444,9 +2436,7 @@ def _quantize_block(
                     set_module(block, n, new_layer)
 
         if self.device_map == "auto" or (isinstance(self.device_map, str) and "," in self.device_map):
-            set_auto_device_map_for_block_with_tuning(
-                block, self.device_map, input_ids, self.low_gpu_mem_usage, self.mem_per_param_scale
-            )
+            set_auto_device_map_for_block_with_tuning(block, self.device_map, input_ids, self.low_gpu_mem_usage)
 
         if self.device_map is not None:
             for n, m in block.named_modules():
diff --git a/auto_round/compressors/config.py b/auto_round/compressors/config.py
index d42e13427..b2bb61409 100644
--- a/auto_round/compressors/config.py
+++ b/auto_round/compressors/config.py
@@ -41,7 +41,6 @@ def __init__(
         lr: float = None,
         lr_scheduler: Callable = None,
         minmax_lr: float = None,
-        mem_per_param_scale: int = None,
         nblocks: int = 1,
         to_quant_block_names: Union[str, list, None] = None,
         scale_dtype: str = "fp16",
@@ -84,8 +83,6 @@ def __init__(
             lr (float): The learning rate (default is 0.005).
             lr_scheduler: The learning rate scheduler to be used.
             minmax_lr (float): The learning rate for min-max tuning (default is None).
-            mem_per_param_scale (int): Scale factor for memory per parameter,
-                used to adjust memory usage estimation for tuning.
             nblocks (int): Number of blocks (default is 1).
             quant_lm_head (bool): Whether to quant lm_head.
             to_quant_block_names (str|list):  Names of quantitative blocks, please use commas to separate them.
@@ -124,7 +121,6 @@ def __init__(
             lr=lr,
             lr_scheduler=lr_scheduler,
             minmax_lr=minmax_lr,
-            mem_per_param_scale=mem_per_param_scale,
             nblocks=nblocks,
             to_quant_block_names=to_quant_block_names,
             scale_dtype=scale_dtype,
@@ -260,7 +256,6 @@ class TuningExtraConfig(BaseExtraConfig):
     lr: float = None
     lr_scheduler: Callable = None
     minmax_lr: float = None
-    mem_per_param_scale: int = None
     nblocks: int = 1
     to_quant_block_names: Union[str, list, None] = None
     scale_dtype: str = "fp16"
diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index 21e375408..821cba15a 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -683,7 +683,6 @@ def set_auto_device_map_for_block_with_tuning(
 
     Note:
         This function is intended for internal use in device memory management and tuning.
-        The mem_per_param_scale parameter should be adjusted based on empirical memory usage observations.
     """
     if torch.cuda.is_available():
         num_devices = torch.cuda.device_count()

From 7c598baf47a2906e88ac42686b226f7adc4f86ac Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Wed, 29 Oct 2025 03:52:11 -0400
Subject: [PATCH 03/13] consider  enable_act_quant and optimize device map
 logic

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 auto_round/utils/device.py | 201 +++++++++++++++++++++++++++++++------
 1 file changed, 171 insertions(+), 30 deletions(-)

diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index 821cba15a..5ad711477 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -489,6 +489,7 @@ def estimate_tuning_block_mem(
 
     for name, module in block.named_modules():
         if check_to_quantized(module):
+            enable_act_quant = module.act_bits <= 8
             layer_name = name
             param_size = module.weight.nbytes
             total_param_mem += param_size
@@ -500,6 +501,12 @@ def estimate_tuning_block_mem(
                 # Output tensor size: batch_size * seq_len * out_features * element_size
                 output_size = pick_samples * seq_len * out_features * element_size
                 output_memory_gb = output_size / 1024**3
+
+                # If enable_act_quant, add input tensor memory to param_memory
+                if enable_act_quant:
+                    input_size = pick_samples * seq_len * in_features * element_size
+                    input_memory_gb = input_size / 1024**3
+                    param_memory_gb += input_memory_gb
             else:
                 output_memory_gb = 0.0
 
@@ -658,6 +665,168 @@ def set_non_auto_device_map(
                 logger.warning(f"{key} in `device_map` dose not match any modules, please have a check")
 
 
+def _allocate_layers_to_devices(
+    layer_memory_dict: dict, device_memory: dict, cuda_xpu_devices: list, mem_per_param: float
+) -> tuple[dict, list]:
+    """
+    Allocates layers to devices using a load-balancing strategy.
+
+    Strategy:
+    1. Sort layers by memory size (descending) to prioritize large operations
+    2. Allocate largest layers to later devices first (to keep device 0 free for I/O)
+    3. For each layer, find the best device considering:
+       - Sufficient remaining memory (preferred)
+       - Continuity (prefer same device as neighboring layers in original model order)
+       - Load balancing (minimize wasted space)
+    4. Fallback to device with most remaining space if all devices are over capacity
+
+    Args:
+        layer_memory_dict (dict): Mapping of layer names to their memory info (order preserved)
+            Format: {layer_name: {"param_memory": float, "output_memory": float}}
+        device_memory (dict): Available memory for each device (will be modified)
+            Format: {device_name: available_memory_gb}
+        cuda_xpu_devices (list): List of available device names (e.g., ["cuda:0", "cuda:1"])
+        mem_per_param (float): Memory multiplier per parameter GB
+
+    Returns:
+        tuple[dict, list]:
+            - device_map: Mapping of layer names to assigned devices
+            - names: List of layer names in processing order
+
+    Examples:
+        Example - Distribution with 3 devices:
+            Given layers [huge: 20GB, big: 15GB, large: 10GB, medium: 5GB, small: 2GB]
+            and 3 devices [device 0, device 1, device 2]:
+            - 'huge' → tries device 2 first (last device, preferred for largest layer)
+            - 'big' → tries device 1 (second-to-last, preferred for second largest)
+            - 'large' → tries device 0, but prioritizes neighbor's device if applicable
+            - 'medium' and 'small' → assigned based on remaining capacity and neighbors
+
+            Result: Device 0 has lightest load, suitable for handling I/O overhead
+    """
+    device_map = {}
+    names = []
+
+    # Build layer order map from original dict order (preserves model structure)
+    layer_names_in_order = list(layer_memory_dict.keys())
+    layer_order = {name: idx for idx, name in enumerate(layer_names_in_order)}
+
+    # Sort layers by memory size (descending) to handle large layers first
+    # This prevents large layers from being stuck without suitable devices later
+    sorted_layers = sorted(layer_memory_dict.items(), key=lambda x: x[1]["param_memory"], reverse=True)
+
+    # Track assigned layers to avoid duplicates
+    assigned_layers = set()
+
+    # Track preferred starting device for large layers (start from last device)
+    num_devices = len(cuda_xpu_devices)
+    preferred_device_idx = num_devices - 1  # Start from last device
+
+    # Process each layer in sorted order (large to small)
+    for layer_idx, (layer_name, mem_info) in enumerate(sorted_layers):
+        if layer_name in assigned_layers:
+            continue
+
+        names.append(layer_name)
+        layer_param_memory = mem_info["param_memory"]
+        estimated_memory = layer_param_memory * mem_per_param
+
+        # Find neighboring layers in original model order for continuity evaluation
+        current_layer_idx = layer_order[layer_name]
+        neighbor_devices = set()
+
+        # Check previous and next layers in original order
+        for offset in [-1, 1]:
+            neighbor_idx = current_layer_idx + offset
+            if 0 <= neighbor_idx < len(layer_names_in_order):
+                neighbor_name = layer_names_in_order[neighbor_idx]
+                if neighbor_name in device_map:
+                    neighbor_devices.add(device_map[neighbor_name])
+
+        # Phase 1: Try to find a device with sufficient space
+        best_device = None
+        best_device_idx = None
+        min_score = float("inf")
+
+        for dev_idx in range(num_devices):
+            dev = cuda_xpu_devices[dev_idx]
+            remaining = device_memory[dev] - estimated_memory
+
+            # Only consider devices with enough space
+            if remaining >= 0:
+                # Continuity bonus: strongly prefer device used by neighboring layers
+                # This keeps adjacent layers in the model on the same device, reducing communication
+                if dev in neighbor_devices:
+                    continuity_bonus = -2000  # Very strong preference for neighbor's device
+                else:
+                    continuity_bonus = 0
+
+                # For large layers (early in sorted order), prefer later devices
+                # This keeps device 0 lighter as it handles I/O overhead
+                if layer_idx < num_devices and dev_idx >= preferred_device_idx:
+                    large_layer_bonus = -500  # Moderate preference for later devices
+                else:
+                    large_layer_bonus = 0
+
+                # Score = remaining memory waste + continuity penalty + large layer penalty (lower is better)
+                score = abs(remaining) + continuity_bonus + large_layer_bonus
+                if score < min_score:
+                    min_score = score
+                    best_device = dev
+                    best_device_idx = dev_idx
+
+        # Phase 2: Fallback - if no device has enough space, prefer neighbor's device
+        if best_device is None:
+            if neighbor_devices:
+                # Prefer neighbor's device even if over capacity
+                for dev in neighbor_devices:
+                    if best_device is None or device_memory[dev] > device_memory[best_device]:
+                        best_device = dev
+                        best_device_idx = cuda_xpu_devices.index(dev)
+            else:
+                # No neighbors assigned yet, prefer later devices for large layers
+                if layer_idx < num_devices:
+                    # Try from last device backwards
+                    max_remaining = float("-inf")
+                    for dev_idx in range(num_devices - 1, -1, -1):
+                        dev = cuda_xpu_devices[dev_idx]
+                        remaining = device_memory[dev] - estimated_memory
+                        if remaining > max_remaining:
+                            max_remaining = remaining
+                            best_device = dev
+                            best_device_idx = dev_idx
+                else:
+                    # For smaller layers, use device with most remaining space
+                    max_remaining = float("-inf")
+                    for dev_idx in range(num_devices):
+                        dev = cuda_xpu_devices[dev_idx]
+                        remaining = device_memory[dev] - estimated_memory
+                        if remaining > max_remaining:
+                            max_remaining = remaining
+                            best_device = dev
+                            best_device_idx = dev_idx
+
+        # Phase 3: Final safety fallback - use last device if still None
+        # Use last device to keep device 0 lighter
+        if best_device is None:
+            best_device = cuda_xpu_devices[-1]
+            best_device_idx = num_devices - 1
+
+        # Assign layer to the selected device
+        device_map[layer_name] = best_device
+        device_memory[best_device] -= estimated_memory
+        assigned_layers.add(layer_name)
+
+        # Update preferred device index for next large layer
+        # Move backwards through devices to distribute large layers
+        if layer_idx < num_devices and preferred_device_idx > 0:
+            preferred_device_idx -= 1
+
+    # Restore device_map to original layer order for printing
+    ordered_device_map = {name: device_map[name] for name in layer_memory_dict.keys() if name in device_map}
+    return ordered_device_map, names
+
+
 def set_auto_device_map_for_block_with_tuning(
     block: torch.nn.Module,
     device_map,
@@ -735,36 +904,8 @@ def set_auto_device_map_for_block_with_tuning(
         device_idx = device_list[i] if device_list else i
         device_memory[cuda_xpu_devices[i]] = get_device_memory(device_idx)
 
-    # Dispatch layers to devices based on mem_per_param
-    # Use devices in order, switch to next device when current one is full
-    device_map = {}
-    names = []
-
-    current_device_idx = 0
-    current_device = cuda_xpu_devices[current_device_idx]
-
-    for layer_name, mem_info in layer_memory_dict.items():
-        names.append(layer_name)
-        # Calculate estimated memory for this layer
-        layer_param_memory = mem_info["param_memory"]
-
-        # All layer outputs are on card_0, so all cards only need to store parameters
-        estimated_memory = layer_param_memory * mem_per_param
-
-        # Try to fit in current device
-        if estimated_memory <= device_memory[current_device]:
-            device_map[layer_name] = current_device
-            device_memory[current_device] -= estimated_memory
-        else:
-            # Current device is full, try to switch to next device
-            if current_device_idx < len(cuda_xpu_devices) - 1:
-                current_device_idx += 1
-                current_device = cuda_xpu_devices[current_device_idx]
-
-            # Place on current device (either new device or last device)
-            device_map[layer_name] = current_device
-            device_memory[current_device] -= estimated_memory
-
+    # Allocate layers to devices using load-balancing strategy
+    device_map, names = _allocate_layers_to_devices(layer_memory_dict, device_memory, cuda_xpu_devices, mem_per_param)
     print(device_map)
     set_non_auto_device_map(block, device_map, names)
 

From 6508b74acdbda91b790eb14b385f0d8d220dd085 Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Wed, 29 Oct 2025 04:45:13 -0400
Subject: [PATCH 04/13] clear_memory for XPU

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 auto_round/compressors/base.py |   5 +-
 auto_round/utils/device.py     | 159 ++++++++++-----------------------
 2 files changed, 52 insertions(+), 112 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 921a5763f..c395bef68 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -2478,9 +2478,8 @@ def _quantize_block(
         if q_input is not None:
             if input_ids is not q_input:
                 clear_memory(input_ids)
-            else:
-                clear_memory()
             input_ids = q_input
+        clear_memory()
 
         quantized_layer_names, unquantized_layer_names = wrapper_block(
             block,
@@ -2567,6 +2566,7 @@ def _quantize_block(
                 current_output = to_device(current_output, device)
 
                 output_q = self._get_current_q_output(block, input_ids, input_others, indices, device)
+                clear_memory()  # clean cached memory after getting output_q
                 if self.attention_mask:
                     tmp_attention_mask = [self.attention_mask[i] for i in indices]
                     tmp_attention_mask = torch.cat(tmp_attention_mask, dim=0).to(device)
@@ -2586,6 +2586,7 @@ def _quantize_block(
 
                 total_loss += loss.item() / num_elm
                 self._scale_loss_and_backward(scaler, loss)
+                clear_memory()  # clean cached memory after backward
 
             if i == 0:
                 init_loss = total_loss
diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index 5ad711477..3a71ec377 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -517,8 +517,8 @@ def estimate_tuning_block_mem(
     input_output_memory = 2 * sum(tensor.nbytes for tensor in input_ids) / 1024**3
     if torch.xpu.is_available():
         # https://github.com/intel/torch-xpu-ops/issues/2232
-        # sdpa on XPU takes more memory than expected.
-        additional_memory = 12
+        # sdpa on XPU takes more memory than expected. 2 from grad tensor
+        additional_memory = 9 * 2 + 1
     else:
         additional_memory = 1  # sdpa usage and loss calculation usage
 
@@ -674,68 +674,52 @@ def _allocate_layers_to_devices(
     Strategy:
     1. Sort layers by memory size (descending) to prioritize large operations
     2. Allocate largest layers to later devices first (to keep device 0 free for I/O)
-    3. For each layer, find the best device considering:
-       - Sufficient remaining memory (preferred)
-       - Continuity (prefer same device as neighboring layers in original model order)
-       - Load balancing (minimize wasted space)
-    4. Fallback to device with most remaining space if all devices are over capacity
+    3. For each layer, prefer: neighbor's device > devices with more space > later devices for large layers
 
     Args:
-        layer_memory_dict (dict): Mapping of layer names to their memory info (order preserved)
-            Format: {layer_name: {"param_memory": float, "output_memory": float}}
+        layer_memory_dict (dict): Mapping of layer names to memory info (order preserved)
         device_memory (dict): Available memory for each device (will be modified)
-            Format: {device_name: available_memory_gb}
-        cuda_xpu_devices (list): List of available device names (e.g., ["cuda:0", "cuda:1"])
+        cuda_xpu_devices (list): List of device names (e.g., ["cuda:0", "cuda:1"])
         mem_per_param (float): Memory multiplier per parameter GB
 
     Returns:
-        tuple[dict, list]:
-            - device_map: Mapping of layer names to assigned devices
-            - names: List of layer names in processing order
-
-    Examples:
-        Example - Distribution with 3 devices:
-            Given layers [huge: 20GB, big: 15GB, large: 10GB, medium: 5GB, small: 2GB]
-            and 3 devices [device 0, device 1, device 2]:
-            - 'huge' → tries device 2 first (last device, preferred for largest layer)
-            - 'big' → tries device 1 (second-to-last, preferred for second largest)
-            - 'large' → tries device 0, but prioritizes neighbor's device if applicable
-            - 'medium' and 'small' → assigned based on remaining capacity and neighbors
-
-            Result: Device 0 has lightest load, suitable for handling I/O overhead
+        tuple[dict, list]: (device_map, names)
+
+    Example (LLaMA block with 7 layers on 3 devices):
+        Input:
+            # cuda:0 already occupied by input/output tensors and activations
+            device_memory = {"cuda:0": 15.0, "cuda:1": 40.0, "cuda:2": 40.0}
+            cuda_xpu_devices = ["cuda:0", "cuda:1", "cuda:2"]
+            mem_per_param = 2.0
+
+        Processing order (sorted by param_memory, descending):
+            1. gate_proj (11.0 * 2 = 22.0 GB) -> cuda:2 (largest layer, prefer last device)
+            2. up_proj   (11.0 * 2 = 22.0 GB) -> cuda:1 (largest layer, prefer 2nd last device)
+            3. down_proj (11.0 * 2 = 22.0 GB) -> cuda:1 (cuda:0 only has 15GB, insufficient)
+            4. q_proj    (4.0 * 2 = 8.0 GB)   -> cuda:2 (neighbor of gate_proj, continuity bonus)
+            5. o_proj    (4.0 * 2 = 8.0 GB)   -> cuda:2 (neighbor of q_proj, continuity bonus)
+            6. k_proj    (1.0 * 2 = 2.0 GB)   -> cuda:1 (neighbor of q_proj, continuity bonus)
+            7. v_proj    (1.0 * 2 = 2.0 GB)   -> cuda:1 (neighbor of k_proj, continuity bonus)
+
     """
     device_map = {}
     names = []
 
-    # Build layer order map from original dict order (preserves model structure)
     layer_names_in_order = list(layer_memory_dict.keys())
     layer_order = {name: idx for idx, name in enumerate(layer_names_in_order)}
 
-    # Sort layers by memory size (descending) to handle large layers first
-    # This prevents large layers from being stuck without suitable devices later
     sorted_layers = sorted(layer_memory_dict.items(), key=lambda x: x[1]["param_memory"], reverse=True)
 
-    # Track assigned layers to avoid duplicates
-    assigned_layers = set()
-
-    # Track preferred starting device for large layers (start from last device)
     num_devices = len(cuda_xpu_devices)
-    preferred_device_idx = num_devices - 1  # Start from last device
+    preferred_device_idx = num_devices - 1  # Start from last device for large layers
 
-    # Process each layer in sorted order (large to small)
     for layer_idx, (layer_name, mem_info) in enumerate(sorted_layers):
-        if layer_name in assigned_layers:
-            continue
-
         names.append(layer_name)
-        layer_param_memory = mem_info["param_memory"]
-        estimated_memory = layer_param_memory * mem_per_param
+        estimated_memory = mem_info["param_memory"] * mem_per_param
 
-        # Find neighboring layers in original model order for continuity evaluation
+        # Find neighbor devices
         current_layer_idx = layer_order[layer_name]
         neighbor_devices = set()
-
-        # Check previous and next layers in original order
         for offset in [-1, 1]:
             neighbor_idx = current_layer_idx + offset
             if 0 <= neighbor_idx < len(layer_names_in_order):
@@ -743,87 +727,42 @@ def _allocate_layers_to_devices(
                 if neighbor_name in device_map:
                     neighbor_devices.add(device_map[neighbor_name])
 
-        # Phase 1: Try to find a device with sufficient space
+        # Find best device
         best_device = None
-        best_device_idx = None
-        min_score = float("inf")
+        best_score = float("inf")
 
-        for dev_idx in range(num_devices):
-            dev = cuda_xpu_devices[dev_idx]
+        for dev_idx, dev in enumerate(cuda_xpu_devices):
             remaining = device_memory[dev] - estimated_memory
+            if remaining < 0:
+                continue  # Skip devices without enough space
 
-            # Only consider devices with enough space
-            if remaining >= 0:
-                # Continuity bonus: strongly prefer device used by neighboring layers
-                # This keeps adjacent layers in the model on the same device, reducing communication
-                if dev in neighbor_devices:
-                    continuity_bonus = -2000  # Very strong preference for neighbor's device
-                else:
-                    continuity_bonus = 0
-
-                # For large layers (early in sorted order), prefer later devices
-                # This keeps device 0 lighter as it handles I/O overhead
-                if layer_idx < num_devices and dev_idx >= preferred_device_idx:
-                    large_layer_bonus = -500  # Moderate preference for later devices
-                else:
-                    large_layer_bonus = 0
-
-                # Score = remaining memory waste + continuity penalty + large layer penalty (lower is better)
-                score = abs(remaining) + continuity_bonus + large_layer_bonus
-                if score < min_score:
-                    min_score = score
-                    best_device = dev
-                    best_device_idx = dev_idx
-
-        # Phase 2: Fallback - if no device has enough space, prefer neighbor's device
+            # Score components (lower is better)
+            continuity_bonus = -2000 if dev in neighbor_devices else 0
+            large_layer_bonus = -500 if layer_idx < num_devices and dev_idx >= preferred_device_idx else 0
+            load_balance_penalty = -remaining  # More space = lower penalty
+
+            score = load_balance_penalty + continuity_bonus + large_layer_bonus
+            if score < best_score:
+                best_score = score
+                best_device = dev
+
+        # Fallback: if no device has space, use neighbor's or last device
         if best_device is None:
             if neighbor_devices:
-                # Prefer neighbor's device even if over capacity
-                for dev in neighbor_devices:
-                    if best_device is None or device_memory[dev] > device_memory[best_device]:
-                        best_device = dev
-                        best_device_idx = cuda_xpu_devices.index(dev)
+                best_device = max(neighbor_devices, key=lambda d: device_memory[d])
             else:
-                # No neighbors assigned yet, prefer later devices for large layers
-                if layer_idx < num_devices:
-                    # Try from last device backwards
-                    max_remaining = float("-inf")
-                    for dev_idx in range(num_devices - 1, -1, -1):
-                        dev = cuda_xpu_devices[dev_idx]
-                        remaining = device_memory[dev] - estimated_memory
-                        if remaining > max_remaining:
-                            max_remaining = remaining
-                            best_device = dev
-                            best_device_idx = dev_idx
-                else:
-                    # For smaller layers, use device with most remaining space
-                    max_remaining = float("-inf")
-                    for dev_idx in range(num_devices):
-                        dev = cuda_xpu_devices[dev_idx]
-                        remaining = device_memory[dev] - estimated_memory
-                        if remaining > max_remaining:
-                            max_remaining = remaining
-                            best_device = dev
-                            best_device_idx = dev_idx
-
-        # Phase 3: Final safety fallback - use last device if still None
-        # Use last device to keep device 0 lighter
-        if best_device is None:
-            best_device = cuda_xpu_devices[-1]
-            best_device_idx = num_devices - 1
+                best_device = max(cuda_xpu_devices, key=lambda d: device_memory[d])
 
-        # Assign layer to the selected device
+        # Assign layer
         device_map[layer_name] = best_device
         device_memory[best_device] -= estimated_memory
-        assigned_layers.add(layer_name)
 
-        # Update preferred device index for next large layer
-        # Move backwards through devices to distribute large layers
+        # Update preferred device for next large layer
         if layer_idx < num_devices and preferred_device_idx > 0:
             preferred_device_idx -= 1
 
-    # Restore device_map to original layer order for printing
-    ordered_device_map = {name: device_map[name] for name in layer_memory_dict.keys() if name in device_map}
+    # Restore original order
+    ordered_device_map = {name: device_map[name] for name in layer_names_in_order if name in device_map}
     return ordered_device_map, names
 
 
@@ -906,7 +845,7 @@ def set_auto_device_map_for_block_with_tuning(
 
     # Allocate layers to devices using load-balancing strategy
     device_map, names = _allocate_layers_to_devices(layer_memory_dict, device_memory, cuda_xpu_devices, mem_per_param)
-    print(device_map)
+
     set_non_auto_device_map(block, device_map, names)
 
 

From cd5f68516c50363ecf5ba38873d82ec03e844514 Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Wed, 29 Oct 2025 06:02:15 -0400
Subject: [PATCH 05/13] refine device map logic

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 auto_round/utils/device.py | 121 ++++++++++++++++++-------------------
 1 file changed, 58 insertions(+), 63 deletions(-)

diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index 3a71ec377..46ce76dd0 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -672,9 +672,9 @@ def _allocate_layers_to_devices(
     Allocates layers to devices using a load-balancing strategy.
 
     Strategy:
-    1. Sort layers by memory size (descending) to prioritize large operations
-    2. Allocate largest layers to later devices first (to keep device 0 free for I/O)
-    3. For each layer, prefer: neighbor's device > devices with more space > later devices for large layers
+    1. Sort layers by memory size (descending), preserve order for equal sizes
+    2. Assign largest N layers to higher-index devices (N = num_devices)
+    3. Remaining layers use memory availability + layer continuity scorings
 
     Args:
         layer_memory_dict (dict): Mapping of layer names to memory info (order preserved)
@@ -685,82 +685,76 @@ def _allocate_layers_to_devices(
     Returns:
         tuple[dict, list]: (device_map, names)
 
-    Example (LLaMA block with 7 layers on 3 devices):
+    Example:
         Input:
-            # cuda:0 already occupied by input/output tensors and activations
-            device_memory = {"cuda:0": 15.0, "cuda:1": 40.0, "cuda:2": 40.0}
-            cuda_xpu_devices = ["cuda:0", "cuda:1", "cuda:2"]
+            device_memory = {"cuda:0": 30.0, "cuda:1": 40.0, "cuda:2": 40.0}
+            layer_memory_dict = {
+                "q_proj": {"param_memory": 4.0}, "k_proj": {"param_memory": 1.0},
+                "v_proj": {"param_memory": 1.0}, "o_proj": {"param_memory": 4.0},
+                "gate_proj": {"param_memory": 11.0}, "up_proj": {"param_memory": 11.0},
+                "down_proj": {"param_memory": 11.0}
+            }
             mem_per_param = 2.0
 
-        Processing order (sorted by param_memory, descending):
-            1. gate_proj (11.0 * 2 = 22.0 GB) -> cuda:2 (largest layer, prefer last device)
-            2. up_proj   (11.0 * 2 = 22.0 GB) -> cuda:1 (largest layer, prefer 2nd last device)
-            3. down_proj (11.0 * 2 = 22.0 GB) -> cuda:1 (cuda:0 only has 15GB, insufficient)
-            4. q_proj    (4.0 * 2 = 8.0 GB)   -> cuda:2 (neighbor of gate_proj, continuity bonus)
-            5. o_proj    (4.0 * 2 = 8.0 GB)   -> cuda:2 (neighbor of q_proj, continuity bonus)
-            6. k_proj    (1.0 * 2 = 2.0 GB)   -> cuda:1 (neighbor of q_proj, continuity bonus)
-            7. v_proj    (1.0 * 2 = 2.0 GB)   -> cuda:1 (neighbor of k_proj, continuity bonus)
-
+        Result (allocation order by size):
+            1. gate_proj (22GB) -> cuda:2 (largest, prefer last device)
+            2. up_proj (22GB) -> cuda:1 (2nd largest, prefer 2nd last device)
+            3. down_proj (22GB) -> cuda:0 (3rd largest, cuda:0 has 30GB available)
+            4. q_proj (8GB) -> cuda:2 (neighbor of gate_proj, continuity bonus)
+            5. o_proj (8GB) -> cuda:2 (neighbor of q_proj, continuity bonus)
+            6. k_proj (2GB) -> cuda:1 (neighbor of q_proj via original order)
+            7. v_proj (2GB) -> cuda:1 (neighbor of k_proj, continuity bonus)
     """
     device_map = {}
     names = []
-
     layer_names_in_order = list(layer_memory_dict.keys())
     layer_order = {name: idx for idx, name in enumerate(layer_names_in_order)}
-
-    sorted_layers = sorted(layer_memory_dict.items(), key=lambda x: x[1]["param_memory"], reverse=True)
-
+    sorted_layers = sorted(layer_memory_dict.items(), key=lambda x: (-x[1]["param_memory"], -layer_order[x[0]]))
     num_devices = len(cuda_xpu_devices)
-    preferred_device_idx = num_devices - 1  # Start from last device for large layers
 
+    def find_best_device(layer_name, estimated_memory, layer_idx):
+        """Find the best device for a layer."""
+        # Phase 1: Direct assign largest layers to higher-index devices first
+        if layer_idx < num_devices - 1:
+            return cuda_xpu_devices[-(layer_idx + 1)]
+
+        # Phase 2: Choose device with best score (memory + continuity)
+        best_device = None
+        best_score = float("-inf")
+        current_layer_order = layer_order[layer_name]
+
+        for device in cuda_xpu_devices:
+            if device_memory[device] < estimated_memory:
+                continue
+
+            # Memory score (normalized)
+            memory_score = device_memory[device] / estimated_memory
+
+            # Continuity bonus for adjacent layers
+            continuity_bonus = 0
+            for offset in [-1, 1]:  # Check previous and next neighbors
+                neighbor_idx = current_layer_order + offset
+                if 0 <= neighbor_idx < len(layer_names_in_order):
+                    neighbor_name = layer_names_in_order[neighbor_idx]
+                    if neighbor_name in device_map and device_map[neighbor_name] == device:
+                        continuity_bonus += 1.0
+
+            total_score = memory_score + continuity_bonus
+            if total_score > best_score:
+                best_score = total_score
+                best_device = device
+
+        # Fallback: device with most available memory
+        return best_device or max(cuda_xpu_devices, key=lambda d: device_memory[d])
+
+    # Allocate layers
     for layer_idx, (layer_name, mem_info) in enumerate(sorted_layers):
         names.append(layer_name)
         estimated_memory = mem_info["param_memory"] * mem_per_param
-
-        # Find neighbor devices
-        current_layer_idx = layer_order[layer_name]
-        neighbor_devices = set()
-        for offset in [-1, 1]:
-            neighbor_idx = current_layer_idx + offset
-            if 0 <= neighbor_idx < len(layer_names_in_order):
-                neighbor_name = layer_names_in_order[neighbor_idx]
-                if neighbor_name in device_map:
-                    neighbor_devices.add(device_map[neighbor_name])
-
-        # Find best device
-        best_device = None
-        best_score = float("inf")
-
-        for dev_idx, dev in enumerate(cuda_xpu_devices):
-            remaining = device_memory[dev] - estimated_memory
-            if remaining < 0:
-                continue  # Skip devices without enough space
-
-            # Score components (lower is better)
-            continuity_bonus = -2000 if dev in neighbor_devices else 0
-            large_layer_bonus = -500 if layer_idx < num_devices and dev_idx >= preferred_device_idx else 0
-            load_balance_penalty = -remaining  # More space = lower penalty
-
-            score = load_balance_penalty + continuity_bonus + large_layer_bonus
-            if score < best_score:
-                best_score = score
-                best_device = dev
-
-        # Fallback: if no device has space, use neighbor's or last device
-        if best_device is None:
-            if neighbor_devices:
-                best_device = max(neighbor_devices, key=lambda d: device_memory[d])
-            else:
-                best_device = max(cuda_xpu_devices, key=lambda d: device_memory[d])
-
-        # Assign layer
+        best_device = find_best_device(layer_name, estimated_memory, layer_idx)
         device_map[layer_name] = best_device
         device_memory[best_device] -= estimated_memory
 
-        # Update preferred device for next large layer
-        if layer_idx < num_devices and preferred_device_idx > 0:
-            preferred_device_idx -= 1
-
     # Restore original order
     ordered_device_map = {name: device_map[name] for name in layer_names_in_order if name in device_map}
     return ordered_device_map, names
@@ -845,6 +839,7 @@ def set_auto_device_map_for_block_with_tuning(
 
     # Allocate layers to devices using load-balancing strategy
     device_map, names = _allocate_layers_to_devices(layer_memory_dict, device_memory, cuda_xpu_devices, mem_per_param)
+    logger.debug(f"Auto device map for block: {device_map}")
 
     set_non_auto_device_map(block, device_map, names)
 

From 649ffee7c078e4c9cb66667c779d7600709e1779 Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Wed, 29 Oct 2025 22:16:58 -0400
Subject: [PATCH 06/13] update per review comments

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 auto_round/compressors/base.py | 24 +++++++++++---------
 auto_round/utils/device.py     | 41 ++++++++++++++++++----------------
 2 files changed, 36 insertions(+), 29 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index c395bef68..0f21850f3 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -311,6 +311,7 @@ def __init__(
         self.low_gpu_mem_usage = low_gpu_mem_usage
         self.seqlen = seqlen
         self.batch_size, self.gradient_accumulate_steps = batch_size, gradient_accumulate_steps
+        self.pick_samples = self.batch_size * self.gradient_accumulate_steps
         self.nblocks = nblocks
         self.dataset = dataset
         self.iters = iters
@@ -1422,7 +1423,9 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str])
                     convert_fp8_model_to_16b_model(block, dtype=self.amp_dtype)
 
                 if self.device_map == "auto" or (isinstance(self.device_map, str) and "," in self.device_map):
-                    set_auto_device_map_for_block_with_tuning(block, self.device_map, input_ids, self.low_gpu_mem_usage)
+                    set_auto_device_map_for_block_with_tuning(
+                        block, self.device_map, input_ids, self.low_gpu_mem_usage, self.pick_samples
+                    )
                 # Dispatch model if needed
                 if self.device_map is not None:
                     from accelerate.hooks import AlignDevicesHook, add_hook_to_module
@@ -2247,10 +2250,10 @@ def _quantize_layer(
         init_loss = None
         gradient_accumulate_steps = self.batch_size  # Force to low gpu
         batch_size = 1  # Force to low gpu
-        pick_samples = batch_size * gradient_accumulate_steps
-        pick_samples = min(nsamples, pick_samples)
+        self.pick_samples = batch_size * gradient_accumulate_steps
+        self.pick_samples = min(nsamples, self.pick_samples)
         if self.sampler != "rand":
-            whole_indices = torch.randperm(nsamples)[:pick_samples]
+            whole_indices = torch.randperm(nsamples)[: self.pick_samples]
         total_loss = 0
         num_elm = 1
         mse_reduction = "mean"
@@ -2261,7 +2264,7 @@ def _quantize_layer(
         for i in range(self.iters):
             total_loss = 0
             if self.sampler == "rand":
-                whole_indices = torch.randperm(nsamples)[:pick_samples]
+                whole_indices = torch.randperm(nsamples)[: self.pick_samples]
                 if gradient_accumulate_steps != 1:
                     if q_inputs is not None:
                         num_elm = self._get_current_num_elm(q_inputs, whole_indices)
@@ -2436,7 +2439,9 @@ def _quantize_block(
                     set_module(block, n, new_layer)
 
         if self.device_map == "auto" or (isinstance(self.device_map, str) and "," in self.device_map):
-            set_auto_device_map_for_block_with_tuning(block, self.device_map, input_ids, self.low_gpu_mem_usage)
+            set_auto_device_map_for_block_with_tuning(
+                block, self.device_map, input_ids, self.low_gpu_mem_usage, self.pick_samples
+            )
 
         if self.device_map is not None:
             for n, m in block.named_modules():
@@ -2535,10 +2540,9 @@ def _quantize_block(
         else:
             nsamples = len(input_ids)
 
-        pick_samples = self.batch_size * self.gradient_accumulate_steps
-        pick_samples = min(nsamples, pick_samples)
+        self.pick_samples = min(nsamples, self.pick_samples)
         if self.sampler != "rand":
-            whole_indices = torch.randperm(nsamples)[:pick_samples]
+            whole_indices = torch.randperm(nsamples)[: self.pick_samples]
         last_best_iter = 0
         best_loss = torch.finfo(torch.float).max
         num_elm = 1
@@ -2553,7 +2557,7 @@ def _quantize_block(
         for i in range(self.iters):
             total_loss = 0
             if self.sampler == "rand":
-                whole_indices = torch.randperm(nsamples)[:pick_samples]
+                whole_indices = torch.randperm(nsamples)[: self.pick_samples]
                 # We assume the block input and output shape is same
                 if self.gradient_accumulate_steps != 1:
                     num_elm = self._get_current_num_elm(input_ids, whole_indices)
diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index 46ce76dd0..4551020e9 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -494,6 +494,7 @@ def estimate_tuning_block_mem(
             param_size = module.weight.nbytes
             total_param_mem += param_size
             param_memory_gb = param_size / 1024**3
+            param_memory_gb *= 2  # considering the v tensor for weight rounding
 
             # Estimate output memory based on input_features and out_features
             in_features, out_features = get_layer_features(module)
@@ -515,12 +516,14 @@ def estimate_tuning_block_mem(
 
     # Assuming bfloat16 or float32, input and output
     input_output_memory = 2 * sum(tensor.nbytes for tensor in input_ids) / 1024**3
+
+    # considering sdpa (attention activation) memory and reference_output memory for loss calculation
+    additional_memory = 1
     if torch.xpu.is_available():
         # https://github.com/intel/torch-xpu-ops/issues/2232
         # sdpa on XPU takes more memory than expected. 2 from grad tensor
-        additional_memory = 9 * 2 + 1
-    else:
-        additional_memory = 1  # sdpa usage and loss calculation usage
+        xpu_sdpa_additional_memory = 9  # GB
+        additional_memory += xpu_sdpa_additional_memory * 2
 
     return layer_memory_dict, input_output_memory, additional_memory
 
@@ -666,7 +669,7 @@ def set_non_auto_device_map(
 
 
 def _allocate_layers_to_devices(
-    layer_memory_dict: dict, device_memory: dict, cuda_xpu_devices: list, mem_per_param: float
+    layer_memory_dict: dict, device_memory: dict, gpu_devices: list, mem_per_param: float
 ) -> tuple[dict, list]:
     """
     Allocates layers to devices using a load-balancing strategy.
@@ -679,7 +682,7 @@ def _allocate_layers_to_devices(
     Args:
         layer_memory_dict (dict): Mapping of layer names to memory info (order preserved)
         device_memory (dict): Available memory for each device (will be modified)
-        cuda_xpu_devices (list): List of device names (e.g., ["cuda:0", "cuda:1"])
+        gpu_devices (list): List of device names (e.g., ["cuda:0", "cuda:1"])
         mem_per_param (float): Memory multiplier per parameter GB
 
     Returns:
@@ -710,20 +713,20 @@ def _allocate_layers_to_devices(
     layer_names_in_order = list(layer_memory_dict.keys())
     layer_order = {name: idx for idx, name in enumerate(layer_names_in_order)}
     sorted_layers = sorted(layer_memory_dict.items(), key=lambda x: (-x[1]["param_memory"], -layer_order[x[0]]))
-    num_devices = len(cuda_xpu_devices)
+    num_devices = len(gpu_devices)
 
     def find_best_device(layer_name, estimated_memory, layer_idx):
         """Find the best device for a layer."""
         # Phase 1: Direct assign largest layers to higher-index devices first
         if layer_idx < num_devices - 1:
-            return cuda_xpu_devices[-(layer_idx + 1)]
+            return gpu_devices[-(layer_idx + 1)]
 
         # Phase 2: Choose device with best score (memory + continuity)
         best_device = None
         best_score = float("-inf")
         current_layer_order = layer_order[layer_name]
 
-        for device in cuda_xpu_devices:
+        for device in gpu_devices:
             if device_memory[device] < estimated_memory:
                 continue
 
@@ -745,7 +748,7 @@ def find_best_device(layer_name, estimated_memory, layer_idx):
                 best_device = device
 
         # Fallback: device with most available memory
-        return best_device or max(cuda_xpu_devices, key=lambda d: device_memory[d])
+        return best_device or max(gpu_devices, key=lambda d: device_memory[d])
 
     # Allocate layers
     for layer_idx, (layer_name, mem_info) in enumerate(sorted_layers):
@@ -799,10 +802,10 @@ def set_auto_device_map_for_block_with_tuning(
         device_list = [int(dev) for dev in device_map.split(",") if dev.isdigit()]
 
     if device_list:
-        cuda_xpu_devices = [f"{device_name}:{i}" for i in device_list]
-        device_0 = cuda_xpu_devices[0]
+        gpu_devices = [f"{device_name}:{i}" for i in device_list]
+        device_0 = gpu_devices[0]
     else:
-        cuda_xpu_devices = [f"{device_name}:{i}" for i in range(num_devices)]
+        gpu_devices = [f"{device_name}:{i}" for i in range(num_devices)]
         device_0 = f"{device_name}:0"
 
     device_0_memory = get_device_memory(device_list[0] if device_list else 0)
@@ -822,7 +825,7 @@ def set_auto_device_map_for_block_with_tuning(
 
     # Calculate total available memory across all devices
     total_available_memory = card_0_left_memory
-    for i in range(1, len(cuda_xpu_devices)):
+    for i in range(1, len(gpu_devices)):
         device_idx = device_list[i] if device_list else i
         total_available_memory += get_device_memory(device_idx)
 
@@ -833,12 +836,12 @@ def set_auto_device_map_for_block_with_tuning(
     # Initialize device memory tracking
     device_memory = {}
     device_memory[device_0] = card_0_left_memory
-    for i in range(1, len(cuda_xpu_devices)):
+    for i in range(1, len(gpu_devices)):
         device_idx = device_list[i] if device_list else i
-        device_memory[cuda_xpu_devices[i]] = get_device_memory(device_idx)
+        device_memory[gpu_devices[i]] = get_device_memory(device_idx)
 
     # Allocate layers to devices using load-balancing strategy
-    device_map, names = _allocate_layers_to_devices(layer_memory_dict, device_memory, cuda_xpu_devices, mem_per_param)
+    device_map, names = _allocate_layers_to_devices(layer_memory_dict, device_memory, gpu_devices, mem_per_param)
     logger.debug(f"Auto device map for block: {device_map}")
 
     set_non_auto_device_map(block, device_map, names)
@@ -927,9 +930,9 @@ def set_avg_auto_device_map(model: torch.nn.Module, device_map):
             return
 
     if device_list:
-        cuda_xpu_devices = [f"{device_name}:{i}" for i in device_list]
+        gpu_devices = [f"{device_name}:{i}" for i in device_list]
     else:
-        cuda_xpu_devices = [f"{device_name}:{i}" for i in range(num_devices)]
+        gpu_devices = [f"{device_name}:{i}" for i in range(num_devices)]
 
     for block_names in block_name_list:
         for block_name in block_names:
@@ -945,7 +948,7 @@ def set_avg_auto_device_map(model: torch.nn.Module, device_map):
             device_index = 0
             for res in res_list:
                 for key in res.keys():
-                    set_tuning_device_for_layer(block_module, key, cuda_xpu_devices[device_index])
+                    set_tuning_device_for_layer(block_module, key, gpu_devices[device_index])
                 device_index += 1
 
 

From 40d634cb3d7d0672bfb7e5f9490f9012e6dc6e6a Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Thu, 30 Oct 2025 01:39:10 -0400
Subject: [PATCH 07/13] clear_memory only xpu and remove block.to(device)

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 auto_round/compressors/base.py | 32 ++++++++++++++++++--------------
 auto_round/utils/device.py     | 20 +++++++++++++++-----
 2 files changed, 33 insertions(+), 19 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 0f21850f3..e234d6802 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -311,7 +311,6 @@ def __init__(
         self.low_gpu_mem_usage = low_gpu_mem_usage
         self.seqlen = seqlen
         self.batch_size, self.gradient_accumulate_steps = batch_size, gradient_accumulate_steps
-        self.pick_samples = self.batch_size * self.gradient_accumulate_steps
         self.nblocks = nblocks
         self.dataset = dataset
         self.iters = iters
@@ -1424,7 +1423,7 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str])
 
                 if self.device_map == "auto" or (isinstance(self.device_map, str) and "," in self.device_map):
                     set_auto_device_map_for_block_with_tuning(
-                        block, self.device_map, input_ids, self.low_gpu_mem_usage, self.pick_samples
+                        block, self.device_map, input_ids, self.low_gpu_mem_usage, self.batch_size
                     )
                 # Dispatch model if needed
                 if self.device_map is not None:
@@ -2250,10 +2249,10 @@ def _quantize_layer(
         init_loss = None
         gradient_accumulate_steps = self.batch_size  # Force to low gpu
         batch_size = 1  # Force to low gpu
-        self.pick_samples = batch_size * gradient_accumulate_steps
-        self.pick_samples = min(nsamples, self.pick_samples)
+        pick_samples = batch_size * gradient_accumulate_steps
+        pick_samples = min(nsamples, pick_samples)
         if self.sampler != "rand":
-            whole_indices = torch.randperm(nsamples)[: self.pick_samples]
+            whole_indices = torch.randperm(nsamples)[:pick_samples]
         total_loss = 0
         num_elm = 1
         mse_reduction = "mean"
@@ -2264,7 +2263,7 @@ def _quantize_layer(
         for i in range(self.iters):
             total_loss = 0
             if self.sampler == "rand":
-                whole_indices = torch.randperm(nsamples)[: self.pick_samples]
+                whole_indices = torch.randperm(nsamples)[:pick_samples]
                 if gradient_accumulate_steps != 1:
                     if q_inputs is not None:
                         num_elm = self._get_current_num_elm(q_inputs, whole_indices)
@@ -2440,7 +2439,7 @@ def _quantize_block(
 
         if self.device_map == "auto" or (isinstance(self.device_map, str) and "," in self.device_map):
             set_auto_device_map_for_block_with_tuning(
-                block, self.device_map, input_ids, self.low_gpu_mem_usage, self.pick_samples
+                block, self.device_map, input_ids, self.low_gpu_mem_usage, self.batch_size
             )
 
         if self.device_map is not None:
@@ -2483,8 +2482,9 @@ def _quantize_block(
         if q_input is not None:
             if input_ids is not q_input:
                 clear_memory(input_ids)
+            else:
+                clear_memory()
             input_ids = q_input
-        clear_memory()
 
         quantized_layer_names, unquantized_layer_names = wrapper_block(
             block,
@@ -2540,9 +2540,10 @@ def _quantize_block(
         else:
             nsamples = len(input_ids)
 
-        self.pick_samples = min(nsamples, self.pick_samples)
+        pick_samples = self.batch_size * self.gradient_accumulate_steps
+        pick_samples = min(nsamples, pick_samples)
         if self.sampler != "rand":
-            whole_indices = torch.randperm(nsamples)[: self.pick_samples]
+            whole_indices = torch.randperm(nsamples)[:pick_samples]
         last_best_iter = 0
         best_loss = torch.finfo(torch.float).max
         num_elm = 1
@@ -2557,7 +2558,7 @@ def _quantize_block(
         for i in range(self.iters):
             total_loss = 0
             if self.sampler == "rand":
-                whole_indices = torch.randperm(nsamples)[: self.pick_samples]
+                whole_indices = torch.randperm(nsamples)[:pick_samples]
                 # We assume the block input and output shape is same
                 if self.gradient_accumulate_steps != 1:
                     num_elm = self._get_current_num_elm(input_ids, whole_indices)
@@ -2570,7 +2571,7 @@ def _quantize_block(
                 current_output = to_device(current_output, device)
 
                 output_q = self._get_current_q_output(block, input_ids, input_others, indices, device)
-                clear_memory()  # clean cached memory after getting output_q
+
                 if self.attention_mask:
                     tmp_attention_mask = [self.attention_mask[i] for i in indices]
                     tmp_attention_mask = torch.cat(tmp_attention_mask, dim=0).to(device)
@@ -2590,7 +2591,11 @@ def _quantize_block(
 
                 total_loss += loss.item() / num_elm
                 self._scale_loss_and_backward(scaler, loss)
-                clear_memory()  # clean cached memory after backward
+
+                # Temporary change for 70B model OOM issue on XPU
+                # TODO: Remove after https://github.com/intel/torch-xpu-ops/issues/2232 is fixed
+                if torch.xpu.is_available():
+                    clear_memory()  # clean cached memory after backward
 
             if i == 0:
                 init_loss = total_loss
@@ -2745,7 +2750,6 @@ def _quantize_blocks(
                 modules = [get_module(model, n) for n in names]
                 m = WrapperMultiblock(modules)
 
-            m = m.to(device)
             q_input, input_ids = quantize_block(
                 m,
                 input_ids,
diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index 4551020e9..b9290b04c 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -473,9 +473,9 @@ def estimate_tuning_block_mem(
         tuple: A tuple containing the following:
             - layer_memory_dict (dict): A dictionary mapping layer names to their memory consumption (in GB).
               Format: {layer_name: {"param_memory": float, "output_memory": float}}
-              SDPA layers are represented with a fixed 1GB output memory.
             - input_output_memory (float): The memory consumption (in GB) for input and output
                 tensors of the block.
+            - additional_memory (float): Additional memory overhead (in GB) for operations like attention.
     """
     # Calculate all block parameters memory and build layer-wise memory dict
     from auto_round.utils.model import get_layer_features
@@ -517,13 +517,13 @@ def estimate_tuning_block_mem(
     # Assuming bfloat16 or float32, input and output
     input_output_memory = 2 * sum(tensor.nbytes for tensor in input_ids) / 1024**3
 
-    # considering sdpa (attention activation) memory and reference_output memory for loss calculation
+    # Considering norm, sdpa, reference_output, etc.
     additional_memory = 1
     if torch.xpu.is_available():
         # https://github.com/intel/torch-xpu-ops/issues/2232
-        # sdpa on XPU takes more memory than expected. 2 from grad tensor
-        xpu_sdpa_additional_memory = 9  # GB
-        additional_memory += xpu_sdpa_additional_memory * 2
+        # TODO: XPU takes more memory than expected. for llama 8B, it's 9*2 GB
+        xpu_additional_memory = 9  # GB
+        additional_memory += xpu_additional_memory * 2
 
     return layer_memory_dict, input_output_memory, additional_memory
 
@@ -846,6 +846,16 @@ def set_auto_device_map_for_block_with_tuning(
 
     set_non_auto_device_map(block, device_map, names)
 
+    # Ensure all remaining modules with params/buffers are moved to device_0
+    # This prevents mixed CPU/GPU execution within the same block
+    for name, module in block.named_modules():
+        if name not in names:  # This module wasn't assigned a device
+            # Check if module has any parameters or buffers
+            has_params = any(True for _ in module.parameters(recurse=False))
+            has_buffers = any(True for _ in module.buffers(recurse=False))
+            if has_params or has_buffers:
+                set_tuning_device_for_layer(block, name, device_0)
+
 
 def partition_dict_numbers(number_dict, n):
     """

From 135339dc66f09179e006236694d23467382214e6 Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Thu, 30 Oct 2025 02:50:26 -0400
Subject: [PATCH 08/13] fix bug and refine additional memory calcu logic

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 auto_round/compressors/base.py |   2 +-
 auto_round/utils/device.py     | 159 +++++++++++++++++----------------
 2 files changed, 83 insertions(+), 78 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index e234d6802..fa0653327 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -2594,7 +2594,7 @@ def _quantize_block(
 
                 # Temporary change for 70B model OOM issue on XPU
                 # TODO: Remove after https://github.com/intel/torch-xpu-ops/issues/2232 is fixed
-                if torch.xpu.is_available():
+                if torch.xpu.is_available() and self.low_gpu_mem_usage:
                     clear_memory()  # clean cached memory after backward
 
             if i == 0:
diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index b9290b04c..f4c367881 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -458,76 +458,6 @@ def check_memory_availability(device, inputs, weight, org_seqlen, org_bs):
     return False, seqlen, bs
 
 
-def estimate_tuning_block_mem(
-    block: torch.nn.Module, input_ids: list[torch.Tensor], pick_samples: int
-) -> tuple[dict, float]:
-    """
-    Calculates the memory consumption of a specific block in the model.
-
-    Args:
-        block (torch.nn.Module): The block of the model to analyze.
-        input_ids (list[torch.Tensor]): A list of input tensors for the block.
-        pick_samples (int): Number of samples to consider for memory estimation.
-
-    Returns:
-        tuple: A tuple containing the following:
-            - layer_memory_dict (dict): A dictionary mapping layer names to their memory consumption (in GB).
-              Format: {layer_name: {"param_memory": float, "output_memory": float}}
-            - input_output_memory (float): The memory consumption (in GB) for input and output
-                tensors of the block.
-            - additional_memory (float): Additional memory overhead (in GB) for operations like attention.
-    """
-    # Calculate all block parameters memory and build layer-wise memory dict
-    from auto_round.utils.model import get_layer_features
-
-    layer_memory_dict = {}
-    total_param_mem = 0
-
-    # Calculate batch_size and sequence_length from input_ids for output memory estimation
-    seq_len = input_ids[0].shape[1] if input_ids and len(input_ids[0].shape) >= 2 else 1
-    element_size = input_ids[0].element_size() if input_ids else 2  # Default to 2 bytes (fp16/bf16)
-
-    for name, module in block.named_modules():
-        if check_to_quantized(module):
-            enable_act_quant = module.act_bits <= 8
-            layer_name = name
-            param_size = module.weight.nbytes
-            total_param_mem += param_size
-            param_memory_gb = param_size / 1024**3
-            param_memory_gb *= 2  # considering the v tensor for weight rounding
-
-            # Estimate output memory based on input_features and out_features
-            in_features, out_features = get_layer_features(module)
-            if in_features is not None and out_features is not None:
-                # Output tensor size: batch_size * seq_len * out_features * element_size
-                output_size = pick_samples * seq_len * out_features * element_size
-                output_memory_gb = output_size / 1024**3
-
-                # If enable_act_quant, add input tensor memory to param_memory
-                if enable_act_quant:
-                    input_size = pick_samples * seq_len * in_features * element_size
-                    input_memory_gb = input_size / 1024**3
-                    param_memory_gb += input_memory_gb
-            else:
-                output_memory_gb = 0.0
-
-            # memory * 2, because it contains grad tensor.
-            layer_memory_dict[layer_name] = {"param_memory": param_memory_gb * 2, "output_memory": output_memory_gb * 2}
-
-    # Assuming bfloat16 or float32, input and output
-    input_output_memory = 2 * sum(tensor.nbytes for tensor in input_ids) / 1024**3
-
-    # Considering norm, sdpa, reference_output, etc.
-    additional_memory = 1
-    if torch.xpu.is_available():
-        # https://github.com/intel/torch-xpu-ops/issues/2232
-        # TODO: XPU takes more memory than expected. for llama 8B, it's 9*2 GB
-        xpu_additional_memory = 9  # GB
-        additional_memory += xpu_additional_memory * 2
-
-    return layer_memory_dict, input_output_memory, additional_memory
-
-
 def out_of_vram(error_msg):
     error_msg = str(error_msg)
     # CUDA
@@ -763,6 +693,77 @@ def find_best_device(layer_name, estimated_memory, layer_idx):
     return ordered_device_map, names
 
 
+def estimate_tuning_block_mem(
+    block: torch.nn.Module, input_ids: list[torch.Tensor], pick_samples: int
+) -> tuple[dict, float]:
+    """
+    Calculates the memory consumption of a specific block in the model.
+
+    Args:
+        block (torch.nn.Module): The block of the model to analyze.
+        input_ids (list[torch.Tensor]): A list of input tensors for the block.
+        pick_samples (int): Number of samples to consider for memory estimation.
+
+    Returns:
+        tuple: A tuple containing the following:
+            - layer_memory_dict (dict): A dictionary mapping layer names to their memory consumption (in GB).
+              Format: {layer_name: {"param_memory": float, "output_memory": float}}
+            - input_output_memory (float): The memory consumption (in GB) for input and output
+                tensors of the block.
+            - additional_memory (float): Additional memory overhead (in GB) for operations like attention.
+    """
+    # Calculate all block parameters memory and build layer-wise memory dict
+    from auto_round.utils.model import get_layer_features
+
+    layer_memory_dict = {}
+    total_param_mem = 0
+
+    # Calculate batch_size and sequence_length from input_ids for output memory estimation
+    seq_len = input_ids[0].shape[1] if input_ids and len(input_ids[0].shape) >= 2 else 1
+    element_size = input_ids[0].element_size() if input_ids else 2  # Default to 2 bytes (fp16/bf16)
+
+    for name, module in block.named_modules():
+        if check_to_quantized(module):
+            enable_act_quant = module.act_bits <= 8
+            layer_name = name
+            param_size = module.weight.nbytes
+            param_memory_gb = param_size / 1024**3
+            param_memory_gb *= 2  # considering the v tensor for weight rounding
+
+            # Estimate output memory based on input_features and out_features
+            in_features, out_features = get_layer_features(module)
+            if in_features is not None and out_features is not None:
+                # Output tensor size: batch_size * seq_len * out_features * element_size
+                output_size = pick_samples * seq_len * out_features * element_size
+                output_memory_gb = output_size / 1024**3
+
+                # If enable_act_quant, add input tensor memory to param_memory
+                if enable_act_quant:
+                    input_size = pick_samples * seq_len * in_features * element_size
+                    input_memory_gb = input_size / 1024**3
+                    param_memory_gb += input_memory_gb
+            else:
+                output_memory_gb = 0.0
+
+            # memory * 2, because it contains grad tensor.
+            layer_memory_dict[layer_name] = {"param_memory": param_memory_gb * 2, "output_memory": output_memory_gb * 2}
+
+    # Assuming bfloat16 or float32, input and output
+    block_input_output_memory = 2 * sum(tensor.nbytes for tensor in input_ids) / 1024**3
+
+    # Roughly estimate additional memory for attention and other operations
+    additional_activation_memory = sum(info["output_memory"] for info in layer_memory_dict.values())
+    # 1GB considers norm weight, sdpa, reference_output, etc.
+    additional_memory = additional_activation_memory + 1  # GB
+    if torch.xpu.is_available():
+        # https://github.com/intel/torch-xpu-ops/issues/2232
+        # TODO: XPU takes more memory than expected. for llama 8B, it's about 12 GB
+        xpu_additional_memory = 12  # GB
+        additional_memory += xpu_additional_memory
+
+    return layer_memory_dict, block_input_output_memory, additional_memory
+
+
 def set_auto_device_map_for_block_with_tuning(
     block: torch.nn.Module,
     device_map,
@@ -809,19 +810,23 @@ def set_auto_device_map_for_block_with_tuning(
         device_0 = f"{device_name}:0"
 
     device_0_memory = get_device_memory(device_list[0] if device_list else 0)
-    layer_memory_dict, input_output_memory, additional_memory = estimate_tuning_block_mem(
+    layer_memory_dict, block_input_output_memory, additional_memory = estimate_tuning_block_mem(
         block, input_ids, pick_samples
     )
     if low_gpu_mem_usage:
-        input_output_memory = 0
+        block_input_output_memory = 0
 
     # Calculate total block memory from layer memory dict (including both param and output memory)
     total_block_param_memory = sum(info["param_memory"] for info in layer_memory_dict.values())
     total_block_output_memory = sum(info["output_memory"] for info in layer_memory_dict.values())
 
     # Average dispatch strategy
-    # card_0_left_memory = card_0_mem - input_output_memory - additional_memory - layer_outputs_memory
-    card_0_left_memory = device_0_memory - input_output_memory - additional_memory - total_block_output_memory
+    # card_0_left_memory = card_0_mem - block_input_output_memory - additional_memory - layer_outputs_memory
+    logger.debug("Card 0 used memory details:")
+    logger.debug(f"  Block input output cache memory: {block_input_output_memory} GB")
+    logger.debug(f"  Quantized layer outputs memory: {total_block_output_memory} GB")
+    logger.debug(f"  Additional_memory from other ops: {additional_memory} GB")
+    card_0_left_memory = device_0_memory - block_input_output_memory - total_block_output_memory - additional_memory
 
     # Calculate total available memory across all devices
     total_available_memory = card_0_left_memory
@@ -842,11 +847,11 @@ def set_auto_device_map_for_block_with_tuning(
 
     # Allocate layers to devices using load-balancing strategy
     device_map, names = _allocate_layers_to_devices(layer_memory_dict, device_memory, gpu_devices, mem_per_param)
-    logger.debug(f"Auto device map for block: {device_map}")
 
+    logger.debug(f"Auto device map for block: {device_map}")
     set_non_auto_device_map(block, device_map, names)
 
-    # Ensure all remaining modules with params/buffers are moved to device_0
+    # Ensure all remaining modules with parameters/buffers are moved to device_0
     # This prevents mixed CPU/GPU execution within the same block
     for name, module in block.named_modules():
         if name not in names:  # This module wasn't assigned a device
@@ -854,7 +859,7 @@ def set_auto_device_map_for_block_with_tuning(
             has_params = any(True for _ in module.parameters(recurse=False))
             has_buffers = any(True for _ in module.buffers(recurse=False))
             if has_params or has_buffers:
-                set_tuning_device_for_layer(block, name, device_0)
+                module = module.to(device_0)
 
 
 def partition_dict_numbers(number_dict, n):

From 488eec1a498bb8a47a875c1787803f655c9d97aa Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Thu, 30 Oct 2025 03:36:15 -0400
Subject: [PATCH 09/13] consider output_device when setting device_map

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 auto_round/compressors/base.py | 4 ++--
 auto_round/utils/device.py     | 8 +++++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index fa0653327..5272e1db3 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -2439,7 +2439,7 @@ def _quantize_block(
 
         if self.device_map == "auto" or (isinstance(self.device_map, str) and "," in self.device_map):
             set_auto_device_map_for_block_with_tuning(
-                block, self.device_map, input_ids, self.low_gpu_mem_usage, self.batch_size
+                block, self.device_map, input_ids, self.low_gpu_mem_usage, self.batch_size, device
             )
 
         if self.device_map is not None:
@@ -2491,7 +2491,7 @@ def _quantize_block(
             self.enable_minmax_tuning,
             self.enable_norm_bias_tuning,
             enable_torch_compile=self.enable_torch_compile,
-            device=self.device,
+            device=device,
         )
         if is_nv_fp(self.data_type):  # enable qkv and moe structure global_scale fuse
             from auto_round.data_type.utils import update_fused_layer_global_scales
diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index f4c367881..ccad2cef0 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -770,6 +770,7 @@ def set_auto_device_map_for_block_with_tuning(
     input_ids: list[torch.Tensor],
     low_gpu_mem_usage=False,
     pick_samples=8,
+    output_device=None,
 ):
     """
     Automatically sets the device map for the block based on available GPUs and memory constraints.
@@ -780,6 +781,7 @@ def set_auto_device_map_for_block_with_tuning(
         input_ids (list[torch.Tensor]): List of input tensors used for estimating memory requirements.
         low_gpu_mem_usage (bool, optional): If True, ignoring input/output memory. Defaults to False.
         pick_samples (int, optional): Number of samples to consider for memory estimation. Defaults to 8.
+        output_device (str | torch.device, optional): Device to move unassigned modules to. Defaults to None.
 
     Returns:
         None
@@ -851,15 +853,15 @@ def set_auto_device_map_for_block_with_tuning(
     logger.debug(f"Auto device map for block: {device_map}")
     set_non_auto_device_map(block, device_map, names)
 
-    # Ensure all remaining modules with parameters/buffers are moved to device_0
-    # This prevents mixed CPU/GPU execution within the same block
+    # Ensure all remaining modules with parameters/buffers are moved to expected device, by default device_0
+    output_device = device_0 if output_device is None else output_device
     for name, module in block.named_modules():
         if name not in names:  # This module wasn't assigned a device
             # Check if module has any parameters or buffers
             has_params = any(True for _ in module.parameters(recurse=False))
             has_buffers = any(True for _ in module.buffers(recurse=False))
             if has_params or has_buffers:
-                module = module.to(device_0)
+                module = module.to(output_device)
 
 
 def partition_dict_numbers(number_dict, n):

From 42b1453a10f883b7eb6e175117ecdf3be72bfb7e Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Thu, 30 Oct 2025 04:47:05 -0400
Subject: [PATCH 10/13] fix bug

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 auto_round/compressors/base.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 5272e1db3..eb8f634f7 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -2437,10 +2437,12 @@ def _quantize_block(
                     new_layer = convert_fp8_layer_to_linear(m, self.amp_dtype).to(device)
                     set_module(block, n, new_layer)
 
-        if self.device_map == "auto" or (isinstance(self.device_map, str) and "," in self.device_map):
+        if self.device_map == "auto" or ((isinstance(self.device_map, str) and "," in self.device_map)):
             set_auto_device_map_for_block_with_tuning(
                 block, self.device_map, input_ids, self.low_gpu_mem_usage, self.batch_size, device
             )
+        else:
+            block = block.to(device)
 
         if self.device_map is not None:
             for n, m in block.named_modules():

From 194ac2749c6f720ee50c9da4c5ff59a8df18c776 Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Fri, 31 Oct 2025 03:20:27 -0400
Subject: [PATCH 11/13] clear_memory_if_reached_threshold

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 auto_round/compressors/base.py |  7 ++--
 auto_round/utils/device.py     | 63 ++++++++++++++++++++++++++++++++--
 2 files changed, 63 insertions(+), 7 deletions(-)

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index eb8f634f7..330003954 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -91,6 +91,7 @@
     unsupported_meta_device,
 )
 from auto_round.utils.device import (
+    clear_memory_if_reached_threshold,
     get_major_device,
     set_auto_device_map_for_block_with_tuning,
     set_non_auto_device_map,
@@ -2593,11 +2594,7 @@ def _quantize_block(
 
                 total_loss += loss.item() / num_elm
                 self._scale_loss_and_backward(scaler, loss)
-
-                # Temporary change for 70B model OOM issue on XPU
-                # TODO: Remove after https://github.com/intel/torch-xpu-ops/issues/2232 is fixed
-                if torch.xpu.is_available() and self.low_gpu_mem_usage:
-                    clear_memory()  # clean cached memory after backward
+                clear_memory_if_reached_threshold(threshold=0.85)
 
             if i == 0:
                 init_loss = total_loss
diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index ccad2cef0..f78114410 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -415,6 +415,62 @@ def clear_memory(tensor=None):
         _clear_memory_for_cpu_and_cuda(tensor)
 
 
+def clear_memory_if_reached_threshold(threshold=0.85):
+    """Check all available devices and clear memory if any device is using close to the threshold.
+
+    Args:
+        threshold (float): Memory usage threshold (default: 0.85 for 85%).
+                            If any device exceeds this percentage, clear_memory() will be called.
+
+    Returns:
+        bool: True if memory was cleared, False otherwise.
+    """
+    # Check CUDA devices
+    if torch.cuda.is_available():
+        num_devices = torch.cuda.device_count()
+        for i in range(num_devices):
+            try:
+                total_memory = torch.cuda.get_device_properties(i).total_memory
+                allocated_memory = torch.cuda.memory_reserved(i)
+                memory_usage_ratio = allocated_memory / total_memory
+
+                if memory_usage_ratio >= threshold:
+                    logger.info(
+                        f"CUDA device {i}: Memory usage {memory_usage_ratio*100:.2f}% "
+                        f"exceeds threshold {threshold*100:.2f}%. Clearing memory..."
+                    )
+                    clear_memory()
+                    allocated_memory = torch.cuda.memory_reserved(i)
+                    memory_usage_ratio = allocated_memory / total_memory
+                    logger.info(f"Cleared memory. CUDA device {i}: Memory usage {memory_usage_ratio*100:.2f}%")
+                    break
+            except Exception as e:
+                logger.warning(f"Failed to check memory for CUDA device {i}: {e}")
+
+    # Check XPU devices if memory not yet cleared
+    if hasattr(torch, "xpu") and torch.xpu.is_available():
+        num_devices = torch.xpu.device_count()
+        for i in range(num_devices):
+            try:
+                total_memory = torch.xpu.get_device_properties(i).total_memory
+                allocated_memory = torch.xpu.memory_allocated(i)
+                memory_usage_ratio = allocated_memory / total_memory
+
+                if memory_usage_ratio >= threshold:
+                    logger.info(
+                        f"XPU device {i}: Memory usage {memory_usage_ratio*100:.2f}% "
+                        f"exceeds threshold {threshold*100:.2f}%. Clearing memory..."
+                    )
+                    clear_memory()
+                    allocated_memory = torch.xpu.memory_reserved(i)
+                    memory_usage_ratio = allocated_memory / total_memory
+                    logger.info(f"Cleared memory. XPU device {i}: Memory usage {memory_usage_ratio*100:.2f}%")
+                    break
+            except Exception as e:
+                logger.warning(f"Failed to check memory for XPU device {i}: {e}")
+    return False
+
+
 def check_memory_availability(device, inputs, weight, org_seqlen, org_bs):
     """Checks the availability of memory on the specified device for processing inputs using a given weight tensor.
 
@@ -824,11 +880,14 @@ def set_auto_device_map_for_block_with_tuning(
 
     # Average dispatch strategy
     # card_0_left_memory = card_0_mem - block_input_output_memory - additional_memory - layer_outputs_memory
-    logger.debug("Card 0 used memory details:")
+    logger.debug("Card 0 used memory details [Estimated]:")
     logger.debug(f"  Block input output cache memory: {block_input_output_memory} GB")
     logger.debug(f"  Quantized layer outputs memory: {total_block_output_memory} GB")
     logger.debug(f"  Additional_memory from other ops: {additional_memory} GB")
-    card_0_left_memory = device_0_memory - block_input_output_memory - total_block_output_memory - additional_memory
+
+    card_0_left_memory = max(
+        0, device_0_memory - block_input_output_memory - total_block_output_memory - additional_memory
+    )
 
     # Calculate total available memory across all devices
     total_available_memory = card_0_left_memory

From ddb20ed13eddc1462f317058370fae5e3e12a65c Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Fri, 31 Oct 2025 04:40:17 -0400
Subject: [PATCH 12/13] add warning for XPU additional memory

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 auto_round/utils/device.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index f78114410..fd0ad29ea 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -816,6 +816,8 @@ def estimate_tuning_block_mem(
         # TODO: XPU takes more memory than expected. for llama 8B, it's about 12 GB
         xpu_additional_memory = 12  # GB
         additional_memory += xpu_additional_memory
+        logger.warning_once("XPU additional memory usage of SDPA is estimated to be 12 GB.")
+        logger.warning_once("Remove it after https://github.com/intel/torch-xpu-ops/issues/2232 is fixed.")
 
     return layer_memory_dict, block_input_output_memory, additional_memory
 

From 015331b288cbaef331e9caccbfdea5d6f3816345 Mon Sep 17 00:00:00 2001
From: "He, Xin3" <xin3.he@intel.com>
Date: Fri, 31 Oct 2025 05:44:36 -0400
Subject: [PATCH 13/13] unify code

Signed-off-by: He, Xin3 <xin3.he@intel.com>
---
 auto_round/utils/device.py | 65 ++++++++++++++------------------------
 1 file changed, 24 insertions(+), 41 deletions(-)

diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
index fd0ad29ea..3ea3e63e0 100644
--- a/auto_round/utils/device.py
+++ b/auto_round/utils/device.py
@@ -425,50 +425,33 @@ def clear_memory_if_reached_threshold(threshold=0.85):
     Returns:
         bool: True if memory was cleared, False otherwise.
     """
-    # Check CUDA devices
+    # Detect CUDA/XPU devices
     if torch.cuda.is_available():
-        num_devices = torch.cuda.device_count()
-        for i in range(num_devices):
-            try:
-                total_memory = torch.cuda.get_device_properties(i).total_memory
-                allocated_memory = torch.cuda.memory_reserved(i)
-                memory_usage_ratio = allocated_memory / total_memory
+        name, device_api = "CUDA", torch.cuda
+    elif hasattr(torch, "xpu") and torch.xpu.is_available():
+        name, device_api = "XPU", torch.xpu
+    else:
+        return
 
-                if memory_usage_ratio >= threshold:
-                    logger.info(
-                        f"CUDA device {i}: Memory usage {memory_usage_ratio*100:.2f}% "
-                        f"exceeds threshold {threshold*100:.2f}%. Clearing memory..."
-                    )
-                    clear_memory()
-                    allocated_memory = torch.cuda.memory_reserved(i)
-                    memory_usage_ratio = allocated_memory / total_memory
-                    logger.info(f"Cleared memory. CUDA device {i}: Memory usage {memory_usage_ratio*100:.2f}%")
-                    break
-            except Exception as e:
-                logger.warning(f"Failed to check memory for CUDA device {i}: {e}")
-
-    # Check XPU devices if memory not yet cleared
-    if hasattr(torch, "xpu") and torch.xpu.is_available():
-        num_devices = torch.xpu.device_count()
-        for i in range(num_devices):
-            try:
-                total_memory = torch.xpu.get_device_properties(i).total_memory
-                allocated_memory = torch.xpu.memory_allocated(i)
+    num_devices = device_api.device_count()
+    for i in range(num_devices):
+        try:
+            total_memory = device_api.get_device_properties(i).total_memory
+            allocated_memory = device_api.memory_reserved(i) if name == "CUDA" else device_api.memory_allocated(i)
+            memory_usage_ratio = allocated_memory / total_memory
+
+            if memory_usage_ratio >= threshold:
+                logger.warning_once(
+                    f"{name} device {i}: Memory usage {memory_usage_ratio*100:.2f}% "
+                    f"exceeds threshold {threshold*100:.2f}%. Clearing memory..."
+                )
+                clear_memory()
+                allocated_memory = device_api.memory_reserved(i) if name == "CUDA" else device_api.memory_allocated(i)
                 memory_usage_ratio = allocated_memory / total_memory
-
-                if memory_usage_ratio >= threshold:
-                    logger.info(
-                        f"XPU device {i}: Memory usage {memory_usage_ratio*100:.2f}% "
-                        f"exceeds threshold {threshold*100:.2f}%. Clearing memory..."
-                    )
-                    clear_memory()
-                    allocated_memory = torch.xpu.memory_reserved(i)
-                    memory_usage_ratio = allocated_memory / total_memory
-                    logger.info(f"Cleared memory. XPU device {i}: Memory usage {memory_usage_ratio*100:.2f}%")
-                    break
-            except Exception as e:
-                logger.warning(f"Failed to check memory for XPU device {i}: {e}")
-    return False
+                logger.warning_once(f"Cleared memory. {name} device {i}: Memory usage {memory_usage_ratio*100:.2f}%")
+                return True
+        except Exception as e:
+            logger.warning_once(f"Failed to check memory for {name} device {i}: {e}")
 
 
 def check_memory_availability(device, inputs, weight, org_seqlen, org_bs):