From f0cc13150a4656a82020c9966163120df7defe41 Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Wed, 29 Oct 2025 03:01:53 -0400 Subject: [PATCH 01/13] enhance auto device map and support XPU Signed-off-by: He, Xin3 --- auto_round/utils/device.py | 169 +++++++++++++++++++++++++------------ 1 file changed, 113 insertions(+), 56 deletions(-) diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index 850c95343..21e375408 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -458,34 +458,64 @@ def check_memory_availability(device, inputs, weight, org_seqlen, org_bs): return False, seqlen, bs -def estimate_tuning_block_mem(block: torch.nn.Module, input_ids: list[torch.Tensor]) -> tuple[float, float]: +def estimate_tuning_block_mem( + block: torch.nn.Module, input_ids: list[torch.Tensor], pick_samples: int +) -> tuple[dict, float]: """ Calculates the memory consumption of a specific block in the model. Args: block (torch.nn.Module): The block of the model to analyze. input_ids (list[torch.Tensor]): A list of input tensors for the block. + pick_samples (int): Number of samples to consider for memory estimation. Returns: tuple: A tuple containing the following: - - block_memory (float): The memory consumption (in GB) of the block's linear layers. + - layer_memory_dict (dict): A dictionary mapping layer names to their memory consumption (in GB). + Format: {layer_name: {"param_memory": float, "output_memory": float}} + SDPA layers are represented with a fixed 1GB output memory. - input_output_memory (float): The memory consumption (in GB) for input and output tensors of the block. """ - # Calculate all block parameters memory - from auto_round.utils.model import check_to_quantized + # Calculate all block parameters memory and build layer-wise memory dict + from auto_round.utils.model import get_layer_features + layer_memory_dict = {} total_param_mem = 0 + + # Calculate batch_size and sequence_length from input_ids for output memory estimation + seq_len = input_ids[0].shape[1] if input_ids and len(input_ids[0].shape) >= 2 else 1 + element_size = input_ids[0].element_size() if input_ids else 2 # Default to 2 bytes (fp16/bf16) + for name, module in block.named_modules(): if check_to_quantized(module): + layer_name = name param_size = module.weight.nbytes total_param_mem += param_size - block_memory = total_param_mem / 1024**3 # Convert to GB + param_memory_gb = param_size / 1024**3 + + # Estimate output memory based on input_features and out_features + in_features, out_features = get_layer_features(module) + if in_features is not None and out_features is not None: + # Output tensor size: batch_size * seq_len * out_features * element_size + output_size = pick_samples * seq_len * out_features * element_size + output_memory_gb = output_size / 1024**3 + else: + output_memory_gb = 0.0 + + # memory * 2, because it contains grad tensor. + layer_memory_dict[layer_name] = {"param_memory": param_memory_gb * 2, "output_memory": output_memory_gb * 2} # Assuming bfloat16 or float32, input and output input_output_memory = 2 * sum(tensor.nbytes for tensor in input_ids) / 1024**3 + if torch.xpu.is_available(): + # https://github.com/intel/torch-xpu-ops/issues/2232 + # sdpa on XPU takes more memory than expected. + additional_memory = 12 + else: + additional_memory = 1 # sdpa usage and loss calculation usage - return block_memory, input_output_memory + return layer_memory_dict, input_output_memory, additional_memory def out_of_vram(error_msg): @@ -538,7 +568,7 @@ def get_device_memory(i: int = 0) -> int: if torch.cuda.is_available(): total_memory = bytes_to_gigabytes(torch.cuda.get_device_properties(i).total_memory) elif torch.xpu.is_available(): - raise RuntimeError("XPU does not support device_map='auto' currently.") + total_memory = bytes_to_gigabytes(torch.xpu.get_device_properties(i).total_memory) else: raise RuntimeError("No supported device found (CUDA or XPU).") return total_memory @@ -629,7 +659,11 @@ def set_non_auto_device_map( def set_auto_device_map_for_block_with_tuning( - block: torch.nn.Module, device_map, input_ids: list[torch.Tensor], low_gpu_mem_usage=False, mem_per_param_scale=13.0 + block: torch.nn.Module, + device_map, + input_ids: list[torch.Tensor], + low_gpu_mem_usage=False, + pick_samples=8, ): """ Automatically sets the device map for the block based on available GPUs and memory constraints. @@ -639,9 +673,7 @@ def set_auto_device_map_for_block_with_tuning( device_map (str | int | dict): Specifies the device mapping. input_ids (list[torch.Tensor]): List of input tensors used for estimating memory requirements. low_gpu_mem_usage (bool, optional): If True, ignoring input/output memory. Defaults to False. - mem_per_param_scale (float, optional): Scaling factor for estimating memory usage per parameter in the block. - Typical values range from 10.0 to 20.0 depending on model size and GPU memory characteristics. - Higher values are more conservative and help avoid out-of-memory errors. Defaults to 13.0. + pick_samples (int, optional): Number of samples to consider for memory estimation. Defaults to 8. Returns: None @@ -654,10 +686,11 @@ def set_auto_device_map_for_block_with_tuning( The mem_per_param_scale parameter should be adjusted based on empirical memory usage observations. """ if torch.cuda.is_available(): - num_gpus = torch.cuda.device_count() + num_devices = torch.cuda.device_count() + device_name = "cuda" elif torch.xpu.is_available(): - logger.warning_once("XPU does not support auto device map yet, using device 0 for tuning.") - return + num_devices = torch.xpu.device_count() + device_name = "xpu" else: raise RuntimeError("No CUDA or XPU devices found.") device_list = None @@ -665,52 +698,75 @@ def set_auto_device_map_for_block_with_tuning( device_list = [int(dev) for dev in device_map.split(",") if dev.isdigit()] if device_list: - cuda_devices = [f"cuda:{i}" for i in device_list] - device_0 = cuda_devices[0] + cuda_xpu_devices = [f"{device_name}:{i}" for i in device_list] + device_0 = cuda_xpu_devices[0] else: - cuda_devices = [f"cuda:{i}" for i in range(num_gpus)] - device_0 = "cuda:0" + cuda_xpu_devices = [f"{device_name}:{i}" for i in range(num_devices)] + device_0 = f"{device_name}:0" device_0_memory = get_device_memory(device_list[0] if device_list else 0) - block_memory, input_output_memory = estimate_tuning_block_mem(block, input_ids) + layer_memory_dict, input_output_memory, additional_memory = estimate_tuning_block_mem( + block, input_ids, pick_samples + ) if low_gpu_mem_usage: input_output_memory = 0 - if (block_memory * mem_per_param_scale + input_output_memory) < device_0_memory: - return # fit in one GPU - + # Calculate total block memory from layer memory dict (including both param and output memory) + total_block_param_memory = sum(info["param_memory"] for info in layer_memory_dict.values()) + total_block_output_memory = sum(info["output_memory"] for info in layer_memory_dict.values()) + + # Average dispatch strategy + # card_0_left_memory = card_0_mem - input_output_memory - additional_memory - layer_outputs_memory + card_0_left_memory = device_0_memory - input_output_memory - additional_memory - total_block_output_memory + + # Calculate total available memory across all devices + total_available_memory = card_0_left_memory + for i in range(1, len(cuda_xpu_devices)): + device_idx = device_list[i] if device_list else i + total_available_memory += get_device_memory(device_idx) + + # Calculate total params (in GB, considering param_memory only for calculation) + total_params = total_block_param_memory + mem_per_param = total_available_memory / total_params + + # Initialize device memory tracking + device_memory = {} + device_memory[device_0] = card_0_left_memory + for i in range(1, len(cuda_xpu_devices)): + device_idx = device_list[i] if device_list else i + device_memory[cuda_xpu_devices[i]] = get_device_memory(device_idx) + + # Dispatch layers to devices based on mem_per_param + # Use devices in order, switch to next device when current one is full device_map = {} - device_memory = {device: get_device_memory(int(device.split(":")[1])) for device in cuda_devices} - device_memory[device_0] = device_0_memory - input_output_memory - - device_idx = 0 names = [] - # First, fill device 0 to its maximum capacity, then distribute the remaining layers evenly across other devices - for n, m in block.named_modules(): - if check_to_quantized(m): - layer_name = m.tmp_name - names.append(layer_name) - layer_memory = m.weight.nbytes / 1024**3 - if device_idx == 0 and layer_memory * mem_per_param_scale < device_memory[cuda_devices[device_idx]]: - device_map[layer_name] = cuda_devices[device_idx] - device_memory[cuda_devices[device_idx]] -= layer_memory * mem_per_param_scale - elif device_idx == 0: - device_idx += 1 # Move to the next device once device 0 is full - device_map[layer_name] = cuda_devices[device_idx] - device_memory[cuda_devices[device_idx]] -= layer_memory * mem_per_param_scale - else: - # Calculate the target device index based on even distribution - sorted_devices = sorted(cuda_devices, key=lambda d: device_memory[d], reverse=True) - device_idx = sorted_devices[0] - if layer_memory * mem_per_param_scale < device_memory[device_idx]: - device_map[layer_name] = device_idx - device_memory[device_idx] -= layer_memory * mem_per_param_scale - else: - logger.warning_once( - f"Block {block.tmp_name} not fit in available GPU memory. " - "Consider using more GPUs or reducing mem_per_param_scale if OOM occurs." - ) + current_device_idx = 0 + current_device = cuda_xpu_devices[current_device_idx] + + for layer_name, mem_info in layer_memory_dict.items(): + names.append(layer_name) + # Calculate estimated memory for this layer + layer_param_memory = mem_info["param_memory"] + + # All layer outputs are on card_0, so all cards only need to store parameters + estimated_memory = layer_param_memory * mem_per_param + + # Try to fit in current device + if estimated_memory <= device_memory[current_device]: + device_map[layer_name] = current_device + device_memory[current_device] -= estimated_memory + else: + # Current device is full, try to switch to next device + if current_device_idx < len(cuda_xpu_devices) - 1: + current_device_idx += 1 + current_device = cuda_xpu_devices[current_device_idx] + + # Place on current device (either new device or last device) + device_map[layer_name] = current_device + device_memory[current_device] -= estimated_memory + + print(device_map) set_non_auto_device_map(block, device_map, names) @@ -789,16 +845,17 @@ def set_avg_auto_device_map(model: torch.nn.Module, device_map): else: if torch.cuda.is_available(): num_devices = torch.cuda.device_count() + device_name = "cuda" elif torch.xpu.is_available(): - logger.warning_once("XPU does not support auto device map yet, using device 0 for tuning.") - return + num_devices = torch.xpu.device_count() + device_name = "xpu" else: return if device_list: - cuda_devices = [f"cuda:{i}" for i in device_list] + cuda_xpu_devices = [f"{device_name}:{i}" for i in device_list] else: - cuda_devices = [f"cuda:{i}" for i in range(num_devices)] + cuda_xpu_devices = [f"{device_name}:{i}" for i in range(num_devices)] for block_names in block_name_list: for block_name in block_names: @@ -814,7 +871,7 @@ def set_avg_auto_device_map(model: torch.nn.Module, device_map): device_index = 0 for res in res_list: for key in res.keys(): - set_tuning_device_for_layer(block_module, key, cuda_devices[device_index]) + set_tuning_device_for_layer(block_module, key, cuda_xpu_devices[device_index]) device_index += 1 From a582d4666f56fa90bcd7986034790ff9d1b76028 Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Wed, 29 Oct 2025 03:08:59 -0400 Subject: [PATCH 02/13] remove mem_per_param_scale Signed-off-by: He, Xin3 --- auto_round/__main__.py | 9 --------- auto_round/compressors/base.py | 14 ++------------ auto_round/compressors/config.py | 5 ----- auto_round/utils/device.py | 1 - 4 files changed, 2 insertions(+), 27 deletions(-) diff --git a/auto_round/__main__.py b/auto_round/__main__.py index c403ee863..2b68475e8 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -157,14 +157,6 @@ def __init__(self, *args, **kwargs): type=float, help="Learning rate specifically for min-max tuning. " "If None, uses the same value as --lr. ", ) - tuning.add_argument( - "--mem_per_param_scale", - default=13, - type=float, - help="Memory scaling factor for parameter memory estimation. " - "Adjust this if you need to control memory usage during tuning. " - "Lower values reduce memory usage but may affect accuracy.", - ) tuning.add_argument( "--gradient_accumulate_steps", default=1, @@ -522,7 +514,6 @@ def tune(args): enable_deterministic_algorithms=args.enable_deterministic_algorithms, lr=args.lr, minmax_lr=args.minmax_lr, - mem_per_param_scale=args.mem_per_param_scale, nblocks=args.nblocks, to_quant_block_names=args.to_quant_block_names, scale_dtype=args.scale_dtype, diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index c148164ae..921a5763f 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -230,8 +230,6 @@ def __init__( enable_deterministic_algorithms = kwargs.pop("enable_deterministic_algorithms", False) static_kv_dtype = kwargs.pop("static_kv_dtype", None) device = kwargs.pop("device", None) - # Scale factor for RAM usage per parameter. - mem_per_param_scale = kwargs.pop("mem_per_param_scale", None) self.quant_lm_head = kwargs.pop("quant_lm_head", False) self.mllm = kwargs.pop("mllm") if "mllm" in kwargs else False self.diffusion = kwargs.pop("diffusion") if "diffusion" in kwargs else False @@ -332,10 +330,6 @@ def __init__( self.optimizer = self._get_optimizer(None) self.disable_opt_rtn = disable_opt_rtn self.is_packing_immediate = False # whether to pack the layer immediately after tuning - if mem_per_param_scale is None: - self.mem_per_param_scale = 13 if self.iters != 0 else 1 - else: - self.mem_per_param_scale = mem_per_param_scale # KV cache, this one does not affect tuning but will collect some infos during tuning self.static_kv_dtype = static_kv_dtype @@ -1428,9 +1422,7 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str]) convert_fp8_model_to_16b_model(block, dtype=self.amp_dtype) if self.device_map == "auto" or (isinstance(self.device_map, str) and "," in self.device_map): - set_auto_device_map_for_block_with_tuning( - block, self.device_map, input_ids, self.low_gpu_mem_usage, self.mem_per_param_scale - ) + set_auto_device_map_for_block_with_tuning(block, self.device_map, input_ids, self.low_gpu_mem_usage) # Dispatch model if needed if self.device_map is not None: from accelerate.hooks import AlignDevicesHook, add_hook_to_module @@ -2444,9 +2436,7 @@ def _quantize_block( set_module(block, n, new_layer) if self.device_map == "auto" or (isinstance(self.device_map, str) and "," in self.device_map): - set_auto_device_map_for_block_with_tuning( - block, self.device_map, input_ids, self.low_gpu_mem_usage, self.mem_per_param_scale - ) + set_auto_device_map_for_block_with_tuning(block, self.device_map, input_ids, self.low_gpu_mem_usage) if self.device_map is not None: for n, m in block.named_modules(): diff --git a/auto_round/compressors/config.py b/auto_round/compressors/config.py index d42e13427..b2bb61409 100644 --- a/auto_round/compressors/config.py +++ b/auto_round/compressors/config.py @@ -41,7 +41,6 @@ def __init__( lr: float = None, lr_scheduler: Callable = None, minmax_lr: float = None, - mem_per_param_scale: int = None, nblocks: int = 1, to_quant_block_names: Union[str, list, None] = None, scale_dtype: str = "fp16", @@ -84,8 +83,6 @@ def __init__( lr (float): The learning rate (default is 0.005). lr_scheduler: The learning rate scheduler to be used. minmax_lr (float): The learning rate for min-max tuning (default is None). - mem_per_param_scale (int): Scale factor for memory per parameter, - used to adjust memory usage estimation for tuning. nblocks (int): Number of blocks (default is 1). quant_lm_head (bool): Whether to quant lm_head. to_quant_block_names (str|list): Names of quantitative blocks, please use commas to separate them. @@ -124,7 +121,6 @@ def __init__( lr=lr, lr_scheduler=lr_scheduler, minmax_lr=minmax_lr, - mem_per_param_scale=mem_per_param_scale, nblocks=nblocks, to_quant_block_names=to_quant_block_names, scale_dtype=scale_dtype, @@ -260,7 +256,6 @@ class TuningExtraConfig(BaseExtraConfig): lr: float = None lr_scheduler: Callable = None minmax_lr: float = None - mem_per_param_scale: int = None nblocks: int = 1 to_quant_block_names: Union[str, list, None] = None scale_dtype: str = "fp16" diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index 21e375408..821cba15a 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -683,7 +683,6 @@ def set_auto_device_map_for_block_with_tuning( Note: This function is intended for internal use in device memory management and tuning. - The mem_per_param_scale parameter should be adjusted based on empirical memory usage observations. """ if torch.cuda.is_available(): num_devices = torch.cuda.device_count() From 7c598baf47a2906e88ac42686b226f7adc4f86ac Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Wed, 29 Oct 2025 03:52:11 -0400 Subject: [PATCH 03/13] consider enable_act_quant and optimize device map logic Signed-off-by: He, Xin3 --- auto_round/utils/device.py | 201 +++++++++++++++++++++++++++++++------ 1 file changed, 171 insertions(+), 30 deletions(-) diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index 821cba15a..5ad711477 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -489,6 +489,7 @@ def estimate_tuning_block_mem( for name, module in block.named_modules(): if check_to_quantized(module): + enable_act_quant = module.act_bits <= 8 layer_name = name param_size = module.weight.nbytes total_param_mem += param_size @@ -500,6 +501,12 @@ def estimate_tuning_block_mem( # Output tensor size: batch_size * seq_len * out_features * element_size output_size = pick_samples * seq_len * out_features * element_size output_memory_gb = output_size / 1024**3 + + # If enable_act_quant, add input tensor memory to param_memory + if enable_act_quant: + input_size = pick_samples * seq_len * in_features * element_size + input_memory_gb = input_size / 1024**3 + param_memory_gb += input_memory_gb else: output_memory_gb = 0.0 @@ -658,6 +665,168 @@ def set_non_auto_device_map( logger.warning(f"{key} in `device_map` dose not match any modules, please have a check") +def _allocate_layers_to_devices( + layer_memory_dict: dict, device_memory: dict, cuda_xpu_devices: list, mem_per_param: float +) -> tuple[dict, list]: + """ + Allocates layers to devices using a load-balancing strategy. + + Strategy: + 1. Sort layers by memory size (descending) to prioritize large operations + 2. Allocate largest layers to later devices first (to keep device 0 free for I/O) + 3. For each layer, find the best device considering: + - Sufficient remaining memory (preferred) + - Continuity (prefer same device as neighboring layers in original model order) + - Load balancing (minimize wasted space) + 4. Fallback to device with most remaining space if all devices are over capacity + + Args: + layer_memory_dict (dict): Mapping of layer names to their memory info (order preserved) + Format: {layer_name: {"param_memory": float, "output_memory": float}} + device_memory (dict): Available memory for each device (will be modified) + Format: {device_name: available_memory_gb} + cuda_xpu_devices (list): List of available device names (e.g., ["cuda:0", "cuda:1"]) + mem_per_param (float): Memory multiplier per parameter GB + + Returns: + tuple[dict, list]: + - device_map: Mapping of layer names to assigned devices + - names: List of layer names in processing order + + Examples: + Example - Distribution with 3 devices: + Given layers [huge: 20GB, big: 15GB, large: 10GB, medium: 5GB, small: 2GB] + and 3 devices [device 0, device 1, device 2]: + - 'huge' → tries device 2 first (last device, preferred for largest layer) + - 'big' → tries device 1 (second-to-last, preferred for second largest) + - 'large' → tries device 0, but prioritizes neighbor's device if applicable + - 'medium' and 'small' → assigned based on remaining capacity and neighbors + + Result: Device 0 has lightest load, suitable for handling I/O overhead + """ + device_map = {} + names = [] + + # Build layer order map from original dict order (preserves model structure) + layer_names_in_order = list(layer_memory_dict.keys()) + layer_order = {name: idx for idx, name in enumerate(layer_names_in_order)} + + # Sort layers by memory size (descending) to handle large layers first + # This prevents large layers from being stuck without suitable devices later + sorted_layers = sorted(layer_memory_dict.items(), key=lambda x: x[1]["param_memory"], reverse=True) + + # Track assigned layers to avoid duplicates + assigned_layers = set() + + # Track preferred starting device for large layers (start from last device) + num_devices = len(cuda_xpu_devices) + preferred_device_idx = num_devices - 1 # Start from last device + + # Process each layer in sorted order (large to small) + for layer_idx, (layer_name, mem_info) in enumerate(sorted_layers): + if layer_name in assigned_layers: + continue + + names.append(layer_name) + layer_param_memory = mem_info["param_memory"] + estimated_memory = layer_param_memory * mem_per_param + + # Find neighboring layers in original model order for continuity evaluation + current_layer_idx = layer_order[layer_name] + neighbor_devices = set() + + # Check previous and next layers in original order + for offset in [-1, 1]: + neighbor_idx = current_layer_idx + offset + if 0 <= neighbor_idx < len(layer_names_in_order): + neighbor_name = layer_names_in_order[neighbor_idx] + if neighbor_name in device_map: + neighbor_devices.add(device_map[neighbor_name]) + + # Phase 1: Try to find a device with sufficient space + best_device = None + best_device_idx = None + min_score = float("inf") + + for dev_idx in range(num_devices): + dev = cuda_xpu_devices[dev_idx] + remaining = device_memory[dev] - estimated_memory + + # Only consider devices with enough space + if remaining >= 0: + # Continuity bonus: strongly prefer device used by neighboring layers + # This keeps adjacent layers in the model on the same device, reducing communication + if dev in neighbor_devices: + continuity_bonus = -2000 # Very strong preference for neighbor's device + else: + continuity_bonus = 0 + + # For large layers (early in sorted order), prefer later devices + # This keeps device 0 lighter as it handles I/O overhead + if layer_idx < num_devices and dev_idx >= preferred_device_idx: + large_layer_bonus = -500 # Moderate preference for later devices + else: + large_layer_bonus = 0 + + # Score = remaining memory waste + continuity penalty + large layer penalty (lower is better) + score = abs(remaining) + continuity_bonus + large_layer_bonus + if score < min_score: + min_score = score + best_device = dev + best_device_idx = dev_idx + + # Phase 2: Fallback - if no device has enough space, prefer neighbor's device + if best_device is None: + if neighbor_devices: + # Prefer neighbor's device even if over capacity + for dev in neighbor_devices: + if best_device is None or device_memory[dev] > device_memory[best_device]: + best_device = dev + best_device_idx = cuda_xpu_devices.index(dev) + else: + # No neighbors assigned yet, prefer later devices for large layers + if layer_idx < num_devices: + # Try from last device backwards + max_remaining = float("-inf") + for dev_idx in range(num_devices - 1, -1, -1): + dev = cuda_xpu_devices[dev_idx] + remaining = device_memory[dev] - estimated_memory + if remaining > max_remaining: + max_remaining = remaining + best_device = dev + best_device_idx = dev_idx + else: + # For smaller layers, use device with most remaining space + max_remaining = float("-inf") + for dev_idx in range(num_devices): + dev = cuda_xpu_devices[dev_idx] + remaining = device_memory[dev] - estimated_memory + if remaining > max_remaining: + max_remaining = remaining + best_device = dev + best_device_idx = dev_idx + + # Phase 3: Final safety fallback - use last device if still None + # Use last device to keep device 0 lighter + if best_device is None: + best_device = cuda_xpu_devices[-1] + best_device_idx = num_devices - 1 + + # Assign layer to the selected device + device_map[layer_name] = best_device + device_memory[best_device] -= estimated_memory + assigned_layers.add(layer_name) + + # Update preferred device index for next large layer + # Move backwards through devices to distribute large layers + if layer_idx < num_devices and preferred_device_idx > 0: + preferred_device_idx -= 1 + + # Restore device_map to original layer order for printing + ordered_device_map = {name: device_map[name] for name in layer_memory_dict.keys() if name in device_map} + return ordered_device_map, names + + def set_auto_device_map_for_block_with_tuning( block: torch.nn.Module, device_map, @@ -735,36 +904,8 @@ def set_auto_device_map_for_block_with_tuning( device_idx = device_list[i] if device_list else i device_memory[cuda_xpu_devices[i]] = get_device_memory(device_idx) - # Dispatch layers to devices based on mem_per_param - # Use devices in order, switch to next device when current one is full - device_map = {} - names = [] - - current_device_idx = 0 - current_device = cuda_xpu_devices[current_device_idx] - - for layer_name, mem_info in layer_memory_dict.items(): - names.append(layer_name) - # Calculate estimated memory for this layer - layer_param_memory = mem_info["param_memory"] - - # All layer outputs are on card_0, so all cards only need to store parameters - estimated_memory = layer_param_memory * mem_per_param - - # Try to fit in current device - if estimated_memory <= device_memory[current_device]: - device_map[layer_name] = current_device - device_memory[current_device] -= estimated_memory - else: - # Current device is full, try to switch to next device - if current_device_idx < len(cuda_xpu_devices) - 1: - current_device_idx += 1 - current_device = cuda_xpu_devices[current_device_idx] - - # Place on current device (either new device or last device) - device_map[layer_name] = current_device - device_memory[current_device] -= estimated_memory - + # Allocate layers to devices using load-balancing strategy + device_map, names = _allocate_layers_to_devices(layer_memory_dict, device_memory, cuda_xpu_devices, mem_per_param) print(device_map) set_non_auto_device_map(block, device_map, names) From 6508b74acdbda91b790eb14b385f0d8d220dd085 Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Wed, 29 Oct 2025 04:45:13 -0400 Subject: [PATCH 04/13] clear_memory for XPU Signed-off-by: He, Xin3 --- auto_round/compressors/base.py | 5 +- auto_round/utils/device.py | 159 ++++++++++----------------------- 2 files changed, 52 insertions(+), 112 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 921a5763f..c395bef68 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -2478,9 +2478,8 @@ def _quantize_block( if q_input is not None: if input_ids is not q_input: clear_memory(input_ids) - else: - clear_memory() input_ids = q_input + clear_memory() quantized_layer_names, unquantized_layer_names = wrapper_block( block, @@ -2567,6 +2566,7 @@ def _quantize_block( current_output = to_device(current_output, device) output_q = self._get_current_q_output(block, input_ids, input_others, indices, device) + clear_memory() # clean cached memory after getting output_q if self.attention_mask: tmp_attention_mask = [self.attention_mask[i] for i in indices] tmp_attention_mask = torch.cat(tmp_attention_mask, dim=0).to(device) @@ -2586,6 +2586,7 @@ def _quantize_block( total_loss += loss.item() / num_elm self._scale_loss_and_backward(scaler, loss) + clear_memory() # clean cached memory after backward if i == 0: init_loss = total_loss diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index 5ad711477..3a71ec377 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -517,8 +517,8 @@ def estimate_tuning_block_mem( input_output_memory = 2 * sum(tensor.nbytes for tensor in input_ids) / 1024**3 if torch.xpu.is_available(): # https://github.com/intel/torch-xpu-ops/issues/2232 - # sdpa on XPU takes more memory than expected. - additional_memory = 12 + # sdpa on XPU takes more memory than expected. 2 from grad tensor + additional_memory = 9 * 2 + 1 else: additional_memory = 1 # sdpa usage and loss calculation usage @@ -674,68 +674,52 @@ def _allocate_layers_to_devices( Strategy: 1. Sort layers by memory size (descending) to prioritize large operations 2. Allocate largest layers to later devices first (to keep device 0 free for I/O) - 3. For each layer, find the best device considering: - - Sufficient remaining memory (preferred) - - Continuity (prefer same device as neighboring layers in original model order) - - Load balancing (minimize wasted space) - 4. Fallback to device with most remaining space if all devices are over capacity + 3. For each layer, prefer: neighbor's device > devices with more space > later devices for large layers Args: - layer_memory_dict (dict): Mapping of layer names to their memory info (order preserved) - Format: {layer_name: {"param_memory": float, "output_memory": float}} + layer_memory_dict (dict): Mapping of layer names to memory info (order preserved) device_memory (dict): Available memory for each device (will be modified) - Format: {device_name: available_memory_gb} - cuda_xpu_devices (list): List of available device names (e.g., ["cuda:0", "cuda:1"]) + cuda_xpu_devices (list): List of device names (e.g., ["cuda:0", "cuda:1"]) mem_per_param (float): Memory multiplier per parameter GB Returns: - tuple[dict, list]: - - device_map: Mapping of layer names to assigned devices - - names: List of layer names in processing order - - Examples: - Example - Distribution with 3 devices: - Given layers [huge: 20GB, big: 15GB, large: 10GB, medium: 5GB, small: 2GB] - and 3 devices [device 0, device 1, device 2]: - - 'huge' → tries device 2 first (last device, preferred for largest layer) - - 'big' → tries device 1 (second-to-last, preferred for second largest) - - 'large' → tries device 0, but prioritizes neighbor's device if applicable - - 'medium' and 'small' → assigned based on remaining capacity and neighbors - - Result: Device 0 has lightest load, suitable for handling I/O overhead + tuple[dict, list]: (device_map, names) + + Example (LLaMA block with 7 layers on 3 devices): + Input: + # cuda:0 already occupied by input/output tensors and activations + device_memory = {"cuda:0": 15.0, "cuda:1": 40.0, "cuda:2": 40.0} + cuda_xpu_devices = ["cuda:0", "cuda:1", "cuda:2"] + mem_per_param = 2.0 + + Processing order (sorted by param_memory, descending): + 1. gate_proj (11.0 * 2 = 22.0 GB) -> cuda:2 (largest layer, prefer last device) + 2. up_proj (11.0 * 2 = 22.0 GB) -> cuda:1 (largest layer, prefer 2nd last device) + 3. down_proj (11.0 * 2 = 22.0 GB) -> cuda:1 (cuda:0 only has 15GB, insufficient) + 4. q_proj (4.0 * 2 = 8.0 GB) -> cuda:2 (neighbor of gate_proj, continuity bonus) + 5. o_proj (4.0 * 2 = 8.0 GB) -> cuda:2 (neighbor of q_proj, continuity bonus) + 6. k_proj (1.0 * 2 = 2.0 GB) -> cuda:1 (neighbor of q_proj, continuity bonus) + 7. v_proj (1.0 * 2 = 2.0 GB) -> cuda:1 (neighbor of k_proj, continuity bonus) + """ device_map = {} names = [] - # Build layer order map from original dict order (preserves model structure) layer_names_in_order = list(layer_memory_dict.keys()) layer_order = {name: idx for idx, name in enumerate(layer_names_in_order)} - # Sort layers by memory size (descending) to handle large layers first - # This prevents large layers from being stuck without suitable devices later sorted_layers = sorted(layer_memory_dict.items(), key=lambda x: x[1]["param_memory"], reverse=True) - # Track assigned layers to avoid duplicates - assigned_layers = set() - - # Track preferred starting device for large layers (start from last device) num_devices = len(cuda_xpu_devices) - preferred_device_idx = num_devices - 1 # Start from last device + preferred_device_idx = num_devices - 1 # Start from last device for large layers - # Process each layer in sorted order (large to small) for layer_idx, (layer_name, mem_info) in enumerate(sorted_layers): - if layer_name in assigned_layers: - continue - names.append(layer_name) - layer_param_memory = mem_info["param_memory"] - estimated_memory = layer_param_memory * mem_per_param + estimated_memory = mem_info["param_memory"] * mem_per_param - # Find neighboring layers in original model order for continuity evaluation + # Find neighbor devices current_layer_idx = layer_order[layer_name] neighbor_devices = set() - - # Check previous and next layers in original order for offset in [-1, 1]: neighbor_idx = current_layer_idx + offset if 0 <= neighbor_idx < len(layer_names_in_order): @@ -743,87 +727,42 @@ def _allocate_layers_to_devices( if neighbor_name in device_map: neighbor_devices.add(device_map[neighbor_name]) - # Phase 1: Try to find a device with sufficient space + # Find best device best_device = None - best_device_idx = None - min_score = float("inf") + best_score = float("inf") - for dev_idx in range(num_devices): - dev = cuda_xpu_devices[dev_idx] + for dev_idx, dev in enumerate(cuda_xpu_devices): remaining = device_memory[dev] - estimated_memory + if remaining < 0: + continue # Skip devices without enough space - # Only consider devices with enough space - if remaining >= 0: - # Continuity bonus: strongly prefer device used by neighboring layers - # This keeps adjacent layers in the model on the same device, reducing communication - if dev in neighbor_devices: - continuity_bonus = -2000 # Very strong preference for neighbor's device - else: - continuity_bonus = 0 - - # For large layers (early in sorted order), prefer later devices - # This keeps device 0 lighter as it handles I/O overhead - if layer_idx < num_devices and dev_idx >= preferred_device_idx: - large_layer_bonus = -500 # Moderate preference for later devices - else: - large_layer_bonus = 0 - - # Score = remaining memory waste + continuity penalty + large layer penalty (lower is better) - score = abs(remaining) + continuity_bonus + large_layer_bonus - if score < min_score: - min_score = score - best_device = dev - best_device_idx = dev_idx - - # Phase 2: Fallback - if no device has enough space, prefer neighbor's device + # Score components (lower is better) + continuity_bonus = -2000 if dev in neighbor_devices else 0 + large_layer_bonus = -500 if layer_idx < num_devices and dev_idx >= preferred_device_idx else 0 + load_balance_penalty = -remaining # More space = lower penalty + + score = load_balance_penalty + continuity_bonus + large_layer_bonus + if score < best_score: + best_score = score + best_device = dev + + # Fallback: if no device has space, use neighbor's or last device if best_device is None: if neighbor_devices: - # Prefer neighbor's device even if over capacity - for dev in neighbor_devices: - if best_device is None or device_memory[dev] > device_memory[best_device]: - best_device = dev - best_device_idx = cuda_xpu_devices.index(dev) + best_device = max(neighbor_devices, key=lambda d: device_memory[d]) else: - # No neighbors assigned yet, prefer later devices for large layers - if layer_idx < num_devices: - # Try from last device backwards - max_remaining = float("-inf") - for dev_idx in range(num_devices - 1, -1, -1): - dev = cuda_xpu_devices[dev_idx] - remaining = device_memory[dev] - estimated_memory - if remaining > max_remaining: - max_remaining = remaining - best_device = dev - best_device_idx = dev_idx - else: - # For smaller layers, use device with most remaining space - max_remaining = float("-inf") - for dev_idx in range(num_devices): - dev = cuda_xpu_devices[dev_idx] - remaining = device_memory[dev] - estimated_memory - if remaining > max_remaining: - max_remaining = remaining - best_device = dev - best_device_idx = dev_idx - - # Phase 3: Final safety fallback - use last device if still None - # Use last device to keep device 0 lighter - if best_device is None: - best_device = cuda_xpu_devices[-1] - best_device_idx = num_devices - 1 + best_device = max(cuda_xpu_devices, key=lambda d: device_memory[d]) - # Assign layer to the selected device + # Assign layer device_map[layer_name] = best_device device_memory[best_device] -= estimated_memory - assigned_layers.add(layer_name) - # Update preferred device index for next large layer - # Move backwards through devices to distribute large layers + # Update preferred device for next large layer if layer_idx < num_devices and preferred_device_idx > 0: preferred_device_idx -= 1 - # Restore device_map to original layer order for printing - ordered_device_map = {name: device_map[name] for name in layer_memory_dict.keys() if name in device_map} + # Restore original order + ordered_device_map = {name: device_map[name] for name in layer_names_in_order if name in device_map} return ordered_device_map, names @@ -906,7 +845,7 @@ def set_auto_device_map_for_block_with_tuning( # Allocate layers to devices using load-balancing strategy device_map, names = _allocate_layers_to_devices(layer_memory_dict, device_memory, cuda_xpu_devices, mem_per_param) - print(device_map) + set_non_auto_device_map(block, device_map, names) From cd5f68516c50363ecf5ba38873d82ec03e844514 Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Wed, 29 Oct 2025 06:02:15 -0400 Subject: [PATCH 05/13] refine device map logic Signed-off-by: He, Xin3 --- auto_round/utils/device.py | 121 ++++++++++++++++++------------------- 1 file changed, 58 insertions(+), 63 deletions(-) diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index 3a71ec377..46ce76dd0 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -672,9 +672,9 @@ def _allocate_layers_to_devices( Allocates layers to devices using a load-balancing strategy. Strategy: - 1. Sort layers by memory size (descending) to prioritize large operations - 2. Allocate largest layers to later devices first (to keep device 0 free for I/O) - 3. For each layer, prefer: neighbor's device > devices with more space > later devices for large layers + 1. Sort layers by memory size (descending), preserve order for equal sizes + 2. Assign largest N layers to higher-index devices (N = num_devices) + 3. Remaining layers use memory availability + layer continuity scorings Args: layer_memory_dict (dict): Mapping of layer names to memory info (order preserved) @@ -685,82 +685,76 @@ def _allocate_layers_to_devices( Returns: tuple[dict, list]: (device_map, names) - Example (LLaMA block with 7 layers on 3 devices): + Example: Input: - # cuda:0 already occupied by input/output tensors and activations - device_memory = {"cuda:0": 15.0, "cuda:1": 40.0, "cuda:2": 40.0} - cuda_xpu_devices = ["cuda:0", "cuda:1", "cuda:2"] + device_memory = {"cuda:0": 30.0, "cuda:1": 40.0, "cuda:2": 40.0} + layer_memory_dict = { + "q_proj": {"param_memory": 4.0}, "k_proj": {"param_memory": 1.0}, + "v_proj": {"param_memory": 1.0}, "o_proj": {"param_memory": 4.0}, + "gate_proj": {"param_memory": 11.0}, "up_proj": {"param_memory": 11.0}, + "down_proj": {"param_memory": 11.0} + } mem_per_param = 2.0 - Processing order (sorted by param_memory, descending): - 1. gate_proj (11.0 * 2 = 22.0 GB) -> cuda:2 (largest layer, prefer last device) - 2. up_proj (11.0 * 2 = 22.0 GB) -> cuda:1 (largest layer, prefer 2nd last device) - 3. down_proj (11.0 * 2 = 22.0 GB) -> cuda:1 (cuda:0 only has 15GB, insufficient) - 4. q_proj (4.0 * 2 = 8.0 GB) -> cuda:2 (neighbor of gate_proj, continuity bonus) - 5. o_proj (4.0 * 2 = 8.0 GB) -> cuda:2 (neighbor of q_proj, continuity bonus) - 6. k_proj (1.0 * 2 = 2.0 GB) -> cuda:1 (neighbor of q_proj, continuity bonus) - 7. v_proj (1.0 * 2 = 2.0 GB) -> cuda:1 (neighbor of k_proj, continuity bonus) - + Result (allocation order by size): + 1. gate_proj (22GB) -> cuda:2 (largest, prefer last device) + 2. up_proj (22GB) -> cuda:1 (2nd largest, prefer 2nd last device) + 3. down_proj (22GB) -> cuda:0 (3rd largest, cuda:0 has 30GB available) + 4. q_proj (8GB) -> cuda:2 (neighbor of gate_proj, continuity bonus) + 5. o_proj (8GB) -> cuda:2 (neighbor of q_proj, continuity bonus) + 6. k_proj (2GB) -> cuda:1 (neighbor of q_proj via original order) + 7. v_proj (2GB) -> cuda:1 (neighbor of k_proj, continuity bonus) """ device_map = {} names = [] - layer_names_in_order = list(layer_memory_dict.keys()) layer_order = {name: idx for idx, name in enumerate(layer_names_in_order)} - - sorted_layers = sorted(layer_memory_dict.items(), key=lambda x: x[1]["param_memory"], reverse=True) - + sorted_layers = sorted(layer_memory_dict.items(), key=lambda x: (-x[1]["param_memory"], -layer_order[x[0]])) num_devices = len(cuda_xpu_devices) - preferred_device_idx = num_devices - 1 # Start from last device for large layers + def find_best_device(layer_name, estimated_memory, layer_idx): + """Find the best device for a layer.""" + # Phase 1: Direct assign largest layers to higher-index devices first + if layer_idx < num_devices - 1: + return cuda_xpu_devices[-(layer_idx + 1)] + + # Phase 2: Choose device with best score (memory + continuity) + best_device = None + best_score = float("-inf") + current_layer_order = layer_order[layer_name] + + for device in cuda_xpu_devices: + if device_memory[device] < estimated_memory: + continue + + # Memory score (normalized) + memory_score = device_memory[device] / estimated_memory + + # Continuity bonus for adjacent layers + continuity_bonus = 0 + for offset in [-1, 1]: # Check previous and next neighbors + neighbor_idx = current_layer_order + offset + if 0 <= neighbor_idx < len(layer_names_in_order): + neighbor_name = layer_names_in_order[neighbor_idx] + if neighbor_name in device_map and device_map[neighbor_name] == device: + continuity_bonus += 1.0 + + total_score = memory_score + continuity_bonus + if total_score > best_score: + best_score = total_score + best_device = device + + # Fallback: device with most available memory + return best_device or max(cuda_xpu_devices, key=lambda d: device_memory[d]) + + # Allocate layers for layer_idx, (layer_name, mem_info) in enumerate(sorted_layers): names.append(layer_name) estimated_memory = mem_info["param_memory"] * mem_per_param - - # Find neighbor devices - current_layer_idx = layer_order[layer_name] - neighbor_devices = set() - for offset in [-1, 1]: - neighbor_idx = current_layer_idx + offset - if 0 <= neighbor_idx < len(layer_names_in_order): - neighbor_name = layer_names_in_order[neighbor_idx] - if neighbor_name in device_map: - neighbor_devices.add(device_map[neighbor_name]) - - # Find best device - best_device = None - best_score = float("inf") - - for dev_idx, dev in enumerate(cuda_xpu_devices): - remaining = device_memory[dev] - estimated_memory - if remaining < 0: - continue # Skip devices without enough space - - # Score components (lower is better) - continuity_bonus = -2000 if dev in neighbor_devices else 0 - large_layer_bonus = -500 if layer_idx < num_devices and dev_idx >= preferred_device_idx else 0 - load_balance_penalty = -remaining # More space = lower penalty - - score = load_balance_penalty + continuity_bonus + large_layer_bonus - if score < best_score: - best_score = score - best_device = dev - - # Fallback: if no device has space, use neighbor's or last device - if best_device is None: - if neighbor_devices: - best_device = max(neighbor_devices, key=lambda d: device_memory[d]) - else: - best_device = max(cuda_xpu_devices, key=lambda d: device_memory[d]) - - # Assign layer + best_device = find_best_device(layer_name, estimated_memory, layer_idx) device_map[layer_name] = best_device device_memory[best_device] -= estimated_memory - # Update preferred device for next large layer - if layer_idx < num_devices and preferred_device_idx > 0: - preferred_device_idx -= 1 - # Restore original order ordered_device_map = {name: device_map[name] for name in layer_names_in_order if name in device_map} return ordered_device_map, names @@ -845,6 +839,7 @@ def set_auto_device_map_for_block_with_tuning( # Allocate layers to devices using load-balancing strategy device_map, names = _allocate_layers_to_devices(layer_memory_dict, device_memory, cuda_xpu_devices, mem_per_param) + logger.debug(f"Auto device map for block: {device_map}") set_non_auto_device_map(block, device_map, names) From 649ffee7c078e4c9cb66667c779d7600709e1779 Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Wed, 29 Oct 2025 22:16:58 -0400 Subject: [PATCH 06/13] update per review comments Signed-off-by: He, Xin3 --- auto_round/compressors/base.py | 24 +++++++++++--------- auto_round/utils/device.py | 41 ++++++++++++++++++---------------- 2 files changed, 36 insertions(+), 29 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index c395bef68..0f21850f3 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -311,6 +311,7 @@ def __init__( self.low_gpu_mem_usage = low_gpu_mem_usage self.seqlen = seqlen self.batch_size, self.gradient_accumulate_steps = batch_size, gradient_accumulate_steps + self.pick_samples = self.batch_size * self.gradient_accumulate_steps self.nblocks = nblocks self.dataset = dataset self.iters = iters @@ -1422,7 +1423,9 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str]) convert_fp8_model_to_16b_model(block, dtype=self.amp_dtype) if self.device_map == "auto" or (isinstance(self.device_map, str) and "," in self.device_map): - set_auto_device_map_for_block_with_tuning(block, self.device_map, input_ids, self.low_gpu_mem_usage) + set_auto_device_map_for_block_with_tuning( + block, self.device_map, input_ids, self.low_gpu_mem_usage, self.pick_samples + ) # Dispatch model if needed if self.device_map is not None: from accelerate.hooks import AlignDevicesHook, add_hook_to_module @@ -2247,10 +2250,10 @@ def _quantize_layer( init_loss = None gradient_accumulate_steps = self.batch_size # Force to low gpu batch_size = 1 # Force to low gpu - pick_samples = batch_size * gradient_accumulate_steps - pick_samples = min(nsamples, pick_samples) + self.pick_samples = batch_size * gradient_accumulate_steps + self.pick_samples = min(nsamples, self.pick_samples) if self.sampler != "rand": - whole_indices = torch.randperm(nsamples)[:pick_samples] + whole_indices = torch.randperm(nsamples)[: self.pick_samples] total_loss = 0 num_elm = 1 mse_reduction = "mean" @@ -2261,7 +2264,7 @@ def _quantize_layer( for i in range(self.iters): total_loss = 0 if self.sampler == "rand": - whole_indices = torch.randperm(nsamples)[:pick_samples] + whole_indices = torch.randperm(nsamples)[: self.pick_samples] if gradient_accumulate_steps != 1: if q_inputs is not None: num_elm = self._get_current_num_elm(q_inputs, whole_indices) @@ -2436,7 +2439,9 @@ def _quantize_block( set_module(block, n, new_layer) if self.device_map == "auto" or (isinstance(self.device_map, str) and "," in self.device_map): - set_auto_device_map_for_block_with_tuning(block, self.device_map, input_ids, self.low_gpu_mem_usage) + set_auto_device_map_for_block_with_tuning( + block, self.device_map, input_ids, self.low_gpu_mem_usage, self.pick_samples + ) if self.device_map is not None: for n, m in block.named_modules(): @@ -2535,10 +2540,9 @@ def _quantize_block( else: nsamples = len(input_ids) - pick_samples = self.batch_size * self.gradient_accumulate_steps - pick_samples = min(nsamples, pick_samples) + self.pick_samples = min(nsamples, self.pick_samples) if self.sampler != "rand": - whole_indices = torch.randperm(nsamples)[:pick_samples] + whole_indices = torch.randperm(nsamples)[: self.pick_samples] last_best_iter = 0 best_loss = torch.finfo(torch.float).max num_elm = 1 @@ -2553,7 +2557,7 @@ def _quantize_block( for i in range(self.iters): total_loss = 0 if self.sampler == "rand": - whole_indices = torch.randperm(nsamples)[:pick_samples] + whole_indices = torch.randperm(nsamples)[: self.pick_samples] # We assume the block input and output shape is same if self.gradient_accumulate_steps != 1: num_elm = self._get_current_num_elm(input_ids, whole_indices) diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index 46ce76dd0..4551020e9 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -494,6 +494,7 @@ def estimate_tuning_block_mem( param_size = module.weight.nbytes total_param_mem += param_size param_memory_gb = param_size / 1024**3 + param_memory_gb *= 2 # considering the v tensor for weight rounding # Estimate output memory based on input_features and out_features in_features, out_features = get_layer_features(module) @@ -515,12 +516,14 @@ def estimate_tuning_block_mem( # Assuming bfloat16 or float32, input and output input_output_memory = 2 * sum(tensor.nbytes for tensor in input_ids) / 1024**3 + + # considering sdpa (attention activation) memory and reference_output memory for loss calculation + additional_memory = 1 if torch.xpu.is_available(): # https://github.com/intel/torch-xpu-ops/issues/2232 # sdpa on XPU takes more memory than expected. 2 from grad tensor - additional_memory = 9 * 2 + 1 - else: - additional_memory = 1 # sdpa usage and loss calculation usage + xpu_sdpa_additional_memory = 9 # GB + additional_memory += xpu_sdpa_additional_memory * 2 return layer_memory_dict, input_output_memory, additional_memory @@ -666,7 +669,7 @@ def set_non_auto_device_map( def _allocate_layers_to_devices( - layer_memory_dict: dict, device_memory: dict, cuda_xpu_devices: list, mem_per_param: float + layer_memory_dict: dict, device_memory: dict, gpu_devices: list, mem_per_param: float ) -> tuple[dict, list]: """ Allocates layers to devices using a load-balancing strategy. @@ -679,7 +682,7 @@ def _allocate_layers_to_devices( Args: layer_memory_dict (dict): Mapping of layer names to memory info (order preserved) device_memory (dict): Available memory for each device (will be modified) - cuda_xpu_devices (list): List of device names (e.g., ["cuda:0", "cuda:1"]) + gpu_devices (list): List of device names (e.g., ["cuda:0", "cuda:1"]) mem_per_param (float): Memory multiplier per parameter GB Returns: @@ -710,20 +713,20 @@ def _allocate_layers_to_devices( layer_names_in_order = list(layer_memory_dict.keys()) layer_order = {name: idx for idx, name in enumerate(layer_names_in_order)} sorted_layers = sorted(layer_memory_dict.items(), key=lambda x: (-x[1]["param_memory"], -layer_order[x[0]])) - num_devices = len(cuda_xpu_devices) + num_devices = len(gpu_devices) def find_best_device(layer_name, estimated_memory, layer_idx): """Find the best device for a layer.""" # Phase 1: Direct assign largest layers to higher-index devices first if layer_idx < num_devices - 1: - return cuda_xpu_devices[-(layer_idx + 1)] + return gpu_devices[-(layer_idx + 1)] # Phase 2: Choose device with best score (memory + continuity) best_device = None best_score = float("-inf") current_layer_order = layer_order[layer_name] - for device in cuda_xpu_devices: + for device in gpu_devices: if device_memory[device] < estimated_memory: continue @@ -745,7 +748,7 @@ def find_best_device(layer_name, estimated_memory, layer_idx): best_device = device # Fallback: device with most available memory - return best_device or max(cuda_xpu_devices, key=lambda d: device_memory[d]) + return best_device or max(gpu_devices, key=lambda d: device_memory[d]) # Allocate layers for layer_idx, (layer_name, mem_info) in enumerate(sorted_layers): @@ -799,10 +802,10 @@ def set_auto_device_map_for_block_with_tuning( device_list = [int(dev) for dev in device_map.split(",") if dev.isdigit()] if device_list: - cuda_xpu_devices = [f"{device_name}:{i}" for i in device_list] - device_0 = cuda_xpu_devices[0] + gpu_devices = [f"{device_name}:{i}" for i in device_list] + device_0 = gpu_devices[0] else: - cuda_xpu_devices = [f"{device_name}:{i}" for i in range(num_devices)] + gpu_devices = [f"{device_name}:{i}" for i in range(num_devices)] device_0 = f"{device_name}:0" device_0_memory = get_device_memory(device_list[0] if device_list else 0) @@ -822,7 +825,7 @@ def set_auto_device_map_for_block_with_tuning( # Calculate total available memory across all devices total_available_memory = card_0_left_memory - for i in range(1, len(cuda_xpu_devices)): + for i in range(1, len(gpu_devices)): device_idx = device_list[i] if device_list else i total_available_memory += get_device_memory(device_idx) @@ -833,12 +836,12 @@ def set_auto_device_map_for_block_with_tuning( # Initialize device memory tracking device_memory = {} device_memory[device_0] = card_0_left_memory - for i in range(1, len(cuda_xpu_devices)): + for i in range(1, len(gpu_devices)): device_idx = device_list[i] if device_list else i - device_memory[cuda_xpu_devices[i]] = get_device_memory(device_idx) + device_memory[gpu_devices[i]] = get_device_memory(device_idx) # Allocate layers to devices using load-balancing strategy - device_map, names = _allocate_layers_to_devices(layer_memory_dict, device_memory, cuda_xpu_devices, mem_per_param) + device_map, names = _allocate_layers_to_devices(layer_memory_dict, device_memory, gpu_devices, mem_per_param) logger.debug(f"Auto device map for block: {device_map}") set_non_auto_device_map(block, device_map, names) @@ -927,9 +930,9 @@ def set_avg_auto_device_map(model: torch.nn.Module, device_map): return if device_list: - cuda_xpu_devices = [f"{device_name}:{i}" for i in device_list] + gpu_devices = [f"{device_name}:{i}" for i in device_list] else: - cuda_xpu_devices = [f"{device_name}:{i}" for i in range(num_devices)] + gpu_devices = [f"{device_name}:{i}" for i in range(num_devices)] for block_names in block_name_list: for block_name in block_names: @@ -945,7 +948,7 @@ def set_avg_auto_device_map(model: torch.nn.Module, device_map): device_index = 0 for res in res_list: for key in res.keys(): - set_tuning_device_for_layer(block_module, key, cuda_xpu_devices[device_index]) + set_tuning_device_for_layer(block_module, key, gpu_devices[device_index]) device_index += 1 From 40d634cb3d7d0672bfb7e5f9490f9012e6dc6e6a Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Thu, 30 Oct 2025 01:39:10 -0400 Subject: [PATCH 07/13] clear_memory only xpu and remove block.to(device) Signed-off-by: He, Xin3 --- auto_round/compressors/base.py | 32 ++++++++++++++++++-------------- auto_round/utils/device.py | 20 +++++++++++++++----- 2 files changed, 33 insertions(+), 19 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 0f21850f3..e234d6802 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -311,7 +311,6 @@ def __init__( self.low_gpu_mem_usage = low_gpu_mem_usage self.seqlen = seqlen self.batch_size, self.gradient_accumulate_steps = batch_size, gradient_accumulate_steps - self.pick_samples = self.batch_size * self.gradient_accumulate_steps self.nblocks = nblocks self.dataset = dataset self.iters = iters @@ -1424,7 +1423,7 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str]) if self.device_map == "auto" or (isinstance(self.device_map, str) and "," in self.device_map): set_auto_device_map_for_block_with_tuning( - block, self.device_map, input_ids, self.low_gpu_mem_usage, self.pick_samples + block, self.device_map, input_ids, self.low_gpu_mem_usage, self.batch_size ) # Dispatch model if needed if self.device_map is not None: @@ -2250,10 +2249,10 @@ def _quantize_layer( init_loss = None gradient_accumulate_steps = self.batch_size # Force to low gpu batch_size = 1 # Force to low gpu - self.pick_samples = batch_size * gradient_accumulate_steps - self.pick_samples = min(nsamples, self.pick_samples) + pick_samples = batch_size * gradient_accumulate_steps + pick_samples = min(nsamples, pick_samples) if self.sampler != "rand": - whole_indices = torch.randperm(nsamples)[: self.pick_samples] + whole_indices = torch.randperm(nsamples)[:pick_samples] total_loss = 0 num_elm = 1 mse_reduction = "mean" @@ -2264,7 +2263,7 @@ def _quantize_layer( for i in range(self.iters): total_loss = 0 if self.sampler == "rand": - whole_indices = torch.randperm(nsamples)[: self.pick_samples] + whole_indices = torch.randperm(nsamples)[:pick_samples] if gradient_accumulate_steps != 1: if q_inputs is not None: num_elm = self._get_current_num_elm(q_inputs, whole_indices) @@ -2440,7 +2439,7 @@ def _quantize_block( if self.device_map == "auto" or (isinstance(self.device_map, str) and "," in self.device_map): set_auto_device_map_for_block_with_tuning( - block, self.device_map, input_ids, self.low_gpu_mem_usage, self.pick_samples + block, self.device_map, input_ids, self.low_gpu_mem_usage, self.batch_size ) if self.device_map is not None: @@ -2483,8 +2482,9 @@ def _quantize_block( if q_input is not None: if input_ids is not q_input: clear_memory(input_ids) + else: + clear_memory() input_ids = q_input - clear_memory() quantized_layer_names, unquantized_layer_names = wrapper_block( block, @@ -2540,9 +2540,10 @@ def _quantize_block( else: nsamples = len(input_ids) - self.pick_samples = min(nsamples, self.pick_samples) + pick_samples = self.batch_size * self.gradient_accumulate_steps + pick_samples = min(nsamples, pick_samples) if self.sampler != "rand": - whole_indices = torch.randperm(nsamples)[: self.pick_samples] + whole_indices = torch.randperm(nsamples)[:pick_samples] last_best_iter = 0 best_loss = torch.finfo(torch.float).max num_elm = 1 @@ -2557,7 +2558,7 @@ def _quantize_block( for i in range(self.iters): total_loss = 0 if self.sampler == "rand": - whole_indices = torch.randperm(nsamples)[: self.pick_samples] + whole_indices = torch.randperm(nsamples)[:pick_samples] # We assume the block input and output shape is same if self.gradient_accumulate_steps != 1: num_elm = self._get_current_num_elm(input_ids, whole_indices) @@ -2570,7 +2571,7 @@ def _quantize_block( current_output = to_device(current_output, device) output_q = self._get_current_q_output(block, input_ids, input_others, indices, device) - clear_memory() # clean cached memory after getting output_q + if self.attention_mask: tmp_attention_mask = [self.attention_mask[i] for i in indices] tmp_attention_mask = torch.cat(tmp_attention_mask, dim=0).to(device) @@ -2590,7 +2591,11 @@ def _quantize_block( total_loss += loss.item() / num_elm self._scale_loss_and_backward(scaler, loss) - clear_memory() # clean cached memory after backward + + # Temporary change for 70B model OOM issue on XPU + # TODO: Remove after https://github.com/intel/torch-xpu-ops/issues/2232 is fixed + if torch.xpu.is_available(): + clear_memory() # clean cached memory after backward if i == 0: init_loss = total_loss @@ -2745,7 +2750,6 @@ def _quantize_blocks( modules = [get_module(model, n) for n in names] m = WrapperMultiblock(modules) - m = m.to(device) q_input, input_ids = quantize_block( m, input_ids, diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index 4551020e9..b9290b04c 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -473,9 +473,9 @@ def estimate_tuning_block_mem( tuple: A tuple containing the following: - layer_memory_dict (dict): A dictionary mapping layer names to their memory consumption (in GB). Format: {layer_name: {"param_memory": float, "output_memory": float}} - SDPA layers are represented with a fixed 1GB output memory. - input_output_memory (float): The memory consumption (in GB) for input and output tensors of the block. + - additional_memory (float): Additional memory overhead (in GB) for operations like attention. """ # Calculate all block parameters memory and build layer-wise memory dict from auto_round.utils.model import get_layer_features @@ -517,13 +517,13 @@ def estimate_tuning_block_mem( # Assuming bfloat16 or float32, input and output input_output_memory = 2 * sum(tensor.nbytes for tensor in input_ids) / 1024**3 - # considering sdpa (attention activation) memory and reference_output memory for loss calculation + # Considering norm, sdpa, reference_output, etc. additional_memory = 1 if torch.xpu.is_available(): # https://github.com/intel/torch-xpu-ops/issues/2232 - # sdpa on XPU takes more memory than expected. 2 from grad tensor - xpu_sdpa_additional_memory = 9 # GB - additional_memory += xpu_sdpa_additional_memory * 2 + # TODO: XPU takes more memory than expected. for llama 8B, it's 9*2 GB + xpu_additional_memory = 9 # GB + additional_memory += xpu_additional_memory * 2 return layer_memory_dict, input_output_memory, additional_memory @@ -846,6 +846,16 @@ def set_auto_device_map_for_block_with_tuning( set_non_auto_device_map(block, device_map, names) + # Ensure all remaining modules with params/buffers are moved to device_0 + # This prevents mixed CPU/GPU execution within the same block + for name, module in block.named_modules(): + if name not in names: # This module wasn't assigned a device + # Check if module has any parameters or buffers + has_params = any(True for _ in module.parameters(recurse=False)) + has_buffers = any(True for _ in module.buffers(recurse=False)) + if has_params or has_buffers: + set_tuning_device_for_layer(block, name, device_0) + def partition_dict_numbers(number_dict, n): """ From 135339dc66f09179e006236694d23467382214e6 Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Thu, 30 Oct 2025 02:50:26 -0400 Subject: [PATCH 08/13] fix bug and refine additional memory calcu logic Signed-off-by: He, Xin3 --- auto_round/compressors/base.py | 2 +- auto_round/utils/device.py | 159 +++++++++++++++++---------------- 2 files changed, 83 insertions(+), 78 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index e234d6802..fa0653327 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -2594,7 +2594,7 @@ def _quantize_block( # Temporary change for 70B model OOM issue on XPU # TODO: Remove after https://github.com/intel/torch-xpu-ops/issues/2232 is fixed - if torch.xpu.is_available(): + if torch.xpu.is_available() and self.low_gpu_mem_usage: clear_memory() # clean cached memory after backward if i == 0: diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index b9290b04c..f4c367881 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -458,76 +458,6 @@ def check_memory_availability(device, inputs, weight, org_seqlen, org_bs): return False, seqlen, bs -def estimate_tuning_block_mem( - block: torch.nn.Module, input_ids: list[torch.Tensor], pick_samples: int -) -> tuple[dict, float]: - """ - Calculates the memory consumption of a specific block in the model. - - Args: - block (torch.nn.Module): The block of the model to analyze. - input_ids (list[torch.Tensor]): A list of input tensors for the block. - pick_samples (int): Number of samples to consider for memory estimation. - - Returns: - tuple: A tuple containing the following: - - layer_memory_dict (dict): A dictionary mapping layer names to their memory consumption (in GB). - Format: {layer_name: {"param_memory": float, "output_memory": float}} - - input_output_memory (float): The memory consumption (in GB) for input and output - tensors of the block. - - additional_memory (float): Additional memory overhead (in GB) for operations like attention. - """ - # Calculate all block parameters memory and build layer-wise memory dict - from auto_round.utils.model import get_layer_features - - layer_memory_dict = {} - total_param_mem = 0 - - # Calculate batch_size and sequence_length from input_ids for output memory estimation - seq_len = input_ids[0].shape[1] if input_ids and len(input_ids[0].shape) >= 2 else 1 - element_size = input_ids[0].element_size() if input_ids else 2 # Default to 2 bytes (fp16/bf16) - - for name, module in block.named_modules(): - if check_to_quantized(module): - enable_act_quant = module.act_bits <= 8 - layer_name = name - param_size = module.weight.nbytes - total_param_mem += param_size - param_memory_gb = param_size / 1024**3 - param_memory_gb *= 2 # considering the v tensor for weight rounding - - # Estimate output memory based on input_features and out_features - in_features, out_features = get_layer_features(module) - if in_features is not None and out_features is not None: - # Output tensor size: batch_size * seq_len * out_features * element_size - output_size = pick_samples * seq_len * out_features * element_size - output_memory_gb = output_size / 1024**3 - - # If enable_act_quant, add input tensor memory to param_memory - if enable_act_quant: - input_size = pick_samples * seq_len * in_features * element_size - input_memory_gb = input_size / 1024**3 - param_memory_gb += input_memory_gb - else: - output_memory_gb = 0.0 - - # memory * 2, because it contains grad tensor. - layer_memory_dict[layer_name] = {"param_memory": param_memory_gb * 2, "output_memory": output_memory_gb * 2} - - # Assuming bfloat16 or float32, input and output - input_output_memory = 2 * sum(tensor.nbytes for tensor in input_ids) / 1024**3 - - # Considering norm, sdpa, reference_output, etc. - additional_memory = 1 - if torch.xpu.is_available(): - # https://github.com/intel/torch-xpu-ops/issues/2232 - # TODO: XPU takes more memory than expected. for llama 8B, it's 9*2 GB - xpu_additional_memory = 9 # GB - additional_memory += xpu_additional_memory * 2 - - return layer_memory_dict, input_output_memory, additional_memory - - def out_of_vram(error_msg): error_msg = str(error_msg) # CUDA @@ -763,6 +693,77 @@ def find_best_device(layer_name, estimated_memory, layer_idx): return ordered_device_map, names +def estimate_tuning_block_mem( + block: torch.nn.Module, input_ids: list[torch.Tensor], pick_samples: int +) -> tuple[dict, float]: + """ + Calculates the memory consumption of a specific block in the model. + + Args: + block (torch.nn.Module): The block of the model to analyze. + input_ids (list[torch.Tensor]): A list of input tensors for the block. + pick_samples (int): Number of samples to consider for memory estimation. + + Returns: + tuple: A tuple containing the following: + - layer_memory_dict (dict): A dictionary mapping layer names to their memory consumption (in GB). + Format: {layer_name: {"param_memory": float, "output_memory": float}} + - input_output_memory (float): The memory consumption (in GB) for input and output + tensors of the block. + - additional_memory (float): Additional memory overhead (in GB) for operations like attention. + """ + # Calculate all block parameters memory and build layer-wise memory dict + from auto_round.utils.model import get_layer_features + + layer_memory_dict = {} + total_param_mem = 0 + + # Calculate batch_size and sequence_length from input_ids for output memory estimation + seq_len = input_ids[0].shape[1] if input_ids and len(input_ids[0].shape) >= 2 else 1 + element_size = input_ids[0].element_size() if input_ids else 2 # Default to 2 bytes (fp16/bf16) + + for name, module in block.named_modules(): + if check_to_quantized(module): + enable_act_quant = module.act_bits <= 8 + layer_name = name + param_size = module.weight.nbytes + param_memory_gb = param_size / 1024**3 + param_memory_gb *= 2 # considering the v tensor for weight rounding + + # Estimate output memory based on input_features and out_features + in_features, out_features = get_layer_features(module) + if in_features is not None and out_features is not None: + # Output tensor size: batch_size * seq_len * out_features * element_size + output_size = pick_samples * seq_len * out_features * element_size + output_memory_gb = output_size / 1024**3 + + # If enable_act_quant, add input tensor memory to param_memory + if enable_act_quant: + input_size = pick_samples * seq_len * in_features * element_size + input_memory_gb = input_size / 1024**3 + param_memory_gb += input_memory_gb + else: + output_memory_gb = 0.0 + + # memory * 2, because it contains grad tensor. + layer_memory_dict[layer_name] = {"param_memory": param_memory_gb * 2, "output_memory": output_memory_gb * 2} + + # Assuming bfloat16 or float32, input and output + block_input_output_memory = 2 * sum(tensor.nbytes for tensor in input_ids) / 1024**3 + + # Roughly estimate additional memory for attention and other operations + additional_activation_memory = sum(info["output_memory"] for info in layer_memory_dict.values()) + # 1GB considers norm weight, sdpa, reference_output, etc. + additional_memory = additional_activation_memory + 1 # GB + if torch.xpu.is_available(): + # https://github.com/intel/torch-xpu-ops/issues/2232 + # TODO: XPU takes more memory than expected. for llama 8B, it's about 12 GB + xpu_additional_memory = 12 # GB + additional_memory += xpu_additional_memory + + return layer_memory_dict, block_input_output_memory, additional_memory + + def set_auto_device_map_for_block_with_tuning( block: torch.nn.Module, device_map, @@ -809,19 +810,23 @@ def set_auto_device_map_for_block_with_tuning( device_0 = f"{device_name}:0" device_0_memory = get_device_memory(device_list[0] if device_list else 0) - layer_memory_dict, input_output_memory, additional_memory = estimate_tuning_block_mem( + layer_memory_dict, block_input_output_memory, additional_memory = estimate_tuning_block_mem( block, input_ids, pick_samples ) if low_gpu_mem_usage: - input_output_memory = 0 + block_input_output_memory = 0 # Calculate total block memory from layer memory dict (including both param and output memory) total_block_param_memory = sum(info["param_memory"] for info in layer_memory_dict.values()) total_block_output_memory = sum(info["output_memory"] for info in layer_memory_dict.values()) # Average dispatch strategy - # card_0_left_memory = card_0_mem - input_output_memory - additional_memory - layer_outputs_memory - card_0_left_memory = device_0_memory - input_output_memory - additional_memory - total_block_output_memory + # card_0_left_memory = card_0_mem - block_input_output_memory - additional_memory - layer_outputs_memory + logger.debug("Card 0 used memory details:") + logger.debug(f" Block input output cache memory: {block_input_output_memory} GB") + logger.debug(f" Quantized layer outputs memory: {total_block_output_memory} GB") + logger.debug(f" Additional_memory from other ops: {additional_memory} GB") + card_0_left_memory = device_0_memory - block_input_output_memory - total_block_output_memory - additional_memory # Calculate total available memory across all devices total_available_memory = card_0_left_memory @@ -842,11 +847,11 @@ def set_auto_device_map_for_block_with_tuning( # Allocate layers to devices using load-balancing strategy device_map, names = _allocate_layers_to_devices(layer_memory_dict, device_memory, gpu_devices, mem_per_param) - logger.debug(f"Auto device map for block: {device_map}") + logger.debug(f"Auto device map for block: {device_map}") set_non_auto_device_map(block, device_map, names) - # Ensure all remaining modules with params/buffers are moved to device_0 + # Ensure all remaining modules with parameters/buffers are moved to device_0 # This prevents mixed CPU/GPU execution within the same block for name, module in block.named_modules(): if name not in names: # This module wasn't assigned a device @@ -854,7 +859,7 @@ def set_auto_device_map_for_block_with_tuning( has_params = any(True for _ in module.parameters(recurse=False)) has_buffers = any(True for _ in module.buffers(recurse=False)) if has_params or has_buffers: - set_tuning_device_for_layer(block, name, device_0) + module = module.to(device_0) def partition_dict_numbers(number_dict, n): From 488eec1a498bb8a47a875c1787803f655c9d97aa Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Thu, 30 Oct 2025 03:36:15 -0400 Subject: [PATCH 09/13] consider output_device when setting device_map Signed-off-by: He, Xin3 --- auto_round/compressors/base.py | 4 ++-- auto_round/utils/device.py | 8 +++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index fa0653327..5272e1db3 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -2439,7 +2439,7 @@ def _quantize_block( if self.device_map == "auto" or (isinstance(self.device_map, str) and "," in self.device_map): set_auto_device_map_for_block_with_tuning( - block, self.device_map, input_ids, self.low_gpu_mem_usage, self.batch_size + block, self.device_map, input_ids, self.low_gpu_mem_usage, self.batch_size, device ) if self.device_map is not None: @@ -2491,7 +2491,7 @@ def _quantize_block( self.enable_minmax_tuning, self.enable_norm_bias_tuning, enable_torch_compile=self.enable_torch_compile, - device=self.device, + device=device, ) if is_nv_fp(self.data_type): # enable qkv and moe structure global_scale fuse from auto_round.data_type.utils import update_fused_layer_global_scales diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index f4c367881..ccad2cef0 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -770,6 +770,7 @@ def set_auto_device_map_for_block_with_tuning( input_ids: list[torch.Tensor], low_gpu_mem_usage=False, pick_samples=8, + output_device=None, ): """ Automatically sets the device map for the block based on available GPUs and memory constraints. @@ -780,6 +781,7 @@ def set_auto_device_map_for_block_with_tuning( input_ids (list[torch.Tensor]): List of input tensors used for estimating memory requirements. low_gpu_mem_usage (bool, optional): If True, ignoring input/output memory. Defaults to False. pick_samples (int, optional): Number of samples to consider for memory estimation. Defaults to 8. + output_device (str | torch.device, optional): Device to move unassigned modules to. Defaults to None. Returns: None @@ -851,15 +853,15 @@ def set_auto_device_map_for_block_with_tuning( logger.debug(f"Auto device map for block: {device_map}") set_non_auto_device_map(block, device_map, names) - # Ensure all remaining modules with parameters/buffers are moved to device_0 - # This prevents mixed CPU/GPU execution within the same block + # Ensure all remaining modules with parameters/buffers are moved to expected device, by default device_0 + output_device = device_0 if output_device is None else output_device for name, module in block.named_modules(): if name not in names: # This module wasn't assigned a device # Check if module has any parameters or buffers has_params = any(True for _ in module.parameters(recurse=False)) has_buffers = any(True for _ in module.buffers(recurse=False)) if has_params or has_buffers: - module = module.to(device_0) + module = module.to(output_device) def partition_dict_numbers(number_dict, n): From 42b1453a10f883b7eb6e175117ecdf3be72bfb7e Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Thu, 30 Oct 2025 04:47:05 -0400 Subject: [PATCH 10/13] fix bug Signed-off-by: He, Xin3 --- auto_round/compressors/base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 5272e1db3..eb8f634f7 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -2437,10 +2437,12 @@ def _quantize_block( new_layer = convert_fp8_layer_to_linear(m, self.amp_dtype).to(device) set_module(block, n, new_layer) - if self.device_map == "auto" or (isinstance(self.device_map, str) and "," in self.device_map): + if self.device_map == "auto" or ((isinstance(self.device_map, str) and "," in self.device_map)): set_auto_device_map_for_block_with_tuning( block, self.device_map, input_ids, self.low_gpu_mem_usage, self.batch_size, device ) + else: + block = block.to(device) if self.device_map is not None: for n, m in block.named_modules(): From 194ac2749c6f720ee50c9da4c5ff59a8df18c776 Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Fri, 31 Oct 2025 03:20:27 -0400 Subject: [PATCH 11/13] clear_memory_if_reached_threshold Signed-off-by: He, Xin3 --- auto_round/compressors/base.py | 7 ++-- auto_round/utils/device.py | 63 ++++++++++++++++++++++++++++++++-- 2 files changed, 63 insertions(+), 7 deletions(-) diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index eb8f634f7..330003954 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -91,6 +91,7 @@ unsupported_meta_device, ) from auto_round.utils.device import ( + clear_memory_if_reached_threshold, get_major_device, set_auto_device_map_for_block_with_tuning, set_non_auto_device_map, @@ -2593,11 +2594,7 @@ def _quantize_block( total_loss += loss.item() / num_elm self._scale_loss_and_backward(scaler, loss) - - # Temporary change for 70B model OOM issue on XPU - # TODO: Remove after https://github.com/intel/torch-xpu-ops/issues/2232 is fixed - if torch.xpu.is_available() and self.low_gpu_mem_usage: - clear_memory() # clean cached memory after backward + clear_memory_if_reached_threshold(threshold=0.85) if i == 0: init_loss = total_loss diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index ccad2cef0..f78114410 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -415,6 +415,62 @@ def clear_memory(tensor=None): _clear_memory_for_cpu_and_cuda(tensor) +def clear_memory_if_reached_threshold(threshold=0.85): + """Check all available devices and clear memory if any device is using close to the threshold. + + Args: + threshold (float): Memory usage threshold (default: 0.85 for 85%). + If any device exceeds this percentage, clear_memory() will be called. + + Returns: + bool: True if memory was cleared, False otherwise. + """ + # Check CUDA devices + if torch.cuda.is_available(): + num_devices = torch.cuda.device_count() + for i in range(num_devices): + try: + total_memory = torch.cuda.get_device_properties(i).total_memory + allocated_memory = torch.cuda.memory_reserved(i) + memory_usage_ratio = allocated_memory / total_memory + + if memory_usage_ratio >= threshold: + logger.info( + f"CUDA device {i}: Memory usage {memory_usage_ratio*100:.2f}% " + f"exceeds threshold {threshold*100:.2f}%. Clearing memory..." + ) + clear_memory() + allocated_memory = torch.cuda.memory_reserved(i) + memory_usage_ratio = allocated_memory / total_memory + logger.info(f"Cleared memory. CUDA device {i}: Memory usage {memory_usage_ratio*100:.2f}%") + break + except Exception as e: + logger.warning(f"Failed to check memory for CUDA device {i}: {e}") + + # Check XPU devices if memory not yet cleared + if hasattr(torch, "xpu") and torch.xpu.is_available(): + num_devices = torch.xpu.device_count() + for i in range(num_devices): + try: + total_memory = torch.xpu.get_device_properties(i).total_memory + allocated_memory = torch.xpu.memory_allocated(i) + memory_usage_ratio = allocated_memory / total_memory + + if memory_usage_ratio >= threshold: + logger.info( + f"XPU device {i}: Memory usage {memory_usage_ratio*100:.2f}% " + f"exceeds threshold {threshold*100:.2f}%. Clearing memory..." + ) + clear_memory() + allocated_memory = torch.xpu.memory_reserved(i) + memory_usage_ratio = allocated_memory / total_memory + logger.info(f"Cleared memory. XPU device {i}: Memory usage {memory_usage_ratio*100:.2f}%") + break + except Exception as e: + logger.warning(f"Failed to check memory for XPU device {i}: {e}") + return False + + def check_memory_availability(device, inputs, weight, org_seqlen, org_bs): """Checks the availability of memory on the specified device for processing inputs using a given weight tensor. @@ -824,11 +880,14 @@ def set_auto_device_map_for_block_with_tuning( # Average dispatch strategy # card_0_left_memory = card_0_mem - block_input_output_memory - additional_memory - layer_outputs_memory - logger.debug("Card 0 used memory details:") + logger.debug("Card 0 used memory details [Estimated]:") logger.debug(f" Block input output cache memory: {block_input_output_memory} GB") logger.debug(f" Quantized layer outputs memory: {total_block_output_memory} GB") logger.debug(f" Additional_memory from other ops: {additional_memory} GB") - card_0_left_memory = device_0_memory - block_input_output_memory - total_block_output_memory - additional_memory + + card_0_left_memory = max( + 0, device_0_memory - block_input_output_memory - total_block_output_memory - additional_memory + ) # Calculate total available memory across all devices total_available_memory = card_0_left_memory From ddb20ed13eddc1462f317058370fae5e3e12a65c Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Fri, 31 Oct 2025 04:40:17 -0400 Subject: [PATCH 12/13] add warning for XPU additional memory Signed-off-by: He, Xin3 --- auto_round/utils/device.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index f78114410..fd0ad29ea 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -816,6 +816,8 @@ def estimate_tuning_block_mem( # TODO: XPU takes more memory than expected. for llama 8B, it's about 12 GB xpu_additional_memory = 12 # GB additional_memory += xpu_additional_memory + logger.warning_once("XPU additional memory usage of SDPA is estimated to be 12 GB.") + logger.warning_once("Remove it after https://github.com/intel/torch-xpu-ops/issues/2232 is fixed.") return layer_memory_dict, block_input_output_memory, additional_memory From 015331b288cbaef331e9caccbfdea5d6f3816345 Mon Sep 17 00:00:00 2001 From: "He, Xin3" Date: Fri, 31 Oct 2025 05:44:36 -0400 Subject: [PATCH 13/13] unify code Signed-off-by: He, Xin3 --- auto_round/utils/device.py | 65 ++++++++++++++------------------------ 1 file changed, 24 insertions(+), 41 deletions(-) diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index fd0ad29ea..3ea3e63e0 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -425,50 +425,33 @@ def clear_memory_if_reached_threshold(threshold=0.85): Returns: bool: True if memory was cleared, False otherwise. """ - # Check CUDA devices + # Detect CUDA/XPU devices if torch.cuda.is_available(): - num_devices = torch.cuda.device_count() - for i in range(num_devices): - try: - total_memory = torch.cuda.get_device_properties(i).total_memory - allocated_memory = torch.cuda.memory_reserved(i) - memory_usage_ratio = allocated_memory / total_memory + name, device_api = "CUDA", torch.cuda + elif hasattr(torch, "xpu") and torch.xpu.is_available(): + name, device_api = "XPU", torch.xpu + else: + return - if memory_usage_ratio >= threshold: - logger.info( - f"CUDA device {i}: Memory usage {memory_usage_ratio*100:.2f}% " - f"exceeds threshold {threshold*100:.2f}%. Clearing memory..." - ) - clear_memory() - allocated_memory = torch.cuda.memory_reserved(i) - memory_usage_ratio = allocated_memory / total_memory - logger.info(f"Cleared memory. CUDA device {i}: Memory usage {memory_usage_ratio*100:.2f}%") - break - except Exception as e: - logger.warning(f"Failed to check memory for CUDA device {i}: {e}") - - # Check XPU devices if memory not yet cleared - if hasattr(torch, "xpu") and torch.xpu.is_available(): - num_devices = torch.xpu.device_count() - for i in range(num_devices): - try: - total_memory = torch.xpu.get_device_properties(i).total_memory - allocated_memory = torch.xpu.memory_allocated(i) + num_devices = device_api.device_count() + for i in range(num_devices): + try: + total_memory = device_api.get_device_properties(i).total_memory + allocated_memory = device_api.memory_reserved(i) if name == "CUDA" else device_api.memory_allocated(i) + memory_usage_ratio = allocated_memory / total_memory + + if memory_usage_ratio >= threshold: + logger.warning_once( + f"{name} device {i}: Memory usage {memory_usage_ratio*100:.2f}% " + f"exceeds threshold {threshold*100:.2f}%. Clearing memory..." + ) + clear_memory() + allocated_memory = device_api.memory_reserved(i) if name == "CUDA" else device_api.memory_allocated(i) memory_usage_ratio = allocated_memory / total_memory - - if memory_usage_ratio >= threshold: - logger.info( - f"XPU device {i}: Memory usage {memory_usage_ratio*100:.2f}% " - f"exceeds threshold {threshold*100:.2f}%. Clearing memory..." - ) - clear_memory() - allocated_memory = torch.xpu.memory_reserved(i) - memory_usage_ratio = allocated_memory / total_memory - logger.info(f"Cleared memory. XPU device {i}: Memory usage {memory_usage_ratio*100:.2f}%") - break - except Exception as e: - logger.warning(f"Failed to check memory for XPU device {i}: {e}") - return False + logger.warning_once(f"Cleared memory. {name} device {i}: Memory usage {memory_usage_ratio*100:.2f}%") + return True + except Exception as e: + logger.warning_once(f"Failed to check memory for {name} device {i}: {e}") def check_memory_availability(device, inputs, weight, org_seqlen, org_bs):