Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
df5f17e
enable auto device map
Kaihui-intel Sep 2, 2025
379da1f
mv get_block_info to utils
Kaihui-intel Sep 2, 2025
b9306c0
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 2, 2025
c367b61
enable low_gpu_usage_mem
Kaihui-intel Sep 2, 2025
a6af52d
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 2, 2025
78858ab
support Conv1D and XPU
Kaihui-intel Sep 2, 2025
3e3a2e9
Merge branch 'kaihui/auto_device' of https://github.com/intel/auto-ro…
Kaihui-intel Sep 2, 2025
b892960
use get_block_names
Kaihui-intel Sep 2, 2025
d32978e
FCFS for block layers
Kaihui-intel Sep 3, 2025
7710686
update comments&First-Fill
Kaihui-intel Sep 4, 2025
09d6d2d
merge main
Kaihui-intel Sep 4, 2025
33c7ff8
update comments
Kaihui-intel Sep 4, 2025
dd6e3ce
support 0,1,2 & rtn
Kaihui-intel Sep 4, 2025
871c3b4
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 4, 2025
f7ab027
use nbytes
Kaihui-intel Sep 5, 2025
f04ff10
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 5, 2025
f028c96
Merge branch 'main' into kaihui/auto_device
wenhuach21 Sep 5, 2025
efc1900
fix rtn ut
Kaihui-intel Sep 5, 2025
09c54ee
Merge branch 'main' into kaihui/auto_device
Kaihui-intel Sep 5, 2025
b036d7f
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 5, 2025
977a18b
update args type
Kaihui-intel Sep 5, 2025
ef23872
Merge branch 'kaihui/auto_device' of https://github.com/intel/auto-ro…
Kaihui-intel Sep 5, 2025
c7c0b04
disable CUDA_VISIBLE_DEVICE setting
Kaihui-intel Sep 5, 2025
a1806a1
Merge branch 'kaihui/auto_device' of https://github.com/intel/auto-ro…
Kaihui-intel Sep 5, 2025
bcc574a
fix ds
Kaihui-intel Sep 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 84 additions & 5 deletions auto_round/autoround.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,11 @@
convert_fp8_layer_to_linear,
convert_fp8_model_to_16b_model,
detect_device,
estimate_tuning_block_mem,
find_matching_blocks,
flatten_list,
get_block_names,
get_device_memory,
get_layer_config_by_gguf_format,
get_layer_features,
get_layer_names_in_block,
Expand Down Expand Up @@ -228,20 +230,19 @@ def __init__(
logger.warning("`device` is deprecated, please use `device_map` instead")

self.vlm = kwargs.pop("vlm") if "vlm" in kwargs else False
# Scale factor for RAM usage per parameter.
self.mem_per_param_scale = kwargs.pop("mem_per_param_scale", None)

if kwargs:
logger.warning(f"unrecognized keys {list(kwargs.keys())} were passed. Please check them.")

if device_map is not None and "," in str(device_map):
raise ValueError(
"API does not support explicit set multiple devices," " please set CUDA_VISIBLE_DEVICES=0,1 yourself"
)
if device_map is None:
device_map = 0

# Set device, must place after model loading
if isinstance(device_map, (str, torch.device, int)):
self.device = detect_device(device_map)

elif isinstance(device_map, dict) and device_map:
tmp_devices = []
for val in device_map.values():
Expand All @@ -258,8 +259,12 @@ def __init__(

self.device = tmp_devices[0]

if isinstance(device_map, dict) and device_map:
if (isinstance(device_map, dict) and device_map) or device_map == "auto":
self.device_map = device_map
elif isinstance(device_map, str) and "," in device_map:
device_map = device_map.replace(" ", "") # Remove any spaces
self.device_list = [int(dev) for dev in device_map.split(",") if dev.isdigit()]
self.device_map = "auto"
else:
self.device_map = None
self._set_device_map_in_blocks(self.device_map)
Expand Down Expand Up @@ -543,6 +548,8 @@ def _set_device_map_in_blocks(self, device_map: Union[str, dict, None]) -> None:
self.device_map = None
if not device_map:
return
if self.device_map == "auto" and device_map == "auto":
return
if isinstance(device_map, str):
device_map = device_map.replace(" ", "")
infos = device_map.split(",")
Expand Down Expand Up @@ -583,6 +590,71 @@ def _set_device_for_matching_module(self, name: str, device: str) -> None:
else:
module.tuning_device = device

def _set_auto_device_map_in_block(self, block: torch.nn.Module, input_ids: list[torch.Tensor]) -> None:
"""Automatically sets the device map for the block based on available GPUs and memory constraints."""
if torch.cuda.is_available():
num_gpus = torch.cuda.device_count()
elif torch.xpu.is_available():
logger.warning_once("XPU does not support auto device map yet, using device 0 for tuning.")
return
else:
raise RuntimeError("No CUDA or XPU devices found.")
Comment thread
Kaihui-intel marked this conversation as resolved.
if num_gpus <= 1:
self.device_map = None
return

if hasattr(self, "device_list") and self.device_list:
cuda_devices = [f"cuda:{i}" for i in self.device_list]
device_0 = cuda_devices[0]
else:
cuda_devices = [f"cuda:{i}" for i in range(num_gpus)]
device_0 = "cuda:0"

device_0_memory = get_device_memory(
self.device_list[0] if hasattr(self, "device_list") and self.device_list else 0
)
block_memory, input_ouput_memory = estimate_tuning_block_mem(block, input_ids)
if self.low_gpu_mem_usage:
input_ouput_memory = 0

mem_per_param_scale = 13 if self.mem_per_param_scale is None else self.mem_per_param_scale
if self.iters == 0:
mem_per_param_scale = 1 # for rtn

if (block_memory * mem_per_param_scale + input_ouput_memory) < device_0_memory:
return # fit in one GPU

device_map = {}
device_memory = {device: get_device_memory(int(device.split(":")[1])) for device in cuda_devices}
device_memory[device_0] = device_0_memory - input_ouput_memory

device_idx = 0
# First, fill device 0 to its maximum capacity, then distribute the remaining layers evenly across other devices
for n, m in block.named_modules():
if check_to_quantized(m):
layer_name = block.tmp_name + "." + n
layer_memory = m.weight.nbytes / 1024**3
if device_idx == 0 and layer_memory * mem_per_param_scale < device_memory[cuda_devices[device_idx]]:
device_map[layer_name] = cuda_devices[device_idx]
device_memory[cuda_devices[device_idx]] -= layer_memory * mem_per_param_scale
elif device_idx == 0:
device_idx += 1 # Move to the next device once device 0 is full
device_map[layer_name] = cuda_devices[device_idx]
device_memory[cuda_devices[device_idx]] -= layer_memory * mem_per_param_scale
else:
# Calculate the target device index based on even distribution
sorted_devices = sorted(cuda_devices, key=lambda d: device_memory[d], reverse=True)
device_idx = sorted_devices[0]
if layer_memory * mem_per_param_scale < device_memory[device_idx]:
device_map[layer_name] = device_idx
device_memory[device_idx] -= layer_memory * mem_per_param_scale
else:
logger.warning_once(
f"Block {block.tmp_name} not fit in available GPU memory. "
"Consider using more GPUs or reducing mem_per_param_scale if OOM occurs."
)
self._set_device_map_in_blocks(device_map)

def _dq_check(self) -> None:
"""Reset the default value of super_bits and super_group_size"""
if self.data_type.endswith("_dq"):
Expand Down Expand Up @@ -1488,6 +1560,10 @@ def _quantize_via_rtn_blockwise(self, all_to_quantized_module_names: list[str])
block = block.to(self.device)
if _is_fp8_model(self.model):
convert_fp8_model_to_16b_model(block, dtype=self.amp_dtype)

if self.device_map == "auto":
self._set_auto_device_map_in_block(block, input_ids)

# Dispatch model if needed
if self.device_map is not None:
from accelerate.hooks import AlignDevicesHook, add_hook_to_module
Expand Down Expand Up @@ -2551,6 +2627,9 @@ def _quantize_block(
new_layer = convert_fp8_layer_to_linear(m, self.amp_dtype).to(device)
set_module(block, n, new_layer)

if self.device_map == "auto":
self._set_auto_device_map_in_block(block, input_ids)

if self.device_map is not None:
for n, m in block.named_modules():
if len(list(m.children())) != 0 or not hasattr(m, "tuning_device"):
Expand Down
9 changes: 8 additions & 1 deletion auto_round/script/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,13 @@ def __init__(self, *args, **kwargs):
help="minmax learning rate, if None, it will beset to be the same with lr",
)

self.add_argument(
"--mem_per_param_scale",
default=13,
type=float,
Comment thread
Kaihui-intel marked this conversation as resolved.
help="Scale factor for memory per parameter, used to adjust memory usage estimation for tuning",
)

self.add_argument("--seed", default=42, type=int, help="random seed")

self.add_argument("--adam", action="store_true", help="whether to use adam optimizer instead of SignSGD")
Expand Down Expand Up @@ -436,7 +443,7 @@ def tune(args):
raise RuntimeError("marlin backend only supports sym quantization, please remove --asym")

# Must set this before import torch
set_cuda_visible_devices(args.device_map)
# set_cuda_visible_devices(args.device_map)
device_str, use_auto_mapping = get_device_and_parallelism(args.device_map)

import torch
Expand Down
2 changes: 1 addition & 1 deletion auto_round/script/mllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,7 @@ def tune(args):
raise ValueError(f"{format} is not supported, we only support {SUPPORTED_FORMATS}")

# Must set this before import torch
set_cuda_visible_devices(args.device_map)
# set_cuda_visible_devices(args.device_map)
device_str, use_auto_mapping = get_device_and_parallelism(args.device_map)

import torch
Expand Down
66 changes: 66 additions & 0 deletions auto_round/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -578,6 +578,10 @@ def is_valid_digit(s):
if is_valid_digit(device):
dev_idx = int(device)
device = "auto"
if isinstance(device, str) and "," in device: # device is "0,1,2"
device_list = [int(dev) for dev in device.split(",") if dev.isdigit()]
dev_idx = device_list[0] if device_list else None
device = "auto"
if device is None or device == "auto":
if torch.cuda.is_available():
device = torch.device("cuda")
Expand Down Expand Up @@ -1426,6 +1430,8 @@ def llm_load_model(
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, trust_remote_code=trust_remote_code)

model_cls = AutoModel if is_glm else AutoModelForCausalLM
if "deepseek" in pretrained_model_name_or_path.lower() and trust_remote_code:
logger.warning("trust_remote_code is enabled by default, please ensure its correctness.")

if low_cpu_mem_tmp_dir is None:
low_cpu_mem_tmp_dir = "low_cpu_mem_tmp"
Expand Down Expand Up @@ -2563,6 +2569,66 @@ def is_static_wfp8afp8(ar):
return False


def bytes_to_gigabytes(bytes) -> int:
"""
Converts bytes to gigabytes.

Args:
bytes (int): The number of bytes.

Returns:
int: The equivalent number of gigabytes.
"""
return bytes / 1024 / 1024 / 1024


def get_device_memory(i: int = 0) -> int:
"""
Gets the available memory on the specified device.
Comment thread
Kaihui-intel marked this conversation as resolved.

Args:
i (int, optional): Device index. Defaults to 0.

Returns:
int: Available memory in gigabytes.
"""
if torch.cuda.is_available():
total_memory = bytes_to_gigabytes(torch.cuda.get_device_properties(i).total_memory)
elif torch.xpu.is_available():
raise RuntimeError("XPU does not support device_map='auto' currently.")
else:
raise RuntimeError("No supported device found (CUDA or XPU).")
return total_memory


def estimate_tuning_block_mem(block: torch.nn.Module, input_ids: list[torch.Tensor]) -> tuple[float, float]:
"""
Calculates the memory consumption of a specific block in the model.

Args:
block (torch.nn.Module): The block of the model to analyze.
input_ids (list[torch.Tensor]): A list of input tensors for the block.

Returns:
tuple: A tuple containing the following:
- block_memory (float): The memory consumption (in GB) of the block's linear layers.
- input_output_memory (float): The memory consumption (in GB) for input and output
tensors of the block.
"""
# Calculate all block parameters memory
total_param_mem = 0
for name, module in block.named_modules():
if check_to_quantized(module):
param_size = module.weight.nbytes
total_param_mem += param_size
block_memory = total_param_mem / 1024**3 # Convert to GB

# Assuming bfloat16 or float32, input and output
input_output_memory = 2 * sum(tensor.nbytes for tensor in input_ids) / 1024**3

return block_memory, input_output_memory


def get_max_vram(ratio: float = 0.9) -> dict:
max_memory = {}
if torch.cuda.is_available(): # NVIDIA CUDA
Expand Down