Skip to content

Commit

Permalink
make more cuda-only tests device-agnostic (#2876)
Browse files Browse the repository at this point in the history
* enable 3 cases

* add ests

* add 2 more

* revert 1 back

* revert 1 more

* enable on xpu
  • Loading branch information
faaany committed Jul 3, 2024
1 parent 6882ff2 commit eac206f
Show file tree
Hide file tree
Showing 5 changed files with 53 additions and 33 deletions.
10 changes: 5 additions & 5 deletions tests/test_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,7 @@ def test_save_model_offload(self, use_safetensors):
assert torch.allclose(expected, output, atol=1e-5)

@parameterized.expand([True, False], name_func=parameterized_custom_name_func)
@require_cuda
@require_non_cpu
def test_get_state_dict_from_offload(self, use_safetensors):
accelerator = Accelerator()

Expand All @@ -312,18 +312,18 @@ def test_get_state_dict_from_offload(self, use_safetensors):
cpu_onloaded_layer = get_state_dict_from_offload(
model.linear2, "linear2.weight", {"linear2.weight": ""}, device_to_put_offload="cpu"
)
cuda_onloaded_layer = get_state_dict_from_offload(
device_onloaded_layer = get_state_dict_from_offload(
model.linear2, "linear2.weight", {"linear2.weight": ""}, device_to_put_offload=0
)
cpu_onloaded_layer_weight = cpu_onloaded_layer["linear2.weight"]
cuda_onloaded_layer_weight = cuda_onloaded_layer["linear2.weight"]
device_onloaded_layer_weight = device_onloaded_layer["linear2.weight"]

assert torch.allclose(offloaded_layer_weight, cpu_onloaded_layer_weight)
assert torch.allclose(
offloaded_layer_weight, cuda_onloaded_layer_weight.to("cpu")
offloaded_layer_weight, device_onloaded_layer_weight.to("cpu")
) # must be on the same device for torch.allclose()
assert cpu_onloaded_layer_weight.device.type == "cpu"
assert cuda_onloaded_layer_weight.device.type == "cuda"
assert device_onloaded_layer_weight.device.type == torch_device

@parameterized.expand([True, False], name_func=parameterized_custom_name_func)
def test_save_load_model_with_hooks(self, use_safetensors):
Expand Down
4 changes: 2 additions & 2 deletions tests/test_big_modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -655,7 +655,7 @@ def test_dispatch_model_move_offloaded_model(self):
with self.assertRaises(RuntimeError):
model.to(0)

@require_multi_gpu
@require_multi_device
def test_dispatch_model_move_model_warning(self):
model = ModelForTest()
device_map = {"linear1": 0, "batchnorm": 0, "linear2": 1}
Expand All @@ -664,7 +664,7 @@ def test_dispatch_model_move_model_warning(self):
with self.assertLogs("accelerate.big_modeling", level="WARNING"):
model.to("cpu")
with self.assertLogs("accelerate.big_modeling", level="WARNING"):
model.cuda(0)
model.to(torch_device)
with self.assertRaises(RuntimeError):
x = torch.randn(2, 3)
model(x)
Expand Down
7 changes: 4 additions & 3 deletions tests/test_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
TempDirTestCase,
get_launch_command,
require_huggingface_suite,
require_multi_device,
require_multi_gpu,
require_pippy,
require_schedulefree,
Expand Down Expand Up @@ -253,17 +254,17 @@ def test_profiler(self):
testargs = ["examples/by_feature/profiler.py"]
run_command(self.launch_args + testargs)

@require_multi_gpu
@require_multi_device
def test_ddp_comm_hook(self):
testargs = ["examples/by_feature/ddp_comm_hook.py", "--ddp_comm_hook", "fp16"]
run_command(self.launch_args + testargs)

@require_multi_gpu
@require_multi_device
def test_distributed_inference_examples_stable_diffusion(self):
testargs = ["examples/inference/distributed/stable_diffusion.py"]
run_command(self.launch_args + testargs)

@require_multi_gpu
@require_multi_device
def test_distributed_inference_examples_phi2(self):
testargs = ["examples/inference/distributed/phi2.py"]
run_command(self.launch_args + testargs)
Expand Down
51 changes: 30 additions & 21 deletions tests/test_modeling_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,13 @@
from safetensors.torch import save_file

from accelerate import init_empty_weights
from accelerate.test_utils import require_cuda, require_huggingface_suite, require_multi_gpu
from accelerate.test_utils import (
require_cuda,
require_huggingface_suite,
require_multi_device,
require_non_cpu,
torch_device,
)
from accelerate.utils.modeling import (
check_device_map,
clean_device_map,
Expand All @@ -44,6 +50,9 @@
)


torch_device = f"{torch_device}:0" if torch_device != "cpu" else "cpu"


class ModelForTest(nn.Module):
def __init__(self):
super().__init__()
Expand Down Expand Up @@ -150,20 +159,20 @@ def test_set_module_tensor_to_meta_and_cpu(self):
model = ModelForTest()
self.check_set_module_tensor_for_device(model, "cpu", "meta")

@require_cuda
@require_non_cpu
def test_set_module_tensor_to_cpu_and_gpu(self):
model = ModelForTest()
self.check_set_module_tensor_for_device(model, "cpu", 0)
self.check_set_module_tensor_for_device(model, "cpu", torch_device)

@require_cuda
@require_non_cpu
def test_set_module_tensor_to_meta_and_gpu(self):
model = ModelForTest().to(0)
self.check_set_module_tensor_for_device(model, 0, "meta")
model = ModelForTest().to(torch_device)
self.check_set_module_tensor_for_device(model, torch_device, "meta")

@require_multi_gpu
@require_multi_device
def test_set_module_tensor_between_gpus(self):
model = ModelForTest().to(0)
self.check_set_module_tensor_for_device(model, 0, 1)
model = ModelForTest().to(torch_device)
self.check_set_module_tensor_for_device(model, torch_device, torch_device.replace("0", "1"))

def test_set_module_tensor_sets_dtype(self):
model = ModelForTest()
Expand Down Expand Up @@ -361,7 +370,7 @@ def test_load_checkpoint_in_model(self):
self.shard_test_model(model, tmp_dir)
load_checkpoint_in_model(model, tmp_dir)

@require_cuda
@require_non_cpu
def test_load_checkpoint_in_model_one_gpu(self):
device_map = {"linear1": 0, "batchnorm": "cpu", "linear2": "cpu"}

Expand All @@ -371,7 +380,7 @@ def test_load_checkpoint_in_model_one_gpu(self):
fname = os.path.join(tmp_dir, "pt_model.bin")
torch.save(model.state_dict(), fname)
load_checkpoint_in_model(model, fname, device_map=device_map)
assert model.linear1.weight.device == torch.device(0)
assert model.linear1.weight.device == torch.device(torch_device)
assert model.batchnorm.weight.device == torch.device("cpu")
assert model.linear2.weight.device == torch.device("cpu")

Expand All @@ -382,7 +391,7 @@ def test_load_checkpoint_in_model_one_gpu(self):
index_file = os.path.join(tmp_dir, "weight_map.index.json")
load_checkpoint_in_model(model, index_file, device_map=device_map)

assert model.linear1.weight.device == torch.device(0)
assert model.linear1.weight.device == torch.device(torch_device)
assert model.batchnorm.weight.device == torch.device("cpu")
assert model.linear2.weight.device == torch.device("cpu")

Expand All @@ -392,11 +401,11 @@ def test_load_checkpoint_in_model_one_gpu(self):
self.shard_test_model(model, tmp_dir)
load_checkpoint_in_model(model, tmp_dir, device_map=device_map)

assert model.linear1.weight.device == torch.device(0)
assert model.linear1.weight.device == torch.device(torch_device)
assert model.batchnorm.weight.device == torch.device("cpu")
assert model.linear2.weight.device == torch.device("cpu")

@require_cuda
@require_non_cpu
def test_load_checkpoint_in_model_disk_offload(self):
device_map = {"linear1": "cpu", "batchnorm": "disk", "linear2": "cpu"}

Expand All @@ -421,7 +430,7 @@ def test_load_checkpoint_in_model_disk_offload(self):
assert model.batchnorm.running_mean.device == torch.device("meta")
assert model.linear2.weight.device == torch.device("cpu")

@require_multi_gpu
@require_multi_device
def test_load_checkpoint_in_model_two_gpu(self):
device_map = {"linear1": 0, "batchnorm": "cpu", "linear2": 1}

Expand All @@ -431,9 +440,9 @@ def test_load_checkpoint_in_model_two_gpu(self):
fname = os.path.join(tmp_dir, "pt_model.bin")
torch.save(model.state_dict(), fname)
load_checkpoint_in_model(model, fname, device_map=device_map)
assert model.linear1.weight.device == torch.device(0)
assert model.linear1.weight.device == torch.device(torch_device)
assert model.batchnorm.weight.device == torch.device("cpu")
assert model.linear2.weight.device == torch.device(1)
assert model.linear2.weight.device == torch.device(torch_device.replace("0", "1"))

# Check with sharded index
model = ModelForTest()
Expand All @@ -442,19 +451,19 @@ def test_load_checkpoint_in_model_two_gpu(self):
index_file = os.path.join(tmp_dir, "weight_map.index.json")
load_checkpoint_in_model(model, index_file, device_map=device_map)

assert model.linear1.weight.device == torch.device(0)
assert model.linear1.weight.device == torch.device(torch_device)
assert model.batchnorm.weight.device == torch.device("cpu")
assert model.linear2.weight.device == torch.device(1)
assert model.linear2.weight.device == torch.device(torch_device.replace("0", "1"))

# Check with sharded checkpoint
model = ModelForTest()
with tempfile.TemporaryDirectory() as tmp_dir:
self.shard_test_model(model, tmp_dir)
load_checkpoint_in_model(model, tmp_dir, device_map=device_map)

assert model.linear1.weight.device == torch.device(0)
assert model.linear1.weight.device == torch.device(torch_device)
assert model.batchnorm.weight.device == torch.device("cpu")
assert model.linear2.weight.device == torch.device(1)
assert model.linear2.weight.device == torch.device(torch_device.replace("0", "1"))

def test_load_checkpoint_in_model_dtype(self):
with tempfile.NamedTemporaryFile(suffix=".pt") as tmpfile:
Expand Down
14 changes: 12 additions & 2 deletions tests/test_multigpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
require_non_torch_xla,
require_pippy,
require_torchvision,
torch_device,
)
from accelerate.utils import patch_environment

Expand Down Expand Up @@ -72,15 +73,24 @@ def test_multi_device_merge_fsdp_weights(self):
execute_subprocess_async(cmd)

@require_non_torch_xla
@require_multi_gpu
@require_multi_device
def test_distributed_data_loop(self):
"""
This TestCase checks the behaviour that occurs during distributed training or evaluation,
when the batch size does not evenly divide the dataset size.
"""
print(f"Found {device_count} devices, using 2 devices only")
cmd = get_launch_command(num_processes=2) + [self.data_loop_file_path]
with patch_environment(omp_num_threads=1, cuda_visible_devices="0,1"):
env_kwargs = dict(omp_num_threads=1)
if torch_device == "xpu":
env_kwargs.update(ze_affinity_mask="0,1")
elif torch_device == "npu":
env_kwargs.update(ascend_rt_visible_devices="0,1")
elif torch_device == "mlu":
env_kwargs.update(mlu_visible_devices="0,1")
else:
env_kwargs.update(cuda_visible_devices="0,1")
with patch_environment(**env_kwargs):
execute_subprocess_async(cmd)

@require_multi_gpu
Expand Down

0 comments on commit eac206f

Please sign in to comment.