make more cuda-only tests device-agnostic (#2876)

* enable 3 cases * add ests * add 2 more * revert 1 back * revert 1 more * enable on xpu
huggingface · Jul 3, 2024 · eac206f · eac206f
1 parent 6882ff2
commit eac206f
Show file tree

Hide file tree

Showing 5 changed files with 53 additions and 33 deletions.
diff --git a/tests/test_accelerator.py b/tests/test_accelerator.py
@@ -298,7 +298,7 @@ def test_save_model_offload(self, use_safetensors):
         assert torch.allclose(expected, output, atol=1e-5)
 
     @parameterized.expand([True, False], name_func=parameterized_custom_name_func)
-    @require_cuda
+    @require_non_cpu
     def test_get_state_dict_from_offload(self, use_safetensors):
         accelerator = Accelerator()
 
@@ -312,18 +312,18 @@ def test_get_state_dict_from_offload(self, use_safetensors):
             cpu_onloaded_layer = get_state_dict_from_offload(
                 model.linear2, "linear2.weight", {"linear2.weight": ""}, device_to_put_offload="cpu"
             )
-            cuda_onloaded_layer = get_state_dict_from_offload(
+            device_onloaded_layer = get_state_dict_from_offload(
                 model.linear2, "linear2.weight", {"linear2.weight": ""}, device_to_put_offload=0
             )
             cpu_onloaded_layer_weight = cpu_onloaded_layer["linear2.weight"]
-            cuda_onloaded_layer_weight = cuda_onloaded_layer["linear2.weight"]
+            device_onloaded_layer_weight = device_onloaded_layer["linear2.weight"]
 
         assert torch.allclose(offloaded_layer_weight, cpu_onloaded_layer_weight)
         assert torch.allclose(
-            offloaded_layer_weight, cuda_onloaded_layer_weight.to("cpu")
+            offloaded_layer_weight, device_onloaded_layer_weight.to("cpu")
         )  # must be on the same device for torch.allclose()
         assert cpu_onloaded_layer_weight.device.type == "cpu"
-        assert cuda_onloaded_layer_weight.device.type == "cuda"
+        assert device_onloaded_layer_weight.device.type == torch_device
 
     @parameterized.expand([True, False], name_func=parameterized_custom_name_func)
     def test_save_load_model_with_hooks(self, use_safetensors):

diff --git a/tests/test_big_modeling.py b/tests/test_big_modeling.py
@@ -655,7 +655,7 @@ def test_dispatch_model_move_offloaded_model(self):
             with self.assertRaises(RuntimeError):
                 model.to(0)
 
-    @require_multi_gpu
+    @require_multi_device
     def test_dispatch_model_move_model_warning(self):
         model = ModelForTest()
         device_map = {"linear1": 0, "batchnorm": 0, "linear2": 1}
@@ -664,7 +664,7 @@ def test_dispatch_model_move_model_warning(self):
             with self.assertLogs("accelerate.big_modeling", level="WARNING"):
                 model.to("cpu")
             with self.assertLogs("accelerate.big_modeling", level="WARNING"):
-                model.cuda(0)
+                model.to(torch_device)
             with self.assertRaises(RuntimeError):
                 x = torch.randn(2, 3)
                 model(x)

diff --git a/tests/test_examples.py b/tests/test_examples.py
@@ -28,6 +28,7 @@
     TempDirTestCase,
     get_launch_command,
     require_huggingface_suite,
+    require_multi_device,
     require_multi_gpu,
     require_pippy,
     require_schedulefree,
@@ -253,17 +254,17 @@ def test_profiler(self):
         testargs = ["examples/by_feature/profiler.py"]
         run_command(self.launch_args + testargs)
 
-    @require_multi_gpu
+    @require_multi_device
     def test_ddp_comm_hook(self):
         testargs = ["examples/by_feature/ddp_comm_hook.py", "--ddp_comm_hook", "fp16"]
         run_command(self.launch_args + testargs)
 
-    @require_multi_gpu
+    @require_multi_device
     def test_distributed_inference_examples_stable_diffusion(self):
         testargs = ["examples/inference/distributed/stable_diffusion.py"]
         run_command(self.launch_args + testargs)
 
-    @require_multi_gpu
+    @require_multi_device
     def test_distributed_inference_examples_phi2(self):
         testargs = ["examples/inference/distributed/phi2.py"]
         run_command(self.launch_args + testargs)

diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py
@@ -26,7 +26,13 @@
 from safetensors.torch import save_file
 
 from accelerate import init_empty_weights
-from accelerate.test_utils import require_cuda, require_huggingface_suite, require_multi_gpu
+from accelerate.test_utils import (
+    require_cuda,
+    require_huggingface_suite,
+    require_multi_device,
+    require_non_cpu,
+    torch_device,
+)
 from accelerate.utils.modeling import (
     check_device_map,
     clean_device_map,
@@ -44,6 +50,9 @@
 )
 
 
+torch_device = f"{torch_device}:0" if torch_device != "cpu" else "cpu"
+
+
 class ModelForTest(nn.Module):
     def __init__(self):
         super().__init__()
@@ -150,20 +159,20 @@ def test_set_module_tensor_to_meta_and_cpu(self):
         model = ModelForTest()
         self.check_set_module_tensor_for_device(model, "cpu", "meta")
 
-    @require_cuda
+    @require_non_cpu
     def test_set_module_tensor_to_cpu_and_gpu(self):
         model = ModelForTest()
-        self.check_set_module_tensor_for_device(model, "cpu", 0)
+        self.check_set_module_tensor_for_device(model, "cpu", torch_device)
 
-    @require_cuda
+    @require_non_cpu
     def test_set_module_tensor_to_meta_and_gpu(self):
-        model = ModelForTest().to(0)
-        self.check_set_module_tensor_for_device(model, 0, "meta")
+        model = ModelForTest().to(torch_device)
+        self.check_set_module_tensor_for_device(model, torch_device, "meta")
 
-    @require_multi_gpu
+    @require_multi_device
     def test_set_module_tensor_between_gpus(self):
-        model = ModelForTest().to(0)
-        self.check_set_module_tensor_for_device(model, 0, 1)
+        model = ModelForTest().to(torch_device)
+        self.check_set_module_tensor_for_device(model, torch_device, torch_device.replace("0", "1"))
 
     def test_set_module_tensor_sets_dtype(self):
         model = ModelForTest()
@@ -361,7 +370,7 @@ def test_load_checkpoint_in_model(self):
             self.shard_test_model(model, tmp_dir)
             load_checkpoint_in_model(model, tmp_dir)
 
-    @require_cuda
+    @require_non_cpu
     def test_load_checkpoint_in_model_one_gpu(self):
         device_map = {"linear1": 0, "batchnorm": "cpu", "linear2": "cpu"}
 
@@ -371,7 +380,7 @@ def test_load_checkpoint_in_model_one_gpu(self):
             fname = os.path.join(tmp_dir, "pt_model.bin")
             torch.save(model.state_dict(), fname)
             load_checkpoint_in_model(model, fname, device_map=device_map)
-        assert model.linear1.weight.device == torch.device(0)
+        assert model.linear1.weight.device == torch.device(torch_device)
         assert model.batchnorm.weight.device == torch.device("cpu")
         assert model.linear2.weight.device == torch.device("cpu")
 
@@ -382,7 +391,7 @@ def test_load_checkpoint_in_model_one_gpu(self):
             index_file = os.path.join(tmp_dir, "weight_map.index.json")
             load_checkpoint_in_model(model, index_file, device_map=device_map)
 
-        assert model.linear1.weight.device == torch.device(0)
+        assert model.linear1.weight.device == torch.device(torch_device)
         assert model.batchnorm.weight.device == torch.device("cpu")
         assert model.linear2.weight.device == torch.device("cpu")
 
@@ -392,11 +401,11 @@ def test_load_checkpoint_in_model_one_gpu(self):
             self.shard_test_model(model, tmp_dir)
             load_checkpoint_in_model(model, tmp_dir, device_map=device_map)
 
-        assert model.linear1.weight.device == torch.device(0)
+        assert model.linear1.weight.device == torch.device(torch_device)
         assert model.batchnorm.weight.device == torch.device("cpu")
         assert model.linear2.weight.device == torch.device("cpu")
 
-    @require_cuda
+    @require_non_cpu
     def test_load_checkpoint_in_model_disk_offload(self):
         device_map = {"linear1": "cpu", "batchnorm": "disk", "linear2": "cpu"}
 
@@ -421,7 +430,7 @@ def test_load_checkpoint_in_model_disk_offload(self):
         assert model.batchnorm.running_mean.device == torch.device("meta")
         assert model.linear2.weight.device == torch.device("cpu")
 
-    @require_multi_gpu
+    @require_multi_device
     def test_load_checkpoint_in_model_two_gpu(self):
         device_map = {"linear1": 0, "batchnorm": "cpu", "linear2": 1}
 
@@ -431,9 +440,9 @@ def test_load_checkpoint_in_model_two_gpu(self):
             fname = os.path.join(tmp_dir, "pt_model.bin")
             torch.save(model.state_dict(), fname)
             load_checkpoint_in_model(model, fname, device_map=device_map)
-        assert model.linear1.weight.device == torch.device(0)
+        assert model.linear1.weight.device == torch.device(torch_device)
         assert model.batchnorm.weight.device == torch.device("cpu")
-        assert model.linear2.weight.device == torch.device(1)
+        assert model.linear2.weight.device == torch.device(torch_device.replace("0", "1"))
 
         # Check with sharded index
         model = ModelForTest()
@@ -442,19 +451,19 @@ def test_load_checkpoint_in_model_two_gpu(self):
             index_file = os.path.join(tmp_dir, "weight_map.index.json")
             load_checkpoint_in_model(model, index_file, device_map=device_map)
 
-        assert model.linear1.weight.device == torch.device(0)
+        assert model.linear1.weight.device == torch.device(torch_device)
         assert model.batchnorm.weight.device == torch.device("cpu")
-        assert model.linear2.weight.device == torch.device(1)
+        assert model.linear2.weight.device == torch.device(torch_device.replace("0", "1"))
 
         # Check with sharded checkpoint
         model = ModelForTest()
         with tempfile.TemporaryDirectory() as tmp_dir:
             self.shard_test_model(model, tmp_dir)
             load_checkpoint_in_model(model, tmp_dir, device_map=device_map)
 
-        assert model.linear1.weight.device == torch.device(0)
+        assert model.linear1.weight.device == torch.device(torch_device)
         assert model.batchnorm.weight.device == torch.device("cpu")
-        assert model.linear2.weight.device == torch.device(1)
+        assert model.linear2.weight.device == torch.device(torch_device.replace("0", "1"))
 
     def test_load_checkpoint_in_model_dtype(self):
         with tempfile.NamedTemporaryFile(suffix=".pt") as tmpfile:

diff --git a/tests/test_multigpu.py b/tests/test_multigpu.py
@@ -32,6 +32,7 @@
     require_non_torch_xla,
     require_pippy,
     require_torchvision,
+    torch_device,
 )
 from accelerate.utils import patch_environment
 
@@ -72,15 +73,24 @@ def test_multi_device_merge_fsdp_weights(self):
             execute_subprocess_async(cmd)
 
     @require_non_torch_xla
-    @require_multi_gpu
+    @require_multi_device
     def test_distributed_data_loop(self):
         """
         This TestCase checks the behaviour that occurs during distributed training or evaluation,
         when the batch size does not evenly divide the dataset size.
         """
         print(f"Found {device_count} devices, using 2 devices only")
         cmd = get_launch_command(num_processes=2) + [self.data_loop_file_path]
-        with patch_environment(omp_num_threads=1, cuda_visible_devices="0,1"):
+        env_kwargs = dict(omp_num_threads=1)
+        if torch_device == "xpu":
+            env_kwargs.update(ze_affinity_mask="0,1")
+        elif torch_device == "npu":
+            env_kwargs.update(ascend_rt_visible_devices="0,1")
+        elif torch_device == "mlu":
+            env_kwargs.update(mlu_visible_devices="0,1")
+        else:
+            env_kwargs.update(cuda_visible_devices="0,1")
+        with patch_environment(**env_kwargs):
             execute_subprocess_async(cmd)
 
     @require_multi_gpu