device agnostic test_accelerator/test_multigpu (#2343)

huggingface · Jan 18, 2024 · ec4f01a · ec4f01a
1 parent f5c01ee
commit ec4f01a
Show file tree

Hide file tree

Showing 2 changed files with 32 additions and 27 deletions.
diff --git a/tests/test_accelerator.py b/tests/test_accelerator.py
@@ -11,8 +11,8 @@
 from accelerate import DistributedType, infer_auto_device_map, init_empty_weights, load_checkpoint_and_dispatch
 from accelerate.accelerator import Accelerator
 from accelerate.state import GradientState, PartialState
-from accelerate.test_utils import require_bnb, require_multi_gpu, slow
-from accelerate.test_utils.testing import AccelerateTestCase, require_cuda
+from accelerate.test_utils import require_bnb, require_multi_device, require_non_cpu, slow, torch_device
+from accelerate.test_utils.testing import AccelerateTestCase
 from accelerate.utils import patch_environment
 from accelerate.utils.modeling import load_checkpoint_in_model
 
@@ -55,11 +55,11 @@ def parameterized_custom_name_func(func, param_num, param):
 
 
 class AcceleratorTester(AccelerateTestCase):
-    @require_cuda
+    @require_non_cpu
     def test_accelerator_can_be_reinstantiated(self):
         _ = Accelerator()
         assert PartialState._shared_state["_cpu"] is False
-        assert PartialState._shared_state["device"].type == "cuda"
+        assert PartialState._shared_state["device"].type in ["cuda", "mps", "npu", "xpu"]
         with self.assertRaises(ValueError):
             _ = Accelerator(cpu=True)
 
@@ -326,12 +326,17 @@ def test_accelerator_bnb_cpu_error(self):
 
     @slow
     @require_bnb
-    @require_multi_gpu
-    def test_accelerator_bnb_multi_gpu(self):
+    @require_multi_device
+    def test_accelerator_bnb_multi_device(self):
         """Tests that the accelerator can be used with the BNB library."""
         from transformers import AutoModelForCausalLM
 
-        PartialState._shared_state = {"distributed_type": DistributedType.MULTI_GPU}
+        if torch_device == "cuda":
+            PartialState._shared_state = {"distributed_type": DistributedType.MULTI_GPU}
+        elif torch_device == "npu":
+            PartialState._shared_state = {"distributed_type": DistributedType.MULTI_NPU}
+        else:
+            raise ValueError(f"{torch_device} is not supported in test_accelerator_bnb_multi_device.")
 
         with init_empty_weights():
             model = AutoModelForCausalLM.from_pretrained(
@@ -356,8 +361,8 @@ def test_accelerator_bnb_multi_gpu(self):
 
     @slow
     @require_bnb
-    @require_multi_gpu
-    def test_accelerator_bnb_multi_gpu_no_distributed(self):
+    @require_multi_device
+    def test_accelerator_bnb_multi_device_no_distributed(self):
         """Tests that the accelerator can be used with the BNB library."""
         from transformers import AutoModelForCausalLM
 
@@ -378,21 +383,21 @@ def test_accelerator_bnb_multi_gpu_no_distributed(self):
         # This should work
         _ = accelerator.prepare(model)
 
-    @require_cuda
+    @require_non_cpu
     def test_accelerator_cpu_flag_prepare(self):
         model = torch.nn.Linear(10, 10)
         sgd = torch.optim.SGD(model.parameters(), lr=0.01)
         accelerator = Accelerator(cpu=True)
         _ = accelerator.prepare(sgd)
 
-    @require_cuda
+    @require_non_cpu
     def test_can_unwrap_model_fp16(self):
         # test for a regression introduced in #872
         # before the fix, after unwrapping with keep_fp32_wrapper=False, there would be the following error:
         # Linear.forward() missing 1 required positional argument: 'input'
         model = create_components()[0]
         accelerator = Accelerator(mixed_precision="fp16")
-        inputs = torch.randn(10, 2).cuda()
+        inputs = torch.randn(10, 2).to(torch_device)
         model = accelerator.prepare(model)
         model(inputs)  # sanity check that this works
 

diff --git a/tests/test_multigpu.py b/tests/test_multigpu.py
@@ -21,11 +21,11 @@
 import accelerate
 from accelerate import Accelerator
 from accelerate.big_modeling import dispatch_model
-from accelerate.test_utils import assert_exception, execute_subprocess_async, require_multi_gpu
+from accelerate.test_utils import assert_exception, device_count, execute_subprocess_async, require_multi_device
 from accelerate.utils import patch_environment
 
 
-class MultiGPUTester(unittest.TestCase):
+class MultiDeviceTester(unittest.TestCase):
     def setUp(self):
         mod_file = inspect.getfile(accelerate.test_utils)
         self.test_file_path = os.path.sep.join(mod_file.split(os.path.sep)[:-1] + ["scripts", "test_script.py"])
@@ -34,35 +34,35 @@ def setUp(self):
         )
         self.operation_file_path = os.path.sep.join(mod_file.split(os.path.sep)[:-1] + ["scripts", "test_ops.py"])
 
-    @require_multi_gpu
-    def test_multi_gpu(self):
-        print(f"Found {torch.cuda.device_count()} devices.")
-        cmd = ["torchrun", f"--nproc_per_node={torch.cuda.device_count()}", self.test_file_path]
+    @require_multi_device
+    def test_multi_device(self):
+        print(f"Found {device_count} devices.")
+        cmd = ["torchrun", f"--nproc_per_node={device_count}", self.test_file_path]
         with patch_environment(omp_num_threads=1):
             execute_subprocess_async(cmd, env=os.environ.copy())
 
-    @require_multi_gpu
-    def test_multi_gpu_ops(self):
-        print(f"Found {torch.cuda.device_count()} devices.")
-        cmd = ["torchrun", f"--nproc_per_node={torch.cuda.device_count()}", self.operation_file_path]
+    @require_multi_device
+    def test_multi_device_ops(self):
+        print(f"Found {device_count} devices.")
+        cmd = ["torchrun", f"--nproc_per_node={device_count}", self.operation_file_path]
         print(f"Command: {cmd}")
         with patch_environment(omp_num_threads=1):
             execute_subprocess_async(cmd, env=os.environ.copy())
 
-    @require_multi_gpu
+    @require_multi_device
     def test_pad_across_processes(self):
-        cmd = ["torchrun", f"--nproc_per_node={torch.cuda.device_count()}", inspect.getfile(self.__class__)]
+        cmd = ["torchrun", f"--nproc_per_node={device_count}", inspect.getfile(self.__class__)]
         with patch_environment(omp_num_threads=1):
             execute_subprocess_async(cmd, env=os.environ.copy())
 
-    @require_multi_gpu
+    @require_multi_device
     def test_distributed_data_loop(self):
         """
         This TestCase checks the behaviour that occurs during distributed training or evaluation,
         when the batch size does not evenly divide the dataset size.
         """
-        print(f"Found {torch.cuda.device_count()} devices, using 2 devices only")
-        cmd = ["torchrun", f"--nproc_per_node={torch.cuda.device_count()}", self.data_loop_file_path]
+        print(f"Found {device_count} devices, using 2 devices only")
+        cmd = ["torchrun", f"--nproc_per_node={device_count}", self.data_loop_file_path]
         with patch_environment(omp_num_threads=1, cuda_visible_devices="0,1"):
             execute_subprocess_async(cmd, env=os.environ.copy())