From 1306b7d3ae944eb6f55b9e1e6eed00530069db1a Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 12 Apr 2023 08:25:45 -0700 Subject: [PATCH] [tests] switch to torchrun (#22712) --- tests/extended/test_trainer_ext.py | 2 +- tests/trainer/test_trainer_distributed.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/extended/test_trainer_ext.py b/tests/extended/test_trainer_ext.py index 8953adaa247f0..7fd2fc9389ab2 100644 --- a/tests/extended/test_trainer_ext.py +++ b/tests/extended/test_trainer_ext.py @@ -366,7 +366,7 @@ def run_trainer( n_gpus_to_use = get_gpu_count() master_port = get_torch_dist_unique_port() distributed_args = f""" - -m torch.distributed.launch + -m torch.distributed.run --nproc_per_node={n_gpus_to_use} --master_port={master_port} {self.examples_dir_str}/pytorch/translation/run_translation.py diff --git a/tests/trainer/test_trainer_distributed.py b/tests/trainer/test_trainer_distributed.py index 5fa6edb1c88f1..97bca4f9d367f 100644 --- a/tests/trainer/test_trainer_distributed.py +++ b/tests/trainer/test_trainer_distributed.py @@ -67,7 +67,7 @@ class TestTrainerDistributedNeuronCore(TestCasePlus): @require_torch_neuroncore def test_trainer(self): distributed_args = f""" - -m torch.distributed.launch + -m torch.distributed.run --nproc_per_node=2 --master_port={get_torch_dist_unique_port()} {self.test_file_dir}/test_trainer_distributed.py @@ -83,7 +83,7 @@ class TestTrainerDistributed(TestCasePlus): @require_torch_multi_gpu def test_trainer(self): distributed_args = f""" - -m torch.distributed.launch + -m torch.distributed.run --nproc_per_node={torch.cuda.device_count()} --master_port={get_torch_dist_unique_port()} {self.test_file_dir}/test_trainer_distributed.py @@ -98,7 +98,7 @@ def test_trainer(self): if __name__ == "__main__": # The script below is meant to be run under torch.distributed, on a machine with multiple GPUs: # - # PYTHONPATH="src" python -m torch.distributed.launch --nproc_per_node 2 --output_dir output_dir ./tests/test_trainer_distributed.py + # PYTHONPATH="src" python -m torch.distributed.run --nproc_per_node 2 --output_dir output_dir ./tests/test_trainer_distributed.py parser = HfArgumentParser((TrainingArguments,)) training_args = parser.parse_args_into_dataclasses()[0]