diff --git a/examples/legacy/seq2seq/seq2seq_trainer.py b/examples/legacy/seq2seq/seq2seq_trainer.py index fb430c00c26b..054ebd63c30a 100644 --- a/examples/legacy/seq2seq/seq2seq_trainer.py +++ b/examples/legacy/seq2seq/seq2seq_trainer.py @@ -144,7 +144,7 @@ def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]: return ( RandomSampler(self.train_dataset) - if self.args.local_rank == -1 + if self.args.local_process_index == -1 else DistributedSampler(self.train_dataset) ) diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index cd7489bcbdb9..370191b35d77 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -996,6 +996,12 @@ class TrainingArguments: ) }, ) + local_rank: int = field( + default=-1, + metadata={ + "help": "When using torch.distributed.launch (Deprecated), it will pass `local_rank` in the script, so we need this for the parser. To get the local rank, prefer using the property `local_process_index`" + }, + ) ddp_backend: Optional[str] = field( default=None, metadata={ diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py index bf3aba7e1a4d..265d4a4462aa 100644 --- a/tests/deepspeed/test_deepspeed.py +++ b/tests/deepspeed/test_deepspeed.py @@ -518,7 +518,6 @@ def test_hf_ds_config_mismatch(self): with mockenv_context(**self.dist_env_1_gpu): trainer = get_regression_trainer( - local_rank=0, fp16=fp16, deepspeed=ds_config, per_device_train_batch_size=per_device_train_batch_size, @@ -552,7 +551,7 @@ def test_hf_scheduler_hf_optimizer(self): ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none" ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step trainer = get_regression_trainer( - a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict, output_dir=self.get_auto_remove_tmp_dir() + a=a, fp16=True, deepspeed=ds_config_zero2_dict, output_dir=self.get_auto_remove_tmp_dir() ) trainer.train() new_a = trainer.model.a.item() @@ -566,7 +565,7 @@ def test_ds_scheduler_hf_optimizer(self): ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none" ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step trainer = get_regression_trainer( - a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict, output_dir=self.get_auto_remove_tmp_dir() + a=a, fp16=True, deepspeed=ds_config_zero2_dict, output_dir=self.get_auto_remove_tmp_dir() ) trainer.train() new_a = trainer.model.a.item() @@ -580,7 +579,7 @@ def test_hf_scheduler_ds_optimizer(self): ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none" ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step trainer = get_regression_trainer( - a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict, output_dir=self.get_auto_remove_tmp_dir() + a=a, fp16=True, deepspeed=ds_config_zero2_dict, output_dir=self.get_auto_remove_tmp_dir() ) trainer.train() new_a = trainer.model.a.item() @@ -598,7 +597,7 @@ def test_stage3_nvme_offload(self): ds_config_zero3_dict["zero_optimization"]["offload_param"] = nvme_config ds_config_zero3_dict["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = True trainer = get_regression_trainer( - local_rank=0, fp16=True, deepspeed=ds_config_zero3_dict, output_dir=self.get_auto_remove_tmp_dir() + fp16=True, deepspeed=ds_config_zero3_dict, output_dir=self.get_auto_remove_tmp_dir() ) with CaptureLogger(deepspeed_logger) as cl: trainer.train() @@ -616,7 +615,6 @@ def model_init(): return model trainer = get_regression_trainer( - local_rank=0, fp16=True, model_init=model_init, deepspeed=ds_config_zero3_dict, @@ -642,7 +640,7 @@ def test_hf_optimizer_with_offload(self, stage, dtype): ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu" ds_config_dict["zero_force_ds_cpu_optimizer"] = False # offload is not efficient w/o CPUAdam with mockenv_context(**self.dist_env_1_gpu): - kwargs = {"local_rank": 0, "deepspeed": ds_config_dict, "output_dir": self.get_auto_remove_tmp_dir()} + kwargs = {"deepspeed": ds_config_dict, "output_dir": self.get_auto_remove_tmp_dir()} kwargs[dtype] = True trainer = get_regression_trainer(**kwargs) with CaptureLogger(deepspeed_logger) as cl: @@ -659,7 +657,6 @@ def test_fake_notebook_no_launcher(self, stage, dtype): # to reset `deepspeed_logger.handlers[0].setStream(sys.stdout)` or directly capture from the deepspeed_logger. with mockenv_context(**self.dist_env_1_gpu): kwargs = { - "local_rank": 0, "deepspeed": self.get_config_dict(stage), "output_dir": self.get_auto_remove_tmp_dir(), } @@ -683,7 +680,6 @@ def test_early_get_last_lr(self, stage, dtype): kwargs = { "a": a, "b": b, - "local_rank": 0, "train_len": 8, "deepspeed": self.get_config_dict(stage), "per_device_train_batch_size": 8, @@ -729,7 +725,6 @@ def test_gradient_accumulation(self, stage, dtype): kwargs = { "a": a, "b": b, - "local_rank": 0, "train_len": train_len, "deepspeed": self.get_config_dict(stage), "output_dir": self.get_auto_remove_tmp_dir(), diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 49c74f8558c9..b87ff42db7bf 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1437,9 +1437,7 @@ def test_training_arguments_are_left_untouched(self): args = TrainingArguments(tmp_dir, report_to=[]) dict1, dict2 = args.to_dict(), trainer.args.to_dict() for key in dict1: - # Logging dir can be slightly different as they default to something with the time. - if key != "logging_dir": - self.assertEqual(dict1[key], dict2[key]) + self.assertEqual(dict1[key], dict2[key]) def test_number_of_steps_in_training(self): # Regular training has n_epochs * len(train_dl) steps @@ -5433,7 +5431,6 @@ def hp_name(trial): num_train_epochs=4, disable_tqdm=True, load_best_model_at_end=True, - logging_dir="runs", run_name="test", model_init=model_init, ) @@ -5482,7 +5479,6 @@ def compute_objective(metrics: dict[str, float]) -> list[float]: num_train_epochs=10, disable_tqdm=True, load_best_model_at_end=True, - logging_dir="runs", run_name="test", model_init=model_init, compute_metrics=AlmostAccuracy(), @@ -5572,7 +5568,6 @@ def hp_name(params): num_train_epochs=4, disable_tqdm=True, load_best_model_at_end=True, - logging_dir="runs", run_name="test", model_init=model_init, ) @@ -6170,7 +6165,6 @@ def model_init(config): num_train_epochs=4, disable_tqdm=True, load_best_model_at_end=True, - logging_dir="runs", run_name="test", model_init=model_init, )