Skip to content

Commit

Permalink
fix launch when elastic run (PaddlePaddle#61847)
Browse files Browse the repository at this point in the history
  • Loading branch information
hitywt committed Feb 20, 2024
1 parent 12e5c97 commit b64a4bd
Showing 1 changed file with 7 additions and 3 deletions.
10 changes: 7 additions & 3 deletions python/paddle/distributed/launch/controllers/collective.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,10 @@ def build_pod(self):
):
return self._build_pod_with_args()
else:
return self._build_pod_with_master()
if self.ctx.args.auto_parallel_config is None:
skip_run = True
# only when skip_run is Flase, should not reset pod
return self._build_pod_with_master(skip_run)

def _build_pod_with_tuner(self):
auto_parallel_config = self.ctx.args.auto_parallel_config
Expand Down Expand Up @@ -148,7 +151,7 @@ def _build_pod_with_args(self):

return True

def _build_pod_with_master(self):
def _build_pod_with_master(self, reset_pod=True):
self.pod.replicas = self.pod_replicas()

# rank will be reset when restart
Expand Down Expand Up @@ -203,7 +206,8 @@ def _build_pod_with_master(self):

job_endpoints = [i['endpoints'] for i in peer_list]

# self.pod.reset()
if reset_pod:
self.pod.reset()
selected_dev_key = self.ctx.node.device.get_selected_device_key()
selected_dev_list = self.ctx.node.device.get_selected_devices(
self.ctx.args.devices
Expand Down

0 comments on commit b64a4bd

Please sign in to comment.