Skip to content

Commit

Permalink
make pending timeout customizable (kubeflow#1268)
Browse files Browse the repository at this point in the history
* make pending timeout customizable

* fix the description of arg
  • Loading branch information
cheyang authored and hamedhsn committed May 5, 2019
1 parent ca7716b commit b029a09
Show file tree
Hide file tree
Showing 4 changed files with 11 additions and 7 deletions.
8 changes: 6 additions & 2 deletions components/arena/docker/arena_launcher.py
Expand Up @@ -249,7 +249,10 @@ def main(argv=None):
parser.add_argument('--tensorboard-image', type=str, default='tensorflow/tensorflow:1.12.0')
parser.add_argument('--timeout-hours', type=int,
default=200,
help='Time in hours to wait for the Job submitted by arena to complete')
help='Time in minutes to wait for the Job submitted by arena to complete')
parser.add_argument('--pending-timeout-minutes', type=int,
default=360,
help='Time in hours to wait for the Job submitted by arena from pending to running')
# parser.add_argument('--command', type=str)
parser.add_argument('--output-dir', type=str, default='')
parser.add_argument('--output-data', type=str, default='None')
Expand Down Expand Up @@ -321,7 +324,8 @@ def main(argv=None):

# wait for job done
# _wait_job_done(fullname, job_type, datetime.timedelta(minutes=timeout_hours))
_wait_job_running(fullname, job_type, datetime.timedelta(minutes=30))
pending_timeout_minutes = args.pending_timeout_minutes
_wait_job_running(fullname, job_type, datetime.timedelta(minutes=pending_timeout_minutes))

rc = _job_logging(fullname, job_type)
logging.info("rc: {0}".format(rc))
Expand Down
6 changes: 3 additions & 3 deletions components/arena/python/arena/_arena_distributed_tf_op.py
Expand Up @@ -29,7 +29,7 @@ def estimator_op(name, image, command,
evaluator=False, evaluator_cpu_limit=0, evaluator_memory_limit=0,
env=[], data=[], sync_source=None,
metrics=['Train-accuracy:PERCENTAGE'],
arena_image='cheyang/arena_launcher:v0.5',
arena_image='cheyang/arena_launcher:v0.6',
timeout_hours=240):

"""This function submits Distributed TFJob in Estimator mode.
Expand Down Expand Up @@ -62,7 +62,7 @@ def parameter_servers_op(name, image, command, env, data, sync_source, annotatio
tensorboard,
worker_port, ps_port,
metrics=['Train-accuracy:PERCENTAGE'],
arena_image='cheyang/arena_launcher:v0.5',
arena_image='cheyang/arena_launcher:v0.6',
timeout_hours=240):

"""This function submits Distributed TFJob in Parameter Servers mode.
Expand Down Expand Up @@ -97,7 +97,7 @@ def distributed_tf_op(name, image, command, env=[], data=[], sync_source=None,
ps_port=22224,
tensorboard=False,
metrics=['Train-accuracy:PERCENTAGE'],
arena_image='cheyang/arena_launcher:v0.5',
arena_image='cheyang/arena_launcher:v0.6',
timeout_hours=240):
"""This function submits Distributed TFJob in Distributed mode.
Expand Down
2 changes: 1 addition & 1 deletion components/arena/python/arena/_arena_mpi_op.py
Expand Up @@ -23,7 +23,7 @@ def mpi_job_op(name, image, command, workers=1, gpus=0, cpu_limit=0, memory_limi
rdma=False,
tensorboard=False, tensorboard_image=None,
metrics=['Train-accuracy:PERCENTAGE'],
arenaImage='cheyang/arena_launcher:v0.5',
arenaImage='cheyang/arena_launcher:v0.6',
timeout_hours=240):
"""This function submits MPI Job, it can run Allreduce-style Distributed Training.
Expand Down
2 changes: 1 addition & 1 deletion components/arena/python/arena/_arena_standalone_op.py
Expand Up @@ -23,7 +23,7 @@ def standalone_job_op(name, image, command, gpus=0, cpu_limit=0, memory_limit=0,
tensorboard=False, tensorboard_image=None,
data=[], sync_source=None, annotations=[],
metrics=['Train-accuracy:PERCENTAGE'],
arena_image='cheyang/arena_launcher:v0.5',
arena_image='cheyang/arena_launcher:v0.6',
timeout_hours=240):

"""This function submits a standalone training Job
Expand Down

0 comments on commit b029a09

Please sign in to comment.