Skip to content

Commit

Permalink
Add limit
Browse files Browse the repository at this point in the history
Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
  • Loading branch information
terrytangyuan committed Aug 31, 2023
1 parent 34acef9 commit cb43812
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 8 deletions.
4 changes: 3 additions & 1 deletion pkg/apis/kubeflow/v2beta1/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ const (
// JobRoleLabel represents the label key for the job role, e.g. master.
JobRoleLabel = "training.kubeflow.org/job-role"

// MPIJobRestartCountLabel represents the number of times we have restarted this MPIJob upon failures.
// MPIJobRestartCountLabel represents the number of times we have restarted a MPIJob upon failures.
MPIJobRestartCountLabel = "training.kubeflow.org/restart-count"
// MPIJobRestartCountLimitLabel represents the maximum number of times we can restart a MPIJob upon failures.
MPIJobRestartCountLimitLabel = "training.kubeflow.org/restart-count-limit"
)
24 changes: 17 additions & 7 deletions pkg/controller/mpi_job_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -577,22 +577,32 @@ func (c *MPIJobController) syncHandler(key string) error {
if mpiJob.Status.ReplicaStatuses[kubeflow.MPIReplicaTypeWorker].Failed >= 0 || mpiJob.Status.ReplicaStatuses[kubeflow.MPIReplicaTypeLauncher].Failed >= 0 {
restartCount := 0
maxRestart := 3
if val, ok := mpiJob.GetLabels()[kubeflow.MPIJobRestartCountLabel]; ok {
if val == "" {
val = "0"
if count, ok := mpiJob.GetLabels()[kubeflow.MPIJobRestartCountLabel]; ok {
if count == "" {
count = "0"
}
i, err := strconv.Atoi(val)
countInt, err := strconv.Atoi(count)
if err != nil {
klog.V(4).Infof("failed to convert %s to integer: %s", kubeflow.MPIJobRestartCountLabel, val)
klog.V(4).Infof("failed to convert %s to integer: %s", kubeflow.MPIJobRestartCountLabel, count)
return err
}
restartCount = i + 1
restartCount = countInt + 1
if limit, ok := mpiJob.GetLabels()[kubeflow.MPIJobRestartCountLimitLabel]; ok {
if limit != "" {
limitInt, err := strconv.Atoi(limit)
if err != nil {
klog.V(4).Infof("failed to convert %s to integer: %s", kubeflow.MPIJobRestartCountLimitLabel, count)
return err
}
maxRestart = limitInt
}
}
if restartCount <= maxRestart {
newMPIJob := &kubeflow.MPIJob{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("%s-%d", mpiJob.Name, restartCount),
Namespace: mpiJob.Namespace,
Labels: map[string]string{kubeflow.MPIJobRestartCountLabel: string(restartCount)},
Labels: map[string]string{kubeflow.MPIJobRestartCountLabel: fmt.Sprintf("%d", restartCount)},
},
Spec: mpiJob.Spec,
}
Expand Down

0 comments on commit cb43812

Please sign in to comment.