Skip to content

Commit

Permalink
fix tfjob status when enableDynamicWorker set true (#1455)
Browse files Browse the repository at this point in the history
  • Loading branch information
zw0610 authored Oct 29, 2021
1 parent 4ac55d2 commit 9efffd4
Showing 1 changed file with 5 additions and 0 deletions.
5 changes: 5 additions & 0 deletions pkg/controller.v1/tensorflow/tfjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -513,6 +513,11 @@ func (r *TFJobReconciler) UpdateJobStatus(job interface{}, replicas map[commonv1
// we know it because we update the status condition when reconciling the replicas
trainingoperatorcommon.RestartedJobsCounterInc(tfJob.Namespace, tensorflowv1.FrameworkName)
} else {
if tfJob.Spec.EnableDynamicWorker && rtype == tensorflowv1.TFReplicaTypeWorker {
commonutil.LoggerForJob(tfJob).Infof("TFJob %s/%s continues regardless %d Worker replica(s) failed as enableDynamicWorker is set true.",
tfJob.Namespace, tfJob.Name, failed)
continue
}
msg := fmt.Sprintf("TFJob %s/%s has failed because %d %s replica(s) failed.",
tfJob.Namespace, tfJob.Name, failed, rtype)
r.recorder.Event(tfJob, corev1.EventTypeNormal, tfJobFailedReason, msg)
Expand Down

0 comments on commit 9efffd4

Please sign in to comment.