Skip to content

Commit

Permalink
fix sync PodGroup logic
Browse files Browse the repository at this point in the history
  • Loading branch information
wackxu committed May 27, 2019
1 parent 671cdab commit dd94695
Showing 1 changed file with 8 additions and 10 deletions.
18 changes: 8 additions & 10 deletions pkg/controller.v1/tensorflow/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -306,14 +306,6 @@ func (tc *TFController) syncTFJob(key string) (bool, error) {
tfjob := sharedTFJob.DeepCopy()
tfjobNeedsSync := tc.satisfiedExpectations(tfjob)

if tc.Config.EnableGangScheduling {
minAvailableReplicas := getTotalReplicas(tfjob)
_, err := tc.SyncPodGroup(tfjob, minAvailableReplicas)
if err != nil {
logger.Warnf("Sync PodGroup %v: %v", tfjob.Name, err)
}
}

// Set default for the new tfjob.
scheme.Scheme.Default(tfjob)

Expand Down Expand Up @@ -418,13 +410,11 @@ func (tc *TFController) reconcileTFJobs(tfjob *tfv1.TFJob) error {
}

if tc.Config.EnableGangScheduling {
tc.Recorder.Event(tfjob, v1.EventTypeNormal, "JobTerminated", "Job is terminated, deleting PodGroup")
if err := tc.DeletePodGroup(tfjob); err != nil {
tc.Recorder.Eventf(tfjob, v1.EventTypeWarning, "FailedDeletePodGroup", "Error deleting: %v", err)
return err
} else {
tc.Recorder.Eventf(tfjob, v1.EventTypeNormal, "SuccessfulDeletePodGroup", "Deleted PodGroup: %v", tfjob.Name)

}
}

Expand All @@ -439,6 +429,14 @@ func (tc *TFController) reconcileTFJobs(tfjob *tfv1.TFJob) error {
return tc.updateStatusHandler(tfjob)
}

if tc.Config.EnableGangScheduling {
minAvailableReplicas := getTotalReplicas(tfjob)
_, err := tc.SyncPodGroup(tfjob, minAvailableReplicas)
if err != nil {
logger.Warnf("Sync PodGroup %v: %v", tfjob.Name, err)
}
}

// Save the current state of the replicas
replicasStatus := make(map[string]v1.PodPhase)

Expand Down

0 comments on commit dd94695

Please sign in to comment.