Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

kubelet: stop probing if liveness check fails #22107

Merged
merged 1 commit into from
Mar 1, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
18 changes: 18 additions & 0 deletions pkg/kubelet/prober/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,9 @@ type worker struct {
lastResult results.Result
// How many times in a row the probe has returned the same result.
resultRun int

// If set, skip probing.
onHold bool
}

// Creates and starts a new probe worker.
Expand Down Expand Up @@ -165,6 +168,13 @@ func (w *worker) doProbe() (keepGoing bool) {
}
w.containerID = kubecontainer.ParseContainerID(c.ContainerID)
w.resultsManager.Set(w.containerID, w.initialValue, w.pod)
// We've got a new container; resume probing.
w.onHold = false
}

if w.onHold {
// Worker is on hold until there is a new container.
return true
}

if c.State.Running == nil {
Expand Down Expand Up @@ -203,5 +213,13 @@ func (w *worker) doProbe() (keepGoing bool) {

w.resultsManager.Set(w.containerID, result, w.pod)

if w.probeType == liveness && result == results.Failure {
// The container fails a liveness check, it will need to be restared.
// Stop probing until we see a new container ID. This is to reduce the
// chance of hitting #21751, where running `docker exec` when a
// container is being stopped may lead to corrupted container state.
w.onHold = true
}

return true
}
37 changes: 36 additions & 1 deletion pkg/kubelet/prober/worker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ func TestHandleCrash(t *testing.T) {
}

func expectResult(t *testing.T, w *worker, expectedResult results.Result, msg string) {
result, ok := resultsManager(w.probeManager, w.probeType).Get(testContainerID)
result, ok := resultsManager(w.probeManager, w.probeType).Get(w.containerID)
if !ok {
t.Errorf("[%s - %s] Expected result to be set, but was not set", w.probeType, msg)
} else if result != expectedResult {
Expand Down Expand Up @@ -305,3 +305,38 @@ type crashingExecProber struct{}
func (p crashingExecProber) Probe(_ exec.Cmd) (probe.Result, string, error) {
panic("Intentional Probe crash.")
}

func TestOnHoldOnLivenessCheckFailure(t *testing.T) {
m := newTestManager()
w := newTestWorker(m, liveness, api.Probe{SuccessThreshold: 1, FailureThreshold: 1})
status := getTestRunningStatus()
m.statusManager.SetPodStatus(w.pod, getTestRunningStatus())

// First probe should fail.
m.prober.exec = fakeExecProber{probe.Failure, nil}
msg := "first probe"
expectContinue(t, w, w.doProbe(), msg)
expectResult(t, w, results.Failure, msg)
if !w.onHold {
t.Errorf("Prober should be on hold due to liveness check failure")
}
// Set fakeExecProber to return success. However, the result will remain
// failure because the worker is on hold and won't probe.
m.prober.exec = fakeExecProber{probe.Success, nil}
msg = "while on hold"
expectContinue(t, w, w.doProbe(), msg)
expectResult(t, w, results.Failure, msg)
if !w.onHold {
t.Errorf("Prober should be on hold due to liveness check failure")
}

// Set a new container ID to lift the hold. The next probe will succeed.
status.ContainerStatuses[0].ContainerID = "test://newCont_ID"
m.statusManager.SetPodStatus(w.pod, status)
msg = "hold lifted"
expectContinue(t, w, w.doProbe(), msg)
expectResult(t, w, results.Success, msg)
if w.onHold {
t.Errorf("Prober should not be on hold anymore")
}
}