Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add startupProbe result handling to kuberuntime #84279

Merged
merged 1 commit into from Nov 13, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions pkg/kubelet/kubelet.go
Expand Up @@ -583,6 +583,7 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
imageBackOff := flowcontrol.NewBackOff(backOffPeriod, MaxContainerBackOff)

klet.livenessManager = proberesults.NewManager()
klet.startupManager = proberesults.NewManager()

klet.podCache = kubecontainer.NewCache()
var checkpointManager checkpointmanager.CheckpointManager
Expand Down Expand Up @@ -671,6 +672,7 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
runtime, err := kuberuntime.NewKubeGenericRuntimeManager(
kubecontainer.FilterEventRecorder(kubeDeps.Recorder),
klet.livenessManager,
klet.startupManager,
seccompProfileRoot,
containerRefManager,
machineInfo,
Expand Down Expand Up @@ -777,6 +779,7 @@ func NewMainKubelet(kubeCfg *kubeletconfiginternal.KubeletConfiguration,
klet.probeManager = prober.NewManager(
klet.statusManager,
klet.livenessManager,
klet.startupManager,
klet.runner,
containerRefManager,
kubeDeps.Recorder)
Expand Down Expand Up @@ -976,6 +979,7 @@ type Kubelet struct {
probeManager prober.Manager
// Manages container health check results.
livenessManager proberesults.Manager
startupManager proberesults.Manager

// How long to keep idle streaming command execution/port forwarding
// connections open before terminating them
Expand Down
1 change: 1 addition & 0 deletions pkg/kubelet/kubelet_test.go
Expand Up @@ -233,6 +233,7 @@ func newTestKubeletWithImageList(

kubelet.probeManager = probetest.FakeManager{}
kubelet.livenessManager = proberesults.NewManager()
kubelet.startupManager = proberesults.NewManager()

kubelet.containerManager = cm.NewStubContainerManager()
fakeNodeRef := &v1.ObjectReference{
Expand Down
1 change: 1 addition & 0 deletions pkg/kubelet/kuberuntime/BUILD
Expand Up @@ -107,6 +107,7 @@ go_test(
"//pkg/kubelet/container/testing:go_default_library",
"//pkg/kubelet/lifecycle:go_default_library",
"//pkg/kubelet/metrics:go_default_library",
"//pkg/kubelet/prober/results:go_default_library",
"//pkg/kubelet/runtimeclass:go_default_library",
"//pkg/kubelet/runtimeclass/testing:go_default_library",
"//staging/src/k8s.io/api/core/v1:go_default_library",
Expand Down
1 change: 1 addition & 0 deletions pkg/kubelet/kuberuntime/fake_kuberuntime_manager.go
Expand Up @@ -78,6 +78,7 @@ func newFakeKubeRuntimeManager(runtimeService internalapi.RuntimeService, imageS
cpuCFSQuota: false,
cpuCFSQuotaPeriod: metav1.Duration{Duration: time.Microsecond * 100},
livenessManager: proberesults.NewManager(),
startupManager: proberesults.NewManager(),
containerRefManager: kubecontainer.NewRefManager(),
machineInfo: machineInfo,
osInterface: osInterface,
Expand Down
6 changes: 6 additions & 0 deletions pkg/kubelet/kuberuntime/kuberuntime_manager.go
Expand Up @@ -100,6 +100,7 @@ type kubeGenericRuntimeManager struct {

// Health check results.
livenessManager proberesults.Manager
startupManager proberesults.Manager

// If true, enforce container cpu limits with CFS quota support
cpuCFSQuota bool
Expand Down Expand Up @@ -150,6 +151,7 @@ type LegacyLogProvider interface {
func NewKubeGenericRuntimeManager(
recorder record.EventRecorder,
livenessManager proberesults.Manager,
startupManager proberesults.Manager,
seccompProfileRoot string,
containerRefManager *kubecontainer.RefManager,
machineInfo *cadvisorapi.MachineInfo,
Expand All @@ -175,6 +177,7 @@ func NewKubeGenericRuntimeManager(
cpuCFSQuotaPeriod: cpuCFSQuotaPeriod,
seccompProfileRoot: seccompProfileRoot,
livenessManager: livenessManager,
startupManager: startupManager,
containerRefManager: containerRefManager,
machineInfo: machineInfo,
osInterface: osInterface,
Expand Down Expand Up @@ -590,6 +593,9 @@ func (m *kubeGenericRuntimeManager) computePodActions(pod *v1.Pod, podStatus *ku
} else if liveness, found := m.livenessManager.Get(containerStatus.ID); found && liveness == proberesults.Failure {
// If the container failed the liveness probe, we should kill it.
message = fmt.Sprintf("Container %s failed liveness probe", container.Name)
} else if startup, found := m.startupManager.Get(containerStatus.ID); found && startup == proberesults.Failure {
// If the container failed the startup probe, we should kill it.
message = fmt.Sprintf("Container %s failed startup probe", container.Name)
} else {
// Keep the container.
keepCount++
Expand Down
39 changes: 37 additions & 2 deletions pkg/kubelet/kuberuntime/kuberuntime_manager_test.go
Expand Up @@ -41,6 +41,7 @@ import (
"k8s.io/kubernetes/pkg/features"
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
containertest "k8s.io/kubernetes/pkg/kubelet/container/testing"
proberesults "k8s.io/kubernetes/pkg/kubelet/prober/results"
)

var (
Expand Down Expand Up @@ -732,6 +733,7 @@ func TestComputePodActions(t *testing.T) {
mutatePodFn func(*v1.Pod)
mutateStatusFn func(*kubecontainer.PodStatus)
actions podActions
resetStatusFn func(*kubecontainer.PodStatus)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we createTestRuntimeManager inside the loop? So that we don't need to reset the status.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense... maybe it could go into another PR?

}{
"everying is good; do nothing": {
actions: noAction,
Expand Down Expand Up @@ -850,8 +852,38 @@ func TestComputePodActions(t *testing.T) {
ContainersToKill: getKillMap(basePod, baseStatus, []int{1}),
ContainersToStart: []int{1},
},
// TODO: Add a test case for containers which failed the liveness
// check. Will need to fake the livessness check result.
},
"Kill and recreate the container if the liveness check has failed": {
mutatePodFn: func(pod *v1.Pod) {
pod.Spec.RestartPolicy = v1.RestartPolicyAlways
},
mutateStatusFn: func(status *kubecontainer.PodStatus) {
m.livenessManager.Set(status.ContainerStatuses[1].ID, proberesults.Failure, basePod)
},
actions: podActions{
SandboxID: baseStatus.SandboxStatuses[0].Id,
ContainersToKill: getKillMap(basePod, baseStatus, []int{1}),
ContainersToStart: []int{1},
},
resetStatusFn: func(status *kubecontainer.PodStatus) {
m.livenessManager.Remove(status.ContainerStatuses[1].ID)
},
},
"Kill and recreate the container if the startup check has failed": {
mutatePodFn: func(pod *v1.Pod) {
pod.Spec.RestartPolicy = v1.RestartPolicyAlways
},
mutateStatusFn: func(status *kubecontainer.PodStatus) {
m.startupManager.Set(status.ContainerStatuses[1].ID, proberesults.Failure, basePod)
},
actions: podActions{
SandboxID: baseStatus.SandboxStatuses[0].Id,
ContainersToKill: getKillMap(basePod, baseStatus, []int{1}),
ContainersToStart: []int{1},
},
resetStatusFn: func(status *kubecontainer.PodStatus) {
m.startupManager.Remove(status.ContainerStatuses[1].ID)
},
},
"Verify we do not create a pod sandbox if no ready sandbox for pod with RestartPolicy=Never and all containers exited": {
mutatePodFn: func(pod *v1.Pod) {
Expand Down Expand Up @@ -917,6 +949,9 @@ func TestComputePodActions(t *testing.T) {
}
actions := m.computePodActions(pod, status)
verifyActions(t, &test.actions, &actions, desc)
if test.resetStatusFn != nil {
test.resetStatusFn(status)
}
}
}

Expand Down
1 change: 1 addition & 0 deletions pkg/kubelet/prober/common_test.go
Expand Up @@ -112,6 +112,7 @@ func newTestManager() *manager {
m := NewManager(
status.NewManager(&fake.Clientset{}, podManager, &statustest.FakePodDeletionSafetyProvider{}),
results.NewManager(),
results.NewManager(),
nil, // runner
refManager,
&record.FakeRecorder{},
Expand Down
2 changes: 1 addition & 1 deletion pkg/kubelet/prober/prober_manager.go
Expand Up @@ -102,13 +102,13 @@ type manager struct {
func NewManager(
statusManager status.Manager,
livenessManager results.Manager,
startupManager results.Manager,
runner kubecontainer.ContainerCommandRunner,
refManager *kubecontainer.RefManager,
recorder record.EventRecorder) Manager {

prober := newProber(runner, refManager, recorder)
readinessManager := results.NewManager()
startupManager := results.NewManager()
return &manager{
statusManager: statusManager,
prober: prober,
Expand Down
13 changes: 8 additions & 5 deletions pkg/kubelet/prober/results/results_manager.go
Expand Up @@ -40,14 +40,17 @@ type Manager interface {
}

// Result is the type for probe results.
type Result bool
type Result int

const (
// Success is encoded as "true" (type Result)
Success Result = true
// Unknown is encoded as -1 (type Result)
Unknown Result = iota - 1

// Failure is encoded as "false" (type Result)
Failure Result = false
// Success is encoded as 0 (type Result)
Success

// Failure is encoded as 1 (type Result)
Failure
)

func (r Result) String() string {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we add the new Result type here (and in ToPrometheusType) as well? I think the current behavior is ok, reporting -1/"UNKNOWN", but it may be nice to be explicit. Ill defer that to the approvers 😄

Expand Down
2 changes: 1 addition & 1 deletion pkg/kubelet/prober/worker.go
Expand Up @@ -101,7 +101,7 @@ func newWorker(
case startup:
w.spec = container.StartupProbe
w.resultsManager = m.startupManager
w.initialValue = results.Failure
w.initialValue = results.Unknown
}

basicMetricLabels := metrics.Labels{
Expand Down
31 changes: 20 additions & 11 deletions pkg/kubelet/prober/worker_test.go
Expand Up @@ -79,10 +79,12 @@ func TestDoProbe(t *testing.T) {
podStatus: &pendingStatus,
expectContinue: true,
expectSet: true,
expectedResult: results.Failure,
},
{ // Container terminated
podStatus: &terminatedStatus,
expectSet: true,
podStatus: &terminatedStatus,
expectSet: true,
expectedResult: results.Failure,
},
{ // Probe successful.
podStatus: &runningStatus,
Expand Down Expand Up @@ -134,8 +136,15 @@ func TestInitialDelay(t *testing.T) {
m.statusManager.SetPodStatus(w.pod, getTestRunningStatusWithStarted(probeType != startup))

expectContinue(t, w, w.doProbe(), "during initial delay")
// Default value depends on probe, true for liveness, otherwise false.
expectResult(t, w, results.Result(probeType == liveness), "during initial delay")
// Default value depends on probe, Success for liveness, Failure for readiness, Unknown for startup
switch probeType {
case liveness:
expectResult(t, w, results.Success, "during initial delay")
case readiness:
expectResult(t, w, results.Failure, "during initial delay")
case startup:
expectResult(t, w, results.Unknown, "during initial delay")
}

// 100 seconds later...
laterStatus := getTestRunningStatusWithStarted(probeType != startup)
Expand Down Expand Up @@ -397,17 +406,17 @@ func TestResultRunOnStartupCheckFailure(t *testing.T) {
// Below FailureThreshold leaves probe state unchanged
// which is failed for startup at first.
m.prober.exec = fakeExecProber{probe.Failure, nil}
msg := "probe failure, result failure"
msg := "probe failure, result unknown"
expectContinue(t, w, w.doProbe(), msg)
expectResult(t, w, results.Failure, msg)
expectResult(t, w, results.Unknown, msg)
if w.resultRun != 1 {
t.Errorf("Prober resultRun should be 1")
}

m.prober.exec = fakeExecProber{probe.Failure, nil}
msg = "2nd probe failure, result failure"
msg = "2nd probe failure, result unknown"
expectContinue(t, w, w.doProbe(), msg)
expectResult(t, w, results.Failure, msg)
expectResult(t, w, results.Unknown, msg)
if w.resultRun != 2 {
t.Errorf("Prober resultRun should be 2")
}
Expand Down Expand Up @@ -446,11 +455,11 @@ func TestStartupProbeDisabledByStarted(t *testing.T) {
m := newTestManager()
w := newTestWorker(m, startup, v1.Probe{SuccessThreshold: 1, FailureThreshold: 2})
m.statusManager.SetPodStatus(w.pod, getTestRunningStatusWithStarted(false))
// startupProbe fails
// startupProbe fails < FailureThreshold, stays unknown
m.prober.exec = fakeExecProber{probe.Failure, nil}
msg := "Not started, probe failure, result failure"
msg := "Not started, probe failure, result unknown"
expectContinue(t, w, w.doProbe(), msg)
expectResult(t, w, results.Failure, msg)
expectResult(t, w, results.Unknown, msg)
// startupProbe succeeds
m.prober.exec = fakeExecProber{probe.Success, nil}
msg = "Started, probe success, result success"
Expand Down