Skip to content

Commit

Permalink
Check the health of PLEG when updating the node status
Browse files Browse the repository at this point in the history
  • Loading branch information
yujuhong committed Jan 11, 2017
1 parent 03106dd commit ec0e99c
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 7 deletions.
15 changes: 8 additions & 7 deletions pkg/kubelet/pleg/generic.go
Expand Up @@ -75,6 +75,11 @@ const (
plegContainerExited plegContainerState = "exited"
plegContainerUnknown plegContainerState = "unknown"
plegContainerNonExistent plegContainerState = "non-existent"

// The threshold needs to be greater than the relisting period + the
// relisting time, which can vary significantly. Set a conservative
// threshold to avoid flipping between healthy and unhealthy.
relistThreshold = 3 * time.Minute
)

func convertState(state kubecontainer.ContainerState) plegContainerState {
Expand Down Expand Up @@ -126,13 +131,9 @@ func (g *GenericPLEG) Start() {

func (g *GenericPLEG) Healthy() (bool, error) {
relistTime := g.getRelistTime()
// TODO: Evaluate if we can reduce this threshold.
// The threshold needs to be greater than the relisting period + the
// relisting time, which can vary significantly. Set a conservative
// threshold so that we don't cause kubelet to be restarted unnecessarily.
threshold := 2 * time.Minute
if g.clock.Since(relistTime) > threshold {
return false, fmt.Errorf("pleg was last seen active at %v", relistTime)
elapsed := g.clock.Since(relistTime)
if elapsed > relistThreshold {
return false, fmt.Errorf("pleg was last seen active %v ago; threshold is %v", elapsed, relistThreshold)
}
return true, nil
}
Expand Down
22 changes: 22 additions & 0 deletions pkg/kubelet/runtime.go
Expand Up @@ -30,6 +30,22 @@ type runtimeState struct {
internalError error
cidr string
initError error
healthChecks []*healthCheck
}

// A health check function should be efficient and not rely on external
// components (e.g., container runtime).
type healthCheckFnType func() (bool, error)

type healthCheck struct {
name string
fn healthCheckFnType
}

func (s *runtimeState) addHealthCheck(name string, f healthCheckFnType) {
s.Lock()
defer s.Unlock()
s.healthChecks = append(s.healthChecks, &healthCheck{name: name, fn: f})
}

func (s *runtimeState) setRuntimeSync(t time.Time) {
Expand Down Expand Up @@ -81,6 +97,12 @@ func (s *runtimeState) runtimeErrors() []string {
if s.internalError != nil {
ret = append(ret, s.internalError.Error())
}
for _, hc := range s.healthChecks {
if ok, err := hc.fn(); !ok {
ret = append(ret, fmt.Sprintf("%s is not healthy: %v", hc.name, err))
}
}

return ret
}

Expand Down

0 comments on commit ec0e99c

Please sign in to comment.