Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NodeLifecycleController treats node lease renewal as a heartbeat signal #69241

Merged
merged 1 commit into from
Oct 12, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions cmd/kube-controller-manager/app/core.go
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ func startNodeIpamController(ctx ControllerContext) (http.Handler, bool, error)

func startNodeLifecycleController(ctx ControllerContext) (http.Handler, bool, error) {
lifecycleController, err := lifecyclecontroller.NewNodeLifecycleController(
ctx.InformerFactory.Coordination().V1beta1().Leases(),
ctx.InformerFactory.Core().V1().Pods(),
ctx.InformerFactory.Core().V1().Nodes(),
ctx.InformerFactory.Extensions().V1beta1().DaemonSets(),
Expand Down
11 changes: 11 additions & 0 deletions pkg/controller/nodelifecycle/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,13 @@ go_library(
"//pkg/controller:go_default_library",
"//pkg/controller/nodelifecycle/scheduler:go_default_library",
"//pkg/controller/util/node:go_default_library",
"//pkg/features:go_default_library",
"//pkg/scheduler/algorithm:go_default_library",
"//pkg/util/metrics:go_default_library",
"//pkg/util/node:go_default_library",
"//pkg/util/system:go_default_library",
"//pkg/util/taints:go_default_library",
"//staging/src/k8s.io/api/coordination/v1beta1:go_default_library",
"//staging/src/k8s.io/api/core/v1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/api/equality:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/api/errors:go_default_library",
Expand All @@ -26,10 +28,13 @@ go_library(
"//staging/src/k8s.io/apimachinery/pkg/types:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/runtime:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/wait:go_default_library",
"//staging/src/k8s.io/apiserver/pkg/util/feature:go_default_library",
"//staging/src/k8s.io/client-go/informers/coordination/v1beta1:go_default_library",
"//staging/src/k8s.io/client-go/informers/core/v1:go_default_library",
"//staging/src/k8s.io/client-go/informers/extensions/v1beta1:go_default_library",
"//staging/src/k8s.io/client-go/kubernetes:go_default_library",
"//staging/src/k8s.io/client-go/kubernetes/scheme:go_default_library",
"//staging/src/k8s.io/client-go/listers/coordination/v1beta1:go_default_library",
"//staging/src/k8s.io/client-go/listers/core/v1:go_default_library",
"//staging/src/k8s.io/client-go/listers/extensions/v1beta1:go_default_library",
"//staging/src/k8s.io/client-go/tools/cache:go_default_library",
Expand Down Expand Up @@ -69,10 +74,12 @@ go_test(
"//pkg/controller/nodelifecycle/scheduler:go_default_library",
"//pkg/controller/testutil:go_default_library",
"//pkg/controller/util/node:go_default_library",
"//pkg/features:go_default_library",
"//pkg/kubelet/apis:go_default_library",
"//pkg/scheduler/algorithm:go_default_library",
"//pkg/util/node:go_default_library",
"//pkg/util/taints:go_default_library",
"//staging/src/k8s.io/api/coordination/v1beta1:go_default_library",
"//staging/src/k8s.io/api/core/v1:go_default_library",
"//staging/src/k8s.io/api/extensions/v1beta1:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/api/equality:go_default_library",
Expand All @@ -81,12 +88,16 @@ go_test(
"//staging/src/k8s.io/apimachinery/pkg/types:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/diff:go_default_library",
"//staging/src/k8s.io/apimachinery/pkg/util/wait:go_default_library",
"//staging/src/k8s.io/apiserver/pkg/util/feature:go_default_library",
"//staging/src/k8s.io/apiserver/pkg/util/feature/testing:go_default_library",
"//staging/src/k8s.io/client-go/informers:go_default_library",
"//staging/src/k8s.io/client-go/informers/coordination/v1beta1:go_default_library",
"//staging/src/k8s.io/client-go/informers/core/v1:go_default_library",
"//staging/src/k8s.io/client-go/informers/extensions/v1beta1:go_default_library",
"//staging/src/k8s.io/client-go/kubernetes:go_default_library",
"//staging/src/k8s.io/client-go/kubernetes/fake:go_default_library",
"//staging/src/k8s.io/client-go/testing:go_default_library",
"//staging/src/k8s.io/cloud-provider:go_default_library",
"//vendor/k8s.io/utils/pointer:go_default_library",
],
)
78 changes: 57 additions & 21 deletions pkg/controller/nodelifecycle/node_lifecycle_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (

"github.com/golang/glog"

coordv1beta1 "k8s.io/api/coordination/v1beta1"
"k8s.io/api/core/v1"
apiequality "k8s.io/apimachinery/pkg/api/equality"
apierrors "k8s.io/apimachinery/pkg/api/errors"
Expand All @@ -39,10 +40,13 @@ import (
"k8s.io/apimachinery/pkg/types"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait"
utilfeature "k8s.io/apiserver/pkg/util/feature"
coordinformers "k8s.io/client-go/informers/coordination/v1beta1"
coreinformers "k8s.io/client-go/informers/core/v1"
extensionsinformers "k8s.io/client-go/informers/extensions/v1beta1"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/client-go/kubernetes/scheme"
coordlisters "k8s.io/client-go/listers/coordination/v1beta1"
corelisters "k8s.io/client-go/listers/core/v1"
extensionslisters "k8s.io/client-go/listers/extensions/v1beta1"
"k8s.io/client-go/tools/cache"
Expand All @@ -54,6 +58,7 @@ import (
"k8s.io/kubernetes/pkg/controller"
"k8s.io/kubernetes/pkg/controller/nodelifecycle/scheduler"
nodeutil "k8s.io/kubernetes/pkg/controller/util/node"
"k8s.io/kubernetes/pkg/features"
"k8s.io/kubernetes/pkg/scheduler/algorithm"
"k8s.io/kubernetes/pkg/util/metrics"
utilnode "k8s.io/kubernetes/pkg/util/node"
Expand Down Expand Up @@ -136,6 +141,7 @@ type nodeHealthData struct {
probeTimestamp metav1.Time
readyTransitionTimestamp metav1.Time
status *v1.NodeStatus
lease *coordv1beta1.Lease
}

// Controller is the controller that manages node's life cycle.
Expand Down Expand Up @@ -172,6 +178,8 @@ type Controller struct {
daemonSetStore extensionslisters.DaemonSetLister
daemonSetInformerSynced cache.InformerSynced

leaseLister coordlisters.LeaseLister
leaseInformerSynced cache.InformerSynced
nodeLister corelisters.NodeLister
nodeInformerSynced cache.InformerSynced
nodeExistsInCloudProvider func(types.NodeName) (bool, error)
Expand All @@ -190,19 +198,23 @@ type Controller struct {
nodeStartupGracePeriod time.Duration

// Controller will not proactively sync node health, but will monitor node
// health signal updated from kubelet. If it doesn't receive update for this
// amount of time, it will start posting "NodeReady==ConditionUnknown". The
// amount of time before which Controller start evicting pods is controlled
// via flag 'pod-eviction-timeout'.
// health signal updated from kubelet. There are 2 kinds of node healthiness
// signals: NodeStatus and NodeLease. NodeLease signal is generated only when
// NodeLease feature is enabled. If it doesn't receive update for this amount
// of time, it will start posting "NodeReady==ConditionUnknown". The amount of
// time before which Controller start evicting pods is controlled via flag
// 'pod-eviction-timeout'.
// Note: be cautious when changing the constant, it must work with
// nodeStatusUpdateFrequency in kubelet. There are several constraints:
// 1. nodeMonitorGracePeriod must be N times more than
// nodeStatusUpdateFrequency, where N means number of retries allowed for
// kubelet to post node health signal. It is pointless to make
// nodeMonitorGracePeriod be less than nodeStatusUpdateFrequency, since
// there will only be fresh values from Kubelet at an interval of
// nodeStatusUpdateFrequency. The constant must be less than
// podEvictionTimeout.
// nodeStatusUpdateFrequency in kubelet and renewInterval in NodeLease
// controller. The node health signal update frequency is the minimal of the
// two.
// There are several constraints:
// 1. nodeMonitorGracePeriod must be N times more than the node health signal
// update frequency, where N means number of retries allowed for kubelet to
// post node status/lease. It is pointless to make nodeMonitorGracePeriod
// be less than the node health signal update frequency, since there will
// only be fresh values from Kubelet at an interval of node health signal
// update frequency. The constant must be less than podEvictionTimeout.
// 2. nodeMonitorGracePeriod can't be too large for user experience - larger
// value takes longer for user to see up-to-date node health.
nodeMonitorGracePeriod time.Duration
Expand All @@ -229,7 +241,9 @@ type Controller struct {
}

// NewNodeLifecycleController returns a new taint controller.
func NewNodeLifecycleController(podInformer coreinformers.PodInformer,
func NewNodeLifecycleController(
leaseInformer coordinformers.LeaseInformer,
podInformer coreinformers.PodInformer,
nodeInformer coreinformers.NodeInformer,
daemonSetInformer extensionsinformers.DaemonSetInformer,
cloud cloudprovider.Interface,
Expand Down Expand Up @@ -373,6 +387,9 @@ func NewNodeLifecycleController(podInformer coreinformers.PodInformer,
}),
})

nc.leaseLister = leaseInformer.Lister()
nc.leaseInformerSynced = leaseInformer.Informer().HasSynced

nc.nodeLister = nodeInformer.Lister()
nc.nodeInformerSynced = nodeInformer.Informer().HasSynced

Expand All @@ -389,7 +406,7 @@ func (nc *Controller) Run(stopCh <-chan struct{}) {
glog.Infof("Starting node controller")
defer glog.Infof("Shutting down node controller")

if !controller.WaitForCacheSync("taint", stopCh, nc.nodeInformerSynced, nc.podInformerSynced, nc.daemonSetInformerSynced) {
if !controller.WaitForCacheSync("taint", stopCh, nc.leaseInformerSynced, nc.nodeInformerSynced, nc.podInformerSynced, nc.daemonSetInformerSynced) {
return
}

Expand Down Expand Up @@ -811,7 +828,7 @@ func (nc *Controller) tryUpdateNodeHealth(node *v1.Node) (time.Duration, v1.Node
_, currentReadyCondition := v1node.GetNodeCondition(&node.Status, v1.NodeReady)
if currentReadyCondition == nil {
// If ready condition is nil, then kubelet (or nodecontroller) never posted node status.
// A fake ready condition is created, where LastProbeTime and LastTransitionTime is set
// A fake ready condition is created, where LastHeartbeatTime and LastTransitionTime is set
// to node.CreationTimestamp to avoid handle the corner case.
observedReadyCondition = v1.NodeCondition{
Type: v1.NodeReady,
Expand All @@ -820,10 +837,14 @@ func (nc *Controller) tryUpdateNodeHealth(node *v1.Node) (time.Duration, v1.Node
LastTransitionTime: node.CreationTimestamp,
}
gracePeriod = nc.nodeStartupGracePeriod
nc.nodeHealthMap[node.Name] = &nodeHealthData{
status: &node.Status,
probeTimestamp: node.CreationTimestamp,
readyTransitionTimestamp: node.CreationTimestamp,
if _, found := nc.nodeHealthMap[node.Name]; found {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When do we expect the node to be in the nodeHealthMap but does not have the ready condition set?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another question is whether it's possible that nc.nodeHealthMap[node.Name] is set to nil?

It's not obvious to me that when setting the value in line 929 below nc.nodeHealthMap[node.Name] = savedNodeHealth, the savedNodeHealth will always be non-nil...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When do we expect the node to be in the nodeHealthMap but does not have the ready condition set?

Look at this example:

  • time t0: Node is created.
  • time t1: Node renew lease, but NodeStatus does not have ready condition. And found is false. We create and save a fake ready condition (unknown) here. And we also save the lease below.
  • time t2: Still, NodeStatus does not have ready condition. So currentReadyCondition is still nil. We saved the node at t1 with unknown ready condition. So found is true.

Another question is whether it's possible that nc.nodeHealthMap[node.Name] is set to nil?

savedNodeHealth is always not nil at line 929. So once this function runs at least once, nc.nodeHealthMap[node.Name] is always non-nil.

Consider 2 cases:

  • If currentReadyCondition is nil, we either use the existing one or create a fake one. Then savedNodeHealth on line 855 is not nil.
  • If currentReadyCondition is not nil, and if at this point we do not have any saved copy, i.e., nc.nodeHealthMap[node.Name] is nil for now. Then at line 855, we check it again, we will have found to be false, so we create a savedNodeHealth at line 879, and then assign it to nc.nodeHealthMap[node.Name] at line 929.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for explaining. I think the code here is not very easy to follow (not this PR's fault).

One case that seems a bit strange to me is that even if the node doesn't have status, as long as the it continues renewing the lease, the controller will take no action. I think this is by design, but still feels strange :\

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In terms of heartbeat, I think renewing the lease satisfy the need of making sure node is alive.

@wojtek-t, WDYT?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So the situation where a node does not have status, but keeps renewing lease depends on the value of node-status-update-period.

So what we have been thinking is to switch:

  • set lease refresh period (however we call it) to how frequently we update node status now
  • decrease frequency of node status updates to something like 1 minute
  • but still compute node statuses with the same frequency (equal of lease refresh period)

So if we make leases and node-statuses independent, then yes - there may be small period when there is no node status (this can pretty much happen only after node registration though, because after that it should always be there).
But if we would make them somehow correlated (I'm not saying we should do that - I'm saying it's theoretically possible), then it would only happen on races.

But yeah - I agree the latter probably doesn't make much sense.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@yujuhong, does this small period that there is no node status sound ok?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was more interested in the pathological case where kubelet could not post the status ever, while it could still sending heartbeats. I think this theoretically could happen if we decouple heartbeat and status updates in kubelet (which is probably the right thing to do to not add more complexity and dependency). I don't think we need to deal with the pathological case now, but in case we want to put a safeguard in the future, let's settle on leaving a comment saying something like If kubelet never posted the node status, but continues renewing the heartbeat leases, the node controller will assume the node is healthy and take no action.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. Added the comments at line 923 now. PTAL

nc.nodeHealthMap[node.Name].status = &node.Status
} else {
nc.nodeHealthMap[node.Name] = &nodeHealthData{
status: &node.Status,
probeTimestamp: node.CreationTimestamp,
readyTransitionTimestamp: node.CreationTimestamp,
}
}
} else {
// If ready condition is not nil, make a copy of it, since we may modify it in place later.
Expand All @@ -847,8 +868,10 @@ func (nc *Controller) tryUpdateNodeHealth(node *v1.Node) (time.Duration, v1.Node
// - currently only correct Ready State transition outside of Node Controller is marking it ready by Kubelet, we don't check
// if that's the case, but it does not seem necessary.
var savedCondition *v1.NodeCondition
var savedLease *coordv1beta1.Lease
if found {
_, savedCondition = v1node.GetNodeCondition(savedNodeHealth.status, v1.NodeReady)
savedLease = savedNodeHealth.lease
}
_, observedCondition := v1node.GetNodeCondition(&node.Status, v1.NodeReady)
if !found {
Expand Down Expand Up @@ -894,11 +917,23 @@ func (nc *Controller) tryUpdateNodeHealth(node *v1.Node) (time.Duration, v1.Node
readyTransitionTimestamp: transitionTime,
}
}
var observedLease *coordv1beta1.Lease
if utilfeature.DefaultFeatureGate.Enabled(features.NodeLease) {
// Always update the probe time if node lease is renewed.
// Note: If kubelet never posted the node status, but continues renewing the
// heartbeat leases, the node controller will assume the node is healthy and
// take no action.
observedLease, _ = nc.leaseLister.Leases(v1.NamespaceNodeLease).Get(node.Name)
if observedLease != nil && (savedLease == nil || savedLease.Spec.RenewTime.Before(observedLease.Spec.RenewTime)) {
savedNodeHealth.lease = observedLease
savedNodeHealth.probeTimestamp = nc.now()
}
}
nc.nodeHealthMap[node.Name] = savedNodeHealth

if nc.now().After(savedNodeHealth.probeTimestamp.Add(gracePeriod)) {
// NodeReady condition was last set longer ago than gracePeriod, so update it to Unknown
// (regardless of its current value) in the master.
// NodeReady condition or lease was last set longer ago than gracePeriod, so
// update it to Unknown (regardless of its current value) in the master.
if currentReadyCondition == nil {
glog.V(2).Infof("node %v is never updated by kubelet", node.Name)
node.Status.Conditions = append(node.Status.Conditions, v1.NodeCondition{
Expand Down Expand Up @@ -967,6 +1002,7 @@ func (nc *Controller) tryUpdateNodeHealth(node *v1.Node) (time.Duration, v1.Node
status: &node.Status,
probeTimestamp: nc.nodeHealthMap[node.Name].probeTimestamp,
readyTransitionTimestamp: nc.now(),
lease: observedLease,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All the changes in this file look reasonable to me.

@gmarek - would you be able to take a look too?

}
return gracePeriod, observedReadyCondition, currentReadyCondition, nil
}
Expand Down