kubernetes · k8s-github-robot · May 20, 2016 · May 16, 2016
diff --git a/pkg/controller/node/nodecontroller.go b/pkg/controller/node/nodecontroller.go
@@ -44,6 +44,7 @@ import (
 	"k8s.io/kubernetes/pkg/util/metrics"
 	utilruntime "k8s.io/kubernetes/pkg/util/runtime"
 	"k8s.io/kubernetes/pkg/util/sets"
+	"k8s.io/kubernetes/pkg/util/system"
 	"k8s.io/kubernetes/pkg/util/wait"
 	"k8s.io/kubernetes/pkg/version"
 	"k8s.io/kubernetes/pkg/watch"
@@ -123,6 +124,11 @@ type NodeController struct {
 
 	forcefullyDeletePod       func(*api.Pod) error
 	nodeExistsInCloudProvider func(string) (bool, error)
+
+	// If in network segmentation mode NodeController won't evict Pods from unhealthy Nodes.
+	// It is enabled when all Nodes observed by the NodeController are NotReady and disabled
+	// when NC sees any healthy Node. This is a temporary fix for v1.3.
+	networkSegmentationMode bool
 }
 
 // NewNodeController returns a new node controller to sync instances from cloudprovider.
@@ -141,10 +147,10 @@ func NewNodeController(
 	recorder := eventBroadcaster.NewRecorder(api.EventSource{Component: "controllermanager"})
 	eventBroadcaster.StartLogging(glog.Infof)
 	if kubeClient != nil {
-		glog.Infof("Sending events to api server.")
+		glog.V(0).Infof("Sending events to api server.")
 		eventBroadcaster.StartRecordingToSink(&unversionedcore.EventSinkImpl{Interface: kubeClient.Core().Events("")})
 	} else {
-		glog.Infof("No api server defined - no events will be sent to API server.")
+		glog.V(0).Infof("No api server defined - no events will be sent to API server.")
 	}
 
 	if kubeClient != nil && kubeClient.Core().GetRESTClient().GetRateLimiter() != nil {
@@ -282,7 +288,7 @@ func (nc *NodeController) Run(period time.Duration) {
 			}
 
 			if completed {
-				glog.Infof("All pods terminated on %s", value.Value)
+				glog.V(2).Infof("All pods terminated on %s", value.Value)
 				nc.recordNodeEvent(value.Value, api.EventTypeNormal, "TerminatedAllPods", fmt.Sprintf("Terminated all Pods on Node %s.", value.Value))
 				return true, 0
 			}
@@ -371,7 +377,7 @@ func (nc *NodeController) maybeDeleteTerminatingPod(obj interface{}) {
 	node := nodeObj.(*api.Node)
 	v, err := version.Parse(node.Status.NodeInfo.KubeletVersion)
 	if err != nil {
-		glog.Infof("couldn't parse verions %q of minion: %v", node.Status.NodeInfo.KubeletVersion, err)
+		glog.V(0).Infof("couldn't parse verions %q of minion: %v", node.Status.NodeInfo.KubeletVersion, err)
 		utilruntime.HandleError(nc.forcefullyDeletePod(pod))
 		return
 	}
@@ -407,7 +413,7 @@ func forcefullyDeletePod(c clientset.Interface, pod *api.Pod) error {
 	var zero int64
 	err := c.Core().Pods(pod.Namespace).Delete(pod.Name, &api.DeleteOptions{GracePeriodSeconds: &zero})
 	if err == nil {
-		glog.Infof("forceful deletion of %s succeeded", pod.Name)
+		glog.V(4).Infof("forceful deletion of %s succeeded", pod.Name)
 	}
 	return err
 }
@@ -449,13 +455,14 @@ func (nc *NodeController) monitorNodeStatus() error {
 		// reduce lists/decouple this from monitoring status.
 		nc.reconcileNodeCIDRs(nodes)
 	}
+	seenReady := false
 	for i := range nodes.Items {
 		var gracePeriod time.Duration
-		var lastReadyCondition api.NodeCondition
-		var readyCondition *api.NodeCondition
+		var observedReadyCondition api.NodeCondition
+		var currentReadyCondition *api.NodeCondition
 		node := &nodes.Items[i]
 		for rep := 0; rep < nodeStatusUpdateRetry; rep++ {
-			gracePeriod, lastReadyCondition, readyCondition, err = nc.tryUpdateNodeStatus(node)
+			gracePeriod, observedReadyCondition, currentReadyCondition, err = nc.tryUpdateNodeStatus(node)
 			if err == nil {
 				break
 			}
@@ -474,28 +481,32 @@ func (nc *NodeController) monitorNodeStatus() error {
 
 		decisionTimestamp := nc.now()
 
-		if readyCondition != nil {
+		if currentReadyCondition != nil {
 			// Check eviction timeout against decisionTimestamp
-			if lastReadyCondition.Status == api.ConditionFalse &&
+			if observedReadyCondition.Status == api.ConditionFalse &&
 				decisionTimestamp.After(nc.nodeStatusMap[node.Name].readyTransitionTimestamp.Add(nc.podEvictionTimeout)) {
 				if nc.evictPods(node.Name) {
 					glog.V(4).Infof("Evicting pods on node %s: %v is later than %v + %v", node.Name, decisionTimestamp, nc.nodeStatusMap[node.Name].readyTransitionTimestamp, nc.podEvictionTimeout)
 				}
 			}
-			if lastReadyCondition.Status == api.ConditionUnknown &&
+			if observedReadyCondition.Status == api.ConditionUnknown &&
 				decisionTimestamp.After(nc.nodeStatusMap[node.Name].probeTimestamp.Add(nc.podEvictionTimeout)) {
 				if nc.evictPods(node.Name) {
 					glog.V(4).Infof("Evicting pods on node %s: %v is later than %v + %v", node.Name, decisionTimestamp, nc.nodeStatusMap[node.Name].readyTransitionTimestamp, nc.podEvictionTimeout-gracePeriod)
 				}
 			}
-			if lastReadyCondition.Status == api.ConditionTrue {
+			if observedReadyCondition.Status == api.ConditionTrue {
+				// We do not treat a master node as a part of the cluster for network segmentation checking.
+				if !system.IsMasterNode(node) {
+					seenReady = true
+				}
 				if nc.cancelPodEviction(node.Name) {
 					glog.V(2).Infof("Node %s is ready again, cancelled pod eviction", node.Name)
 				}
 			}
 
 			// Report node event.
-			if readyCondition.Status != api.ConditionTrue && lastReadyCondition.Status == api.ConditionTrue {
+			if currentReadyCondition.Status != api.ConditionTrue && observedReadyCondition.Status == api.ConditionTrue {
 				nc.recordNodeStatusChange(node, "NodeNotReady")
 				if err = nc.markAllPodsNotReady(node.Name); err != nil {
 					utilruntime.HandleError(fmt.Errorf("Unable to mark all pods NotReady on node %v: %v", node.Name, err))
@@ -504,14 +515,14 @@ func (nc *NodeController) monitorNodeStatus() error {
 
 			// Check with the cloud provider to see if the node still exists. If it
 			// doesn't, delete the node immediately.
-			if readyCondition.Status != api.ConditionTrue && nc.cloud != nil {
+			if currentReadyCondition.Status != api.ConditionTrue && nc.cloud != nil {
 				exists, err := nc.nodeExistsInCloudProvider(node.Name)
 				if err != nil {
 					glog.Errorf("Error determining if node %v exists in cloud: %v", node.Name, err)
 					continue
 				}
 				if !exists {
-					glog.Infof("Deleting node (no longer present in cloud provider): %s", node.Name)
+					glog.V(2).Infof("Deleting node (no longer present in cloud provider): %s", node.Name)
 					nc.recordNodeEvent(node.Name, api.EventTypeNormal, "DeletingNode", fmt.Sprintf("Deleting Node %v because it's not present according to cloud provider", node.Name))
 					go func(nodeName string) {
 						defer utilruntime.HandleCrash()
@@ -527,6 +538,18 @@ func (nc *NodeController) monitorNodeStatus() error {
 			}
 		}
 	}
+
+	// NC don't see any Ready Node. We assume that the network is segmented and Nodes cannot connect to API server and
+	// update their statuses. NC enteres network segmentation mode and cancels all evictions in progress.
+	if !seenReady {
+		nc.networkSegmentationMode = true
+		nc.stopAllPodEvictions()
+	} else {
+		if nc.networkSegmentationMode {
+			nc.forceUpdateAllProbeTimes()
+			nc.networkSegmentationMode = false
+		}
+	}
 	return nil
 }
 
@@ -632,13 +655,13 @@ func (nc *NodeController) recordNodeStatusChange(node *api.Node, new_status stri
 func (nc *NodeController) tryUpdateNodeStatus(node *api.Node) (time.Duration, api.NodeCondition, *api.NodeCondition, error) {
 	var err error
 	var gracePeriod time.Duration
-	var lastReadyCondition api.NodeCondition
-	readyCondition := nc.getCondition(&node.Status, api.NodeReady)
-	if readyCondition == nil {
+	var observedReadyCondition api.NodeCondition
+	currentReadyCondition := nc.getCondition(&node.Status, api.NodeReady)
+	if currentReadyCondition == nil {
 		// If ready condition is nil, then kubelet (or nodecontroller) never posted node status.
 		// A fake ready condition is created, where LastProbeTime and LastTransitionTime is set
 		// to node.CreationTimestamp to avoid handle the corner case.
-		lastReadyCondition = api.NodeCondition{
+		observedReadyCondition = api.NodeCondition{
 			Type:               api.NodeReady,
 			Status:             api.ConditionUnknown,
 			LastHeartbeatTime:  node.CreationTimestamp,
@@ -652,7 +675,7 @@ func (nc *NodeController) tryUpdateNodeStatus(node *api.Node) (time.Duration, ap
 		}
 	} else {
 		// If ready condition is not nil, make a copy of it, since we may modify it in place later.
-		lastReadyCondition = *readyCondition
+		observedReadyCondition = *currentReadyCondition
 		gracePeriod = nc.nodeMonitorGracePeriod
 	}
 
@@ -683,15 +706,13 @@ func (nc *NodeController) tryUpdateNodeStatus(node *api.Node) (time.Duration, ap
 			probeTimestamp:           nc.now(),
 			readyTransitionTimestamp: nc.now(),
 		}
-		nc.nodeStatusMap[node.Name] = savedNodeStatus
 	} else if savedCondition == nil && observedCondition != nil {
 		glog.V(1).Infof("Creating timestamp entry for newly observed Node %s", node.Name)
 		savedNodeStatus = nodeStatusData{
 			status:                   node.Status,
 			probeTimestamp:           nc.now(),
 			readyTransitionTimestamp: nc.now(),
 		}
-		nc.nodeStatusMap[node.Name] = savedNodeStatus
 	} else if savedCondition != nil && observedCondition == nil {
 		glog.Errorf("ReadyCondition was removed from Status of Node %s", node.Name)
 		// TODO: figure out what to do in this case. For now we do the same thing as above.
@@ -700,7 +721,6 @@ func (nc *NodeController) tryUpdateNodeStatus(node *api.Node) (time.Duration, ap
 			probeTimestamp:           nc.now(),
 			readyTransitionTimestamp: nc.now(),
 		}
-		nc.nodeStatusMap[node.Name] = savedNodeStatus
 	} else if savedCondition != nil && observedCondition != nil && savedCondition.LastHeartbeatTime != observedCondition.LastHeartbeatTime {
 		var transitionTime unversioned.Time
 		// If ReadyCondition changed since the last time we checked, we update the transition timestamp to "now",
@@ -713,7 +733,7 @@ func (nc *NodeController) tryUpdateNodeStatus(node *api.Node) (time.Duration, ap
 			transitionTime = savedNodeStatus.readyTransitionTimestamp
 		}
 		if glog.V(5) {
-			glog.Infof("Node %s ReadyCondition updated. Updating timestamp: %+v vs %+v.", node.Name, savedNodeStatus.status, node.Status)
+			glog.V(5).Infof("Node %s ReadyCondition updated. Updating timestamp: %+v vs %+v.", node.Name, savedNodeStatus.status, node.Status)
 		} else {
 			glog.V(3).Infof("Node %s ReadyCondition updated. Updating timestamp.", node.Name)
 		}
@@ -722,13 +742,13 @@ func (nc *NodeController) tryUpdateNodeStatus(node *api.Node) (time.Duration, ap
 			probeTimestamp:           nc.now(),
 			readyTransitionTimestamp: transitionTime,
 		}
-		nc.nodeStatusMap[node.Name] = savedNodeStatus
 	}
+	nc.nodeStatusMap[node.Name] = savedNodeStatus
 
 	if nc.now().After(savedNodeStatus.probeTimestamp.Add(gracePeriod)) {
 		// NodeReady condition was last set longer ago than gracePeriod, so update it to Unknown
 		// (regardless of its current value) in the master.
-		if readyCondition == nil {
+		if currentReadyCondition == nil {
 			glog.V(2).Infof("node %v is never updated by kubelet", node.Name)
 			node.Status.Conditions = append(node.Status.Conditions, api.NodeCondition{
 				Type:               api.NodeReady,
@@ -740,14 +760,14 @@ func (nc *NodeController) tryUpdateNodeStatus(node *api.Node) (time.Duration, ap
 			})
 		} else {
 			glog.V(4).Infof("node %v hasn't been updated for %+v. Last ready condition is: %+v",
-				node.Name, nc.now().Time.Sub(savedNodeStatus.probeTimestamp.Time), lastReadyCondition)
-			if lastReadyCondition.Status != api.ConditionUnknown {
-				readyCondition.Status = api.ConditionUnknown
-				readyCondition.Reason = "NodeStatusUnknown"
-				readyCondition.Message = fmt.Sprintf("Kubelet stopped posting node status.")
+				node.Name, nc.now().Time.Sub(savedNodeStatus.probeTimestamp.Time), observedReadyCondition)
+			if observedReadyCondition.Status != api.ConditionUnknown {
+				currentReadyCondition.Status = api.ConditionUnknown
+				currentReadyCondition.Reason = "NodeStatusUnknown"
+				currentReadyCondition.Message = fmt.Sprintf("Kubelet stopped posting node status.")
 				// LastProbeTime is the last time we heard from kubelet.
-				readyCondition.LastHeartbeatTime = lastReadyCondition.LastHeartbeatTime
-				readyCondition.LastTransitionTime = nc.now()
+				currentReadyCondition.LastHeartbeatTime = observedReadyCondition.LastHeartbeatTime
+				currentReadyCondition.LastTransitionTime = nc.now()
 			}
 		}
 
@@ -776,27 +796,41 @@ func (nc *NodeController) tryUpdateNodeStatus(node *api.Node) (time.Duration, ap
 			}
 		}
 
-		if !api.Semantic.DeepEqual(nc.getCondition(&node.Status, api.NodeReady), &lastReadyCondition) {
+		if !api.Semantic.DeepEqual(nc.getCondition(&node.Status, api.NodeReady), &observedReadyCondition) {
 			if _, err = nc.kubeClient.Core().Nodes().UpdateStatus(node); err != nil {
 				glog.Errorf("Error updating node %s: %v", node.Name, err)
-				return gracePeriod, lastReadyCondition, readyCondition, err
+				return gracePeriod, observedReadyCondition, currentReadyCondition, err
 			} else {
 				nc.nodeStatusMap[node.Name] = nodeStatusData{
 					status:                   node.Status,
 					probeTimestamp:           nc.nodeStatusMap[node.Name].probeTimestamp,
 					readyTransitionTimestamp: nc.now(),
 				}
-				return gracePeriod, lastReadyCondition, readyCondition, nil
+				return gracePeriod, observedReadyCondition, currentReadyCondition, nil
 			}
 		}
 	}
 
-	return gracePeriod, lastReadyCondition, readyCondition, err
+	return gracePeriod, observedReadyCondition, currentReadyCondition, err
+}
+
+// forceUpdateAllProbeTimes bumps all observed timestamps in saved nodeStatuses to now. This makes
+// all eviction timer to reset.
+func (nc *NodeController) forceUpdateAllProbeTimes() {
+	now := nc.now()
+	for k, v := range nc.nodeStatusMap {
+		v.probeTimestamp = now
+		v.readyTransitionTimestamp = now
+		nc.nodeStatusMap[k] = v
+	}
 }
 
 // evictPods queues an eviction for the provided node name, and returns false if the node is already
 // queued for eviction.
 func (nc *NodeController) evictPods(nodeName string) bool {
+	if nc.networkSegmentationMode {
+		return false
+	}
 	nc.evictorLock.Lock()
 	defer nc.evictorLock.Unlock()
 	return nc.podEvictor.Add(nodeName)
@@ -816,6 +850,15 @@ func (nc *NodeController) cancelPodEviction(nodeName string) bool {
 	return false
 }
 
+// stopAllPodEvictions removes any queued evictions for all Nodes.
+func (nc *NodeController) stopAllPodEvictions() {
+	nc.evictorLock.Lock()
+	defer nc.evictorLock.Unlock()
+	glog.V(3).Infof("Cancelling all pod evictions.")
+	nc.podEvictor.Clear()
+	nc.terminationEvictor.Clear()
+}
+
 // deletePods will delete all pods from master running on given node, and return true
 // if any pods were deleted.
 func (nc *NodeController) deletePods(nodeName string) (bool, error) {