Skip to content
This repository was archived by the owner on Apr 17, 2019. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 33 additions & 8 deletions cluster-autoscaler/cluster_autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,14 +95,15 @@ var (
"Node utilization level, defined as sum of requested resources divided by capacity, below which a node can be considered for scale down")
scaleDownTrialInterval = flag.Duration("scale-down-trial-interval", 1*time.Minute,
"How often scale down possiblity is check")
scanInterval = flag.Duration("scan-interval", 10*time.Second, "How often cluster is reevaluated for scale up or down")
maxNodesTotal = flag.Int("max-nodes-total", 0, "Maximum number of nodes in all node groups. Cluster autoscaler will not grow the cluster beyond this number.")
cloudProviderFlag = flag.String("cloud-provider", "gce", "Cloud provider type. Allowed values: gce, aws")
maxEmptyBulkDeleteFlag = flag.Int("max-empty-bulk-delete", 10, "Maximum number of empty nodes that can be deleted at the same time.")
maxGratefulTerminationFlag = flag.Int("max-grateful-termination-sec", 60, "Maximum number of seconds CA waints for pod termination when trying to scale down a node.")
maxTotalUnreadyPercentage = flag.Float64("max-total-unready-percentage", 33, "Maximum percentage of unready nodes after which CA halts operations")
okTotalUnreadyCount = flag.Int("ok-total-unready-count", 3, "Number of unready nodes that is allowed, irrespective of max-total-unready-percentage")
maxNodeProvisionTime = flag.Duration("max-node-provision-time", 15*time.Minute, "Maximum time CA waits for node to be provisioned")
scanInterval = flag.Duration("scan-interval", 10*time.Second, "How often cluster is reevaluated for scale up or down")
maxNodesTotal = flag.Int("max-nodes-total", 0, "Maximum number of nodes in all node groups. Cluster autoscaler will not grow the cluster beyond this number.")
cloudProviderFlag = flag.String("cloud-provider", "gce", "Cloud provider type. Allowed values: gce, aws")
maxEmptyBulkDeleteFlag = flag.Int("max-empty-bulk-delete", 10, "Maximum number of empty nodes that can be deleted at the same time.")
maxGratefulTerminationFlag = flag.Int("max-grateful-termination-sec", 60, "Maximum number of seconds CA waints for pod termination when trying to scale down a node.")
maxTotalUnreadyPercentage = flag.Float64("max-total-unready-percentage", 33, "Maximum percentage of unready nodes after which CA halts operations")
okTotalUnreadyCount = flag.Int("ok-total-unready-count", 3, "Number of allowed unready nodes, irrespective of max-total-unready-percentage")
maxNodeProvisionTime = flag.Duration("max-node-provision-time", 15*time.Minute, "Maximum time CA waits for node to be provisioned")
unregisteredNodeRemovalTime = flag.Duration("unregistered-node-removal-time", 5*time.Minute, "Time that CA waits before removing nodes that are not registered in Kubernetes")

// AvailableEstimators is a list of available estimators.
AvailableEstimators = []string{BasicEstimatorName, BinpackingEstimatorName}
Expand Down Expand Up @@ -234,6 +235,7 @@ func run(_ <-chan struct{}) {
ExpanderStrategy: expanderStrategy,
MaxGratefulTerminationSec: *maxGratefulTerminationFlag,
MaxNodeProvisionTime: *maxNodeProvisionTime,
UnregisteredNodeRemovalTime: *unregisteredNodeRemovalTime,
}

scaleDown := NewScaleDown(&autoscalingContext)
Expand Down Expand Up @@ -273,6 +275,29 @@ func run(_ <-chan struct{}) {
continue
}

// Check if there are any nodes that failed to register in kuberentes
// master.
unregisteredNodes := autoscalingContext.ClusterStateRegistry.GetUnregisteredNodes()
if len(unregisteredNodes) > 0 {
glog.V(1).Infof("%d unregistered nodes present", len(unregisteredNodes))
removedAny, err := removeOldUnregisteredNodes(unregisteredNodes, &autoscalingContext, time.Now())
// There was a problem with removing unregistered nodes. Retry in the next loop.
if err != nil {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are not handling correctly the case when some nodes were removed, but error was observed afterwards: "Some unregistered nodes were removed" should be printed before printing error.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean the error itself message. Yeah - we can have 2 different here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

if removedAny {
glog.Warningf("Some unregistered nodes were removed, but got error: %v", err)
} else {
glog.Warningf("Failed to remove unregistered nodes: %v", err)

}
continue
}
// Some nodes were removed. Let's skip this iteration, the next one should be better.
if removedAny {
glog.V(0).Infof("Some unregistered nodes were removed, skipping iteration")
continue
}
}

// TODO: remove once all of the unready node handling elements are in place.
if err := CheckGroupsAndNodes(readyNodes, autoscalingContext.CloudProvider); err != nil {
glog.Warningf("Cluster is not ready for autoscaling: %v", err)
Expand Down
25 changes: 25 additions & 0 deletions cluster-autoscaler/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ type AutoscalingContext struct {
MaxGratefulTerminationSec int
// Maximum time that CA waits for a node to be provisioned. This is cloud provider specific.
MaxNodeProvisionTime time.Duration
// Time that CA waits before starting to remove nodes that exist in cloud provider but not in Kubernetes.
UnregisteredNodeRemovalTime time.Duration
}

// GetAllNodesAvailableTime returns time when the newest node became available for scheduler.
Expand Down Expand Up @@ -219,3 +221,26 @@ func GetNodeInfosForGroups(nodes []*apiv1.Node, cloudProvider cloudprovider.Clou
}
return result, nil
}

// Removes unregisterd nodes if needed. Returns true if anything was removed and error if such occurred.
func removeOldUnregisteredNodes(unregisteredNodes []clusterstate.UnregisteredNode, contetxt *AutoscalingContext,
currentTime time.Time) (bool, error) {
removedAny := false
for _, unregisteredNode := range unregisteredNodes {
if unregisteredNode.UnregisteredSice.Add(contetxt.UnregisteredNodeRemovalTime).Before(currentTime) {
glog.V(0).Infof("Removing unregistered node %v", unregisteredNode.Node.Name)
nodeGroup, err := contetxt.CloudProvider.NodeGroupForNode(unregisteredNode.Node)
if err != nil {
glog.Warningf("Failed to get node group for %s: %v", unregisteredNode.Node.Name, err)
return removedAny, err
}
err = nodeGroup.DeleteNodes([]*apiv1.Node{unregisteredNode.Node})
if err != nil {
glog.Warningf("Failed to remove node %s: %v", unregisteredNode.Node.Name, err)
return removedAny, err
}
removedAny = true
}
}
return removedAny, nil
}
49 changes: 49 additions & 0 deletions cluster-autoscaler/utils_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,12 @@ limitations under the License.
package main

import (
"fmt"
"testing"
"time"

"k8s.io/contrib/cluster-autoscaler/cloudprovider/test"
"k8s.io/contrib/cluster-autoscaler/clusterstate"
"k8s.io/contrib/cluster-autoscaler/simulator"
. "k8s.io/contrib/cluster-autoscaler/utils/test"

Expand Down Expand Up @@ -51,3 +55,48 @@ func TestFilterOutSchedulable(t *testing.T) {
assert.Equal(t, p1, res2[0])
assert.Equal(t, p2, res2[1])
}

func TestRemoveOldUnregisteredNodes(t *testing.T) {
deletedNodes := make(chan string, 10)

now := time.Now()

ng1_1 := BuildTestNode("ng1-1", 1000, 1000)
ng1_1.Spec.ProviderID = "ng1-1"
ng1_2 := BuildTestNode("ng1-2", 1000, 1000)
ng1_2.Spec.ProviderID = "ng1-2"
provider := testprovider.NewTestCloudProvider(nil, func(nodegroup string, node string) error {
deletedNodes <- fmt.Sprintf("%s/%s", nodegroup, node)
return nil
})
provider.AddNodeGroup("ng1", 1, 10, 2)
provider.AddNode("ng1", ng1_1)
provider.AddNode("ng1", ng1_2)

clusterState := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{
MaxTotalUnreadyPercentage: 10,
OkTotalUnreadyCount: 1,
})
err := clusterState.UpdateNodes([]*apiv1.Node{ng1_1}, now.Add(-time.Hour))
assert.NoError(t, err)

context := &AutoscalingContext{
CloudProvider: provider,
ClusterStateRegistry: clusterState,
UnregisteredNodeRemovalTime: 45 * time.Minute,
}
unregisteredNodes := clusterState.GetUnregisteredNodes()
assert.Equal(t, 1, len(unregisteredNodes))

// Nothing should be removed. The unregistered node is not old enough.
removed, err := removeOldUnregisteredNodes(unregisteredNodes, context, now.Add(-50*time.Minute))
assert.NoError(t, err)
assert.False(t, removed)

// ng1_2 should be removed.
removed, err = removeOldUnregisteredNodes(unregisteredNodes, context, now)
assert.NoError(t, err)
assert.True(t, removed)
deletedNode := getStringFromChan(deletedNodes)
assert.Equal(t, "ng1/ng1-2", deletedNode)
}
1 change: 1 addition & 0 deletions hack/verify-flags/known-flags.txt
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ triager-url
ttl-secs
udp-services-configmap
unit-status-context
unregistered-node-removal-time
url-list
use-cluster-credentials
use-ip
Expand Down