Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[hpa] Parameterize tolerance, downscale, and upscale into HPAController, and add corresponding unit test for backsolved tolerance. #18315

Merged
merged 1 commit into from
Dec 12, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 5 additions & 1 deletion cmd/kube-controller-manager/app/controllermanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,11 @@ func (s *CMServer) Run(_ []string) error {
metrics.DefaultHeapsterService,
metrics.DefaultHeapsterPort,
)
podautoscaler.NewHorizontalController(hpaClient, metricsClient).
// TODO parameterize tolerance/downscale/upscale options.
tolerance := 1.0
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@wojtek-t @jszczepkowski this is the culprit. I made the default tolerance was way too high (off by a decimal)!
I will fix and re-validate this manually. which were the specific autoscale tests that it broke? just for good measure ill paste results in the PR ...

downScale := time.Duration(5) * time.Second
upScale := time.Duration(3) * time.Second
podautoscaler.NewHorizontalController(kubeClient, metricsClient, tolerance, downScale, upScale).
Run(s.HorizontalPodAutoscalerSyncPeriod)
}

Expand Down
48 changes: 29 additions & 19 deletions pkg/controller/podautoscaler/horizontal.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,30 +31,40 @@ import (
"k8s.io/kubernetes/pkg/util"
)

const (
// Usage shoud exceed the tolerance before we start downscale or upscale the pods.
// TODO: make it a flag or HPA spec element.
tolerance = 0.1
)

type HorizontalController struct {
client client.Interface
metricsClient metrics.MetricsClient
eventRecorder record.EventRecorder
client client.Interface
metricsClient metrics.MetricsClient
eventRecorder record.EventRecorder
tolerance float64
downscaleForbiddenWindow time.Duration
upscaleForbiddenWindow time.Duration
}

var downscaleForbiddenWindow = 5 * time.Minute
var upscaleForbiddenWindow = 3 * time.Minute

func NewHorizontalController(client client.Interface, metricsClient metrics.MetricsClient) *HorizontalController {
func NewHorizontalController(client client.Interface, metricsClient metrics.MetricsClient, tol float64, dScale, uScale time.Duration) *HorizontalController {
broadcaster := record.NewBroadcaster()
broadcaster.StartRecordingToSink(client.Events(""))
recorder := broadcaster.NewRecorder(api.EventSource{Component: "horizontal-pod-autoscaler"})

if tol < 0 || tol > 1 {
glog.Warningf("Invalid tolerance provided %v using default.", tol)
tol = .1
}
if uScale == 0*time.Second {
glog.Warningf("Invalid upscale value provided, %v using default.", uScale)
uScale = 3 * time.Minute
}
if dScale == 0*time.Second {
glog.Warningf("Invalid downscale value provided, %v using default.", dScale)
dScale = 5 * time.Minute
}
glog.V(2).Infof("Created Horizontal Controller with downscale %v, upscale %v, and tolerance %v", tol, uScale, dScale)
return &HorizontalController{
client: client,
metricsClient: metricsClient,
eventRecorder: recorder,
client: client,
metricsClient: metricsClient,
eventRecorder: recorder,
tolerance: tol,
downscaleForbiddenWindow: dScale,
upscaleForbiddenWindow: uScale,
}
}

Expand Down Expand Up @@ -83,7 +93,7 @@ func (a *HorizontalController) computeReplicasForCPUUtilization(hpa extensions.H
}

usageRatio := float64(*currentUtilization) / float64(hpa.Spec.CPUUtilization.TargetPercentage)
if math.Abs(1.0-usageRatio) > tolerance {
if math.Abs(1.0-usageRatio) > a.tolerance {
return int(math.Ceil(usageRatio * float64(currentReplicas))), currentUtilization, timestamp, nil
} else {
return currentReplicas, currentUtilization, timestamp, nil
Expand Down Expand Up @@ -125,15 +135,15 @@ func (a *HorizontalController) reconcileAutoscaler(hpa extensions.HorizontalPodA
// and there was no rescaling in the last downscaleForbiddenWindow.
if desiredReplicas < currentReplicas &&
(hpa.Status.LastScaleTime == nil ||
hpa.Status.LastScaleTime.Add(downscaleForbiddenWindow).Before(timestamp)) {
hpa.Status.LastScaleTime.Add(a.downscaleForbiddenWindow).Before(timestamp)) {
rescale = true
}

// Going up only if the usage ratio increased significantly above the target
// and there was no rescaling in the last upscaleForbiddenWindow.
if desiredReplicas > currentReplicas &&
(hpa.Status.LastScaleTime == nil ||
hpa.Status.LastScaleTime.Add(upscaleForbiddenWindow).Before(timestamp)) {
hpa.Status.LastScaleTime.Add(a.upscaleForbiddenWindow).Before(timestamp)) {
rescale = true
}
}
Expand Down
72 changes: 69 additions & 3 deletions pkg/controller/podautoscaler/horizontal_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"encoding/json"
"fmt"
"io"
"math"
"testing"
"time"

Expand All @@ -32,9 +33,14 @@ import (
"k8s.io/kubernetes/pkg/controller/podautoscaler/metrics"
"k8s.io/kubernetes/pkg/runtime"

glog "github.com/golang/glog"
"github.com/stretchr/testify/assert"
heapster "k8s.io/heapster/api/v1/types"
)

"github.com/stretchr/testify/assert"
// unit tests need tolerance awareness to calibrate.
const (
tolerance = .1
)

func (w fakeResponseWrapper) DoRaw() ([]byte, error) {
Expand Down Expand Up @@ -206,7 +212,7 @@ func (tc *testCase) verifyResults(t *testing.T) {
func (tc *testCase) runTest(t *testing.T) {
testClient := tc.prepareTestClient(t)
metricsClient := metrics.NewHeapsterMetricsClient(testClient, metrics.DefaultHeapsterNamespace, metrics.DefaultHeapsterScheme, metrics.DefaultHeapsterService, metrics.DefaultHeapsterPort)
hpaController := NewHorizontalController(testClient, metricsClient)
hpaController := NewHorizontalController(testClient, metricsClient, tolerance, time.Second, time.Second)
err := hpaController.reconcileAutoscalers()
assert.Equal(t, nil, err)
if tc.verifyEvents {
Expand Down Expand Up @@ -360,4 +366,64 @@ func TestEventNotCreated(t *testing.T) {
tc.runTest(t)
}

// TODO: add more tests
// TestComputedToleranceAlgImplementation is a regression test which
// back-calculates a minimal percentage for downscaling based on a small percentage
// increase in pod utilization which is calibrated against the tolerance value.
func TestComputedToleranceAlgImplementation(t *testing.T) {

startPods := 10
// 150 mCPU per pod.
totalUsedCPUOfAllPods := uint64(startPods * 150)
// Each pod starts out asking for 2X what is really needed.
// This means we will have a 50% ratio of used/requested
totalRequestedCPUOfAllPods := 2 * totalUsedCPUOfAllPods
requestedToUsed := float64(totalRequestedCPUOfAllPods / totalUsedCPUOfAllPods)
// Spread the amount we ask over 10 pods. We can add some jitter later in reportedLevels.
perPodRequested := int(totalRequestedCPUOfAllPods) / startPods

// Force a minimal scaling event by satisfying (tolerance < 1 - resourcesUsedRatio).
target := math.Abs(1/(requestedToUsed*(1-tolerance))) + .01
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jszczepkowski 1-tolerance works as well. it just changes the sign, which is a no-op since we're absolute-valuing. I've updated it from tolerance-1 to 1-tolerance here, any way... (i like your idea it makes the equation a little more readable i think, probably is more accurate to)

finalCpuPercentTarget := int(target * 100)
resourcesUsedRatio := float64(totalUsedCPUOfAllPods) / float64(float64(totalRequestedCPUOfAllPods)*target)
// the autoscaler will compare this vs. tolearnce. Lets calculate the usageRatio, which will be
// compared w tolerance.
usageRatioToleranceValue := float64(1 - resourcesUsedRatio)
// i.e. .60 * 20 -> scaled down expectation.
finalPods := math.Ceil(resourcesUsedRatio * float64(startPods))

glog.Infof("To breach tolerance %f we will create a utilization ratio difference of %f", tolerance, usageRatioToleranceValue)
tc := testCase{
minReplicas: 0,
maxReplicas: 1000,
initialReplicas: startPods,
desiredReplicas: int(finalPods),
CPUTarget: finalCpuPercentTarget,
reportedLevels: []uint64{
totalUsedCPUOfAllPods / 10,
totalUsedCPUOfAllPods / 10,
totalUsedCPUOfAllPods / 10,
totalUsedCPUOfAllPods / 10,
totalUsedCPUOfAllPods / 10,
totalUsedCPUOfAllPods / 10,
totalUsedCPUOfAllPods / 10,
totalUsedCPUOfAllPods / 10,
totalUsedCPUOfAllPods / 10,
totalUsedCPUOfAllPods / 10,
},
reportedCPURequests: []resource.Quantity{
resource.MustParse(fmt.Sprint(perPodRequested+100) + "m"),
resource.MustParse(fmt.Sprint(perPodRequested-100) + "m"),
resource.MustParse(fmt.Sprint(perPodRequested+10) + "m"),
resource.MustParse(fmt.Sprint(perPodRequested-10) + "m"),
resource.MustParse(fmt.Sprint(perPodRequested+2) + "m"),
resource.MustParse(fmt.Sprint(perPodRequested-2) + "m"),
resource.MustParse(fmt.Sprint(perPodRequested+1) + "m"),
resource.MustParse(fmt.Sprint(perPodRequested-1) + "m"),
resource.MustParse(fmt.Sprint(perPodRequested) + "m"),
resource.MustParse(fmt.Sprint(perPodRequested) + "m"),
},
}
tc.runTest(t)
}

// TODO: add more tests, e.g., enforcement of upscal/downscale window.