-
Notifications
You must be signed in to change notification settings - Fork 38.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support specifying custom LB retry period from cloud provider #94021
Changes from all commits
a179203
ea4ce5d
fc8b465
18e978a
73bc986
a15013e
208af2d
0fcf42f
b464d08
2ad2c15
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
/* | ||
Copyright 2023 The Kubernetes Authors. | ||
|
||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
|
||
http://www.apache.org/licenses/LICENSE-2.0 | ||
|
||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
package api | ||
|
||
import ( | ||
"time" | ||
) | ||
|
||
// RetryError indicates that a service reconciliation should be retried after a | ||
// fixed duration (as opposed to backing off exponentially). | ||
type RetryError struct { | ||
msg string | ||
retryAfter time.Duration | ||
} | ||
|
||
// NewRetryError returns a RetryError. | ||
func NewRetryError(msg string, retryAfter time.Duration) *RetryError { | ||
return &RetryError{ | ||
msg: msg, | ||
retryAfter: retryAfter, | ||
} | ||
} | ||
|
||
// Error shows the details of the retry reason. | ||
func (re *RetryError) Error() string { | ||
return re.msg | ||
} | ||
|
||
// RetryAfter returns the defined retry-after duration. | ||
func (re *RetryError) RetryAfter() time.Duration { | ||
return re.retryAfter | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -131,19 +131,25 @@ func GetInstanceProviderID(ctx context.Context, cloud Interface, nodeName types. | |
// irrespective of the ImplementedElsewhere error. Additional finalizers for | ||
// LB services must be managed in the alternate implementation. | ||
type LoadBalancer interface { | ||
// TODO: Break this up into different interfaces (LB, etc) when we have more than one type of service | ||
// GetLoadBalancer returns whether the specified load balancer exists, and | ||
// if so, what its status is. | ||
// Implementations must treat the *v1.Service parameter as read-only and not modify it. | ||
// Parameter 'clusterName' is the name of the cluster as presented to kube-controller-manager | ||
// Parameter 'clusterName' is the name of the cluster as presented to kube-controller-manager. | ||
// TODO: Break this up into different interfaces (LB, etc) when we have more than one type of service | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note to reviewers: Moved the |
||
GetLoadBalancer(ctx context.Context, clusterName string, service *v1.Service) (status *v1.LoadBalancerStatus, exists bool, err error) | ||
// GetLoadBalancerName returns the name of the load balancer. Implementations must treat the | ||
// *v1.Service parameter as read-only and not modify it. | ||
GetLoadBalancerName(ctx context.Context, clusterName string, service *v1.Service) string | ||
// EnsureLoadBalancer creates a new load balancer 'name', or updates the existing one. Returns the status of the balancer | ||
// Implementations must treat the *v1.Service and *v1.Node | ||
// parameters as read-only and not modify them. | ||
// Parameter 'clusterName' is the name of the cluster as presented to kube-controller-manager | ||
// Parameter 'clusterName' is the name of the cluster as presented to kube-controller-manager. | ||
// | ||
// Implementations may return a (possibly wrapped) api.RetryError to enforce | ||
// backing off at a fixed duration. This can be used for cases like when the | ||
// load balancer is not ready yet (e.g., it is still being provisioned) and | ||
// polling at a fixed rate is preferred over backing off exponentially in | ||
// order to minimize latency. | ||
EnsureLoadBalancer(ctx context.Context, clusterName string, service *v1.Service, nodes []*v1.Node) (*v1.LoadBalancerStatus, error) | ||
// UpdateLoadBalancer updates hosts under the specified load balancer. | ||
// Implementations must treat the *v1.Service and *v1.Node | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -41,9 +41,12 @@ import ( | |
"k8s.io/client-go/kubernetes/fake" | ||
"k8s.io/client-go/kubernetes/scheme" | ||
v1core "k8s.io/client-go/kubernetes/typed/core/v1" | ||
corelisters "k8s.io/client-go/listers/core/v1" | ||
core "k8s.io/client-go/testing" | ||
"k8s.io/client-go/tools/cache" | ||
"k8s.io/client-go/tools/record" | ||
"k8s.io/client-go/util/workqueue" | ||
"k8s.io/cloud-provider/api" | ||
fakecloud "k8s.io/cloud-provider/fake" | ||
servicehelper "k8s.io/cloud-provider/service/helpers" | ||
featuregatetesting "k8s.io/component-base/featuregate/testing" | ||
|
@@ -1093,22 +1096,24 @@ func TestSyncService(t *testing.T) { | |
} | ||
|
||
for _, tc := range testCases { | ||
ctx, cancel := context.WithCancel(context.Background()) | ||
defer cancel() | ||
t.Run(tc.testName, func(t *testing.T) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note to reviewers: drive-by refactoring wrapping a |
||
ctx, cancel := context.WithCancel(context.Background()) | ||
defer cancel() | ||
|
||
tc.updateFn() | ||
obtainedErr := controller.syncService(ctx, tc.key) | ||
tc.updateFn() | ||
obtainedErr := controller.syncService(ctx, tc.key) | ||
|
||
//expected matches obtained ??. | ||
if exp := tc.expectedFn(obtainedErr); exp != nil { | ||
t.Errorf("%v Error:%v", tc.testName, exp) | ||
} | ||
//expected matches obtained ??. | ||
if exp := tc.expectedFn(obtainedErr); exp != nil { | ||
t.Errorf("%v Error:%v", tc.testName, exp) | ||
} | ||
|
||
//Post processing, the element should not be in the sync queue. | ||
_, exist := controller.cache.get(tc.key) | ||
if exist { | ||
t.Fatalf("%v working Queue should be empty, but contains %s", tc.testName, tc.key) | ||
} | ||
//Post processing, the element should not be in the sync queue. | ||
_, exist := controller.cache.get(tc.key) | ||
if exist { | ||
t.Fatalf("%v working Queue should be empty, but contains %s", tc.testName, tc.key) | ||
} | ||
}) | ||
} | ||
} | ||
|
||
|
@@ -2253,6 +2258,87 @@ func Test_shouldSyncUpdatedNode_compoundedPredicates(t *testing.T) { | |
} | ||
} | ||
|
||
func TestServiceQueueDelay(t *testing.T) { | ||
const ns = metav1.NamespaceDefault | ||
|
||
tests := []struct { | ||
name string | ||
lbCloudErr error | ||
wantRetryDelay time.Duration | ||
}{ | ||
{ | ||
name: "processing successful", | ||
lbCloudErr: nil, | ||
}, | ||
{ | ||
name: "regular error", | ||
lbCloudErr: errors.New("something went wrong"), | ||
}, | ||
{ | ||
name: "retry error", | ||
lbCloudErr: api.NewRetryError("LB create in progress", 42*time.Second), | ||
wantRetryDelay: 42 * time.Second, | ||
}, | ||
} | ||
|
||
for _, tc := range tests { | ||
t.Run(tc.name, func(t *testing.T) { | ||
controller, cloud, client := newController() | ||
queue := &spyWorkQueue{RateLimitingInterface: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "test-service-queue-delay")} | ||
controller.serviceQueue = queue | ||
cloud.Err = tc.lbCloudErr | ||
|
||
serviceCache := cache.NewIndexer(cache.MetaNamespaceKeyFunc, cache.Indexers{cache.NamespaceIndex: cache.MetaNamespaceIndexFunc}) | ||
controller.serviceLister = corelisters.NewServiceLister(serviceCache) | ||
|
||
svc := defaultExternalService() | ||
if err := serviceCache.Add(svc); err != nil { | ||
t.Fatalf("adding service %s to cache: %s", svc.Name, err) | ||
} | ||
|
||
ctx := context.Background() | ||
_, err := client.CoreV1().Services(ns).Create(ctx, svc, metav1.CreateOptions{}) | ||
if err != nil { | ||
t.Fatal(err) | ||
} | ||
|
||
key, err := cache.MetaNamespaceKeyFunc(svc) | ||
if err != nil { | ||
t.Fatalf("creating meta namespace key: %s", err) | ||
} | ||
queue.Add(key) | ||
|
||
done := controller.processNextServiceItem(ctx) | ||
if !done { | ||
t.Fatal("processNextServiceItem stopped prematurely") | ||
} | ||
|
||
// Expect no requeues unless we hit an error that is not a retry | ||
// error. | ||
wantNumRequeues := 0 | ||
var re *api.RetryError | ||
isRetryError := errors.As(tc.lbCloudErr, &re) | ||
if tc.lbCloudErr != nil && !isRetryError { | ||
wantNumRequeues = 1 | ||
} | ||
|
||
if gotNumRequeues := queue.NumRequeues(key); gotNumRequeues != wantNumRequeues { | ||
t.Fatalf("got %d requeue(s), want %d", gotNumRequeues, wantNumRequeues) | ||
} | ||
|
||
if tc.wantRetryDelay > 0 { | ||
items := queue.getItems() | ||
if len(items) != 1 { | ||
t.Fatalf("got %d item(s), want 1", len(items)) | ||
} | ||
if gotDelay := items[0].Delay; gotDelay != tc.wantRetryDelay { | ||
t.Fatalf("got delay %s, want %s", gotDelay, tc.wantRetryDelay) | ||
} | ||
} | ||
}) | ||
} | ||
} | ||
|
||
type fakeNodeLister struct { | ||
cache []*v1.Node | ||
err error | ||
|
@@ -2281,3 +2367,33 @@ func (l *fakeNodeLister) Get(name string) (*v1.Node, error) { | |
} | ||
return nil, nil | ||
} | ||
|
||
// spyWorkQueue implements a work queue and adds the ability to inspect processed | ||
// items for testing purposes. | ||
type spyWorkQueue struct { | ||
workqueue.RateLimitingInterface | ||
items []spyQueueItem | ||
} | ||
|
||
// spyQueueItem represents an item that was being processed. | ||
type spyQueueItem struct { | ||
Key interface{} | ||
// Delay represents the delayed duration if and only if AddAfter was invoked. | ||
Delay time.Duration | ||
} | ||
|
||
// AddAfter is like workqueue.RateLimitingInterface.AddAfter but records the | ||
// added key and delay internally. | ||
func (f *spyWorkQueue) AddAfter(key interface{}, delay time.Duration) { | ||
f.items = append(f.items, spyQueueItem{ | ||
Key: key, | ||
Delay: delay, | ||
}) | ||
|
||
f.RateLimitingInterface.AddAfter(key, delay) | ||
} | ||
|
||
// getItems returns all items that were recorded. | ||
func (f *spyWorkQueue) getItems() []spyQueueItem { | ||
return f.items | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what should be the interpretation of 0?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There'd be no special interpretation. Instead, the retry would be immediate (see also where the value is used).
The need to retry right away may be uncommon or even rare, but I personally wouldn't want to disallow it. Maybe a user's network is very slow, or there are already some natural / drive-by delays that don't warrant another extra wait at the client side?
I think a zero delay can be legitimate, but let me know if you think differently.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think is ok, I just wanted to double check we all have the same interpretation