Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Promote APIServerIdentity to Beta #113629

Merged
merged 5 commits into from Nov 8, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
19 changes: 4 additions & 15 deletions cmd/kube-apiserver/app/options/options.go
Expand Up @@ -82,9 +82,6 @@ type ServerRunOptions struct {
MasterCount int
EndpointReconcilerType string

IdentityLeaseDurationSeconds int
IdentityLeaseRenewIntervalSeconds int

ServiceAccountSigningKeyFile string
ServiceAccountIssuer serviceaccount.TokenGenerator
ServiceAccountTokenMaxExpiration time.Duration
Expand All @@ -110,12 +107,10 @@ func NewServerRunOptions() *ServerRunOptions {
Logs: logs.NewOptions(),
Traces: genericoptions.NewTracingOptions(),

EnableLogsHandler: true,
EventTTL: 1 * time.Hour,
MasterCount: 1,
EndpointReconcilerType: string(reconcilers.LeaseEndpointReconcilerType),
IdentityLeaseDurationSeconds: 3600,
IdentityLeaseRenewIntervalSeconds: 10,
EnableLogsHandler: true,
EventTTL: 1 * time.Hour,
MasterCount: 1,
EndpointReconcilerType: string(reconcilers.LeaseEndpointReconcilerType),
KubeletConfig: kubeletclient.KubeletClientConfig{
Port: ports.KubeletPort,
ReadOnlyPort: ports.KubeletReadOnlyPort,
Expand Down Expand Up @@ -185,12 +180,6 @@ func (s *ServerRunOptions) Flags() (fss cliflag.NamedFlagSets) {
fs.StringVar(&s.EndpointReconcilerType, "endpoint-reconciler-type", s.EndpointReconcilerType,
"Use an endpoint reconciler ("+strings.Join(reconcilers.AllTypes.Names(), ", ")+") master-count is deprecated, and will be removed in a future version.")

fs.IntVar(&s.IdentityLeaseDurationSeconds, "identity-lease-duration-seconds", s.IdentityLeaseDurationSeconds,
"The duration of kube-apiserver lease in seconds, must be a positive number. (In use when the APIServerIdentity feature gate is enabled.)")

fs.IntVar(&s.IdentityLeaseRenewIntervalSeconds, "identity-lease-renew-interval-seconds", s.IdentityLeaseRenewIntervalSeconds,
"The interval of kube-apiserver renewing its lease in seconds, must be a positive number. (In use when the APIServerIdentity feature gate is enabled.)")

// See #14282 for details on how to test/try this option out.
// TODO: remove this comment once this option is tested in CI.
fs.IntVar(&s.KubernetesServiceNodePort, "kubernetes-service-node-port", s.KubernetesServiceNodePort, ""+
Expand Down
2 changes: 0 additions & 2 deletions cmd/kube-apiserver/app/options/options_test.go
Expand Up @@ -318,8 +318,6 @@ func TestAddFlags(t *testing.T) {
Traces: &apiserveroptions.TracingOptions{
ConfigFile: "/var/run/kubernetes/tracing_config.yaml",
},
IdentityLeaseDurationSeconds: 3600,
IdentityLeaseRenewIntervalSeconds: 10,
AggregatorRejectForwardingRedirects: true,
}

Expand Down
12 changes: 0 additions & 12 deletions cmd/kube-apiserver/app/options/validation.go
Expand Up @@ -142,17 +142,6 @@ func validateAPIPriorityAndFairness(options *ServerRunOptions) []error {
return nil
}

func validateAPIServerIdentity(options *ServerRunOptions) []error {
var errs []error
if options.IdentityLeaseDurationSeconds <= 0 {
errs = append(errs, fmt.Errorf("--identity-lease-duration-seconds should be a positive number, but value '%d' provided", options.IdentityLeaseDurationSeconds))
}
if options.IdentityLeaseRenewIntervalSeconds <= 0 {
errs = append(errs, fmt.Errorf("--identity-lease-renew-interval-seconds should be a positive number, but value '%d' provided", options.IdentityLeaseRenewIntervalSeconds))
}
return errs
}

// Validate checks ServerRunOptions and return a slice of found errs.
func (s *ServerRunOptions) Validate() []error {
var errs []error
Expand All @@ -171,7 +160,6 @@ func (s *ServerRunOptions) Validate() []error {
errs = append(errs, s.APIEnablement.Validate(legacyscheme.Scheme, apiextensionsapiserver.Scheme, aggregatorscheme.Scheme)...)
errs = append(errs, validateTokenRequest(s)...)
errs = append(errs, s.Metrics.Validate()...)
errs = append(errs, validateAPIServerIdentity(s)...)

return errs
}
3 changes: 0 additions & 3 deletions cmd/kube-apiserver/app/server.go
Expand Up @@ -283,9 +283,6 @@ func CreateKubeAPIServerConfig(s completedServerRunOptions) (
ExtendExpiration: s.Authentication.ServiceAccounts.ExtendExpiration,

VersionedInformers: versionedInformers,

IdentityLeaseDurationSeconds: s.IdentityLeaseDurationSeconds,
IdentityLeaseRenewIntervalSeconds: s.IdentityLeaseRenewIntervalSeconds,
},
}

Expand Down
21 changes: 15 additions & 6 deletions pkg/controlplane/instance.go
Expand Up @@ -129,6 +129,18 @@ const (
repairLoopInterval = 3 * time.Minute
)

var (
// IdentityLeaseGCPeriod is the interval which the lease GC controller checks for expired leases
// IdentityLeaseGCPeriod is exposed so integration tests can tune this value.
IdentityLeaseGCPeriod = 3600 * time.Second
// IdentityLeaseDurationSeconds is the duration of kube-apiserver lease in seconds
// IdentityLeaseDurationSeconds is exposed so integration tests can tune this value.
IdentityLeaseDurationSeconds = 3600
// IdentityLeaseRenewIntervalSeconds is the interval of kube-apiserver renewing its lease in seconds
// IdentityLeaseRenewIntervalSeconds is exposed so integration tests can tune this value.
IdentityLeaseRenewIntervalPeriod = 10 * time.Second
)

// ExtraConfig defines extra configuration for the master
type ExtraConfig struct {
ClusterAuthenticationInfo clusterauthenticationtrust.ClusterAuthenticationInfo
Expand Down Expand Up @@ -193,9 +205,6 @@ type ExtraConfig struct {

VersionedInformers informers.SharedInformerFactory

IdentityLeaseDurationSeconds int
IdentityLeaseRenewIntervalSeconds int

// RepairServicesInterval interval used by the repair loops for
// the Services NodePort and ClusterIP resources
RepairServicesInterval time.Duration
Expand Down Expand Up @@ -480,9 +489,9 @@ func (c completedConfig) New(delegationTarget genericapiserver.DelegationTarget)
clock.RealClock{},
kubeClient,
holderIdentity,
int32(c.ExtraConfig.IdentityLeaseDurationSeconds),
int32(IdentityLeaseDurationSeconds),
nil,
time.Duration(c.ExtraConfig.IdentityLeaseRenewIntervalSeconds)*time.Second,
IdentityLeaseRenewIntervalPeriod,
leaseName,
metav1.NamespaceSystem,
labelAPIServerHeartbeat)
Expand All @@ -496,7 +505,7 @@ func (c completedConfig) New(delegationTarget genericapiserver.DelegationTarget)
}
go apiserverleasegc.NewAPIServerLeaseGC(
kubeClient,
time.Duration(c.ExtraConfig.IdentityLeaseDurationSeconds)*time.Second,
IdentityLeaseGCPeriod,
metav1.NamespaceSystem,
KubeAPIServerIdentityLeaseLabelSelector,
).Run(hookContext.StopCh)
Expand Down
2 changes: 1 addition & 1 deletion staging/src/k8s.io/apiserver/pkg/features/kube_features.go
Expand Up @@ -216,7 +216,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS

APIResponseCompression: {Default: true, PreRelease: featuregate.Beta},

APIServerIdentity: {Default: false, PreRelease: featuregate.Alpha},
APIServerIdentity: {Default: true, PreRelease: featuregate.Beta},

APIServerTracing: {Default: false, PreRelease: featuregate.Alpha},

Expand Down
169 changes: 169 additions & 0 deletions test/e2e/apimachinery/apiserver_identity.go
@@ -0,0 +1,169 @@
/*
Copyright 2022 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package apimachinery

import (
"context"
"crypto/sha256"
"encoding/base32"
"errors"
"fmt"
"net"
"strings"
"time"

"github.com/onsi/ginkgo/v2"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/kubernetes/test/e2e/framework"
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
e2essh "k8s.io/kubernetes/test/e2e/framework/ssh"
admissionapi "k8s.io/pod-security-admission/api"
)

func getControlPlaneHostname(node *v1.Node) (string, error) {
nodeAddresses := e2enode.GetAddresses(node, v1.NodeExternalIP)
if len(nodeAddresses) == 0 {
return "", errors.New("no valid addresses to use for SSH")
}

controlPlaneAddress := nodeAddresses[0]

host := controlPlaneAddress + ":" + e2essh.SSHPort
result, err := e2essh.SSH("hostname", host, framework.TestContext.Provider)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hah, that is one way to do it :)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

😂

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why not using exec on pods?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

apiserver pods generally aren't part of the cluster (static pods managed only by a kubelet)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean, exec a pod on that node as hostNetwork , not execing on the apiserver pod ... but maybe this is restricted, right?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

kubectl debug has tooling for running a more-privileged Pod on a node to learn about it; maybe we could use some of that approach?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just to clarify, I commented because I was curious, we are close to code freeze and I don't mean to delay this PR

if err != nil {
return "", err
}

if result.Code != 0 {
return "", fmt.Errorf("encountered non-zero exit code when running hostname command: %d", result.Code)
}

return strings.TrimSpace(result.Stdout), nil
}

// restartAPIServer attempts to restart the kube-apiserver on a node
func restartAPIServer(node *v1.Node) error {
nodeAddresses := e2enode.GetAddresses(node, v1.NodeExternalIP)
if len(nodeAddresses) == 0 {
return errors.New("no valid addresses to use for SSH")
}

controlPlaneAddress := nodeAddresses[0]
cmd := "pidof kube-apiserver | xargs sudo kill"
framework.Logf("Restarting kube-apiserver via ssh, running: %v", cmd)
result, err := e2essh.SSH(cmd, net.JoinHostPort(controlPlaneAddress, e2essh.SSHPort), framework.TestContext.Provider)
if err != nil || result.Code != 0 {
e2essh.LogResult(result)
return fmt.Errorf("couldn't restart kube-apiserver: %v", err)
}
return nil
}

// This test requires that --feature-gates=APIServerIdentity=true be set on the apiserver
var _ = SIGDescribe("kube-apiserver identity [Feature:APIServerIdentity]", func() {
f := framework.NewDefaultFramework("kube-apiserver-identity")
f.NamespacePodSecurityEnforceLevel = admissionapi.LevelPrivileged

ginkgo.It("kube-apiserver identity should persist after restart [Disruptive]", func() {
e2eskipper.SkipUnlessProviderIs("gce")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this something we do often?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, especially on tests where you have to ssh into the nodes or run some external operation against them


client := f.ClientSet

var controlPlaneNodes []v1.Node
nodes, err := client.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{})
framework.ExpectNoError(err)

for _, node := range nodes.Items {
if _, ok := node.Labels["node-role.kubernetes.io/control-plane"]; ok {
controlPlaneNodes = append(controlPlaneNodes, node)
continue
}

if _, ok := node.Labels["node-role.kubernetes.io/master"]; ok {
controlPlaneNodes = append(controlPlaneNodes, node)
continue
}

for _, taint := range node.Spec.Taints {
if taint.Key == "node-role.kubernetes.io/master" {
controlPlaneNodes = append(controlPlaneNodes, node)
break
}

if taint.Key == "node-role.kubernetes.io/control-plane" {
controlPlaneNodes = append(controlPlaneNodes, node)
break
}
}
}

leases, err := client.CoordinationV1().Leases(metav1.NamespaceSystem).List(context.TODO(), metav1.ListOptions{
LabelSelector: "k8s.io/component=kube-apiserver",
})
framework.ExpectNoError(err)
framework.ExpectEqual(len(leases.Items), len(controlPlaneNodes), "unexpected number of leases")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this flake in CI?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only if you run it on a cluster while it is still being provisioned. Otherwise the node check above should prevent flakes

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess it doesn't hurt to add a retry loop here


for _, node := range controlPlaneNodes {
hostname, err := getControlPlaneHostname(&node)
framework.ExpectNoError(err)

hash := sha256.Sum256([]byte(hostname))
leaseName := "kube-apiserver-" + strings.ToLower(base32.StdEncoding.WithPadding(base32.NoPadding).EncodeToString(hash[:16]))

lease, err := client.CoordinationV1().Leases(metav1.NamespaceSystem).Get(context.TODO(), leaseName, metav1.GetOptions{})
framework.ExpectNoError(err)
oldHolderIdentity := lease.Spec.HolderIdentity
lastRenewedTime := lease.Spec.RenewTime

err = restartAPIServer(&node)
framework.ExpectNoError(err)

err = wait.PollImmediate(time.Second, wait.ForeverTestTimeout, func() (bool, error) {
lease, err = client.CoordinationV1().Leases(metav1.NamespaceSystem).Get(context.TODO(), leaseName, metav1.GetOptions{})
if err != nil {
return false, nil
}

// expect only the holder identity to change after a restart
newHolderIdentity := lease.Spec.HolderIdentity
if newHolderIdentity == oldHolderIdentity {
return false, nil
}

// wait for at least one lease heart beat after the holder identity changes
if !lease.Spec.RenewTime.After(lastRenewedTime.Time) {
return false, nil
}

return true, nil

})
framework.ExpectNoError(err, "holder identity did not change after a restart")
}

// As long as the hostname of kube-apiserver is unchanged, a restart should not result in new Lease objects.
// Check that the number of lease objects remains the same after restarting kube-apiserver.
leases, err = client.CoordinationV1().Leases(metav1.NamespaceSystem).List(context.TODO(), metav1.ListOptions{
LabelSelector: "k8s.io/component=kube-apiserver",
})
framework.ExpectNoError(err)
framework.ExpectEqual(len(leases.Items), len(controlPlaneNodes), "unexpected number of leases")
})
})
22 changes: 16 additions & 6 deletions test/integration/controlplane/apiserver_identity_test.go
Expand Up @@ -99,13 +99,23 @@ func TestCreateLeaseOnStart(t *testing.T) {
}

func TestLeaseGarbageCollection(t *testing.T) {
oldIdentityLeaseDurationSeconds := controlplane.IdentityLeaseDurationSeconds
oldIdentityLeaseGCPeriod := controlplane.IdentityLeaseGCPeriod
oldIdentityLeaseRenewIntervalPeriod := controlplane.IdentityLeaseRenewIntervalPeriod
defer func() {
// reset the default values for leases after this test
controlplane.IdentityLeaseDurationSeconds = oldIdentityLeaseDurationSeconds
controlplane.IdentityLeaseGCPeriod = oldIdentityLeaseGCPeriod
controlplane.IdentityLeaseRenewIntervalPeriod = oldIdentityLeaseRenewIntervalPeriod
}()

// Shorten lease parameters so GC behavior can be exercised in integration tests
controlplane.IdentityLeaseDurationSeconds = 1
controlplane.IdentityLeaseGCPeriod = time.Second
controlplane.IdentityLeaseRenewIntervalPeriod = time.Second
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you set them back when you're done to avoid weird things in other tests?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch -- updated


defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.APIServerIdentity, true)()
result := kubeapiservertesting.StartTestServerOrDie(t, nil,
// This shorten the GC check period to make the test run faster.
// Since we are testing GC behavior on leases we create, what happens to
// the real apiserver lease doesn't matter.
[]string{"--identity-lease-duration-seconds=1"},
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had to remove this test, because of the dependence to this flag. But I think the unit tests added in #113074 and the additional unit test I added in this PR should make up for the coverage we lose here.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In other places we just let the integration test override the "constant"

// CertCallbackRefreshDuration is exposed so that integration tests can crank up the reload speed.
var CertCallbackRefreshDuration = 5 * time.Minute

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmmm -- updating the lease parameters into global vars seems worthwhile so we can run integration tests.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated to use public vars

framework.SharedEtcd())
result := kubeapiservertesting.StartTestServerOrDie(t, nil, nil, framework.SharedEtcd())
defer result.TearDownFn()

kubeclient, err := kubernetes.NewForConfig(result.ClientConfig)
Expand Down