diff --git a/cluster/addons/volumesnapshots/volume-snapshot-controller/volume-snapshot-controller-deployment.yaml b/cluster/addons/volumesnapshots/volume-snapshot-controller/volume-snapshot-controller-deployment.yaml index c8423d08351d..bad49c60b9c3 100644 --- a/cluster/addons/volumesnapshots/volume-snapshot-controller/volume-snapshot-controller-deployment.yaml +++ b/cluster/addons/volumesnapshots/volume-snapshot-controller/volume-snapshot-controller-deployment.yaml @@ -22,6 +22,6 @@ spec: serviceAccount: volume-snapshot-controller containers: - name: volume-snapshot-controller - image: k8s.gcr.io/sig-storage/snapshot-controller:v3.0.0 + image: k8s.gcr.io/sig-storage/snapshot-controller:v3.0.2 args: - "--v=5" diff --git a/test/e2e/storage/drivers/csi.go b/test/e2e/storage/drivers/csi.go index 5cf578ef124e..0e1565a900cd 100644 --- a/test/e2e/storage/drivers/csi.go +++ b/test/e2e/storage/drivers/csi.go @@ -90,6 +90,14 @@ func initHostPathCSIDriver(name string, capabilities map[testsuites.Capability]b Min: "1Mi", }, Capabilities: capabilities, + StressTestOptions: &testsuites.StressTestOptions{ + NumPods: 10, + NumRestarts: 10, + }, + VolumeSnapshotStressTestOptions: &testsuites.VolumeSnapshotStressTestOptions{ + NumPods: 10, + NumSnapshots: 10, + }, }, manifests: manifests, volumeAttributes: volumeAttributes, @@ -491,6 +499,14 @@ func InitGcePDCSIDriver() testsuites.TestDriver { NumPods: 10, NumRestarts: 10, }, + VolumeSnapshotStressTestOptions: &testsuites.VolumeSnapshotStressTestOptions{ + // GCE only allows for one snapshot per volume to be created at a time, + // which can cause test timeouts. We reduce the likelihood of test timeouts + // by increasing the number of pods (and volumes) and reducing the number + // of snapshots per volume. + NumPods: 20, + NumSnapshots: 2, + }, }, } } diff --git a/test/e2e/storage/testsuites/BUILD b/test/e2e/storage/testsuites/BUILD index c7427df2c81c..127e187b531e 100644 --- a/test/e2e/storage/testsuites/BUILD +++ b/test/e2e/storage/testsuites/BUILD @@ -10,12 +10,13 @@ go_library( "multivolume.go", "provisioning.go", "snapshottable.go", - "stress.go", + "snapshottable_stress.go", "subpath.go", "testdriver.go", "topology.go", "volume_expand.go", "volume_io.go", + "volume_stress.go", "volumelimits.go", "volumemode.go", "volumes.go", diff --git a/test/e2e/storage/testsuites/base.go b/test/e2e/storage/testsuites/base.go index ecbb204f060e..87a2f5c236ff 100644 --- a/test/e2e/storage/testsuites/base.go +++ b/test/e2e/storage/testsuites/base.go @@ -84,13 +84,14 @@ var BaseSuites = []func() TestSuite{ InitDisruptiveTestSuite, InitVolumeLimitsTestSuite, InitTopologyTestSuite, - InitStressTestSuite, + InitVolumeStressTestSuite, } // CSISuites is a list of storage test suites that work only for CSI drivers var CSISuites = append(BaseSuites, InitEphemeralTestSuite, InitSnapshottableTestSuite, + InitSnapshottableStressTestSuite, ) // TestSuite represents an interface for a set of tests which works with TestDriver diff --git a/test/e2e/storage/testsuites/driveroperations.go b/test/e2e/storage/testsuites/driveroperations.go index 5026d8d8f792..e2d41966fafa 100644 --- a/test/e2e/storage/testsuites/driveroperations.go +++ b/test/e2e/storage/testsuites/driveroperations.go @@ -73,6 +73,7 @@ func GetStorageClass( }, ObjectMeta: metav1.ObjectMeta{ // Name must be unique, so let's base it on namespace name and use GenerateName + // TODO(#96234): Remove unnecessary suffix. Name: names.SimpleNameGenerator.GenerateName(ns + "-" + suffix), }, Provisioner: provisioner, @@ -94,8 +95,9 @@ func GetSnapshotClass( "kind": "VolumeSnapshotClass", "apiVersion": snapshotAPIVersion, "metadata": map[string]interface{}{ - // Name must be unique, so let's base it on namespace name - "name": ns + "-" + suffix, + // Name must be unique, so let's base it on namespace name and use GenerateName + // TODO(#96234): Remove unnecessary suffix. + "name": names.SimpleNameGenerator.GenerateName(ns + "-" + suffix), }, "driver": snapshotter, "parameters": parameters, diff --git a/test/e2e/storage/testsuites/snapshottable_stress.go b/test/e2e/storage/testsuites/snapshottable_stress.go new file mode 100644 index 000000000000..3de43fefe897 --- /dev/null +++ b/test/e2e/storage/testsuites/snapshottable_stress.go @@ -0,0 +1,289 @@ +/* +Copyright 2020 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// This suite tests volume snapshots under stress conditions. + +package testsuites + +import ( + "context" + "sync" + + "github.com/onsi/ginkgo" + + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + errors "k8s.io/apimachinery/pkg/util/errors" + clientset "k8s.io/client-go/kubernetes" + "k8s.io/kubernetes/test/e2e/framework" + e2epod "k8s.io/kubernetes/test/e2e/framework/pod" + e2epv "k8s.io/kubernetes/test/e2e/framework/pv" + e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" + e2evolume "k8s.io/kubernetes/test/e2e/framework/volume" + "k8s.io/kubernetes/test/e2e/storage/testpatterns" +) + +type snapshottableStressTestSuite struct { + tsInfo TestSuiteInfo +} + +type snapshottableStressTest struct { + config *PerTestConfig + testOptions VolumeSnapshotStressTestOptions + driverCleanup func() + + pods []*v1.Pod + volumes []*VolumeResource + snapshots []*SnapshotResource + // Because we are appending snapshot resources in parallel goroutines. + snapshotsMutex sync.Mutex + + // Stop and wait for any async routines. + ctx context.Context + wg sync.WaitGroup + cancel context.CancelFunc +} + +var _ TestSuite = &snapshottableStressTestSuite{} + +// InitSnapshottableStressTestSuite returns snapshottableStressTestSuite that implements TestSuite interface +func InitSnapshottableStressTestSuite() TestSuite { + return &snapshottableStressTestSuite{ + tsInfo: TestSuiteInfo{ + Name: "snapshottable-stress", + TestPatterns: []testpatterns.TestPattern{ + testpatterns.DynamicSnapshotDelete, + testpatterns.DynamicSnapshotRetain, + }, + SupportedSizeRange: e2evolume.SizeRange{ + Min: "1Mi", + }, + FeatureTag: "[Feature:VolumeSnapshotDataSource]", + }, + } +} + +func (t *snapshottableStressTestSuite) GetTestSuiteInfo() TestSuiteInfo { + return t.tsInfo +} + +func (t *snapshottableStressTestSuite) SkipRedundantSuite(driver TestDriver, pattern testpatterns.TestPattern) { +} + +func (t *snapshottableStressTestSuite) DefineTests(driver TestDriver, pattern testpatterns.TestPattern) { + var ( + driverInfo *DriverInfo + snapshottableDriver SnapshottableTestDriver + cs clientset.Interface + stressTest *snapshottableStressTest + ) + + // Check preconditions before setting up namespace via framework below. + ginkgo.BeforeEach(func() { + driverInfo = driver.GetDriverInfo() + if driverInfo.VolumeSnapshotStressTestOptions == nil { + e2eskipper.Skipf("Driver %s doesn't specify snapshot stress test options -- skipping", driverInfo.Name) + } + if driverInfo.VolumeSnapshotStressTestOptions.NumPods <= 0 { + framework.Failf("NumPods in snapshot stress test options must be a positive integer, received: %d", driverInfo.VolumeSnapshotStressTestOptions.NumPods) + } + if driverInfo.VolumeSnapshotStressTestOptions.NumSnapshots <= 0 { + framework.Failf("NumSnapshots in snapshot stress test options must be a positive integer, received: %d", driverInfo.VolumeSnapshotStressTestOptions.NumSnapshots) + } + + // Because we're initializing snapshottableDriver, both vars must exist. + ok := false + + snapshottableDriver, ok = driver.(SnapshottableTestDriver) + if !driverInfo.Capabilities[CapSnapshotDataSource] || !ok { + e2eskipper.Skipf("Driver %q doesn't implement SnapshottableTestDriver - skipping", driverInfo.Name) + } + + _, ok = driver.(DynamicPVTestDriver) + if !ok { + e2eskipper.Skipf("Driver %s doesn't implement DynamicPVTestDriver -- skipping", driverInfo.Name) + } + }) + + // This intentionally comes after checking the preconditions because it + // registers its own BeforeEach which creates the namespace. Beware that it + // also registers an AfterEach which renders f unusable. Any code using + // f must run inside an It or Context callback. + f := framework.NewDefaultFramework("snapshottable-stress") + + init := func() { + cs = f.ClientSet + config, driverCleanup := driver.PrepareTest(f) + ctx, cancel := context.WithCancel(context.Background()) + + stressTest = &snapshottableStressTest{ + config: config, + driverCleanup: driverCleanup, + volumes: []*VolumeResource{}, + snapshots: []*SnapshotResource{}, + pods: []*v1.Pod{}, + testOptions: *driverInfo.VolumeSnapshotStressTestOptions, + ctx: ctx, + cancel: cancel, + } + } + + createPodsAndVolumes := func() { + for i := 0; i < stressTest.testOptions.NumPods; i++ { + framework.Logf("Creating resources for pod %d/%d", i, stressTest.testOptions.NumPods-1) + + volume := CreateVolumeResource(driver, stressTest.config, pattern, t.GetTestSuiteInfo().SupportedSizeRange) + stressTest.volumes = append(stressTest.volumes, volume) + + podConfig := e2epod.Config{ + NS: f.Namespace.Name, + PVCs: []*v1.PersistentVolumeClaim{volume.Pvc}, + SeLinuxLabel: e2epv.SELinuxLabel, + } + pod, err := e2epod.MakeSecPod(&podConfig) + framework.ExpectNoError(err) + stressTest.pods = append(stressTest.pods, pod) + + } + + var wg sync.WaitGroup + for i, pod := range stressTest.pods { + wg.Add(1) + + go func(i int, pod *v1.Pod) { + defer ginkgo.GinkgoRecover() + defer wg.Done() + + if _, err := cs.CoreV1().Pods(pod.Namespace).Create(context.TODO(), pod, metav1.CreateOptions{}); err != nil { + stressTest.cancel() + framework.Failf("Failed to create pod-%d [%+v]. Error: %v", i, pod, err) + } + }(i, pod) + } + wg.Wait() + + for i, pod := range stressTest.pods { + if err := e2epod.WaitForPodRunningInNamespace(cs, pod); err != nil { + stressTest.cancel() + framework.Failf("Failed to wait for pod-%d [%+v] turn into running status. Error: %v", i, pod, err) + } + } + } + + cleanup := func() { + framework.Logf("Stopping and waiting for all test routines to finish") + stressTest.cancel() + stressTest.wg.Wait() + + var ( + errs []error + mu sync.Mutex + wg sync.WaitGroup + ) + + for i, snapshot := range stressTest.snapshots { + wg.Add(1) + + go func(i int, snapshot *SnapshotResource) { + defer ginkgo.GinkgoRecover() + defer wg.Done() + + framework.Logf("Deleting snapshot %s/%s", snapshot.Vs.GetNamespace(), snapshot.Vs.GetName()) + err := snapshot.CleanupResource() + mu.Lock() + defer mu.Unlock() + errs = append(errs, err) + }(i, snapshot) + } + wg.Wait() + + for i, pod := range stressTest.pods { + wg.Add(1) + + go func(i int, pod *v1.Pod) { + defer ginkgo.GinkgoRecover() + defer wg.Done() + + framework.Logf("Deleting pod %s", pod.Name) + err := e2epod.DeletePodWithWait(cs, pod) + mu.Lock() + defer mu.Unlock() + errs = append(errs, err) + }(i, pod) + } + wg.Wait() + + for i, volume := range stressTest.volumes { + wg.Add(1) + + go func(i int, volume *VolumeResource) { + defer ginkgo.GinkgoRecover() + defer wg.Done() + + framework.Logf("Deleting volume %s", volume.Pvc.GetName()) + err := volume.CleanupResource() + mu.Lock() + defer mu.Unlock() + errs = append(errs, err) + }(i, volume) + } + wg.Wait() + + errs = append(errs, tryFunc(stressTest.driverCleanup)) + + framework.ExpectNoError(errors.NewAggregate(errs), "while cleaning up resources") + } + + ginkgo.BeforeEach(func() { + init() + createPodsAndVolumes() + }) + + // See #96177, this is necessary for cleaning up resources when tests are interrupted. + f.AddAfterEach("cleanup", func(f *framework.Framework, failed bool) { + cleanup() + }) + + ginkgo.It("should support snapshotting of many volumes repeatedly [Slow] [Serial]", func() { + // Repeatedly create and delete snapshots of each volume. + for i := 0; i < stressTest.testOptions.NumPods; i++ { + for j := 0; j < stressTest.testOptions.NumSnapshots; j++ { + stressTest.wg.Add(1) + + go func(podIndex, snapshotIndex int) { + defer ginkgo.GinkgoRecover() + defer stressTest.wg.Done() + + pod := stressTest.pods[podIndex] + volume := stressTest.volumes[podIndex] + + select { + case <-stressTest.ctx.Done(): + return + default: + framework.Logf("Pod-%d [%s], Iteration %d/%d", podIndex, pod.Name, snapshotIndex, stressTest.testOptions.NumSnapshots-1) + snapshot := CreateSnapshotResource(snapshottableDriver, stressTest.config, pattern, volume.Pvc.GetName(), volume.Pvc.GetNamespace()) + stressTest.snapshotsMutex.Lock() + defer stressTest.snapshotsMutex.Unlock() + stressTest.snapshots = append(stressTest.snapshots, snapshot) + } + }(i, j) + } + } + + stressTest.wg.Wait() + }) +} diff --git a/test/e2e/storage/testsuites/testdriver.go b/test/e2e/storage/testsuites/testdriver.go index 6373f2554d45..f1f70d6218a7 100644 --- a/test/e2e/storage/testsuites/testdriver.go +++ b/test/e2e/storage/testsuites/testdriver.go @@ -189,7 +189,10 @@ type DriverInfo struct { // Example: multi-zonal disk requires at least 2 allowed topologies. NumAllowedTopologies int // [Optional] Scale parameters for stress tests. + // TODO(#96241): Rename this field to reflect the tests that consume it. StressTestOptions *StressTestOptions + // [Optional] Scale parameters for volume snapshot stress tests. + VolumeSnapshotStressTestOptions *VolumeSnapshotStressTestOptions } // StressTestOptions contains parameters used for stress tests. @@ -201,6 +204,15 @@ type StressTestOptions struct { NumRestarts int } +// VolumeSnapshotStressTestOptions contains parameters used for volume snapshot stress tests. +type VolumeSnapshotStressTestOptions struct { + // Number of pods to create in the test. This may also create + // up to 1 volume per pod. + NumPods int + // Number of snapshots to create for each volume. + NumSnapshots int +} + // PerTestConfig represents parameters that control test execution. // One instance gets allocated for each test and is then passed // via pointer to functions involved in the test. diff --git a/test/e2e/storage/testsuites/stress.go b/test/e2e/storage/testsuites/volume_stress.go similarity index 82% rename from test/e2e/storage/testsuites/stress.go rename to test/e2e/storage/testsuites/volume_stress.go index 8cdc20e6cd3d..c75225b27536 100644 --- a/test/e2e/storage/testsuites/stress.go +++ b/test/e2e/storage/testsuites/volume_stress.go @@ -35,11 +35,11 @@ import ( "k8s.io/kubernetes/test/e2e/storage/testpatterns" ) -type stressTestSuite struct { +type volumeStressTestSuite struct { tsInfo TestSuiteInfo } -type stressTest struct { +type volumeStressTest struct { config *PerTestConfig driverCleanup func() @@ -55,13 +55,13 @@ type stressTest struct { testOptions StressTestOptions } -var _ TestSuite = &stressTestSuite{} +var _ TestSuite = &volumeStressTestSuite{} -// InitStressTestSuite returns stressTestSuite that implements TestSuite interface -func InitStressTestSuite() TestSuite { - return &stressTestSuite{ +// InitVolumeStressTestSuite returns volumeStressTestSuite that implements TestSuite interface +func InitVolumeStressTestSuite() TestSuite { + return &volumeStressTestSuite{ tsInfo: TestSuiteInfo{ - Name: "stress", + Name: "volume-stress", TestPatterns: []testpatterns.TestPattern{ testpatterns.DefaultFsDynamicPV, testpatterns.BlockVolModeDynamicPV, @@ -70,18 +70,18 @@ func InitStressTestSuite() TestSuite { } } -func (t *stressTestSuite) GetTestSuiteInfo() TestSuiteInfo { +func (t *volumeStressTestSuite) GetTestSuiteInfo() TestSuiteInfo { return t.tsInfo } -func (t *stressTestSuite) SkipRedundantSuite(driver TestDriver, pattern testpatterns.TestPattern) { +func (t *volumeStressTestSuite) SkipRedundantSuite(driver TestDriver, pattern testpatterns.TestPattern) { } -func (t *stressTestSuite) DefineTests(driver TestDriver, pattern testpatterns.TestPattern) { +func (t *volumeStressTestSuite) DefineTests(driver TestDriver, pattern testpatterns.TestPattern) { var ( dInfo = driver.GetDriverInfo() cs clientset.Interface - l *stressTest + l *volumeStressTest ) // Check preconditions before setting up namespace via framework below. @@ -89,6 +89,12 @@ func (t *stressTestSuite) DefineTests(driver TestDriver, pattern testpatterns.Te if dInfo.StressTestOptions == nil { e2eskipper.Skipf("Driver %s doesn't specify stress test options -- skipping", dInfo.Name) } + if dInfo.StressTestOptions.NumPods <= 0 { + framework.Failf("NumPods in stress test options must be a positive integer, received: %d", dInfo.StressTestOptions.NumPods) + } + if dInfo.StressTestOptions.NumRestarts <= 0 { + framework.Failf("NumRestarts in stress test options must be a positive integer, received: %d", dInfo.StressTestOptions.NumRestarts) + } if _, ok := driver.(DynamicPVTestDriver); !ok { e2eskipper.Skipf("Driver %s doesn't implement DynamicPVTestDriver -- skipping", dInfo.Name) @@ -103,11 +109,11 @@ func (t *stressTestSuite) DefineTests(driver TestDriver, pattern testpatterns.Te // registers its own BeforeEach which creates the namespace. Beware that it // also registers an AfterEach which renders f unusable. Any code using // f must run inside an It or Context callback. - f := framework.NewDefaultFramework("stress") + f := framework.NewDefaultFramework("volume-stress") init := func() { cs = f.ClientSet - l = &stressTest{} + l = &volumeStressTest{} // Now do the more expensive test initialization. l.config, l.driverCleanup = driver.PrepareTest(f) @@ -162,6 +168,7 @@ func (t *stressTestSuite) DefineTests(driver TestDriver, pattern testpatterns.Te createPodsAndVolumes() }) + // See #96177, this is necessary for cleaning up resources when tests are interrupted. f.AddAfterEach("cleanup", func(f *framework.Framework, failed bool) { cleanup() }) diff --git a/test/e2e/testing-manifests/storage-csi/gce-pd/controller_ss.yaml b/test/e2e/testing-manifests/storage-csi/gce-pd/controller_ss.yaml index 07ee47affa76..7eb39fa928be 100644 --- a/test/e2e/testing-manifests/storage-csi/gce-pd/controller_ss.yaml +++ b/test/e2e/testing-manifests/storage-csi/gce-pd/controller_ss.yaml @@ -21,7 +21,7 @@ spec: serviceAccountName: csi-gce-pd-controller-sa containers: - name: csi-snapshotter - image: gcr.io/gke-release/csi-snapshotter:v2.1.1-gke.0 + image: k8s.gcr.io/sig-storage/csi-snapshotter:v3.0.2 args: - "--v=5" - "--csi-address=/csi/csi.sock" @@ -55,7 +55,7 @@ spec: - name: socket-dir mountPath: /csi - name: gce-pd-driver - image: gcr.io/gke-release/gcp-compute-persistent-disk-csi-driver:v0.7.0-gke.0 + image: gcr.io/gke-release/gcp-compute-persistent-disk-csi-driver:v1.0.1-gke.0 args: - "--v=5" - "--endpoint=unix:/csi/csi.sock" diff --git a/test/e2e/testing-manifests/storage-csi/hostpath/hostpath/csi-hostpath-snapshotter.yaml b/test/e2e/testing-manifests/storage-csi/hostpath/hostpath/csi-hostpath-snapshotter.yaml index 693518274c00..66c3882ff640 100644 --- a/test/e2e/testing-manifests/storage-csi/hostpath/hostpath/csi-hostpath-snapshotter.yaml +++ b/test/e2e/testing-manifests/storage-csi/hostpath/hostpath/csi-hostpath-snapshotter.yaml @@ -40,7 +40,7 @@ spec: serviceAccount: csi-snapshotter containers: - name: csi-snapshotter - image: k8s.gcr.io/sig-storage/csi-snapshotter:v2.1.0 + image: k8s.gcr.io/sig-storage/csi-snapshotter:v3.0.2 args: - -v=5 - --csi-address=/csi/csi.sock