Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Scalability tests #1931

Merged
merged 5 commits into from
Apr 17, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 46 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ ifdef IMAGE_EXTRA_TAG
IMAGE_BUILD_EXTRA_OPTS += -t $(IMAGE_EXTRA_TAG)
endif

PROJECT_DIR := $(shell dirname $(abspath $(lastword $(MAKEFILE_LIST))))
ARTIFACTS ?= $(PROJECT_DIR)/bin
# Use distroless as minimal base image to package the manager binary
# Refer to https://github.com/GoogleContainerTools/distroless for more details
Expand Down Expand Up @@ -212,6 +213,51 @@ run-test-multikueue-e2e-%: FORCE
@echo Running multikueue e2e for k8s ${K8S_VERSION}
E2E_KIND_VERSION="kindest/node:v$(K8S_VERSION)" KIND_CLUSTER_NAME=$(KIND_CLUSTER_NAME) CREATE_KIND_CLUSTER=$(CREATE_KIND_CLUSTER) ARTIFACTS="$(ARTIFACTS)/$@" IMAGE_TAG=$(IMAGE_TAG) GINKGO_ARGS="$(GINKGO_ARGS)" JOBSET_VERSION=$(JOBSET_VERSION) ./hack/multikueue-e2e-test.sh

SCALABILITY_RUNNER := $(ARTIFACTS)/scalability-runner
trasc marked this conversation as resolved.
Show resolved Hide resolved
.PHONY: scalability-runner
scalability-runner:
$(GO_BUILD_ENV) $(GO_CMD) build -ldflags="$(LD_FLAGS)" -o $(SCALABILITY_RUNNER) test/scalability/runner/main.go

.PHONY: minimalkueue
minimalkueue:
$(GO_BUILD_ENV) $(GO_CMD) build -ldflags="$(LD_FLAGS)" -o $(ARTIFACTS)/minimalkueue test/scalability/minimalkueue/main.go

ifdef SCALABILITY_CPU_PROFILE
SCALABILITY_EXTRA_ARGS += --withCPUProfile=true
endif

ifdef SCALABILITY_KUEUE_LOGS
SCALABILITY_EXTRA_ARGS += --withLogs=true --logToFile=true
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is the file that it goes to?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bin/run-scalability/minimalkueue.err.log and bin/run-scalability/minimalkueue.out.log

(Just a side note, 1820 was really good, only from log size POV it got the size of bin/run-scalability/minimalkueue.err.log from around 3GB to under 100MB )

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI @gabesaba, as the person who discovered the issue :)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh, actually, @gabesaba's was #1897
It was probably a combination of both.

endif
trasc marked this conversation as resolved.
Show resolved Hide resolved

SCALABILITY_GENERATOR_CONFIG ?= $(PROJECT_DIR)/test/scalability/default_generator_config.yaml

SCALABILITY_RUN_DIR := $(ARTIFACTS)/run-scalability
.PHONY: run-scalability
run-scalability: envtest scalability-runner minimalkueue
mkdir -p $(SCALABILITY_RUN_DIR)
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) -p path)" \
$(SCALABILITY_RUNNER) \
--o $(SCALABILITY_RUN_DIR) \
--crds=$(PROJECT_DIR)/config/components/crd/bases \
--generatorConfig=$(SCALABILITY_GENERATOR_CONFIG) \
--minimalKueue=$(ARTIFACTS)/minimalkueue $(SCALABILITY_EXTRA_ARGS)

.PHONY: test-scalability
test-scalability: gotestsum run-scalability
$(GOTESTSUM) --junitfile $(ARTIFACTS)/junit.xml -- $(GO_TEST_FLAGS) ./test/scalability/checker \
--cmdStats=$(SCALABILITY_RUN_DIR)/minimalkueue.stats.yaml \
--range=$(PROJECT_DIR)/test/scalability/default_rangespec.yaml

.PHONY: run-scalability-in-cluster
run-scalability-in-cluster: envtest scalability-runner
mkdir -p $(ARTIFACTS)/run-scalability-in-cluster
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) -p path)" \
$(SCALABILITY_RUNNER) \
--o $(ARTIFACTS)/run-scalability-in-cluster \
--generatorConfig=$(SCALABILITY_GENERATOR_CONFIG) \
--qps=1000 --burst=2000 --timeout=15m

.PHONY: ci-lint
ci-lint: golangci-lint
$(GOLANGCI_LINT) run --timeout 15m0s
Expand Down Expand Up @@ -359,7 +405,6 @@ importer-image: PLATFORMS=linux/amd64
importer-image: PUSH=--load
importer-image: importer-image-build

PROJECT_DIR := $(shell dirname $(abspath $(lastword $(MAKEFILE_LIST))))
GOLANGCI_LINT = $(PROJECT_DIR)/bin/golangci-lint
.PHONY: golangci-lint
golangci-lint: ## Download golangci-lint locally if necessary.
Expand Down
50 changes: 50 additions & 0 deletions pkg/util/testing/kubeconfig.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/*
Copyright 2024 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package testing

import (
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/clientcmd"
clientcmdapi "k8s.io/client-go/tools/clientcmd/api"
)

func RestConfigToKubeConfig(restConfig *rest.Config) ([]byte, error) {
cfg := clientcmdapi.Config{
Kind: "config",
APIVersion: "v1",
Clusters: map[string]*clientcmdapi.Cluster{
"default-cluster": {
Server: restConfig.Host,
CertificateAuthorityData: restConfig.CAData,
},
},
AuthInfos: map[string]*clientcmdapi.AuthInfo{
"default-user": {
ClientCertificateData: restConfig.CertData,
ClientKeyData: restConfig.KeyData,
},
},
Contexts: map[string]*clientcmdapi.Context{
"default-context": {
Cluster: "default-cluster",
AuthInfo: "default-user",
},
},
CurrentContext: "default-context",
}
return clientcmd.Write(cfg)
}
8 changes: 8 additions & 0 deletions pkg/util/testing/wrappers.go
Original file line number Diff line number Diff line change
Expand Up @@ -627,6 +627,14 @@ func (c *ClusterQueueWrapper) StopPolicy(p kueue.StopPolicy) *ClusterQueueWrappe
return c
}

func (c *ClusterQueueWrapper) Label(k, v string) *ClusterQueueWrapper {
if c.Labels == nil {
c.Labels = make(map[string]string)
}
c.Labels[k] = v
return c
}

// Condition sets a condition on the ClusterQueue.
func (c *ClusterQueueWrapper) Condition(conditionType string, status metav1.ConditionStatus, reason, message string) *ClusterQueueWrapper {
apimeta.SetStatusCondition(&c.Status.Conditions, metav1.Condition{
Expand Down
28 changes: 2 additions & 26 deletions test/integration/multikueue/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,6 @@ import (
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/rest"
"k8s.io/client-go/tools/clientcmd"
clientcmdapi "k8s.io/client-go/tools/clientcmd/api"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/manager"

Expand All @@ -41,6 +39,7 @@ import (
workloadjob "sigs.k8s.io/kueue/pkg/controller/jobs/job"
workloadjobset "sigs.k8s.io/kueue/pkg/controller/jobs/jobset"
"sigs.k8s.io/kueue/pkg/queue"
utiltesting "sigs.k8s.io/kueue/pkg/util/testing"
"sigs.k8s.io/kueue/pkg/webhooks"
"sigs.k8s.io/kueue/test/integration/framework"
// +kubebuilder:scaffold:imports
Expand All @@ -58,30 +57,7 @@ type cluster struct {
}

func (c *cluster) kubeConfigBytes() ([]byte, error) {
cfg := clientcmdapi.Config{
Kind: "config",
APIVersion: "v1",
Clusters: map[string]*clientcmdapi.Cluster{
"default-cluster": {
Server: c.cfg.Host,
CertificateAuthorityData: c.cfg.CAData,
},
},
AuthInfos: map[string]*clientcmdapi.AuthInfo{
"default-user": {
ClientCertificateData: c.cfg.CertData,
ClientKeyData: c.cfg.KeyData,
},
},
Contexts: map[string]*clientcmdapi.Context{
"default-context": {
Cluster: "default-cluster",
AuthInfo: "default-user",
},
},
CurrentContext: "default-context",
}
return clientcmd.Write(cfg)
return utiltesting.RestConfigToKubeConfig(c.cfg)
}

var (
Expand Down
64 changes: 64 additions & 0 deletions test/scalability/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# Scalability test

Is a test meant to detect regressions int the Kueue's overall scheduling capabilities.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe we should put all of this inside test/performance/scheduling?

The existing tests in test/performance are scalability tests as well, just a different level.

Another potential directory structure could be:

test/performance (for this tool) and test/performance_e2e (for the cl2 based things).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can but let's come up with a precise naming scheme, not only for the code location but also the artifacts and make targets, otherwise it will be hard to follow the terminology.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the target names should match. Maybe we can put all the targets for "performance" inside its own Makefile. so it would look like:

make test/performance/jobs test/performance/scheduling

But we can leave them for a follow up.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's go with the follow-up then

trasc marked this conversation as resolved.
Show resolved Hide resolved

# Components
In order to achieve this the following components are used:

## Runner

An application able to:
- generate a set of Kueue specific objects based on a config following the schema of [default_generator_config](`./default_generator_config.yaml`)
- mimic the execution of the workloads
- monitor the created object and generate execution statistics based on the received events

Optionally it's able to run an instance of [minimalkueue](#MinimalKueue) in a dedicated [envtest](https://book.kubebuilder.io/reference/envtest.html) environment.

## MinimalKueue

A light version of the Kueue's controller manager consisting only of the core controllers and the scheduler.

It is designed to offer the Kueue scheduling capabilities without any additional components which may flood the optional cpu profiles taken during it's execution.


## Checker

Checks the results of a scalability against a set of expected value defined as [default_rangespec](./default_rangespec.yaml).

# Usage

## Run in an existing cluster

```bash
make run-scalability-in-cluster
```

Will run a scalability scenario against an existing cluster (connectable by the host's default kubeconfig), and store the resulting artifacts are stored in `$(PROJECT_DIR)/bin/run-scalability-in-cluster`.

The generation config to be used can be set in `SCALABILITY_GENERATOR_CONFIG` by default using `$(PROJECT_DIR)/test/scalability/default_generator_config.yaml`

Check [installation guide](https://kueue.sigs.k8s.io/docs/installation) for cluster and [observability](https://kueue.sigs.k8s.io/docs/installation/#add-metrics-scraping-for-prometheus-operator).

## Run with minimalkueue

```bash
make run-scalability
```

Will run a scalability scenario against an [envtest](https://book.kubebuilder.io/reference/envtest.html) environment
and an instance of minimalkueue.
The resulting artifacts are stored in `$(PROJECT_DIR)/bin/run-scalability`.

The generation config to be used can be set in `SCALABILITY_GENERATOR_CONFIG` by default using `$(PROJECT_DIR)/test/scalability/default_generator_config.yaml`

Setting `SCALABILITY_CPU_PROFILE=1` will generate a cpuprofile of minimalkueue in `$(PROJECT_DIR)/bin/run-scalability/minimalkueue.cpu.prof`

Setting `SCALABILITY_KUEUE_LOGS=1` will save the logs of minimalkueue in `$(PROJECT_DIR)/bin/run-scalability/minimalkueue.out.log` and `$(PROJECT_DIR)/bin/run-scalability/minimalkueue.err.log`

## Run scalability test
trasc marked this conversation as resolved.
Show resolved Hide resolved

```bash
make test-scalability
```

Runs the scalability with minimalkueue and checks the results against `$(PROJECT_DIR)/test/scalability/default_rangespec.yaml`
80 changes: 80 additions & 0 deletions test/scalability/checker/checker_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
Copyright 2024 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package checker

import (
"flag"
"os"
"testing"

"sigs.k8s.io/yaml"

"sigs.k8s.io/kueue/test/scalability/runner/stats"
)

var (
cmdStatsFile = flag.String("cmdStats", "", "command stats yaml file")
rangeFile = flag.String("range", "", "expectations range file")
)

type RangeSpec struct {
Cmd struct {
MaxWallMs int64 `json:"maxWallMs"`
MaxUserMs int64 `json:"maxUserMs"`
MaxSysMs int64 `json:"maxSysMs"`
Maxrss uint64 `json:"maxrss"`
} `json:"cmd"`
}

func TestScalability(t *testing.T) {
cmdStatsBytes, err := os.ReadFile(*cmdStatsFile)
if err != nil {
t.Fatalf("Unable to read command stats: %s", err)
}

cmdStats := stats.CmdStats{}
err = yaml.UnmarshalStrict(cmdStatsBytes, &cmdStats)
if err != nil {
t.Fatalf("Unable to unmarshal command stats: %s", err)
}

rangeBytes, err := os.ReadFile(*rangeFile)
if err != nil {
t.Fatalf("Unable to read range spec: %s", err)
}

rangeSpec := RangeSpec{}
err = yaml.UnmarshalStrict(rangeBytes, &rangeSpec)
if err != nil {
t.Fatalf("Unable to unmarshal range spec: %s", err)
}

t.Run("CommandStats", func(t *testing.T) {
if cmdStats.WallMs > rangeSpec.Cmd.MaxWallMs {
t.Errorf("Wall time %dms is grater than maximum expected %dms", cmdStats.WallMs, rangeSpec.Cmd.MaxWallMs)
}
if cmdStats.UserMs > rangeSpec.Cmd.MaxUserMs {
t.Errorf("User time %dms is grater than maximum expected %dms", cmdStats.UserMs, rangeSpec.Cmd.MaxUserMs)
}
if cmdStats.SysMs > rangeSpec.Cmd.MaxSysMs {
t.Errorf("Sys time %dms is grater than maximum expected %dms", cmdStats.SysMs, rangeSpec.Cmd.MaxSysMs)
}
if cmdStats.Maxrss > int64(rangeSpec.Cmd.Maxrss) {
t.Errorf("Maxrss %dKib is grater than maximum expected %dKib", cmdStats.Maxrss, rangeSpec.Cmd.Maxrss)
trasc marked this conversation as resolved.
Show resolved Hide resolved
}
})
}
31 changes: 31 additions & 0 deletions test/scalability/default_generator_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
- className: cohort
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are no templates for the objects, right? They are all generated in code?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In short yes, we can extend the schema for this file if needed, but to keep it simple for now is better to just "hardcode" the genaration of namespace, LQs, ResourceFlavor ....

count: 5
queuesSets:
- className: cq
count: 6
nominalQuota: 20
borrowingLimit: 100
reclaimWithinCohort: Any
withinClusterQueue: LowerPriority
workloadsSets:
- count: 350
creationIntervalMs: 100
workloads:
- className: small
runtimeMs: 200
priority: 50
request: 1
- count: 100
creationIntervalMs: 500
workloads:
- className: medium
runtimeMs: 500
priority: 100
request: 5
- count: 50
creationIntervalMs: 1200
workloads:
- className: large
runtimeMs: 1000
priority: 200
request: 20
7 changes: 7 additions & 0 deletions test/scalability/default_rangespec.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Until we have a clear picture on how the setup
# performs in CI keep the values "very relaxed"
cmd:
maxWallMs: 3600_000 #ih
trasc marked this conversation as resolved.
Show resolved Hide resolved
maxUserMs: 3600_000
maxSysMs: 3600_000
maxrss: 1024_000 #1000MiB
Loading