Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Kubelet] Improving QOS in kubelet by introducing QoS level Cgroups - --cgroups-per-qos #27853

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ test-e2e: ginkgo generated_files
# Example:
# make test-e2e-node FOCUS=kubelet SKIP=container
# make test-e2e-node REMOTE=true DELETE_INSTANCES=true
# make test-e2e-node TEST_ARGS="--cgroups-per-qos=true"
# Build and run tests.
.PHONY: test-e2e-node
test-e2e-node: ginkgo generated_files
Expand Down
1 change: 1 addition & 0 deletions cmd/kubelet/app/options/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ func (s *KubeletServer) AddFlags(fs *pflag.FlagSet) {
fs.MarkDeprecated("system-container", "Use --system-cgroups instead. Will be removed in a future version.")
fs.StringVar(&s.SystemCgroups, "system-cgroups", s.SystemCgroups, "Optional absolute name of cgroups in which to place all non-kernel processes that are not already inside a cgroup under `/`. Empty for no container. Rolling back the flag requires a reboot. (Default: \"\").")

fs.BoolVar(&s.CgroupsPerQOS, "cgroups-per-qos", s.CgroupsPerQOS, "Enable creation of QoS cgroup hierarchy, if true top level QoS and pod cgroups are created.")
fs.StringVar(&s.CgroupRoot, "cgroup-root", s.CgroupRoot, "Optional root cgroup to use for pods. This is handled by the container runtime on a best effort basis. Default: '', which means use the container runtime default.")
fs.StringVar(&s.ContainerRuntime, "container-runtime", s.ContainerRuntime, "The container runtime to use. Possible values: 'docker', 'rkt'. Default: 'docker'.")
fs.DurationVar(&s.RuntimeRequestTimeout.Duration, "runtime-request-timeout", s.RuntimeRequestTimeout.Duration, "Timeout of all runtime requests except long running request - pull, logs, exec and attach. When timeout exceeded, kubelet will cancel the request, throw out an error and retry later. Default: 2m0s")
Expand Down
7 changes: 6 additions & 1 deletion cmd/kubelet/app/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ func UnsecuredKubeletConfig(s *options.KubeletServer) (*KubeletConfig, error) {
EnableControllerAttachDetach: s.EnableControllerAttachDetach,
EnableCustomMetrics: s.EnableCustomMetrics,
EnableDebuggingHandlers: s.EnableDebuggingHandlers,
CgroupsPerQOS: s.CgroupsPerQOS,
EnableServer: s.EnableServer,
EventBurst: int(s.EventBurst),
EventRecordQPS: float32(s.EventRecordQPS),
Expand Down Expand Up @@ -363,12 +364,13 @@ func run(s *options.KubeletServer, kcfg *KubeletConfig) (err error) {
if kcfg.SystemCgroups != "" && kcfg.CgroupRoot == "" {
return fmt.Errorf("invalid configuration: system container was specified and cgroup root was not specified")
}

kcfg.ContainerManager, err = cm.NewContainerManager(kcfg.Mounter, kcfg.CAdvisorInterface, cm.NodeConfig{
RuntimeCgroupsName: kcfg.RuntimeCgroups,
SystemCgroupsName: kcfg.SystemCgroups,
KubeletCgroupsName: kcfg.KubeletCgroups,
ContainerRuntime: kcfg.ContainerRuntime,
CgroupsPerQOS: kcfg.CgroupsPerQOS,
CgroupRoot: kcfg.CgroupRoot,
})
if err != nil {
return err
Expand Down Expand Up @@ -575,6 +577,7 @@ func SimpleKubelet(client *clientset.Clientset,
EnableCustomMetrics: false,
EnableDebuggingHandlers: true,
EnableServer: true,
CgroupsPerQOS: false,
FileCheckFrequency: fileCheckFrequency,
// Since this kubelet runs with --configure-cbr0=false, it needs to use
// hairpin-veth to allow hairpin packets. Note that this deviates from
Expand Down Expand Up @@ -798,6 +801,7 @@ type KubeletConfig struct {
EnableControllerAttachDetach bool
EnableCustomMetrics bool
EnableDebuggingHandlers bool
CgroupsPerQOS bool
EnableServer bool
EventClient *clientset.Clientset
EventBurst int
Expand Down Expand Up @@ -929,6 +933,7 @@ func CreateAndInitKubelet(kc *KubeletConfig) (k KubeletBootstrap, pc *config.Pod
kc.NodeLabels,
kc.NodeStatusUpdateFrequency,
kc.OSInterface,
kc.CgroupsPerQOS,
kc.CgroupRoot,
kc.ContainerRuntime,
kc.RuntimeRequestTimeout,
Expand Down
8 changes: 8 additions & 0 deletions docs/devel/e2e-node-tests.md
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,14 @@ less useful for catching flakes related creating the instance from an image.**
make test-e2e-node REMOTE=true RUN_UNTIL_FAILURE=true
```

## Additional QoS Cgroups Hierarchy level testing

For testing with the QoS Cgroup Hierarchy enabled, you can pass --cgroups-per-qos flag as an argument into Ginkgo using TEST_ARGS

```sh
make test_e2e_node TEST_ARGS="--cgroups-per-qos=true"
```

# Notes on tests run by the Kubernetes project during pre-, post- submit.

The node e2e tests are run by the PR builder for each Pull Request and the results published at
Expand Down
6 changes: 4 additions & 2 deletions hack/make-rules/test-e2e-node.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ cleanup=${CLEANUP:-"true"}
delete_instances=${DELETE_INSTANCES:-"false"}
run_until_failure=${RUN_UNTIL_FAILURE:-"false"}
list_images=${LIST_IMAGES:-"false"}
test_args=${TEST_ARGS:-""}

if [[ $list_images == "true" ]]; then
gcloud compute images list --project="${image_project}" | grep "e2e-node"
Expand Down Expand Up @@ -117,7 +118,7 @@ if [ $remote = true ] ; then
--hosts="$hosts" --images="$images" --cleanup="$cleanup" \
--results-dir="$artifacts" --ginkgo-flags="$ginkgoflags" \
--image-project="$image_project" --instance-name-prefix="$instance_prefix" --setup-node="true" \
--delete-instances="$delete_instances"
--delete-instances="$delete_instances" --test_args="$test_args"
exit $?

else
Expand All @@ -129,6 +130,7 @@ else
# Test using the host the script was run on
# Provided for backwards compatibility
"${ginkgo}" --focus=$focus --skip=$skip "${KUBE_ROOT}/test/e2e_node/" --report-dir=${report} \
-- --alsologtostderr --v 2 --node-name $(hostname) --disable-kubenet=true --build-services=true --start-services=true --stop-services=true
-- --alsologtostderr --v 2 --node-name $(hostname) --disable-kubenet=true --build-services=true \
--start-services=true --stop-services=true "$test_args"
exit $?
fi
1 change: 1 addition & 0 deletions hack/verify-flags/known-flags.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ build-tag
cadvisor-port
cert-dir
certificate-authority
cgroups-per-qos
cgroup-root
chaos-chance
clean-start
Expand Down