Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automated cherry pick of #32886 #32885 #32926 #32982

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Expand Up @@ -152,7 +152,7 @@ test-e2e: ginkgo generated_files
# DELETE_INSTANCES: For REMOTE=true only. Delete any instances created as
# part of this test run. Defaults to false.
# ARTIFACTS: For REMOTE=true only. Local directory to scp test artifacts into
# from the remote hosts. Defaults to ""/tmp/_artifacts".
# from the remote hosts. Defaults to "/tmp/_artifacts".
# REPORT: For REMOTE=false only. Local directory to write juntil xml results
# to. Defaults to "/tmp/".
# CLEANUP: For REMOTE=true only. If false, do not stop processes or delete
Expand Down
4 changes: 2 additions & 2 deletions cluster/aws/templates/configure-vm-aws.sh
Expand Up @@ -91,7 +91,7 @@ EOF
if [[ ! -z "${KUBELET_APISERVER:-}" ]] && [[ ! -z "${KUBELET_CERT:-}" ]] && [[ ! -z "${KUBELET_KEY:-}" ]]; then
cat <<EOF >>/etc/salt/minion.d/grains.conf
kubelet_api_servers: '${KUBELET_APISERVER}'
cbr-cidr: 10.123.45.0/30
cbr-cidr: 10.123.45.0/29
EOF
else
# If the kubelet is running disconnected from a master, give it a fixed
Expand All @@ -110,7 +110,7 @@ salt-node-role() {
grains:
roles:
- kubernetes-pool
cbr-cidr: 10.123.45.0/30
cbr-cidr: 10.123.45.0/29
cloud: aws
api_servers: '${API_SERVERS}'
EOF
Expand Down
4 changes: 2 additions & 2 deletions cluster/gce/configure-vm.sh
Expand Up @@ -949,7 +949,7 @@ EOF
if [[ ! -z "${KUBELET_APISERVER:-}" ]] && [[ ! -z "${KUBELET_CERT:-}" ]] && [[ ! -z "${KUBELET_KEY:-}" ]]; then
cat <<EOF >>/etc/salt/minion.d/grains.conf
kubelet_api_servers: '${KUBELET_APISERVER}'
cbr-cidr: 10.123.45.0/30
cbr-cidr: 10.123.45.0/29
EOF
else
# If the kubelet is running disconnected from a master, give it a fixed
Expand All @@ -968,7 +968,7 @@ function salt-node-role() {
grains:
roles:
- kubernetes-pool
cbr-cidr: 10.123.45.0/30
cbr-cidr: 10.123.45.0/29
cloud: gce
api_servers: '${KUBERNETES_MASTER_NAME}'
EOF
Expand Down
5 changes: 4 additions & 1 deletion cluster/gce/gci/configure-helper.sh
Expand Up @@ -480,7 +480,10 @@ function start-kubelet {
if [[ ! -z "${KUBELET_APISERVER:-}" && ! -z "${KUBELET_CERT:-}" && ! -z "${KUBELET_KEY:-}" ]]; then
flags+=" --api-servers=https://${KUBELET_APISERVER}"
flags+=" --register-schedulable=false"
flags+=" --pod-cidr=10.123.45.0/30"
# need at least a /29 pod cidr for now due to #32844
# TODO: determine if we still allow non-hostnetwork pods to run on master, clean up master pod setup
# WARNING: potential ip range collision with 10.123.45.0/29
flags+=" --pod-cidr=10.123.45.0/29"
reconcile_cidr="false"
else
flags+=" --pod-cidr=${MASTER_IP_RANGE}"
Expand Down
2 changes: 1 addition & 1 deletion cluster/gce/trusty/configure-helper.sh
Expand Up @@ -155,7 +155,7 @@ assemble_kubelet_flags() {
if [ ! -z "${KUBELET_APISERVER:-}" ] && \
[ ! -z "${KUBELET_CERT:-}" ] && \
[ ! -z "${KUBELET_KEY:-}" ]; then
KUBELET_CMD_FLAGS="${KUBELET_CMD_FLAGS} --api-servers=https://${KUBELET_APISERVER} --register-schedulable=false --reconcile-cidr=false --pod-cidr=10.123.45.0/30"
KUBELET_CMD_FLAGS="${KUBELET_CMD_FLAGS} --api-servers=https://${KUBELET_APISERVER} --register-schedulable=false --reconcile-cidr=false --pod-cidr=10.123.45.0/29"
else
KUBELET_CMD_FLAGS="${KUBELET_CMD_FLAGS} --pod-cidr=${MASTER_IP_RANGE}"
fi
Expand Down
1 change: 1 addition & 0 deletions docs/design/resource-qos.md
Expand Up @@ -197,6 +197,7 @@ Pod OOM score configuration

*Pod infra containers* or *Special Pod init process*
- OOM_SCORE_ADJ: -998

*Kubelet, Docker*
- OOM_SCORE_ADJ: -999 (won’t be OOM killed)
- Hack, because these critical tasks might die if they conflict with guaranteed containers. In the future, we should place all user-pods into a separate cgroup, and set a limit on the memory they can consume.
Expand Down
2 changes: 1 addition & 1 deletion docs/proposals/kubelet-systemd.md
Expand Up @@ -336,7 +336,7 @@ The `kubelet` will set the following:
The `kubelet` at bootstrapping will set the `oom_score_adj` value for Kubernetes
daemons, and any dependent container-runtime daemons.

If `container-runtime` is set to `docker`, then set its `oom_score_adj=-900`
If `container-runtime` is set to `docker`, then set its `oom_score_adj=-999`

## Implementation concerns

Expand Down
8 changes: 8 additions & 0 deletions hack/verify-flags/exceptions.txt
Expand Up @@ -100,6 +100,8 @@ pkg/kubelet/api/v1alpha1/runtime/api.pb.go: ContainerPort *int32 `protobuf:"vari
pkg/kubelet/api/v1alpha1/runtime/api.pb.go: OomScoreAdj *int64 `protobuf:"varint,5,opt,name=oom_score_adj,json=oomScoreAdj" json:"oom_score_adj,omitempty"`
pkg/kubelet/api/v1alpha1/runtime/api.proto: optional int32 container_port = 3;
pkg/kubelet/api/v1alpha1/runtime/api.proto: optional int64 oom_score_adj = 5;
pkg/kubelet/cm/container_manager_linux.go: glog.V(3).Infof("Failed to apply oom_score_adj %d for pid %d: %v", oomScoreAdj, pid, err)
pkg/kubelet/cm/container_manager_linux.go: glog.V(5).Infof("attempting to apply oom_score_adj of %d to pid %d", oomScoreAdj, pid)
pkg/kubelet/network/hairpin/hairpin.go: hairpinModeRelativePath = "hairpin_mode"
pkg/kubelet/qos/policy_test.go: t.Errorf("oom_score_adj should be between %d and %d, but was %d", test.lowOOMScoreAdj, test.highOOMScoreAdj, oomScoreAdj)
pkg/kubelet/qos/policy_test.go: highOOMScoreAdj int // The min oom_score_adj score the container should be assigned.
Expand All @@ -117,6 +119,12 @@ test/e2e/common/host_path.go: fmt.Sprintf("--retry_time=%d", retryDuration),
test/e2e/es_cluster_logging.go: framework.Failf("No cluster_name field in Elasticsearch response: %v", esResponse)
test/e2e/es_cluster_logging.go: // Check to see if have a cluster_name field.
test/e2e/es_cluster_logging.go: clusterName, ok := esResponse["cluster_name"]
test/e2e_node/container_manager_test.go: return fmt.Errorf("expected pid %d's oom_score_adj to be %d; found %d", pid, expectedOOMScoreAdj, oomScore)
test/e2e_node/container_manager_test.go: return fmt.Errorf("expected pid %d's oom_score_adj to be < %d; found %d", pid, expectedMaxOOMScoreAdj, oomScore)
test/e2e_node/container_manager_test.go: return fmt.Errorf("expected pid %d's oom_score_adj to be >= %d; found %d", pid, expectedMinOOMScoreAdj, oomScore)
test/e2e_node/container_manager_test.go: return fmt.Errorf("failed to get oom_score_adj for %d", pid)
test/e2e_node/container_manager_test.go: return fmt.Errorf("failed to get oom_score_adj for %d: %v", pid, err)
test/e2e_node/container_manager_test.go: procfsPath := path.Join("/proc", strconv.Itoa(pid), "oom_score_adj")
test/images/mount-tester/mt.go: flag.BoolVar(&breakOnExpectedContent, "break_on_expected_content", true, "Break out of loop on expected content, (use with --file_content_in_loop flag only)")
test/images/mount-tester/mt.go: flag.IntVar(&retryDuration, "retry_time", 180, "Retry time during the loop")
test/images/mount-tester/mt.go: flag.StringVar(&readFileContentInLoopPath, "file_content_in_loop", "", "Path to read the file content in loop from")
57 changes: 36 additions & 21 deletions pkg/kubelet/cm/container_manager_linux.go
Expand Up @@ -326,6 +326,7 @@ func (cm *containerManagerImpl) setupNode() error {

systemContainers := []*systemContainer{}
if cm.ContainerRuntime == "docker" {
dockerVersion := getDockerVersion(cm.cadvisorInterface)
if cm.RuntimeCgroupsName != "" {
cont := newSystemCgroups(cm.RuntimeCgroupsName)
var capacity = api.ResourceList{}
Expand All @@ -351,13 +352,17 @@ func (cm *containerManagerImpl) setupNode() error {
},
},
}
dockerVersion := getDockerVersion(cm.cadvisorInterface)
cont.ensureStateFunc = func(manager *fs.Manager) error {
return ensureDockerInContainer(dockerVersion, -900, dockerContainer)
return ensureDockerInContainer(dockerVersion, qos.DockerOOMScoreAdj, dockerContainer)
}
systemContainers = append(systemContainers, cont)
} else {
cm.periodicTasks = append(cm.periodicTasks, func() {
glog.V(10).Infof("Adding docker daemon periodic tasks")
if err := ensureDockerInContainer(dockerVersion, qos.DockerOOMScoreAdj, nil); err != nil {
glog.Error(err)
return
}
cont, err := getContainerNameForProcess(dockerProcessName, dockerPidFile)
if err != nil {
glog.Error(err)
Expand Down Expand Up @@ -401,11 +406,15 @@ func (cm *containerManagerImpl) setupNode() error {
},
}
cont.ensureStateFunc = func(_ *fs.Manager) error {
return manager.Apply(os.Getpid())
return ensureProcessInContainerWithOOMScore(os.Getpid(), qos.KubeletOOMScoreAdj, &manager)
}
systemContainers = append(systemContainers, cont)
} else {
cm.periodicTasks = append(cm.periodicTasks, func() {
if err := ensureProcessInContainerWithOOMScore(os.Getpid(), qos.KubeletOOMScoreAdj, nil); err != nil {
glog.Error(err)
return
}
cont, err := getContainer(os.Getpid())
if err != nil {
glog.Errorf("failed to find cgroups of kubelet - %v", err)
Expand Down Expand Up @@ -515,16 +524,18 @@ func (cm *containerManagerImpl) SystemCgroupsLimit() api.ResourceList {
}

func isProcessRunningInHost(pid int) (bool, error) {
// Get init mount namespace. Mount namespace is unique for all containers.
initMntNs, err := os.Readlink("/proc/1/ns/mnt")
// Get init pid namespace.
initPidNs, err := os.Readlink("/proc/1/ns/pid")
if err != nil {
return false, fmt.Errorf("failed to find mount namespace of init process")
return false, fmt.Errorf("failed to find pid namespace of init process")
}
processMntNs, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/mnt", pid))
glog.V(10).Infof("init pid ns is %q", initPidNs)
processPidNs, err := os.Readlink(fmt.Sprintf("/proc/%d/ns/pid", pid))
if err != nil {
return false, fmt.Errorf("failed to find mount namespace of process %q", pid)
return false, fmt.Errorf("failed to find pid namespace of process %q", pid)
}
return initMntNs == processMntNs, nil
glog.V(10).Infof("Pid %d pid ns is %q", pid, processPidNs)
return initPidNs == processPidNs, nil
}

func getPidFromPidFile(pidFile string) (int, error) {
Expand Down Expand Up @@ -566,7 +577,6 @@ func ensureDockerInContainer(dockerVersion semver.Version, oomScoreAdj int, mana
if dockerVersion.GTE(containerdVersion) {
dockerProcs = append(dockerProcs, process{containerdProcessName, containerdPidFile})
}

var errs []error
for _, proc := range dockerProcs {
pids, err := getPidsForProcess(proc.name, proc.file)
Expand All @@ -577,40 +587,45 @@ func ensureDockerInContainer(dockerVersion semver.Version, oomScoreAdj int, mana

// Move if the pid is not already in the desired container.
for _, pid := range pids {
if err := ensureProcessInContainer(pid, oomScoreAdj, manager); err != nil {
if err := ensureProcessInContainerWithOOMScore(pid, oomScoreAdj, manager); err != nil {
errs = append(errs, fmt.Errorf("errors moving %q pid: %v", proc.name, err))
}
}
}
return utilerrors.NewAggregate(errs)
}

func ensureProcessInContainer(pid int, oomScoreAdj int, manager *fs.Manager) error {
func ensureProcessInContainerWithOOMScore(pid int, oomScoreAdj int, manager *fs.Manager) error {
if runningInHost, err := isProcessRunningInHost(pid); err != nil {
// Err on the side of caution. Avoid moving the docker daemon unless we are able to identify its context.
return err
} else if !runningInHost {
// Process is running inside a container. Don't touch that.
glog.V(2).Infof("pid %d is not running in the host namespaces", pid)
return nil
}

var errs []error
cont, err := getContainer(pid)
if err != nil {
errs = append(errs, fmt.Errorf("failed to find container of PID %d: %v", pid, err))
}

if cont != manager.Cgroups.Name {
err = manager.Apply(pid)
if manager != nil {
cont, err := getContainer(pid)
if err != nil {
errs = append(errs, fmt.Errorf("failed to move PID %d (in %q) to %q", pid, cont, manager.Cgroups.Name))
errs = append(errs, fmt.Errorf("failed to find container of PID %d: %v", pid, err))
}

if cont != manager.Cgroups.Name {
err = manager.Apply(pid)
if err != nil {
errs = append(errs, fmt.Errorf("failed to move PID %d (in %q) to %q: %v", pid, cont, manager.Cgroups.Name, err))
}
}
}

// Also apply oom-score-adj to processes
oomAdjuster := oom.NewOOMAdjuster()
glog.V(5).Infof("attempting to apply oom_score_adj of %d to pid %d", oomScoreAdj, pid)
if err := oomAdjuster.ApplyOOMScoreAdj(pid, oomScoreAdj); err != nil {
errs = append(errs, fmt.Errorf("failed to apply oom score %d to PID %d", oomScoreAdj, pid))
glog.V(3).Infof("Failed to apply oom_score_adj %d for pid %d: %v", oomScoreAdj, pid, err)
errs = append(errs, fmt.Errorf("failed to apply oom score %d to PID %d: %v", oomScoreAdj, pid, err))
}
return utilerrors.NewAggregate(errs)
}
Expand Down
9 changes: 5 additions & 4 deletions pkg/kubelet/qos/policy.go
Expand Up @@ -21,8 +21,9 @@ import (
)

const (
PodInfraOOMAdj int = -999
PodInfraOOMAdj int = -998
KubeletOOMScoreAdj int = -999
DockerOOMScoreAdj int = -999
KubeProxyOOMScoreAdj int = -999
guaranteedOOMScoreAdj int = -998
besteffortOOMScoreAdj int = 1000
Expand Down Expand Up @@ -53,10 +54,10 @@ func GetContainerOOMScoreAdjust(pod *api.Pod, container *api.Container, memoryCa
// Note that this is a heuristic, it won't work if a container has many small processes.
memoryRequest := container.Resources.Requests.Memory().Value()
oomScoreAdjust := 1000 - (1000*memoryRequest)/memoryCapacity
// A guaranteed pod using 100% of memory can have an OOM score of 1. Ensure
// A guaranteed pod using 100% of memory can have an OOM score of 10. Ensure
// that burstable pods have a higher OOM score adjustment.
if oomScoreAdjust < 2 {
return 2
if int(oomScoreAdjust) < (1000 + guaranteedOOMScoreAdj) {
return (1000 + guaranteedOOMScoreAdj)
}
// Give burstable pods a higher chance of survival over besteffort pods.
if int(oomScoreAdjust) == besteffortOOMScoreAdj {
Expand Down
2 changes: 1 addition & 1 deletion pkg/registry/generic/registry/store.go
Expand Up @@ -268,7 +268,7 @@ func (e *Store) Create(ctx api.Context, obj runtime.Object) (runtime.Object, err
if e.QualifiedResource.Resource != "replicationcontrollers" {
return nil, err
}
*msg = fmt.Sprintf("Note: if you are using \"kubectl rolling-update\" and your kubectl version is older than v1.4.0, your rolling-update has probably failed, though the pods are correctly updated. Please see https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG.md#kubectl-rolling-update for a workaround. : %s", *msg)
*msg = fmt.Sprintf("%s: if you're using \"kubectl rolling-update\" with kubectl version older than v1.4.0, your rolling update has failed, though the pods are correctly updated. Please see https://github.com/kubernetes/kubernetes/blob/master/CHANGELOG.md#kubectl-rolling-update for a workaround", *msg)
}
return nil, err
}
Expand Down
21 changes: 9 additions & 12 deletions pkg/util/oom/oom_linux.go
Expand Up @@ -20,6 +20,7 @@ package oom

import (
"fmt"
"io/ioutil"
"os"
"path"
"strconv"
Expand Down Expand Up @@ -65,28 +66,24 @@ func applyOOMScoreAdj(pid int, oomScoreAdj int) error {
maxTries := 2
oomScoreAdjPath := path.Join("/proc", pidStr, "oom_score_adj")
value := strconv.Itoa(oomScoreAdj)
glog.V(4).Infof("attempting to set %q to %q", oomScoreAdjPath, value)
var err error
for i := 0; i < maxTries; i++ {
f, err := os.Open(oomScoreAdjPath)
err = ioutil.WriteFile(oomScoreAdjPath, []byte(value), 0700)
if err != nil {
if os.IsNotExist(err) {
glog.V(2).Infof("%q does not exist", oomScoreAdjPath)
return os.ErrNotExist
}
err = fmt.Errorf("failed to apply oom-score-adj to pid %d (%v)", pid, err)
continue
}
if _, err := f.Write([]byte(value)); err != nil {
// we can ignore the return value of f.Close() here.
f.Close()
err = fmt.Errorf("failed to apply oom-score-adj to pid %d (%v)", pid, err)
continue
}
if err = f.Close(); err != nil {
err = fmt.Errorf("failed to apply oom-score-adj to pid %d (%v)", pid, err)

glog.V(3).Info(err)
continue
}
return nil
}
if err != nil {
glog.V(2).Infof("failed to set %q to %q: %v", oomScoreAdjPath, value, err)
}
return err
}

Expand Down