Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automated cherry pick of #38836 #39114 #39059 upstream release 1.5 #39562

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
8 changes: 8 additions & 0 deletions cluster/saltbase/salt/kube-proxy/kube-proxy.manifest
Expand Up @@ -38,6 +38,14 @@ kind: Pod
metadata:
name: kube-proxy
namespace: kube-system
# This annotation lowers the possibility that kube-proxy gets evicted when the
# node is under memory pressure, and prioritizes it for admission, even if
# the node is under memory pressure.
# Note that kube-proxy runs as a static pod so this annotation does NOT have
# any effect on rescheduler (default scheduler and rescheduler are not
# involved in scheduling kube-proxy).
annotations:
scheduler.alpha.kubernetes.io/critical-pod: ''
labels:
tier: node
component: kube-proxy
Expand Down
2 changes: 2 additions & 0 deletions pkg/kubelet/eviction/BUILD
Expand Up @@ -27,6 +27,7 @@ go_library(
"//pkg/kubelet/api/v1alpha1/stats:go_default_library",
"//pkg/kubelet/cm:go_default_library",
"//pkg/kubelet/lifecycle:go_default_library",
"//pkg/kubelet/pod:go_default_library",
"//pkg/kubelet/qos:go_default_library",
"//pkg/kubelet/server/stats:go_default_library",
"//pkg/kubelet/util/format:go_default_library",
Expand All @@ -53,6 +54,7 @@ go_test(
"//pkg/client/record:go_default_library",
"//pkg/kubelet/api/v1alpha1/stats:go_default_library",
"//pkg/kubelet/lifecycle:go_default_library",
"//pkg/kubelet/types:go_default_library",
"//pkg/quota:go_default_library",
"//pkg/types:go_default_library",
"//pkg/util/clock:go_default_library",
Expand Down
13 changes: 12 additions & 1 deletion pkg/kubelet/eviction/eviction_manager.go
Expand Up @@ -28,8 +28,10 @@ import (
"k8s.io/kubernetes/pkg/client/record"
"k8s.io/kubernetes/pkg/kubelet/cm"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
kubepod "k8s.io/kubernetes/pkg/kubelet/pod"
"k8s.io/kubernetes/pkg/kubelet/qos"
"k8s.io/kubernetes/pkg/kubelet/server/stats"
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
"k8s.io/kubernetes/pkg/kubelet/util/format"
"k8s.io/kubernetes/pkg/util/clock"
"k8s.io/kubernetes/pkg/util/wait"
Expand Down Expand Up @@ -108,7 +110,7 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd
// the node has memory pressure, admit if not best-effort
if hasNodeCondition(m.nodeConditions, api.NodeMemoryPressure) {
notBestEffort := qos.BestEffort != qos.GetPodQOS(attrs.Pod)
if notBestEffort {
if notBestEffort || kubetypes.IsCriticalPod(attrs.Pod) {
return lifecycle.PodAdmitResult{Admit: true}
}
}
Expand Down Expand Up @@ -311,6 +313,15 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
// we kill at most a single pod during each eviction interval
for i := range activePods {
pod := activePods[i]
if kubepod.IsStaticPod(pod) {
// The eviction manager doesn't evict static pods. To stop a static
// pod, the admin needs to remove the manifest from kubelet's
// --config directory.
// TODO(39124): This is a short term fix, we can't assume static pods
// are always well behaved.
glog.Infof("eviction manager: NOT evicting static pod %v", pod.Name)
continue
}
status := api.PodStatus{
Phase: api.PodFailed,
Message: fmt.Sprintf(message, resourceToReclaim),
Expand Down
24 changes: 14 additions & 10 deletions pkg/kubelet/eviction/eviction_manager_test.go
Expand Up @@ -25,6 +25,7 @@ import (
"k8s.io/kubernetes/pkg/client/record"
statsapi "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/stats"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
"k8s.io/kubernetes/pkg/types"
"k8s.io/kubernetes/pkg/util/clock"
)
Expand Down Expand Up @@ -210,6 +211,8 @@ func TestMemoryPressure(t *testing.T) {
// create a best effort pod to test admission
bestEffortPodToAdmit, _ := podMaker("best-admit", newResourceList("", ""), newResourceList("", ""), "0Gi")
burstablePodToAdmit, _ := podMaker("burst-admit", newResourceList("100m", "100Mi"), newResourceList("200m", "200Mi"), "0Gi")
criticalBestEffortPodToAdmit, _ := podMaker("critical-best-admit", newResourceList("", ""), newResourceList("", ""), "0Gi")
criticalBestEffortPodToAdmit.ObjectMeta.Annotations = map[string]string{kubetypes.CriticalPodAnnotationKey: ""}

// synchronize
manager.synchronize(diskInfoProvider, activePodsFunc)
Expand All @@ -220,8 +223,8 @@ func TestMemoryPressure(t *testing.T) {
}

// try to admit our pods (they should succeed)
expected := []bool{true, true}
for i, pod := range []*api.Pod{bestEffortPodToAdmit, burstablePodToAdmit} {
expected := []bool{true, true, true}
for i, pod := range []*api.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} {
if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit {
t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit)
}
Expand Down Expand Up @@ -296,9 +299,10 @@ func TestMemoryPressure(t *testing.T) {
t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod)
}

// the best-effort pod should not admit, burstable should
expected = []bool{false, true}
for i, pod := range []*api.Pod{bestEffortPodToAdmit, burstablePodToAdmit} {
// the best-effort pod without critical annotation should not admit,
// burstable and critical pods should
expected = []bool{false, true, true}
for i, pod := range []*api.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} {
if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit {
t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit)
}
Expand All @@ -320,9 +324,9 @@ func TestMemoryPressure(t *testing.T) {
t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name)
}

// the best-effort pod should not admit, burstable should
expected = []bool{false, true}
for i, pod := range []*api.Pod{bestEffortPodToAdmit, burstablePodToAdmit} {
// the best-effort pod should not admit, burstable and critical pods should
expected = []bool{false, true, true}
for i, pod := range []*api.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} {
if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit {
t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit)
}
Expand All @@ -345,8 +349,8 @@ func TestMemoryPressure(t *testing.T) {
}

// all pods should admit now
expected = []bool{true, true}
for i, pod := range []*api.Pod{bestEffortPodToAdmit, burstablePodToAdmit} {
expected = []bool{true, true, true}
for i, pod := range []*api.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} {
if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit {
t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit)
}
Expand Down
17 changes: 15 additions & 2 deletions pkg/kubelet/kubelet.go
Expand Up @@ -1909,8 +1909,21 @@ func (kl *Kubelet) handleMirrorPod(mirrorPod *api.Pod, start time.Time) {
// a config source.
func (kl *Kubelet) HandlePodAdditions(pods []*api.Pod) {
start := kl.clock.Now()
sort.Sort(sliceutils.PodsByCreationTime(pods))
for _, pod := range pods {

// Pass critical pods through admission check first.
var criticalPods []*api.Pod
var nonCriticalPods []*api.Pod
for _, p := range pods {
if kubetypes.IsCriticalPod(p) {
criticalPods = append(criticalPods, p)
} else {
nonCriticalPods = append(nonCriticalPods, p)
}
}
sort.Sort(sliceutils.PodsByCreationTime(criticalPods))
sort.Sort(sliceutils.PodsByCreationTime(nonCriticalPods))

for _, pod := range append(criticalPods, nonCriticalPods...) {
existingPods := kl.podManager.GetPods()
// Always add the pod to the pod manager. Kubelet relies on the pod
// manager as the source of truth for the desired state. If a pod does
Expand Down
62 changes: 62 additions & 0 deletions pkg/kubelet/kubelet_test.go
Expand Up @@ -463,6 +463,68 @@ func TestHandlePortConflicts(t *testing.T) {
require.Equal(t, api.PodPending, status.Phase)
}

// Tests that we sort pods based on criticality.
func TestCriticalPrioritySorting(t *testing.T) {
testKubelet := newTestKubelet(t, false /* controllerAttachDetachEnabled */)
kl := testKubelet.kubelet
nodes := []api.Node{
{ObjectMeta: api.ObjectMeta{Name: testKubeletHostname},
Status: api.NodeStatus{Capacity: api.ResourceList{}, Allocatable: api.ResourceList{
api.ResourceCPU: *resource.NewMilliQuantity(10, resource.DecimalSI),
api.ResourceMemory: *resource.NewQuantity(100, resource.BinarySI),
api.ResourcePods: *resource.NewQuantity(40, resource.DecimalSI),
}}},
}
kl.nodeLister = testNodeLister{nodes: nodes}
kl.nodeInfo = testNodeInfo{nodes: nodes}
testKubelet.fakeCadvisor.On("MachineInfo").Return(&cadvisorapi.MachineInfo{}, nil)
testKubelet.fakeCadvisor.On("ImagesFsInfo").Return(cadvisorapiv2.FsInfo{}, nil)
testKubelet.fakeCadvisor.On("RootFsInfo").Return(cadvisorapiv2.FsInfo{}, nil)

spec := api.PodSpec{NodeName: string(kl.nodeName),
Containers: []api.Container{{Resources: api.ResourceRequirements{
Requests: api.ResourceList{
"memory": resource.MustParse("90"),
},
}}}}
pods := []*api.Pod{
podWithUidNameNsSpec("000000000", "newpod", "foo", spec),
podWithUidNameNsSpec("987654321", "oldpod", "foo", spec),
podWithUidNameNsSpec("123456789", "middlepod", "foo", spec),
}

// Pods are not sorted by creation time.
startTime := time.Now()
pods[0].CreationTimestamp = unversioned.NewTime(startTime.Add(10 * time.Second))
pods[1].CreationTimestamp = unversioned.NewTime(startTime)
pods[2].CreationTimestamp = unversioned.NewTime(startTime.Add(1 * time.Second))

// Make the middle and new pod critical, the middle pod should win
// even though it comes later in the list
critical := map[string]string{kubetypes.CriticalPodAnnotationKey: ""}
pods[0].Annotations = critical
pods[1].Annotations = map[string]string{}
pods[2].Annotations = critical

// The non-critical pod should be rejected
notfittingPods := []*api.Pod{pods[0], pods[1]}
fittingPod := pods[2]

kl.HandlePodAdditions(pods)
// Check pod status stored in the status map.
// notfittingPod should be Failed
for _, p := range notfittingPods {
status, found := kl.statusManager.GetPodStatus(p.UID)
require.True(t, found, "Status of pod %q is not found in the status map", p.UID)
require.Equal(t, api.PodFailed, status.Phase)
}

// fittingPod should be Pending
status, found := kl.statusManager.GetPodStatus(fittingPod.UID)
require.True(t, found, "Status of pod %q is not found in the status map", fittingPod.UID)
require.Equal(t, api.PodPending, status.Phase)
}

// Tests that we handle host name conflicts correctly by setting the failed status in status map.
func TestHandleHostNameConflicts(t *testing.T) {
testKubelet := newTestKubelet(t, false /* controllerAttachDetachEnabled */)
Expand Down
8 changes: 8 additions & 0 deletions pkg/kubelet/qos/policy.go
Expand Up @@ -18,14 +18,18 @@ package qos

import (
"k8s.io/kubernetes/pkg/api"
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
)

const (
// PodInfraOOMAdj is very docker specific. For arbitrary runtime, it may not make
// sense to set sandbox level oom score, e.g. a sandbox could only be a namespace
// without a process.
// TODO: Handle infra container oom score adj in a runtime agnostic way.
// TODO: Should handle critical pod oom score adj with a proper preemption priority.
// This is the workaround for https://github.com/kubernetes/kubernetes/issues/38322.
PodInfraOOMAdj int = -998
CriticalPodOOMAdj int = -998
KubeletOOMScoreAdj int = -999
DockerOOMScoreAdj int = -999
KubeProxyOOMScoreAdj int = -999
Expand All @@ -40,6 +44,10 @@ const (
// and 1000. Containers with higher OOM scores are killed if the system runs out of memory.
// See https://lwn.net/Articles/391222/ for more information.
func GetContainerOOMScoreAdjust(pod *api.Pod, container *api.Container, memoryCapacity int64) int {
if kubetypes.IsCriticalPod(pod) {
return CriticalPodOOMAdj
}

switch GetPodQOS(pod) {
case Guaranteed:
// Guaranteed containers should be the last to get killed.
Expand Down
26 changes: 26 additions & 0 deletions pkg/kubelet/qos/policy_test.go
Expand Up @@ -22,6 +22,7 @@ import (

"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/api/resource"
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
)

const (
Expand Down Expand Up @@ -135,6 +136,25 @@ var (
},
},
}
criticalPodWithNoLimit = api.Pod{
ObjectMeta: api.ObjectMeta{
Annotations: map[string]string{
kubetypes.CriticalPodAnnotationKey: "",
},
},
Spec: api.PodSpec{
Containers: []api.Container{
{
Resources: api.ResourceRequirements{
Requests: api.ResourceList{
api.ResourceName(api.ResourceMemory): resource.MustParse(strconv.Itoa(standardMemoryAmount - 1)),
api.ResourceName(api.ResourceCPU): resource.MustParse("5m"),
},
},
},
},
},
}
)

type oomTest struct {
Expand Down Expand Up @@ -188,6 +208,12 @@ func TestGetContainerOOMScoreAdjust(t *testing.T) {
lowOOMScoreAdj: 2,
highOOMScoreAdj: 2,
},
{
pod: &criticalPodWithNoLimit,
memoryCapacity: standardMemoryAmount,
lowOOMScoreAdj: -998,
highOOMScoreAdj: -998,
},
}
for _, test := range oomTests {
oomScoreAdj := GetContainerOOMScoreAdjust(test.pod, &test.pod.Spec.Containers[0], test.memoryCapacity)
Expand Down
17 changes: 17 additions & 0 deletions pkg/kubelet/types/pod_update.go
Expand Up @@ -27,6 +27,15 @@ const ConfigMirrorAnnotationKey = "kubernetes.io/config.mirror"
const ConfigFirstSeenAnnotationKey = "kubernetes.io/config.seen"
const ConfigHashAnnotationKey = "kubernetes.io/config.hash"

// This key needs to sync with the key used by the rescheduler, which currently
// lives in contrib. Its presence indicates 2 things, as far as the kubelet is
// concerned:
// 1. Resource related admission checks will prioritize the admission of
// pods bearing the key, over pods without the key, regardless of QoS.
// 2. The OOM score of pods bearing the key will be <= pods without
// the key (where the <= part is determied by QoS).
const CriticalPodAnnotationKey = "scheduler.alpha.kubernetes.io/critical-pod"

// PodOperation defines what changes will be made on a pod configuration.
type PodOperation int

Expand Down Expand Up @@ -131,3 +140,11 @@ func (sp SyncPodType) String() string {
return "unknown"
}
}

// IsCriticalPod returns true if the pod bears the critical pod annotation
// key. Both the rescheduler and the kubelet use this key to make admission
// and scheduling decisions.
func IsCriticalPod(pod *api.Pod) bool {
_, ok := pod.Annotations[CriticalPodAnnotationKey]
return ok
}