From 20ccad27478997c667d0d5bf9200b07db20215c1 Mon Sep 17 00:00:00 2001 From: bprashanth Date: Thu, 15 Dec 2016 12:25:07 -0800 Subject: [PATCH 1/7] Kubelet admits critical pods even under memory pressure --- pkg/kubelet/eviction/BUILD | 2 ++ pkg/kubelet/eviction/eviction_manager.go | 3 ++- pkg/kubelet/eviction/eviction_manager_test.go | 24 +++++++++++-------- pkg/kubelet/pod/pod_manager.go | 9 +++++++ pkg/kubelet/types/pod_update.go | 9 +++++++ 5 files changed, 36 insertions(+), 11 deletions(-) diff --git a/pkg/kubelet/eviction/BUILD b/pkg/kubelet/eviction/BUILD index 0d3bf2663bf5..86863442de63 100644 --- a/pkg/kubelet/eviction/BUILD +++ b/pkg/kubelet/eviction/BUILD @@ -27,6 +27,7 @@ go_library( "//pkg/kubelet/api/v1alpha1/stats:go_default_library", "//pkg/kubelet/cm:go_default_library", "//pkg/kubelet/lifecycle:go_default_library", + "//pkg/kubelet/pod:go_default_library", "//pkg/kubelet/qos:go_default_library", "//pkg/kubelet/server/stats:go_default_library", "//pkg/kubelet/util/format:go_default_library", @@ -53,6 +54,7 @@ go_test( "//pkg/client/record:go_default_library", "//pkg/kubelet/api/v1alpha1/stats:go_default_library", "//pkg/kubelet/lifecycle:go_default_library", + "//pkg/kubelet/types:go_default_library", "//pkg/quota:go_default_library", "//pkg/types:go_default_library", "//pkg/util/clock:go_default_library", diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index b261cfdf129a..b2abe8c591f6 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -28,6 +28,7 @@ import ( "k8s.io/kubernetes/pkg/client/record" "k8s.io/kubernetes/pkg/kubelet/cm" "k8s.io/kubernetes/pkg/kubelet/lifecycle" + kubepod "k8s.io/kubernetes/pkg/kubelet/pod" "k8s.io/kubernetes/pkg/kubelet/qos" "k8s.io/kubernetes/pkg/kubelet/server/stats" "k8s.io/kubernetes/pkg/kubelet/util/format" @@ -108,7 +109,7 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd // the node has memory pressure, admit if not best-effort if hasNodeCondition(m.nodeConditions, api.NodeMemoryPressure) { notBestEffort := qos.BestEffort != qos.GetPodQOS(attrs.Pod) - if notBestEffort { + if notBestEffort || kubepod.IsCriticalPod(attrs.Pod) { return lifecycle.PodAdmitResult{Admit: true} } } diff --git a/pkg/kubelet/eviction/eviction_manager_test.go b/pkg/kubelet/eviction/eviction_manager_test.go index eb40c82c043c..460d46db3261 100644 --- a/pkg/kubelet/eviction/eviction_manager_test.go +++ b/pkg/kubelet/eviction/eviction_manager_test.go @@ -25,6 +25,7 @@ import ( "k8s.io/kubernetes/pkg/client/record" statsapi "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/stats" "k8s.io/kubernetes/pkg/kubelet/lifecycle" + kubetypes "k8s.io/kubernetes/pkg/kubelet/types" "k8s.io/kubernetes/pkg/types" "k8s.io/kubernetes/pkg/util/clock" ) @@ -210,6 +211,8 @@ func TestMemoryPressure(t *testing.T) { // create a best effort pod to test admission bestEffortPodToAdmit, _ := podMaker("best-admit", newResourceList("", ""), newResourceList("", ""), "0Gi") burstablePodToAdmit, _ := podMaker("burst-admit", newResourceList("100m", "100Mi"), newResourceList("200m", "200Mi"), "0Gi") + criticalBestEffortPodToAdmit, _ := podMaker("critical-best-admit", newResourceList("", ""), newResourceList("", ""), "0Gi") + criticalBestEffortPodToAdmit.ObjectMeta.Annotations = map[string]string{kubetypes.CriticalPodAnnotationKey: ""} // synchronize manager.synchronize(diskInfoProvider, activePodsFunc) @@ -220,8 +223,8 @@ func TestMemoryPressure(t *testing.T) { } // try to admit our pods (they should succeed) - expected := []bool{true, true} - for i, pod := range []*api.Pod{bestEffortPodToAdmit, burstablePodToAdmit} { + expected := []bool{true, true, true} + for i, pod := range []*api.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} { if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit { t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit) } @@ -296,9 +299,10 @@ func TestMemoryPressure(t *testing.T) { t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod) } - // the best-effort pod should not admit, burstable should - expected = []bool{false, true} - for i, pod := range []*api.Pod{bestEffortPodToAdmit, burstablePodToAdmit} { + // the best-effort pod without critical annotation should not admit, + // burstable and critical pods should + expected = []bool{false, true, true} + for i, pod := range []*api.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} { if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit { t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit) } @@ -320,9 +324,9 @@ func TestMemoryPressure(t *testing.T) { t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name) } - // the best-effort pod should not admit, burstable should - expected = []bool{false, true} - for i, pod := range []*api.Pod{bestEffortPodToAdmit, burstablePodToAdmit} { + // the best-effort pod should not admit, burstable and critical pods should + expected = []bool{false, true, true} + for i, pod := range []*api.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} { if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit { t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit) } @@ -345,8 +349,8 @@ func TestMemoryPressure(t *testing.T) { } // all pods should admit now - expected = []bool{true, true} - for i, pod := range []*api.Pod{bestEffortPodToAdmit, burstablePodToAdmit} { + expected = []bool{true, true, true} + for i, pod := range []*api.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} { if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit { t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit) } diff --git a/pkg/kubelet/pod/pod_manager.go b/pkg/kubelet/pod/pod_manager.go index 712a37d867bc..874389fb1470 100644 --- a/pkg/kubelet/pod/pod_manager.go +++ b/pkg/kubelet/pod/pod_manager.go @@ -21,6 +21,7 @@ import ( "k8s.io/kubernetes/pkg/api" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" + kubetypes "k8s.io/kubernetes/pkg/kubelet/types" "k8s.io/kubernetes/pkg/types" ) @@ -306,3 +307,11 @@ func (pm *basicManager) GetPodByMirrorPod(mirrorPod *api.Pod) (*api.Pod, bool) { pod, ok := pm.podByFullName[kubecontainer.GetPodFullName(mirrorPod)] return pod, ok } + +// IsCriticalPod returns true if the pod bears the critical pod annotation +// key. Both the rescheduler and the kubelet use this key to make admission +// and scheduling decisions. +func IsCriticalPod(pod *v1.Pod) bool { + _, ok := pod.Annotations[kubetypes.CriticalPodAnnotationKey] + return ok +} diff --git a/pkg/kubelet/types/pod_update.go b/pkg/kubelet/types/pod_update.go index e98489df6b1d..14ff8c645ed0 100644 --- a/pkg/kubelet/types/pod_update.go +++ b/pkg/kubelet/types/pod_update.go @@ -27,6 +27,15 @@ const ConfigMirrorAnnotationKey = "kubernetes.io/config.mirror" const ConfigFirstSeenAnnotationKey = "kubernetes.io/config.seen" const ConfigHashAnnotationKey = "kubernetes.io/config.hash" +// This key needs to sync with the key used by the rescheduler, which currently +// lives in contrib. Its presence indicates 2 things, as far as the kubelet is +// concerned: +// 1. Resource related admission checks will prioritize the admission of +// pods bearing the key, over pods without the key, regardless of QoS. +// 2. The OOM score of pods bearing the key will be <= pods without +// the key (where the <= part is determied by QoS). +const CriticalPodAnnotationKey = "scheduler.alpha.kubernetes.io/critical-pod" + // PodOperation defines what changes will be made on a pod configuration. type PodOperation int From f5b6b17061df23bde11fbbc7f8bdbccf9db563cc Mon Sep 17 00:00:00 2001 From: bprashanth Date: Thu, 15 Dec 2016 12:28:31 -0800 Subject: [PATCH 2/7] Make kube-proxy a critical pod --- cluster/saltbase/salt/kube-proxy/kube-proxy.manifest | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cluster/saltbase/salt/kube-proxy/kube-proxy.manifest b/cluster/saltbase/salt/kube-proxy/kube-proxy.manifest index 9b85a671e37c..0d0de575359f 100644 --- a/cluster/saltbase/salt/kube-proxy/kube-proxy.manifest +++ b/cluster/saltbase/salt/kube-proxy/kube-proxy.manifest @@ -38,6 +38,14 @@ kind: Pod metadata: name: kube-proxy namespace: kube-system + # This annotation lowers the possibility that kube-proxy gets evicted when the + # node is under memory pressure, and prioritizes it for admission, even if + # the node is under memory pressure. + # Note that kube-proxy runs as a static pod so this annotation does NOT have + # any effect on rescheduler (default scheduler and rescheduler are not + # involved in scheduling kube-proxy). + annotations: + scheduler.alpha.kubernetes.io/critical-pod: '' labels: tier: node component: kube-proxy From 97b6aa06f52c3b128088b02c7dfa68a198586373 Mon Sep 17 00:00:00 2001 From: bprashanth Date: Thu, 15 Dec 2016 13:27:49 -0800 Subject: [PATCH 3/7] Sort critical pods before admission --- pkg/kubelet/kubelet.go | 17 ++++++++-- pkg/kubelet/kubelet_test.go | 62 +++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 2 deletions(-) diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index 1703c731cd5d..4a052a530edb 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -1909,8 +1909,21 @@ func (kl *Kubelet) handleMirrorPod(mirrorPod *api.Pod, start time.Time) { // a config source. func (kl *Kubelet) HandlePodAdditions(pods []*api.Pod) { start := kl.clock.Now() - sort.Sort(sliceutils.PodsByCreationTime(pods)) - for _, pod := range pods { + + // Pass critical pods through admission check first. + var criticalPods []*v1.Pod + var nonCriticalPods []*v1.Pod + for _, p := range pods { + if kubepod.IsCriticalPod(p) { + criticalPods = append(criticalPods, p) + } else { + nonCriticalPods = append(nonCriticalPods, p) + } + } + sort.Sort(sliceutils.PodsByCreationTime(criticalPods)) + sort.Sort(sliceutils.PodsByCreationTime(nonCriticalPods)) + + for _, pod := range append(criticalPods, nonCriticalPods...) { existingPods := kl.podManager.GetPods() // Always add the pod to the pod manager. Kubelet relies on the pod // manager as the source of truth for the desired state. If a pod does diff --git a/pkg/kubelet/kubelet_test.go b/pkg/kubelet/kubelet_test.go index f854fcae5579..791697f9ee8e 100644 --- a/pkg/kubelet/kubelet_test.go +++ b/pkg/kubelet/kubelet_test.go @@ -463,6 +463,68 @@ func TestHandlePortConflicts(t *testing.T) { require.Equal(t, api.PodPending, status.Phase) } +// Tests that we sort pods based on criticality. +func TestCriticalPrioritySorting(t *testing.T) { + testKubelet := newTestKubelet(t, false /* controllerAttachDetachEnabled */) + kl := testKubelet.kubelet + nodes := []v1.Node{ + {ObjectMeta: v1.ObjectMeta{Name: testKubeletHostname}, + Status: v1.NodeStatus{Capacity: v1.ResourceList{}, Allocatable: v1.ResourceList{ + v1.ResourceCPU: *resource.NewMilliQuantity(10, resource.DecimalSI), + v1.ResourceMemory: *resource.NewQuantity(100, resource.BinarySI), + v1.ResourcePods: *resource.NewQuantity(40, resource.DecimalSI), + }}}, + } + kl.nodeLister = testNodeLister{nodes: nodes} + kl.nodeInfo = testNodeInfo{nodes: nodes} + testKubelet.fakeCadvisor.On("MachineInfo").Return(&cadvisorapi.MachineInfo{}, nil) + testKubelet.fakeCadvisor.On("ImagesFsInfo").Return(cadvisorapiv2.FsInfo{}, nil) + testKubelet.fakeCadvisor.On("RootFsInfo").Return(cadvisorapiv2.FsInfo{}, nil) + + spec := v1.PodSpec{NodeName: string(kl.nodeName), + Containers: []v1.Container{{Resources: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + "memory": resource.MustParse("90"), + }, + }}}} + pods := []*v1.Pod{ + podWithUidNameNsSpec("000000000", "newpod", "foo", spec), + podWithUidNameNsSpec("987654321", "oldpod", "foo", spec), + podWithUidNameNsSpec("123456789", "middlepod", "foo", spec), + } + + // Pods are not sorted by creation time. + startTime := time.Now() + pods[0].CreationTimestamp = metav1.NewTime(startTime.Add(10 * time.Second)) + pods[1].CreationTimestamp = metav1.NewTime(startTime) + pods[2].CreationTimestamp = metav1.NewTime(startTime.Add(1 * time.Second)) + + // Make the middle and new pod critical, the middle pod should win + // even though it comes later in the list + critical := map[string]string{kubetypes.CriticalPodAnnotationKey: ""} + pods[0].Annotations = critical + pods[1].Annotations = map[string]string{} + pods[2].Annotations = critical + + // The non-critical pod should be rejected + notfittingPods := []*v1.Pod{pods[0], pods[1]} + fittingPod := pods[2] + + kl.HandlePodAdditions(pods) + // Check pod status stored in the status map. + // notfittingPod should be Failed + for _, p := range notfittingPods { + status, found := kl.statusManager.GetPodStatus(p.UID) + require.True(t, found, "Status of pod %q is not found in the status map", p.UID) + require.Equal(t, v1.PodFailed, status.Phase) + } + + // fittingPod should be Pending + status, found := kl.statusManager.GetPodStatus(fittingPod.UID) + require.True(t, found, "Status of pod %q is not found in the status map", fittingPod.UID) + require.Equal(t, v1.PodPending, status.Phase) +} + // Tests that we handle host name conflicts correctly by setting the failed status in status map. func TestHandleHostNameConflicts(t *testing.T) { testKubelet := newTestKubelet(t, false /* controllerAttachDetachEnabled */) From d44165986d8618261110bd2c639c53cc99c0c93b Mon Sep 17 00:00:00 2001 From: Dawn Chen Date: Wed, 21 Dec 2016 11:23:41 -0800 Subject: [PATCH 4/7] assign -998 as the oom_score_adj for critical pods. --- pkg/kubelet/qos/policy.go | 10 +++++++++- pkg/kubelet/qos/policy_test.go | 26 ++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/pkg/kubelet/qos/policy.go b/pkg/kubelet/qos/policy.go index 7c142f5cd107..05af53071048 100644 --- a/pkg/kubelet/qos/policy.go +++ b/pkg/kubelet/qos/policy.go @@ -18,6 +18,7 @@ package qos import ( "k8s.io/kubernetes/pkg/api" + kubepod "k8s.io/kubernetes/pkg/kubelet/pod" ) const ( @@ -25,7 +26,10 @@ const ( // sense to set sandbox level oom score, e.g. a sandbox could only be a namespace // without a process. // TODO: Handle infra container oom score adj in a runtime agnostic way. + // TODO: Should handle critical pod oom score adj with a proper preemption priority. + // This is the workaround for https://github.com/kubernetes/kubernetes/issues/38322. PodInfraOOMAdj int = -998 + CriticalPodOOMAdj int = -998 KubeletOOMScoreAdj int = -999 DockerOOMScoreAdj int = -999 KubeProxyOOMScoreAdj int = -999 @@ -39,7 +43,11 @@ const ( // multiplied by 10 (barring exceptional cases) + a configurable quantity which is between -1000 // and 1000. Containers with higher OOM scores are killed if the system runs out of memory. // See https://lwn.net/Articles/391222/ for more information. -func GetContainerOOMScoreAdjust(pod *api.Pod, container *api.Container, memoryCapacity int64) int { +func GetContainerOOMScoreAdjust(pod *api.Pod, container *v1.Container, memoryCapacity int64) int { + if kubepod.IsCriticalPod(pod) { + return CriticalPodOOMAdj + } + switch GetPodQOS(pod) { case Guaranteed: // Guaranteed containers should be the last to get killed. diff --git a/pkg/kubelet/qos/policy_test.go b/pkg/kubelet/qos/policy_test.go index 50c9b163497f..3f0264d7ce3d 100644 --- a/pkg/kubelet/qos/policy_test.go +++ b/pkg/kubelet/qos/policy_test.go @@ -22,6 +22,7 @@ import ( "k8s.io/kubernetes/pkg/api" "k8s.io/kubernetes/pkg/api/resource" + kubetypes "k8s.io/kubernetes/pkg/kubelet/types" ) const ( @@ -135,6 +136,25 @@ var ( }, }, } + criticalPodWithNoLimit = api.Pod{ + ObjectMeta: api.ObjectMeta{ + Annotations: map[string]string{ + kubetypes.CriticalPodAnnotationKey: "", + }, + }, + Spec: api.PodSpec{ + Containers: []api.Container{ + { + Resources: api.ResourceRequirements{ + Requests: api.ResourceList{ + api.ResourceName(api.ResourceMemory): resource.MustParse(strconv.Itoa(standardMemoryAmount - 1)), + api.ResourceName(api.ResourceCPU): resource.MustParse("5m"), + }, + }, + }, + }, + }, + } ) type oomTest struct { @@ -188,6 +208,12 @@ func TestGetContainerOOMScoreAdjust(t *testing.T) { lowOOMScoreAdj: 2, highOOMScoreAdj: 2, }, + { + pod: &criticalPodWithNoLimit, + memoryCapacity: standardMemoryAmount, + lowOOMScoreAdj: -998, + highOOMScoreAdj: -998, + }, } for _, test := range oomTests { oomScoreAdj := GetContainerOOMScoreAdjust(test.pod, &test.pod.Spec.Containers[0], test.memoryCapacity) From 9c7015f510270698754de8ca2655194f7fb25784 Mon Sep 17 00:00:00 2001 From: Dawn Chen Date: Wed, 21 Dec 2016 16:04:23 -0800 Subject: [PATCH 5/7] Fixed an import cycle issue: import cycle not allowed in test package k8s.io/kubernetes/pkg/client/restclient (test) imports k8s.io/kubernetes/pkg/api/testapi imports k8s.io/kubernetes/pkg/apis/componentconfig/install imports k8s.io/kubernetes/pkg/apis/componentconfig/v1alpha1 imports k8s.io/kubernetes/pkg/kubelet/qos imports k8s.io/kubernetes/pkg/kubelet/pod imports k8s.io/kubernetes/pkg/client/clientset_generated/clientset imports k8s.io/kubernetes/pkg/client/clientset_generated/clientset/typed/apps/v1beta1 imports k8s.io/kubernetes/pkg/client/restclient --- pkg/kubelet/qos/policy.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/kubelet/qos/policy.go b/pkg/kubelet/qos/policy.go index 05af53071048..60e3617b6ddd 100644 --- a/pkg/kubelet/qos/policy.go +++ b/pkg/kubelet/qos/policy.go @@ -43,7 +43,7 @@ const ( // multiplied by 10 (barring exceptional cases) + a configurable quantity which is between -1000 // and 1000. Containers with higher OOM scores are killed if the system runs out of memory. // See https://lwn.net/Articles/391222/ for more information. -func GetContainerOOMScoreAdjust(pod *api.Pod, container *v1.Container, memoryCapacity int64) int { +func GetContainerOOMScoreAdjust(pod *api.Pod, container *api.Container, memoryCapacity int64) int { if kubepod.IsCriticalPod(pod) { return CriticalPodOOMAdj } From 07f1799c7f756a4fe1d34533a432937987631354 Mon Sep 17 00:00:00 2001 From: bprashanth Date: Tue, 20 Dec 2016 18:36:12 -0800 Subject: [PATCH 6/7] Don't evict static pods --- pkg/kubelet/eviction/eviction_manager.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index b2abe8c591f6..0fe60d3df7f8 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -312,6 +312,15 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act // we kill at most a single pod during each eviction interval for i := range activePods { pod := activePods[i] + if kubepod.IsStaticPod(pod) { + // The eviction manager doesn't evict static pods. To stop a static + // pod, the admin needs to remove the manifest from kubelet's + // --config directory. + // TODO(39124): This is a short term fix, we can't assume static pods + // are always well behaved. + glog.Infof("eviction manager: NOT evicting static pod %v", pod.Name) + continue + } status := api.PodStatus{ Phase: api.PodFailed, Message: fmt.Sprintf(message, resourceToReclaim), From e9ddb4b6615871ee1c5ee1f6669e3f7b9407127e Mon Sep 17 00:00:00 2001 From: Dawn Chen Date: Fri, 6 Jan 2017 18:00:00 -0800 Subject: [PATCH 7/7] Resolve the cherrypick conflict. --- pkg/kubelet/eviction/eviction_manager.go | 3 ++- pkg/kubelet/kubelet.go | 6 ++--- pkg/kubelet/kubelet_test.go | 32 ++++++++++++------------ pkg/kubelet/pod/pod_manager.go | 9 ------- pkg/kubelet/qos/policy.go | 4 +-- pkg/kubelet/types/pod_update.go | 8 ++++++ 6 files changed, 31 insertions(+), 31 deletions(-) diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index 0fe60d3df7f8..dfcc2d411e39 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -31,6 +31,7 @@ import ( kubepod "k8s.io/kubernetes/pkg/kubelet/pod" "k8s.io/kubernetes/pkg/kubelet/qos" "k8s.io/kubernetes/pkg/kubelet/server/stats" + kubetypes "k8s.io/kubernetes/pkg/kubelet/types" "k8s.io/kubernetes/pkg/kubelet/util/format" "k8s.io/kubernetes/pkg/util/clock" "k8s.io/kubernetes/pkg/util/wait" @@ -109,7 +110,7 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd // the node has memory pressure, admit if not best-effort if hasNodeCondition(m.nodeConditions, api.NodeMemoryPressure) { notBestEffort := qos.BestEffort != qos.GetPodQOS(attrs.Pod) - if notBestEffort || kubepod.IsCriticalPod(attrs.Pod) { + if notBestEffort || kubetypes.IsCriticalPod(attrs.Pod) { return lifecycle.PodAdmitResult{Admit: true} } } diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index 4a052a530edb..9c9bcf651a1f 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -1911,10 +1911,10 @@ func (kl *Kubelet) HandlePodAdditions(pods []*api.Pod) { start := kl.clock.Now() // Pass critical pods through admission check first. - var criticalPods []*v1.Pod - var nonCriticalPods []*v1.Pod + var criticalPods []*api.Pod + var nonCriticalPods []*api.Pod for _, p := range pods { - if kubepod.IsCriticalPod(p) { + if kubetypes.IsCriticalPod(p) { criticalPods = append(criticalPods, p) } else { nonCriticalPods = append(nonCriticalPods, p) diff --git a/pkg/kubelet/kubelet_test.go b/pkg/kubelet/kubelet_test.go index 791697f9ee8e..59674e5593c6 100644 --- a/pkg/kubelet/kubelet_test.go +++ b/pkg/kubelet/kubelet_test.go @@ -467,12 +467,12 @@ func TestHandlePortConflicts(t *testing.T) { func TestCriticalPrioritySorting(t *testing.T) { testKubelet := newTestKubelet(t, false /* controllerAttachDetachEnabled */) kl := testKubelet.kubelet - nodes := []v1.Node{ - {ObjectMeta: v1.ObjectMeta{Name: testKubeletHostname}, - Status: v1.NodeStatus{Capacity: v1.ResourceList{}, Allocatable: v1.ResourceList{ - v1.ResourceCPU: *resource.NewMilliQuantity(10, resource.DecimalSI), - v1.ResourceMemory: *resource.NewQuantity(100, resource.BinarySI), - v1.ResourcePods: *resource.NewQuantity(40, resource.DecimalSI), + nodes := []api.Node{ + {ObjectMeta: api.ObjectMeta{Name: testKubeletHostname}, + Status: api.NodeStatus{Capacity: api.ResourceList{}, Allocatable: api.ResourceList{ + api.ResourceCPU: *resource.NewMilliQuantity(10, resource.DecimalSI), + api.ResourceMemory: *resource.NewQuantity(100, resource.BinarySI), + api.ResourcePods: *resource.NewQuantity(40, resource.DecimalSI), }}}, } kl.nodeLister = testNodeLister{nodes: nodes} @@ -481,13 +481,13 @@ func TestCriticalPrioritySorting(t *testing.T) { testKubelet.fakeCadvisor.On("ImagesFsInfo").Return(cadvisorapiv2.FsInfo{}, nil) testKubelet.fakeCadvisor.On("RootFsInfo").Return(cadvisorapiv2.FsInfo{}, nil) - spec := v1.PodSpec{NodeName: string(kl.nodeName), - Containers: []v1.Container{{Resources: v1.ResourceRequirements{ - Requests: v1.ResourceList{ + spec := api.PodSpec{NodeName: string(kl.nodeName), + Containers: []api.Container{{Resources: api.ResourceRequirements{ + Requests: api.ResourceList{ "memory": resource.MustParse("90"), }, }}}} - pods := []*v1.Pod{ + pods := []*api.Pod{ podWithUidNameNsSpec("000000000", "newpod", "foo", spec), podWithUidNameNsSpec("987654321", "oldpod", "foo", spec), podWithUidNameNsSpec("123456789", "middlepod", "foo", spec), @@ -495,9 +495,9 @@ func TestCriticalPrioritySorting(t *testing.T) { // Pods are not sorted by creation time. startTime := time.Now() - pods[0].CreationTimestamp = metav1.NewTime(startTime.Add(10 * time.Second)) - pods[1].CreationTimestamp = metav1.NewTime(startTime) - pods[2].CreationTimestamp = metav1.NewTime(startTime.Add(1 * time.Second)) + pods[0].CreationTimestamp = unversioned.NewTime(startTime.Add(10 * time.Second)) + pods[1].CreationTimestamp = unversioned.NewTime(startTime) + pods[2].CreationTimestamp = unversioned.NewTime(startTime.Add(1 * time.Second)) // Make the middle and new pod critical, the middle pod should win // even though it comes later in the list @@ -507,7 +507,7 @@ func TestCriticalPrioritySorting(t *testing.T) { pods[2].Annotations = critical // The non-critical pod should be rejected - notfittingPods := []*v1.Pod{pods[0], pods[1]} + notfittingPods := []*api.Pod{pods[0], pods[1]} fittingPod := pods[2] kl.HandlePodAdditions(pods) @@ -516,13 +516,13 @@ func TestCriticalPrioritySorting(t *testing.T) { for _, p := range notfittingPods { status, found := kl.statusManager.GetPodStatus(p.UID) require.True(t, found, "Status of pod %q is not found in the status map", p.UID) - require.Equal(t, v1.PodFailed, status.Phase) + require.Equal(t, api.PodFailed, status.Phase) } // fittingPod should be Pending status, found := kl.statusManager.GetPodStatus(fittingPod.UID) require.True(t, found, "Status of pod %q is not found in the status map", fittingPod.UID) - require.Equal(t, v1.PodPending, status.Phase) + require.Equal(t, api.PodPending, status.Phase) } // Tests that we handle host name conflicts correctly by setting the failed status in status map. diff --git a/pkg/kubelet/pod/pod_manager.go b/pkg/kubelet/pod/pod_manager.go index 874389fb1470..712a37d867bc 100644 --- a/pkg/kubelet/pod/pod_manager.go +++ b/pkg/kubelet/pod/pod_manager.go @@ -21,7 +21,6 @@ import ( "k8s.io/kubernetes/pkg/api" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" - kubetypes "k8s.io/kubernetes/pkg/kubelet/types" "k8s.io/kubernetes/pkg/types" ) @@ -307,11 +306,3 @@ func (pm *basicManager) GetPodByMirrorPod(mirrorPod *api.Pod) (*api.Pod, bool) { pod, ok := pm.podByFullName[kubecontainer.GetPodFullName(mirrorPod)] return pod, ok } - -// IsCriticalPod returns true if the pod bears the critical pod annotation -// key. Both the rescheduler and the kubelet use this key to make admission -// and scheduling decisions. -func IsCriticalPod(pod *v1.Pod) bool { - _, ok := pod.Annotations[kubetypes.CriticalPodAnnotationKey] - return ok -} diff --git a/pkg/kubelet/qos/policy.go b/pkg/kubelet/qos/policy.go index 60e3617b6ddd..818268a41b7b 100644 --- a/pkg/kubelet/qos/policy.go +++ b/pkg/kubelet/qos/policy.go @@ -18,7 +18,7 @@ package qos import ( "k8s.io/kubernetes/pkg/api" - kubepod "k8s.io/kubernetes/pkg/kubelet/pod" + kubetypes "k8s.io/kubernetes/pkg/kubelet/types" ) const ( @@ -44,7 +44,7 @@ const ( // and 1000. Containers with higher OOM scores are killed if the system runs out of memory. // See https://lwn.net/Articles/391222/ for more information. func GetContainerOOMScoreAdjust(pod *api.Pod, container *api.Container, memoryCapacity int64) int { - if kubepod.IsCriticalPod(pod) { + if kubetypes.IsCriticalPod(pod) { return CriticalPodOOMAdj } diff --git a/pkg/kubelet/types/pod_update.go b/pkg/kubelet/types/pod_update.go index 14ff8c645ed0..229eefd0a451 100644 --- a/pkg/kubelet/types/pod_update.go +++ b/pkg/kubelet/types/pod_update.go @@ -140,3 +140,11 @@ func (sp SyncPodType) String() string { return "unknown" } } + +// IsCriticalPod returns true if the pod bears the critical pod annotation +// key. Both the rescheduler and the kubelet use this key to make admission +// and scheduling decisions. +func IsCriticalPod(pod *api.Pod) bool { + _, ok := pod.Annotations[CriticalPodAnnotationKey] + return ok +}