Skip to content

Commit

Permalink
Merge pull request #39562 from dchen1107/automated-cherry-pick-of-#38…
Browse files Browse the repository at this point in the history
…836-#39114-#39059-upstream-release-1.5

Automated cherry pick of #38836 #39114 #39059 upstream release 1.5
  • Loading branch information
saad-ali committed Jan 10, 2017
2 parents 8e68207 + e9ddb4b commit 2f67f8c
Show file tree
Hide file tree
Showing 9 changed files with 164 additions and 13 deletions.
8 changes: 8 additions & 0 deletions cluster/saltbase/salt/kube-proxy/kube-proxy.manifest
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,14 @@ kind: Pod
metadata:
name: kube-proxy
namespace: kube-system
# This annotation lowers the possibility that kube-proxy gets evicted when the
# node is under memory pressure, and prioritizes it for admission, even if
# the node is under memory pressure.
# Note that kube-proxy runs as a static pod so this annotation does NOT have
# any effect on rescheduler (default scheduler and rescheduler are not
# involved in scheduling kube-proxy).
annotations:
scheduler.alpha.kubernetes.io/critical-pod: ''
labels:
tier: node
component: kube-proxy
Expand Down
2 changes: 2 additions & 0 deletions pkg/kubelet/eviction/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ go_library(
"//pkg/kubelet/api/v1alpha1/stats:go_default_library",
"//pkg/kubelet/cm:go_default_library",
"//pkg/kubelet/lifecycle:go_default_library",
"//pkg/kubelet/pod:go_default_library",
"//pkg/kubelet/qos:go_default_library",
"//pkg/kubelet/server/stats:go_default_library",
"//pkg/kubelet/util/format:go_default_library",
Expand All @@ -53,6 +54,7 @@ go_test(
"//pkg/client/record:go_default_library",
"//pkg/kubelet/api/v1alpha1/stats:go_default_library",
"//pkg/kubelet/lifecycle:go_default_library",
"//pkg/kubelet/types:go_default_library",
"//pkg/quota:go_default_library",
"//pkg/types:go_default_library",
"//pkg/util/clock:go_default_library",
Expand Down
13 changes: 12 additions & 1 deletion pkg/kubelet/eviction/eviction_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,10 @@ import (
"k8s.io/kubernetes/pkg/client/record"
"k8s.io/kubernetes/pkg/kubelet/cm"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
kubepod "k8s.io/kubernetes/pkg/kubelet/pod"
"k8s.io/kubernetes/pkg/kubelet/qos"
"k8s.io/kubernetes/pkg/kubelet/server/stats"
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
"k8s.io/kubernetes/pkg/kubelet/util/format"
"k8s.io/kubernetes/pkg/util/clock"
"k8s.io/kubernetes/pkg/util/wait"
Expand Down Expand Up @@ -108,7 +110,7 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd
// the node has memory pressure, admit if not best-effort
if hasNodeCondition(m.nodeConditions, api.NodeMemoryPressure) {
notBestEffort := qos.BestEffort != qos.GetPodQOS(attrs.Pod)
if notBestEffort {
if notBestEffort || kubetypes.IsCriticalPod(attrs.Pod) {
return lifecycle.PodAdmitResult{Admit: true}
}
}
Expand Down Expand Up @@ -311,6 +313,15 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
// we kill at most a single pod during each eviction interval
for i := range activePods {
pod := activePods[i]
if kubepod.IsStaticPod(pod) {
// The eviction manager doesn't evict static pods. To stop a static
// pod, the admin needs to remove the manifest from kubelet's
// --config directory.
// TODO(39124): This is a short term fix, we can't assume static pods
// are always well behaved.
glog.Infof("eviction manager: NOT evicting static pod %v", pod.Name)
continue
}
status := api.PodStatus{
Phase: api.PodFailed,
Message: fmt.Sprintf(message, resourceToReclaim),
Expand Down
24 changes: 14 additions & 10 deletions pkg/kubelet/eviction/eviction_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ import (
"k8s.io/kubernetes/pkg/client/record"
statsapi "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/stats"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
"k8s.io/kubernetes/pkg/types"
"k8s.io/kubernetes/pkg/util/clock"
)
Expand Down Expand Up @@ -210,6 +211,8 @@ func TestMemoryPressure(t *testing.T) {
// create a best effort pod to test admission
bestEffortPodToAdmit, _ := podMaker("best-admit", newResourceList("", ""), newResourceList("", ""), "0Gi")
burstablePodToAdmit, _ := podMaker("burst-admit", newResourceList("100m", "100Mi"), newResourceList("200m", "200Mi"), "0Gi")
criticalBestEffortPodToAdmit, _ := podMaker("critical-best-admit", newResourceList("", ""), newResourceList("", ""), "0Gi")
criticalBestEffortPodToAdmit.ObjectMeta.Annotations = map[string]string{kubetypes.CriticalPodAnnotationKey: ""}

// synchronize
manager.synchronize(diskInfoProvider, activePodsFunc)
Expand All @@ -220,8 +223,8 @@ func TestMemoryPressure(t *testing.T) {
}

// try to admit our pods (they should succeed)
expected := []bool{true, true}
for i, pod := range []*api.Pod{bestEffortPodToAdmit, burstablePodToAdmit} {
expected := []bool{true, true, true}
for i, pod := range []*api.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} {
if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit {
t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit)
}
Expand Down Expand Up @@ -296,9 +299,10 @@ func TestMemoryPressure(t *testing.T) {
t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod)
}

// the best-effort pod should not admit, burstable should
expected = []bool{false, true}
for i, pod := range []*api.Pod{bestEffortPodToAdmit, burstablePodToAdmit} {
// the best-effort pod without critical annotation should not admit,
// burstable and critical pods should
expected = []bool{false, true, true}
for i, pod := range []*api.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} {
if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit {
t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit)
}
Expand All @@ -320,9 +324,9 @@ func TestMemoryPressure(t *testing.T) {
t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod.Name)
}

// the best-effort pod should not admit, burstable should
expected = []bool{false, true}
for i, pod := range []*api.Pod{bestEffortPodToAdmit, burstablePodToAdmit} {
// the best-effort pod should not admit, burstable and critical pods should
expected = []bool{false, true, true}
for i, pod := range []*api.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} {
if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit {
t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit)
}
Expand All @@ -345,8 +349,8 @@ func TestMemoryPressure(t *testing.T) {
}

// all pods should admit now
expected = []bool{true, true}
for i, pod := range []*api.Pod{bestEffortPodToAdmit, burstablePodToAdmit} {
expected = []bool{true, true, true}
for i, pod := range []*api.Pod{bestEffortPodToAdmit, burstablePodToAdmit, criticalBestEffortPodToAdmit} {
if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: pod}); expected[i] != result.Admit {
t.Errorf("Admit pod: %v, expected: %v, actual: %v", pod, expected[i], result.Admit)
}
Expand Down
17 changes: 15 additions & 2 deletions pkg/kubelet/kubelet.go
Original file line number Diff line number Diff line change
Expand Up @@ -1909,8 +1909,21 @@ func (kl *Kubelet) handleMirrorPod(mirrorPod *api.Pod, start time.Time) {
// a config source.
func (kl *Kubelet) HandlePodAdditions(pods []*api.Pod) {
start := kl.clock.Now()
sort.Sort(sliceutils.PodsByCreationTime(pods))
for _, pod := range pods {

// Pass critical pods through admission check first.
var criticalPods []*api.Pod
var nonCriticalPods []*api.Pod
for _, p := range pods {
if kubetypes.IsCriticalPod(p) {
criticalPods = append(criticalPods, p)
} else {
nonCriticalPods = append(nonCriticalPods, p)
}
}
sort.Sort(sliceutils.PodsByCreationTime(criticalPods))
sort.Sort(sliceutils.PodsByCreationTime(nonCriticalPods))

for _, pod := range append(criticalPods, nonCriticalPods...) {
existingPods := kl.podManager.GetPods()
// Always add the pod to the pod manager. Kubelet relies on the pod
// manager as the source of truth for the desired state. If a pod does
Expand Down
62 changes: 62 additions & 0 deletions pkg/kubelet/kubelet_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,68 @@ func TestHandlePortConflicts(t *testing.T) {
require.Equal(t, api.PodPending, status.Phase)
}

// Tests that we sort pods based on criticality.
func TestCriticalPrioritySorting(t *testing.T) {
testKubelet := newTestKubelet(t, false /* controllerAttachDetachEnabled */)
kl := testKubelet.kubelet
nodes := []api.Node{
{ObjectMeta: api.ObjectMeta{Name: testKubeletHostname},
Status: api.NodeStatus{Capacity: api.ResourceList{}, Allocatable: api.ResourceList{
api.ResourceCPU: *resource.NewMilliQuantity(10, resource.DecimalSI),
api.ResourceMemory: *resource.NewQuantity(100, resource.BinarySI),
api.ResourcePods: *resource.NewQuantity(40, resource.DecimalSI),
}}},
}
kl.nodeLister = testNodeLister{nodes: nodes}
kl.nodeInfo = testNodeInfo{nodes: nodes}
testKubelet.fakeCadvisor.On("MachineInfo").Return(&cadvisorapi.MachineInfo{}, nil)
testKubelet.fakeCadvisor.On("ImagesFsInfo").Return(cadvisorapiv2.FsInfo{}, nil)
testKubelet.fakeCadvisor.On("RootFsInfo").Return(cadvisorapiv2.FsInfo{}, nil)

spec := api.PodSpec{NodeName: string(kl.nodeName),
Containers: []api.Container{{Resources: api.ResourceRequirements{
Requests: api.ResourceList{
"memory": resource.MustParse("90"),
},
}}}}
pods := []*api.Pod{
podWithUidNameNsSpec("000000000", "newpod", "foo", spec),
podWithUidNameNsSpec("987654321", "oldpod", "foo", spec),
podWithUidNameNsSpec("123456789", "middlepod", "foo", spec),
}

// Pods are not sorted by creation time.
startTime := time.Now()
pods[0].CreationTimestamp = unversioned.NewTime(startTime.Add(10 * time.Second))
pods[1].CreationTimestamp = unversioned.NewTime(startTime)
pods[2].CreationTimestamp = unversioned.NewTime(startTime.Add(1 * time.Second))

// Make the middle and new pod critical, the middle pod should win
// even though it comes later in the list
critical := map[string]string{kubetypes.CriticalPodAnnotationKey: ""}
pods[0].Annotations = critical
pods[1].Annotations = map[string]string{}
pods[2].Annotations = critical

// The non-critical pod should be rejected
notfittingPods := []*api.Pod{pods[0], pods[1]}
fittingPod := pods[2]

kl.HandlePodAdditions(pods)
// Check pod status stored in the status map.
// notfittingPod should be Failed
for _, p := range notfittingPods {
status, found := kl.statusManager.GetPodStatus(p.UID)
require.True(t, found, "Status of pod %q is not found in the status map", p.UID)
require.Equal(t, api.PodFailed, status.Phase)
}

// fittingPod should be Pending
status, found := kl.statusManager.GetPodStatus(fittingPod.UID)
require.True(t, found, "Status of pod %q is not found in the status map", fittingPod.UID)
require.Equal(t, api.PodPending, status.Phase)
}

// Tests that we handle host name conflicts correctly by setting the failed status in status map.
func TestHandleHostNameConflicts(t *testing.T) {
testKubelet := newTestKubelet(t, false /* controllerAttachDetachEnabled */)
Expand Down
8 changes: 8 additions & 0 deletions pkg/kubelet/qos/policy.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,18 @@ package qos

import (
"k8s.io/kubernetes/pkg/api"
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
)

const (
// PodInfraOOMAdj is very docker specific. For arbitrary runtime, it may not make
// sense to set sandbox level oom score, e.g. a sandbox could only be a namespace
// without a process.
// TODO: Handle infra container oom score adj in a runtime agnostic way.
// TODO: Should handle critical pod oom score adj with a proper preemption priority.
// This is the workaround for https://github.com/kubernetes/kubernetes/issues/38322.
PodInfraOOMAdj int = -998
CriticalPodOOMAdj int = -998
KubeletOOMScoreAdj int = -999
DockerOOMScoreAdj int = -999
KubeProxyOOMScoreAdj int = -999
Expand All @@ -40,6 +44,10 @@ const (
// and 1000. Containers with higher OOM scores are killed if the system runs out of memory.
// See https://lwn.net/Articles/391222/ for more information.
func GetContainerOOMScoreAdjust(pod *api.Pod, container *api.Container, memoryCapacity int64) int {
if kubetypes.IsCriticalPod(pod) {
return CriticalPodOOMAdj
}

switch GetPodQOS(pod) {
case Guaranteed:
// Guaranteed containers should be the last to get killed.
Expand Down
26 changes: 26 additions & 0 deletions pkg/kubelet/qos/policy_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (

"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/api/resource"
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
)

const (
Expand Down Expand Up @@ -135,6 +136,25 @@ var (
},
},
}
criticalPodWithNoLimit = api.Pod{
ObjectMeta: api.ObjectMeta{
Annotations: map[string]string{
kubetypes.CriticalPodAnnotationKey: "",
},
},
Spec: api.PodSpec{
Containers: []api.Container{
{
Resources: api.ResourceRequirements{
Requests: api.ResourceList{
api.ResourceName(api.ResourceMemory): resource.MustParse(strconv.Itoa(standardMemoryAmount - 1)),
api.ResourceName(api.ResourceCPU): resource.MustParse("5m"),
},
},
},
},
},
}
)

type oomTest struct {
Expand Down Expand Up @@ -188,6 +208,12 @@ func TestGetContainerOOMScoreAdjust(t *testing.T) {
lowOOMScoreAdj: 2,
highOOMScoreAdj: 2,
},
{
pod: &criticalPodWithNoLimit,
memoryCapacity: standardMemoryAmount,
lowOOMScoreAdj: -998,
highOOMScoreAdj: -998,
},
}
for _, test := range oomTests {
oomScoreAdj := GetContainerOOMScoreAdjust(test.pod, &test.pod.Spec.Containers[0], test.memoryCapacity)
Expand Down
17 changes: 17 additions & 0 deletions pkg/kubelet/types/pod_update.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,15 @@ const ConfigMirrorAnnotationKey = "kubernetes.io/config.mirror"
const ConfigFirstSeenAnnotationKey = "kubernetes.io/config.seen"
const ConfigHashAnnotationKey = "kubernetes.io/config.hash"

// This key needs to sync with the key used by the rescheduler, which currently
// lives in contrib. Its presence indicates 2 things, as far as the kubelet is
// concerned:
// 1. Resource related admission checks will prioritize the admission of
// pods bearing the key, over pods without the key, regardless of QoS.
// 2. The OOM score of pods bearing the key will be <= pods without
// the key (where the <= part is determied by QoS).
const CriticalPodAnnotationKey = "scheduler.alpha.kubernetes.io/critical-pod"

// PodOperation defines what changes will be made on a pod configuration.
type PodOperation int

Expand Down Expand Up @@ -131,3 +140,11 @@ func (sp SyncPodType) String() string {
return "unknown"
}
}

// IsCriticalPod returns true if the pod bears the critical pod annotation
// key. Both the rescheduler and the kubelet use this key to make admission
// and scheduling decisions.
func IsCriticalPod(pod *api.Pod) bool {
_, ok := pod.Annotations[CriticalPodAnnotationKey]
return ok
}

0 comments on commit 2f67f8c

Please sign in to comment.