generated from kubernetes/kubernetes-template-project
/
filter.go
190 lines (160 loc) · 7.19 KB
/
filter.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
/*
Copyright 2021 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package noderesourcetopology
import (
"context"
"fmt"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/klog/v2"
v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper"
v1qos "k8s.io/kubernetes/pkg/apis/core/v1/helper/qos"
bm "k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
"k8s.io/kubernetes/pkg/scheduler/framework"
topologyv1alpha1 "github.com/k8stopologyawareschedwg/noderesourcetopology-api/pkg/apis/topology/v1alpha1"
"sigs.k8s.io/scheduler-plugins/pkg/util"
)
type PolicyHandler func(pod *v1.Pod, zoneMap topologyv1alpha1.ZoneList) *framework.Status
func singleNUMAContainerLevelHandler(pod *v1.Pod, zones topologyv1alpha1.ZoneList, nodeInfo *framework.NodeInfo) *framework.Status {
klog.V(5).InfoS("Single NUMA node handler")
// prepare NUMANodes list from zoneMap
nodes := createNUMANodeList(zones)
qos := v1qos.GetPodQOS(pod)
// Node() != nil already verified in Filter(), which is the only public entry point
logNumaNodes("container handler NUMA resources", nodeInfo.Node().Name, nodes)
// We count here in the way TopologyManager is doing it, IOW we put InitContainers
// and normal containers in the one scope
for _, container := range append(pod.Spec.InitContainers, pod.Spec.Containers...) {
logKey := fmt.Sprintf("%s/%s/%s", pod.Namespace, pod.Name, container.Name)
klog.V(6).InfoS("target resources", resourceListToLoggable(logKey, container.Resources.Requests)...)
if !resourcesAvailableInAnyNUMANodes(logKey, nodes, container.Resources.Requests, qos, nodeInfo) {
// definitely we can't align container, so we can't align a pod
return framework.NewStatus(framework.Unschedulable, fmt.Sprintf("cannot align container: %s", container.Name))
}
}
return nil
}
// resourcesAvailableInAnyNUMANodes checks for sufficient resource, this function
// requires NUMANodeList with properly populated NUMANode, NUMAID should be in range 0-63
func resourcesAvailableInAnyNUMANodes(logKey string, numaNodes NUMANodeList, resources v1.ResourceList, qos v1.PodQOSClass, nodeInfo *framework.NodeInfo) bool {
bitmask := bm.NewEmptyBitMask()
// set all bits, each bit is a NUMA node, if resources couldn't be aligned
// on the NUMA node, bit should be unset
bitmask.Fill()
// Node() != nil already verified in Filter(), which is the only public entry point
nodeName := nodeInfo.Node().Name
nodeResources := util.ResourceList(nodeInfo.Allocatable)
for resource, quantity := range resources {
if quantity.IsZero() {
// why bother? everything's fine from the perspective of this resource
klog.V(4).InfoS("ignoring zero-qty resource request", "logKey", logKey, "node", nodeName, "resource", resource)
continue
}
if !isNodeSuitable(nodeResources, resource, quantity) {
// some resources may not expose NUMA affinity (device plugins, extended resources), but all resources
// must be reported at node level; thus, if they are not present at node level, we can safely assume
// we don't have the resource at all.
klog.V(5).InfoS("short circuit", "logKey", logKey, "node", nodeName, "resource", resource)
bitmask.Clear()
break
}
// for each requested resource, calculate which NUMA slots are good fits, and then AND with the aggregated bitmask, IOW unset appropriate bit if we can't align resources, or set it
// obvious, bits which are not in the NUMA id's range would be unset
resourceBitmask := bm.NewEmptyBitMask()
for _, numaNode := range numaNodes {
if !isNUMANodeSuitable(qos, numaNode.Resources, resource, quantity) {
continue
}
resourceBitmask.Add(numaNode.NUMAID)
klog.V(6).InfoS("feasible", "logKey", logKey, "node", nodeName, "NUMA", numaNode.NUMAID, "resource", resource)
}
bitmask.And(resourceBitmask)
if bitmask.IsEmpty() {
klog.V(5).InfoS("short circuit", "logKey", logKey, "node", nodeName, "resource", resource)
bitmask.Clear()
break
}
}
ret := !bitmask.IsEmpty()
klog.V(5).InfoS("final verdict", "logKey", logKey, "node", nodeName, "suitable", ret)
return ret
}
func isNodeSuitable(nodeResources v1.ResourceList, resource v1.ResourceName, quantity resource.Quantity) bool {
nodeQuantity, ok := nodeResources[resource]
if !ok {
return false
}
return nodeQuantity.Cmp(quantity) >= 0
}
func isNUMANodeSuitable(qos v1.PodQOSClass, numaResources v1.ResourceList, resource v1.ResourceName, quantity resource.Quantity) bool {
numaQuantity, ok := numaResources[resource]
if !ok {
// resource not available on this NUMA zone at all
return false
}
// Check for the following:
if qos != v1.PodQOSGuaranteed {
// 1. set numa node as possible node if resource is memory or Hugepages
if resource == v1.ResourceMemory {
return true
}
if v1helper.IsHugePageResourceName(resource) {
return true
}
// 2. set numa node as possible node if resource is CPU
if resource == v1.ResourceCPU {
return true
}
}
// 3. otherwise check amount of resources
return numaQuantity.Cmp(quantity) >= 0
}
func singleNUMAPodLevelHandler(pod *v1.Pod, zones topologyv1alpha1.ZoneList, nodeInfo *framework.NodeInfo) *framework.Status {
klog.V(5).InfoS("Pod Level Resource handler")
resources := util.GetPodEffectiveRequest(pod)
logKey := fmt.Sprintf("%s/%s", pod.Namespace, pod.Name)
nodes := createNUMANodeList(zones)
// Node() != nil already verified in Filter(), which is the only public entry point
logNumaNodes("pod handler NUMA resources", nodeInfo.Node().Name, nodes)
klog.V(6).InfoS("target resources", resourceListToLoggable(logKey, resources)...)
if !resourcesAvailableInAnyNUMANodes(logKey, createNUMANodeList(zones), resources, v1qos.GetPodQOS(pod), nodeInfo) {
// definitely we can't align container, so we can't align a pod
return framework.NewStatus(framework.Unschedulable, fmt.Sprintf("cannot align pod: %s", pod.Name))
}
return nil
}
// Filter Now only single-numa-node supported
func (tm *TopologyMatch) Filter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
if nodeInfo.Node() == nil {
return framework.NewStatus(framework.Error, "node not found")
}
if v1qos.GetPodQOS(pod) == v1.PodQOSBestEffort {
return nil
}
nodeName := nodeInfo.Node().Name
nodeTopology := findNodeTopology(nodeName, tm.lister)
if nodeTopology == nil {
return nil
}
klog.V(5).InfoS("Found NodeResourceTopology", "nodeTopology", klog.KObj(nodeTopology))
for _, policyName := range nodeTopology.TopologyPolicies {
if handler, ok := tm.policyHandlers[topologyv1alpha1.TopologyManagerPolicy(policyName)]; ok {
if status := handler.filter(pod, nodeTopology.Zones, nodeInfo); status != nil {
return status
}
} else {
klog.V(5).InfoS("Policy handler not found", "policy", policyName)
}
}
return nil
}