Skip to content

Commit

Permalink
all: Node Resource Reservation supports apply policy
Browse files Browse the repository at this point in the history
Signed-off-by: Joseph <joseph.t.lee@outlook.com>
  • Loading branch information
eahydra committed Apr 17, 2023
1 parent 0af2a00 commit 03d985e
Show file tree
Hide file tree
Showing 8 changed files with 303 additions and 127 deletions.
61 changes: 37 additions & 24 deletions apis/extension/node_reservation.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,44 +30,57 @@ const (

// NodeReservation resource reserved by node.annotation,
// If node.annotation declares the resources to be reserved, like this:
// annotations:
// node.koordinator.sh/reservation: >-
// {"reservedCPUs":"0-5"}

// In the filter phase it needs to satisfy: node.alloc - node.req - reserved(6c) > pod.req
// if qos==LSE/LSR: the cores 0-5 are not used in the reserve phase
//
// annotations:
// node.koordinator.sh/reservation: >-
// {"reservedCPUs":"0-5"}
type NodeReservation struct {
// resources need to be reserved. like, {"cpu":"1C", "memory":"2Gi"}
Resources corev1.ResourceList `json:"resources,omitempty"`
// reserved cpus need to be reserved, such as 1-6, or 2,4,6,8
ReservedCPUs string `json:"reservedCPUs,omitempty"`
// ApplyPolicy indicates how the reserved resources take effect.
ApplyPolicy NodeReservationApplyPolicy `json:"applyPolicy,omitempty"`
}

func GetReservedCPUs(anno map[string]string) (string, int) {
specificCPUsReservedStr := ""
numReservedCPUs := 0
type NodeReservationApplyPolicy string

const (
// NodeReservationApplyPolicyDefault will affect the total amount of schedulable resources of the node and reserve CPU Cores.
// For example, NodeInfo.Allocatable will be modified in the scheduler to deduct the amount of reserved resources
NodeReservationApplyPolicyDefault NodeReservationApplyPolicy = "Default"
// NodeReservationApplyPolicyReservedCPUsOnly means that only CPU Cores are reserved, but it will
// not affect the total amount of schedulable resources of the node.
// The total amount of schedulable resources is taken into effect by the kubelet's reservation mechanism.
// But koordinator need to exclude reserved CPUs when allocating CPU Cores
NodeReservationApplyPolicyReservedCPUsOnly NodeReservationApplyPolicy = "ReservedCPUsOnly"
)

val, ok := anno[AnnotationNodeReservation]
if !ok || val == "" {
return specificCPUsReservedStr, numReservedCPUs
func GetNodeReservation(annotations map[string]string) (*NodeReservation, error) {
reservation := &NodeReservation{}
if s := annotations[AnnotationNodeReservation]; s != "" {
if err := json.Unmarshal([]byte(s), &reservation); err != nil {
return nil, err
}
}
return reservation, nil
}

reserved := NodeReservation{}
if err := json.Unmarshal([]byte(val), &reserved); err != nil {
klog.Errorf("failed to unmarshal reserved resources from node.annotation in nodenumaresource scheduler plugin.err:%v", err)
return specificCPUsReservedStr, numReservedCPUs
func GetReservedCPUs(annotations map[string]string) (reservedCPUs string, numReservedCPUs int) {
reservation, err := GetNodeReservation(annotations)
if err != nil {
klog.ErrorS(err, "failed to GetNodeReservation")
return
}

CPUsQuantityReserved, ok := reserved.Resources[corev1.ResourceCPU]
if ok && CPUsQuantityReserved.MilliValue() > 0 {
reservedCPUsFloat := float64(CPUsQuantityReserved.MilliValue()) / 1000
numReservedCPUs = int(math.Ceil(reservedCPUsFloat))
quantity := reservation.Resources[corev1.ResourceCPU]
if quantity.MilliValue() > 0 {
numReservedCPUs = int(math.Ceil(float64(quantity.MilliValue()) / 1000))
}

if reserved.ReservedCPUs != "" {
if reservation.ReservedCPUs != "" {
numReservedCPUs = 0
}
specificCPUsReservedStr = reserved.ReservedCPUs

return specificCPUsReservedStr, numReservedCPUs
reservedCPUs = reservation.ReservedCPUs
return
}
111 changes: 70 additions & 41 deletions docs/proposals/scheduling/20221227-node-resource-reservation.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,35 +10,37 @@ reviewers:
- "@zwzhang0107"
- "@jasonliu747"
creation-date: 2022-12-27
last-updated: 2022-12-27
last-updated: 2023-04-13
---

# Resource Reservation

## Table of Contents

<!--ts-->
<!-- TOC -->

- [Resource Reservation](#resource-reservation)
- [Table of Contents](#table-of-contents)
- [Summary](#summary)
- [Motivation](#motivation)
- [Goals](#goals)
- [Non-Goals/Future Work](#non-goalsfuture-work)
- [Proposal](#proposal)
- [User Stories](#user-stories)
- [Story 1](#story-1)
- [Story 2](#story-2)
- [Story 3](#story-3)
- [Implementation Details](#implementation-details)
- [koordlet](#koordlet)
- [koord-manager](#koord-manager)
- [koord-scheduler](#koord-scheduler)
- [descheduler](#descheduler)
- [eviction](#eviction)
- [example](#example)
- [Implementation History](#implementation-history)
<!--te-->
- [Table of Contents](#table-of-contents)
- [Summary](#summary)
- [Motivation](#motivation)
- [Goals](#goals)
- [Non-Goals/Future Work](#non-goalsfuture-work)
- [Proposal](#proposal)
- [User Stories](#user-stories)
- [Story 1](#story-1)
- [Story 2](#story-2)
- [Story 3](#story-3)
- [Implementation Details](#implementation-details)
- [API](#api)
- [koordlet](#koordlet)
- [koord-manager](#koord-manager)
- [koord-scheduler](#koord-scheduler)
- [Descheduler](#descheduler)
- [Eviction](#eviction)
- [Example](#example)
- [Implementation History](#implementation-history)

<!-- /TOC -->

## Summary
The proposal provides a scheduling mechanism to reserve resources by node.annotation, such as reserving 0-6 cores of CPU on the node.
Expand Down Expand Up @@ -80,19 +82,44 @@ so for K8S, the 0-11 core is reserved and it can not be used.
#### Story 2
As a K8S cluster administrator, set the total amount of resources to be reserved directly via `node.annotation`, just like: `1C2Gi`.

#### Story 3

The cluster administrator just wants to reserve some cpu cores on the nods, and also wants to use reservation mechanism of kubelet to trim the allocatable resources. For example, suppose the CPU capacity of node A is 32000m, the administrator reserve `0-3` CPU cores to the processes outside the Kubernetes, and kubelet reserve 8000m CPU to keep the `Node.Status.Allocatable` is 24000m.

### Implementation Details

#### API

By adding annotation to the node, we specify the resources to be reserved by the koordinator, such as memory, CPU, etc.
For CPU, we can specify the total amount of CPU to be reserved, or we can specify which cores to reserve explicitly.

If resources are reserved and ApplyPolicy is empty or Default, it will affect `Node.Status.Allocatable`. If `ReservedCPUsOnly` is used, only CPU is reserved, but it will not affect `Node.Status.Allocatable`.

```golang
// resources that koordinator reserved, and you can reserve other resource if needed.
type NodeReservation struct {
// resources need to be reserved. like, {"cpu":"1C", "memory":"2Gi"}
Resources corev1.ResourceList `json:"resources,omitempty"`
// reserved cpus need to be reserved, such as 1-6, or 2,4,6,8
ReservedCPUs string `json:"reservedCPUs,omitempty"`
// resources need to be reserved. like, {"cpu":"1C", "memory":"2Gi"}
Resources corev1.ResourceList `json:"resources,omitempty"`
// reserved cpus need to be reserved, such as 1-6, or 2,4,6,8
ReservedCPUs string `json:"reservedCPUs,omitempty"`
// ApplyPolicy indicates how the reserved resources take effect.
ApplyPolicy NodeReservationApplyPolicy `json:"applyPolicy,omitempty"`
}

type NodeReservationApplyPolicy string

const (
// NodeReservationApplyPolicyDefault will affect the total amount of schedulable resources of the node and reserve CPU Cores.
// For example, NodeInfo.Allocatable will be modified in the scheduler to deduct the amount of reserved resources
NodeReservationApplyPolicyDefault NodeReservationApplyPolicy = "Default"
// NodeReservationApplyPolicyReservedCPUsOnly means that only CPU Cores are reserved, but it will
// not affect the total amount of schedulable resources of the node.
// The total amount of schedulable resources is taken into effect by the kubelet's reservation mechanism.
// But koordinator need to exclude reserved CPUs when allocating CPU Cores
NodeReservationApplyPolicyReservedCPUsOnly NodeReservationApplyPolicy = "ReservedCPUsOnly"
)
```

#### koordlet
1. When koordlet reports `NodeResourceTopology`, it updates the explicitly reserved cores in node.annntation to
`NodeResourceTopology.annotation["node.koordinator.sh/reservation"]`, so that the `runtimehook` can ignore these cores
Expand Down Expand Up @@ -150,6 +177,7 @@ LS pod use cpus from the shared pool, so here we can ensure that LS pods do not


#### koord-manager

we should update batch-resource on node here, just like `batch-memory`,`batch-cpu`.
the implementation of this part depends on the `nodeResource` controller in koord-manager,
so for batch-resource, the `batchResourceFit` plugin can take into account the resources reserved by the node.
Expand All @@ -162,33 +190,35 @@ Node(BE).Alloc = Node.Alloc - Node.Reserved - System.Used - Pod(LS).Used
```

#### koord-scheduler
1. Register the corresponding function by `RegisterNodeInfoTransformer`, subtract the resources reserved from `node.annotation` in `nodeinfo.allocatable`.
This part of the logic executed before the filter() of the scheduling plugin. So the resources reserved from node are also taken into account in other scheduler plugins.
2. When allocating cpus to the LSE/LSR pod in the reserve phase of the `nodenumaresource` plugin, the reserved cores in `nodetopo.annotation` need to be removed.
```
cpus(alloc) = cpus(total) - cpus(allocated) - cpus(kubeletReserved) - cpus(nodeAnnoReserved)
```
3. `elasticquota` plugin shuould also remove those reserved resource when calculate the total amount that can be allocated by Quota

<font color=Chocolate>*description*</font>:
In summary, the resources reserved in node.annotation are taken into account during the `filter` phase of scheduling;
When allocating cores to pod in the `reserve` phase, do not allocate the cores already reserved in node.annotation.
1. Register the corresponding function by `RegisterNodeInfoTransformer`, subtract the resources reserved from `node.annotation` in `Nodeinfo.Allocatable` if the reservation apply policy is empty or `Default`.
This part of the logic executed before the Filter of the scheduling plugin. So the resources reserved from node are also taken into account in other scheduler plugins.

2. When allocating cpus to the LSE/LSR pod in the reserve phase of the `NodeNUMAResource` plugin, the reserved cores in `nodetopo.annotation` need to be removed.
```
cpus(alloc) = cpus(total) - cpus(allocated) - cpus(kubeletReserved) - cpus(nodeAnnoReserved)
```
3. `ElasticQuota` plugin shuould also remove those reserved resource if the apply policy is empty or `Default` when calculate the total amount that can be allocated by Quota

<font color=Chocolate>*description*</font>:
In summary, the resources reserved in node.annotation are taken into account during the `Filter` phase of scheduling;
When allocating cores to pod in the `Reserve` phase, do not allocate the cores already reserved in node.annotation.
...


#### descheduler
#### Descheduler

- cpu: evict (LSE/LSR)pods that are using reserved cpus in node.annotation.
for example, 0-1 core is already used by LSE/LSR pods, but at at the same time we have reserved 0-3 core CPU through `node.annotation`, as declared below.
`node.annotation["node.koordinator.sh/reservation"]={"reservedCPUs":"0-3"}`
We should evict those pods that occupy 0-3 cores
- ...

Other eviction policies also need to take into account the resources reserved in node.annotation,
e.g. `node.alloctable` in the `LowNodeLoad` descheduler plugin should be minus the reserved resources
e.g. `Node.Status.Allocatable` in the `LowNodeLoad` descheduler plugin should be minus the reserved resources if the policy is empty or `Default`.

#### Eviction

#### eviction
We can do something in the following stages:
- On the agent side,
it is easier to get the resource usage of individual containers, we can evict pods with lower priority or higher CPU usage.
Expand All @@ -198,7 +228,7 @@ We can do something in the following stages:
at this time, we should evict a group of pods according to the job/crd dimension. The simplest algorithm looks like this,
sorted by priority / number of pods in a group / total resource usage in a group of pods.

#### example
#### Example
There is a demo to reserve CPU by quantity, like this:
```yaml
apiVersion: v1
Expand Down Expand Up @@ -238,9 +268,8 @@ zones:

```



## Implementation History

- [ ] 12/27/2022: Open PR for initial draft.
- [ ] 13/04/2023: Add ApplyPolicy.

18 changes: 8 additions & 10 deletions pkg/koordlet/statesinformer/states_noderesourcetopology.go
Original file line number Diff line number Diff line change
Expand Up @@ -357,19 +357,17 @@ func removeSystemQOSCPUs(cpuSharePools []extension.CPUSharedPool, sysQOSRes *ext
return newCPUSharePools
}

func getNodeReserved(topo *topology.CPUTopology, anno map[string]string) extension.NodeReservation {
func getNodeReserved(cpuTopology *topology.CPUTopology, nodeAnnotations map[string]string) extension.NodeReservation {
reserved := extension.NodeReservation{}
allCPUs := topo.CPUDetails.CPUs()
reservedCPUs, numReservedCPUs := extension.GetReservedCPUs(anno)
if numReservedCPUs > 0 {
cpus, _ := kubelet.TakeByTopology(allCPUs, numReservedCPUs, topo)
reserved.ReservedCPUs = cpus.String()
}
reservedCPUs, numReservedCPUs := extension.GetReservedCPUs(nodeAnnotations)
if reservedCPUs != "" {
res, _ := cpuset.Parse(reservedCPUs)
reserved.ReservedCPUs = res.String()
cpus, _ := cpuset.Parse(reservedCPUs)
reserved.ReservedCPUs = cpus.String()
} else if numReservedCPUs > 0 {
allCPUs := cpuTopology.CPUDetails.CPUs()
cpus, _ := kubelet.TakeByTopology(allCPUs, numReservedCPUs, cpuTopology)
reserved.ReservedCPUs = cpus.String()
}

return reserved
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,8 @@ limitations under the License.
package sharedlisterext

import (
quotav1 "k8s.io/apiserver/pkg/quota/v1"
"k8s.io/kubernetes/pkg/scheduler/framework"

"github.com/koordinator-sh/koordinator/apis/extension"
"github.com/koordinator-sh/koordinator/pkg/util"
)

Expand All @@ -35,20 +33,9 @@ func nodeReservationTransformer(nodeInfo *framework.NodeInfo) {
}

node := nodeInfo.Node()
resourceListReservedByNode := util.GetNodeReservationFromAnnotation(node.Annotations)
if resourceListReservedByNode == nil {
trimmedAllocatable, trimmed := util.TrimNodeAllocatableByNodeReservation(node)
if !trimmed {
return
}

originAlloc := node.Status.Allocatable.DeepCopy()
currentAlloc := quotav1.Subtract(originAlloc, resourceListReservedByNode)

// node.alloc(batch-memory) and node.alloc(batch-memory) have subtracted the reserved resources from the koord-manager,
// so we should keep the original data here.
currentAlloc[extension.BatchMemory] = originAlloc[extension.BatchMemory]
currentAlloc[extension.BatchCPU] = originAlloc[extension.BatchCPU]

if !quotav1.Equals(originAlloc, currentAlloc) {
nodeInfo.Allocatable = framework.NewResource(currentAlloc)
}
nodeInfo.Allocatable = framework.NewResource(trimmedAllocatable)
}

0 comments on commit 03d985e

Please sign in to comment.