Skip to content

Commit

Permalink
feat(qrm): get native qos class from framework
Browse files Browse the repository at this point in the history
Signed-off-by: caohe <caohe9603@gmail.com>
  • Loading branch information
caohe committed Jul 26, 2023
1 parent 924db6e commit f47f86b
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 78 deletions.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ replace (
k8s.io/kube-proxy => k8s.io/kube-proxy v0.24.6
k8s.io/kube-scheduler => k8s.io/kube-scheduler v0.24.6
k8s.io/kubectl => k8s.io/kubectl v0.24.6
k8s.io/kubelet => github.com/kubewharf/kubelet v1.24.6-kubewharf.5
k8s.io/kubelet => github.com/kubewharf/kubelet v1.24.6-kubewharf.6
k8s.io/kubernetes => k8s.io/kubernetes v1.24.6
k8s.io/legacy-cloud-providers => k8s.io/legacy-cloud-providers v0.24.6
k8s.io/metrics => k8s.io/metrics v0.24.6
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -545,8 +545,8 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/kubewharf/katalyst-api v0.1.12 h1:dmfXMzknvgAPL/DI5hUmU9JGbrI6X3TUs4M9a0jZxYg=
github.com/kubewharf/katalyst-api v0.1.12/go.mod h1:iVILS5UL5PRtkUPH2Iu1K/gFGTPMNItnth5fmQ80VGE=
github.com/kubewharf/kubelet v1.24.6-kubewharf.5 h1:i3BcfBY3fFTzPWi5BCYyhkiSZCrIGczaGNAwgUvga6U=
github.com/kubewharf/kubelet v1.24.6-kubewharf.5/go.mod h1:MxbSZUx3wXztFneeelwWWlX7NAAStJ6expqq7gY2J3c=
github.com/kubewharf/kubelet v1.24.6-kubewharf.6 h1:36IfOYzDL4Eb8uwJgpq2080lIn04Il+MbmFx5yi46UA=
github.com/kubewharf/kubelet v1.24.6-kubewharf.6/go.mod h1:MxbSZUx3wXztFneeelwWWlX7NAAStJ6expqq7gY2J3c=
github.com/kyoh86/exportloopref v0.1.7/go.mod h1:h1rDl2Kdj97+Kwh4gdz3ujE7XHmH51Q0lUiZ1z4NLj8=
github.com/lib/pq v1.0.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo=
github.com/libopenstorage/openstorage v1.0.0/go.mod h1:Sp1sIObHjat1BeXhfMqLZ14wnOzEhNx2YQedreMcUyc=
Expand Down
1 change: 0 additions & 1 deletion pkg/agent/qrm-plugins/cpu/dynamicpolicy/state/state.go
Original file line number Diff line number Diff line change
Expand Up @@ -466,7 +466,6 @@ type reader interface {
GetMachineState() NUMANodeMap
GetPodEntries() PodEntries
GetAllocationInfo(podUID string, containerName string) *AllocationInfo
GetCPUSetOrDefault(podUID string, containerName string) machine.CPUSet
}

// writer is used to store information into local states,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,13 +150,6 @@ func (sc *stateCheckpoint) GetAllocationInfo(podUID string, containerName string
return sc.cache.GetAllocationInfo(podUID, containerName)
}

func (sc *stateCheckpoint) GetCPUSetOrDefault(podUID string, containerName string) machine.CPUSet {
sc.RLock()
defer sc.RUnlock()

return sc.cache.GetCPUSetOrDefault(podUID, containerName)
}

func (sc *stateCheckpoint) GetPodEntries() PodEntries {
sc.RLock()
defer sc.RUnlock()
Expand Down
7 changes: 0 additions & 7 deletions pkg/agent/qrm-plugins/cpu/dynamicpolicy/state/state_mem.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,13 +82,6 @@ func (s *cpuPluginState) GetAllocationInfo(podUID string, containerName string)
return nil
}

func (s *cpuPluginState) GetCPUSetOrDefault(podUID string, containerName string) machine.CPUSet {
if res := s.GetAllocationInfo(podUID, containerName); res != nil {
return res.AllocationResult
}
return s.machineState.GetDefaultCPUSet()
}

func (s *cpuPluginState) GetPodEntries() PodEntries {
s.RLock()
defer s.RUnlock()
Expand Down
96 changes: 36 additions & 60 deletions pkg/agent/qrm-plugins/cpu/nativepolicy/policy.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ import (
"sync"
"time"

"github.com/pkg/errors"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/util/wait"
pluginapi "k8s.io/kubelet/pkg/apis/resourceplugin/v1alpha1"
Expand Down Expand Up @@ -236,12 +235,6 @@ func (p *NativePolicy) GetTopologyHints(ctx context.Context,
return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err)
}

pod, err := p.metaServer.GetPod(ctx, req.PodUid)
if err != nil {
return nil, fmt.Errorf("GetPod failed with error: %v", err)
}

qosClass := qos.GetPodQOS(pod)
isInteger := float64(reqInt) == req.ResourceRequests[string(v1.ResourceCPU)]

general.InfoS("called",
Expand All @@ -251,7 +244,7 @@ func (p *NativePolicy) GetTopologyHints(ctx context.Context,
"podType", req.PodType,
"podRole", req.PodRole,
"containerType", req.ContainerType,
"qosClass", qosClass,
"qosClass", req.NativeQosClass,
"numCPUs", reqInt,
"isDebugPod", isDebugPod,
"isInteger", isInteger)
Expand All @@ -272,7 +265,7 @@ func (p *NativePolicy) GetTopologyHints(ctx context.Context,
}
}()

if qosClass != v1.PodQOSGuaranteed || !isInteger {
if req.NativeQosClass != string(v1.PodQOSGuaranteed) || !isInteger {
return p.sharedPoolHintHandler(ctx, req)
}
return p.dedicatedCoresHintHandler(ctx, req)
Expand All @@ -298,12 +291,6 @@ func (p *NativePolicy) Allocate(ctx context.Context,
return nil, fmt.Errorf("getReqQuantityFromResourceReq failed with error: %v", err)
}

pod, err := p.metaServer.GetPod(ctx, req.PodUid)
if err != nil {
return nil, fmt.Errorf("GetPod failed with error: %v", err)
}

qosClass := qos.GetPodQOS(pod)
isInteger := float64(reqInt) == req.ResourceRequests[string(v1.ResourceCPU)]

general.InfoS("called",
Expand All @@ -313,7 +300,7 @@ func (p *NativePolicy) Allocate(ctx context.Context,
"podType", req.PodType,
"podRole", req.PodRole,
"containerType", req.ContainerType,
"qosClass", qosClass,
"qosClass", req.NativeQosClass,
"numCPUs", reqInt,
"isDebugPod", isDebugPod,
"isInteger", isInteger)
Expand Down Expand Up @@ -409,7 +396,7 @@ func (p *NativePolicy) Allocate(ctx context.Context,
}, nil
}

if qosClass != v1.PodQOSGuaranteed || !isInteger {
if req.NativeQosClass != string(v1.PodQOSGuaranteed) || !isInteger {
return p.sharedPoolAllocationHandler(ctx, req)
}
return p.dedicatedCoresAllocationHandler(ctx, req)
Expand All @@ -426,57 +413,46 @@ func (p *NativePolicy) GetResourcesAllocation(_ context.Context,
p.Lock()
defer p.Unlock()

podResources := make(map[string]*pluginapi.ContainerResources)

if p.metaServer == nil {
general.Errorf("nil metaServer")
return nil, errors.New("nil metaServer")
}

podList, err := p.metaServer.GetPodList(context.Background(), nil)
defaultCPUSet := p.state.GetMachineState().GetDefaultCPUSet()
defaultCPUSetTopologyAwareAssignments, err := machine.GetNumaAwareAssignments(p.machineInfo.CPUTopology, defaultCPUSet)
if err != nil {
general.Errorf("get pod list failed, err: %v", err)
return nil, fmt.Errorf("get pod list failed, err: %v", err)
return nil, fmt.Errorf("GetNumaAwareAssignments err: %v", err)
}

for _, pod := range podList {
if pod == nil {
general.Errorf("get nil pod from metaServer")
continue
}
podResources := make(map[string]*pluginapi.ContainerResources)

podUID := string(pod.UID)
for podUID, containerEntries := range p.state.GetPodEntries() {
if podResources[podUID] == nil {
podResources[podUID] = &pluginapi.ContainerResources{}
}

for _, container := range append(pod.Spec.InitContainers, pod.Spec.Containers...) {
containerName := container.Name

containerID, err := p.metaServer.GetContainerID(podUID, containerName)
if err != nil {
general.Errorf("get container id failed, pod: %s, container: %s, err: %v",
podUID, containerName, err)
continue
}

isContainerNotRunning, err := native.CheckContainerNotRunning(pod, containerName)
if err != nil {
general.Errorf("check container not running failed, pod: %s, container: %s(%s), err: %v",
podUID, containerName, containerID, err)
for containerName, allocationInfo := range containerEntries {
if allocationInfo == nil {
continue
}

if isContainerNotRunning {
general.Infof("skip container because it is not running, pod: %s, container: %s(%s), err: %v",
podUID, containerName, containerID, err)
continue
}

containerCPUs := p.state.GetCPUSetOrDefault(string(pod.UID), containerName)
if containerCPUs.IsEmpty() {
general.Errorf("skip container because the cpuset is empty, pod: %s, container: %s(%s), err: %v",
podUID, containerName, containerID, err)
allocationInfo = allocationInfo.Clone()

resultCPUSet := machine.NewCPUSet()
switch allocationInfo.OwnerPoolName {
case state.PoolNameDedicated:
resultCPUSet = allocationInfo.AllocationResult
case state.PoolNameShare:
resultCPUSet = defaultCPUSet

if !allocationInfo.AllocationResult.Equals(defaultCPUSet) {
clonedDefaultCPUSet := defaultCPUSet.Clone()
clonedDefaultCPUSetTopologyAwareAssignments := machine.DeepcopyCPUAssignment(defaultCPUSetTopologyAwareAssignments)

allocationInfo.AllocationResult = clonedDefaultCPUSet
allocationInfo.OriginalAllocationResult = clonedDefaultCPUSet
allocationInfo.TopologyAwareAssignments = clonedDefaultCPUSetTopologyAwareAssignments
allocationInfo.OriginalTopologyAwareAssignments = clonedDefaultCPUSetTopologyAwareAssignments

p.state.SetAllocationInfo(podUID, containerName, allocationInfo)
}
default:
general.Errorf("skip container because the pool name is not supported, pod: %s, container: %s",
podUID, containerName)
continue
}

Expand All @@ -490,8 +466,8 @@ func (p *NativePolicy) GetResourcesAllocation(_ context.Context,
OciPropertyName: util.OCIPropertyNameCPUSetCPUs,
IsNodeResource: false,
IsScalarResource: true,
AllocatedQuantity: float64(containerCPUs.Size()),
AllocationResult: containerCPUs.String(),
AllocatedQuantity: float64(resultCPUSet.Size()),
AllocationResult: resultCPUSet.String(),
},
},
}
Expand Down

0 comments on commit f47f86b

Please sign in to comment.