Skip to content

Commit

Permalink
koordlet: fix RDT cache id issue (koordinator-sh#1789)
Browse files Browse the repository at this point in the history
Signed-off-by: bowen-intel <bowen.song@intel.com>
Signed-off-by: zwzhang <zzw261520@alibaba-inc.com>
Co-authored-by: zwzhang <zzw261520@alibaba-inc.com>
  • Loading branch information
bowen-intel and zwzhang0107 committed Jan 16, 2024
1 parent b512d33 commit fe6dae3
Show file tree
Hide file tree
Showing 7 changed files with 191 additions and 66 deletions.
16 changes: 8 additions & 8 deletions pkg/koordlet/qosmanager/plugins/resctrl/resctrl_reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ func (r *resctrlReconcile) getPodCgroupNewTaskIds(podMeta *statesinformer.PodMet
return taskIds
}

func (r *resctrlReconcile) calculateAndApplyCatL3PolicyForGroup(group string, cbm uint, l3Num int,
func (r *resctrlReconcile) calculateAndApplyRDTL3PolicyForGroup(group string, cbm uint, l3Num int,
resourceQoS *slov1alpha1.ResourceQOS) error {
if resourceQoS == nil || resourceQoS.ResctrlQOS == nil || resourceQoS.ResctrlQOS.CATRangeStartPercent == nil ||
resourceQoS.ResctrlQOS.CATRangeEndPercent == nil {
Expand Down Expand Up @@ -326,7 +326,7 @@ func (r *resctrlReconcile) calculateAndApplyCatL3PolicyForGroup(group string, cb
return nil
}

func (r *resctrlReconcile) calculateAndApplyCatMbPolicyForGroup(group string, l3Num int, cpuBasicInfo extension.CPUBasicInfo, resourceQoS *slov1alpha1.ResourceQOS) error {
func (r *resctrlReconcile) calculateAndApplyRDTMbPolicyForGroup(group string, l3Num int, cpuBasicInfo extension.CPUBasicInfo, resourceQoS *slov1alpha1.ResourceQOS) error {
if resourceQoS == nil || resourceQoS.ResctrlQOS == nil {
klog.Warningf("skipped, since resourceQoS or ResctrlQOS is nil for group %v, "+
"resourceQoS %v", resourceQoS, group)
Expand Down Expand Up @@ -356,7 +356,7 @@ func (r *resctrlReconcile) calculateAndApplyCatMbPolicyForGroup(group string, l3
return nil
}

func (r *resctrlReconcile) calculateAndApplyCatL3GroupTasks(group string, taskIds []int32) error {
func (r *resctrlReconcile) calculateAndApplyRDTL3GroupTasks(group string, taskIds []int32) error {
if len(taskIds) <= 0 {
klog.V(6).Infof("apply l3 cat tasks for group %s skipped, no new task id", group)
return nil
Expand Down Expand Up @@ -384,7 +384,7 @@ func (r *resctrlReconcile) calculateAndApplyCatL3GroupTasks(group string, taskId
return nil
}

func (r *resctrlReconcile) reconcileCatResctrlPolicy(qosStrategy *slov1alpha1.ResourceQOSStrategy) {
func (r *resctrlReconcile) reconcileRDTResctrlPolicy(qosStrategy *slov1alpha1.ResourceQOSStrategy) {
// 1. retrieve rdt configs from nodeSLOSpec
// 2.1 get cbm and l3 numbers, which are general for all resctrl groups
// 2.2 calculate applying resctrl policies, like cat policy and so on, with each rdt config
Expand Down Expand Up @@ -427,11 +427,11 @@ func (r *resctrlReconcile) reconcileCatResctrlPolicy(qosStrategy *slov1alpha1.Re
// calculate and apply l3 cat policy for each group
for _, group := range resctrlGroupList {
resQoSStrategy := getResourceQOSForResctrlGroup(qosStrategy, group)
err = r.calculateAndApplyCatL3PolicyForGroup(group, cbm, l3Num, resQoSStrategy)
err = r.calculateAndApplyRDTL3PolicyForGroup(group, cbm, l3Num, resQoSStrategy)
if err != nil {
klog.Warningf("failed to apply l3 cat policy for group %v, err: %v", group, err)
}
err = r.calculateAndApplyCatMbPolicyForGroup(group, l3Num, nodeCPUInfo.BasicInfo, resQoSStrategy)
err = r.calculateAndApplyRDTMbPolicyForGroup(group, l3Num, nodeCPUInfo.BasicInfo, resQoSStrategy)
if err != nil {
klog.Warningf("failed to apply cat MB policy for group %v, err: %v", group, err)
}
Expand Down Expand Up @@ -481,7 +481,7 @@ func (r *resctrlReconcile) reconcileResctrlGroups(qosStrategy *slov1alpha1.Resou

// write Cat L3 tasks for each resctrl group
for _, group := range resctrlGroupList {
err = r.calculateAndApplyCatL3GroupTasks(group, taskIds[group])
err = r.calculateAndApplyRDTL3GroupTasks(group, taskIds[group])
if err != nil {
klog.Warningf("failed to apply l3 cat tasks for group %s, err %s", group, err)
}
Expand Down Expand Up @@ -518,6 +518,6 @@ func (r *resctrlReconcile) reconcile() {
klog.V(4).Infof("resctrlReconcile failed, cannot initialize cat resctrl group, err: %s", err)
return
}
r.reconcileCatResctrlPolicy(nodeSLO.Spec.ResourceQOSStrategy)
r.reconcileRDTResctrlPolicy(nodeSLO.Spec.ResourceQOSStrategy)
r.reconcileResctrlGroups(nodeSLO.Spec.ResourceQOSStrategy)
}
63 changes: 43 additions & 20 deletions pkg/koordlet/qosmanager/plugins/resctrl/resctrl_reconcile_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -520,7 +520,7 @@ func Test_getPodCgroupNewTaskIds(t *testing.T) {
}
}

func TestResctrlReconcile_calculateAndApplyCatL3PolicyForGroup(t *testing.T) {
func TestResctrlReconcile_calculateAndApplyRDTL3PolicyForGroup(t *testing.T) {
type args struct {
group string
cbm uint
Expand All @@ -532,6 +532,7 @@ func TestResctrlReconcile_calculateAndApplyCatL3PolicyForGroup(t *testing.T) {
noUpdate bool
cachedMask string
schemataData []string
mockSchemata string
}
tests := []struct {
name string
Expand Down Expand Up @@ -623,6 +624,9 @@ func TestResctrlReconcile_calculateAndApplyCatL3PolicyForGroup(t *testing.T) {
},
},
},
field: field{
mockSchemata: "L3:0=f;1=f;\n",
},
want: "L3:0=f;1=f;\n",
wantErr: false,
},
Expand Down Expand Up @@ -651,6 +655,9 @@ func TestResctrlReconcile_calculateAndApplyCatL3PolicyForGroup(t *testing.T) {
},
},
},
field: field{
mockSchemata: "L3:0=7ff;1=7ff;\n",
},
want: "L3:0=3c;1=3c;\n",
wantErr: false,
},
Expand Down Expand Up @@ -679,6 +686,9 @@ func TestResctrlReconcile_calculateAndApplyCatL3PolicyForGroup(t *testing.T) {
},
},
},
field: field{
mockSchemata: "L3:0=7ff;1=7ff;\n",
},
want: "L3:0=3f;1=3f;\n",
wantErr: false,
},
Expand Down Expand Up @@ -711,20 +721,24 @@ func TestResctrlReconcile_calculateAndApplyCatL3PolicyForGroup(t *testing.T) {
noUpdate: true,
cachedMask: "3c",
schemataData: []string{"L3:0=f\nMB:0=100"},
mockSchemata: "L3:0=7ff\nMB:0=100\n",
},
want: "L3:0=3c;\n",
wantErr: false,
},
}
system.CacheIdsCacheFunc = system.GetCacheIds
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
helper := system.NewFileTestUtil(t)
sysFSRootDirName := "calculateAndApplyCatL3PolicyForGroup"
sysFSRootDirName := "calculateAndApplyRDTL3PolicyForGroup"
helper.MkDirAll(sysFSRootDirName)
system.Conf.SysFSRootDir = filepath.Join(helper.TempDir, sysFSRootDirName)
validSysFSRootDir := system.Conf.SysFSRootDir
system.CommonRootDir = ""
testingPrepareResctrlL3CatGroups(t, "ff", "", tt.field.schemataData...)
testingPrepareResctrlL3CatGroups(t, "ff", tt.field.mockSchemata, tt.field.schemataData...)

os.ReadFile(filepath.Join(sysFSRootDirName))

opt := &framework.Options{
Config: framework.NewDefaultConfig(),
Expand All @@ -741,7 +755,7 @@ func TestResctrlReconcile_calculateAndApplyCatL3PolicyForGroup(t *testing.T) {
}

// execute function
err := r.calculateAndApplyCatL3PolicyForGroup(tt.args.group, tt.args.cbm, tt.args.l3Num,
err := r.calculateAndApplyRDTL3PolicyForGroup(tt.args.group, tt.args.cbm, tt.args.l3Num,
getResourceQOSForResctrlGroup(tt.args.qosStrategy, tt.args.group))
assert.Equal(t, tt.wantErr, err != nil, err)

Expand All @@ -760,7 +774,7 @@ func TestResctrlReconcile_calculateAndApplyCatL3PolicyForGroup(t *testing.T) {
}
}

func TestResctrlReconcile_calculateAndApplyCatMbPolicyForGroup(t *testing.T) {
func TestResctrlReconcile_calculateAndApplyRDTMbPolicyForGroup(t *testing.T) {
type args struct {
group string
l3Num int
Expand All @@ -772,6 +786,7 @@ func TestResctrlReconcile_calculateAndApplyCatMbPolicyForGroup(t *testing.T) {
noUpdate bool
cachedPercent string
schemataData []string
mockSchemata string
}
tests := []struct {
name string
Expand Down Expand Up @@ -840,13 +855,17 @@ func TestResctrlReconcile_calculateAndApplyCatMbPolicyForGroup(t *testing.T) {
},
},
},
field: field{
mockSchemata: "L3:0=ff;1=ff;\nMB:0=90;1=90;\n",
},
want: "MB:0=90;1=90;\n",
wantErr: false,
},
{
name: "apply policy correctly on amd",
field: field{
schemataData: []string{" L3:0=f;1=f;2=f;3=f\n MB:0=2048;1=2048;2=2048;3=2048"},
mockSchemata: " L3:0=f;1=f;2=f;3=f\n MB:0=2048;1=2048;2=2048;3=2048",
},
args: args{
group: BEResctrlGroup,
Expand All @@ -870,6 +889,7 @@ func TestResctrlReconcile_calculateAndApplyCatMbPolicyForGroup(t *testing.T) {
name: "apply unlimited policy correctly on amd",
field: field{
schemataData: []string{" L3:0=f;1=f;2=f;3=f\n MB:0=100;1=100;2=100;3=100"},
mockSchemata: " L3:0=f;1=f;2=f;3=f\n MB:0=2048;1=2048;2=2048;3=2048",
},
args: args{
group: BEResctrlGroup,
Expand Down Expand Up @@ -914,6 +934,7 @@ func TestResctrlReconcile_calculateAndApplyCatMbPolicyForGroup(t *testing.T) {
field: field{
noUpdate: true,
cachedPercent: "90",
mockSchemata: "L3:0=ff;1=ff\nMB:0=100;1=100;\n",
},
want: "MB:0=90;1=90;\n",
wantErr: false,
Expand Down Expand Up @@ -945,20 +966,22 @@ func TestResctrlReconcile_calculateAndApplyCatMbPolicyForGroup(t *testing.T) {
noUpdate: true,
cachedPercent: "80",
schemataData: []string{"L3:0=f\nMB:0=100"},
mockSchemata: "L3:0=f\nMB:0=100\n",
},
want: "MB:0=80;\n",
wantErr: false,
},
}
system.CacheIdsCacheFunc = system.GetCacheIds
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
helper := system.NewFileTestUtil(t)
sysFSRootDirName := "calculateAndApplyCatMbPolicyForGroup"
sysFSRootDirName := "calculateAndApplyRDTMbPolicyForGroup"
helper.MkDirAll(sysFSRootDirName)
system.Conf.SysFSRootDir = filepath.Join(helper.TempDir, sysFSRootDirName)
validSysFSRootDir := system.Conf.SysFSRootDir
system.CommonRootDir = ""
testingPrepareResctrlL3CatGroups(t, "ff", "", tt.field.schemataData...)
testingPrepareResctrlL3CatGroups(t, "ff", tt.field.mockSchemata, tt.field.schemataData...)

opt := &framework.Options{
Config: framework.NewDefaultConfig(),
Expand All @@ -975,7 +998,7 @@ func TestResctrlReconcile_calculateAndApplyCatMbPolicyForGroup(t *testing.T) {
}

// execute function
err := r.calculateAndApplyCatMbPolicyForGroup(tt.args.group, tt.args.l3Num, tt.args.basicCPUInfo,
err := r.calculateAndApplyRDTMbPolicyForGroup(tt.args.group, tt.args.l3Num, tt.args.basicCPUInfo,
getResourceQOSForResctrlGroup(tt.args.qosStrategy, tt.args.group))
assert.Equal(t, tt.wantErr, err != nil)

Expand All @@ -994,7 +1017,7 @@ func TestResctrlReconcile_calculateAndApplyCatMbPolicyForGroup(t *testing.T) {
}
}

func TestResctrlReconcile_calculateAndApplyCatL3GroupTasks(t *testing.T) {
func TestResctrlReconcile_calculateAndApplyRDTL3GroupTasks(t *testing.T) {
type args struct {
group string
taskIds []int32
Expand Down Expand Up @@ -1069,7 +1092,7 @@ func TestResctrlReconcile_calculateAndApplyCatL3GroupTasks(t *testing.T) {
r.init(stop)
defer func() { stop <- struct{}{} }()

err := r.calculateAndApplyCatL3GroupTasks(tt.args.group, tt.args.taskIds)
err := r.calculateAndApplyRDTL3GroupTasks(tt.args.group, tt.args.taskIds)
assert.Equal(t, tt.wantErr, err != nil, err)

out, err := os.ReadFile(filepath.Join(validSysFSRootDir, system.ResctrlDir, tt.args.group,
Expand All @@ -1080,11 +1103,11 @@ func TestResctrlReconcile_calculateAndApplyCatL3GroupTasks(t *testing.T) {
}
}

func TestResctrlReconcile_reconcileCatResctrlPolicy(t *testing.T) {
func TestResctrlReconcile_reconcileRDTResctrlPolicy(t *testing.T) {
t.Run("test", func(t *testing.T) {
helper := system.NewFileTestUtil(t)

sysFSRootDirName := "reconcileCatResctrlPolicy"
sysFSRootDirName := "reconcileRDTResctrlPolicy"
helper.MkDirAll(sysFSRootDirName)

system.Conf.SysFSRootDir = filepath.Join(helper.TempDir, sysFSRootDirName)
Expand Down Expand Up @@ -1147,7 +1170,7 @@ func TestResctrlReconcile_reconcileCatResctrlPolicy(t *testing.T) {
defer func() { stop <- struct{}{} }()

// reconcile and check if the result is correct
r.reconcileCatResctrlPolicy(nodeSLO.Spec.ResourceQOSStrategy)
r.reconcileRDTResctrlPolicy(nodeSLO.Spec.ResourceQOSStrategy)

beSchemataPath := filepath.Join(resctrlDirPath, BEResctrlGroup, system.ResctrlSchemataName)
expectBESchemataStr := "L3:0=3f;1=3f;\n"
Expand All @@ -1162,39 +1185,39 @@ func TestResctrlReconcile_reconcileCatResctrlPolicy(t *testing.T) {
// log error for invalid be resctrl path
err = os.RemoveAll(filepath.Join(resctrlDirPath, BEResctrlGroup))
assert.NoError(t, err)
r.reconcileCatResctrlPolicy(nodeSLO.Spec.ResourceQOSStrategy)
r.reconcileRDTResctrlPolicy(nodeSLO.Spec.ResourceQOSStrategy)

// log error for invalid root resctrl path
system.Conf.SysFSRootDir = "invalidPath"
r.reconcileCatResctrlPolicy(nodeSLO.Spec.ResourceQOSStrategy)
r.reconcileRDTResctrlPolicy(nodeSLO.Spec.ResourceQOSStrategy)
system.Conf.SysFSRootDir = validSysFSRootDir

// log error for invalid l3 number
metricCache.EXPECT().Get(metriccache.NodeCPUInfoKey).Return(&metriccache.NodeCPUInfo{
BasicInfo: extension.CPUBasicInfo{CatL3CbmMask: "7ff"},
TotalInfo: koordletutil.CPUTotalInfo{},
}, true).Times(1)
r.reconcileCatResctrlPolicy(nodeSLO.Spec.ResourceQOSStrategy)
r.reconcileRDTResctrlPolicy(nodeSLO.Spec.ResourceQOSStrategy)

// log error for invalid l3 cbm
metricCache.EXPECT().Get(metriccache.NodeCPUInfoKey).Return(&metriccache.NodeCPUInfo{
BasicInfo: extension.CPUBasicInfo{CatL3CbmMask: "invalid"},
TotalInfo: koordletutil.CPUTotalInfo{L3ToCPU: map[int32][]koordletutil.ProcessorInfo{0: {}, 1: {}}},
}, true).Times(1)
r.reconcileCatResctrlPolicy(nodeSLO.Spec.ResourceQOSStrategy)
r.reconcileRDTResctrlPolicy(nodeSLO.Spec.ResourceQOSStrategy)
metricCache.EXPECT().Get(metriccache.NodeCPUInfoKey).Return(&metriccache.NodeCPUInfo{
BasicInfo: extension.CPUBasicInfo{CatL3CbmMask: ""},
TotalInfo: koordletutil.CPUTotalInfo{L3ToCPU: map[int32][]koordletutil.ProcessorInfo{0: {}, 1: {}}},
}, true).Times(1)
r.reconcileCatResctrlPolicy(nodeSLO.Spec.ResourceQOSStrategy)
r.reconcileRDTResctrlPolicy(nodeSLO.Spec.ResourceQOSStrategy)

// log error for invalid nodeCPUInfo
metricCache.EXPECT().Get(metriccache.NodeCPUInfoKey).Return(nil, false)
r.reconcileCatResctrlPolicy(nodeSLO.Spec.ResourceQOSStrategy)
r.reconcileRDTResctrlPolicy(nodeSLO.Spec.ResourceQOSStrategy)

// log error for get nodeCPUInfo failed
metricCache.EXPECT().Get(metriccache.NodeCPUInfoKey).Return(nil, false)
r.reconcileCatResctrlPolicy(nodeSLO.Spec.ResourceQOSStrategy)
r.reconcileRDTResctrlPolicy(nodeSLO.Spec.ResourceQOSStrategy)
})
}

Expand Down
12 changes: 10 additions & 2 deletions pkg/koordlet/resourceexecutor/resctrl_updater.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,11 @@ func (r *ResctrlSchemataResourceUpdater) Clone() ResourceUpdater {
func NewResctrlL3SchemataResource(group, schemataDelta string, l3Num int) ResourceUpdater {
schemataFile := sysutil.ResctrlSchemata.Path(group)
l3SchemataKey := sysutil.L3SchemataPrefix + ":" + schemataFile
schemata := sysutil.NewResctrlSchemataRaw().WithL3Num(l3Num).WithL3Mask(schemataDelta)
// The current assumption is that the cache ids obtained through
// resctrl schemata will not go wrong. TODO: Use the ability of node info
// to obtain cache ids to replace the current method.
ids, _ := sysutil.CacheIdsCacheFunc()
schemata := sysutil.NewResctrlSchemataRaw(ids).WithL3Num(l3Num).WithL3Mask(schemataDelta)
klog.V(6).Infof("generate new resctrl l3 schemata resource, file %s, key %s, value %s",
schemataFile, l3SchemataKey, schemata.L3String())

Expand All @@ -72,7 +76,11 @@ func NewResctrlL3SchemataResource(group, schemataDelta string, l3Num int) Resour
func NewResctrlMbSchemataResource(group, schemataDelta string, l3Num int) ResourceUpdater {
schemataFile := sysutil.ResctrlSchemata.Path(group)
mbSchemataKey := sysutil.MbSchemataPrefix + ":" + schemataFile
schemata := sysutil.NewResctrlSchemataRaw().WithL3Num(l3Num).WithMB(schemataDelta)
// The current assumption is that the cache ids obtained through
// resctrl schemata will not go wrong. TODO: Use the ability of node info
// to obtain cache ids to replace the current method.
ids, _ := sysutil.CacheIdsCacheFunc()
schemata := sysutil.NewResctrlSchemataRaw(ids).WithL3Num(l3Num).WithMB(schemataDelta)
klog.V(6).Infof("generate new resctrl mba schemata resource, file %s, key %s, value %s",
schemataFile, mbSchemataKey, schemata.MBString())

Expand Down

0 comments on commit fe6dae3

Please sign in to comment.