Merge pull request #5518 from kawych/metrics

Add "resource_name" to scaled_up_gpu_nodes_total and scaled_down_gpu_nodes_total metrics
kubernetes · Feb 22, 2023 · c611acd · c611acd
2 parents 3922f49 + 2ea2fb6
commit c611acd
Show file tree

Hide file tree

Showing 5 changed files with 35 additions and 36 deletions.
diff --git a/cluster-autoscaler/core/scale_up.go b/cluster-autoscaler/core/scale_up.go
@@ -209,7 +209,6 @@ func ScaleUp(context *context.AutoscalingContext, processors *ca_processors.Auto
 	}
 
 	now := time.Now()
-	gpuLabel := context.CloudProvider.GPULabel()
 	availableGPUTypes := context.CloudProvider.GetAvailableGPUTypes()
 	expansionOptions := make(map[string]expander.Option, 0)
 	skippedNodeGroups := map[string]status.Reasons{}
@@ -405,7 +404,9 @@ func ScaleUp(context *context.AutoscalingContext, processors *ca_processors.Auto
 
 		klog.V(1).Infof("Final scale-up plan: %v", scaleUpInfos)
 		for _, info := range scaleUpInfos {
-			typedErr := executeScaleUp(context, clusterStateRegistry, info, gpu.GetGpuTypeForMetrics(gpuLabel, availableGPUTypes, nodeInfo.Node(), nil), now)
+			gpuConfig := context.CloudProvider.GetNodeGpuConfig(nodeInfo.Node())
+			gpuResourceName, gpuType := gpu.GetGpuInfoForMetrics(gpuConfig, availableGPUTypes, nodeInfo.Node(), nil)
+			typedErr := executeScaleUp(context, clusterStateRegistry, info, gpuResourceName, gpuType, now)
 			if typedErr != nil {
 				return scaleUpError(
 					&status.ScaleUpStatus{
@@ -444,7 +445,6 @@ func ScaleUpToNodeGroupMinSize(context *context.AutoscalingContext, processors *
 	nodes []*apiv1.Node, nodeInfos map[string]*schedulerframework.NodeInfo) (*status.ScaleUpStatus, errors.AutoscalerError) {
 	now := time.Now()
 	nodeGroups := context.CloudProvider.NodeGroups()
-	gpuLabel := context.CloudProvider.GPULabel()
 	availableGPUTypes := context.CloudProvider.GetAvailableGPUTypes()
 	scaleUpInfos := make([]nodegroupset.ScaleUpInfo, 0)
 
@@ -521,9 +521,9 @@ func ScaleUpToNodeGroupMinSize(context *context.AutoscalingContext, processors *
 			klog.Warningf("ScaleUpToNodeGroupMinSize: failed to get node info for node group %s", info.Group.Id())
 			continue
 		}
-
-		gpuType := gpu.GetGpuTypeForMetrics(gpuLabel, availableGPUTypes, nodeInfo.Node(), nil)
-		if err := executeScaleUp(context, clusterStateRegistry, info, gpuType, now); err != nil {
+		gpuConfig := context.CloudProvider.GetNodeGpuConfig(nodeInfo.Node())
+		gpuResourceName, gpuType := gpu.GetGpuInfoForMetrics(gpuConfig, availableGPUTypes, nodeInfo.Node(), nil)
+		if err := executeScaleUp(context, clusterStateRegistry, info, gpuResourceName, gpuType, now); err != nil {
 			return scaleUpError(
 				&status.ScaleUpStatus{
 					FailedResizeNodeGroups: []cloudprovider.NodeGroup{info.Group},
@@ -605,7 +605,7 @@ func filterNodeGroupsByPods(
 	return result
 }
 
-func executeScaleUp(context *context.AutoscalingContext, clusterStateRegistry *clusterstate.ClusterStateRegistry, info nodegroupset.ScaleUpInfo, gpuType string, now time.Time) errors.AutoscalerError {
+func executeScaleUp(context *context.AutoscalingContext, clusterStateRegistry *clusterstate.ClusterStateRegistry, info nodegroupset.ScaleUpInfo, gpuResourceName, gpuType string, now time.Time) errors.AutoscalerError {
 	klog.V(0).Infof("Scale-up: setting group %s size to %d", info.Group.Id(), info.NewSize)
 	context.LogRecorder.Eventf(apiv1.EventTypeNormal, "ScaledUpGroup",
 		"Scale-up: setting group %s size to %d instead of %d (max: %d)", info.Group.Id(), info.NewSize, info.CurrentSize, info.MaxSize)
@@ -620,7 +620,7 @@ func executeScaleUp(context *context.AutoscalingContext, clusterStateRegistry *c
 		info.Group,
 		increase,
 		time.Now())
-	metrics.RegisterScaleUp(increase, gpuType)
+	metrics.RegisterScaleUp(increase, gpuResourceName, gpuType)
 	context.LogRecorder.Eventf(apiv1.EventTypeNormal, "ScaledUpGroup",
 		"Scale-up: group %s size set to %d instead of %d (max: %d)", info.Group.Id(), info.NewSize, info.CurrentSize, info.MaxSize)
 	return nil

diff --git a/cluster-autoscaler/core/scale_up_test.go b/cluster-autoscaler/core/scale_up_test.go
@@ -1053,7 +1053,7 @@ func TestAuthError(t *testing.T) {
 
 	clusterStateRegistry := clusterstate.NewClusterStateRegistry(nil, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff())
 
-	aerr := executeScaleUp(&context, clusterStateRegistry, info, "", time.Now())
+	aerr := executeScaleUp(&context, clusterStateRegistry, info, "", "", time.Now())
 	assert.Error(t, aerr)
 
 	req, err := http.NewRequest("GET", "/", nil)

diff --git a/cluster-autoscaler/core/scaledown/actuation/actuator.go b/cluster-autoscaler/core/scaledown/actuation/actuator.go
@@ -437,7 +437,9 @@ func RegisterAndRecordSuccessfulScaleDownEvent(ctx *context.AutoscalingContext,
 		Time:               time.Now(),
 		ExpectedDeleteTime: time.Now().Add(MaxCloudProviderNodeDeletionTime),
 	})
-	metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(ctx.CloudProvider.GPULabel(), ctx.CloudProvider.GetAvailableGPUTypes(), node, nodeGroup), nodeScaleDownReason(node, drain))
+	gpuConfig := ctx.CloudProvider.GetNodeGpuConfig(node)
+	metricResourceName, metricGpuType := gpu.GetGpuInfoForMetrics(gpuConfig, ctx.CloudProvider.GetAvailableGPUTypes(), node, nodeGroup)
+	metrics.RegisterScaleDown(1, metricResourceName, metricGpuType, nodeScaleDownReason(node, drain))
 	if drain {
 		ctx.LogRecorder.Eventf(apiv1.EventTypeNormal, "ScaleDown", "Scale-down: node %s removed with drain", node.Name)
 	} else {

diff --git a/cluster-autoscaler/metrics/metrics.go b/cluster-autoscaler/metrics/metrics.go
@@ -246,7 +246,7 @@ var (
 			Namespace: caNamespace,
 			Name:      "scaled_up_gpu_nodes_total",
 			Help:      "Number of GPU nodes added by CA, by GPU name.",
-		}, []string{"gpu_name"},
+		}, []string{"gpu_resource_name", "gpu_name"},
 	)
 
 	failedScaleUpCount = k8smetrics.NewCounterVec(
@@ -270,7 +270,7 @@ var (
 			Namespace: caNamespace,
 			Name:      "scaled_down_gpu_nodes_total",
 			Help:      "Number of GPU nodes removed by CA, by reason and GPU name.",
-		}, []string{"reason", "gpu_name"},
+		}, []string{"reason", "gpu_resource_name", "gpu_name"},
 	)
 
 	evictionsCount = k8smetrics.NewCounter(
@@ -490,10 +490,10 @@ func RegisterError(err errors.AutoscalerError) {
 }
 
 // RegisterScaleUp records number of nodes added by scale up
-func RegisterScaleUp(nodesCount int, gpuType string) {
+func RegisterScaleUp(nodesCount int, gpuResourceName, gpuType string) {
 	scaleUpCount.Add(float64(nodesCount))
 	if gpuType != gpu.MetricsNoGPU {
-		gpuScaleUpCount.WithLabelValues(gpuType).Add(float64(nodesCount))
+		gpuScaleUpCount.WithLabelValues(gpuResourceName, gpuType).Add(float64(nodesCount))
 	}
 }
 
@@ -503,10 +503,10 @@ func RegisterFailedScaleUp(reason FailedScaleUpReason) {
 }
 
 // RegisterScaleDown records number of nodes removed by scale down
-func RegisterScaleDown(nodesCount int, gpuType string, reason NodeScaleDownReason) {
+func RegisterScaleDown(nodesCount int, gpuResourceName, gpuType string, reason NodeScaleDownReason) {
 	scaleDownCount.WithLabelValues(string(reason)).Add(float64(nodesCount))
 	if gpuType != gpu.MetricsNoGPU {
-		gpuScaleDownCount.WithLabelValues(string(reason), gpuType).Add(float64(nodesCount))
+		gpuScaleDownCount.WithLabelValues(string(reason), gpuResourceName, gpuType).Add(float64(nodesCount))
 	}
 }
 

diff --git a/cluster-autoscaler/utils/gpu/gpu.go b/cluster-autoscaler/utils/gpu/gpu.go
@@ -47,47 +47,44 @@ const (
 	MetricsNoGPU = ""
 )
 
-// GetGpuTypeForMetrics returns name of the GPU used on the node or empty string if there's no GPU
+// GetGpuInfoForMetrics returns the name of the custom resource and the GPU used on the node or empty string if there's no GPU
 // if the GPU type is unknown, "generic" is returned
 // NOTE: current implementation is GKE/GCE-specific
-func GetGpuTypeForMetrics(GPULabel string, availableGPUTypes map[string]struct{}, node *apiv1.Node, nodeGroup cloudprovider.NodeGroup) string {
-	// we use the GKE label if there is one
-	gpuType, labelFound := node.Labels[GPULabel]
-	capacity, capacityFound := node.Status.Capacity[ResourceNvidiaGPU]
-
-	if !labelFound {
-		// no label, fallback to generic solution
-		if capacityFound && !capacity.IsZero() {
-			return MetricsGenericGPU
-		}
-
-		// no signs of GPU
-		return MetricsNoGPU
+func GetGpuInfoForMetrics(gpuConfig *cloudprovider.GpuConfig, availableGPUTypes map[string]struct{}, node *apiv1.Node, nodeGroup cloudprovider.NodeGroup) (gpuResource string, gpuType string) {
+	// There is no sign of GPU
+	if gpuConfig == nil {
+		return "", MetricsNoGPU
+	}
+	resourceName := gpuConfig.ResourceName
+	capacity, capacityFound := node.Status.Capacity[resourceName]
+	// There is no label value, fallback to generic solution
+	if gpuConfig.Type == "" && capacityFound && !capacity.IsZero() {
+		return resourceName.String(), MetricsGenericGPU
 	}
 
 	// GKE-specific label & capacity are present - consistent state
 	if capacityFound {
-		return validateGpuType(availableGPUTypes, gpuType)
+		return resourceName.String(), validateGpuType(availableGPUTypes, gpuConfig.Type)
 	}
 
 	// GKE-specific label present but no capacity (yet?) - check the node template
 	if nodeGroup != nil {
 		template, err := nodeGroup.TemplateNodeInfo()
 		if err != nil {
 			klog.Warningf("Failed to build template for getting GPU metrics for node %v: %v", node.Name, err)
-			return MetricsErrorGPU
+			return resourceName.String(), MetricsErrorGPU
 		}
 
-		if _, found := template.Node().Status.Capacity[ResourceNvidiaGPU]; found {
-			return MetricsMissingGPU
+		if _, found := template.Node().Status.Capacity[resourceName]; found {
+			return resourceName.String(), MetricsMissingGPU
 		}
 
 		// if template does not define GPUs we assume node will not have any even if it has gpu label
 		klog.Warningf("Template does not define GPUs even though node from its node group does; node=%v", node.Name)
-		return MetricsUnexpectedLabelGPU
+		return resourceName.String(), MetricsUnexpectedLabelGPU
 	}
 
-	return MetricsUnexpectedLabelGPU
+	return resourceName.String(), MetricsUnexpectedLabelGPU
 }
 
 func validateGpuType(availableGPUTypes map[string]struct{}, gpu string) string {