Skip to content

Commit

Permalink
Merge pull request #5518 from kawych/metrics
Browse files Browse the repository at this point in the history
Add "resource_name" to scaled_up_gpu_nodes_total and scaled_down_gpu_nodes_total metrics
  • Loading branch information
k8s-ci-robot committed Feb 22, 2023
2 parents 3922f49 + 2ea2fb6 commit c611acd
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 36 deletions.
16 changes: 8 additions & 8 deletions cluster-autoscaler/core/scale_up.go
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,6 @@ func ScaleUp(context *context.AutoscalingContext, processors *ca_processors.Auto
}

now := time.Now()
gpuLabel := context.CloudProvider.GPULabel()
availableGPUTypes := context.CloudProvider.GetAvailableGPUTypes()
expansionOptions := make(map[string]expander.Option, 0)
skippedNodeGroups := map[string]status.Reasons{}
Expand Down Expand Up @@ -405,7 +404,9 @@ func ScaleUp(context *context.AutoscalingContext, processors *ca_processors.Auto

klog.V(1).Infof("Final scale-up plan: %v", scaleUpInfos)
for _, info := range scaleUpInfos {
typedErr := executeScaleUp(context, clusterStateRegistry, info, gpu.GetGpuTypeForMetrics(gpuLabel, availableGPUTypes, nodeInfo.Node(), nil), now)
gpuConfig := context.CloudProvider.GetNodeGpuConfig(nodeInfo.Node())
gpuResourceName, gpuType := gpu.GetGpuInfoForMetrics(gpuConfig, availableGPUTypes, nodeInfo.Node(), nil)
typedErr := executeScaleUp(context, clusterStateRegistry, info, gpuResourceName, gpuType, now)
if typedErr != nil {
return scaleUpError(
&status.ScaleUpStatus{
Expand Down Expand Up @@ -444,7 +445,6 @@ func ScaleUpToNodeGroupMinSize(context *context.AutoscalingContext, processors *
nodes []*apiv1.Node, nodeInfos map[string]*schedulerframework.NodeInfo) (*status.ScaleUpStatus, errors.AutoscalerError) {
now := time.Now()
nodeGroups := context.CloudProvider.NodeGroups()
gpuLabel := context.CloudProvider.GPULabel()
availableGPUTypes := context.CloudProvider.GetAvailableGPUTypes()
scaleUpInfos := make([]nodegroupset.ScaleUpInfo, 0)

Expand Down Expand Up @@ -521,9 +521,9 @@ func ScaleUpToNodeGroupMinSize(context *context.AutoscalingContext, processors *
klog.Warningf("ScaleUpToNodeGroupMinSize: failed to get node info for node group %s", info.Group.Id())
continue
}

gpuType := gpu.GetGpuTypeForMetrics(gpuLabel, availableGPUTypes, nodeInfo.Node(), nil)
if err := executeScaleUp(context, clusterStateRegistry, info, gpuType, now); err != nil {
gpuConfig := context.CloudProvider.GetNodeGpuConfig(nodeInfo.Node())
gpuResourceName, gpuType := gpu.GetGpuInfoForMetrics(gpuConfig, availableGPUTypes, nodeInfo.Node(), nil)
if err := executeScaleUp(context, clusterStateRegistry, info, gpuResourceName, gpuType, now); err != nil {
return scaleUpError(
&status.ScaleUpStatus{
FailedResizeNodeGroups: []cloudprovider.NodeGroup{info.Group},
Expand Down Expand Up @@ -605,7 +605,7 @@ func filterNodeGroupsByPods(
return result
}

func executeScaleUp(context *context.AutoscalingContext, clusterStateRegistry *clusterstate.ClusterStateRegistry, info nodegroupset.ScaleUpInfo, gpuType string, now time.Time) errors.AutoscalerError {
func executeScaleUp(context *context.AutoscalingContext, clusterStateRegistry *clusterstate.ClusterStateRegistry, info nodegroupset.ScaleUpInfo, gpuResourceName, gpuType string, now time.Time) errors.AutoscalerError {
klog.V(0).Infof("Scale-up: setting group %s size to %d", info.Group.Id(), info.NewSize)
context.LogRecorder.Eventf(apiv1.EventTypeNormal, "ScaledUpGroup",
"Scale-up: setting group %s size to %d instead of %d (max: %d)", info.Group.Id(), info.NewSize, info.CurrentSize, info.MaxSize)
Expand All @@ -620,7 +620,7 @@ func executeScaleUp(context *context.AutoscalingContext, clusterStateRegistry *c
info.Group,
increase,
time.Now())
metrics.RegisterScaleUp(increase, gpuType)
metrics.RegisterScaleUp(increase, gpuResourceName, gpuType)
context.LogRecorder.Eventf(apiv1.EventTypeNormal, "ScaledUpGroup",
"Scale-up: group %s size set to %d instead of %d (max: %d)", info.Group.Id(), info.NewSize, info.CurrentSize, info.MaxSize)
return nil
Expand Down
2 changes: 1 addition & 1 deletion cluster-autoscaler/core/scale_up_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1053,7 +1053,7 @@ func TestAuthError(t *testing.T) {

clusterStateRegistry := clusterstate.NewClusterStateRegistry(nil, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff())

aerr := executeScaleUp(&context, clusterStateRegistry, info, "", time.Now())
aerr := executeScaleUp(&context, clusterStateRegistry, info, "", "", time.Now())
assert.Error(t, aerr)

req, err := http.NewRequest("GET", "/", nil)
Expand Down
4 changes: 3 additions & 1 deletion cluster-autoscaler/core/scaledown/actuation/actuator.go
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,9 @@ func RegisterAndRecordSuccessfulScaleDownEvent(ctx *context.AutoscalingContext,
Time: time.Now(),
ExpectedDeleteTime: time.Now().Add(MaxCloudProviderNodeDeletionTime),
})
metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(ctx.CloudProvider.GPULabel(), ctx.CloudProvider.GetAvailableGPUTypes(), node, nodeGroup), nodeScaleDownReason(node, drain))
gpuConfig := ctx.CloudProvider.GetNodeGpuConfig(node)
metricResourceName, metricGpuType := gpu.GetGpuInfoForMetrics(gpuConfig, ctx.CloudProvider.GetAvailableGPUTypes(), node, nodeGroup)
metrics.RegisterScaleDown(1, metricResourceName, metricGpuType, nodeScaleDownReason(node, drain))
if drain {
ctx.LogRecorder.Eventf(apiv1.EventTypeNormal, "ScaleDown", "Scale-down: node %s removed with drain", node.Name)
} else {
Expand Down
12 changes: 6 additions & 6 deletions cluster-autoscaler/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ var (
Namespace: caNamespace,
Name: "scaled_up_gpu_nodes_total",
Help: "Number of GPU nodes added by CA, by GPU name.",
}, []string{"gpu_name"},
}, []string{"gpu_resource_name", "gpu_name"},
)

failedScaleUpCount = k8smetrics.NewCounterVec(
Expand All @@ -270,7 +270,7 @@ var (
Namespace: caNamespace,
Name: "scaled_down_gpu_nodes_total",
Help: "Number of GPU nodes removed by CA, by reason and GPU name.",
}, []string{"reason", "gpu_name"},
}, []string{"reason", "gpu_resource_name", "gpu_name"},
)

evictionsCount = k8smetrics.NewCounter(
Expand Down Expand Up @@ -490,10 +490,10 @@ func RegisterError(err errors.AutoscalerError) {
}

// RegisterScaleUp records number of nodes added by scale up
func RegisterScaleUp(nodesCount int, gpuType string) {
func RegisterScaleUp(nodesCount int, gpuResourceName, gpuType string) {
scaleUpCount.Add(float64(nodesCount))
if gpuType != gpu.MetricsNoGPU {
gpuScaleUpCount.WithLabelValues(gpuType).Add(float64(nodesCount))
gpuScaleUpCount.WithLabelValues(gpuResourceName, gpuType).Add(float64(nodesCount))
}
}

Expand All @@ -503,10 +503,10 @@ func RegisterFailedScaleUp(reason FailedScaleUpReason) {
}

// RegisterScaleDown records number of nodes removed by scale down
func RegisterScaleDown(nodesCount int, gpuType string, reason NodeScaleDownReason) {
func RegisterScaleDown(nodesCount int, gpuResourceName, gpuType string, reason NodeScaleDownReason) {
scaleDownCount.WithLabelValues(string(reason)).Add(float64(nodesCount))
if gpuType != gpu.MetricsNoGPU {
gpuScaleDownCount.WithLabelValues(string(reason), gpuType).Add(float64(nodesCount))
gpuScaleDownCount.WithLabelValues(string(reason), gpuResourceName, gpuType).Add(float64(nodesCount))
}
}

Expand Down
37 changes: 17 additions & 20 deletions cluster-autoscaler/utils/gpu/gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,47 +47,44 @@ const (
MetricsNoGPU = ""
)

// GetGpuTypeForMetrics returns name of the GPU used on the node or empty string if there's no GPU
// GetGpuInfoForMetrics returns the name of the custom resource and the GPU used on the node or empty string if there's no GPU
// if the GPU type is unknown, "generic" is returned
// NOTE: current implementation is GKE/GCE-specific
func GetGpuTypeForMetrics(GPULabel string, availableGPUTypes map[string]struct{}, node *apiv1.Node, nodeGroup cloudprovider.NodeGroup) string {
// we use the GKE label if there is one
gpuType, labelFound := node.Labels[GPULabel]
capacity, capacityFound := node.Status.Capacity[ResourceNvidiaGPU]

if !labelFound {
// no label, fallback to generic solution
if capacityFound && !capacity.IsZero() {
return MetricsGenericGPU
}

// no signs of GPU
return MetricsNoGPU
func GetGpuInfoForMetrics(gpuConfig *cloudprovider.GpuConfig, availableGPUTypes map[string]struct{}, node *apiv1.Node, nodeGroup cloudprovider.NodeGroup) (gpuResource string, gpuType string) {
// There is no sign of GPU
if gpuConfig == nil {
return "", MetricsNoGPU
}
resourceName := gpuConfig.ResourceName
capacity, capacityFound := node.Status.Capacity[resourceName]
// There is no label value, fallback to generic solution
if gpuConfig.Type == "" && capacityFound && !capacity.IsZero() {
return resourceName.String(), MetricsGenericGPU
}

// GKE-specific label & capacity are present - consistent state
if capacityFound {
return validateGpuType(availableGPUTypes, gpuType)
return resourceName.String(), validateGpuType(availableGPUTypes, gpuConfig.Type)
}

// GKE-specific label present but no capacity (yet?) - check the node template
if nodeGroup != nil {
template, err := nodeGroup.TemplateNodeInfo()
if err != nil {
klog.Warningf("Failed to build template for getting GPU metrics for node %v: %v", node.Name, err)
return MetricsErrorGPU
return resourceName.String(), MetricsErrorGPU
}

if _, found := template.Node().Status.Capacity[ResourceNvidiaGPU]; found {
return MetricsMissingGPU
if _, found := template.Node().Status.Capacity[resourceName]; found {
return resourceName.String(), MetricsMissingGPU
}

// if template does not define GPUs we assume node will not have any even if it has gpu label
klog.Warningf("Template does not define GPUs even though node from its node group does; node=%v", node.Name)
return MetricsUnexpectedLabelGPU
return resourceName.String(), MetricsUnexpectedLabelGPU
}

return MetricsUnexpectedLabelGPU
return resourceName.String(), MetricsUnexpectedLabelGPU
}

func validateGpuType(availableGPUTypes map[string]struct{}, gpu string) string {
Expand Down

0 comments on commit c611acd

Please sign in to comment.