Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add "resource_name" to scaled_up_gpu_nodes_total and scaled_down_gpu_nodes_total metrics #5518

Merged
merged 1 commit into from
Feb 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
16 changes: 8 additions & 8 deletions cluster-autoscaler/core/scale_up.go
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,6 @@ func ScaleUp(context *context.AutoscalingContext, processors *ca_processors.Auto
}

now := time.Now()
gpuLabel := context.CloudProvider.GPULabel()
availableGPUTypes := context.CloudProvider.GetAvailableGPUTypes()
expansionOptions := make(map[string]expander.Option, 0)
skippedNodeGroups := map[string]status.Reasons{}
Expand Down Expand Up @@ -405,7 +404,9 @@ func ScaleUp(context *context.AutoscalingContext, processors *ca_processors.Auto

klog.V(1).Infof("Final scale-up plan: %v", scaleUpInfos)
for _, info := range scaleUpInfos {
typedErr := executeScaleUp(context, clusterStateRegistry, info, gpu.GetGpuTypeForMetrics(gpuLabel, availableGPUTypes, nodeInfo.Node(), nil), now)
gpuConfig := context.CloudProvider.GetNodeGpuConfig(nodeInfo.Node())
gpuResourceName, gpuType := gpu.GetGpuInfoForMetrics(gpuConfig, availableGPUTypes, nodeInfo.Node(), nil)
typedErr := executeScaleUp(context, clusterStateRegistry, info, gpuResourceName, gpuType, now)
if typedErr != nil {
return scaleUpError(
&status.ScaleUpStatus{
Expand Down Expand Up @@ -444,7 +445,6 @@ func ScaleUpToNodeGroupMinSize(context *context.AutoscalingContext, processors *
nodes []*apiv1.Node, nodeInfos map[string]*schedulerframework.NodeInfo) (*status.ScaleUpStatus, errors.AutoscalerError) {
now := time.Now()
nodeGroups := context.CloudProvider.NodeGroups()
gpuLabel := context.CloudProvider.GPULabel()
availableGPUTypes := context.CloudProvider.GetAvailableGPUTypes()
scaleUpInfos := make([]nodegroupset.ScaleUpInfo, 0)

Expand Down Expand Up @@ -521,9 +521,9 @@ func ScaleUpToNodeGroupMinSize(context *context.AutoscalingContext, processors *
klog.Warningf("ScaleUpToNodeGroupMinSize: failed to get node info for node group %s", info.Group.Id())
continue
}

gpuType := gpu.GetGpuTypeForMetrics(gpuLabel, availableGPUTypes, nodeInfo.Node(), nil)
if err := executeScaleUp(context, clusterStateRegistry, info, gpuType, now); err != nil {
gpuConfig := context.CloudProvider.GetNodeGpuConfig(nodeInfo.Node())
gpuResourceName, gpuType := gpu.GetGpuInfoForMetrics(gpuConfig, availableGPUTypes, nodeInfo.Node(), nil)
if err := executeScaleUp(context, clusterStateRegistry, info, gpuResourceName, gpuType, now); err != nil {
return scaleUpError(
&status.ScaleUpStatus{
FailedResizeNodeGroups: []cloudprovider.NodeGroup{info.Group},
Expand Down Expand Up @@ -605,7 +605,7 @@ func filterNodeGroupsByPods(
return result
}

func executeScaleUp(context *context.AutoscalingContext, clusterStateRegistry *clusterstate.ClusterStateRegistry, info nodegroupset.ScaleUpInfo, gpuType string, now time.Time) errors.AutoscalerError {
func executeScaleUp(context *context.AutoscalingContext, clusterStateRegistry *clusterstate.ClusterStateRegistry, info nodegroupset.ScaleUpInfo, gpuResourceName, gpuType string, now time.Time) errors.AutoscalerError {
klog.V(0).Infof("Scale-up: setting group %s size to %d", info.Group.Id(), info.NewSize)
context.LogRecorder.Eventf(apiv1.EventTypeNormal, "ScaledUpGroup",
"Scale-up: setting group %s size to %d instead of %d (max: %d)", info.Group.Id(), info.NewSize, info.CurrentSize, info.MaxSize)
Expand All @@ -620,7 +620,7 @@ func executeScaleUp(context *context.AutoscalingContext, clusterStateRegistry *c
info.Group,
increase,
time.Now())
metrics.RegisterScaleUp(increase, gpuType)
metrics.RegisterScaleUp(increase, gpuResourceName, gpuType)
context.LogRecorder.Eventf(apiv1.EventTypeNormal, "ScaledUpGroup",
"Scale-up: group %s size set to %d instead of %d (max: %d)", info.Group.Id(), info.NewSize, info.CurrentSize, info.MaxSize)
return nil
Expand Down
2 changes: 1 addition & 1 deletion cluster-autoscaler/core/scale_up_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1053,7 +1053,7 @@ func TestAuthError(t *testing.T) {

clusterStateRegistry := clusterstate.NewClusterStateRegistry(nil, clusterstate.ClusterStateRegistryConfig{}, context.LogRecorder, NewBackoff())

aerr := executeScaleUp(&context, clusterStateRegistry, info, "", time.Now())
aerr := executeScaleUp(&context, clusterStateRegistry, info, "", "", time.Now())
assert.Error(t, aerr)

req, err := http.NewRequest("GET", "/", nil)
Expand Down
4 changes: 3 additions & 1 deletion cluster-autoscaler/core/scaledown/actuation/actuator.go
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,9 @@ func RegisterAndRecordSuccessfulScaleDownEvent(ctx *context.AutoscalingContext,
Time: time.Now(),
ExpectedDeleteTime: time.Now().Add(MaxCloudProviderNodeDeletionTime),
})
metrics.RegisterScaleDown(1, gpu.GetGpuTypeForMetrics(ctx.CloudProvider.GPULabel(), ctx.CloudProvider.GetAvailableGPUTypes(), node, nodeGroup), nodeScaleDownReason(node, drain))
gpuConfig := ctx.CloudProvider.GetNodeGpuConfig(node)
metricResourceName, metricGpuType := gpu.GetGpuInfoForMetrics(gpuConfig, ctx.CloudProvider.GetAvailableGPUTypes(), node, nodeGroup)
metrics.RegisterScaleDown(1, metricResourceName, metricGpuType, nodeScaleDownReason(node, drain))
if drain {
ctx.LogRecorder.Eventf(apiv1.EventTypeNormal, "ScaleDown", "Scale-down: node %s removed with drain", node.Name)
} else {
Expand Down
12 changes: 6 additions & 6 deletions cluster-autoscaler/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ var (
Namespace: caNamespace,
Name: "scaled_up_gpu_nodes_total",
Help: "Number of GPU nodes added by CA, by GPU name.",
}, []string{"gpu_name"},
}, []string{"gpu_resource_name", "gpu_name"},
)

failedScaleUpCount = k8smetrics.NewCounterVec(
Expand All @@ -270,7 +270,7 @@ var (
Namespace: caNamespace,
Name: "scaled_down_gpu_nodes_total",
Help: "Number of GPU nodes removed by CA, by reason and GPU name.",
}, []string{"reason", "gpu_name"},
}, []string{"reason", "gpu_resource_name", "gpu_name"},
)

evictionsCount = k8smetrics.NewCounter(
Expand Down Expand Up @@ -490,10 +490,10 @@ func RegisterError(err errors.AutoscalerError) {
}

// RegisterScaleUp records number of nodes added by scale up
func RegisterScaleUp(nodesCount int, gpuType string) {
func RegisterScaleUp(nodesCount int, gpuResourceName, gpuType string) {
scaleUpCount.Add(float64(nodesCount))
if gpuType != gpu.MetricsNoGPU {
gpuScaleUpCount.WithLabelValues(gpuType).Add(float64(nodesCount))
gpuScaleUpCount.WithLabelValues(gpuResourceName, gpuType).Add(float64(nodesCount))
}
}

Expand All @@ -503,10 +503,10 @@ func RegisterFailedScaleUp(reason FailedScaleUpReason) {
}

// RegisterScaleDown records number of nodes removed by scale down
func RegisterScaleDown(nodesCount int, gpuType string, reason NodeScaleDownReason) {
func RegisterScaleDown(nodesCount int, gpuResourceName, gpuType string, reason NodeScaleDownReason) {
scaleDownCount.WithLabelValues(string(reason)).Add(float64(nodesCount))
if gpuType != gpu.MetricsNoGPU {
gpuScaleDownCount.WithLabelValues(string(reason), gpuType).Add(float64(nodesCount))
gpuScaleDownCount.WithLabelValues(string(reason), gpuResourceName, gpuType).Add(float64(nodesCount))
}
}

Expand Down
37 changes: 17 additions & 20 deletions cluster-autoscaler/utils/gpu/gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,47 +47,44 @@ const (
MetricsNoGPU = ""
)

// GetGpuTypeForMetrics returns name of the GPU used on the node or empty string if there's no GPU
// GetGpuInfoForMetrics returns the name of the custom resource and the GPU used on the node or empty string if there's no GPU
// if the GPU type is unknown, "generic" is returned
// NOTE: current implementation is GKE/GCE-specific
func GetGpuTypeForMetrics(GPULabel string, availableGPUTypes map[string]struct{}, node *apiv1.Node, nodeGroup cloudprovider.NodeGroup) string {
// we use the GKE label if there is one
gpuType, labelFound := node.Labels[GPULabel]
capacity, capacityFound := node.Status.Capacity[ResourceNvidiaGPU]

if !labelFound {
// no label, fallback to generic solution
if capacityFound && !capacity.IsZero() {
return MetricsGenericGPU
}

// no signs of GPU
return MetricsNoGPU
func GetGpuInfoForMetrics(gpuConfig *cloudprovider.GpuConfig, availableGPUTypes map[string]struct{}, node *apiv1.Node, nodeGroup cloudprovider.NodeGroup) (gpuResource string, gpuType string) {
// There is no sign of GPU
if gpuConfig == nil {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: It took me a bit to figure out that the PR doesn't change behavior for the existing GPU logic (there's still 1 difference - this function only looks at capacity, while GetNodeGpuConfig utilizes NodeHasGpu which looks at allocatable - but capacity and allocatable should be in sync for GPUs, and allocatable is arguably more correct - so it looks fine to me). This function could really use a unit test, if you're up for it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a little bit tricky, started writing a test but I found out that previous PR #5459 introduced a "hidden" import cycle (so far the cycle happens only if you use the cloudprovider test package). If you're OK with it, I'd prefer to follow up in the next PR

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The cycle will probably have to be solved sooner or later, but a follow-up SGTM.

return "", MetricsNoGPU
}
resourceName := gpuConfig.ResourceName
capacity, capacityFound := node.Status.Capacity[resourceName]
// There is no label value, fallback to generic solution
if gpuConfig.Type == "" && capacityFound && !capacity.IsZero() {
return resourceName.String(), MetricsGenericGPU
}

// GKE-specific label & capacity are present - consistent state
if capacityFound {
return validateGpuType(availableGPUTypes, gpuType)
return resourceName.String(), validateGpuType(availableGPUTypes, gpuConfig.Type)
}

// GKE-specific label present but no capacity (yet?) - check the node template
if nodeGroup != nil {
template, err := nodeGroup.TemplateNodeInfo()
if err != nil {
klog.Warningf("Failed to build template for getting GPU metrics for node %v: %v", node.Name, err)
return MetricsErrorGPU
return resourceName.String(), MetricsErrorGPU
}

if _, found := template.Node().Status.Capacity[ResourceNvidiaGPU]; found {
return MetricsMissingGPU
if _, found := template.Node().Status.Capacity[resourceName]; found {
return resourceName.String(), MetricsMissingGPU
}

// if template does not define GPUs we assume node will not have any even if it has gpu label
klog.Warningf("Template does not define GPUs even though node from its node group does; node=%v", node.Name)
return MetricsUnexpectedLabelGPU
return resourceName.String(), MetricsUnexpectedLabelGPU
}

return MetricsUnexpectedLabelGPU
return resourceName.String(), MetricsUnexpectedLabelGPU
}

func validateGpuType(availableGPUTypes map[string]struct{}, gpu string) string {
Expand Down