From 9a013cea2570c4ea27b8f45c09b104c154af0f06 Mon Sep 17 00:00:00 2001 From: Jiahui <4543bxy@gmail.com> Date: Mon, 31 Jul 2023 10:23:53 +0800 Subject: [PATCH] feat: adapt for gpu cost (#3596) * add GPU monitoring * add node role * add persistentvolumeclaims role * add payment-secret optional; fix account Kubefile; hide monitor useless log * fix monitoring nil point error * delete property string to lower --- controllers/account/deploy/Kubefile | 2 +- .../account/deploy/manifests/deploy.yaml | 1 + controllers/pkg/common/gpu/nvidia.go | 120 ++++++++++++++++++ controllers/pkg/common/resources.go | 9 ++ controllers/pkg/database/mongodb.go | 3 +- controllers/resources/config/rbac/role.yaml | 16 +++ .../controllers/monitor_controller.go | 116 +++++++++++------ .../resources/deploy/manifests/deploy.yaml | 16 +++ 8 files changed, 239 insertions(+), 44 deletions(-) create mode 100644 controllers/pkg/common/gpu/nvidia.go diff --git a/controllers/account/deploy/Kubefile b/controllers/account/deploy/Kubefile index 9e917de2b10..17f63c8677f 100644 --- a/controllers/account/deploy/Kubefile +++ b/controllers/account/deploy/Kubefile @@ -8,4 +8,4 @@ COPY manifests manifests ENV DEFAULT_NAMESPACE account-system ENV MONGO_URI "mongodb://mongo:27017/resources" -CMD ["( kubectl create -f manifests/mongo-secret.yaml -n $DEFAULT_NAMESPACE || true ) && kubectl apply -f manifests/deploy.yaml -n $DEFAULT_NAMESPACE"] +CMD ["( kubectl create ns $DEFAULT_NAMESPACE || true ) && ( kubectl create -f manifests/mongo-secret.yaml -n $DEFAULT_NAMESPACE || true ) && kubectl apply -f manifests/deploy.yaml -n $DEFAULT_NAMESPACE"] diff --git a/controllers/account/deploy/manifests/deploy.yaml b/controllers/account/deploy/manifests/deploy.yaml index 612523ae80b..6e56cb4738f 100644 --- a/controllers/account/deploy/manifests/deploy.yaml +++ b/controllers/account/deploy/manifests/deploy.yaml @@ -1168,6 +1168,7 @@ spec: envFrom: - secretRef: name: payment-secret + optional: true image: ghcr.io/labring/sealos-account-controller:latest imagePullPolicy: Always livenessProbe: diff --git a/controllers/pkg/common/gpu/nvidia.go b/controllers/pkg/common/gpu/nvidia.go new file mode 100644 index 00000000000..1b47889af93 --- /dev/null +++ b/controllers/pkg/common/gpu/nvidia.go @@ -0,0 +1,120 @@ +package gpu + +import ( + "context" + + corev1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +// nvidia labels for gpu +const ( + NvidiaGpuKey = "nvidia.com/gpu" + NvidiaCudaDriverMajorKey = "nvidia.com/cuda.driver.major" + NvidiaCudaDriverMinorKey = "nvidia.com/cuda.driver.minor" + NvidiaCudaDriverRevKey = "nvidia.com/cuda.driver.rev" + NvidiaCudaRuntimeMajorKey = "nvidia.com/cuda.runtime.major" + NvidiaCudaRuntimeMinorKey = "nvidia.com/cuda.runtime.minor" + NvidiaGfdTimestampKey = "nvidia.com/gfd.timestamp" + NvidiaGpuComputeMajorKey = "nvidia.com/gpu.compute.major" + NvidiaGpuComputeMinorKey = "nvidia.com/gpu.compute.minor" + NvidiaGpuCountKey = "nvidia.com/gpu.count" + NvidiaGpuDeployContainerToolkitKey = "nvidia.com/gpu.deploy.container-toolkit" + NvidiaGpuDeployDcgmKey = "nvidia.com/gpu.deploy.dcgm" + NvidiaGpuDeployDcgmExporterKey = "nvidia.com/gpu.deploy.dcgm-exporter" + NvidiaGpuDeployDevicePluginKey = "nvidia.com/gpu.deploy.device-plugin" + NvidiaGpuDeployDriverKey = "nvidia.com/gpu.deploy.driver" + NvidiaGpuDeployGpuFeatureDiscoveryKey = "nvidia.com/gpu.deploy.gpu-feature-discovery" + NvidiaGpuDeployNodeStatusExporterKey = "nvidia.com/gpu.deploy.node-status-exporter" + NvidiaGpuDeployOperatorValidatorKey = "nvidia.com/gpu.deploy.operator-validator" + NvidiaGpuFamilyKey = "nvidia.com/gpu.family" + NvidiaGpuMachineKey = "nvidia.com/gpu.machine" + NvidiaGpuMemoryKey = "nvidia.com/gpu.memory" + NvidiaGpuPresentKey = "nvidia.com/gpu.present" + NvidiaGpuProductKey = "nvidia.com/gpu.product" + NvidiaGpuReplicasKey = "nvidia.com/gpu.replicas" + NvidiaMigCapableKey = "nvidia.com/mig.capable" + NvidiaMigStrategyKey = "nvidia.com/mig.strategy" +) + +type NvidiaGPU struct { + GpuInfo Information + CudaInfo CudaInformation + GpuDeploy Deployment + GpuDetails DetailInformation + MigInfo MigInformation +} + +type Information struct { + Gpu string + GpuCount string + GpuPresent string + GpuProduct string + GpuReplicas string +} + +type CudaInformation struct { + CudaDriverMajor string + CudaDriverMinor string + CudaDriverRev string + CudaRuntimeMajor string + CudaRuntimeMinor string +} + +type Deployment struct { + GpuDeployContainerToolkit string + GpuDeployDcgm string + GpuDeployDcgmExporter string + GpuDeployDevicePlugin string + GpuDeployDriver string + GpuDeployGpuFeatureDiscovery string + GpuDeployNodeStatusExporter string + GpuDeployOperatorValidator string +} + +type DetailInformation struct { + GpuComputeMajor string + GpuComputeMinor string + GpuFamily string + GpuMachine string + GpuMemory string + GfdTimestamp string +} + +type MigInformation struct { + MigCapable string + MigStrategy string +} + +//nvidia.com/gpu + +func GetNodeGpuModel(c client.Client) (map[string]NvidiaGPU, error) { + nodeList := &corev1.NodeList{} + err := c.List(context.Background(), nodeList) + if err != nil { + return nil, err + } + + gpuModels := make(map[string]NvidiaGPU) + for _, node := range nodeList.Items { + gpu := NvidiaGPU{ + GpuInfo: Information{ + Gpu: node.Labels[NvidiaGpuKey], + GpuCount: node.Labels[NvidiaGpuCountKey], + GpuPresent: node.Labels[NvidiaGpuPresentKey], + GpuProduct: node.Labels[NvidiaGpuProductKey], + GpuReplicas: node.Labels[NvidiaGpuReplicasKey], + }, + CudaInfo: CudaInformation{ + CudaDriverMajor: node.Labels[NvidiaCudaDriverMajorKey], + CudaDriverMinor: node.Labels[NvidiaCudaDriverMinorKey], + CudaDriverRev: node.Labels[NvidiaCudaDriverRevKey], + CudaRuntimeMajor: node.Labels[NvidiaCudaRuntimeMajorKey], + CudaRuntimeMinor: node.Labels[NvidiaCudaRuntimeMinorKey], + }, + // fill in the rest similarly... + } + gpuModels[node.Name] = gpu + } + return gpuModels, nil +} diff --git a/controllers/pkg/common/resources.go b/controllers/pkg/common/resources.go index 71e8a2b9364..8609c04fbe5 100644 --- a/controllers/pkg/common/resources.go +++ b/controllers/pkg/common/resources.go @@ -6,6 +6,8 @@ import ( "math" "time" + "github.com/labring/sealos/controllers/pkg/common/gpu" + "go.mongodb.org/mongo-driver/bson" "go.mongodb.org/mongo-driver/mongo" @@ -112,6 +114,12 @@ const ( PropertyInfraDisk = "infra-disk" ) +const ResourceGPU corev1.ResourceName = gpu.NvidiaGpuKey + +func NewGpuResource(product string) corev1.ResourceName { + return corev1.ResourceName("gpu-" + product) +} + var ( bin1Mi = resource.NewQuantity(1<<20, resource.BinarySI) cpuUnit = resource.MustParse("1m") @@ -119,6 +127,7 @@ var ( var PricesUnit = map[corev1.ResourceName]*resource.Quantity{ corev1.ResourceCPU: &cpuUnit, // 1 m CPU (1000 μ) + ResourceGPU: &cpuUnit, // 1 m CPU (1000 μ) corev1.ResourceMemory: bin1Mi, // 1 MiB corev1.ResourceStorage: bin1Mi, // 1 MiB } diff --git a/controllers/pkg/database/mongodb.go b/controllers/pkg/database/mongodb.go index a81e5d9774c..38199de61a0 100644 --- a/controllers/pkg/database/mongodb.go +++ b/controllers/pkg/database/mongodb.go @@ -4,7 +4,6 @@ import ( "context" "fmt" "math" - "strings" "time" "github.com/labring/sealos/controllers/pkg/crypto" @@ -197,7 +196,7 @@ func (m *MongoDB) GetAllPricesMap() (map[string]common.Price, error) { if err != nil { return nil, fmt.Errorf("decrypt price error: %v", err) } - pricesMap[strings.ToLower(prices[i].Property)] = common.Price{ + pricesMap[prices[i].Property] = common.Price{ Price: price, Detail: prices[i].Detail, Property: prices[i].Property, diff --git a/controllers/resources/config/rbac/role.yaml b/controllers/resources/config/rbac/role.yaml index a0d28a48623..9b1c44d0798 100644 --- a/controllers/resources/config/rbac/role.yaml +++ b/controllers/resources/config/rbac/role.yaml @@ -13,6 +13,22 @@ rules: - get - list - watch +- apiGroups: + - "" + resources: + - nodes + verbs: + - get + - list + - watch +- apiGroups: + - "" + resources: + - persistentvolumeclaims + verbs: + - get + - list + - watch - apiGroups: - "" resources: diff --git a/controllers/resources/controllers/monitor_controller.go b/controllers/resources/controllers/monitor_controller.go index 4ca7c540a68..d82164d1f30 100644 --- a/controllers/resources/controllers/monitor_controller.go +++ b/controllers/resources/controllers/monitor_controller.go @@ -25,6 +25,8 @@ import ( "sync" "time" + "github.com/labring/sealos/controllers/pkg/common/gpu" + "golang.org/x/sync/semaphore" "github.com/labring/sealos/pkg/utils/logger" @@ -34,7 +36,6 @@ import ( meteringv1 "github.com/labring/sealos/controllers/metering/api/v1" "github.com/labring/sealos/controllers/pkg/common" "github.com/labring/sealos/controllers/pkg/database" - v1 "github.com/labring/sealos/controllers/user/api/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/runtime" @@ -52,6 +53,7 @@ type MonitorReconciler struct { stopCh chan struct{} wg sync.WaitGroup periodicReconcile time.Duration + NvidiaGpu map[string]gpu.NvidiaGPU } type quantity struct { @@ -67,6 +69,8 @@ const ( var namespaceMonitorFuncs = make(map[string]func(ctx context.Context, dbClient database.Interface, namespace *corev1.Namespace) error) +//+kubebuilder:rbac:groups=core,resources=nodes,verbs=get;list;watch +//+kubebuilder:rbac:groups=core,resources=persistentvolumeclaims,verbs=get;list;watch //+kubebuilder:rbac:groups=core,resources=namespaces,verbs=get;list;watch //+kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch //+kubebuilder:rbac:groups=core,resources=resourcequotas,verbs=get;list;watch @@ -87,9 +91,15 @@ func NewMonitorReconciler(mgr ctrl.Manager) (*MonitorReconciler, error) { return nil, fmt.Errorf("mongo uri is empty") } r.initNamespaceFuncs() - if err := r.preApply(); err != nil { + err := r.preApply() + if err != nil { return nil, err } + r.NvidiaGpu, err = gpu.GetNodeGpuModel(mgr.GetClient()) + if err != nil { + return nil, fmt.Errorf("failed to get node gpu model: %v", err) + } + r.Logger.Info("get gpu model", "gpu model", r.NvidiaGpu) r.startPeriodicReconcile() return r, nil } @@ -268,27 +278,24 @@ func (r *MonitorReconciler) podResourceUsage(ctx context.Context, dbClient datab return err } rs := initResources() + hasStorageQuota := false if err := r.Get(ctx, client.ObjectKey{Name: meteringv1.ResourceQuotaPrefix + namespace.Name, Namespace: namespace.Name}, "a); err != nil { if client.IgnoreNotFound(err) != nil { return err } - if _, ok := namespace.GetAnnotations()[v1.UserAnnotationCreatorKey]; ok { - //+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=rolebindings,verbs=get;list;watch;create;update;patch;delete - //+kubebuilder:rbac:groups=rbac.authorization.k8s.io,resources=roles,verbs=get;list;watch;create;update;patch;delete - //if err = r.syncResourceQuota(ctx, namespace.Name); err != nil { - // r.Logger.Error(err, "sync resource quota failed", "namespace", namespace.Name) - //} - r.Logger.Error(fmt.Errorf("resources quota is empty"), "", "namespace", namespace.Name) - } + //if _, ok := namespace.GetAnnotations()[v1.UserAnnotationOwnerKey]; ok { + // r.Logger.Error(fmt.Errorf("resources quota is empty"), "", "namespace", namespace.Name) + //} rs[corev1.ResourceStorage].detail = "no resource quota" } else { + hasStorageQuota = true rs[corev1.ResourceStorage].Add(*quota.Status.Used.Name("requests.storage", resource.BinarySI)) } for _, pod := range podList.Items { // TODO pending status need skip? - if pod.Status.Phase != corev1.PodRunning /*&& pod.Status.Phase != corev1.PodPending*/ { - continue - } + //if pod.Status.Phase != corev1.PodRunning /*&& pod.Status.Phase != corev1.PodPending*/ { + // continue + //} for _, container := range pod.Spec.Containers { if cpuRequest, ok := container.Resources.Limits[corev1.ResourceCPU]; ok { rs[corev1.ResourceCPU].Add(cpuRequest) @@ -300,56 +307,83 @@ func (r *MonitorReconciler) podResourceUsage(ctx context.Context, dbClient datab } else { rs[corev1.ResourceMemory].Add(container.Resources.Requests[corev1.ResourceMemory]) } + // gpu only use limit + if gpuRequest, ok := container.Resources.Limits[gpu.NvidiaGpuKey]; ok { + gpuModel, ok := r.NvidiaGpu[pod.Spec.NodeName] + if !ok { + var err error + r.NvidiaGpu, err = gpu.GetNodeGpuModel(r.Client) + if err != nil { + logger.Error(err, "get node gpu model failed") + continue + } + gpuModel, ok = r.NvidiaGpu[pod.Spec.NodeName] + if !ok { + logger.Error(fmt.Errorf("node %s not found gpu model", pod.Spec.NodeName), "") + continue + } + } + if _, ok := rs[common.NewGpuResource(gpuModel.GpuInfo.GpuProduct)]; !ok { + rs[common.NewGpuResource(gpuModel.GpuInfo.GpuProduct)] = initGpuResources() + } + logger.Info("gpu request", "pod", pod.Name, "namespace", pod.Namespace, "gpu req", gpuRequest.String(), "node", pod.Spec.NodeName, "gpu model", gpuModel.GpuInfo.GpuProduct) + rs[common.NewGpuResource(gpuModel.GpuInfo.GpuProduct)].Add(gpuRequest) + } } } - cpuValue, memoryValue, storageValue := getResourceValue(corev1.ResourceCPU, rs), getResourceValue(corev1.ResourceMemory, rs), getResourceValue(corev1.ResourceStorage, rs) - var monitors []*common.Monitor - if cpuValue > 0 { - monitors = append(monitors, &common.Monitor{ - Category: namespace.Name, - Property: corev1.ResourceCPU.String(), - Value: cpuValue, - Time: timeStamp, - Detail: rs[corev1.ResourceCPU].String(), - }) - } - if memoryValue > 0 { - monitors = append(monitors, &common.Monitor{ - Category: namespace.Name, - Property: corev1.ResourceMemory.String(), - Value: memoryValue, - Time: timeStamp, - Detail: rs[corev1.ResourceMemory].String(), - }) + if !hasStorageQuota { + pvcList := corev1.PersistentVolumeClaimList{} + if err := r.List(context.Background(), &pvcList, &client.ListOptions{Namespace: namespace.Name}); err != nil { + return err + } + for _, pvc := range pvcList.Items { + if pvc.Status.Phase != corev1.ClaimBound { + continue + } + rs[corev1.ResourceStorage].Add(pvc.Spec.Resources.Requests[corev1.ResourceStorage]) + } } - if storageValue > 0 { - monitors = append(monitors, &common.Monitor{ - Category: namespace.Name, - Property: corev1.ResourceStorage.String(), - Value: storageValue, - Time: timeStamp, - Detail: rs[corev1.ResourceStorage].String(), - }) + var monitors []*common.Monitor + for resour, value := range rs { + v := getResourceValue(resour, rs) + if v > 0 { + monitors = append(monitors, &common.Monitor{ + Category: namespace.Name, + Property: resour.String(), + Value: v, + Time: timeStamp, + Detail: value.detail, + }) + } } return dbClient.InsertMonitor(ctx, monitors...) } func getResourceValue(resourceName corev1.ResourceName, res map[corev1.ResourceName]*quantity) int64 { quantity := res[resourceName] + priceUnit := common.PricesUnit[resourceName] + if strings.Contains(resourceName.String(), "gpu") { + priceUnit = common.PricesUnit[common.ResourceGPU] + } if quantity != nil && quantity.MilliValue() != 0 { - return int64(math.Ceil(float64(quantity.MilliValue()) / float64(common.PricesUnit[resourceName].MilliValue()))) + return int64(math.Ceil(float64(quantity.MilliValue()) / float64(priceUnit.MilliValue()))) } return 0 } func initResources() (rs map[corev1.ResourceName]*quantity) { rs = make(map[corev1.ResourceName]*quantity) + rs[common.ResourceGPU] = initGpuResources() rs[corev1.ResourceCPU] = &quantity{Quantity: resource.NewQuantity(0, resource.DecimalSI), detail: ""} rs[corev1.ResourceMemory] = &quantity{Quantity: resource.NewQuantity(0, resource.BinarySI), detail: ""} rs[corev1.ResourceStorage] = &quantity{Quantity: resource.NewQuantity(0, resource.BinarySI), detail: ""} return } +func initGpuResources() *quantity { + return &quantity{Quantity: resource.NewQuantity(0, resource.DecimalSI), detail: ""} +} + func (r *MonitorReconciler) infraResourceUsage(ctx context.Context, dbClient database.Interface, namespace *corev1.Namespace) error { var infraList infrav1.InfraList if err := r.List(ctx, &infraList, client.InNamespace(namespace.Name)); err != nil { diff --git a/controllers/resources/deploy/manifests/deploy.yaml b/controllers/resources/deploy/manifests/deploy.yaml index 94157ee36f1..e7349be104a 100644 --- a/controllers/resources/deploy/manifests/deploy.yaml +++ b/controllers/resources/deploy/manifests/deploy.yaml @@ -63,6 +63,22 @@ rules: - get - list - watch +- apiGroups: + - "" + resources: + - nodes + verbs: + - get + - list + - watch +- apiGroups: + - "" + resources: + - persistentvolumeclaims + verbs: + - get + - list + - watch - apiGroups: - "" resources: