Skip to content

Commit

Permalink
feat: adapt for gpu cost (#3596)
Browse files Browse the repository at this point in the history
* add GPU monitoring

* add node role

* add persistentvolumeclaims role

* add payment-secret optional;
fix account Kubefile;
hide monitor useless log

* fix monitoring nil point error

* delete property string to lower
  • Loading branch information
bxy4543 committed Jul 31, 2023
1 parent 4933287 commit 9a013ce
Show file tree
Hide file tree
Showing 8 changed files with 239 additions and 44 deletions.
2 changes: 1 addition & 1 deletion controllers/account/deploy/Kubefile
Expand Up @@ -8,4 +8,4 @@ COPY manifests manifests
ENV DEFAULT_NAMESPACE account-system
ENV MONGO_URI "mongodb://mongo:27017/resources"

CMD ["( kubectl create -f manifests/mongo-secret.yaml -n $DEFAULT_NAMESPACE || true ) && kubectl apply -f manifests/deploy.yaml -n $DEFAULT_NAMESPACE"]
CMD ["( kubectl create ns $DEFAULT_NAMESPACE || true ) && ( kubectl create -f manifests/mongo-secret.yaml -n $DEFAULT_NAMESPACE || true ) && kubectl apply -f manifests/deploy.yaml -n $DEFAULT_NAMESPACE"]
1 change: 1 addition & 0 deletions controllers/account/deploy/manifests/deploy.yaml
Expand Up @@ -1168,6 +1168,7 @@ spec:
envFrom:
- secretRef:
name: payment-secret
optional: true
image: ghcr.io/labring/sealos-account-controller:latest
imagePullPolicy: Always
livenessProbe:
Expand Down
120 changes: 120 additions & 0 deletions controllers/pkg/common/gpu/nvidia.go
@@ -0,0 +1,120 @@
package gpu

import (
"context"

corev1 "k8s.io/api/core/v1"
"sigs.k8s.io/controller-runtime/pkg/client"
)

// nvidia labels for gpu
const (
NvidiaGpuKey = "nvidia.com/gpu"
NvidiaCudaDriverMajorKey = "nvidia.com/cuda.driver.major"
NvidiaCudaDriverMinorKey = "nvidia.com/cuda.driver.minor"
NvidiaCudaDriverRevKey = "nvidia.com/cuda.driver.rev"
NvidiaCudaRuntimeMajorKey = "nvidia.com/cuda.runtime.major"
NvidiaCudaRuntimeMinorKey = "nvidia.com/cuda.runtime.minor"
NvidiaGfdTimestampKey = "nvidia.com/gfd.timestamp"
NvidiaGpuComputeMajorKey = "nvidia.com/gpu.compute.major"
NvidiaGpuComputeMinorKey = "nvidia.com/gpu.compute.minor"
NvidiaGpuCountKey = "nvidia.com/gpu.count"
NvidiaGpuDeployContainerToolkitKey = "nvidia.com/gpu.deploy.container-toolkit"
NvidiaGpuDeployDcgmKey = "nvidia.com/gpu.deploy.dcgm"
NvidiaGpuDeployDcgmExporterKey = "nvidia.com/gpu.deploy.dcgm-exporter"
NvidiaGpuDeployDevicePluginKey = "nvidia.com/gpu.deploy.device-plugin"
NvidiaGpuDeployDriverKey = "nvidia.com/gpu.deploy.driver"
NvidiaGpuDeployGpuFeatureDiscoveryKey = "nvidia.com/gpu.deploy.gpu-feature-discovery"
NvidiaGpuDeployNodeStatusExporterKey = "nvidia.com/gpu.deploy.node-status-exporter"
NvidiaGpuDeployOperatorValidatorKey = "nvidia.com/gpu.deploy.operator-validator"
NvidiaGpuFamilyKey = "nvidia.com/gpu.family"
NvidiaGpuMachineKey = "nvidia.com/gpu.machine"
NvidiaGpuMemoryKey = "nvidia.com/gpu.memory"
NvidiaGpuPresentKey = "nvidia.com/gpu.present"
NvidiaGpuProductKey = "nvidia.com/gpu.product"
NvidiaGpuReplicasKey = "nvidia.com/gpu.replicas"
NvidiaMigCapableKey = "nvidia.com/mig.capable"
NvidiaMigStrategyKey = "nvidia.com/mig.strategy"
)

type NvidiaGPU struct {
GpuInfo Information
CudaInfo CudaInformation
GpuDeploy Deployment
GpuDetails DetailInformation
MigInfo MigInformation
}

type Information struct {
Gpu string
GpuCount string
GpuPresent string
GpuProduct string
GpuReplicas string
}

type CudaInformation struct {
CudaDriverMajor string
CudaDriverMinor string
CudaDriverRev string
CudaRuntimeMajor string
CudaRuntimeMinor string
}

type Deployment struct {
GpuDeployContainerToolkit string
GpuDeployDcgm string
GpuDeployDcgmExporter string
GpuDeployDevicePlugin string
GpuDeployDriver string
GpuDeployGpuFeatureDiscovery string
GpuDeployNodeStatusExporter string
GpuDeployOperatorValidator string
}

type DetailInformation struct {
GpuComputeMajor string
GpuComputeMinor string
GpuFamily string
GpuMachine string
GpuMemory string
GfdTimestamp string
}

type MigInformation struct {
MigCapable string
MigStrategy string
}

//nvidia.com/gpu

func GetNodeGpuModel(c client.Client) (map[string]NvidiaGPU, error) {
nodeList := &corev1.NodeList{}
err := c.List(context.Background(), nodeList)
if err != nil {
return nil, err
}

gpuModels := make(map[string]NvidiaGPU)
for _, node := range nodeList.Items {
gpu := NvidiaGPU{
GpuInfo: Information{
Gpu: node.Labels[NvidiaGpuKey],
GpuCount: node.Labels[NvidiaGpuCountKey],
GpuPresent: node.Labels[NvidiaGpuPresentKey],
GpuProduct: node.Labels[NvidiaGpuProductKey],
GpuReplicas: node.Labels[NvidiaGpuReplicasKey],
},
CudaInfo: CudaInformation{
CudaDriverMajor: node.Labels[NvidiaCudaDriverMajorKey],
CudaDriverMinor: node.Labels[NvidiaCudaDriverMinorKey],
CudaDriverRev: node.Labels[NvidiaCudaDriverRevKey],
CudaRuntimeMajor: node.Labels[NvidiaCudaRuntimeMajorKey],
CudaRuntimeMinor: node.Labels[NvidiaCudaRuntimeMinorKey],
},
// fill in the rest similarly...
}
gpuModels[node.Name] = gpu
}
return gpuModels, nil
}
9 changes: 9 additions & 0 deletions controllers/pkg/common/resources.go
Expand Up @@ -6,6 +6,8 @@ import (
"math"
"time"

"github.com/labring/sealos/controllers/pkg/common/gpu"

"go.mongodb.org/mongo-driver/bson"
"go.mongodb.org/mongo-driver/mongo"

Expand Down Expand Up @@ -112,13 +114,20 @@ const (
PropertyInfraDisk = "infra-disk"
)

const ResourceGPU corev1.ResourceName = gpu.NvidiaGpuKey

func NewGpuResource(product string) corev1.ResourceName {
return corev1.ResourceName("gpu-" + product)
}

var (
bin1Mi = resource.NewQuantity(1<<20, resource.BinarySI)
cpuUnit = resource.MustParse("1m")
)

var PricesUnit = map[corev1.ResourceName]*resource.Quantity{
corev1.ResourceCPU: &cpuUnit, // 1 m CPU (1000 μ)
ResourceGPU: &cpuUnit, // 1 m CPU (1000 μ)
corev1.ResourceMemory: bin1Mi, // 1 MiB
corev1.ResourceStorage: bin1Mi, // 1 MiB
}
Expand Down
3 changes: 1 addition & 2 deletions controllers/pkg/database/mongodb.go
Expand Up @@ -4,7 +4,6 @@ import (
"context"
"fmt"
"math"
"strings"
"time"

"github.com/labring/sealos/controllers/pkg/crypto"
Expand Down Expand Up @@ -197,7 +196,7 @@ func (m *MongoDB) GetAllPricesMap() (map[string]common.Price, error) {
if err != nil {
return nil, fmt.Errorf("decrypt price error: %v", err)
}
pricesMap[strings.ToLower(prices[i].Property)] = common.Price{
pricesMap[prices[i].Property] = common.Price{
Price: price,
Detail: prices[i].Detail,
Property: prices[i].Property,
Expand Down
16 changes: 16 additions & 0 deletions controllers/resources/config/rbac/role.yaml
Expand Up @@ -13,6 +13,22 @@ rules:
- get
- list
- watch
- apiGroups:
- ""
resources:
- nodes
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
- persistentvolumeclaims
verbs:
- get
- list
- watch
- apiGroups:
- ""
resources:
Expand Down

0 comments on commit 9a013ce

Please sign in to comment.