Skip to content

Commit

Permalink
feat: add initGPUInfoCM func (#3633)
Browse files Browse the repository at this point in the history
* fix: add env for Kubefile CMD

* feat: add cm & role & rb config

* feat: watch create event of node-gpu-info configmap to init it

* fix name error

* update yaml

* recover Kubefile

* Remove the watch CM logic and directly access the api-server to init the CM

* fix review

* feat: add alias data to node-gpu-info configmap

* rename gpu short name
  • Loading branch information
nowinkeyy committed Aug 22, 2023
1 parent f09ff82 commit 205a5bb
Show file tree
Hide file tree
Showing 7 changed files with 166 additions and 33 deletions.
9 changes: 9 additions & 0 deletions controllers/node/config/configmap/configmap.yaml
@@ -0,0 +1,9 @@
---
apiVersion: v1
kind: ConfigMap
metadata:
name: gpu-info
namespace: node-system
data:
gpu: ""
alias: '{"NVIDIA-GeForce-RTX-4090":"GeForce-RTX-4090"}'
2 changes: 2 additions & 0 deletions controllers/node/config/configmap/kustomization.yaml
@@ -0,0 +1,2 @@
resources:
- configmap.yaml
1 change: 1 addition & 0 deletions controllers/node/config/default/kustomization.yaml
Expand Up @@ -18,6 +18,7 @@ resources:
#- ../crd
- ../rbac
- ../manager
- ../configmap
# [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in
# crd/kustomization.yaml
#- ../webhook
Expand Down
11 changes: 11 additions & 0 deletions controllers/node/config/rbac/role.yaml
Expand Up @@ -79,3 +79,14 @@ rules:
- get
- patch
- update
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: gpu-info-cm-reader
namespace: node-system
rules:
- apiGroups: [ "" ] # "" indicates the core API group
resources: [ "configmaps" ]
resourceNames: [ "gpu-info" ]
verbs: [ "get", "watch", "list" ]
14 changes: 14 additions & 0 deletions controllers/node/config/rbac/role_binding.yaml
Expand Up @@ -17,3 +17,17 @@ subjects:
- kind: ServiceAccount
name: controller-manager
namespace: system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: gpu-info-cm-reader-rolebinding
namespace: node-system
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: node-gpu-info-cm-reader
subjects:
- kind: Group
name: system:serviceaccounts
apiGroup: rbac.authorization.k8s.io
123 changes: 90 additions & 33 deletions controllers/node/controllers/gpu_controller.go
Expand Up @@ -25,9 +25,14 @@ import (
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
metaV1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/selection"
"k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/kubernetes"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/builder"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/event"
"sigs.k8s.io/controller-runtime/pkg/predicate"
Expand All @@ -41,8 +46,8 @@ type GpuReconciler struct {

const (
GPU = "gpu"
GPUInfo = "gpu-info"
GPUInfoNameSpace = "sealos"
GPUInfo = "node-gpu-info"
GPUInfoNameSpace = "node-system"
NvidiaGPUProduct = "nvidia.com/gpu.product"
NvidiaGPUMemory = "nvidia.com/gpu.memory"
NvidiaGPU corev1.ResourceName = "nvidia.com/gpu"
Expand Down Expand Up @@ -73,7 +78,10 @@ func (r *GpuReconciler) Reconcile(ctx context.Context, _ ctrl.Request) (ctrl.Res
r.Logger.Error(err, "failed to get pod list")
return ctrl.Result{}, err
}
return r.applyGPUInfoCM(ctx, nodeList, podList, nil)
}

func (r *GpuReconciler) applyGPUInfoCM(ctx context.Context, nodeList *corev1.NodeList, podList *corev1.PodList, clientSet *kubernetes.Clientset) (ctrl.Result, error) {
/*
"nodeMap": {
"sealos-poc-gpu-master-0":{},
Expand Down Expand Up @@ -137,7 +145,13 @@ func (r *GpuReconciler) Reconcile(ctx context.Context, _ ctrl.Request) (ctrl.Res

// create or update gpu-info configmap
configmap := &corev1.ConfigMap{}
err = r.Get(ctx, types.NamespacedName{Name: GPUInfo, Namespace: GPUInfoNameSpace}, configmap)

if clientSet != nil {
configmap, err = clientSet.CoreV1().ConfigMaps(GPUInfoNameSpace).Get(ctx, GPUInfo, metaV1.GetOptions{})
} else {
err = r.Get(ctx, types.NamespacedName{Name: GPUInfo, Namespace: GPUInfoNameSpace}, configmap)
}

if errors.IsNotFound(err) {
configmap = &corev1.ConfigMap{
ObjectMeta: metaV1.ObjectMeta{
Expand All @@ -152,27 +166,73 @@ func (r *GpuReconciler) Reconcile(ctx context.Context, _ ctrl.Request) (ctrl.Res
r.Logger.Error(err, "failed to create gpu-info configmap")
return ctrl.Result{}, err
}
} else if err == nil {
if configmap.Data[GPU] != nodeMapStr {
configmap.Data[GPU] = nodeMapStr
if err := r.Update(ctx, configmap); err != nil && !errors.IsConflict(err) {
r.Logger.Error(err, "failed to update gpu-info configmap")
return ctrl.Result{}, err
}
}
} else {
} else if err != nil {
r.Logger.Error(err, "failed to get gpu-info configmap")
return ctrl.Result{}, err
}

if configmap.Data == nil {
configmap.Data = map[string]string{}
}
if configmap.Data[GPU] != nodeMapStr {
configmap.Data[GPU] = nodeMapStr
if err := r.Update(ctx, configmap); err != nil && !errors.IsConflict(err) {
r.Logger.Error(err, "failed to update gpu-info configmap")
return ctrl.Result{}, err
}
}

r.Logger.V(1).Info("gpu-info configmap status", "gpu", configmap.Data[GPU])
return ctrl.Result{}, nil
}

func (r *GpuReconciler) initGPUInfoCM(ctx context.Context, clientSet *kubernetes.Clientset) error {
// filter for nodes that have GPU
req1, _ := labels.NewRequirement(NvidiaGPUProduct, selection.Exists, []string{})
req2, _ := labels.NewRequirement(NvidiaGPUMemory, selection.Exists, []string{})
selector := labels.NewSelector().Add(*req1, *req2)
listOpts := metaV1.ListOptions{
LabelSelector: selector.String(),
}

nodeList, err := clientSet.CoreV1().Nodes().List(ctx, listOpts)
if err != nil {
return err
}

podList := &corev1.PodList{}
for _, item := range nodeList.Items {
list, err := clientSet.CoreV1().Pods("").List(context.TODO(), metaV1.ListOptions{
FieldSelector: fields.OneTermEqualSelector("spec.nodeName", item.Name).String(),
})
if err != nil {
return err
}
podList.Items = append(podList.Items, list.Items...)
}

_, err = r.applyGPUInfoCM(ctx, nodeList, podList, clientSet)
return err
}

// SetupWithManager sets up the controller with the Manager.
func (r *GpuReconciler) SetupWithManager(mgr ctrl.Manager) error {
r.Logger = ctrl.Log.WithName("gpu-controller")
r.Logger.V(1).Info("starting gpu controller")

// use clientSet to get resources from the API Server, not from Informer's cache
clientSet, err := kubernetes.NewForConfig(mgr.GetConfig())
if err != nil {
r.Logger.Error(err, "failed to init")
return nil
}

// init node-gpu-info configmap
r.Logger.V(1).Info("initializing node-gpu-info configmap")
if err := r.initGPUInfoCM(context.Background(), clientSet); err != nil {
return err
}

// build index for node which have GPU
if err := mgr.GetFieldIndexer().IndexField(context.Background(), &corev1.Node{}, NodeIndexKey, func(rawObj client.Object) []string {
node := rawObj.(*corev1.Node)
Expand All @@ -198,27 +258,24 @@ func (r *GpuReconciler) SetupWithManager(mgr ctrl.Manager) error {
}

return ctrl.NewControllerManagedBy(mgr).
For(&corev1.Pod{}).
WithEventFilter(
predicate.Funcs{
CreateFunc: func(event event.CreateEvent) bool {
_, ok := event.Object.(*corev1.Pod).Spec.NodeSelector[NvidiaGPUProduct]
return ok
},
UpdateFunc: func(event event.UpdateEvent) bool {
_, ok := event.ObjectNew.(*corev1.Pod).Spec.NodeSelector[NvidiaGPUProduct]
if !ok {
return false
}
phaseOld := event.ObjectOld.(*corev1.Pod).Status.Phase
phaseNew := event.ObjectNew.(*corev1.Pod).Status.Phase
return phaseOld != phaseNew
},
DeleteFunc: func(event event.DeleteEvent) bool {
_, ok := event.Object.(*corev1.Pod).Spec.NodeSelector[NvidiaGPUProduct]
return ok
},
For(&corev1.Pod{}, builder.WithPredicates(predicate.Funcs{
CreateFunc: func(event event.CreateEvent) bool {
_, ok := event.Object.(*corev1.Pod).Spec.NodeSelector[NvidiaGPUProduct]
return ok
},
UpdateFunc: func(event event.UpdateEvent) bool {
_, ok := event.ObjectNew.(*corev1.Pod).Spec.NodeSelector[NvidiaGPUProduct]
if !ok {
return false
}
phaseOld := event.ObjectOld.(*corev1.Pod).Status.Phase
phaseNew := event.ObjectNew.(*corev1.Pod).Status.Phase
return phaseOld != phaseNew
},
DeleteFunc: func(event event.DeleteEvent) bool {
_, ok := event.Object.(*corev1.Pod).Spec.NodeSelector[NvidiaGPUProduct]
return ok
},
).
})).
Complete(r)
}
39 changes: 39 additions & 0 deletions controllers/node/deploy/manifests/deploy.yaml
Expand Up @@ -26,6 +26,23 @@ metadata:
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: node-gpu-info-cm-reader
namespace: node-system
rules:
- apiGroups:
- ""
resourceNames:
- node-gpu-info
resources:
- configmaps
verbs:
- get
- watch
- list
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
labels:
app.kubernetes.io/component: rbac
Expand Down Expand Up @@ -194,6 +211,20 @@ rules:
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: node-gpu-info-cm-reader-rolebinding
namespace: node-system
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: node-gpu-info-cm-reader
subjects:
- apiGroup: rbac.authorization.k8s.io
kind: Group
name: system:serviceaccounts
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
labels:
app.kubernetes.io/component: rbac
Expand Down Expand Up @@ -254,6 +285,14 @@ subjects:
namespace: node-system
---
apiVersion: v1
data:
gpu: ""
kind: ConfigMap
metadata:
name: node-gpu-info
namespace: node-system
---
apiVersion: v1
kind: Service
metadata:
labels:
Expand Down

0 comments on commit 205a5bb

Please sign in to comment.