Skip to content

Commit

Permalink
koordlet: introduce Accelerators feature gate for GPU related features (
Browse files Browse the repository at this point in the history
#393)

Signed-off-by: Jason Liu <jasonliu747@gmail.com>
  • Loading branch information
jasonliu747 committed Jul 25, 2022
1 parent 47e7189 commit 779ac80
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 3 deletions.
5 changes: 5 additions & 0 deletions pkg/features/koordlet_features.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ const (

// CgroupReconcile reconciles qos config for resources like cpu, memory, disk, etc.
CgroupReconcile featuregate.Feature = "CgroupReconcile"

// Accelerators enables GPU related feature in koordlet.
// Only Nvidia GPUs are supported as of v0.6.
Accelerators featuregate.Feature = "Accelerators"
)

func init() {
Expand All @@ -68,5 +72,6 @@ var (
CPUBurst: {Default: false, PreRelease: featuregate.Alpha},
RdtResctrl: {Default: false, PreRelease: featuregate.Alpha},
CgroupReconcile: {Default: false, PreRelease: featuregate.Alpha},
Accelerators: {Default: false, PreRelease: featuregate.Alpha},
}
)
2 changes: 0 additions & 2 deletions pkg/koordlet/metricsadvisor/collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,6 @@ func (c *collector) Run(stopCh <-chan struct{}) error {
}

go wait.Until(func() {

// collect gpu metrics.
c.collectGPUUsage()
c.collectNodeResUsed()
// add sync metaService cache check before collect pod information
Expand Down
6 changes: 5 additions & 1 deletion pkg/koordlet/metricsadvisor/collector_gpu.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,13 @@ import (
"time"

"github.com/NVIDIA/go-nvml/pkg/nvml"
"github.com/koordinator-sh/koordinator/pkg/util"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/klog/v2"

"github.com/koordinator-sh/koordinator/pkg/features"
"github.com/koordinator-sh/koordinator/pkg/koordlet/metriccache"
"github.com/koordinator-sh/koordinator/pkg/util"
)

type GPUDeviceManager interface {
Expand Down Expand Up @@ -82,6 +83,9 @@ type device struct {

// initGPUDeviceManager will not retry if init fails,
func initGPUDeviceManager() GPUDeviceManager {
if !features.DefaultKoordletFeatureGate.Enabled(features.Accelerators) {
return &dummyDeviceManager{}
}
if ret := nvml.Init(); ret != nvml.SUCCESS {
if ret == nvml.ERROR_LIBRARY_NOT_FOUND {
klog.Warning("nvml init failed, library not found")
Expand Down

0 comments on commit 779ac80

Please sign in to comment.