Skip to content

Commit

Permalink
Add optional resource prefix support to "fake-mode"
Browse files Browse the repository at this point in the history
Adding prefix (e.g. "fake_") to resource names has pros and cons.

With different resource name, real GPU workloads do not end on faked
devices, so one can easily run both real GPU plugin and workloads in
the same cluster with the fake ones.

However, GAS hard-codes resource names i.e. changing the name(s) with
a prefix will break GPU plugin resource management, so one should use
prefix only when fractional resources are not needed for faked
devices.

Signed-off-by: Eero Tamminen <eero.t.tamminen@intel.com>
  • Loading branch information
eero-t committed Aug 18, 2022
1 parent c011712 commit 291a1bc
Showing 1 changed file with 20 additions and 9 deletions.
29 changes: 20 additions & 9 deletions cmd/gpu_plugin/gpu_plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ const (

type cliOptions struct {
preferredAllocationPolicy string
resourcePrefix string
sharedDevNum int
enableMonitoring bool
resourceManagement bool
Expand Down Expand Up @@ -176,7 +177,7 @@ func newDevicePlugin(sysfsDir, devfsDir string, options cliOptions) *devicePlugi
if options.resourceManagement {
var err error

dp.resMan, err = rm.NewResourceManager(monitorID, namespace+"/"+deviceType)
dp.resMan, err = rm.NewResourceManager(monitorID, namespace+"/"+options.resourcePrefix+deviceType)
if err != nil {
klog.Errorf("Failed to create resource manager: %+v", err)
return nil
Expand Down Expand Up @@ -330,7 +331,8 @@ func (dp *devicePlugin) scan() (dpapi.DeviceTree, error) {
}

if dp.options.enableMonitoring {
klog.V(4).Infof("Adding %s to GPU %s/%s", devPath, monitorType, monitorID)
klog.V(4).Infof("Adding %s to GPU resource %s%s/%s", devPath,
dp.options.resourcePrefix, monitorType, monitorID)

monitor = append(monitor, devSpec)
}
Expand All @@ -343,7 +345,7 @@ func (dp *devicePlugin) scan() (dpapi.DeviceTree, error) {
devID := fmt.Sprintf("%s-%d", f.Name(), i)
// Currently only one device type (i915) is supported.
// TODO: check model ID to differentiate device models.
devTree.AddDevice(deviceType, devID, deviceInfo)
devTree.AddDevice(dp.options.resourcePrefix+deviceType, devID, deviceInfo)

rmDevInfos[devID] = rm.NewDeviceInfo(nodes, nil, nil)
}
Expand All @@ -352,7 +354,7 @@ func (dp *devicePlugin) scan() (dpapi.DeviceTree, error) {
// all Intel GPUs are under single monitoring resource
if len(monitor) > 0 {
deviceInfo := dpapi.NewDeviceInfo(pluginapi.Healthy, monitor, nil, nil, nil)
devTree.AddDevice(monitorType, monitorID, deviceInfo)
devTree.AddDevice(dp.options.resourcePrefix+monitorType, monitorID, deviceInfo)
}

if dp.resMan != nil {
Expand All @@ -371,10 +373,10 @@ func (dp *devicePlugin) Allocate(request *pluginapi.AllocateRequest) (*pluginapi
}

func main() {
var prefix string
var faked string
var opts cliOptions

flag.StringVar(&fprefix, "fake-mode", "", "Prefix for devfs & sysfs paths")
flag.StringVar(&faked, "fake-mode", "", "Comma separated prefix for devfs & sysfs paths + prefix for resources names")
flag.BoolVar(&opts.enableMonitoring, "enable-monitoring", false, "whether to enable 'i915_monitoring' (= all GPUs) resource")
flag.BoolVar(&opts.resourceManagement, "resource-manager", false, "fractional GPU resource management")
flag.IntVar(&opts.sharedDevNum, "shared-dev-num", 1, "number of containers sharing the same GPU device")
Expand All @@ -400,12 +402,21 @@ func main() {
klog.V(1).Infof("GPU device plugin started with %s preferred allocation policy", opts.preferredAllocationPolicy)

var sysfs, devfs string
if prefix != "" {
sysfs = prefix + sysfsDrmDirectory
devfs = prefix + devfsDriDirectory
if faked != "" {
prefixes := strings.Split(faked, ",")
if len(prefixes) != 2 {
klog.Fatalf("%d commas in fake-mode option value, not one", len(prefixes)-1)
}
sysfs = prefixes[0] + sysfsDrmDirectory
devfs = prefixes[0] + devfsDriDirectory
opts.resourcePrefix = prefixes[1]
if opts.resourceManagement && opts.resourcePrefix != "" {
klog.Warning("Resource name prefix breaks resource management as it hard-codes their names")
}
} else {
sysfs = sysfsDrmDirectory
devfs = devfsDriDirectory
opts.resourcePrefix = ""
}
plugin := newDevicePlugin(sysfs, devfs, opts)
manager := dpapi.NewManager(namespace, plugin)
Expand Down

0 comments on commit 291a1bc

Please sign in to comment.