Skip to content

Commit

Permalink
Add vgpu and mps metrics
Browse files Browse the repository at this point in the history
Signed-off-by: ghokun <gokhun@gmail.com>
  • Loading branch information
ghokun committed Mar 24, 2022
1 parent b7fe821 commit dbdc3de
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 42 deletions.
2 changes: 1 addition & 1 deletion manifests/device-plugin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ spec:
capabilities:
add: ["SYS_ADMIN"]
containers:
- image: ghcr.io/kuartis/kuartis-virtual-gpu-device-plugin:0.6.2
- image: ghcr.io/kuartis/kuartis-virtual-gpu-device-plugin:0.6.3
name: kuartis-virtual-gpu-device-plugin-ctr
command:
- /usr/bin/virtual-gpu-device-plugin
Expand Down
74 changes: 41 additions & 33 deletions pkg/gpu/nvidia/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,29 +29,33 @@ var node = os.Getenv("NODE_NAME")
var metricsFormat = `# HELP gpu_memory_usage_per_container Shows the GPU memory usage per container.
# TYPE gpu_memory_usage_per_container gauge
{{- range $m := . }}
gpu_memory_usage_per_container{pid="{{ $m.Pid }}",gpuindex="{{ $m.GpuIndex }}",gpuuuid="{{ $m.GpuUUID }}",node="{{ $m.Node }}",namespace="{{ $m.Namespace }}",pod="{{ $m.Pod }}",poduid="{{ $m.PodUid }}",container="{{ $m.Container }}",containerid="{{ $m.ContainerId }}"} {{ $m.UsedGpuMemory }}
gpu_memory_usage_per_container{pid="{{ $m.Pid }}",gpuindex="{{ $m.GpuIndex }}",gpuuuid="{{ $m.GpuUUID }}",node="{{ $m.Node }}",namespace="{{ $m.Namespace }}",pod="{{ $m.Pod }}",poduid="{{ $m.PodUid }}",container="{{ $m.Container }}",containerid="{{ $m.ContainerId }}",vgpucount="{{ $m.VGpuCount }}",mpsactivethread="{{ $m.MpsActiveThread }}"} {{ $m.UsedGpuMemory }}
{{- end -}}`

type metric struct {
Pid uint32
UsedGpuMemory uint64
GpuIndex int
GpuUUID string
Node string
Namespace string
Pod string
PodUid string
Container string
ContainerId string
Pid uint32
UsedGpuMemory uint64
GpuIndex int
GpuUUID string
Node string
Namespace string
Pod string
PodUid string
Container string
ContainerId string
VGpuCount string
MpsActiveThread string
}

type containerInfo struct {
Node string
Namespace string
Pod string
PodUid string
Container string
ContainerId string
Node string
Namespace string
Pod string
PodUid string
Container string
ContainerId string
VGpuCount string
MpsActiveThread string
}

func MetricServer() {
Expand All @@ -77,12 +81,14 @@ func collectMetrics(w http.ResponseWriter, r *http.Request) {
containerMap := make(map[string]containerInfo)
for _, container := range containers.GetContainers() {
containerMap[container.GetId()] = containerInfo{
Node: node,
Namespace: container.GetLabels()["io.kubernetes.pod.namespace"],
Pod: container.GetLabels()["io.kubernetes.pod.name"],
PodUid: container.GetLabels()["io.kubernetes.pod.uid"],
Container: container.GetMetadata().GetName(),
ContainerId: container.GetId(),
Node: node,
Namespace: container.GetLabels()["io.kubernetes.pod.namespace"],
Pod: container.GetLabels()["io.kubernetes.pod.name"],
PodUid: container.GetLabels()["io.kubernetes.pod.uid"],
Container: container.GetMetadata().GetName(),
ContainerId: container.GetId(),
VGpuCount: container.GetAnnotations()["k8s.kuartis.com/vgpu-count"],
MpsActiveThread: container.GetAnnotations()["k8s.kuartis.com/mps-active-thread"],
}
}
collected := []metric{}
Expand All @@ -97,16 +103,18 @@ func collectMetrics(w http.ResponseWriter, r *http.Request) {
if container, ok := containerMap[containerId]; ok {
log.Printf("Using %s Found container %+v for process: %d", containerId, container, process.Pid)
collected = append(collected, metric{
Pid: process.Pid,
UsedGpuMemory: process.UsedGpuMemory,
GpuIndex: i,
GpuUUID: getDeviceUUID(d),
Node: container.Node,
Namespace: container.Namespace,
Pod: container.Pod,
PodUid: container.PodUid,
Container: container.Container,
ContainerId: container.ContainerId,
Pid: process.Pid,
UsedGpuMemory: process.UsedGpuMemory,
GpuIndex: i,
GpuUUID: getDeviceUUID(d),
Node: container.Node,
Namespace: container.Namespace,
Pod: container.Pod,
PodUid: container.PodUid,
Container: container.Container,
ContainerId: container.ContainerId,
VGpuCount: container.VGpuCount,
MpsActiveThread: container.MpsActiveThread,
})
}
}
Expand Down
22 changes: 14 additions & 8 deletions pkg/gpu/nvidia/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -290,17 +290,23 @@ func (m *NvidiaDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.Alloc

cresp := new(pluginapi.ContainerAllocateResponse)

cudaActiveThread := fmt.Sprintf("%d", 100*len(req.DevicesIDs)/(len(m.devs)/len(m.physicalDevs)*len(visibleDevs)))
visibleDevsStr := strings.Join(visibleDevs, ",")
allocatedDeviceIdsStr := strings.Join(req.DevicesIDs, ",")

cresp.Envs = map[string]string{}
cresp.Envs["NVIDIA_VISIBLE_DEVICES"] = strings.Join(visibleDevs, ",")
cresp.Envs["CUDA_MPS_ACTIVE_THREAD_PERCENTAGE"] = fmt.Sprintf("%d", 100*len(req.DevicesIDs)/len(m.devs))
cresp.Envs["NVIDIA_VISIBLE_DEVICES"] = visibleDevsStr
cresp.Envs["CUDA_MPS_ACTIVE_THREAD_PERCENTAGE"] = cudaActiveThread

cresp.Annotations = map[string]string{}
cresp.Annotations["k8s.kuartis.com/gpu-ids"] = strings.Join(visibleDevs, ",")
cresp.Annotations["k8s.kuartis.com/vgpu-ids"] = strings.Join(req.DevicesIDs, ",")

log.Printf("Allocated physical devices: %s", strings.Join(visibleDevs, ","))
log.Printf("Allocated virtual devices: %s", strings.Join(req.DevicesIDs, ","))
log.Printf("Allocated MPS ACTIVE THREAD PERCENTAGE: %s", fmt.Sprintf("%d", 100*len(req.DevicesIDs)/len(m.devs)))
cresp.Annotations["k8s.kuartis.com/gpu-ids"] = visibleDevsStr
cresp.Annotations["k8s.kuartis.com/vgpu-ids"] = allocatedDeviceIdsStr
cresp.Annotations["k8s.kuartis.com/vgpu-count"] = fmt.Sprintf("%d", len(req.DevicesIDs))
cresp.Annotations["k8s.kuartis.com/mps-active-thread"] = cudaActiveThread

log.Printf("Allocated physical devices: %s", visibleDevsStr)
log.Printf("Allocated virtual devices: %s", allocatedDeviceIdsStr)
log.Printf("Allocated MPS ACTIVE THREAD PERCENTAGE: %s", cudaActiveThread)

response.ContainerResponses = append(response.ContainerResponses, cresp)
}
Expand Down

0 comments on commit dbdc3de

Please sign in to comment.