Skip to content

Commit

Permalink
Add service and service monitor manifests
Browse files Browse the repository at this point in the history
Signed-off-by: ghokun <gokhun@gmail.com>
  • Loading branch information
ghokun committed Mar 21, 2022
1 parent dd92a41 commit f939e6e
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 9 deletions.
43 changes: 40 additions & 3 deletions manifests/device-plugin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@ metadata:
spec:
selector:
matchLabels:
name: kuartis-virtual-gpu-device-plugin
app.kubernetes.io/name: kuartis-virtual-gpu-device-plugin
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: kuartis-virtual-gpu-device-plugin
app.kubernetes.io/name: kuartis-virtual-gpu-device-plugin
spec:
hostIPC: true
nodeSelector:
Expand All @@ -45,7 +45,7 @@ spec:
capabilities:
add: ["SYS_ADMIN"]
containers:
- image: ghcr.io/kuartis/kuartis-virtual-gpu-device-plugin:0.4.10
- image: ghcr.io/kuartis/kuartis-virtual-gpu-device-plugin:0.5.0
name: kuartis-virtual-gpu-device-plugin-ctr
command:
- /usr/bin/virtual-gpu-device-plugin
Expand Down Expand Up @@ -89,3 +89,40 @@ spec:
- name: dockershimsock
hostPath:
path: /var/run/dockershim.sock
---
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/name: kuartis-virtual-gpu-device-plugin
name: kuartis-virtual-gpu-device-plugin
namespace: kube-system
spec:
ports:
- name: metrics
port: 8080
targetPort: 8080
protocol: TCP
selector:
app.kubernetes.io/name: kuartis-virtual-gpu-device-plugin
sessionAffinity: None
type: ClusterIP
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/name: kuartis-virtual-gpu-device-plugin
name: kuartis-virtual-gpu-device-plugin
namespace: kube-system
spec:
endpoints:
- interval: 15s
path: /metrics
port: metrics
namespaceSelector:
matchNames:
- kube-system
selector:
matchLabels:
app.kubernetes.io/name: kuartis-virtual-gpu-device-plugin
9 changes: 3 additions & 6 deletions pkg/gpu/nvidia/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,7 @@ const (

var node = os.Getenv("NODE_NAME")

var metricsFormat = `
# HELP gpu_memory_usage_per_container Shows the GPU memory usage per container.
var metricsFormat = `# HELP gpu_memory_usage_per_container Shows the GPU memory usage per container.
# TYPE gpu_memory_usage_per_container gauge
{{- range $m := . }}
gpu_memory_usage_per_container{pid="{{ $m.Pid }}",gpuindex="{{ $m.GpuIndex }}",gpuuuid="{{ $m.GpuUUID }}",node="{{ $m.Node }}",namespace="{{ $m.Namespace }}",pod="{{ $m.Pod }}",poduid="{{ $m.PodUid }}",container="{{ $m.Container }}",containerid="{{ $m.ContainerId }}"} {{ $m.UsedGpuMemory }}
Expand Down Expand Up @@ -86,7 +85,6 @@ func collectMetrics(w http.ResponseWriter, r *http.Request) {
ContainerId: container.GetId(),
}
}
log.Printf("Current map %+v", containerMap)
collected := []metric{}
for i := 0; i < getDeviceCount(); i++ {
d, ret := nvml.DeviceGetHandleByIndex(i)
Expand All @@ -96,7 +94,7 @@ func collectMetrics(w http.ResponseWriter, r *http.Request) {
log.Printf("Found %d processes on GPU %d", len(processes), i)
for _, process := range processes {
containerId := getContainerId(process.Pid)
if container, ok := containerMap[strings.TrimSpace(containerId)]; ok {
if container, ok := containerMap[containerId]; ok {
log.Printf("Using %s Found container %+v for process: %d", containerId, container, process.Pid)
collected = append(collected, metric{
Pid: process.Pid,
Expand Down Expand Up @@ -145,6 +143,5 @@ func getContainerId(pid uint32) string {
}
proc := string(data)
containerId := proc[strings.LastIndex(proc, "/")+1:]
log.Printf("Found container id %s for process: %d", containerId, pid)
return containerId
return strings.TrimSpace(containerId)
}

0 comments on commit f939e6e

Please sign in to comment.