-
Notifications
You must be signed in to change notification settings - Fork 1.9k
/
client.go
194 lines (173 loc) · 6.4 KB
/
client.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
package nvml
import (
"fmt"
)
// DeviceData represents common fields for Nvidia device
type DeviceData struct {
UUID string
DeviceName *string
MemoryMiB *uint64
PowerW *uint
BAR1MiB *uint64
}
// FingerprintDeviceData is a superset of DeviceData
// it describes device specific fields returned from
// nvml queries during fingerprinting call
type FingerprintDeviceData struct {
*DeviceData
PCIBandwidthMBPerS *uint
CoresClockMHz *uint
MemoryClockMHz *uint
DisplayState string
PersistenceMode string
PCIBusID string
}
// FingerprintData represets attributes of driver/devices
type FingerprintData struct {
Devices []*FingerprintDeviceData
DriverVersion string
}
// StatsData is a superset of DeviceData
// it represents statistics data returned for every Nvidia device
type StatsData struct {
*DeviceData
PowerUsageW *uint
GPUUtilization *uint
MemoryUtilization *uint
EncoderUtilization *uint
DecoderUtilization *uint
TemperatureC *uint
UsedMemoryMiB *uint64
BAR1UsedMiB *uint64
ECCErrorsL1Cache *uint64
ECCErrorsL2Cache *uint64
ECCErrorsDevice *uint64
}
// NvmlClient describes how users would use nvml library
type NvmlClient interface {
GetFingerprintData() (*FingerprintData, error)
GetStatsData() ([]*StatsData, error)
}
// nvmlClient implements NvmlClient
// Users of this lib are expected to use this struct via NewNvmlClient func
type nvmlClient struct {
driver NvmlDriver
}
// NewNvmlClient function creates new nvmlClient with real
// NvmlDriver implementation. Also, this func initializes NvmlDriver
func NewNvmlClient() (*nvmlClient, error) {
driver := &nvmlDriver{}
err := driver.Initialize()
if err != nil {
return nil, err
}
return &nvmlClient{
driver: driver,
}, nil
}
// GetFingerprintData returns FingerprintData for available Nvidia devices
func (c *nvmlClient) GetFingerprintData() (*FingerprintData, error) {
/*
nvml fields to be fingerprinted # nvml_library_call
1 - Driver Version # nvmlSystemGetDriverVersion
2 - Product Name # nvmlDeviceGetName
3 - GPU UUID # nvmlDeviceGetUUID
4 - Total Memory # nvmlDeviceGetMemoryInfo
5 - Power # nvmlDeviceGetPowerManagementLimit
6 - PCIBusID # nvmlDeviceGetPciInfo
7 - BAR1 Memory # nvmlDeviceGetBAR1MemoryInfo(
8 - PCI Bandwidth
9 - Memory, Cores Clock # nvmlDeviceGetMaxClockInfo
10 - Display Mode # nvmlDeviceGetDisplayMode
11 - Persistence Mode # nvmlDeviceGetPersistenceMode
*/
// Assumed that this method is called with receiver retrieved from
// NewNvmlClient
// because this method handles initialization of NVML library
driverVersion, err := c.driver.SystemDriverVersion()
if err != nil {
return nil, fmt.Errorf("nvidia nvml SystemDriverVersion() error: %v\n", err)
}
numDevices, err := c.driver.DeviceCount()
if err != nil {
return nil, fmt.Errorf("nvidia nvml DeviceCount() error: %v\n", err)
}
allNvidiaGPUResources := make([]*FingerprintDeviceData, numDevices)
for i := 0; i < int(numDevices); i++ {
deviceInfo, err := c.driver.DeviceInfoByIndex(uint(i))
if err != nil {
return nil, fmt.Errorf("nvidia nvml DeviceInfoByIndex() error: %v\n", err)
}
allNvidiaGPUResources[i] = &FingerprintDeviceData{
DeviceData: &DeviceData{
DeviceName: deviceInfo.Name,
UUID: deviceInfo.UUID,
MemoryMiB: deviceInfo.MemoryMiB,
PowerW: deviceInfo.PowerW,
BAR1MiB: deviceInfo.BAR1MiB,
},
PCIBandwidthMBPerS: deviceInfo.PCIBandwidthMBPerS,
CoresClockMHz: deviceInfo.CoresClockMHz,
MemoryClockMHz: deviceInfo.MemoryClockMHz,
DisplayState: deviceInfo.DisplayState,
PersistenceMode: deviceInfo.PersistenceMode,
PCIBusID: deviceInfo.PCIBusID,
}
}
return &FingerprintData{
Devices: allNvidiaGPUResources,
DriverVersion: driverVersion,
}, nil
}
// GetStatsData returns statistics data for all devices on this machine
func (c *nvmlClient) GetStatsData() ([]*StatsData, error) {
/*
nvml fields to be reported to stats api # nvml_library_call
1 - Used Memory # nvmlDeviceGetMemoryInfo
2 - Utilization of GPU # nvmlDeviceGetUtilizationRates
3 - Utilization of Memory # nvmlDeviceGetUtilizationRates
4 - Utilization of Decoder # nvmlDeviceGetDecoderUtilization
5 - Utilization of Encoder # nvmlDeviceGetEncoderUtilization
6 - Current GPU Temperature # nvmlDeviceGetTemperature
7 - Power Draw # nvmlDeviceGetPowerUsage
8 - BAR1 Used memory # nvmlDeviceGetBAR1MemoryInfo
9 - ECC Errors on requesting L1Cache # nvmlDeviceGetMemoryErrorCounter
10 - ECC Errors on requesting L2Cache # nvmlDeviceGetMemoryErrorCounter
11 - ECC Errors on requesting Device memory # nvmlDeviceGetMemoryErrorCounter
*/
// Assumed that this method is called with receiver retrieved from
// NewNvmlClient
// because this method handles initialization of NVML library
numDevices, err := c.driver.DeviceCount()
if err != nil {
return nil, fmt.Errorf("nvidia nvml DeviceCount() error: %v\n", err)
}
allNvidiaGPUStats := make([]*StatsData, numDevices)
for i := 0; i < int(numDevices); i++ {
deviceInfo, deviceStatus, err := c.driver.DeviceInfoAndStatusByIndex(uint(i))
if err != nil {
return nil, fmt.Errorf("nvidia nvml DeviceInfoAndStatusByIndex() error: %v\n", err)
}
allNvidiaGPUStats[i] = &StatsData{
DeviceData: &DeviceData{
DeviceName: deviceInfo.Name,
UUID: deviceInfo.UUID,
MemoryMiB: deviceInfo.MemoryMiB,
PowerW: deviceInfo.PowerW,
BAR1MiB: deviceInfo.BAR1MiB,
},
PowerUsageW: deviceStatus.PowerUsageW,
GPUUtilization: deviceStatus.GPUUtilization,
MemoryUtilization: deviceStatus.MemoryUtilization,
EncoderUtilization: deviceStatus.EncoderUtilization,
DecoderUtilization: deviceStatus.DecoderUtilization,
TemperatureC: deviceStatus.TemperatureC,
UsedMemoryMiB: deviceStatus.UsedMemoryMiB,
BAR1UsedMiB: deviceStatus.BAR1UsedMiB,
ECCErrorsL1Cache: deviceStatus.ECCErrorsL1Cache,
ECCErrorsL2Cache: deviceStatus.ECCErrorsL2Cache,
ECCErrorsDevice: deviceStatus.ECCErrorsDevice,
}
}
return allNvidiaGPUStats, nil
}