Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 21 additions & 10 deletions build/docker/intel-gpu-levelzero.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,15 @@ RUN if [ $ROCKYLINUX -eq 0 ]; then \
LATEST_GO=$(curl --no-progress-meter https://go.dev/dl/?mode=json | jq ".[] | select(.version | startswith(\"go${CGO_VERSION}\")).version" | tr -d "\"") && \
wget -q https://go.dev/dl/$LATEST_GO.linux-amd64.tar.gz -O - | tar -xz -C /usr/local && \
cd /runtime && \
wget -q https://github.com/intel/compute-runtime/releases/download/25.09.32961.7/intel-level-zero-gpu_1.6.32961.7_amd64.deb && \
wget -q https://github.com/intel/compute-runtime/releases/download/25.09.32961.7/intel-opencl-icd_25.09.32961.7_amd64.deb && \
wget -q https://github.com/intel/compute-runtime/releases/download/25.09.32961.7/libigdgmm12_22.6.0_amd64.deb && \
wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.20.2/level-zero-devel_1.20.2+u22.04_amd64.deb && \
wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.20.2/level-zero_1.20.2+u22.04_amd64.deb && \
wget -q https://github.com/intel/intel-graphics-compiler/releases/download/v2.8.3/intel-igc-core-2_2.8.3+18762_amd64.deb && \
wget -q https://github.com/intel/intel-graphics-compiler/releases/download/v2.8.3/intel-igc-opencl-2_2.8.3+18762_amd64.deb && \
wget -q https://github.com/intel/intel-graphics-compiler/releases/download/v2.20.3/intel-igc-core-2_2.20.3+19972_amd64.deb && \
wget -q https://github.com/intel/intel-graphics-compiler/releases/download/v2.20.3/intel-igc-opencl-2_2.20.3+19972_amd64.deb && \
wget -q https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/intel-opencl-icd_25.40.35563.4-0_amd64.deb && \
wget -q https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/libigdgmm12_22.8.2_amd64.deb && \
wget -q https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/libze-intel-gpu1_25.40.35563.4-0_amd64.deb && \
wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.24.3/level-zero_1.24.3+u22.04_amd64.deb && \
wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.24.3/level-zero-devel_1.24.3+u22.04_amd64.deb && \
dpkg -i *.deb && \
rm -f *.deb && \
rm -rf /var/lib/apt/lists/\*; \
else \
source /etc/os-release && dnf install -y gcc jq wget 'dnf-command(config-manager)' && \
Expand Down Expand Up @@ -83,9 +84,19 @@ ARG CMD
ARG ROCKYLINUX
COPY --from=builder /runtime /runtime
RUN if [ $ROCKYLINUX -eq 0 ]; then \
apt-get update && apt-get install --no-install-recommends -y ocl-icd-libopencl1 && \
rm /runtime/level-zero-devel_*.deb && \
cd /runtime && dpkg -i *.deb && rm -rf /runtime && \
apt-get update && apt-get install --no-install-recommends -y ocl-icd-libopencl1 wget ca-certificates && \
cd /runtime && \
wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.20.3/intel-igc-core-2_2.20.3+19972_amd64.deb && \
wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.20.3/intel-igc-opencl-2_2.20.3+19972_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/intel-opencl-icd_25.40.35563.4-0_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/libigdgmm12_22.8.2_amd64.deb && \
wget https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/libze-intel-gpu1_25.40.35563.4-0_amd64.deb && \
wget https://github.com/oneapi-src/level-zero/releases/download/v1.24.3/level-zero_1.24.3+u22.04_amd64.deb && \
dpkg -i *.deb && \
apt-get -y remove wget ca-certificates && \
apt-get -y autoremove && \
rm -f *.deb && \
rm -rf /var/lib/apt/lists/\* && \
rm "/lib/x86_64-linux-gnu/libze_validation"* && rm "/lib/x86_64-linux-gnu/libze_tracing_layer"*; \
else \
cp -a /runtime//*.so* /usr/lib64/ && cp -a /runtime/OpenCL /etc/ && cp -a /runtime/licenses/* /usr/share/licenses/; \
Expand Down
31 changes: 21 additions & 10 deletions build/docker/templates/intel-gpu-levelzero.Dockerfile.in
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,15 @@ RUN if [ $ROCKYLINUX -eq 0 ]; then \N
LATEST_GO=$(curl --no-progress-meter https://go.dev/dl/?mode=json | jq ".[] | select(.version | startswith(\"go${CGO_VERSION}\")).version" | tr -d "\"") && \N
wget -q https://go.dev/dl/$LATEST_GO.linux-amd64.tar.gz -O - | tar -xz -C /usr/local && \N
cd /runtime && \N
wget -q https://github.com/intel/compute-runtime/releases/download/25.09.32961.7/intel-level-zero-gpu_1.6.32961.7_amd64.deb && \N
wget -q https://github.com/intel/compute-runtime/releases/download/25.09.32961.7/intel-opencl-icd_25.09.32961.7_amd64.deb && \N
wget -q https://github.com/intel/compute-runtime/releases/download/25.09.32961.7/libigdgmm12_22.6.0_amd64.deb && \N
wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.20.2/level-zero-devel_1.20.2+u22.04_amd64.deb && \N
wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.20.2/level-zero_1.20.2+u22.04_amd64.deb && \N
wget -q https://github.com/intel/intel-graphics-compiler/releases/download/v2.8.3/intel-igc-core-2_2.8.3+18762_amd64.deb && \N
wget -q https://github.com/intel/intel-graphics-compiler/releases/download/v2.8.3/intel-igc-opencl-2_2.8.3+18762_amd64.deb && \N
wget -q https://github.com/intel/intel-graphics-compiler/releases/download/v2.20.3/intel-igc-core-2_2.20.3+19972_amd64.deb && \N
wget -q https://github.com/intel/intel-graphics-compiler/releases/download/v2.20.3/intel-igc-opencl-2_2.20.3+19972_amd64.deb && \N
wget -q https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/intel-opencl-icd_25.40.35563.4-0_amd64.deb && \N
wget -q https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/libigdgmm12_22.8.2_amd64.deb && \N
wget -q https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/libze-intel-gpu1_25.40.35563.4-0_amd64.deb && \N
wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.24.3/level-zero_1.24.3+u22.04_amd64.deb && \N
wget -q https://github.com/oneapi-src/level-zero/releases/download/v1.24.3/level-zero-devel_1.24.3+u22.04_amd64.deb && \N
dpkg -i *.deb && \N
rm -f *.deb && \N
rm -rf /var/lib/apt/lists/\*; \N
else \N
source /etc/os-release && dnf install -y gcc jq wget 'dnf-command(config-manager)' && \N
Expand Down Expand Up @@ -80,9 +81,19 @@ ARG ROCKYLINUX
COPY --from=builder /runtime /runtime

RUN if [ $ROCKYLINUX -eq 0 ]; then \N
apt-get update && apt-get install --no-install-recommends -y ocl-icd-libopencl1 && \N
rm /runtime/level-zero-devel_*.deb && \N
cd /runtime && dpkg -i *.deb && rm -rf /runtime && \N
apt-get update && apt-get install --no-install-recommends -y ocl-icd-libopencl1 wget ca-certificates && \N
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suspect the main size reduction from this duplication comes actually from removing the accidentally left (large) downloaded deb files, not from dropping l0-dev, wget, certs & their deps. Did you check that?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It mainly comes from this:
COPY --from=builder /runtime /runtime
The runtime deb packages are copied from the build to the final phase, and while they are removed after the install the copy creates a large unnecessary layer. This is evident if you open the container in dive.

Would be nice if one could install packages directly from the build phase.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

docker does not support host volumes for builds (needs extension), but podman does. What if packages were on a host tmp volume (-v $(mktemp -d):/temporary:rw), I don't think those go to the final image?

cd /runtime && \N
wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.20.3/intel-igc-core-2_2.20.3+19972_amd64.deb && \N
wget https://github.com/intel/intel-graphics-compiler/releases/download/v2.20.3/intel-igc-opencl-2_2.20.3+19972_amd64.deb && \N
wget https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/intel-opencl-icd_25.40.35563.4-0_amd64.deb && \N
wget https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/libigdgmm12_22.8.2_amd64.deb && \N
wget https://github.com/intel/compute-runtime/releases/download/25.40.35563.4/libze-intel-gpu1_25.40.35563.4-0_amd64.deb && \N
wget https://github.com/oneapi-src/level-zero/releases/download/v1.24.3/level-zero_1.24.3+u22.04_amd64.deb && \N
dpkg -i *.deb && \N
apt-get -y remove wget ca-certificates && \N
apt-get -y autoremove && \N
rm -f *.deb && \N
rm -rf /var/lib/apt/lists/\* && \N
rm "/lib/x86_64-linux-gnu/libze_validation"* && rm "/lib/x86_64-linux-gnu/libze_tracing_layer"*; \N
else \N
cp -a /runtime//*.so* /usr/lib64/ && cp -a /runtime/OpenCL /etc/ && cp -a /runtime/licenses/* /usr/share/licenses/; \N
Expand Down
15 changes: 11 additions & 4 deletions cmd/gpu_levelzero/zes.c
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,12 @@ static ze_result_t enumerate_zes_devices(void)
for (uint32_t i = 0; i < count; ++i) {
zes_device_handle_t dev_h = zes_handles[i];

zes_pci_properties_t pci_props;
zes_pci_properties_t pci_props = {
.pNext = NULL,
};

if (zesDevicePciGetProperties(dev_h, &pci_props) != ZE_RESULT_SUCCESS) {
print_log(LOG_WARNING, "Failed to get PCI properties for device %d: %X\n", i, res);
continue;
}

Expand Down Expand Up @@ -332,8 +336,9 @@ bool zes_device_bus_is_healthy(char* bdf_address, uint32_t* error)
return true;
}

zes_pci_state_t pci_state;
memset(&pci_state, 0, sizeof(pci_state));
zes_pci_state_t pci_state = {
.pNext = NULL,
};

ze_result_t res = zesDevicePciGetState(handle, &pci_state);
if (res == ZE_RESULT_SUCCESS) {
Expand Down Expand Up @@ -409,7 +414,9 @@ double zes_device_temp_max(char* bdf_address, char* sensor, uint32_t* error)
}

for (uint32_t i = 0; i < count; ++i) {
zes_temp_properties_t props;
zes_temp_properties_t props = {
.pNext = NULL,
};

res = zesTemperatureGetProperties(tempHandles[i], &props);
if (res != ZE_RESULT_SUCCESS) {
Expand Down
16 changes: 10 additions & 6 deletions cmd/gpu_plugin/gpu_plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,6 @@ const (
devfsDriDirectory = "/dev/dri"
wslDxgPath = "/dev/dxg"
wslLibPath = "/usr/lib/wsl"
nfdFeatureDir = "/etc/kubernetes/node-feature-discovery/features.d"
resourceFilename = "intel-gpu-resources.txt"
gpuDeviceRE = `^card[0-9]+$`
controlDeviceRE = `^controlD[0-9]+$`
pciAddressRE = "^[0-9a-f]{4}:[0-9a-f]{2}:[0-9a-f]{2}\\.[0-9a-f]{1}$"
Expand Down Expand Up @@ -72,7 +70,9 @@ type cliOptions struct {
allowIDs string
denyIDs string
sharedDevNum int
temperatureLimit int
globalTempLimit int
memoryTempLimit int
gpuTempLimit int
enableMonitoring bool
wslScan bool
healthManagement bool
Expand Down Expand Up @@ -404,13 +404,15 @@ func (dp *devicePlugin) healthStatusForCard(cardPath string) string {
return health
}

limit := float64(dp.options.temperatureLimit)
globalTempLimit := float64(dp.options.globalTempLimit)
memoryTempLimit := float64(dp.options.memoryTempLimit)
gpuTempLimit := float64(dp.options.gpuTempLimit)

// Temperatures for different areas
klog.V(4).Infof("Temperatures: Memory=%.1fC, GPU=%.1fC, Global=%.1fC",
deviceTemps.Memory, deviceTemps.GPU, deviceTemps.Global)

if deviceTemps.GPU > limit || deviceTemps.Global > limit || deviceTemps.Memory > limit {
if deviceTemps.GPU > gpuTempLimit || deviceTemps.Global > globalTempLimit || deviceTemps.Memory > memoryTempLimit {
health = pluginapi.Unhealthy
}

Expand Down Expand Up @@ -786,7 +788,9 @@ func main() {
flag.BoolVar(&opts.healthManagement, "health-management", false, "enable GPU health management")
flag.BoolVar(&opts.wslScan, "wsl", false, "scan for / use WSL devices")
flag.IntVar(&opts.sharedDevNum, "shared-dev-num", 1, "number of containers sharing the same GPU device")
flag.IntVar(&opts.temperatureLimit, "temp-limit", 100, "temperature limit at which device is marked unhealthy")
flag.IntVar(&opts.globalTempLimit, "temp-limit", 100, "Global temperature limit at which device is marked unhealthy")
flag.IntVar(&opts.gpuTempLimit, "gpu-temp-limit", 100, "GPU temperature limit at which device is marked unhealthy")
flag.IntVar(&opts.memoryTempLimit, "memory-temp-limit", 100, "Memory temperature limit at which device is marked unhealthy")
flag.StringVar(&opts.preferredAllocationPolicy, "allocation-policy", "none", "modes of allocating GPU devices: balanced, packed and none")
flag.StringVar(&opts.allowIDs, "allow-ids", "", "comma-separated list of device IDs to allow (e.g. 0x49c5,0x49c6)")
flag.StringVar(&opts.denyIDs, "deny-ids", "", "comma-separated list of device IDs to deny (e.g. 0x49c5,0x49c6)")
Expand Down
29 changes: 24 additions & 5 deletions cmd/gpu_plugin/gpu_plugin_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,11 @@ func (n *mockNotifier) Notify(newDeviceTree dpapi.DeviceTree) {
}

type mockL0Service struct {
indices []uint32
memSize uint64
healthy bool
fail bool
indices []uint32
memSize uint64
healthy bool
failTemp bool
fail bool
}

func (m *mockL0Service) Run(keep bool) {
Expand All @@ -83,7 +84,7 @@ func (m *mockL0Service) GetDeviceHealth(bdfAddress string) (levelzeroservice.Dev
return levelzeroservice.DeviceHealth{Memory: m.healthy, Bus: m.healthy, SoC: m.healthy}, nil
}
func (m *mockL0Service) GetDeviceTemperature(bdfAddress string) (levelzeroservice.DeviceTemperature, error) {
if m.fail {
if m.fail || m.failTemp {
return levelzeroservice.DeviceTemperature{}, errors.Errorf("error, error")
}

Expand Down Expand Up @@ -608,6 +609,24 @@ func TestScanWithHealth(t *testing.T) {
healthy: true,
},
},
{
name: "one device with failure on temp reading",
pciAddresses: map[string]string{"0000:00:00.0": "card0"},
sysfsdirs: []string{"card0/device/drm/card0", "card0/device/drm/controlD64"},
sysfsfiles: map[string][]byte{
"card0/device/vendor": []byte("0x8086"),
},
devfsdirs: []string{
"card0",
"by-path/pci-0000:00:00.0-card",
"by-path/pci-0000:00:00.0-render",
},
expectedI915Devs: 1,
l0mock: &mockL0Service{
healthy: true,
failTemp: true,
},
},
{
name: "one unhealthy device with proper symlink",
pciAddresses: map[string]string{"0000:00:00.0": "card0"},
Expand Down
Loading