Skip to content

Commit

Permalink
Use local daemonset manifest for installing Nvidia drivers
Browse files Browse the repository at this point in the history
Updates sig-scheduling e2e Nvidia GPU tests to install drivers using
local manifest by default. Currently the DaemonSet is fetched from the
GoogleCloudPlatform/container-enginer-accelerators repo by default.
Using a local manifest allows for manually specifying the image
cos-gpu-installer image rather than always using latest. A remote
manifest can still be fetched by setting
NVIDIA_DRIVER_INSTALLER_DAEMONSET env var.

Signed-off-by: hasheddan <georgedanielmangum@gmail.com>
  • Loading branch information
hasheddan committed Jul 26, 2020
1 parent 79569e2 commit 67508f1
Show file tree
Hide file tree
Showing 4 changed files with 102 additions and 11 deletions.
6 changes: 5 additions & 1 deletion test/e2e/framework/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -1933,7 +1933,6 @@ func DumpDebugInfo(c clientset.Interface, ns string) {

// DsFromManifest reads a .json/yaml file and returns the daemonset in it.
func DsFromManifest(url string) (*appsv1.DaemonSet, error) {
var ds appsv1.DaemonSet
Logf("Parsing ds from %v", url)

var response *http.Response
Expand All @@ -1959,7 +1958,12 @@ func DsFromManifest(url string) (*appsv1.DaemonSet, error) {
if err != nil {
return nil, fmt.Errorf("Failed to read html response body: %v", err)
}
return DsFromData(data)
}

// DsFromData reads a .json/yaml file and returns the daemonset in it.
func DsFromData(data []byte) (*appsv1.DaemonSet, error) {
var ds appsv1.DaemonSet
dataJSON, err := utilyaml.ToJSON(data)
if err != nil {
return nil, fmt.Errorf("Failed to parse data to json: %v", err)
Expand Down
1 change: 1 addition & 0 deletions test/e2e/scheduling/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ go_library(
"//test/e2e/framework/pv:go_default_library",
"//test/e2e/framework/replicaset:go_default_library",
"//test/e2e/framework/service:go_default_library",
"//test/e2e/framework/testfiles:go_default_library",
"//test/utils:go_default_library",
"//test/utils/image:go_default_library",
"//vendor/github.com/onsi/ginkgo:go_default_library",
Expand Down
26 changes: 16 additions & 10 deletions test/e2e/scheduling/nvidia-gpus.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,19 @@ import (
"regexp"
"time"

appsv1 "k8s.io/api/apps/v1"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/uuid"
extensionsinternal "k8s.io/kubernetes/pkg/apis/extensions"
"k8s.io/kubernetes/test/e2e/framework"
"k8s.io/kubernetes/test/e2e/framework/gpu"
e2egpu "k8s.io/kubernetes/test/e2e/framework/gpu"
jobutil "k8s.io/kubernetes/test/e2e/framework/job"
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
"k8s.io/kubernetes/test/e2e/framework/providers/gce"
e2etestfiles "k8s.io/kubernetes/test/e2e/framework/testfiles"
imageutils "k8s.io/kubernetes/test/utils/image"

"github.com/onsi/ginkgo"
Expand All @@ -46,7 +48,6 @@ const (

var (
gpuResourceName v1.ResourceName
dsYamlURL string
)

func makeCudaAdditionDevicePluginTestPod() *v1.Pod {
Expand Down Expand Up @@ -124,18 +125,23 @@ func getGPUsAvailable(f *framework.Framework) int64 {
func SetupNVIDIAGPUNode(f *framework.Framework, setupResourceGatherer bool) *framework.ContainerResourceGatherer {
logOSImages(f)

var err error
var ds *appsv1.DaemonSet
dsYamlURLFromEnv := os.Getenv("NVIDIA_DRIVER_INSTALLER_DAEMONSET")
if dsYamlURLFromEnv != "" {
dsYamlURL = dsYamlURLFromEnv
// Using DaemonSet from remote URL
framework.Logf("Using remote nvidia-driver-installer daemonset manifest from %v", dsYamlURLFromEnv)
ds, err = framework.DsFromManifest(dsYamlURLFromEnv)
framework.ExpectNoError(err, "failed get remote")
} else {
dsYamlURL = "https://raw.githubusercontent.com/GoogleCloudPlatform/container-engine-accelerators/master/daemonset.yaml"
// Using default local DaemonSet
framework.Logf("Using default local nvidia-driver-installer daemonset manifest.")
data, err := e2etestfiles.Read("test/e2e/testing-manifests/scheduling/nvidia-driver-installer.yaml")
framework.ExpectNoError(err, "failed to read local manifest for nvidia-driver-installer daemonset")
ds, err = framework.DsFromData(data)
framework.ExpectNoError(err, "failed to parse local manifest for nvidia-driver-installer daemonset")
}
gpuResourceName = gpu.NVIDIAGPUResourceName

framework.Logf("Using %v", dsYamlURL)
// Creates the DaemonSet that installs Nvidia Drivers.
ds, err := framework.DsFromManifest(dsYamlURL)
framework.ExpectNoError(err)
gpuResourceName = e2egpu.NVIDIAGPUResourceName
ds.Namespace = f.Namespace.Name
_, err = f.ClientSet.AppsV1().DaemonSets(f.Namespace.Name).Create(ds)
framework.ExpectNoError(err, "failed to create nvidia-driver-installer daemonset")
Expand Down
80 changes: 80 additions & 0 deletions test/e2e/testing-manifests/scheduling/nvidia-driver-installer.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# This DaemonSet was originally referenced from
# https://github.com/GoogleCloudPlatform/container-engine-accelerators/blob/master/daemonset.yaml

# The Dockerfile and other source for this daemonset are in
# https://github.com/GoogleCloudPlatform/cos-gpu-installer

apiVersion: apps/v1
kind: DaemonSet
metadata:
name: nvidia-driver-installer
namespace: kube-system
labels:
k8s-app: nvidia-driver-installer
spec:
selector:
matchLabels:
k8s-app: nvidia-driver-installer
updateStrategy:
type: RollingUpdate
template:
metadata:
labels:
name: nvidia-driver-installer
k8s-app: nvidia-driver-installer
spec:
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: cloud.google.com/gke-accelerator
operator: Exists
tolerations:
- operator: "Exists"
hostNetwork: true
hostPID: true
volumes:
- name: dev
hostPath:
path: /dev
- name: vulkan-icd-mount
hostPath:
path: /home/kubernetes/bin/nvidia/vulkan/icd.d
- name: nvidia-install-dir-host
hostPath:
path: /home/kubernetes/bin/nvidia
- name: root-mount
hostPath:
path: /
initContainers:
- image: gcr.io/cos-cloud/cos-gpu-installer:v20200701
name: nvidia-driver-installer
resources:
requests:
cpu: 0.15
securityContext:
privileged: true
env:
- name: NVIDIA_INSTALL_DIR_HOST
value: /home/kubernetes/bin/nvidia
- name: NVIDIA_INSTALL_DIR_CONTAINER
value: /usr/local/nvidia
- name: VULKAN_ICD_DIR_HOST
value: /home/kubernetes/bin/nvidia/vulkan/icd.d
- name: VULKAN_ICD_DIR_CONTAINER
value: /etc/vulkan/icd.d
- name: ROOT_MOUNT_DIR
value: /root
volumeMounts:
- name: nvidia-install-dir-host
mountPath: /usr/local/nvidia
- name: vulkan-icd-mount
mountPath: /etc/vulkan/icd.d
- name: dev
mountPath: /dev
- name: root-mount
mountPath: /root
containers:
- image: "k8s.gcr.io/pause:3.2"
name: pause

0 comments on commit 67508f1

Please sign in to comment.