From 1e77594958e346e26e4d29fa0d018729fdce1d86 Mon Sep 17 00:00:00 2001 From: Vishnu kannan Date: Fri, 28 Apr 2017 17:48:36 -0700 Subject: [PATCH 1/3] Adding an installer script that installs Nvidia drivers in Container Optimized OS Packaged the script as a docker container stored in gcr.io/google-containers A daemonset deployment is included to make it easy to consume the installer A cluster e2e has been added to test the installation daemonset along with verifying installation by using a sample CUDA application. Node e2e for GPUs updated to avoid running on nodes without GPU devices. Signed-off-by: Vishnu kannan --- cluster/gce/BUILD | 5 +- cluster/gce/config-default.sh | 2 + cluster/gce/gci/configure-helper.sh | 1 + cluster/gce/gci/nvidia-gpus/BUILD | 24 ++ cluster/gce/gci/nvidia-gpus/Dockerfile | 28 +++ cluster/gce/gci/nvidia-gpus/Makefile | 27 +++ .../nvidia-gpus/cos-installer-daemonset.yaml | 57 +++++ cluster/gce/gci/nvidia-gpus/installer.sh | 207 ++++++++++++++++++ hack/generate-bindata.sh | 3 +- test/e2e/BUILD | 1 + test/e2e/generated/BUILD | 1 + test/e2e/nvidia-gpus.go | 178 +++++++++++++++ test/e2e_node/gpus.go | 53 ++++- test/e2e_node/jenkins/gci-init-gpu.yaml | 19 ++ .../e2e_node/jenkins/image-config-serial.yaml | 7 +- test/e2e_node/runner/remote/run_remote.go | 1 + test/images/nvidia-cuda/Dockerfile | 24 ++ test/images/nvidia-cuda/Makefile | 28 +++ test/images/nvidia-cuda/README.md | 13 ++ 19 files changed, 665 insertions(+), 14 deletions(-) create mode 100644 cluster/gce/gci/nvidia-gpus/BUILD create mode 100644 cluster/gce/gci/nvidia-gpus/Dockerfile create mode 100644 cluster/gce/gci/nvidia-gpus/Makefile create mode 100644 cluster/gce/gci/nvidia-gpus/cos-installer-daemonset.yaml create mode 100644 cluster/gce/gci/nvidia-gpus/installer.sh create mode 100644 test/e2e/nvidia-gpus.go create mode 100644 test/e2e_node/jenkins/gci-init-gpu.yaml create mode 100644 test/images/nvidia-cuda/Dockerfile create mode 100644 test/images/nvidia-cuda/Makefile create mode 100644 test/images/nvidia-cuda/README.md diff --git a/cluster/gce/BUILD b/cluster/gce/BUILD index ccc7be8ae25f..66079dc2fcd1 100644 --- a/cluster/gce/BUILD +++ b/cluster/gce/BUILD @@ -32,7 +32,10 @@ filegroup( filegroup( name = "all-srcs", - srcs = [":package-srcs"], + srcs = [ + ":package-srcs", + "//cluster/gce/gci/nvidia-gpus:all-srcs", + ], tags = ["automanaged"], ) diff --git a/cluster/gce/config-default.sh b/cluster/gce/config-default.sh index 0ca7350af18b..1bcf7b05c17b 100755 --- a/cluster/gce/config-default.sh +++ b/cluster/gce/config-default.sh @@ -68,6 +68,8 @@ fi # variable. Also please update corresponding image for node e2e at: # https://github.com/kubernetes/kubernetes/blob/master/test/e2e_node/jenkins/image-config.yaml CVM_VERSION=${CVM_VERSION:-container-vm-v20170214} +# NOTE: Update the kernel commit SHA in cluster/addons/nvidia-gpus/cos-installer-daemonset.yaml +# while updating the COS version here. GCI_VERSION=${KUBE_GCI_VERSION:-gci-stable-56-9000-84-2} MASTER_IMAGE=${KUBE_GCE_MASTER_IMAGE:-} MASTER_IMAGE_PROJECT=${KUBE_GCE_MASTER_PROJECT:-google-containers} diff --git a/cluster/gce/gci/configure-helper.sh b/cluster/gce/gci/configure-helper.sh index 3c60b7b4961e..9c0099ed9ca1 100644 --- a/cluster/gce/gci/configure-helper.sh +++ b/cluster/gce/gci/configure-helper.sh @@ -1605,4 +1605,5 @@ else fi reset-motd prepare-mounter-rootfs +modprobe configs echo "Done for the configuration for kubernetes" diff --git a/cluster/gce/gci/nvidia-gpus/BUILD b/cluster/gce/gci/nvidia-gpus/BUILD new file mode 100644 index 000000000000..0f8fa04948e2 --- /dev/null +++ b/cluster/gce/gci/nvidia-gpus/BUILD @@ -0,0 +1,24 @@ +package(default_visibility = ["//visibility:public"]) + +load("@io_bazel//tools/build_defs/pkg:pkg.bzl", "pkg_tar") +load("@io_kubernetes_build//defs:build.bzl", "release_filegroup") + +filegroup( + name = "sources", + srcs = glob([ + "**/*", + ]), +) + +filegroup( + name = "package-srcs", + srcs = glob(["**"]), + tags = ["automanaged"], + visibility = ["//visibility:private"], +) + +filegroup( + name = "all-srcs", + srcs = [":package-srcs"], + tags = ["automanaged"], +) diff --git a/cluster/gce/gci/nvidia-gpus/Dockerfile b/cluster/gce/gci/nvidia-gpus/Dockerfile new file mode 100644 index 000000000000..9cec8ab365d8 --- /dev/null +++ b/cluster/gce/gci/nvidia-gpus/Dockerfile @@ -0,0 +1,28 @@ +# Copyright 2017 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM ubuntu:16.04 + +# Disable prompts from apt +ENV DEBIAN_FRONTEND noninteractive + +RUN apt-get -qq update +RUN apt-get install -qq pciutils gcc g++ git make dpkg-dev bc module-init-tools curl + +RUN mkdir /lakitu-kernel +RUN git clone https://chromium.googlesource.com/chromiumos/third_party/kernel /lakitu-kernel + +ADD installer.sh /usr/bin/nvidia-installer.sh +RUN chmod a+x /usr/bin/nvidia-installer.sh +CMD ["/usr/bin/nvidia-installer.sh"] \ No newline at end of file diff --git a/cluster/gce/gci/nvidia-gpus/Makefile b/cluster/gce/gci/nvidia-gpus/Makefile new file mode 100644 index 000000000000..49a0dfc2c84f --- /dev/null +++ b/cluster/gce/gci/nvidia-gpus/Makefile @@ -0,0 +1,27 @@ +# Copyright 2017 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +TAG?=v0.1 +REGISTRY?=gcr.io/google_containers +IMAGE=cos-nvidia-driver-install + +all: container + +container: + docker build --pull -t ${REGISTRY}/${IMAGE}:${TAG} . + +push: + gcloud docker -- push ${REGISTRY}/${IMAGE}:${TAG} + +.PHONY: all container push diff --git a/cluster/gce/gci/nvidia-gpus/cos-installer-daemonset.yaml b/cluster/gce/gci/nvidia-gpus/cos-installer-daemonset.yaml new file mode 100644 index 000000000000..a8ecbf249620 --- /dev/null +++ b/cluster/gce/gci/nvidia-gpus/cos-installer-daemonset.yaml @@ -0,0 +1,57 @@ +apiVersion: extensions/v1beta1 +kind: DaemonSet +metadata: + name: cos-nvidia-installer + namespace: kube-system +spec: + template: + metadata: + labels: + name: cos-nvidia-installer + # Update the version tag here and `LAKITU_KERNEL_SHA1` while using against new COS releases. + cos-version: cos-beta-59-9460-20-0 + spec: + hostNetwork: true + hostPID: true + volumes: + - name: dev + hostPath: + path: /dev + - name: nvidia-overlay + hostPath: + path: /home/kubernetes/bin/nvidia + - name: os-release + hostPath: + path: /etc/os-release + - name: sysrq + hostPath: + path: /proc/sysrq-trigger + containers: + - image: gcr.io/google_containers/cos-nvidia-driver-install@sha256:ad83ede6e0c6d768bf7cf69a7dec972aa5e8f88778142ca46afd3286ad58cfc8 + command: ["/bin/sh", "-c"] + args: ["usr/bin/nvidia-installer.sh && sleep infinity"] + env: + - name: BASE_DIR + value: "/rootfs/nvidia" + name: nvidia-driver-installer + resources: + requests: + cpu: 0.15 + securityContext: + privileged: true + env: + # The kernel SHA1 here should correspond to the GCI_VERSION specified by default under cluster/gce/config-default.sh + - name: LAKITU_KERNEL_SHA1 + value: 26481563cb3788ad254c2bf2126b843c161c7e48 + - name: BASE_DIR + value: "/rootfs/nvidia" + volumeMounts: + - name: nvidia-overlay + mountPath: /rootfs/nvidia + - name: dev + mountPath: /dev + - name: os-release + mountPath: /rootfs/etc/os-release + - name: sysrq + mountPath: /sysrq + diff --git a/cluster/gce/gci/nvidia-gpus/installer.sh b/cluster/gce/gci/nvidia-gpus/installer.sh new file mode 100644 index 000000000000..a950d426e86f --- /dev/null +++ b/cluster/gce/gci/nvidia-gpus/installer.sh @@ -0,0 +1,207 @@ +#!/bin/bash + +# Copyright 2017 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This script is for dynamically installing nvidia kernel drivers in Container Optimized OS + +set -o errexit +set -o pipefail +set -x + +# The script must be run as a root. +# Prerequisites: +# +# LAKITU_KERNEL_SHA1 - The env variable is expected to be set to HEAD of the kernel version used on the host. +# BASE_DIR - Directory that is mapped to a stateful partition on host. Defaults to `/rootfs/nvidia`. +# +# The script will output the following artifacts: +# ${BASE_DIR}/lib* --> Nvidia CUDA libraries +# ${BASE_DIR}/bin/* --> Nvidia debug utilities +# ${BASE_DIR}/.cache/* --> Nvidia driver artifacts cached for idempotency. +# + +BASE_DIR=${BASE_DIR:-"/rootfs/nvidia"} +CACHE_DIR="${BASE_DIR}/.cache" +USR_WORK_DIR="${CACHE_DIR}/usr-work" +USR_WRITABLE_DIR="${CACHE_DIR}/usr-writable" +LIB_WORK_DIR="${CACHE_DIR}/lib-work" +LIB_WRITABLE_DIR="${CACHE_DIR}/lib-writable" + +LIB_OUTPUT_DIR="${BASE_DIR}/lib" +BIN_OUTPUT_DIR="${BASE_DIR}/bin" + +KERNEL_SRC_DIR="/lakitu-kernel" +NVIDIA_DRIVER_DIR="/nvidia" +NVIDIA_DRIVER_VERSION="375.26" + +# Source: https://developer.nvidia.com/cuda-downloads +NVIDIA_CUDA_URL="https://developer.nvidia.com/compute/cuda/8.0/Prod2/local_installers/cuda_8.0.61_375.26_linux-run" +NVIDIA_CUDA_MD5SUM="33e1bd980e91af4e55f3ef835c103f9b" +NVIDIA_CUDA_PKG_NAME="cuda_8.0.61_375.26_linux.run" +NVIDIA_DRIVER_PKG_NAME="NVIDIA-Linux-x86_64-375.26.run" + +check_nvidia_device() { + lspci + if ! lspci | grep -i -q NVIDIA; then + echo "No NVIDIA devices attached to this instance." + exit 0 + fi + echo "Found NVIDIA device on this instance." +} + +prepare_kernel_source() { + local kernel_git_repo="https://chromium.googlesource.com/chromiumos/third_party/kernel" + local kernel_version="$(uname -r)" + local kernel_version_stripped="$(echo ${kernel_version} | sed 's/\+//')" + + # Checkout the correct tag. + echo "Downloading kernel source at tag ${kernel_version_stripped} ..." + pushd "${KERNEL_SRC_DIR}" + # TODO: Consume KERNEL SHA1 from COS image directly. + # git checkout "tags/v${kernel_version_stripped}" + git checkout ${LAKITU_KERNEL_SHA1} + + # Prepare kernel configu and source for modules. + echo "Preparing kernel sources ..." + zcat "/proc/config.gz" > ".config" + make olddefconfig + make modules_prepare + # Done. + popd +} + +download_install_nvidia() { + local pkg_name="${NVIDIA_CUDA_PKG_NAME}" + local url="${NVIDIA_CUDA_URL}" + local log_file_name="${NVIDIA_DRIVER_DIR}/nvidia-installer.log" + + mkdir -p "${NVIDIA_DRIVER_DIR}" + pushd "${NVIDIA_DRIVER_DIR}" + + echo "Downloading Nvidia CUDA package from ${url} ..." + curl -L -s "${url}" -o "${pkg_name}" + echo "${NVIDIA_CUDA_MD5SUM} ${pkg_name}" | md5sum --check + + echo "Extracting Nvidia CUDA package ..." + sh ${pkg_name} --extract="$(pwd)" + + echo "Running the Nvidia driver installer ..." + if ! sh "${NVIDIA_DRIVER_PKG_NAME}" --kernel-source-path="${KERNEL_SRC_DIR}" --silent --accept-license --keep --log-file-name="${log_file_name}"; then + echo "Nvidia installer failed, log below:" + echo "===================================" + tail -50 "${log_file_name}" + echo "===================================" + exit 1 + fi + # Create unified memory device file. + nvidia-modprobe -c0 -u + popd +} + +unlock_loadpin_and_reboot_if_needed() { + kernel_cmdline="$(cat /proc/cmdline)" + if echo "${kernel_cmdline}" | grep -q -v "lsm.module_locking=0"; then + local -r esp_partition="/dev/sda12" + local -r mount_path="/tmp/esp" + local -r grub_cfg="efi/boot/grub.cfg" + + mkdir -p "${mount_path}" + mount "${esp_partition}" "${mount_path}" + + pushd "${mount_path}" + cp "${grub_cfg}" "${grub_cfg}.orig" + sed 's/cros_efi/cros_efi lsm.module_locking=0/g' -i "efi/boot/grub.cfg" + cat "${grub_cfg}" + popd + sync + umount "${mount_path}" + # Restart the node for loadpin to be disabled. + echo b > /sysrq + fi +} + +create_uvm_device() { + # Create unified memory device file. + nvidia-modprobe -c0 -u +} + +verify_base_image() { + mount --bind /rootfs/etc/os-release /etc/os-release + local id="$(grep "^ID=" /etc/os-release)" + if [[ "${id#*=}" != "cos" ]]; then + echo "This installer is designed to run on Container-Optimized OS only" + exit 1 + fi +} + +setup_overlay_mounts() { + mkdir -p ${USR_WRITABLE_DIR} ${USR_WORK_DIR} ${LIB_WRITABLE_DIR} ${LIB_WORK_DIR} + mount -t overlay -o lowerdir=/usr,upperdir=${USR_WRITABLE_DIR},workdir=${USR_WORK_DIR} none /usr + mount -t overlay -o lowerdir=/lib,upperdir=${LIB_WRITABLE_DIR},workdir=${LIB_WORK_DIR} none /lib +} + +exit_if_install_not_needed() { + if nvidia-smi; then + echo "nvidia drivers already installed. Skipping installation" + post_installation_sequence + exit 0 + fi +} + +restart_kubelet() { + echo "Sending SIGTERM to kubelet" + pkill -SIGTERM kubelet +} + +# Copy user space libraries and debug utilities to a special output directory on the host. +# Make these artifacts world readable and executable. +copy_files_to_host() { + mkdir -p ${LIB_OUTPUT_DIR} ${BIN_OUTPUT_DIR} + cp -r ${USR_WRITABLE_DIR}/lib/x86_64-linux-gnu/* ${LIB_OUTPUT_DIR}/ + cp -r ${USR_WRITABLE_DIR}/bin/* ${BIN_OUTPUT_DIR}/ + chmod -R a+rx ${LIB_OUTPUT_DIR} + chmod -R a+rx ${BIN_OUTPUT_DIR} +} + +post_installation_sequence() { + create_uvm_device + # Copy nvidia user space libraries and debug tools to the host for use from other containers. + copy_files_to_host + # Restart the kubelet for it to pick up the GPU devices. + restart_kubelet +} + +main() { + # Do not run the installer unless the base image is Container Optimized OS (COS) + verify_base_image + # Do not run the installer unless a Nvidia device is found on the PCI bus + check_nvidia_device + # Setup overlay mounts to capture nvidia driver artificats in a more permanent storage on the host. + setup_overlay_mounts + # Disable a critical security feature in COS that will allow for dynamically loading Nvidia drivers + unlock_loadpin_and_reboot_if_needed + # Exit if installation is not required (for idempotency) + exit_if_install_not_needed + # Checkout kernel sources appropriate for the base image. + prepare_kernel_source + # Download, compile and install nvidia drivers. + download_install_nvidia + # Verify that the Nvidia drivers have been successfully installed. + nvidia-smi + # Perform post installation steps - copying artifacts, restarting kubelet, etc. + post_installation_sequence +} + +main "$@" diff --git a/hack/generate-bindata.sh b/hack/generate-bindata.sh index 4c843c5cdc83..99f3ec5eb261 100755 --- a/hack/generate-bindata.sh +++ b/hack/generate-bindata.sh @@ -48,7 +48,8 @@ go-bindata -nometadata -o "${BINDATA_OUTPUT}.tmp" -pkg generated \ "examples/..." \ "test/e2e/testing-manifests/..." \ "test/images/..." \ - "test/fixtures/..." + "test/fixtures/..." \ + "cluster/gce/gci/nvidia-gpus/..." gofmt -s -w "${BINDATA_OUTPUT}.tmp" diff --git a/test/e2e/BUILD b/test/e2e/BUILD index bb67b533f2ad..ec8df8278d8c 100644 --- a/test/e2e/BUILD +++ b/test/e2e/BUILD @@ -86,6 +86,7 @@ go_library( "networking.go", "networking_perf.go", "nodeoutofdisk.go", + "nvidia-gpus.go", "pod_gc.go", "podpreset.go", "pods.go", diff --git a/test/e2e/generated/BUILD b/test/e2e/generated/BUILD index dc3bd3861419..c8acf6747eb3 100644 --- a/test/e2e/generated/BUILD +++ b/test/e2e/generated/BUILD @@ -23,6 +23,7 @@ genrule( name = "bindata", srcs = [ "//examples:sources", + "//cluster/gce/gci/nvidia-gpus:sources", "//test/images:sources", "//test/fixtures:sources", "//test/e2e/testing-manifests:sources", diff --git a/test/e2e/nvidia-gpus.go b/test/e2e/nvidia-gpus.go new file mode 100644 index 000000000000..93b3a62fa881 --- /dev/null +++ b/test/e2e/nvidia-gpus.go @@ -0,0 +1,178 @@ +/* +Copyright 2017 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2e + +import ( + "strings" + "time" + + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/uuid" + utilyaml "k8s.io/apimachinery/pkg/util/yaml" + "k8s.io/kubernetes/pkg/api" + "k8s.io/kubernetes/pkg/api/v1" + extensions "k8s.io/kubernetes/pkg/apis/extensions/v1beta1" + "k8s.io/kubernetes/test/e2e/framework" + "k8s.io/kubernetes/test/e2e/generated" + + . "github.com/onsi/ginkgo" + . "github.com/onsi/gomega" +) + +const ( + testPodNamePrefix = "nvidia-gpu-" + testCUDAImage = "gcr.io/google_containers/cuda-vector-add:v0.1" + cosOSImage = "Container-Optimized OS from Google" + // Nvidia driver installation can take upwards of 5 minutes. + driverInstallTimeout = 10 * time.Minute + // Nvidia COS driver installer daemonset. + cosNvidiaDriverInstallerPath = "cluster/gce/gci/nvidia-gpus/cos-installer-daemonset.yaml" +) + +func makeCudaAdditionTestPod() *v1.Pod { + podName := testPodNamePrefix + string(uuid.NewUUID()) + testPod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + }, + Spec: v1.PodSpec{ + RestartPolicy: v1.RestartPolicyNever, + Containers: []v1.Container{ + { + Name: "vector-addition", + Image: testCUDAImage, + Resources: v1.ResourceRequirements{ + Limits: v1.ResourceList{ + v1.ResourceNvidiaGPU: *resource.NewQuantity(1, resource.DecimalSI), + }, + }, + VolumeMounts: []v1.VolumeMount{ + { + Name: "nvidia-libraries", + MountPath: "/usr/local/nvidia/lib64", + }, + }, + }, + }, + Volumes: []v1.Volume{ + { + Name: "nvidia-libraries", + VolumeSource: v1.VolumeSource{ + HostPath: &v1.HostPathVolumeSource{ + Path: "/home/kubernetes/bin/nvidia/lib", + }, + }, + }, + }, + }, + } + return testPod +} + +func isClusterRunningCOS(f *framework.Framework) bool { + nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{}) + framework.ExpectNoError(err, "getting node list") + for _, node := range nodeList.Items { + if !strings.Contains(node.Status.NodeInfo.OSImage, cosOSImage) { + return false + } + } + return true +} + +func areGPUsAvailableOnAllSchedulableNodes(f *framework.Framework) bool { + framework.Logf("Getting list of Nodes from API server") + nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{}) + framework.ExpectNoError(err, "getting node list") + for _, node := range nodeList.Items { + if node.Spec.Unschedulable { + continue + } + if node.Status.Capacity.NvidiaGPU().Value() == 0 { + framework.Logf("Nvidia GPUs not available on Node: %q", node.Name) + return false + } + } + framework.Logf("Nvidia GPUs exist on all schedulable nodes") + return true +} + +func getGPUsAvailable(f *framework.Framework) int64 { + nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{}) + framework.ExpectNoError(err, "getting node list") + var gpusAvailable int64 + for _, node := range nodeList.Items { + gpusAvailable += node.Status.Capacity.NvidiaGPU().Value() + } + return gpusAvailable +} + +func testNvidiaGPUsOnCOS(f *framework.Framework) { + // Skip the test if the base image is not COS. + // TODO: Add support for other base images. + // CUDA apps require host mounts which is not portable across base images (yet). + framework.Logf("Checking base image") + if !isClusterRunningCOS(f) { + Skip("Nvidia GPU tests are supproted only on Container Optimized OS image currently") + } + framework.Logf("Cluster is running on COS. Proceeding with test") + // GPU drivers might have already been installed. + if !areGPUsAvailableOnAllSchedulableNodes(f) { + // Install Nvidia Drivers. + ds := dsFromManifest(cosNvidiaDriverInstallerPath) + ds.Namespace = f.Namespace.Name + _, err := f.ClientSet.Extensions().DaemonSets(f.Namespace.Name).Create(ds) + framework.ExpectNoError(err, "failed to create daemonset") + framework.Logf("Successfully created daemonset to install Nvidia drivers. Waiting for drivers to be installed and GPUs to be available in Node Capacity...") + // Wait for Nvidia GPUs to be available on nodes + Eventually(func() bool { + return areGPUsAvailableOnAllSchedulableNodes(f) + }, driverInstallTimeout, time.Second).Should(BeTrue()) + } + framework.Logf("Creating as many pods as there are Nvidia GPUs and have the pods run a CUDA app") + podList := []*v1.Pod{} + for i := int64(0); i < getGPUsAvailable(f); i++ { + podList = append(podList, f.PodClient().Create(makeCudaAdditionTestPod())) + } + framework.Logf("Wait for all test pods to succeed") + // Wait for all pods to succeed + for _, po := range podList { + f.PodClient().WaitForSuccess(po.Name, 5*time.Minute) + } +} + +// dsFromManifest reads a .json/yaml file and returns the daemonset in it. +func dsFromManifest(fileName string) *extensions.DaemonSet { + var controller extensions.DaemonSet + framework.Logf("Parsing ds from %v", fileName) + data := generated.ReadOrDie(fileName) + + json, err := utilyaml.ToJSON(data) + Expect(err).NotTo(HaveOccurred()) + + Expect(runtime.DecodeInto(api.Codecs.UniversalDecoder(), json, &controller)).NotTo(HaveOccurred()) + return &controller +} + +var _ = framework.KubeDescribe("[Feature:GPU]", func() { + f := framework.NewDefaultFramework("gpus") + It("run Nvidia GPU tests on Container Optimized OS only", func() { + testNvidiaGPUsOnCOS(f) + }) +}) diff --git a/test/e2e_node/gpus.go b/test/e2e_node/gpus.go index 26d943da4cc4..4baa75f34d55 100644 --- a/test/e2e_node/gpus.go +++ b/test/e2e_node/gpus.go @@ -18,6 +18,7 @@ package e2e_node import ( "fmt" + "os/exec" "time" "k8s.io/apimachinery/pkg/api/resource" @@ -33,11 +34,49 @@ import ( const acceleratorsFeatureGate = "Accelerators=true" +func getGPUsAvailable(f *framework.Framework) int64 { + nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{}) + framework.ExpectNoError(err, "getting node list") + var gpusAvailable int64 + for _, node := range nodeList.Items { + gpusAvailable += node.Status.Capacity.NvidiaGPU().Value() + } + return gpusAvailable +} + +func gpusExistOnAllNodes(f *framework.Framework) bool { + nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{}) + framework.ExpectNoError(err, "getting node list") + for _, node := range nodeList.Items { + if node.Name == "kubernetes-master" { + continue + } + if node.Status.Capacity.NvidiaGPU().Value() == 0 { + return false + } + } + return true +} + +func checkIfNvidiaGPUsExistOnNode() bool { + // Cannot use `lspci` because it is not installed on all distros by default. + err := exec.Command("/bin/sh", "-c", "find /sys/devices/pci* -type f | grep vendor | xargs cat | grep 0x10de").Run() + if err != nil { + framework.Logf("check for nvidia GPUs failed. Got Error: %v", err) + return false + } + return true +} + // Serial because the test updates kubelet configuration. var _ = framework.KubeDescribe("GPU [Serial]", func() { f := framework.NewDefaultFramework("gpu-test") Context("attempt to use GPUs if available", func() { It("setup the node and create pods to test gpus", func() { + By("ensuring that Nvidia GPUs exist on the node") + if !checkIfNvidiaGPUsExistOnNode() { + Skip("Nvidia GPUs do not exist on the node. Skipping test.") + } By("ensuring that dynamic kubelet configuration is enabled") enabled, err := isKubeletConfigEnabled(f) framework.ExpectNoError(err) @@ -65,19 +104,11 @@ var _ = framework.KubeDescribe("GPU [Serial]", func() { } framework.ExpectNoError(setKubeletConfiguration(f, newCfg)) - By("Getting the local node object from the api server") - nodeList, err := f.ClientSet.Core().Nodes().List(metav1.ListOptions{}) - framework.ExpectNoError(err, "getting node list") - Expect(len(nodeList.Items)).To(Equal(1)) - node := nodeList.Items[0] - gpusAvailable := node.Status.Capacity.NvidiaGPU() - By("Skipping the test if GPUs aren't available") - if gpusAvailable.IsZero() { - Skip("No GPUs available on local node. Skipping test.") - } + By("Waiting for GPUs to become available on the local node") + Eventually(gpusExistOnAllNodes(f), 10*time.Minute, time.Second).Should(BeTrue()) By("Creating a pod that will consume all GPUs") - podSuccess := makePod(gpusAvailable.Value(), "gpus-success") + podSuccess := makePod(getGPUsAvailable(f), "gpus-success") podSuccess = f.PodClient().CreateSync(podSuccess) By("Checking the containers in the pod had restarted at-least twice successfully thereby ensuring GPUs are reused") diff --git a/test/e2e_node/jenkins/gci-init-gpu.yaml b/test/e2e_node/jenkins/gci-init-gpu.yaml new file mode 100644 index 000000000000..3119e8b6c41a --- /dev/null +++ b/test/e2e_node/jenkins/gci-init-gpu.yaml @@ -0,0 +1,19 @@ +#cloud-config + +runcmd: + - mount /tmp /tmp -o remount,exec,suid + - usermod -a -G docker jenkins + - mkdir -p /var/lib/kubelet + - mkdir -p /home/kubernetes/containerized_mounter/rootfs + - mount --bind /home/kubernetes/containerized_mounter/ /home/kubernetes/containerized_mounter/ + - mount -o remount, exec /home/kubernetes/containerized_mounter/ + - wget https://storage.googleapis.com/kubernetes-release/gci-mounter/mounter.tar -O /tmp/mounter.tar + - tar xvf /tmp/mounter.tar -C /home/kubernetes/containerized_mounter/rootfs + - mkdir -p /home/kubernetes/containerized_mounter/rootfs/var/lib/kubelet + - mount --rbind /var/lib/kubelet /home/kubernetes/containerized_mounter/rootfs/var/lib/kubelet + - mount --make-rshared /home/kubernetes/containerized_mounter/rootfs/var/lib/kubelet + - mount --bind /proc /home/kubernetes/containerized_mounter/rootfs/proc + - mount --bind /dev /home/kubernetes/containerized_mounter/rootfs/dev + - rm /tmp/mounter.tar + - modprobe configs + - docker run -v /dev:/dev -v /home/kubernetes/bin/nvidia:/rootfs/nvidia -v /etc/os-release:/rootfs/etc/os-release -v /proc/sysrq-trigger:/sysrq -e LAKITU_KERNEL_SHA1=2fdf6034a0fae9794d80e4d218e237771224ba8f -e BASE_DIR=/rootfs/nvidia --privileged gcr.io/google_containers/cos-nvidia-driver-install@sha256:ad83ede6e0c6d768bf7cf69a7dec972aa5e8f88778142ca46afd3286ad58cfc8 diff --git a/test/e2e_node/jenkins/image-config-serial.yaml b/test/e2e_node/jenkins/image-config-serial.yaml index 30baceb13e4d..71ccbcfd8275 100644 --- a/test/e2e_node/jenkins/image-config-serial.yaml +++ b/test/e2e_node/jenkins/image-config-serial.yaml @@ -25,4 +25,9 @@ images: gci: image_regex: gci-stable-56-9000-84-2 # docker 1.11.2 project: google-containers - metadata: "user-data Date: Sat, 20 May 2017 05:23:39 -0700 Subject: [PATCH 2/3] Update COS version to m59 Signed-off-by: Vishnu kannan --- cluster/gce/config-default.sh | 2 +- cluster/gce/config-test.sh | 2 +- cluster/gce/util.sh | 39 ++++++++++++------- cluster/kubemark/gce/config-default.sh | 2 +- .../jenkins/benchmark/benchmark-config.yaml | 6 +-- .../e2e_node/jenkins/image-config-serial.yaml | 4 +- test/e2e_node/jenkins/image-config.yaml | 2 +- test/kubemark/gce/util.sh | 9 ++++- 8 files changed, 41 insertions(+), 25 deletions(-) diff --git a/cluster/gce/config-default.sh b/cluster/gce/config-default.sh index 1bcf7b05c17b..12f6f8fcd77f 100755 --- a/cluster/gce/config-default.sh +++ b/cluster/gce/config-default.sh @@ -70,7 +70,7 @@ fi CVM_VERSION=${CVM_VERSION:-container-vm-v20170214} # NOTE: Update the kernel commit SHA in cluster/addons/nvidia-gpus/cos-installer-daemonset.yaml # while updating the COS version here. -GCI_VERSION=${KUBE_GCI_VERSION:-gci-stable-56-9000-84-2} +GCI_VERSION=${KUBE_GCI_VERSION:-cos-beta-59-9460-20-0} MASTER_IMAGE=${KUBE_GCE_MASTER_IMAGE:-} MASTER_IMAGE_PROJECT=${KUBE_GCE_MASTER_PROJECT:-google-containers} NODE_IMAGE=${KUBE_GCE_NODE_IMAGE:-${CVM_VERSION}} diff --git a/cluster/gce/config-test.sh b/cluster/gce/config-test.sh index 2ba8163799b8..bcc3c102957c 100755 --- a/cluster/gce/config-test.sh +++ b/cluster/gce/config-test.sh @@ -67,7 +67,7 @@ fi # variable. Also please update corresponding image for node e2e at: # https://github.com/kubernetes/kubernetes/blob/master/test/e2e_node/jenkins/image-config.yaml CVM_VERSION=${CVM_VERSION:-container-vm-v20170214} -GCI_VERSION=${KUBE_GCI_VERSION:-gci-stable-56-9000-84-2} +GCI_VERSION=${KUBE_GCI_VERSION:-cos-beta-59-9460-20-0} MASTER_IMAGE=${KUBE_GCE_MASTER_IMAGE:-} MASTER_IMAGE_PROJECT=${KUBE_GCE_MASTER_PROJECT:-google-containers} NODE_IMAGE=${KUBE_GCE_NODE_IMAGE:-${CVM_VERSION}} diff --git a/cluster/gce/util.sh b/cluster/gce/util.sh index a6d497b73065..173c866d2118 100755 --- a/cluster/gce/util.sh +++ b/cluster/gce/util.sh @@ -38,27 +38,36 @@ else fi if [[ "${MASTER_OS_DISTRIBUTION}" == "gci" ]]; then - # If the master image is not set, we use the latest GCI image. - # Otherwise, we respect whatever is set by the user. - MASTER_IMAGE=${KUBE_GCE_MASTER_IMAGE:-${GCI_VERSION}} - MASTER_IMAGE_PROJECT=${KUBE_GCE_MASTER_PROJECT:-google-containers} + DEFAULT_GCI_PROJECT=google-containers + if [[ "${GCI_VERSION}" == "cos"* ]]; then + DEFAULT_GCI_PROJECT=cos-cloud + fi + MASTER_IMAGE_PROJECT=${KUBE_GCE_MASTER_PROJECT:-${DEFAULT_GCI_PROJECT}} + # If the master image is not set, we use the latest GCI image. + # Otherwise, we respect whatever is set by the user. + MASTER_IMAGE=${KUBE_GCE_MASTER_IMAGE:-${GCI_VERSION}} elif [[ "${MASTER_OS_DISTRIBUTION}" == "debian" ]]; then - MASTER_IMAGE=${KUBE_GCE_MASTER_IMAGE:-${CVM_VERSION}} - MASTER_IMAGE_PROJECT=${KUBE_GCE_MASTER_PROJECT:-google-containers} + MASTER_IMAGE=${KUBE_GCE_MASTER_IMAGE:-${CVM_VERSION}} + MASTER_IMAGE_PROJECT=${KUBE_GCE_MASTER_PROJECT:-google-containers} fi # Sets node image based on the specified os distro. Currently this function only # supports gci and debian. function set-node-image() { - if [[ "${NODE_OS_DISTRIBUTION}" == "gci" ]]; then - # If the node image is not set, we use the latest GCI image. - # Otherwise, we respect whatever is set by the user. - NODE_IMAGE=${KUBE_GCE_NODE_IMAGE:-${GCI_VERSION}} - NODE_IMAGE_PROJECT=${KUBE_GCE_NODE_PROJECT:-google-containers} - elif [[ "${NODE_OS_DISTRIBUTION}" == "debian" ]]; then - NODE_IMAGE=${KUBE_GCE_NODE_IMAGE:-${CVM_VERSION}} - NODE_IMAGE_PROJECT=${KUBE_GCE_NODE_PROJECT:-google-containers} - fi + if [[ "${NODE_OS_DISTRIBUTION}" == "gci" ]]; then + DEFAULT_GCI_PROJECT=google-containers + if [[ "${GCI_VERSION}" == "cos"* ]]; then + DEFAULT_GCI_PROJECT=cos-cloud + fi + + # If the node image is not set, we use the latest GCI image. + # Otherwise, we respect whatever is set by the user. + NODE_IMAGE=${KUBE_GCE_NODE_IMAGE:-${GCI_VERSION}} + NODE_IMAGE_PROJECT=${KUBE_GCE_NODE_PROJECT:-${DEFAULT_GCI_PROJECT}} + elif [[ "${NODE_OS_DISTRIBUTION}" == "debian" ]]; then + NODE_IMAGE=${KUBE_GCE_NODE_IMAGE:-${CVM_VERSION}} + NODE_IMAGE_PROJECT=${KUBE_GCE_NODE_PROJECT:-google-containers} + fi } set-node-image diff --git a/cluster/kubemark/gce/config-default.sh b/cluster/kubemark/gce/config-default.sh index 8aee0259322c..1fc342e91853 100644 --- a/cluster/kubemark/gce/config-default.sh +++ b/cluster/kubemark/gce/config-default.sh @@ -36,7 +36,7 @@ PREEMPTIBLE_NODE=${PREEMPTIBLE_NODE:-false} MASTER_OS_DISTRIBUTION=${KUBE_MASTER_OS_DISTRIBUTION:-gci} NODE_OS_DISTRIBUTION=${KUBE_NODE_OS_DISTRIBUTION:-debian} -MASTER_IMAGE=${KUBE_GCE_MASTER_IMAGE:-gci-stable-56-9000-84-2} +MASTER_IMAGE=${KUBE_GCE_MASTER_IMAGE:-cos-beta-59-9460-20-0} MASTER_IMAGE_PROJECT=${KUBE_GCE_MASTER_PROJECT:-google-containers} NETWORK=${KUBE_GCE_NETWORK:-default} diff --git a/test/e2e_node/jenkins/benchmark/benchmark-config.yaml b/test/e2e_node/jenkins/benchmark/benchmark-config.yaml index 883f4247e311..95d1ee2f2815 100644 --- a/test/e2e_node/jenkins/benchmark/benchmark-config.yaml +++ b/test/e2e_node/jenkins/benchmark/benchmark-config.yaml @@ -49,21 +49,21 @@ images: tests: - 'resource tracking for 105 pods per node \[Benchmark\]' gci-resource1: - image: gci-stable-56-9000-84-2 + image: cos-beta-59-9460-20-0 project: google-containers machine: n1-standard-1 metadata: "user-data Date: Sat, 20 May 2017 21:21:23 -0700 Subject: [PATCH 3/3] update default project to cos-cloud in gce configs Signed-off-by: Vishnu kannan --- cluster/gce/config-default.sh | 4 ++-- cluster/gce/config-test.sh | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cluster/gce/config-default.sh b/cluster/gce/config-default.sh index 12f6f8fcd77f..aee2134ef910 100755 --- a/cluster/gce/config-default.sh +++ b/cluster/gce/config-default.sh @@ -72,9 +72,9 @@ CVM_VERSION=${CVM_VERSION:-container-vm-v20170214} # while updating the COS version here. GCI_VERSION=${KUBE_GCI_VERSION:-cos-beta-59-9460-20-0} MASTER_IMAGE=${KUBE_GCE_MASTER_IMAGE:-} -MASTER_IMAGE_PROJECT=${KUBE_GCE_MASTER_PROJECT:-google-containers} +MASTER_IMAGE_PROJECT=${KUBE_GCE_MASTER_PROJECT:-cos-cloud} NODE_IMAGE=${KUBE_GCE_NODE_IMAGE:-${CVM_VERSION}} -NODE_IMAGE_PROJECT=${KUBE_GCE_NODE_PROJECT:-google-containers} +NODE_IMAGE_PROJECT=${KUBE_GCE_NODE_PROJECT:-cos-cloud} CONTAINER_RUNTIME=${KUBE_CONTAINER_RUNTIME:-docker} RKT_VERSION=${KUBE_RKT_VERSION:-1.23.0} RKT_STAGE1_IMAGE=${KUBE_RKT_STAGE1_IMAGE:-coreos.com/rkt/stage1-coreos} diff --git a/cluster/gce/config-test.sh b/cluster/gce/config-test.sh index bcc3c102957c..a87f5fe110b4 100755 --- a/cluster/gce/config-test.sh +++ b/cluster/gce/config-test.sh @@ -69,9 +69,9 @@ fi CVM_VERSION=${CVM_VERSION:-container-vm-v20170214} GCI_VERSION=${KUBE_GCI_VERSION:-cos-beta-59-9460-20-0} MASTER_IMAGE=${KUBE_GCE_MASTER_IMAGE:-} -MASTER_IMAGE_PROJECT=${KUBE_GCE_MASTER_PROJECT:-google-containers} +MASTER_IMAGE_PROJECT=${KUBE_GCE_MASTER_PROJECT:-cos-cloud} NODE_IMAGE=${KUBE_GCE_NODE_IMAGE:-${CVM_VERSION}} -NODE_IMAGE_PROJECT=${KUBE_GCE_NODE_PROJECT:-google-containers} +NODE_IMAGE_PROJECT=${KUBE_GCE_NODE_PROJECT:-cos-cloud} CONTAINER_RUNTIME=${KUBE_CONTAINER_RUNTIME:-docker} GCI_DOCKER_VERSION=${KUBE_GCI_DOCKER_VERSION:-} RKT_VERSION=${KUBE_RKT_VERSION:-1.23.0}