/
02-nvidia-docker.sh
executable file
·91 lines (78 loc) · 3.48 KB
/
02-nvidia-docker.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# Copyright 2017 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#!/bin/bash
set -euo pipefail
set -x
#################################################
# Install nvidia-docker2
# This section is somewhat adapted from README at:
# https://github.com/NVIDIA/nvidia-docker
#######################################
# Cleanup old nvidia-docker
# If you have nvidia-docker 1.0 installed: we need to remove it and all existing GPU containers
docker volume ls -q -f driver=nvidia-docker | xargs -r -I{} -n1 docker ps -q -a -f volume={} | xargs -r docker rm -f
# Remove the old nvidia-docker if it exists
apt-get purge -y nvidia-docker || true
#######################################
# Add package repositories
# Add the package repository for docker-ce
curl -fsSL https://download.docker.com/linux/debian/gpg | \
apt-key add -
echo 'deb [arch=amd64] https://download.docker.com/linux/debian stretch stable' | \
tee /etc/apt/sources.list.d/docker-ce.list
# Add the package repository for nvidia-docker
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | \
apt-key add -
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | \
tee /etc/apt/sources.list.d/nvidia-docker.list
# Override the default runtime with the one from nvidia
# Also explicity set the storage-driver to the prior 'overlay'
cat << 'EOF' > /etc/docker/daemon.json
{
"default-runtime": "nvidia",
"runtimes": {
"nvidia": {
"path": "/usr/bin/nvidia-container-runtime",
"runtimeArgs": []
}
},
"storage-driver": "overlay"
}
EOF
# Install nvidia-docker2 and reload the Docker daemon configuration
# Note that the nvidia-docker version must match the docker-ce version
# --force-confold prevents prompt for replacement of daemon.json
apt-get -y update
# Stop protokube to ensure not bring kubelet up again
systemctl stop protokube
# Stop kubelet to ensure not bring stopped containers up again and leak
# them as orphan containers
systemctl stop kubelet
# pin versions https://github.com/NVIDIA/nvidia-docker/wiki/Frequently-Asked-Questions#how-do-i-install-20-if-im-not-using-the-latest-docker-version
apt-get install -y --allow-downgrades -o Dpkg::Options::="--force-confold" \
nvidia-docker2=2.0.3+docker18.09.4-1 \
nvidia-container-runtime=2.0.0+docker18.09.4-1 \
docker-ce=5:18.09.4~3-0~debian-stretch
# Disable a few things that break docker-ce/gpu support upon reboot:
# Upon boot, the kops-configuration.service systemd unit sets up and starts
# the cloud-init.service which runs nodeup which forces docker-ce to a
# specific version that is a downgrade and incompatible with nvidia-docker2.
# Permanently disable these systemd units via masking.
systemctl mask cloud-init.service
systemctl mask kops-configuration.service
# Restore protokube and protokube will bring up kubelet
systemctl start protokube
# Seems protokube won't bring up kubelet, so start kubelet separately
systemctl start kubelet