-
Notifications
You must be signed in to change notification settings - Fork 38.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add options for mounting SCSI or NVMe local SSD though Block or Filesystem and do all of that with UUID #53466
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,6 +25,9 @@ set -o errexit | |
set -o nounset | ||
set -o pipefail | ||
|
||
readonly UUID_MNT_PREFIX="/mnt/disks/by-uuid/google-local-ssds" | ||
readonly UUID_BLOCK_PREFIX="/dev/disk/by-uuid/google-local-ssds" | ||
|
||
function setup-os-params { | ||
# Reset core_pattern. On GCI, the default core_pattern pipes the core dumps to | ||
# /sbin/crash_reporter which is more restrictive in saving crash dumps. So for | ||
|
@@ -85,11 +88,85 @@ function create-dirs { | |
fi | ||
} | ||
|
||
# Formats the given device ($1) if needed and mounts it at given mount point | ||
# Gets the total number of $(1) and $(2) type disks specified | ||
# by the user in ${NODE_LOCAL_SSDS_EXT} | ||
function get-local-disk-num() { | ||
local interface="${1}" | ||
local format="${2}" | ||
|
||
localdisknum=0 | ||
if [[ ! -z "${NODE_LOCAL_SSDS_EXT}" ]]; then | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. add a :- to this, e.g. "${NODE_LOCAL_SSDS_EXT:-}" |
||
IFS=";" read -r -a ssdgroups <<< "${NODE_LOCAL_SSDS_EXT}" | ||
for ssdgroup in "${ssdgroups[@]}"; do | ||
IFS="," read -r -a ssdopts <<< "${ssdgroup}" | ||
local opnum="${ssdopts[0]}" | ||
local opinterface="${ssdopts[1]}" | ||
local opformat="${ssdopts[2]}" | ||
|
||
if [[ "${opformat,,}" == "${format,,}" && "${opinterface,,}" == "${interface,,}" ]]; then | ||
localdisknum=$((localdisknum+opnum)) | ||
fi | ||
done | ||
fi | ||
} | ||
|
||
# Creates a symlink for a ($1) so that it may be used as block storage | ||
function safe-block-symlink(){ | ||
local device="${1}" | ||
local symdir="${2}" | ||
|
||
mkdir -p "${symdir}" | ||
|
||
get-or-generate-uuid "${device}" | ||
local myuuid="${retuuid}" | ||
|
||
local sym="${symdir}/local-ssd-${myuuid}" | ||
# Do not "mkdir -p ${sym}" as that will cause unintended symlink behavior | ||
ln -s "${device}" "${sym}" | ||
echo "Created a symlink for SSD $ssd at ${sym}" | ||
chmod a+w "${sym}" | ||
} | ||
|
||
# Gets a pregenerated UUID from ${ssdmap} if it exists, otherwise generates a new | ||
# UUID and places it inside ${ssdmap} | ||
function get-or-generate-uuid(){ | ||
local device="${1}" | ||
|
||
local ssdmap="/home/kubernetes/localssdmap.txt" | ||
echo "Generating or getting UUID from ${ssdmap}" | ||
|
||
if [[ ! -e "${ssdmap}" ]]; then | ||
touch "${ssdmap}" | ||
chmod +w "${ssdmap}" | ||
fi | ||
|
||
# each line of the ssdmap looks like "${device} persistent-uuid" | ||
if [[ ! -z $(grep ${device} ${ssdmap}) ]]; then | ||
#create symlink based on saved uuid | ||
local myuuid=$(grep ${device} ${ssdmap} | cut -d ' ' -f 2) | ||
else | ||
# generate new uuid and add it to the map | ||
local myuuid=$(uuidgen) | ||
if [[ ! ${?} -eq 0 ]]; then | ||
echo "Failed to generate valid UUID with uuidgen" >&2 | ||
exit 2 | ||
fi | ||
echo "${device} ${myuuid}" >> "${ssdmap}" | ||
fi | ||
|
||
if [[ -z "${myuuid}" ]]; then | ||
echo "Failed to get a uuid for device ${device} when symlinking." >&2 | ||
exit 2 | ||
fi | ||
|
||
retuuid="${myuuid}" | ||
} | ||
|
||
#Formats the given device ($1) if needed and mounts it at given mount point | ||
# ($2). | ||
function safe-format-and-mount() { | ||
device=$1 | ||
mountpoint=$2 | ||
local device="${1}" | ||
local mountpoint="${2}" | ||
|
||
# Format only if the disk is not already formatted. | ||
if ! tune2fs -l "${device}" ; then | ||
|
@@ -102,18 +179,135 @@ function safe-format-and-mount() { | |
mount -o discard,defaults "${device}" "${mountpoint}" | ||
} | ||
|
||
# Local ssds, if present, are mounted at /mnt/disks/ssdN. | ||
# Gets a devices UUID and bind mounts the device to mount location in | ||
# /mnt/disks/by-id/ | ||
function unique-uuid-bind-mount(){ | ||
local mountpoint="${1}" | ||
local actual_device="${2}" | ||
|
||
# Trigger udev refresh so that newly formatted devices are propagated in by-uuid | ||
udevadm control --reload-rules | ||
udevadm trigger | ||
udevadm settle | ||
|
||
# grep the exact match of actual device, prevents substring matching | ||
local myuuid=$(ls -l /dev/disk/by-uuid/ | grep "/${actual_device}$" | tr -s ' ' | cut -d ' ' -f 9) | ||
# myuuid should be the uuid of the device as found in /dev/disk/by-uuid/ | ||
if [[ -z "${myuuid}" ]]; then | ||
echo "Failed to get a uuid for device ${actual_device} when mounting." >&2 | ||
exit 2 | ||
fi | ||
|
||
# bindpoint should be the full path of the to-be-bound device | ||
local bindpoint="${UUID_MNT_PREFIX}-${interface}-fs/local-ssd-${myuuid}" | ||
|
||
safe-bind-mount "${mountpoint}" "${bindpoint}" | ||
} | ||
|
||
# Bind mounts device at mountpoint to bindpoint | ||
function safe-bind-mount(){ | ||
local mountpoint="${1}" | ||
local bindpoint="${2}" | ||
|
||
# Mount device to the mountpoint | ||
mkdir -p "${bindpoint}" | ||
echo "Binding '${mountpoint}' at '${bindpoint}'" | ||
mount --bind "${mountpoint}" "${bindpoint}" | ||
chmod a+w "${bindpoint}" | ||
} | ||
|
||
|
||
# Mounts, bindmounts, or symlinks depending on the interface and format | ||
# of the incoming device | ||
function mount-ext(){ | ||
local ssd="${1}" | ||
local devicenum="${2}" | ||
local interface="${3}" | ||
local format="${4}" | ||
|
||
|
||
if [[ -z "${devicenum}" ]]; then | ||
echo "Failed to get the local disk number for device ${ssd}" >&2 | ||
exit 2 | ||
fi | ||
|
||
# TODO: Handle partitioned disks. Right now this code just ignores partitions | ||
if [[ "${format}" == "fs" ]]; then | ||
if [[ "${interface}" == "scsi" ]]; then | ||
local actual_device=$(readlink -f "${ssd}" | cut -d '/' -f 3) | ||
# Error checking | ||
if [[ "${actual_device}" != sd* ]]; then | ||
echo "'actual_device' is not of the correct format. It must be the kernel name of the device, got ${actual_device} instead" >&2 | ||
exit 1 | ||
fi | ||
local mountpoint="/mnt/disks/ssd${devicenum}" | ||
else | ||
# This path is required because the existing Google images do not | ||
# expose NVMe devices in /dev/disk/by-id so we are using the /dev/nvme instead | ||
local actual_device=$(echo ${ssd} | cut -d '/' -f 3) | ||
# Error checking | ||
if [[ "${actual_device}" != nvme* ]]; then | ||
echo "'actual_device' is not of the correct format. It must be the kernel name of the device, got ${actual_device} instead" >&2 | ||
exit 1 | ||
fi | ||
local mountpoint="/mnt/disks/ssd-nvme${devicenum}" | ||
fi | ||
|
||
safe-format-and-mount "${ssd}" "${mountpoint}" | ||
# We only do the bindmount if users are using the new local ssd request method | ||
# see https://github.com/kubernetes/kubernetes/pull/53466#discussion_r146431894 | ||
if [[ ! -z "${NODE_LOCAL_SSDS_EXT}" ]]; then | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. :- here as well |
||
unique-uuid-bind-mount "${mountpoint}" "${actual_device}" | ||
fi | ||
elif [[ "${format}" == "block" ]]; then | ||
local symdir="${UUID_BLOCK_PREFIX}-${interface}-block" | ||
safe-block-symlink "${ssd}" "${symdir}" | ||
else | ||
echo "Disk format must be either fs or block, got ${format}" | ||
fi | ||
} | ||
|
||
# Local ssds, if present, are mounted or symlinked to their appropriate | ||
# locations | ||
function ensure-local-ssds() { | ||
get-local-disk-num "scsi" "block" | ||
local scsiblocknum="${localdisknum}" | ||
local i=0 | ||
for ssd in /dev/disk/by-id/google-local-ssd-*; do | ||
if [ -e "${ssd}" ]; then | ||
ssdnum=`echo ${ssd} | sed -e 's/\/dev\/disk\/by-id\/google-local-ssd-\([0-9]*\)/\1/'` | ||
ssdmount="/mnt/disks/ssd${ssdnum}/" | ||
mkdir -p ${ssdmount} | ||
safe-format-and-mount "${ssd}" ${ssdmount} | ||
echo "Mounted local SSD $ssd at ${ssdmount}" | ||
chmod a+w ${ssdmount} | ||
local devicenum=`echo ${ssd} | sed -e 's/\/dev\/disk\/by-id\/google-local-ssd-\([0-9]*\)/\1/'` | ||
if [[ "${i}" -lt "${scsiblocknum}" ]]; then | ||
mount-ext "${ssd}" "${devicenum}" "scsi" "block" | ||
else | ||
# GKE does not set NODE_LOCAL_SSDS so all non-block devices | ||
# are assumed to be filesystem devices | ||
mount-ext "${ssd}" "${devicenum}" "scsi" "fs" | ||
fi | ||
i=$((i+1)) | ||
else | ||
echo "No local SCSI SSD disks found." | ||
fi | ||
done | ||
|
||
# The following mounts or symlinks NVMe devices | ||
get-local-disk-num "nvme" "block" | ||
local nvmeblocknum="${localdisknum}" | ||
local i=0 | ||
for ssd in /dev/nvme*; do | ||
if [ -e "${ssd}" ]; then | ||
# This workaround to find if the NVMe device is a disk is required because | ||
# the existing Google images does not expose NVMe devices in /dev/disk/by-id | ||
if [[ `udevadm info --query=property --name=${ssd} | grep DEVTYPE | sed "s/DEVTYPE=//"` == "disk" ]]; then | ||
local devicenum=`echo ${ssd} | sed -e 's/\/dev\/nvme0n\([0-9]*\)/\1/'` | ||
if [[ "${i}" -lt "${nvmeblocknum}" ]]; then | ||
mount-ext "${ssd}" "${devicenum}" "nvme" "block" | ||
else | ||
mount-ext "${ssd}" "${devicenum}" "nvme" "fs" | ||
fi | ||
i=$((i+1)) | ||
fi | ||
else | ||
echo "No local SSD disks found." | ||
echo "No local NVMe SSD disks found." | ||
fi | ||
done | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,6 +18,8 @@ | |
|
||
# Use the config file specified in $KUBE_CONFIG_FILE, or default to | ||
# config-default.sh. | ||
readonly GCE_MAX_LOCAL_SSD=8 | ||
|
||
KUBE_ROOT=$(dirname "${BASH_SOURCE}")/../.. | ||
source "${KUBE_ROOT}/cluster/gce/${KUBE_CONFIG_FILE-"config-default.sh"}" | ||
source "${KUBE_ROOT}/cluster/common.sh" | ||
|
@@ -37,6 +39,11 @@ else | |
exit 1 | ||
fi | ||
|
||
if [[ ${NODE_LOCAL_SSDS} -ge 1 ]] && [[ ! -z ${NODE_LOCAL_SSDS_EXT} ]] ; then | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. :- here as well |
||
echo -e "${color_red}Local SSD: Only one of NODE_LOCAL_SSDS and NODE_LOCAL_SSDS_EXT can be specified at once${color_norm}" >&2 | ||
exit 2 | ||
fi | ||
|
||
if [[ "${MASTER_OS_DISTRIBUTION}" == "gci" ]]; then | ||
DEFAULT_GCI_PROJECT=google-containers | ||
if [[ "${GCI_VERSION}" == "cos"* ]]; then | ||
|
@@ -546,6 +553,29 @@ function get-template-name-from-version() { | |
echo "${NODE_INSTANCE_PREFIX}-template-${1}" | cut -c 1-63 | sed 's/[\.\+]/-/g;s/-*$//g' | ||
} | ||
|
||
# validates the NODE_LOCAL_SSDS_EXT variable | ||
function validate-node-local-ssds-ext(){ | ||
ssdopts="${1}" | ||
|
||
if [[ -z "${ssdopts[0]}" || -z "${ssdopts[1]}" || -z "${ssdopts[2]}" ]]; then | ||
echo -e "${color_red}Local SSD: NODE_LOCAL_SSDS_EXT is malformed, found ${ssdopts[0]-_},${ssdopts[1]-_},${ssdopts[2]-_} ${color_norm}" >&2 | ||
exit 2 | ||
fi | ||
if [[ "${ssdopts[1]}" != "scsi" && "${ssdopts[1]}" != "nvme" ]]; then | ||
echo -e "${color_red}Local SSD: Interface must be scsi or nvme, found: ${ssdopts[1]} ${color_norm}" >&2 | ||
exit 2 | ||
fi | ||
if [[ "${ssdopts[2]}" != "fs" && "${ssdopts[2]}" != "block" ]]; then | ||
echo -e "${color_red}Local SSD: Filesystem type must be fs or block, found: ${ssdopts[2]} ${color_norm}" >&2 | ||
exit 2 | ||
fi | ||
local_ssd_ext_count=$((local_ssd_ext_count+ssdopts[0])) | ||
if [[ "${local_ssd_ext_count}" -gt "${GCE_MAX_LOCAL_SSD}" || "${local_ssd_ext_count}" -lt 1 ]]; then | ||
echo -e "${color_red}Local SSD: Total number of local ssds must range from 1 to 8, found: ${local_ssd_ext_count} ${color_norm}" >&2 | ||
exit 2 | ||
fi | ||
} | ||
|
||
# Robustly try to create an instance template. | ||
# $1: The name of the instance template. | ||
# $2: The scopes flag. | ||
|
@@ -587,6 +617,19 @@ function create-node-template() { | |
fi | ||
|
||
local local_ssds="" | ||
local_ssd_ext_count=0 | ||
if [[ ! -z ${NODE_LOCAL_SSDS_EXT-""} ]]; then | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. instead of |
||
IFS=";" read -r -a ssdgroups <<< ${NODE_LOCAL_SSDS_EXT} | ||
for ssdgroup in "${ssdgroups[@]}" | ||
do | ||
IFS="," read -r -a ssdopts <<< ${ssdgroup} | ||
validate-node-local-ssds-ext ${ssdopts} | ||
for i in $(seq ${ssdopts[0]}); do | ||
local_ssds="$local_ssds--local-ssd=interface=${ssdopts[1]} " | ||
done | ||
done | ||
fi | ||
|
||
if [[ ! -z ${NODE_LOCAL_SSDS+x} ]]; then | ||
# The NODE_LOCAL_SSDS check below fixes issue #49171 | ||
# Some versions of seq will count down from 1 if "seq 0" is specified | ||
|
@@ -596,6 +639,7 @@ function create-node-template() { | |
done | ||
fi | ||
fi | ||
|
||
|
||
local network=$(make-gcloud-network-argument \ | ||
"${NETWORK_PROJECT}" \ | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if it's because on gce it's defaulted to "", but on GKE it's unset.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah I found this error in the master logs:
We keep running into this every time a new var is added, because GKE does not use
config-default.sh
at all, andconfigure-helper.sh
usesnounset
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@enisoc how has this been fixed in the past? I was just going to add a 'NODE_LOCAL_SSDS_EXT=${NODE_LOCAL_SSDS_EXT:-""}' to configure-helper.sh but it feels like maybe there might be a better solution