Skip to content

Commit

Permalink
redhat: add udev/systemd/etc infrastructure bits
Browse files Browse the repository at this point in the history
Red Hat has been shipping an "rdma" package, as well as it's own systemd
unit files for some daemons for a while now, in both Fedora and Red Hat
Enterprise Linux. Some of these are fairly RH-specific, but might be of
use to others, so we'd like to move them into the upstream source tree.

Most of these were authored by Doug Ledford, though I'm currently the one
that maintains (most of) them in RHEL.

CC: Doug Ledford <dledford@redhat.com>
Signed-off-by: Jarod Wilson <jarod@redhat.com>
Signed-off-by: Jason Gunthorpe <jgunthorpe@obsidianresearch.com>
  • Loading branch information
jarodwilson authored and jgunthorpe committed Oct 27, 2016
1 parent 8df5873 commit 39fa824
Show file tree
Hide file tree
Showing 19 changed files with 1,367 additions and 0 deletions.
12 changes: 12 additions & 0 deletions redhat/ibacm.service
@@ -0,0 +1,12 @@
[Unit]
Description=Starts the InfiniBand Address Cache Manager daemon
Documentation=man:ibacm
Requires=rdma.service
After=rdma.service opensm.service

[Service]
Type=forking
ExecStart=/usr/sbin/ibacm

[Install]
WantedBy=network.target
25 changes: 25 additions & 0 deletions redhat/rdma.conf
@@ -0,0 +1,25 @@
# Load IPoIB
IPOIB_LOAD=yes
# Load SRP (SCSI Remote Protocol initiator support) module
SRP_LOAD=yes
# Load SRPT (SCSI Remote Protocol target support) module
SRPT_LOAD=yes
# Load iSER (iSCSI over RDMA initiator support) module
ISER_LOAD=yes
# Load iSERT (iSCSI over RDMA target support) module
ISERT_LOAD=yes
# Load RDS (Reliable Datagram Service) network protocol
RDS_LOAD=no
# Load NFSoRDMA client transport module
XPRTRDMA_LOAD=yes
# Load NFSoRDMA server transport module
SVCRDMA_LOAD=no
# Load Tech Preview device driver modules
TECH_PREVIEW_LOAD=no
# Should we modify the system mtrr registers? We may need to do this if you
# get messages from the ib_ipath driver saying that it couldn't enable
# write combining for the PIO buffs on the card.
#
# Note: recent kernels should do this for us, but in case they don't, we'll
# leave this option
FIXUP_MTRR_REGS=no
1 change: 1 addition & 0 deletions redhat/rdma.cxgb3.sys.modprobe
@@ -0,0 +1 @@
install cxgb3 /sbin/modprobe --ignore-install cxgb3 $CMDLINE_OPTS && /sbin/modprobe iw_cxgb3
1 change: 1 addition & 0 deletions redhat/rdma.cxgb4.sys.modprobe
@@ -0,0 +1 @@
install cxgb4 /sbin/modprobe --ignore-install cxgb4 $CMDLINE_OPTS && /sbin/modprobe iw_cxgb4
160 changes: 160 additions & 0 deletions redhat/rdma.fixup-mtrr.awk
@@ -0,0 +1,160 @@
# This is a simple script that checks the contents of /proc/mtrr to see if
# the BIOS maker for the computer took the easy way out in terms of
# specifying memory regions when there is a hole below 4GB for PCI access
# and the machine has 4GB or more of RAM. When the contents of /proc/mtrr
# show a 4GB mapping of write-back cached RAM, minus punch out hole(s) of
# uncacheable regions (the area reserved for PCI access), then it becomes
# impossible for the ib_ipath driver to set write_combining on its PIO
# buffers. To correct the problem, remap the lower memory region in various
# chunks up to the start of the punch out hole(s), then delete the punch out
# hole(s) entirely as they aren't needed any more. That way, ib_ipath will
# be able to set write_combining on its PIO memory access region.

BEGIN {
regs = 0
}

function check_base(mem)
{
printf "Base memory data: base=0x%08x, size=0x%x\n", base[mem], size[mem] > "/dev/stderr"
if (size[mem] < (512 * 1024 * 1024))
return 0
if (type[mem] != "write-back")
return 0
if (base[mem] >= (4 * 1024 * 1024 * 1024))
return 0
return 1
}

function check_hole(hole)
{
printf "Hole data: base=0x%08x, size=0x%x\n", base[hole], size[hole] > "/dev/stderr"
if (size[hole] > (1 * 1024 * 1024 * 1024))
return 0
if (type[hole] != "uncachable")
return 0
if ((base[hole] + size[hole]) > (4 * 1024 * 1024 * 1024))
return 0
return 1
}

function build_entries(start, end, new_base, new_size, tmp_base)
{
# mtrr registers require alignment of blocks, so a 256MB chunk must
# be 256MB aligned. Additionally, all blocks must be a power of 2
# in size. So, do the largest power of two size that we can and
# still have start + block <= end, rinse and repeat.
tmp_base = start
do {
new_base = tmp_base
new_size = 4096
while (((new_base + new_size) < end) &&
((new_base % new_size) == 0))
new_size = lshift(new_size, 1)
if (((new_base + new_size) > end) ||
((new_base % new_size) != 0))
new_size = rshift(new_size, 1)
printf "base=0x%x size=0x%x type=%s\n",
new_base, new_size, type[mem] > "/dev/stderr"
printf "base=0x%x size=0x%x type=%s\n",
new_base, new_size, type[mem] > "/proc/mtrr"
fflush("")
tmp_base = new_base + new_size
} while (tmp_base < end)
}

{
gsub("^reg", "")
gsub(": base=", " ")
gsub(" [(].*), size=", " ")
gsub(": ", " ")
gsub(", count=.*$", "")
register[regs] = strtonum($1)
base[regs] = strtonum($2)
size[regs] = strtonum($3)
human_size[regs] = size[regs]
if (match($3, "MB")) { size[regs] *= 1024*1024; mult[regs] = "MB" }
else { size[regs] *= 1024; mult[regs] = "KB" }
type[regs] = $4
enabled[regs] = 1
end[regs] = base[regs] + size[regs]
regs++
}

END {
# First we need to find our base memory region. We only care about
# the memory register that starts at base 0. This is the only one
# that we can reliably know is our global memory region, and the
# only one that we can reliably check against overlaps. It's entirely
# possible that any memory region not starting at 0 and having an
# overlap with another memory region is in fact intentional and we
# shouldn't touch it.
for(i=0; i<regs; i++)
if (base[i] == 0)
break
# Did we get a valid base register?
if (i == regs)
exit 1
mem = i
if (!check_base(mem))
exit 1

cur_hole = 0
for(i=0; i<regs; i++) {
if (i == mem)
continue
if (base[i] < end[mem] && check_hole(i))
holes[cur_hole++] = i
}
if (cur_hole == 0) {
print "Nothing to do" > "/dev/stderr"
exit 1
}
printf "Found %d punch-out holes\n", cur_hole > "/dev/stderr"

# We need to sort the holes according to base address
for(j = 0; j < cur_hole - 1; j++) {
for(i = cur_hole - 1; i > j; i--) {
if(base[holes[i]] < base[holes[i-1]]) {
tmp = holes[i]
holes[i] = holes[i-1]
holes[i-1] = tmp
}
}
}
# OK, the common case would be that the BIOS is mapping holes out
# of the 4GB memory range, and that our hole(s) are consecutive and
# that our holes and our memory region end at the same place. However,
# things like machines with 8GB of RAM or more can foul up these
# common traits.
#
# So, our modus operandi is to disable all of the memory/hole regions
# to start, then build new base memory zones that in the end add
# up to the same as our original zone minus the holes. We know that
# we will never have a hole listed here that belongs to a valid
# hole punched in a write-combining memory region because you can't
# overlay write-combining on top of write-back and we know our base
# memory region is write-back, so in order for this hole to overlap
# our base memory region it can't be also overlapping a write-combining
# region.
printf "disable=%d\n", register[mem] > "/dev/stderr"
printf "disable=%d\n", register[mem] > "/proc/mtrr"
fflush("")
enabled[mem] = 0
for(i=0; i < cur_hole; i++) {
printf "disable=%d\n", register[holes[i]] > "/dev/stderr"
printf "disable=%d\n", register[holes[i]] > "/proc/mtrr"
fflush("")
enabled[holes[i]] = 0
}
build_entries(base[mem], base[holes[0]])
for(i=0; i < cur_hole - 1; i++)
if (base[holes[i+1]] > end[holes[i]])
build_entries(end[holes[i]], base[holes[i+1]])
if (end[mem] > end[holes[i]])
build_entries(end[holes[i]], end[mem])
# We changed up the mtrr regs, so signal to the rdma script to
# reload modules that need the mtrr regs to be right.
exit 0
}

183 changes: 183 additions & 0 deletions redhat/rdma.ifdown-ib
@@ -0,0 +1,183 @@
#!/bin/bash
# Network Interface Configuration System
# Copyright (c) 1996-2013 Red Hat, Inc. all rights reserved.
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License, version 2,
# as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.

. /etc/init.d/functions

cd /etc/sysconfig/network-scripts
. ./network-functions

[ -f ../network ] && . ../network

CONFIG=${1}

source_config

# Allow the user to override the detection of our physical device by passing
# it in. No checking is done, if the user gives us a bogus dev, it's
# their problem.
[ -n "${PHYSDEV}" ] && REALDEVICE="$PHYSDEV"

. /etc/sysconfig/network

# Check to make sure the device is actually up
check_device_down ${DEVICE} && exit 0

# If we are a P_Key device, we need to munge a few things
if [ "${PKEY}" = yes ]; then
[ -z "${PKEY_ID}" ] && {
net_log $"InfiniBand IPoIB device: PKEY=yes requires a PKEY_ID"
exit 1
}
[ -z "${PHYSDEV}" ] && {
net_log $"InfiniBand IPoIB device: PKEY=yes requires a PHYSDEV"
exit 1
}
# Normalize our PKEY_ID to have the high bit set
NEW_PKEY_ID=`printf "0x%04x" $(( 0x8000 | ${PKEY_ID} ))`
NEW_PKEY_NAME=`printf "%04x" ${NEW_PKEY_ID}`
[ "${DEVICE}" != "${PHYSDEV}.${NEW_PKEY_NAME}" ] && {
net_log $"Configured DEVICE name does not match what new device name would be. This
is most likely because once the PKEY_ID was normalized, it no longer
resulted in the expected device naming, and so the DEVICE entry in the
config file needs to be updated to match. This can also be caused by
giving PKEY_ID as a hex number but without using the mandatory 0x prefix.
Configured DEVICE=$DEVICE
Configured PHYSDEV=$PHYSDEV
Configured PKEY_ID=$PKEY_ID
Calculated PKEY_ID=$NEW_PKEY_ID
Calculated name=${PHYSDEV}.${NEW_PKEY_NAME}"
exit 1
}
[ -d "/sys/class/net/${DEVICE}" ] || exit 0
# When we get to downing the IP address, we need REALDEVICE to
# point to our PKEY device
REALDEVICE="${DEVICE}"
fi


if [ "${SLAVE}" != "yes" -o -z "${MASTER}" ]; then
if [ -n "${HWADDR}" -a -z "${MACADDR}" ]; then
HWADDR=$(echo $HWADDR | tail -c 24)
FOUNDMACADDR=$(get_hwaddr ${REALDEVICE} | tail -c 24)
if [ -n "${FOUNDMACADDR}" -a "${FOUNDMACADDR}" != "${HWADDR}" ]; then
NEWCONFIG=$(get_config_by_hwaddr ${FOUNDMACADDR})
if [ -n "${NEWCONFIG}" ]; then
eval $(LANG=C grep -F "DEVICE=" $NEWCONFIG)
else
net_log $"Device ${DEVICE} has MAC address ${FOUNDMACADDR}, instead of configured address ${HWADDR}. Ignoring."
exit 1
fi
if [ -n "${NEWCONFIG}" -a "${NEWCONFIG##*/}" != "${CONFIG##*/}" -a "${DEVICE}" = "${REALDEVICE}" ]; then
exec /sbin/ifdown ${NEWCONFIG}
else
net_log $"Device ${DEVICE} has MAC address ${FOUNDMACADDR}, instead of configured address ${HWADDR}. Ignoring."
exit 1
fi
fi
fi
fi

if is_bonding_device ${DEVICE} ; then
for device in $(LANG=C grep -l "^[[:space:]]*MASTER=\"\?${DEVICE}\"\?\([[:space:]#]\|$\)" /etc/sysconfig/network-scripts/ifcfg-*) ; do
is_ignored_file "$device" && continue
/sbin/ifdown ${device##*/}
done
for arg in $BONDING_OPTS ; do
key=${arg%%=*};
[[ "${key}" != "arp_ip_target" ]] && continue
value=${arg##*=};
if [ "${value:0:1}" != "" ]; then
OLDIFS=$IFS;
IFS=',';
for arp_ip in $value; do
if grep -q $arp_ip /sys/class/net/${DEVICE}/bonding/arp_ip_target; then
echo "-$arp_ip" > /sys/class/net/${DEVICE}/bonding/arp_ip_target
fi
done
IFS=$OLDIFS;
else
value=${value#+};
if grep -q $value /sys/class/net/${DEVICE}/bonding/arp_ip_target; then
echo "-$value" > /sys/class/net/${DEVICE}/bonding/arp_ip_target
fi
fi
done
fi

/etc/sysconfig/network-scripts/ifdown-ipv6 ${CONFIG}

retcode=0
[ -n "$(pidof -x dhclient)" ] && {
for VER in "" 6 ; do
if [ -f "/var/run/dhclient$VER-${DEVICE}.pid" ]; then
dhcpid=$(cat /var/run/dhclient$VER-${DEVICE}.pid)
generate_lease_file_name $VER
if [[ "$DHCPRELEASE" = [yY1]* ]]; then
/sbin/dhclient -r -lf ${LEASEFILE} -pf /var/run/dhclient$VER-${DEVICE}.pid ${DEVICE} >/dev/null 2>&1
retcode=$?
else
kill $dhcpid >/dev/null 2>&1
retcode=$?
reason=STOP$VER interface=${DEVICE} /sbin/dhclient-script
fi
if [ -f "/var/run/dhclient$VER-${DEVICE}.pid" ]; then
rm -f /var/run/dhclient$VER-${DEVICE}.pid
kill $dhcpid >/dev/null 2>&1
fi
fi
done
}
# we can't just delete the configured address because that address
# may have been changed in the config file since the device was
# brought up. Flush all addresses associated with this
# instance instead.
if [ -d "/sys/class/net/${REALDEVICE}" ]; then
if [ "${REALDEVICE}" = "${DEVICE}" ]; then
ip addr flush dev ${REALDEVICE} scope global 2>/dev/null
else
ip addr flush dev ${REALDEVICE} label ${DEVICE} scope global 2>/dev/null
fi

if [ "${SLAVE}" = "yes" -a -n "${MASTER}" ]; then
echo "-${DEVICE}" > /sys/class/net/${MASTER}/bonding/slaves 2>/dev/null
fi

if [ "${REALDEVICE}" = "${DEVICE}" ]; then
ip link set dev ${DEVICE} down 2>/dev/null
fi
fi
[ "$retcode" = "0" ] && retcode=$?

# wait up to 5 seconds for device to actually come down...
waited=0
while ! check_device_down ${DEVICE} && [ "$waited" -lt 50 ] ; do
usleep 10000
waited=$(($waited+1))
done

if [ "$retcode" = 0 ] ; then
/etc/sysconfig/network-scripts/ifdown-post $CONFIG
# do NOT use $? because ifdown should return whether or not
# the interface went down.
fi

if [ -n "$PKEY" ]; then
# PKey PKEY
echo "$NEW_PKEY_ID" > /sys/class/net/${PHYSDEV}/delete_child
fi

exit $retcode

0 comments on commit 39fa824

Please sign in to comment.