Skip to content

Commit

Permalink
user units: implicitly enable PrivateUsers= when sandboxing options a…
Browse files Browse the repository at this point in the history
…re set

Enabling these options when not running as root requires a user
namespace, so implicitly enable PrivateUsers=.
This has a side effect as it changes which users are visible to the unit.
However until now these options did not work at all for user units, and
in practice just a handful of user units in Fedora, Debian and Ubuntu
mistakenly used them (and they have been all fixed since).

This fixes the long-standing confusing issue that the user and system
units take the same options but the behaviour is wildly (and sometimes
silently) different depending on which is which, with user units
requiring manually specifiying PrivateUsers= in order for sandboxing
options to actually work and not be silently ignored.
  • Loading branch information
bluca committed Apr 13, 2023
1 parent ce963a7 commit 6ef721c
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 52 deletions.
8 changes: 6 additions & 2 deletions man/system-or-user-ns.xml
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,13 @@
<refsect1>

<para id="singular">This option is only available for system services, or for services running in per-user
instances of the service manager when <varname>PrivateUsers=</varname> is enabled.</para>
instances of the service manager in which case <varname>PrivateUsers=</varname> is implicitly enabled
(requires unprivileged user namespaces support to be enabled in the kernel via the
<literal>kernel.unprivileged_userns_clone=</literal> sysctl).</para>

<para id="plural">These options are only available for system services, or for services running in per-user
instances of the service manager when <varname>PrivateUsers=</varname> is enabled.</para>
instances of the service manager in which case <varname>PrivateUsers=</varname> is implicitly enabled
(requires unprivileged user namespaces support to be enabled in the kernel via the
<literal>kernel.unprivileged_userns_clone=</literal> sysctl).</para>

</refsect1>
49 changes: 46 additions & 3 deletions src/core/execute.c
Original file line number Diff line number Diff line change
Expand Up @@ -4400,6 +4400,44 @@ static void log_command_line(Unit *unit, const char *msg, const char *executable
LOG_UNIT_INVOCATION_ID(unit));
}

static bool exec_context_need_unprivileged_private_users(const ExecContext *context, const Manager *manager) {
assert(context);
assert(manager);

/* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
* to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
* (system manager) then we have privileges and don't need this. */
if (MANAGER_IS_SYSTEM(manager))
return false;

return context->private_users ||
context->private_tmp ||
context->private_devices ||
context->private_network ||
context->network_namespace_path ||
context->private_ipc ||
context->ipc_namespace_path ||
context->private_mounts ||
context->mount_apivfs ||
context->n_bind_mounts > 0 ||
context->n_temporary_filesystems > 0 ||
context->root_directory ||
!strv_isempty(context->extension_directories) ||
context->protect_system != PROTECT_SYSTEM_NO ||
context->protect_home != PROTECT_HOME_NO ||
context->protect_kernel_tunables ||
context->protect_kernel_modules ||
context->protect_kernel_logs ||
context->protect_control_groups ||
context->protect_clock ||
context->protect_hostname ||
!strv_isempty(context->read_write_paths) ||
!strv_isempty(context->read_only_paths) ||
!strv_isempty(context->inaccessible_paths) ||
!strv_isempty(context->exec_paths) ||
!strv_isempty(context->no_exec_paths);
}

static int exec_child(
Unit *unit,
const ExecCommand *command,
Expand Down Expand Up @@ -5032,17 +5070,22 @@ static int exec_child(
}
}

if (needs_sandboxing && context->private_users && have_effective_cap(CAP_SYS_ADMIN) <= 0) {
if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, unit->manager)) {
/* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
* Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
* set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */

userns_set_up = true;
r = setup_private_users(saved_uid, saved_gid, uid, gid);
if (r < 0) {
/* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
* the actual requested operations fail (or silently continue). */
if (r < 0 && context->private_users) {
*exit_status = EXIT_USER;
return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
}
if (r < 0)
log_unit_info_errno(unit, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
else
userns_set_up = true;
}

if (exec_needs_network_namespace(context) && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
Expand Down
59 changes: 29 additions & 30 deletions src/test/test-execute.c
Original file line number Diff line number Diff line change
Expand Up @@ -401,9 +401,9 @@ static void test_exec_ignoresigpipe(Manager *m) {
static void test_exec_privatetmp(Manager *m) {
assert_se(touch("/tmp/test-exec_privatetmp") >= 0);

test(m, "exec-privatetmp-yes.service", can_unshare ? 0 : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-privatetmp-yes.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED);
test(m, "exec-privatetmp-no.service", 0, CLD_EXITED);
test(m, "exec-privatetmp-disabled-by-prefix.service", can_unshare ? 0 : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-privatetmp-disabled-by-prefix.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED);

unlink("/tmp/test-exec_privatetmp");
}
Expand All @@ -420,10 +420,10 @@ static void test_exec_privatedevices(Manager *m) {
return;
}

test(m, "exec-privatedevices-yes.service", can_unshare ? 0 : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-privatedevices-yes.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED);
test(m, "exec-privatedevices-no.service", 0, CLD_EXITED);
test(m, "exec-privatedevices-disabled-by-prefix.service", can_unshare ? 0 : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-privatedevices-yes-with-group.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_GROUP, CLD_EXITED);
test(m, "exec-privatedevices-disabled-by-prefix.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED);
test(m, "exec-privatedevices-yes-with-group.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED);

/* We use capsh to test if the capabilities are
* properly set, so be sure that it exists */
Expand All @@ -433,10 +433,10 @@ static void test_exec_privatedevices(Manager *m) {
return;
}

test(m, "exec-privatedevices-yes-capability-mknod.service", 0, CLD_EXITED);
test(m, "exec-privatedevices-no-capability-mknod.service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-privatedevices-yes-capability-sys-rawio.service", 0, CLD_EXITED);
test(m, "exec-privatedevices-no-capability-sys-rawio.service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-privatedevices-yes-capability-mknod.service", can_unshare || MANAGER_IS_SYSTEM(m) ? 0 : EXIT_NAMESPACE, CLD_EXITED);
test(m, "exec-privatedevices-no-capability-mknod.service", 0, CLD_EXITED);
test(m, "exec-privatedevices-yes-capability-sys-rawio.service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_NAMESPACE, CLD_EXITED);
test(m, "exec-privatedevices-no-capability-sys-rawio.service", 0, CLD_EXITED);
}

static void test_exec_protecthome(Manager *m) {
Expand Down Expand Up @@ -466,23 +466,23 @@ static void test_exec_protectkernelmodules(Manager *m) {
return;
}

test(m, "exec-protectkernelmodules-no-capabilities.service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-protectkernelmodules-yes-capabilities.service", 0, CLD_EXITED);
test(m, "exec-protectkernelmodules-yes-mount-propagation.service", can_unshare ? 0 : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-protectkernelmodules-no-capabilities.service", 0, CLD_EXITED);
test(m, "exec-protectkernelmodules-yes-capabilities.service", MANAGER_IS_SYSTEM(m) ? 0 : EXIT_NAMESPACE, CLD_EXITED);
test(m, "exec-protectkernelmodules-yes-mount-propagation.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED);
}

static void test_exec_readonlypaths(Manager *m) {

test(m, "exec-readonlypaths-simple.service", can_unshare ? 0 : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-readonlypaths-simple.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED);

if (path_is_read_only_fs("/var") > 0) {
log_notice("Directory /var is readonly, skipping remaining tests in %s", __func__);
return;
}

test(m, "exec-readonlypaths.service", can_unshare ? 0 : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-readonlypaths.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED);
test(m, "exec-readonlypaths-with-bindpaths.service", can_unshare ? 0 : EXIT_NAMESPACE, CLD_EXITED);
test(m, "exec-readonlypaths-mount-propagation.service", can_unshare ? 0 : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-readonlypaths-mount-propagation.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED);
}

static void test_exec_readwritepaths(Manager *m) {
Expand All @@ -492,7 +492,7 @@ static void test_exec_readwritepaths(Manager *m) {
return;
}

test(m, "exec-readwritepaths-mount-propagation.service", can_unshare ? 0 : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-readwritepaths-mount-propagation.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED);
}

static void test_exec_inaccessiblepaths(Manager *m) {
Expand All @@ -502,14 +502,14 @@ static void test_exec_inaccessiblepaths(Manager *m) {
return;
}

test(m, "exec-inaccessiblepaths-sys.service", can_unshare ? 0 : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-inaccessiblepaths-sys.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED);

if (path_is_read_only_fs("/") > 0) {
log_notice("Root directory is readonly, skipping remaining tests in %s", __func__);
return;
}

test(m, "exec-inaccessiblepaths-mount-propagation.service", can_unshare ? 0 : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-inaccessiblepaths-mount-propagation.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED);
}

static int on_spawn_io(sd_event_source *s, int fd, uint32_t revents, void *userdata) {
Expand Down Expand Up @@ -687,14 +687,14 @@ static void test_exec_mount_apivfs(Manager *m) {

assert_se(mkdir_p("/tmp/test-exec-mount-apivfs-no/root", 0755) >= 0);

test(m, "exec-mount-apivfs-no.service", can_unshare ? 0 : EXIT_NAMESPACE, CLD_EXITED);
test(m, "exec-mount-apivfs-no.service", can_unshare || !MANAGER_IS_SYSTEM(m) ? 0 : EXIT_NAMESPACE, CLD_EXITED);

(void) rm_rf("/tmp/test-exec-mount-apivfs-no/root", REMOVE_ROOT|REMOVE_PHYSICAL);
}

static void test_exec_noexecpaths(Manager *m) {

test(m, "exec-noexecpaths-simple.service", can_unshare ? 0 : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-noexecpaths-simple.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED);
}

static void test_exec_temporaryfilesystem(Manager *m) {
Expand Down Expand Up @@ -964,8 +964,8 @@ static void test_exec_passenvironment(Manager *m) {
}

static void test_exec_umask(Manager *m) {
test(m, "exec-umask-default.service", 0, CLD_EXITED);
test(m, "exec-umask-0177.service", 0, CLD_EXITED);
test(m, "exec-umask-default.service", can_unshare || MANAGER_IS_SYSTEM(m) ? 0 : EXIT_NAMESPACE, CLD_EXITED);
test(m, "exec-umask-0177.service", can_unshare || MANAGER_IS_SYSTEM(m) ? 0 : EXIT_NAMESPACE, CLD_EXITED);
}

static void test_exec_runtimedirectory(Manager *m) {
Expand Down Expand Up @@ -1012,7 +1012,7 @@ static void test_exec_capabilityboundingset(Manager *m) {
}

static void test_exec_basic(Manager *m) {
test(m, "exec-basic.service", 0, CLD_EXITED);
test(m, "exec-basic.service", can_unshare || MANAGER_IS_SYSTEM(m) ? 0 : EXIT_NAMESPACE, CLD_EXITED);
}

static void test_exec_ambientcapabilities(Manager *m) {
Expand Down Expand Up @@ -1052,17 +1052,16 @@ static void test_exec_ambientcapabilities(Manager *m) {
}

static void test_exec_privatenetwork(Manager *m) {
int r, status;
int r;

r = find_executable("ip", NULL);
if (r < 0) {
log_notice_errno(r, "Skipping %s, could not find ip binary: %m", __func__);
return;
}

status = can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_NETWORK : EXIT_FAILURE;
test(m, "exec-privatenetwork-yes-privatemounts-no.service", status, CLD_EXITED);
test(m, "exec-privatenetwork-yes-privatemounts-yes.service", status, CLD_EXITED);
test(m, "exec-privatenetwork-yes-privatemounts-no.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_NETWORK : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-privatenetwork-yes-privatemounts-yes.service", can_unshare ? 0 : MANAGER_IS_SYSTEM(m) ? EXIT_NETWORK : EXIT_NAMESPACE, CLD_EXITED);
}

static void test_exec_networknamespacepath(Manager *m) {
Expand All @@ -1075,7 +1074,7 @@ static void test_exec_networknamespacepath(Manager *m) {
}

test(m, "exec-networknamespacepath-privatemounts-no.service", MANAGER_IS_SYSTEM(m) ? EXIT_SUCCESS : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-networknamespacepath-privatemounts-yes.service", can_unshare ? EXIT_SUCCESS : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-networknamespacepath-privatemounts-yes.service", can_unshare ? EXIT_SUCCESS : MANAGER_IS_SYSTEM(m) ? EXIT_FAILURE : EXIT_NAMESPACE, CLD_EXITED);
}

static void test_exec_oomscoreadjust(Manager *m) {
Expand Down Expand Up @@ -1105,12 +1104,12 @@ static void test_exec_unsetenvironment(Manager *m) {
}

static void test_exec_specifier(Manager *m) {
test(m, "exec-specifier.service", 0, CLD_EXITED);
test(m, "exec-specifier.service", can_unshare || MANAGER_IS_SYSTEM(m) ? 0 : EXIT_FAILURE, CLD_EXITED);
if (MANAGER_IS_SYSTEM(m))
test(m, "exec-specifier-system.service", 0, CLD_EXITED);
else
test(m, "exec-specifier-user.service", 0, CLD_EXITED);
test(m, "exec-specifier@foo-bar.service", 0, CLD_EXITED);
test(m, "exec-specifier@foo-bar.service", can_unshare || MANAGER_IS_SYSTEM(m) ? 0 : EXIT_FAILURE, CLD_EXITED);
test(m, "exec-specifier-interpolation.service", 0, CLD_EXITED);
}

Expand Down
34 changes: 17 additions & 17 deletions test/units/testsuite-43.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@ runas testuser systemd-run --wait --user --unit=test-private-users \
runas testuser systemctl --user log-level debug

runas testuser systemd-run --wait --user --unit=test-private-tmp-innerfile \
-p PrivateUsers=yes -p PrivateTmp=yes \
-p PrivateTmp=yes \
-P touch /tmp/innerfile.txt
# File should not exist outside the job's tmp directory.
test ! -e /tmp/innerfile.txt

touch /tmp/outerfile.txt
# File should not appear in unit's private tmp.
runas testuser systemd-run --wait --user --unit=test-private-tmp-outerfile \
-p PrivateUsers=yes -p PrivateTmp=yes \
-p PrivateTmp=yes \
-P test ! -e /tmp/outerfile.txt

# Confirm that creating a file in home works
Expand All @@ -35,7 +35,7 @@ test -e /home/testuser/works.txt

# Confirm that creating a file in home is blocked under read-only
runas testuser systemd-run --wait --user --unit=test-protect-home-read-only \
-p PrivateUsers=yes -p ProtectHome=read-only \
-p ProtectHome=read-only \
-P bash -c '
test -e /home/testuser/works.txt || exit 10
touch /home/testuser/blocked.txt && exit 11
Expand All @@ -45,13 +45,13 @@ test ! -e /home/testuser/blocked.txt

# Check that tmpfs hides the whole directory
runas testuser systemd-run --wait --user --unit=test-protect-home-tmpfs \
-p PrivateUsers=yes -p ProtectHome=tmpfs \
-p ProtectHome=tmpfs \
-P test ! -e /home/testuser

# Confirm that home, /root, and /run/user are inaccessible under "yes"
# shellcheck disable=SC2016
runas testuser systemd-run --wait --user --unit=test-protect-home-yes \
-p PrivateUsers=yes -p ProtectHome=yes \
-p ProtectHome=yes \
-P bash -c '
test "$(stat -c %a /home)" = "0"
test "$(stat -c %a /root)" = "0"
Expand All @@ -70,11 +70,11 @@ runas testuser systemd-run --wait --user --unit=test-group-fail \
# Check that with a new user namespace we can bind mount
# files and use a different root directory
runas testuser systemd-run --wait --user --unit=test-bind-mount \
-p PrivateUsers=yes -p BindPaths=/dev/null:/etc/os-release \
-p BindPaths=/dev/null:/etc/os-release \
test ! -s /etc/os-release

runas testuser systemd-run --wait --user --unit=test-read-write \
-p PrivateUsers=yes -p ReadOnlyPaths=/ \
-p ReadOnlyPaths=/ \
-p ReadWritePaths="/var /run /tmp" \
-p NoExecPaths=/ -p ExecPaths=/usr \
test ! -w /etc/os-release
Expand All @@ -85,50 +85,50 @@ runas testuser systemd-run --wait --user --unit=test-caps \
test -s /etc/os-release

runas testuser systemd-run --wait --user --unit=test-devices \
-p PrivateUsers=yes -p PrivateDevices=yes -p PrivateIPC=yes \
-p PrivateDevices=yes -p PrivateIPC=yes \
sh -c "ls -1 /dev/ | wc -l | grep -q -F 18"

# Same check as test/test-execute/exec-privatenetwork-yes.service
runas testuser systemd-run --wait --user --unit=test-network \
-p PrivateUsers=yes -p PrivateNetwork=yes \
-p PrivateNetwork=yes \
/bin/sh -x -c '! ip link | grep -E "^[0-9]+: " | grep -Ev ": (lo|(erspan|gre|gretap|ip_vti|ip6_vti|ip6gre|ip6tnl|sit|tunl)0@.*):"'

runas testuser systemd-run --wait --user --unit=test-hostname \
-p PrivateUsers=yes -p ProtectHostname=yes \
-p ProtectHostname=yes \
hostnamectl hostname foo \
&& { echo 'unexpected success'; exit 1; }

runas testuser systemd-run --wait --user --unit=test-clock \
-p PrivateUsers=yes -p ProtectClock=yes \
-p ProtectClock=yes \
timedatectl set-time "2012-10-30 18:17:16" \
&& { echo 'unexpected success'; exit 1; }

runas testuser systemd-run --wait --user --unit=test-kernel-tunable \
-p PrivateUsers=yes -p ProtectKernelTunables=yes \
-p ProtectKernelTunables=yes \
sh -c "echo 0 >/proc/sys/user/max_user_namespaces" \
&& { echo 'unexpected success'; exit 1; }

runas testuser systemd-run --wait --user --unit=test-kernel-mod \
-p PrivateUsers=yes -p ProtectKernelModules=yes \
-p ProtectKernelModules=yes \
sh -c "modprobe -r overlay && modprobe overlay" \
&& { echo 'unexpected success'; exit 1; }

if sysctl kernel.dmesg_restrict=0; then
runas testuser systemd-run --wait --user --unit=test-kernel-log \
-p PrivateUsers=yes -p ProtectKernelLogs=yes -p LogNamespace=yes \
-p ProtectKernelLogs=yes -p LogNamespace=yes \
dmesg \
&& { echo 'unexpected success'; exit 1; }
fi

unsquashfs -no-xattrs -d /tmp/img /usr/share/minimal_0.raw
runas testuser systemd-run --wait --user --unit=test-root-dir \
-p PrivateUsers=yes -p RootDirectory=/tmp/img \
-p RootDirectory=/tmp/img \
grep MARKER=1 /etc/os-release

mkdir /tmp/img_bind
mount --bind /tmp/img /tmp/img_bind
runas testuser systemd-run --wait --user --unit=test-root-dir-bind \
-p PrivateUsers=yes -p RootDirectory=/tmp/img_bind -p MountFlags=private \
-p RootDirectory=/tmp/img_bind -p MountFlags=private \
grep MARKER=1 /etc/os-release
umount /tmp/img_bind

Expand All @@ -137,7 +137,7 @@ mkdir -p /tmp/a /tmp/b /tmp/c
if unshare --mount --user --map-root-user mount -t overlay overlay /tmp/c -o lowerdir=/tmp/a:/tmp/b; then
unsquashfs -no-xattrs -d /tmp/app2 /usr/share/app1.raw
runas testuser systemd-run --wait --user --unit=test-extension-dir \
-p PrivateUsers=yes -p ExtensionDirectories=/tmp/app2 \
-p ExtensionDirectories=/tmp/app2 \
-p TemporaryFileSystem=/run -p RootDirectory=/tmp/img \
-p MountAPIVFS=yes \
grep PORTABLE_PREFIXES=app1 /usr/lib/extension-release.d/extension-release.app2
Expand Down

0 comments on commit 6ef721c

Please sign in to comment.