Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CRI: Add security context for sandbox/container #34811

Merged
merged 6 commits into from
Nov 8, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
789 changes: 435 additions & 354 deletions pkg/kubelet/api/v1alpha1/runtime/api.pb.go

Large diffs are not rendered by default.

81 changes: 52 additions & 29 deletions pkg/kubelet/api/v1alpha1/runtime/api.proto
Original file line number Diff line number Diff line change
Expand Up @@ -148,16 +148,35 @@ message NamespaceOption {
optional bool host_ipc = 3;
}

// LinuxSandboxSecurityContext holds linux security configuration that will be
// applied to a sandbox. Note that:
// 1) It does not apply to containers in the pods.
// 2) It may not be applicable to a PodSandbox which does not contain any running
// process.
message LinuxSandboxSecurityContext {
// The configurations for the sandbox's namespaces.
// This will be used only if the PodSandbox uses namespace for isolation.
optional NamespaceOption namespace_options = 1;
// Optional SELinux context to be applied.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should add a sentence about precedence here as well.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ack. thanks.

optional SELinuxOption selinux_options = 2;
// The UID to run the entrypoint of the sandbox process.
optional int64 run_as_user = 3;
// If set, the root filesystem of the sandbox is read-only.
optional bool readonly_rootfs = 4;
// A list of groups applied to the first process run in the sandbox, in addition
// to the sandbox's primary GID.
repeated int64 supplemental_groups = 5;
}

// LinuxPodSandboxConfig holds platform-specific configurations for Linux
// host platforms and Linux-based containers.
message LinuxPodSandboxConfig {
// The parent cgroup of the pod sandbox.
// The cgroupfs style syntax will be used, but the container runtime can
// convert it to systemd semantics if needed.
optional string cgroup_parent = 1;
// The configurations for the sandbox's namespaces.
// This will be used only if the PodSandbox uses namespace for isolation.
optional NamespaceOption namespace_options = 2;
// LinuxSandboxSecurityContext holds sandbox security attributes.
optional LinuxSandboxSecurityContext security_context = 2;
}

// PodSandboxMetadata holds all necessary information for building the sandbox name.
Expand Down Expand Up @@ -409,26 +428,34 @@ message Capability {
repeated string drop_capabilities = 2;
}

// LinuxContainerSecurityContext holds linux security configuration that will be applied to a container.
message LinuxContainerSecurityContext {
// Capabilities to add or drop.
optional Capability capabilities = 1;
// If set, run container in privileged mode.
optional bool privileged = 2;
// The configurations for the container's namespaces.
// This will be used only if the container uses namespace for isolation.
optional NamespaceOption namespace_options = 3;
// Optional SELinux context to be applied.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a sentence about precedence here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ack

optional SELinuxOption selinux_options = 4;
// The UID to run the the container process as.
// Defaults to user specified in image metadata if unspecified.
optional int64 run_as_user = 5;
// If set, the root filesystem of the container is read-only.
optional bool readonly_rootfs = 6;
// A list of groups applied to the first process run in the container, in addition
// to the container's primary GID.
repeated int64 supplemental_groups = 7;
}

// LinuxContainerConfig contains platform-specific configuration for
// Linux-based containers.
message LinuxContainerConfig {
// Resources specification for the container.
optional LinuxContainerResources resources = 1;
// Capabilities to add or drop.
optional Capability capabilities = 2;
// Optional SELinux context to be applied.
optional SELinuxOption selinux_options = 3;
// User contains the user for the container process.
optional LinuxUser user = 4;
}

message LinuxUser {
// uid specifies the user ID the container process has.
optional int64 uid = 1;
// gid specifies the group ID the container process has.
optional int64 gid = 2;
// additional_gids specifies additional GIDs the container process has.
repeated int64 additional_gids = 3;
// LinuxContainerSecurityContext configuration for the container.
optional LinuxContainerSecurityContext security_context = 2;
}

// ContainerMetadata holds all necessary information for building the container
Expand Down Expand Up @@ -488,11 +515,6 @@ message ContainerConfig {
// Annotations is an unstructured key value map that may be set by external
// tools to store and retrieve arbitrary metadata.
map<string, string> annotations = 10;
// If set, run container in privileged mode.
// Processes in privileged containers are essentially equivalent to root on the host.
optional bool privileged = 11;
// If set, the root filesystem of the container is read-only.
optional bool readonly_rootfs = 12;
// Path relative to PodSandboxConfig.LogDirectory for container to store
// the log (STDOUT and STDERR) on the host.
// E.g.,
Expand All @@ -503,19 +525,18 @@ message ContainerConfig {
// container logs are under active discussion in
// https://issues.k8s.io/24677. There *may* be future change of direction
// for logging as the discussion carries on.
optional string log_path = 13;
// The hash of container config
optional string log_path = 11;

// Variables for interactive containers, these have very specialized
// use-cases (e.g. debugging).
// TODO: Determine if we need to continue supporting these fields that are
// part of Kubernetes's Container Spec.
optional bool stdin = 14;
optional bool stdin_once = 15;
optional bool tty = 16;
optional bool stdin = 12;
optional bool stdin_once = 13;
optional bool tty = 14;

// Linux contains configuration specific to Linux containers.
optional LinuxContainerConfig linux = 17;
optional LinuxContainerConfig linux = 15;
}

message CreateContainerRequest {
Expand Down Expand Up @@ -737,6 +758,8 @@ message Image {
repeated string repo_digests = 3;
// The size of the image in bytes.
optional uint64 size = 4;
// The uid that will run the command(s).
optional int64 uid = 5;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@euank @yifan-gu are you ok with this or do you prefer runtime itself checks the run_as_non_root option?

Background: In the initial version, kubelet set the run_as_non_root field in the security context and pass it to the runtime (via CRI). Runtime checks the (docker) image and verify that the user is not set to root. This assumes that the RuntimeService is aware of the images and can perform checks. Since we may want to decouple ImageService and RuntimeService in the future, I suggested move up this logic and let kubelet perform the check. This requires reporting the user name or ID as part of the image for kubelet to check.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, we have that image info handy as well.

I'm 👍 for kubelet doing that check.

My nit for this field is that user probably makes more sense since it can be either a name or uid, and the comment needs to clarify that dual-meaning as well.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, reading more it looks like this is strictly only UID and k8s just doesn't support non_root_user for images that have a non-numeric user (hey, another Dockerfile feature that's not fully supported :).

looks like my concern isn't valid.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@euank

Actually, reading more it looks like this is strictly only UID and k8s just doesn't support non_root_user for images that have a non-numeric user (hey, another Dockerfile feature that's not fully supported :).

IIRC the reason it doesn't support non-numeric users is that it is not trivial to get at an image's /etc/passwd without actually creating a container.

@yujuhong I support Kubelet implementing the run_as_non_root functionality and keeping it out of CRI.

}

message ListImagesResponse {
Expand Down
5 changes: 5 additions & 0 deletions pkg/kubelet/dockershim/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ go_library(
"helpers.go",
"legacy.go",
"naming.go",
"security_context.go",
],
tags = ["automanaged"],
deps = [
Expand All @@ -41,6 +42,7 @@ go_library(
"//pkg/kubelet/server/streaming:go_default_library",
"//pkg/kubelet/types:go_default_library",
"//pkg/kubelet/util/ioutils:go_default_library",
"//pkg/securitycontext:go_default_library",
"//pkg/util/term:go_default_library",
"//vendor:github.com/docker/engine-api/types",
"//vendor:github.com/docker/engine-api/types/container",
Expand All @@ -63,6 +65,7 @@ go_test(
"docker_service_test.go",
"helpers_test.go",
"naming_test.go",
"security_context_test.go",
],
library = "go_default_library",
tags = ["automanaged"],
Expand All @@ -76,8 +79,10 @@ go_test(
"//pkg/kubelet/network/mock_network:go_default_library",
"//pkg/kubelet/types:go_default_library",
"//pkg/security/apparmor:go_default_library",
"//pkg/securitycontext:go_default_library",
"//pkg/util/clock:go_default_library",
"//vendor:github.com/docker/engine-api/types",
"//vendor:github.com/docker/engine-api/types/container",
"//vendor:github.com/golang/mock/gomock",
"//vendor:github.com/stretchr/testify/assert",
],
Expand Down
15 changes: 14 additions & 1 deletion pkg/kubelet/dockershim/convert.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,14 @@ package dockershim

import (
"fmt"
"strconv"
"strings"
"time"

dockertypes "github.com/docker/engine-api/types"

runtimeApi "k8s.io/kubernetes/pkg/kubelet/api/v1alpha1/runtime"
"k8s.io/kubernetes/pkg/kubelet/dockertools"
)

// This file contains helper functions to convert docker API types to runtime
Expand Down Expand Up @@ -55,14 +57,25 @@ func imageInspectToRuntimeAPIImage(image *dockertypes.ImageInspect) (*runtimeApi
return nil, fmt.Errorf("unable to convert a nil pointer to a runtime API image")
}

var err error
var uid int64
size := uint64(image.VirtualSize)
imageUid := dockertools.GetUidFromUser(image.Config.User)
// Convert image UID to int64 format. Not that it assumes the process in
// the image is running as root if image.Config.User is not set.
if imageUid != "" {
uid, err = strconv.ParseInt(imageUid, 10, 64)
if err != nil {
return nil, fmt.Errorf("non-numeric user (%q)", imageUid)
}
}
return &runtimeApi.Image{
Id: &image.ID,
RepoTags: image.RepoTags,
RepoDigests: image.RepoDigests,
Size_: &size,
Uid: &uid,
}, nil

}

func toPullableImageID(id string, image *dockertypes.ImageInspect) string {
Expand Down
32 changes: 8 additions & 24 deletions pkg/kubelet/dockershim/docker_container.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,34 +120,15 @@ func (ds *dockerService) CreateContainer(podSandboxID string, config *runtimeApi

// Fill the HostConfig.
hc := &dockercontainer.HostConfig{
Binds: generateMountBindings(config.GetMounts()),
ReadonlyRootfs: config.GetReadonlyRootfs(),
Privileged: config.GetPrivileged(),
Binds: generateMountBindings(config.GetMounts()),
}

// Apply options derived from the sandbox config.
// Apply cgroupsParent derived from the sandbox config.
if lc := sandboxConfig.GetLinux(); lc != nil {
// Apply Cgroup options.
// TODO: Check if this works with per-pod cgroups.
// TODO: we need to pass the cgroup in syntax expected by cgroup driver but shim does not use docker info yet...
hc.CgroupParent = lc.GetCgroupParent()

// Apply namespace options.
sandboxNSMode := fmt.Sprintf("container:%v", podSandboxID)
hc.NetworkMode = dockercontainer.NetworkMode(sandboxNSMode)
hc.IpcMode = dockercontainer.IpcMode(sandboxNSMode)
hc.UTSMode = ""
hc.PidMode = ""

nsOpts := lc.GetNamespaceOptions()
if nsOpts != nil {
if nsOpts.GetHostNetwork() {
hc.UTSMode = namespaceModeHost
}
if nsOpts.GetHostPid() {
hc.PidMode = namespaceModeHost
}
}
}

// Apply Linux-specific options if applicable.
Expand All @@ -167,6 +148,9 @@ func (ds *dockerService) CreateContainer(podSandboxID string, config *runtimeApi
hc.OomScoreAdj = int(rOpts.GetOomScoreAdj())
}
// Note: ShmSize is handled in kube_docker_client.go

// Apply security context.
applyContainerSecurityContext(lc, podSandboxID, createConfig.Config, hc)
}

// Set devices for container.
Expand All @@ -180,12 +164,12 @@ func (ds *dockerService) CreateContainer(podSandboxID string, config *runtimeApi
}
hc.Resources.Devices = devices

var err error
hc.SecurityOpt, err = getContainerSecurityOpts(config.Metadata.GetName(), sandboxConfig, ds.seccompProfileRoot)
// Apply appArmor and seccomp options.
securityOpts, err := getContainerSecurityOpts(config.Metadata.GetName(), sandboxConfig, ds.seccompProfileRoot)
if err != nil {
return "", fmt.Errorf("failed to generate container security options for container %q: %v", config.Metadata.GetName(), err)
}
// TODO: Add or drop capabilities.
hc.SecurityOpt = append(hc.SecurityOpt, securityOpts...)

createConfig.HostConfig = hc
createResp, err := ds.client.CreateContainer(createConfig)
Expand Down
41 changes: 18 additions & 23 deletions pkg/kubelet/dockershim/docker_sandbox.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ func (ds *dockerService) RunPodSandbox(config *runtimeApi.PodSandboxConfig) (str
if err != nil {
return createResp.ID, fmt.Errorf("failed to start sandbox container for pod %q: %v", config.Metadata.GetName(), err)
}
if config.GetLinux().GetNamespaceOptions().GetHostNetwork() {
if config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetHostNetwork() {
return createResp.ID, nil
}

Expand Down Expand Up @@ -286,6 +286,18 @@ func (ds *dockerService) ListPodSandbox(filter *runtimeApi.PodSandboxFilter) ([]
return result, nil
}

// applySandboxLinuxOptions applies LinuxPodSandboxConfig to dockercontainer.HostConfig and dockercontainer.ContainerCreateConfig.
func (ds *dockerService) applySandboxLinuxOptions(hc *dockercontainer.HostConfig, lc *runtimeApi.LinuxPodSandboxConfig, createConfig *dockertypes.ContainerCreateConfig, image string) error {
// Apply Cgroup options.
// TODO: Check if this works with per-pod cgroups.
hc.CgroupParent = lc.GetCgroupParent()
// Apply security context.
applySandboxSecurityContext(lc, createConfig.Config, hc)

return nil
}

// makeSandboxDockerConfig returns dockertypes.ContainerCreateConfig based on runtimeApi.PodSandboxConfig.
func (ds *dockerService) makeSandboxDockerConfig(c *runtimeApi.PodSandboxConfig, image string) (*dockertypes.ContainerCreateConfig, error) {
// Merge annotations and labels because docker supports only labels.
labels := makeLabels(c.GetLabels(), c.GetAnnotations())
Expand Down Expand Up @@ -316,29 +328,11 @@ func (ds *dockerService) makeSandboxDockerConfig(c *runtimeApi.PodSandboxConfig,

// Apply linux-specific options.
if lc := c.GetLinux(); lc != nil {
// Apply Cgroup options.
// TODO: Check if this works with per-pod cgroups.
hc.CgroupParent = lc.GetCgroupParent()

// Apply namespace options.
hc.NetworkMode, hc.UTSMode, hc.PidMode = "", "", ""
nsOpts := lc.GetNamespaceOptions()
if nsOpts != nil {
if nsOpts.GetHostNetwork() {
hc.NetworkMode = namespaceModeHost
} else {
// Assume kubelet uses either the cni or the kubenet plugin.
// TODO: support docker networking.
hc.NetworkMode = "none"
}
if nsOpts.GetHostIpc() {
hc.IpcMode = namespaceModeHost
}
if nsOpts.GetHostPid() {
hc.PidMode = namespaceModeHost
}
if err := ds.applySandboxLinuxOptions(hc, lc, createConfig, image); err != nil {
return nil, err
}
}

// Set port mappings.
exposedPorts, portBindings := makePortsAndBindings(c.GetPortMappings())
createConfig.Config.ExposedPorts = exposedPorts
Expand All @@ -355,10 +349,11 @@ func (ds *dockerService) makeSandboxDockerConfig(c *runtimeApi.PodSandboxConfig,
setSandboxResources(hc)

// Set security options.
hc.SecurityOpt, err = getSandboxSecurityOpts(c, ds.seccompProfileRoot)
securityOpts, err := getSandboxSecurityOpts(c, ds.seccompProfileRoot)
if err != nil {
return nil, fmt.Errorf("failed to generate sandbox security options for sandbox %q: %v", c.Metadata.GetName(), err)
}
hc.SecurityOpt = append(hc.SecurityOpt, securityOpts...)
return createConfig, nil
}

Expand Down
8 changes: 7 additions & 1 deletion pkg/kubelet/dockershim/docker_sandbox_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,13 @@ func TestHostNetworkPluginInvocation(t *testing.T) {
map[string]string{"annotation": ns},
)
hostNetwork := true
c.Linux = &runtimeApi.LinuxPodSandboxConfig{NamespaceOptions: &runtimeApi.NamespaceOption{HostNetwork: &hostNetwork}}
c.Linux = &runtimeApi.LinuxPodSandboxConfig{
SecurityContext: &runtimeApi.LinuxSandboxSecurityContext{
NamespaceOptions: &runtimeApi.NamespaceOption{
HostNetwork: &hostNetwork,
},
},
}
cID := kubecontainer.ContainerID{Type: runtimeName, ID: fmt.Sprintf("/%v", makeSandboxName(c))}

// No calls to network plugin are expected
Expand Down
5 changes: 4 additions & 1 deletion pkg/kubelet/dockershim/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,13 +123,16 @@ func extractLabels(input map[string]string) (map[string]string, map[string]strin
// '<HostPath>:<ContainerPath>:Z', if the volume requires SELinux
// relabeling and the pod provides an SELinux label
func generateMountBindings(mounts []*runtimeApi.Mount) (result []string) {
// TODO: resolve podHasSELinuxLabel
for _, m := range mounts {
bind := fmt.Sprintf("%s:%s", m.GetHostPath(), m.GetContainerPath())
readOnly := m.GetReadonly()
if readOnly {
bind += ":ro"
}
// Only request relabeling if the pod provides an SELinux context. If the pod
// does not provide an SELinux context relabeling will label the volume with
// the container's randomly allocated MCS label. This would restrict access
// to the volume to the container which mounts it first.
if m.GetSelinuxRelabel() {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might as well handle this TODO now that #33663 has merged, IMO.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@pmorie Already handled this at here, here's todo should be removed.

if readOnly {
bind += ",Z"
Expand Down