Skip to content

Commit

Permalink
Merge pull request #31169 from pweil-/userns-experimental
Browse files Browse the repository at this point in the history
Automatic merge from submit-queue

Default host user namespace via experimental flag

@vishh @ncdc @pmorie @smarterclayton @thockin 

Initial thought on the implementation #30684 (comment) wasn't quite right.  Since we need to dereference a PVC in some cases the defaulting code didn't fit nicely in the docker manager code (would've coupled it with a kube client and would've been messy).  I think passing this in via the runtime config turned out cleaner.  PTAL
  • Loading branch information
Kubernetes Submit Queue committed Nov 14, 2016
2 parents 03455d0 + d0d78f4 commit c76fe8d
Show file tree
Hide file tree
Showing 7 changed files with 382 additions and 15 deletions.
6 changes: 6 additions & 0 deletions pkg/kubelet/container/runtime.go
Expand Up @@ -434,6 +434,12 @@ type RunContainerOptions struct {
ReadOnly bool
// hostname for pod containers
Hostname string
// EnableHostUserNamespace sets userns=host when users request host namespaces (pid, ipc, net),
// are using non-namespaced capabilities (mknod, sys_time, sys_module), the pod contains a privileged container,
// or using host path volumes.
// This should only be enabled when the container runtime is performing user remapping AND if the
// experimental behavior is desired.
EnableHostUserNamespace bool
}

// VolumeInfo contains information about the volume.
Expand Down
7 changes: 7 additions & 0 deletions pkg/kubelet/dockershim/docker_service.go
Expand Up @@ -57,6 +57,13 @@ const (
containerTypeLabelContainer = "container"
containerLogPathLabelKey = "io.kubernetes.container.logpath"
sandboxIDLabelKey = "io.kubernetes.sandbox.id"

// TODO: https://github.com/kubernetes/kubernetes/pull/31169 provides experimental
// defaulting of host user namespace that may be enabled when the docker daemon
// is using remapped UIDs.
// Dockershim should provide detection support for a remapping environment .
// This should be included in the feature proposal. Defaulting may still occur according
// to kubelet behavior and system settings in addition to any API flags that may be introduced.
)

// NetworkPluginSettings is the subset of kubelet runtime args we pass
Expand Down
6 changes: 6 additions & 0 deletions pkg/kubelet/dockertools/docker_manager.go
Expand Up @@ -681,12 +681,18 @@ func (dm *DockerManager) runContainer(
}
}

userNsMode := ""
if opts.EnableHostUserNamespace {
userNsMode = "host"
}

hc := &dockercontainer.HostConfig{
Binds: binds,
NetworkMode: dockercontainer.NetworkMode(netMode),
IpcMode: dockercontainer.IpcMode(ipcMode),
UTSMode: dockercontainer.UTSMode(utsMode),
PidMode: dockercontainer.PidMode(pidMode),
UsernsMode: dockercontainer.UsernsMode(userNsMode),
ReadonlyRootfs: readOnlyRootFilesystem(container),
Resources: dockercontainer.Resources{
Memory: memoryLimit,
Expand Down
30 changes: 21 additions & 9 deletions pkg/kubelet/kubelet.go
Expand Up @@ -450,15 +450,20 @@ func NewMainKubelet(kubeCfg *componentconfig.KubeletConfiguration, kubeDeps *Kub
containerManager: kubeDeps.ContainerManager,
nodeIP: net.ParseIP(kubeCfg.NodeIP),
clock: clock.RealClock{},
outOfDiskTransitionFrequency: kubeCfg.OutOfDiskTransitionFrequency.Duration,
reservation: *reservation,
enableCustomMetrics: kubeCfg.EnableCustomMetrics,
babysitDaemons: kubeCfg.BabysitDaemons,
enableControllerAttachDetach: kubeCfg.EnableControllerAttachDetach,
iptClient: utilipt.New(utilexec.New(), utildbus.New(), utilipt.ProtocolIpv4),
makeIPTablesUtilChains: kubeCfg.MakeIPTablesUtilChains,
iptablesMasqueradeBit: int(kubeCfg.IPTablesMasqueradeBit),
iptablesDropBit: int(kubeCfg.IPTablesDropBit),
outOfDiskTransitionFrequency: kubeCfg.OutOfDiskTransitionFrequency.Duration,
reservation: *reservation,
enableCustomMetrics: kubeCfg.EnableCustomMetrics,
babysitDaemons: kubeCfg.BabysitDaemons,
enableControllerAttachDetach: kubeCfg.EnableControllerAttachDetach,
iptClient: utilipt.New(utilexec.New(), utildbus.New(), utilipt.ProtocolIpv4),
makeIPTablesUtilChains: kubeCfg.MakeIPTablesUtilChains,
iptablesMasqueradeBit: int(kubeCfg.IPTablesMasqueradeBit),
iptablesDropBit: int(kubeCfg.IPTablesDropBit),
experimentalHostUserNamespaceDefaulting: utilconfig.DefaultFeatureGate.ExperimentalHostUserNamespaceDefaulting(),
}

if klet.experimentalHostUserNamespaceDefaulting {
glog.Infof("Experimental host user namespace defaulting is enabled.")
}

if mode, err := effectiveHairpinMode(componentconfig.HairpinMode(kubeCfg.HairpinMode), kubeCfg.ContainerRuntime, kubeCfg.NetworkPluginName); err != nil {
Expand Down Expand Up @@ -1087,6 +1092,13 @@ type Kubelet struct {

// The handler serving CRI streaming calls (exec/attach/port-forward).
criHandler http.Handler

// experimentalHostUserNamespaceDefaulting sets userns=true when users request host namespaces (pid, ipc, net),
// are using non-namespaced capabilities (mknod, sys_time, sys_module), the pod contains a privileged container,
// or using host path volumes.
// This should only be enabled when the container runtime is performing user remapping AND if the
// experimental behavior is desired.
experimentalHostUserNamespaceDefaulting bool
}

// setupDataDirs creates:
Expand Down
89 changes: 89 additions & 0 deletions pkg/kubelet/kubelet_pods.go
Expand Up @@ -315,6 +315,11 @@ func (kl *Kubelet) GenerateRunContainerOptions(pod *api.Pod, container *api.Cont
return nil, err
}

// only do this check if the experimental behavior is enabled, otherwise allow it to default to false
if kl.experimentalHostUserNamespaceDefaulting {
opts.EnableHostUserNamespace = kl.enableHostUserNamespace(pod)
}

return opts, nil
}

Expand Down Expand Up @@ -1397,3 +1402,87 @@ func (kl *Kubelet) cleanupOrphanedPodCgroups(
}
return nil
}

// enableHostUserNamespace determines if the host user namespace should be used by the container runtime.
// Returns true if the pod is using a host pid, pic, or network namespace, the pod is using a non-namespaced
// capability, the pod contains a privileged container, or the pod has a host path volume.
//
// NOTE: when if a container shares any namespace with another container it must also share the user namespace
// or it will not have the correct capabilities in the namespace. This means that host user namespace
// is enabled per pod, not per container.
func (kl *Kubelet) enableHostUserNamespace(pod *api.Pod) bool {
if hasPrivilegedContainer(pod) || hasHostNamespace(pod) ||
hasHostVolume(pod) || hasNonNamespacedCapability(pod) || kl.hasHostMountPVC(pod) {
return true
}
return false
}

// hasPrivilegedContainer returns true if any of the containers in the pod are privileged.
func hasPrivilegedContainer(pod *api.Pod) bool {
for _, c := range pod.Spec.Containers {
if c.SecurityContext != nil &&
c.SecurityContext.Privileged != nil &&
*c.SecurityContext.Privileged {
return true
}
}
return false
}

// hasNonNamespacedCapability returns true if MKNOD, SYS_TIME, or SYS_MODULE is requested for any container.
func hasNonNamespacedCapability(pod *api.Pod) bool {
for _, c := range pod.Spec.Containers {
if c.SecurityContext != nil && c.SecurityContext.Capabilities != nil {
for _, cap := range c.SecurityContext.Capabilities.Add {
if cap == "MKNOD" || cap == "SYS_TIME" || cap == "SYS_MODULE" {
return true
}
}
}
}

return false
}

// hasHostVolume returns true if the pod spec has a HostPath volume.
func hasHostVolume(pod *api.Pod) bool {
for _, v := range pod.Spec.Volumes {
if v.HostPath != nil {
return true
}
}
return false
}

// hasHostNamespace returns true if hostIPC, hostNetwork, or hostPID are set to true.
func hasHostNamespace(pod *api.Pod) bool {
if pod.Spec.SecurityContext == nil {
return false
}
return pod.Spec.SecurityContext.HostIPC || pod.Spec.SecurityContext.HostNetwork || pod.Spec.SecurityContext.HostPID
}

// hasHostMountPVC returns true if a PVC is referencing a HostPath volume.
func (kl *Kubelet) hasHostMountPVC(pod *api.Pod) bool {
for _, volume := range pod.Spec.Volumes {
if volume.PersistentVolumeClaim != nil {
pvc, err := kl.kubeClient.Core().PersistentVolumeClaims(pod.Namespace).Get(volume.PersistentVolumeClaim.ClaimName)
if err != nil {
glog.Warningf("unable to retrieve pvc %s:%s - %v", pod.Namespace, volume.PersistentVolumeClaim.ClaimName, err)
continue
}
if pvc != nil {
referencedVolume, err := kl.kubeClient.Core().PersistentVolumes().Get(pvc.Spec.VolumeName)
if err != nil {
glog.Warningf("unable to retrieve pvc %s - %v", pvc.Spec.VolumeName, err)
continue
}
if referencedVolume != nil && referencedVolume.Spec.HostPath != nil {
return true
}
}
}
}
return false
}

0 comments on commit c76fe8d

Please sign in to comment.