Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support total process ID limiting for nodes #73651

Merged
merged 1 commit into from Feb 14, 2019
Merged
Changes from all commits
Commits
File filter...
Filter file types
Jump to…
Jump to file or symbol
Failed to load files and symbols.
+211 −41
Diff settings

Always

Just for now

@@ -68,6 +68,7 @@ go_library(
"//pkg/kubelet/kubeletconfig/configfiles:go_default_library",
"//pkg/kubelet/server:go_default_library",
"//pkg/kubelet/server/streaming:go_default_library",
"//pkg/kubelet/stats/pidlimit:go_default_library",
"//pkg/kubelet/types:go_default_library",
"//pkg/util/configz:go_default_library",
"//pkg/util/filesystem:go_default_library",
@@ -588,8 +588,8 @@ func AddKubeletConfigFlags(mainfs *pflag.FlagSet, c *kubeletconfig.KubeletConfig
fs.BoolVar(&c.ProtectKernelDefaults, "protect-kernel-defaults", c.ProtectKernelDefaults, "Default kubelet behaviour for kernel tuning. If set, kubelet errors if any of kernel tunables is different than kubelet defaults.")

// Node Allocatable Flags
fs.Var(flag.NewMapStringString(&c.SystemReserved), "system-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=500Mi,ephemeral-storage=1Gi) pairs that describe resources reserved for non-kubernetes components. Currently only cpu and memory are supported. See http://kubernetes.io/docs/user-guide/compute-resources for more detail. [default=none]")
fs.Var(flag.NewMapStringString(&c.KubeReserved), "kube-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=500Mi,ephemeral-storage=1Gi) pairs that describe resources reserved for kubernetes system components. Currently cpu, memory and local ephemeral storage for root file system are supported. See http://kubernetes.io/docs/user-guide/compute-resources for more detail. [default=none]")
fs.Var(flag.NewMapStringString(&c.SystemReserved), "system-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=500Mi,ephemeral-storage=1Gi,pid=100) pairs that describe resources reserved for non-kubernetes components. Currently only cpu, memory, and pid (process IDs) are supported. See http://kubernetes.io/docs/user-guide/compute-resources for more detail. [default=none]")
fs.Var(flag.NewMapStringString(&c.KubeReserved), "kube-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=500Mi,ephemeral-storage=1Gi,pid=100) pairs that describe resources reserved for kubernetes system components. Currently cpu, memory, local ephemeral storage for root file system, and pid (process IDs) are supported. See http://kubernetes.io/docs/user-guide/compute-resources for more detail. [default=none]")
fs.StringSliceVar(&c.EnforceNodeAllocatable, "enforce-node-allocatable", c.EnforceNodeAllocatable, "A comma separated list of levels of node allocatable enforcement to be enforced by kubelet. Acceptable options are 'none', 'pods', 'system-reserved', and 'kube-reserved'. If the latter two options are specified, '--system-reserved-cgroup' and '--kube-reserved-cgroup' must also be set, respectively. If 'none' is specified, no additional options should be set. See https://kubernetes.io/docs/tasks/administer-cluster/reserve-compute-resources/ for more details.")
fs.StringVar(&c.SystemReservedCgroup, "system-reserved-cgroup", c.SystemReservedCgroup, "Absolute name of the top level cgroup that is used to manage non-kubernetes components for which compute resources were reserved via '--system-reserved' flag. Ex. '/system-reserved'. [default='']")
fs.StringVar(&c.KubeReservedCgroup, "kube-reserved-cgroup", c.KubeReservedCgroup, "Absolute name of the top level cgroup that is used to manage kubernetes components for which compute resources were reserved via '--kube-reserved' flag. Ex. '/kube-reserved'. [default='']")
@@ -83,6 +83,7 @@ import (
"k8s.io/kubernetes/pkg/kubelet/kubeletconfig/configfiles"
"k8s.io/kubernetes/pkg/kubelet/server"
"k8s.io/kubernetes/pkg/kubelet/server/streaming"
"k8s.io/kubernetes/pkg/kubelet/stats/pidlimit"
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
"k8s.io/kubernetes/pkg/util/configz"
utilfs "k8s.io/kubernetes/pkg/util/filesystem"
@@ -1150,16 +1151,18 @@ func parseResourceList(m map[string]string) (v1.ResourceList, error) {
rl := make(v1.ResourceList)
for k, v := range m {
switch v1.ResourceName(k) {
// CPU, memory and local storage resources are supported.
case v1.ResourceCPU, v1.ResourceMemory, v1.ResourceEphemeralStorage:
q, err := resource.ParseQuantity(v)
if err != nil {
return nil, err
}
if q.Sign() == -1 {
return nil, fmt.Errorf("resource quantity for %q cannot be negative: %v", k, v)
// CPU, memory, local storage, and PID resources are supported.
case v1.ResourceCPU, v1.ResourceMemory, v1.ResourceEphemeralStorage, pidlimit.PIDs:
if v1.ResourceName(k) != pidlimit.PIDs || utilfeature.DefaultFeatureGate.Enabled(features.SupportNodePidsLimit) {
q, err := resource.ParseQuantity(v)
if err != nil {
return nil, err
}
if q.Sign() == -1 {
return nil, fmt.Errorf("resource quantity for %q cannot be negative: %v", k, v)
}
rl[v1.ResourceName(k)] = q
}
rl[v1.ResourceName(k)] = q
default:
return nil, fmt.Errorf("cannot reserve %q resource", k)
}
@@ -405,6 +405,12 @@ const (
//
// Enables the AWS EBS in-tree driver to AWS EBS CSI Driver migration feature.
CSIMigrationAWS utilfeature.Feature = "CSIMigrationAWS"

// owner: @RobertKrawitz
// alpha: v1.14
//
// Implement support for limiting pids in nodes
SupportNodePidsLimit utilfeature.Feature = "SupportNodePidsLimit"
)

func init() {
@@ -450,6 +456,7 @@ var defaultKubernetesFeatureGates = map[utilfeature.Feature]utilfeature.FeatureS
ResourceLimitsPriorityFunction: {Default: false, PreRelease: utilfeature.Alpha},
SupportIPVSProxyMode: {Default: true, PreRelease: utilfeature.GA},
SupportPodPidsLimit: {Default: true, PreRelease: utilfeature.Beta},
SupportNodePidsLimit: {Default: false, PreRelease: utilfeature.Alpha},
HyperVContainer: {Default: false, PreRelease: utilfeature.Alpha},
ScheduleDaemonSetPods: {Default: true, PreRelease: utilfeature.Beta},
TokenRequest: {Default: true, PreRelease: utilfeature.Beta},
@@ -291,12 +291,12 @@ type KubeletConfiguration struct {

/* the following fields are meant for Node Allocatable */

// A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G) pairs
// A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G,pids=100) pairs
// that describe resources reserved for non-kubernetes components.
// Currently only cpu and memory are supported.
// See http://kubernetes.io/docs/user-guide/compute-resources for more detail.
SystemReserved map[string]string
// A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G) pairs
// A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G,pids=100) pairs
// that describe resources reserved for kubernetes system components.
// Currently cpu, memory and local ephemeral storage for root file system are supported.
// See http://kubernetes.io/docs/user-guide/compute-resources for more detail.
@@ -73,6 +73,7 @@ go_library(
"//pkg/kubelet/events:go_default_library",
"//pkg/kubelet/metrics:go_default_library",
"//pkg/kubelet/qos:go_default_library",
"//pkg/kubelet/stats/pidlimit:go_default_library",
"//pkg/kubelet/types:go_default_library",
"//pkg/util/mount:go_default_library",
"//pkg/util/oom:go_default_library",
@@ -257,7 +257,7 @@ func (m *cgroupManagerImpl) Exists(name CgroupName) bool {
// in https://github.com/opencontainers/runc/issues/1440
// once resolved, we can remove this code.
whitelistControllers := sets.NewString("cpu", "cpuacct", "cpuset", "memory", "systemd")
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) {
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) || utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportNodePidsLimit) {
whitelistControllers.Insert("pids")
}
var missingPaths []string
@@ -325,10 +325,11 @@ func getSupportedSubsystems() map[subsystem]bool {
supportedSubsystems := map[subsystem]bool{
&cgroupfs.MemoryGroup{}: true,
&cgroupfs.CpuGroup{}: true,
&cgroupfs.PidsGroup{}: true,
}
// not all hosts support hugetlb cgroup, and in the absent of hugetlb, we will fail silently by reporting no capacity.
supportedSubsystems[&cgroupfs.HugetlbGroup{}] = false
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) {
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) || utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportNodePidsLimit) {
supportedSubsystems[&cgroupfs.PidsGroup{}] = true
}
return supportedSubsystems
@@ -377,9 +378,9 @@ func (m *cgroupManagerImpl) toResources(resourceConfig *ResourceConfig) *libcont
if resourceConfig.CpuPeriod != nil {
resources.CpuPeriod = *resourceConfig.CpuPeriod
}
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) {
if resourceConfig.PodPidsLimit != nil {
resources.PidsLimit = *resourceConfig.PodPidsLimit
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) || utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportNodePidsLimit) {
if resourceConfig.PidsLimit != nil {
resources.PidsLimit = *resourceConfig.PidsLimit
}
}
// if huge pages are enabled, we set them in libcontainer
@@ -431,8 +432,8 @@ func (m *cgroupManagerImpl) Update(cgroupConfig *CgroupConfig) error {
libcontainerCgroupConfig.Path = cgroupConfig.Name.ToCgroupfs()
}

if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) && cgroupConfig.ResourceParameters != nil && cgroupConfig.ResourceParameters.PodPidsLimit != nil {
libcontainerCgroupConfig.PidsLimit = *cgroupConfig.ResourceParameters.PodPidsLimit
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) && cgroupConfig.ResourceParameters != nil && cgroupConfig.ResourceParameters.PidsLimit != nil {
libcontainerCgroupConfig.PidsLimit = *cgroupConfig.ResourceParameters.PidsLimit
}

if err := setSupportedSubsystems(libcontainerCgroupConfig); err != nil {
@@ -461,8 +462,8 @@ func (m *cgroupManagerImpl) Create(cgroupConfig *CgroupConfig) error {
libcontainerCgroupConfig.Path = cgroupConfig.Name.ToCgroupfs()
}

if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) && cgroupConfig.ResourceParameters != nil && cgroupConfig.ResourceParameters.PodPidsLimit != nil {
libcontainerCgroupConfig.PidsLimit = *cgroupConfig.ResourceParameters.PodPidsLimit
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) && cgroupConfig.ResourceParameters != nil && cgroupConfig.ResourceParameters.PidsLimit != nil {
libcontainerCgroupConfig.PidsLimit = *cgroupConfig.ResourceParameters.PidsLimit
}

// get the manager with the specified cgroup configuration
@@ -53,6 +53,7 @@ import (
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
"k8s.io/kubernetes/pkg/kubelet/qos"
"k8s.io/kubernetes/pkg/kubelet/stats/pidlimit"
"k8s.io/kubernetes/pkg/kubelet/status"
"k8s.io/kubernetes/pkg/kubelet/util/pluginwatcher"
schedulernodeinfo "k8s.io/kubernetes/pkg/scheduler/nodeinfo"
@@ -123,6 +124,8 @@ type containerManagerImpl struct {
cgroupManager CgroupManager
// Capacity of this node.
capacity v1.ResourceList
// Capacity of this node, including internal resources.
internalCapacity v1.ResourceList
// Absolute cgroupfs path to a cgroup that Kubelet needs to place all pods under.
// This path include a top level container for enforcing Node Allocatable.
cgroupRoot CgroupName
@@ -219,6 +222,7 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I
}

var capacity = v1.ResourceList{}
var internalCapacity = v1.ResourceList{}
// It is safe to invoke `MachineInfo` on cAdvisor before logically initializing cAdvisor here because
// machine info is computed and cached once as part of cAdvisor object creation.
// But `RootFsInfo` and `ImagesFsInfo` are not available at this moment so they will be called later during manager starts
@@ -227,6 +231,15 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I
return nil, err
}
capacity = cadvisor.CapacityFromMachineInfo(machineInfo)
for k, v := range capacity {
internalCapacity[k] = v
}
pidlimits, err := pidlimit.Stats()
if err == nil && pidlimits != nil && pidlimits.MaxPID != nil {
internalCapacity[pidlimit.PIDs] = *resource.NewQuantity(

This comment has been minimized.

Copy link
@tedyu

tedyu Feb 14, 2019

Contributor

It seems internalCapacity only needs to be allocated when the above if condition is true.
Otherwise capacity can be used.

int64(*pidlimits.MaxPID),
resource.DecimalSI)
}

// Turn CgroupRoot from a string (in cgroupfs path format) to internal CgroupName
cgroupRoot := ParseCgroupfsToCgroupName(nodeConfig.CgroupRoot)
@@ -264,6 +277,7 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I
subsystems: subsystems,
cgroupManager: cgroupManager,
capacity: capacity,
internalCapacity: internalCapacity,
cgroupRoot: cgroupRoot,
recorder: recorder,
qosContainerManager: qosContainerManager,
@@ -28,6 +28,7 @@ import (
"k8s.io/apimachinery/pkg/types"
"k8s.io/klog"
"k8s.io/kubernetes/pkg/kubelet/events"
"k8s.io/kubernetes/pkg/kubelet/stats/pidlimit"
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
)

@@ -40,7 +41,7 @@ func (cm *containerManagerImpl) createNodeAllocatableCgroups() error {
cgroupConfig := &CgroupConfig{
Name: cm.cgroupRoot,
// The default limits for cpu shares can be very low which can lead to CPU starvation for pods.
ResourceParameters: getCgroupConfig(cm.capacity),
ResourceParameters: getCgroupConfig(cm.internalCapacity),
}
if cm.cgroupManager.Exists(cgroupConfig.Name) {
return nil
@@ -58,10 +59,10 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {

// We need to update limits on node allocatable cgroup no matter what because
// default cpu shares on cgroups are low and can cause cpu starvation.
nodeAllocatable := cm.capacity
nodeAllocatable := cm.internalCapacity
// Use Node Allocatable limits instead of capacity if the user requested enforcing node allocatable.
if cm.CgroupsPerQOS && nc.EnforceNodeAllocatable.Has(kubetypes.NodeAllocatableEnforcementKey) {
nodeAllocatable = cm.getNodeAllocatableAbsolute()
nodeAllocatable = cm.getNodeAllocatableInternalAbsolute()
}

klog.V(4).Infof("Attempting to enforce Node Allocatable with config: %+v", nc)
@@ -130,7 +131,7 @@ func enforceExistingCgroup(cgroupManager CgroupManager, cName CgroupName, rl v1.
if cgroupConfig.ResourceParameters == nil {
return fmt.Errorf("%q cgroup is not config properly", cgroupConfig.Name)
}
klog.V(4).Infof("Enforcing limits on cgroup %q with %d cpu shares and %d bytes of memory", cName, cgroupConfig.ResourceParameters.CpuShares, cgroupConfig.ResourceParameters.Memory)
klog.V(4).Infof("Enforcing limits on cgroup %q with %d cpu shares, %d bytes of memory, and %d processes", cName, cgroupConfig.ResourceParameters.CpuShares, cgroupConfig.ResourceParameters.Memory, cgroupConfig.ResourceParameters.PidsLimit)
if !cgroupManager.Exists(cgroupConfig.Name) {
return fmt.Errorf("%q cgroup does not exist", cgroupConfig.Name)
}
@@ -157,6 +158,10 @@ func getCgroupConfig(rl v1.ResourceList) *ResourceConfig {
val := MilliCPUToShares(q.MilliValue())
rc.CpuShares = &val
}
if q, exists := rl[pidlimit.PIDs]; exists {
val := q.Value()
rc.PidsLimit = &val
}
rc.HugePageLimit = HugePageLimits(rl)

return &rc
@@ -166,8 +171,12 @@ func getCgroupConfig(rl v1.ResourceList) *ResourceConfig {
// Note that not all resources that are available on the node are included in the returned list of resources.
// Returns a ResourceList.
func (cm *containerManagerImpl) getNodeAllocatableAbsolute() v1.ResourceList {
return cm.getNodeAllocatableAbsoluteImpl(cm.capacity)
}

func (cm *containerManagerImpl) getNodeAllocatableAbsoluteImpl(capacity v1.ResourceList) v1.ResourceList {
result := make(v1.ResourceList)
for k, v := range cm.capacity {
for k, v := range capacity {
value := *(v.Copy())
if cm.NodeConfig.SystemReserved != nil {
value.Sub(cm.NodeConfig.SystemReserved[k])
@@ -182,7 +191,13 @@ func (cm *containerManagerImpl) getNodeAllocatableAbsolute() v1.ResourceList {
result[k] = value
}
return result
}

// getNodeAllocatableInternalAbsolute is similar to getNodeAllocatableAbsolute except that
// it also includes internal resources (currently process IDs). It is intended for setting
// up top level cgroups only.
func (cm *containerManagerImpl) getNodeAllocatableInternalAbsolute() v1.ResourceList {
return cm.getNodeAllocatableAbsoluteImpl(cm.internalCapacity)
}

// GetNodeAllocatableReservation returns amount of compute or storage resource that have to be reserved on this node from scheduling.
@@ -87,7 +87,7 @@ func (m *podContainerManagerImpl) EnsureExists(pod *v1.Pod) error {
ResourceParameters: ResourceConfigForPod(pod, m.enforceCPULimits, m.cpuCFSQuotaPeriod),
}
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) && m.podPidsLimit > 0 {
containerConfig.ResourceParameters.PodPidsLimit = &m.podPidsLimit
containerConfig.ResourceParameters.PidsLimit = &m.podPidsLimit
}
if err := m.cgroupManager.Create(containerConfig); err != nil {
return fmt.Errorf("failed to create container for %v : %v", podContainerName, err)
@@ -34,7 +34,7 @@ type ResourceConfig struct {
// HugePageLimit map from page size (in bytes) to limit (in bytes)
HugePageLimit map[int64]int64
// Maximum number of pids
PodPidsLimit *int64
PidsLimit *int64
}

// CgroupName is the abstract name of a cgroup prior to any driver specific conversion.
@@ -10,8 +10,6 @@ go_library(
"helper.go",
"log_metrics_provider.go",
"stats_provider.go",
"stats_provider_linux.go",
"stats_provider_unsupported.go",
],
importpath = "k8s.io/kubernetes/pkg/kubelet/stats",
visibility = ["//visibility:public"],
@@ -26,6 +24,7 @@ go_library(
"//pkg/kubelet/leaky:go_default_library",
"//pkg/kubelet/pod:go_default_library",
"//pkg/kubelet/server/stats:go_default_library",
"//pkg/kubelet/stats/pidlimit:go_default_library",
"//pkg/kubelet/status:go_default_library",
"//pkg/kubelet/types:go_default_library",
"//pkg/volume:go_default_library",
@@ -52,7 +51,10 @@ filegroup(

filegroup(
name = "all-srcs",
srcs = [":package-srcs"],
srcs = [
":package-srcs",
"//pkg/kubelet/stats/pidlimit:all-srcs",
],
tags = ["automanaged"],
visibility = ["//visibility:public"],
)
Oops, something went wrong.
ProTip! Use n and p to navigate between commits in a pull request.
You can’t perform that action at this time.