From 1034cd29fac5a044a506be25c056fc857e5f291c Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Thu, 15 Aug 2019 13:23:30 -0700 Subject: [PATCH 1/6] Use syscall.Setrlimit instead of direct RawSyscall6 This syscall is implemented in libraries we use anyway, so we can easily avoid dealing with unsafe pointers etc. --- cmd/chroot/main.go | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/cmd/chroot/main.go b/cmd/chroot/main.go index 457d4683f380..0e552fa2f2a0 100644 --- a/cmd/chroot/main.go +++ b/cmd/chroot/main.go @@ -8,7 +8,6 @@ import ( "strconv" "strings" "syscall" - "unsafe" "github.com/spf13/cobra" "golang.org/x/sys/unix" @@ -61,10 +60,9 @@ func main() { Cur: uint64(cpuTime), Max: uint64(cpuTime), } - // PID 0 means that we assign the limit to us, all commands will inherit it - _, _, e1 := syscall.RawSyscall6(syscall.SYS_PRLIMIT64, uintptr(0), uintptr(unix.RLIMIT_CPU), uintptr(unsafe.Pointer(value)), 0, 0, 0) - if e1 != 0 { - return fmt.Errorf("error setting prlimit on cpu time with value %d: %v", value, e1) + err := syscall.Setrlimit(unix.RLIMIT_CPU, value) + if err != nil { + return fmt.Errorf("error setting prlimit on cpu time with value %d: %v", value, err) } } @@ -73,10 +71,9 @@ func main() { Cur: uint64(megabyte) * 1000000, Max: uint64(megabyte) * 1000000, } - // PID 0 means that we assign the limit to us, all commands will inherit it - _, _, e1 := syscall.RawSyscall6(syscall.SYS_PRLIMIT64, uintptr(0), uintptr(unix.RLIMIT_AS), uintptr(unsafe.Pointer(value)), 0, 0, 0) - if e1 != 0 { - return fmt.Errorf("error setting prlimit on virtual memory with value %d: %v", value, e1) + err := syscall.Setrlimit(unix.RLIMIT_AS, value) + if err != nil { + return fmt.Errorf("error setting prlimit on virtual memory with value %d: %v", value, err) } } From dc7fa755b2f8ba8c5c50a094a2ff632cc6755903 Mon Sep 17 00:00:00 2001 From: Ihar Hrachyshka Date: Tue, 6 Aug 2019 13:47:33 -0700 Subject: [PATCH 2/6] Remove SYS_RESOURCE capability from launcher pod Instead, set the (unlimited) limit for libvirtd from handler pod that is already privileged. --- go.mod | 1 + go.sum | 2 + pkg/virt-controller/services/template.go | 7 - pkg/virt-handler/isolation/BUILD.bazel | 2 + .../isolation/generated_mock_isolation.go | 10 + pkg/virt-handler/isolation/isolation.go | 61 ++++ pkg/virt-handler/vm.go | 5 + pkg/virt-handler/vm_test.go | 22 +- pkg/virt-operator/creation/components/scc.go | 2 +- vendor/github.com/mitchellh/go-ps/.gitignore | 1 + vendor/github.com/mitchellh/go-ps/.travis.yml | 4 + vendor/github.com/mitchellh/go-ps/BUILD.bazel | 17 ++ vendor/github.com/mitchellh/go-ps/LICENSE.md | 21 ++ vendor/github.com/mitchellh/go-ps/README.md | 34 +++ vendor/github.com/mitchellh/go-ps/Vagrantfile | 43 +++ vendor/github.com/mitchellh/go-ps/process.go | 40 +++ .../mitchellh/go-ps/process_darwin.go | 138 ++++++++++ .../mitchellh/go-ps/process_freebsd.go | 260 ++++++++++++++++++ .../mitchellh/go-ps/process_linux.go | 35 +++ .../mitchellh/go-ps/process_solaris.go | 96 +++++++ .../mitchellh/go-ps/process_unix.go | 101 +++++++ .../mitchellh/go-ps/process_windows.go | 119 ++++++++ vendor/modules.txt | 28 +- 23 files changed, 1027 insertions(+), 22 deletions(-) create mode 100644 vendor/github.com/mitchellh/go-ps/.gitignore create mode 100644 vendor/github.com/mitchellh/go-ps/.travis.yml create mode 100644 vendor/github.com/mitchellh/go-ps/BUILD.bazel create mode 100644 vendor/github.com/mitchellh/go-ps/LICENSE.md create mode 100644 vendor/github.com/mitchellh/go-ps/README.md create mode 100644 vendor/github.com/mitchellh/go-ps/Vagrantfile create mode 100644 vendor/github.com/mitchellh/go-ps/process.go create mode 100644 vendor/github.com/mitchellh/go-ps/process_darwin.go create mode 100644 vendor/github.com/mitchellh/go-ps/process_freebsd.go create mode 100644 vendor/github.com/mitchellh/go-ps/process_linux.go create mode 100644 vendor/github.com/mitchellh/go-ps/process_solaris.go create mode 100644 vendor/github.com/mitchellh/go-ps/process_unix.go create mode 100644 vendor/github.com/mitchellh/go-ps/process_windows.go diff --git a/go.mod b/go.mod index b7e7d63c649b..a477f1c4c3d7 100644 --- a/go.mod +++ b/go.mod @@ -39,6 +39,7 @@ require ( github.com/libvirt/libvirt-go v5.0.0+incompatible github.com/mattn/go-colorable v0.1.1 // indirect github.com/mattn/go-runewidth v0.0.0-20181218000649-703b5e6b11ae // indirect + github.com/mitchellh/go-ps v0.0.0-20190716172923-621e5597135b github.com/onsi/ginkgo v1.8.0 github.com/onsi/gomega v1.5.1-0.20190515112211-6a48b4839f85 github.com/openshift/api v3.9.1-0.20190401220125-3a6077f1f910+incompatible diff --git a/go.sum b/go.sum index fab0bea414a3..fa6f25cad09a 100644 --- a/go.sum +++ b/go.sum @@ -194,6 +194,8 @@ github.com/mattn/go-sqlite3 v1.10.0/go.mod h1:FPy6KqzDD04eiIsT53CuJW3U88zkxoIYsO github.com/matttproud/golang_protobuf_extensions v1.0.1 h1:4hp9jkHxhMHkqkrB3Ix0jegS5sx/RkqARlsWZ6pIwiU= github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= github.com/maxbrunsfeld/counterfeiter v0.0.0-20181017030959-1aadac120687/go.mod h1:aoVsckWnsNzazwF2kmD+bzgdr4GBlbK91zsdivQJ2eU= +github.com/mitchellh/go-ps v0.0.0-20190716172923-621e5597135b h1:9+ke9YJ9KGWw5ANXK6ozjoK47uI3uNbXv4YVINBnGm8= +github.com/mitchellh/go-ps v0.0.0-20190716172923-621e5597135b/go.mod h1:r1VsdOzOPt1ZSrGZWFoNhsAedKnEd6r9Np1+5blZCWk= github.com/mitchellh/mapstructure v1.1.2 h1:fmNYVwqnSfB9mZU6OS2O6GsXM+wcskZDuKQzvN1EDeE= github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= diff --git a/pkg/virt-controller/services/template.go b/pkg/virt-controller/services/template.go index a7a90c40ddc5..8a5707c8a093 100644 --- a/pkg/virt-controller/services/template.go +++ b/pkg/virt-controller/services/template.go @@ -60,7 +60,6 @@ const GenieNetworksAnnotation = "cni" const CAP_NET_ADMIN = "NET_ADMIN" const CAP_SYS_NICE = "SYS_NICE" -const CAP_SYS_RESOURCE = "SYS_RESOURCE" // LibvirtStartupDelay is added to custom liveness and readiness probes initial delay value. // Libvirt needs roughly 10 seconds to start. @@ -945,12 +944,6 @@ func getRequiredCapabilities(vmi *v1.VirtualMachineInstance) []k8sv1.Capability } // add a CAP_SYS_NICE capability to allow setting cpu affinity res = append(res, CAP_SYS_NICE) - - if isSRIOVVmi(vmi) { - // this capability is needed for libvirt to be able to change ulimits for device passthrough: - // "error : cannot limit locked memory to 2098200576: Operation not permitted" - res = append(res, CAP_SYS_RESOURCE) - } return res } diff --git a/pkg/virt-handler/isolation/BUILD.bazel b/pkg/virt-handler/isolation/BUILD.bazel index 4ff5a1bba3c9..6a32631ae04a 100644 --- a/pkg/virt-handler/isolation/BUILD.bazel +++ b/pkg/virt-handler/isolation/BUILD.bazel @@ -15,6 +15,8 @@ go_library( "//staging/src/kubevirt.io/client-go/api/v1:go_default_library", "//staging/src/kubevirt.io/client-go/log:go_default_library", "//vendor/github.com/golang/mock/gomock:go_default_library", + "//vendor/github.com/mitchellh/go-ps:go_default_library", + "//vendor/golang.org/x/sys/unix:go_default_library", ], ) diff --git a/pkg/virt-handler/isolation/generated_mock_isolation.go b/pkg/virt-handler/isolation/generated_mock_isolation.go index ad98ad9bf604..37c5f32b3a9b 100644 --- a/pkg/virt-handler/isolation/generated_mock_isolation.go +++ b/pkg/virt-handler/isolation/generated_mock_isolation.go @@ -61,3 +61,13 @@ func (_m *MockPodIsolationDetector) Whitelist(controller []string) PodIsolationD func (_mr *_MockPodIsolationDetectorRecorder) Whitelist(arg0 interface{}) *gomock.Call { return _mr.mock.ctrl.RecordCall(_mr.mock, "Whitelist", arg0) } + +func (_m *MockPodIsolationDetector) AdjustResources(vm *v1.VirtualMachineInstance) error { + ret := _m.ctrl.Call(_m, "AdjustResources", vm) + ret0, _ := ret[0].(error) + return ret0 +} + +func (_mr *_MockPodIsolationDetectorRecorder) AdjustResources(arg0 interface{}) *gomock.Call { + return _mr.mock.ctrl.RecordCall(_mr.mock, "AdjustResources", arg0) +} diff --git a/pkg/virt-handler/isolation/isolation.go b/pkg/virt-handler/isolation/isolation.go index 2c1d4bb95ef8..399be4a40e72 100644 --- a/pkg/virt-handler/isolation/isolation.go +++ b/pkg/virt-handler/isolation/isolation.go @@ -35,6 +35,11 @@ import ( "path/filepath" "strings" "syscall" + "unsafe" + + "golang.org/x/sys/unix" + + ps "github.com/mitchellh/go-ps" v1 "kubevirt.io/client-go/api/v1" "kubevirt.io/client-go/log" @@ -53,6 +58,9 @@ type PodIsolationDetector interface { // Whitelist allows specifying cgroup controller which should be considered to detect the cgroup slice // It returns a PodIsolationDetector to allow configuring the PodIsolationDetector via the builder pattern. Whitelist(controller []string) PodIsolationDetector + + // Adjust system resources to run the passed VM + AdjustResources(vm *v1.VirtualMachineInstance) error } type MountInfo struct { @@ -120,6 +128,59 @@ func (s *socketBasedIsolationDetector) Detect(vm *v1.VirtualMachineInstance) (*I return NewIsolationResult(pid, slice, controller), nil } +// standard golang libraries don't provide API to set runtime limits +// for other processes, so we have to directly call to kernel +func prLimit(pid int, limit uintptr, rlimit *unix.Rlimit) error { + _, _, errno := unix.RawSyscall6(unix.SYS_PRLIMIT64, + uintptr(pid), + limit, + uintptr(unsafe.Pointer(rlimit)), + 0, 0, 0) + if errno != 0 { + return fmt.Errorf("Error setting prlimit: %v", errno) + } + return nil +} + +func (s *socketBasedIsolationDetector) AdjustResources(vm *v1.VirtualMachineInstance) error { + // bump memlock ulimit for libvirtd + res, err := s.Detect(vm) + if err != nil { + return err + } + launcherPid := res.Pid() + + processes, err := ps.Processes() + if err != nil { + return fmt.Errorf("failed to get all processes: %v", err) + } + + for _, process := range processes { + // consider all processes that are virt-launcher children + if process.PPid() != launcherPid { + continue + } + + // libvirtd process sets the memory lock limit before fork/exec-ing into qemu + if process.Executable() != "libvirtd" { + continue + } + + // TODO estimate actual memory size needed + rLimit := unix.Rlimit{ + Max: unix.RLIM_INFINITY, + Cur: unix.RLIM_INFINITY, + } + err = prLimit(process.Pid(), unix.RLIMIT_MEMLOCK, &rLimit) + if err != nil { + return fmt.Errorf("failed to set rlimit for memory lock: %v", err) + } + // we assume a single process should match + break + } + return nil +} + func NewIsolationResult(pid int, slice string, controller []string) *IsolationResult { return &IsolationResult{pid: pid, slice: slice, controller: controller} } diff --git a/pkg/virt-handler/vm.go b/pkg/virt-handler/vm.go index c01612122e45..0b24019d47e9 100644 --- a/pkg/virt-handler/vm.go +++ b/pkg/virt-handler/vm.go @@ -1414,6 +1414,11 @@ func (d *VirtualMachineController) processVmUpdate(origVMI *v1.VirtualMachineIns } } + err = d.podIsolationDetector.AdjustResources(vmi) + if err != nil { + return fmt.Errorf("failed to adjust resources: %v", err) + } + options := &cmdv1.VirtualMachineOptions{ VirtualMachineSMBios: &cmdv1.SMBios{ Family: d.clusterConfig.GetSMBIOS().Family, diff --git a/pkg/virt-handler/vm_test.go b/pkg/virt-handler/vm_test.go index 0f2628dcd105..bd6c9fd8b75a 100644 --- a/pkg/virt-handler/vm_test.go +++ b/pkg/virt-handler/vm_test.go @@ -73,6 +73,7 @@ var _ = Describe("VirtualMachineInstance", func() { var mockQueue *testutils.MockWorkQueue var mockWatchdog *MockWatchdog var mockGracefulShutdown *MockGracefulShutdown + var mockIsolationDetector *MockIsolationDetector var vmiFeeder *testutils.VirtualMachineFeeder var domainFeeder *testutils.DomainFeeder @@ -127,6 +128,7 @@ var _ = Describe("VirtualMachineInstance", func() { mockGracefulShutdown = &MockGracefulShutdown{shareDir} config, _, _ := testutils.NewFakeClusterConfig(&k8sv1.ConfigMap{}) + mockIsolationDetector = &MockIsolationDetector{} controller = NewController(recorder, virtClient, host, @@ -140,7 +142,7 @@ var _ = Describe("VirtualMachineInstance", func() { 10, config, tlsConfig, - isolation.NewSocketBasedIsolationDetector(shareDir), + mockIsolationDetector, ) testUUID = uuid.NewUUID() @@ -1302,6 +1304,24 @@ func (m *MockGracefulShutdown) TriggerShutdown(vmi *v1.VirtualMachineInstance) { Expect(err).NotTo(HaveOccurred()) } +type MockIsolationDetector struct{} + +func (_m *MockIsolationDetector) Detect(vm *v1.VirtualMachineInstance) (*isolation.IsolationResult, error) { + return &isolation.IsolationResult{}, nil +} + +func (_m *MockIsolationDetector) DetectForSocket(vm *v1.VirtualMachineInstance, socket string) (*isolation.IsolationResult, error) { + return nil, nil +} + +func (_m *MockIsolationDetector) Whitelist(controller []string) isolation.PodIsolationDetector { + return _m +} + +func (_m *MockIsolationDetector) AdjustResources(vm *v1.VirtualMachineInstance) error { + return nil +} + type MockWatchdog struct { baseDir string } diff --git a/pkg/virt-operator/creation/components/scc.go b/pkg/virt-operator/creation/components/scc.go index 3d156771cbb3..51ad11b4e094 100644 --- a/pkg/virt-operator/creation/components/scc.go +++ b/pkg/virt-operator/creation/components/scc.go @@ -74,7 +74,7 @@ func NewKubeVirtControllerSCC(namespace string) *secv1.SecurityContextConstraint scc.SELinuxContext = secv1.SELinuxContextStrategyOptions{ Type: secv1.SELinuxStrategyRunAsAny, } - scc.AllowedCapabilities = []corev1.Capability{"NET_ADMIN", "SYS_NICE", "SYS_RESOURCE"} + scc.AllowedCapabilities = []corev1.Capability{"NET_ADMIN", "SYS_NICE"} scc.AllowHostDirVolumePlugin = true scc.Users = []string{fmt.Sprintf("system:serviceaccount:%s:kubevirt-controller", namespace)} diff --git a/vendor/github.com/mitchellh/go-ps/.gitignore b/vendor/github.com/mitchellh/go-ps/.gitignore new file mode 100644 index 000000000000..a977916f6583 --- /dev/null +++ b/vendor/github.com/mitchellh/go-ps/.gitignore @@ -0,0 +1 @@ +.vagrant/ diff --git a/vendor/github.com/mitchellh/go-ps/.travis.yml b/vendor/github.com/mitchellh/go-ps/.travis.yml new file mode 100644 index 000000000000..8f794f71da43 --- /dev/null +++ b/vendor/github.com/mitchellh/go-ps/.travis.yml @@ -0,0 +1,4 @@ +language: go + +go: + - 1.2.1 diff --git a/vendor/github.com/mitchellh/go-ps/BUILD.bazel b/vendor/github.com/mitchellh/go-ps/BUILD.bazel new file mode 100644 index 000000000000..9e24fb5e23f4 --- /dev/null +++ b/vendor/github.com/mitchellh/go-ps/BUILD.bazel @@ -0,0 +1,17 @@ +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "go_default_library", + srcs = [ + "process.go", + "process_darwin.go", + "process_freebsd.go", + "process_linux.go", + "process_solaris.go", + "process_unix.go", + "process_windows.go", + ], + importmap = "kubevirt.io/kubevirt/vendor/github.com/mitchellh/go-ps", + importpath = "github.com/mitchellh/go-ps", + visibility = ["//visibility:public"], +) diff --git a/vendor/github.com/mitchellh/go-ps/LICENSE.md b/vendor/github.com/mitchellh/go-ps/LICENSE.md new file mode 100644 index 000000000000..229851590442 --- /dev/null +++ b/vendor/github.com/mitchellh/go-ps/LICENSE.md @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2014 Mitchell Hashimoto + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/vendor/github.com/mitchellh/go-ps/README.md b/vendor/github.com/mitchellh/go-ps/README.md new file mode 100644 index 000000000000..4e3d0e146343 --- /dev/null +++ b/vendor/github.com/mitchellh/go-ps/README.md @@ -0,0 +1,34 @@ +# Process List Library for Go [![GoDoc](https://godoc.org/github.com/mitchellh/go-ps?status.png)](https://godoc.org/github.com/mitchellh/go-ps) + +go-ps is a library for Go that implements OS-specific APIs to list and +manipulate processes in a platform-safe way. The library can find and +list processes on Linux, Mac OS X, Solaris, and Windows. + +If you're new to Go, this library has a good amount of advanced Go educational +value as well. It uses some advanced features of Go: build tags, accessing +DLL methods for Windows, cgo for Darwin, etc. + +How it works: + + * **Darwin** uses the `sysctl` syscall to retrieve the process table. + * **Unix** uses the procfs at `/proc` to inspect the process tree. + * **Windows** uses the Windows API, and methods such as + `CreateToolhelp32Snapshot` to get a point-in-time snapshot of + the process table. + +## Installation + +Install using standard `go get`: + +``` +$ go get github.com/mitchellh/go-ps +... +``` + +## TODO + +Want to contribute? Here is a short TODO list of things that aren't +implemented for this library that would be nice: + + * FreeBSD support + * Plan9 support diff --git a/vendor/github.com/mitchellh/go-ps/Vagrantfile b/vendor/github.com/mitchellh/go-ps/Vagrantfile new file mode 100644 index 000000000000..61662ab1e3e7 --- /dev/null +++ b/vendor/github.com/mitchellh/go-ps/Vagrantfile @@ -0,0 +1,43 @@ +# -*- mode: ruby -*- +# vi: set ft=ruby : + +# Vagrantfile API/syntax version. Don't touch unless you know what you're doing! +VAGRANTFILE_API_VERSION = "2" + +Vagrant.configure(VAGRANTFILE_API_VERSION) do |config| + config.vm.box = "chef/ubuntu-12.04" + + config.vm.provision "shell", inline: $script + + ["vmware_fusion", "vmware_workstation"].each do |p| + config.vm.provider "p" do |v| + v.vmx["memsize"] = "1024" + v.vmx["numvcpus"] = "2" + v.vmx["cpuid.coresPerSocket"] = "1" + end + end +end + +$script = <