Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove SYS_RESOURCE capability from launcher pod #2584

Merged
merged 6 commits into from Aug 30, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
15 changes: 6 additions & 9 deletions cmd/chroot/main.go
Expand Up @@ -8,7 +8,6 @@ import (
"strconv"
"strings"
"syscall"
"unsafe"

"github.com/spf13/cobra"
"golang.org/x/sys/unix"
Expand Down Expand Up @@ -61,10 +60,9 @@ func main() {
Cur: uint64(cpuTime),
Max: uint64(cpuTime),
}
// PID 0 means that we assign the limit to us, all commands will inherit it
_, _, e1 := syscall.RawSyscall6(syscall.SYS_PRLIMIT64, uintptr(0), uintptr(unix.RLIMIT_CPU), uintptr(unsafe.Pointer(value)), 0, 0, 0)
if e1 != 0 {
return fmt.Errorf("error setting prlimit on cpu time with value %d: %v", value, e1)
err := syscall.Setrlimit(unix.RLIMIT_CPU, value)
if err != nil {
return fmt.Errorf("error setting prlimit on cpu time with value %d: %v", value, err)
}
}

Expand All @@ -73,10 +71,9 @@ func main() {
Cur: uint64(megabyte) * 1000000,
Max: uint64(megabyte) * 1000000,
}
// PID 0 means that we assign the limit to us, all commands will inherit it
_, _, e1 := syscall.RawSyscall6(syscall.SYS_PRLIMIT64, uintptr(0), uintptr(unix.RLIMIT_AS), uintptr(unsafe.Pointer(value)), 0, 0, 0)
if e1 != 0 {
return fmt.Errorf("error setting prlimit on virtual memory with value %d: %v", value, e1)
err := syscall.Setrlimit(unix.RLIMIT_AS, value)
if err != nil {
return fmt.Errorf("error setting prlimit on virtual memory with value %d: %v", value, err)
}
}

Expand Down
1 change: 1 addition & 0 deletions go.mod
Expand Up @@ -39,6 +39,7 @@ require (
github.com/libvirt/libvirt-go v5.0.0+incompatible
github.com/mattn/go-colorable v0.1.1 // indirect
github.com/mattn/go-runewidth v0.0.0-20181218000649-703b5e6b11ae // indirect
github.com/mitchellh/go-ps v0.0.0-20190716172923-621e5597135b
github.com/onsi/ginkgo v1.8.0
github.com/onsi/gomega v1.5.1-0.20190515112211-6a48b4839f85
github.com/openshift/api v3.9.1-0.20190401220125-3a6077f1f910+incompatible
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Expand Up @@ -194,6 +194,8 @@ github.com/mattn/go-sqlite3 v1.10.0/go.mod h1:FPy6KqzDD04eiIsT53CuJW3U88zkxoIYsO
github.com/matttproud/golang_protobuf_extensions v1.0.1 h1:4hp9jkHxhMHkqkrB3Ix0jegS5sx/RkqARlsWZ6pIwiU=
github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
github.com/maxbrunsfeld/counterfeiter v0.0.0-20181017030959-1aadac120687/go.mod h1:aoVsckWnsNzazwF2kmD+bzgdr4GBlbK91zsdivQJ2eU=
github.com/mitchellh/go-ps v0.0.0-20190716172923-621e5597135b h1:9+ke9YJ9KGWw5ANXK6ozjoK47uI3uNbXv4YVINBnGm8=
github.com/mitchellh/go-ps v0.0.0-20190716172923-621e5597135b/go.mod h1:r1VsdOzOPt1ZSrGZWFoNhsAedKnEd6r9Np1+5blZCWk=
github.com/mitchellh/mapstructure v1.1.2 h1:fmNYVwqnSfB9mZU6OS2O6GsXM+wcskZDuKQzvN1EDeE=
github.com/mitchellh/mapstructure v1.1.2/go.mod h1:FVVH3fgwuzCH5S8UJGiWEs2h04kUh9fWfEaFds41c1Y=
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
Expand Down
1 change: 1 addition & 0 deletions pkg/util/BUILD.bazel
Expand Up @@ -8,4 +8,5 @@ go_library(
],
importpath = "kubevirt.io/kubevirt/pkg/util",
visibility = ["//visibility:public"],
deps = ["//staging/src/kubevirt.io/client-go/api/v1:go_default_library"],
)
13 changes: 13 additions & 0 deletions pkg/util/util.go
@@ -1,6 +1,19 @@
package util

import (
v1 "kubevirt.io/client-go/api/v1"
)

const ExtensionAPIServerAuthenticationConfigMap = "extension-apiserver-authentication"
const RequestHeaderClientCAFileKey = "requestheader-client-ca-file"
const VirtShareDir = "/var/run/kubevirt"
const VirtLibDir = "/var/lib/kubevirt"

func IsSRIOVVmi(vmi *v1.VirtualMachineInstance) bool {
for _, iface := range vmi.Spec.Domain.Devices.Interfaces {
if iface.SRIOV != nil {
return true
}
}
return false
}
1 change: 1 addition & 0 deletions pkg/virt-controller/services/BUILD.bazel
Expand Up @@ -10,6 +10,7 @@ go_library(
"//pkg/container-disk:go_default_library",
"//pkg/hooks:go_default_library",
"//pkg/host-disk:go_default_library",
"//pkg/util:go_default_library",
"//pkg/util/hardware:go_default_library",
"//pkg/util/net/dns:go_default_library",
"//pkg/util/types:go_default_library",
Expand Down
19 changes: 2 additions & 17 deletions pkg/virt-controller/services/template.go
Expand Up @@ -42,6 +42,7 @@ import (
"kubevirt.io/kubevirt/pkg/config"
containerdisk "kubevirt.io/kubevirt/pkg/container-disk"
"kubevirt.io/kubevirt/pkg/hooks"
"kubevirt.io/kubevirt/pkg/util"
"kubevirt.io/kubevirt/pkg/util/hardware"
"kubevirt.io/kubevirt/pkg/util/net/dns"
"kubevirt.io/kubevirt/pkg/util/types"
Expand All @@ -60,7 +61,6 @@ const GenieNetworksAnnotation = "cni"

const CAP_NET_ADMIN = "NET_ADMIN"
const CAP_SYS_NICE = "SYS_NICE"
const CAP_SYS_RESOURCE = "SYS_RESOURCE"

// LibvirtStartupDelay is added to custom liveness and readiness probes initial delay value.
// Libvirt needs roughly 10 seconds to start.
Expand Down Expand Up @@ -98,15 +98,6 @@ type templateService struct {

type PvcNotFoundError error

func isSRIOVVmi(vmi *v1.VirtualMachineInstance) bool {
for _, iface := range vmi.Spec.Domain.Devices.Interfaces {
if iface.SRIOV != nil {
return true
}
}
return false
}

func isFeatureStateEnabled(fs *v1.FeatureState) bool {
return fs != nil && fs.Enabled != nil && *fs.Enabled
}
Expand Down Expand Up @@ -355,7 +346,7 @@ func (t *templateService) RenderLaunchManifest(vmi *v1.VirtualMachineInstance) (
MountPath: "/var/run/libvirt",
})

if isSRIOVVmi(vmi) {
if util.IsSRIOVVmi(vmi) {
// libvirt needs this volume to access PCI device config;
// note that the volume should not be read-only because libvirt
// opens the config for writing
Expand Down Expand Up @@ -945,12 +936,6 @@ func getRequiredCapabilities(vmi *v1.VirtualMachineInstance) []k8sv1.Capability
}
// add a CAP_SYS_NICE capability to allow setting cpu affinity
res = append(res, CAP_SYS_NICE)

if isSRIOVVmi(vmi) {
// this capability is needed for libvirt to be able to change ulimits for device passthrough:
// "error : cannot limit locked memory to 2098200576: Operation not permitted"
res = append(res, CAP_SYS_RESOURCE)
}
return res
}

Expand Down
4 changes: 4 additions & 0 deletions pkg/virt-handler/isolation/BUILD.bazel
Expand Up @@ -11,10 +11,14 @@ go_library(
visibility = ["//visibility:public"],
deps = [
"//pkg/container-disk:go_default_library",
"//pkg/util:go_default_library",
"//pkg/virt-handler/cmd-client:go_default_library",
"//staging/src/kubevirt.io/client-go/api/v1:go_default_library",
"//staging/src/kubevirt.io/client-go/log:go_default_library",
"//vendor/github.com/golang/mock/gomock:go_default_library",
"//vendor/github.com/mitchellh/go-ps:go_default_library",
"//vendor/golang.org/x/sys/unix:go_default_library",
"//vendor/k8s.io/apimachinery/pkg/api/resource:go_default_library",
],
)

Expand Down
10 changes: 10 additions & 0 deletions pkg/virt-handler/isolation/generated_mock_isolation.go
Expand Up @@ -61,3 +61,13 @@ func (_m *MockPodIsolationDetector) Whitelist(controller []string) PodIsolationD
func (_mr *_MockPodIsolationDetectorRecorder) Whitelist(arg0 interface{}) *gomock.Call {
return _mr.mock.ctrl.RecordCall(_mr.mock, "Whitelist", arg0)
}

func (_m *MockPodIsolationDetector) AdjustResources(vm *v1.VirtualMachineInstance) error {
ret := _m.ctrl.Call(_m, "AdjustResources", vm)
ret0, _ := ret[0].(error)
return ret0
}

func (_mr *_MockPodIsolationDetectorRecorder) AdjustResources(arg0 interface{}) *gomock.Call {
return _mr.mock.ctrl.RecordCall(_mr.mock, "AdjustResources", arg0)
}
95 changes: 95 additions & 0 deletions pkg/virt-handler/isolation/isolation.go
Expand Up @@ -35,9 +35,16 @@ import (
"path/filepath"
"strings"
"syscall"
"unsafe"

"golang.org/x/sys/unix"

ps "github.com/mitchellh/go-ps"
"k8s.io/apimachinery/pkg/api/resource"

v1 "kubevirt.io/client-go/api/v1"
"kubevirt.io/client-go/log"
"kubevirt.io/kubevirt/pkg/util"
cmdclient "kubevirt.io/kubevirt/pkg/virt-handler/cmd-client"
)

Expand All @@ -53,6 +60,9 @@ type PodIsolationDetector interface {
// Whitelist allows specifying cgroup controller which should be considered to detect the cgroup slice
// It returns a PodIsolationDetector to allow configuring the PodIsolationDetector via the builder pattern.
Whitelist(controller []string) PodIsolationDetector

// Adjust system resources to run the passed VM
AdjustResources(vm *v1.VirtualMachineInstance) error
}

type MountInfo struct {
Expand Down Expand Up @@ -120,6 +130,91 @@ func (s *socketBasedIsolationDetector) Detect(vm *v1.VirtualMachineInstance) (*I
return NewIsolationResult(pid, slice, controller), nil
}

// standard golang libraries don't provide API to set runtime limits
// for other processes, so we have to directly call to kernel
func prLimit(pid int, limit uintptr, rlimit *unix.Rlimit) error {
_, _, errno := unix.RawSyscall6(unix.SYS_PRLIMIT64,
phoracek marked this conversation as resolved.
Show resolved Hide resolved
uintptr(pid),
limit,
uintptr(unsafe.Pointer(rlimit)),
0, 0, 0)
if errno != 0 {
return fmt.Errorf("Error setting prlimit: %v", errno)
}
return nil
}

func (s *socketBasedIsolationDetector) AdjustResources(vm *v1.VirtualMachineInstance) error {
// only VFIO attached domains require MEMLOCK adjustment
if !util.IsSRIOVVmi(vm) {
return nil
}

// bump memlock ulimit for libvirtd
res, err := s.Detect(vm)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could we use a more specific name than result?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(won't block the PR on this one, if you answer no no no to my comments, this PR is good to go)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it's of "IsolationResult" type so I thought it's a good enough name. I am ok with renaming. What would be a better name?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it is being used only in this block and i don't really know. since the other comments are resolved, i wont bother

if err != nil {
return err
}
launcherPid := res.Pid()

processes, err := ps.Processes()
if err != nil {
return fmt.Errorf("failed to get all processes: %v", err)
}

for _, process := range processes {
// consider all processes that are virt-launcher children
if process.PPid() != launcherPid {
continue
}

// libvirtd process sets the memory lock limit before fork/exec-ing into qemu
if process.Executable() != "libvirtd" {
continue
}

// make the best estimate for memory required by libvirt
memlockSize, err := getMemlockSize(vm)
if err != nil {
return err
}
rLimit := unix.Rlimit{
Max: uint64(memlockSize),
Cur: uint64(memlockSize),
}
err = prLimit(process.Pid(), unix.RLIMIT_MEMLOCK, &rLimit)
if err != nil {
return fmt.Errorf("failed to set rlimit for memory lock: %v", err)
}
// we assume a single process should match
phoracek marked this conversation as resolved.
Show resolved Hide resolved
break
}
return nil
}

// consider reusing getMemoryOverhead()
// This is not scientific, but neither what libvirtd does is. See details in:
// https://www.redhat.com/archives/libvirt-users/2019-August/msg00051.html
func getMemlockSize(vm *v1.VirtualMachineInstance) (int64, error) {
memlockSize := resource.NewQuantity(0, resource.DecimalSI)

// start with base memory requested for the VM
vmiMemoryReq := vm.Spec.Domain.Resources.Requests.Memory()
memlockSize.Add(*resource.NewScaledQuantity(vmiMemoryReq.ScaledValue(resource.Kilo), resource.Kilo))

// allocate 1Gb for VFIO needs
memlockSize.Add(resource.MustParse("1G"))
booxter marked this conversation as resolved.
Show resolved Hide resolved

// add some more memory for NUMA / CPU topology, platform memory alignment and other needs
memlockSize.Add(resource.MustParse("256M"))

bytes_, ok := memlockSize.AsInt64()
if !ok {
return 0, fmt.Errorf("could not calculate memory lock size")
}
return bytes_, nil
}

func NewIsolationResult(pid int, slice string, controller []string) *IsolationResult {
return &IsolationResult{pid: pid, slice: slice, controller: controller}
}
Expand Down
11 changes: 11 additions & 0 deletions pkg/virt-handler/isolation/isolation_test.go
Expand Up @@ -73,3 +73,14 @@ var _ = Describe("Isolation", func() {
})
})
})

var _ = Describe("getMemlockSize", func() {
vm := v1.NewMinimalVMIWithNS("default", "testvm")

It("Should return correct number of bytes for memlock limit", func() {
bytes_, err := getMemlockSize(vm)
Expect(err).ToNot(HaveOccurred())
// 1Gb (static part for vfio VMs) + 256Mb (estimated overhead) + 8 Mb (VM)
Expect(int(bytes_)).To(Equal(1264389000))
})
})
5 changes: 5 additions & 0 deletions pkg/virt-handler/vm.go
Expand Up @@ -1414,6 +1414,11 @@ func (d *VirtualMachineController) processVmUpdate(origVMI *v1.VirtualMachineIns
}
}

err = d.podIsolationDetector.AdjustResources(vmi)
if err != nil {
return fmt.Errorf("failed to adjust resources: %v", err)
}

options := &cmdv1.VirtualMachineOptions{
VirtualMachineSMBios: &cmdv1.SMBios{
Family: d.clusterConfig.GetSMBIOS().Family,
Expand Down
7 changes: 6 additions & 1 deletion pkg/virt-handler/vm_test.go
Expand Up @@ -73,6 +73,7 @@ var _ = Describe("VirtualMachineInstance", func() {
var mockQueue *testutils.MockWorkQueue
var mockWatchdog *MockWatchdog
var mockGracefulShutdown *MockGracefulShutdown
var mockIsolationDetector *isolation.MockPodIsolationDetector

var vmiFeeder *testutils.VirtualMachineFeeder
var domainFeeder *testutils.DomainFeeder
Expand Down Expand Up @@ -127,6 +128,10 @@ var _ = Describe("VirtualMachineInstance", func() {
mockGracefulShutdown = &MockGracefulShutdown{shareDir}
config, _, _ := testutils.NewFakeClusterConfig(&k8sv1.ConfigMap{})

mockIsolationDetector = isolation.NewMockPodIsolationDetector(ctrl)
mockIsolationDetector.EXPECT().Detect(gomock.Any()).Return(&isolation.IsolationResult{}, nil).AnyTimes()
mockIsolationDetector.EXPECT().AdjustResources(gomock.Any()).Return(nil).AnyTimes()

controller = NewController(recorder,
virtClient,
host,
Expand All @@ -140,7 +145,7 @@ var _ = Describe("VirtualMachineInstance", func() {
10,
config,
tlsConfig,
isolation.NewSocketBasedIsolationDetector(shareDir),
mockIsolationDetector,
)

testUUID = uuid.NewUUID()
Expand Down
2 changes: 1 addition & 1 deletion pkg/virt-operator/creation/components/scc.go
Expand Up @@ -74,7 +74,7 @@ func NewKubeVirtControllerSCC(namespace string) *secv1.SecurityContextConstraint
scc.SELinuxContext = secv1.SELinuxContextStrategyOptions{
Type: secv1.SELinuxStrategyRunAsAny,
}
scc.AllowedCapabilities = []corev1.Capability{"NET_ADMIN", "SYS_NICE", "SYS_RESOURCE"}
scc.AllowedCapabilities = []corev1.Capability{"NET_ADMIN", "SYS_NICE"}
scc.AllowHostDirVolumePlugin = true
scc.Users = []string{fmt.Sprintf("system:serviceaccount:%s:kubevirt-controller", namespace)}

Expand Down
1 change: 1 addition & 0 deletions vendor/github.com/mitchellh/go-ps/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions vendor/github.com/mitchellh/go-ps/.travis.yml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions vendor/github.com/mitchellh/go-ps/BUILD.bazel

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 21 additions & 0 deletions vendor/github.com/mitchellh/go-ps/LICENSE.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.