runtime: added vcpus pinning logics

Core VCPU threads pinning logics for issue 4476. Also provided docs. Fixes:#4476 Signed-off-by: LitFlwr0 <861690705@qq.com>
kata-containers · Nov 4, 2022 · 2508d39 · 2508d39
1 parent 288e337
commit 2508d39
Show file tree

Hide file tree

Showing 9 changed files with 150 additions and 0 deletions.
diff --git a/docs/design/README.md b/docs/design/README.md
@@ -7,6 +7,7 @@ Kata Containers design documents:
 - [Design requirements for Kata Containers](kata-design-requirements.md)
 - [VSocks](VSocks.md)
 - [VCPU handling](vcpu-handling.md)
+- [VCPU threads pinning](vcpu-threads-pinning.md)
 - [Host cgroups](host-cgroups.md)
 - [`Inotify` support](inotify.md)
 - [Metrics(Kata 2.0)](kata-2-0-metrics.md)

diff --git a/docs/design/arch-images/vcpus-pinning-process.png b/docs/design/arch-images/vcpus-pinning-process.png
diff --git a/docs/design/vcpu-threads-pinning.md b/docs/design/vcpu-threads-pinning.md
@@ -0,0 +1,37 @@
+# Design Doc for Kata Containers' VCPUs Pinning Feature
+
+## Background
+By now, vCPU threads of Kata Containers are scheduled randomly to CPUs. And each pod would request a specific set of CPUs which we call it CPU set (just the CPU set meaning in Linux cgroups).    
+
+If the number of vCPU threads are equal to that of CPUs claimed in CPU set, we can then pin each vCPU thread to one specified CPU, to reduce the cost of random scheduling. 
+
+## Detailed Design
+
+### Passing Config Parameters
+Two ways are provided to use this vCPU thread pinning feature: through `QEMU` configuration file and through annotations. Finally the pinning parameter is passed to `HypervisorConfig`.
+
+### Related Linux Thread Scheduling API
+
+| API Info          | Value                                                     |
+|-------------------|-----------------------------------------------------------|
+| Package           | `golang.org/x/sys/unix`                                     |
+| Method            | `unix.SchedSetaffinity(thread_id, &unixCPUSet)`             |
+| Official Doc Page | https://pkg.go.dev/golang.org/x/sys/unix#SchedSetaffinity |
+
+### When is VCPUs Pinning Checked?
+
+As shown in Section 1, when `num(vCPU threads) == num(CPUs in CPU set)`, we shall pin each vCPU thread to a specified CPU. And when this condition is broken, we should restore to the original random scheduling pattern.  
+So when may `num(CPUs in CPU set)` change? There are 5 possible scenes:
+
+| Possible scenes                   | Related Code                               |
+|-----------------------------------|--------------------------------------------|
+| when creating a container         | File Sandbox.go, in method `CreateContainer`  |
+| when starting a container         | File Sandbox.go, in method `StartContainer`   |
+| when deleting a container         | File Sandbox.go, in method `DeleteContainer`  |
+| when updating a container         | File Sandbox.go, in method `UpdateContainer`  |
+| when creating multiple containers | File Sandbox.go, in method `createContainers` |
+
+### Core Pinning Logics
+
+We can split the whole process into the following steps. Related methods are `checkVCPUsPinning` and `resetVCPUsPinning`, in file Sandbox.go.
+![](arch-images/vcpus-pinning-process.png) 
diff --git a/src/runtime/config/configuration-qemu.toml.in b/src/runtime/config/configuration-qemu.toml.in
@@ -96,6 +96,11 @@ machine_accelerators="@MACHINEACCELERATORS@"
 # For example, `cpu_features = "pmu=off,vmx=off"
 cpu_features="@CPUFEATURES@"
 
+# vCPUs pinning settings
+# if enabled, each vCPU thread will be scheduled to a fixed CPU
+# qualified condition: num(vCPU threads) == num(CPUs in sandbox's CPUSet)
+# enable_vcpus_pinning = false
+
 # Default number of vCPUs per SB/VM:
 # unspecified or 0                --> will be set to @DEFVCPUS@
 # < 0                             --> will be set to the actual number of physical cores

diff --git a/src/runtime/pkg/katautils/config.go b/src/runtime/pkg/katautils/config.go
@@ -155,6 +155,7 @@ type hypervisor struct {
 	DisableSeccomp                 bool     `toml:"disable_seccomp"`
 	DisableSeLinux                 bool     `toml:"disable_selinux"`
 	LegacySerial                   bool     `toml:"use_legacy_serial"`
+	EnableVCPUsPinning             bool     `toml:"enable_vcpus_pinning"`
 }
 
 type runtime struct {
@@ -833,6 +834,7 @@ func newQemuHypervisorConfig(h hypervisor) (vc.HypervisorConfig, error) {
 		Rootless:                h.Rootless,
 		LegacySerial:            h.LegacySerial,
 		DisableSeLinux:          h.DisableSeLinux,
+		EnableVCPUsPinning:      h.EnableVCPUsPinning,
 	}, nil
 }
 

diff --git a/src/runtime/pkg/oci/utils.go b/src/runtime/pkg/oci/utils.go
@@ -651,6 +651,12 @@ func addHypervisorCPUOverrides(ocispec specs.Spec, sbConfig *vc.SandboxConfig) e
 		return err
 	}
 
+	if err := newAnnotationConfiguration(ocispec, vcAnnotations.EnableVCPUsPinning).setBool(func(enableVCPUsPinning bool) {
+		sbConfig.HypervisorConfig.EnableVCPUsPinning = enableVCPUsPinning
+	}); err != nil {
+		return err
+	}
+
 	return newAnnotationConfiguration(ocispec, vcAnnotations.DefaultMaxVCPUs).setUintWithCheck(func(maxVCPUs uint64) error {
 		max := uint32(maxVCPUs)
 

diff --git a/src/runtime/virtcontainers/hypervisor.go b/src/runtime/virtcontainers/hypervisor.go
@@ -559,6 +559,9 @@ type HypervisorConfig struct {
 
 	// Use legacy serial for the guest console
 	LegacySerial bool
+
+	// EnableVCPUsPinning controls whether each vCPU thread should be scheduled to a fixed CPU
+	EnableVCPUsPinning bool
 }
 
 // vcpu mapping from vcpu number to thread number

diff --git a/src/runtime/virtcontainers/pkg/annotations/annotations.go b/src/runtime/virtcontainers/pkg/annotations/annotations.go
@@ -143,6 +143,9 @@ const (
 	// DefaultVCPUs is a sandbox annotation that specifies the maximum number of vCPUs allocated for the VM by the hypervisor.
 	DefaultMaxVCPUs = kataAnnotHypervisorPrefix + "default_max_vcpus"
 
+	// EnableVCPUsPinning is a sandbox annotation that controls bundling between vCPU threads and CPUs
+	EnableVCPUsPinning = kataAnnotationsPrefix + "enable_vcpus_pinning"
+
 	//
 	//	Memory related annotations
 	//

diff --git a/src/runtime/virtcontainers/sandbox.go b/src/runtime/virtcontainers/sandbox.go
@@ -44,6 +44,7 @@ import (
 	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/rootless"
 	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
 	"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/utils"
+	"golang.org/x/sys/unix"
 )
 
 // sandboxTracingTags defines tags for the trace span
@@ -236,6 +237,7 @@ type Sandbox struct {
 	sharePidNs        bool
 	seccompSupported  bool
 	disableVMShutdown bool
+	isVCPUsPinningOn  bool
 }
 
 // ID returns the sandbox identifier string.
@@ -1353,6 +1355,10 @@ func (s *Sandbox) CreateContainer(ctx context.Context, contConfig ContainerConfi
 		return nil, err
 	}
 
+	if err = s.checkVCPUsPinning(ctx); err != nil {
+		return nil, err
+	}
+
 	if err = s.storeSandbox(ctx); err != nil {
 		return nil, err
 	}
@@ -1385,6 +1391,10 @@ func (s *Sandbox) StartContainer(ctx context.Context, containerID string) (VCCon
 		return nil, err
 	}
 
+	if err = s.checkVCPUsPinning(ctx); err != nil {
+		return nil, err
+	}
+
 	return c, nil
 }
 
@@ -1457,6 +1467,10 @@ func (s *Sandbox) DeleteContainer(ctx context.Context, containerID string) (VCCo
 		return nil, err
 	}
 
+	if err = s.checkVCPUsPinning(ctx); err != nil {
+		return nil, err
+	}
+
 	if err = s.storeSandbox(ctx); err != nil {
 		return nil, err
 	}
@@ -1522,6 +1536,10 @@ func (s *Sandbox) UpdateContainer(ctx context.Context, containerID string, resou
 		return err
 	}
 
+	if err = s.checkVCPUsPinning(ctx); err != nil {
+		return err
+	}
+
 	if err = s.storeSandbox(ctx); err != nil {
 		return err
 	}
@@ -1640,6 +1658,11 @@ func (s *Sandbox) createContainers(ctx context.Context) error {
 	if err := s.resourceControllerUpdate(ctx); err != nil {
 		return err
 	}
+
+	if err := s.checkVCPUsPinning(ctx); err != nil {
+		return err
+	}
+
 	if err := s.storeSandbox(ctx); err != nil {
 		return err
 	}
@@ -2459,3 +2482,73 @@ func (s *Sandbox) fetchContainers(ctx context.Context) error {
 
 	return nil
 }
+
+// checkVCPUsPinning is used to support CPUSet mode of kata container.
+// CPUSet mode is on when Sandbox.HypervisorConfig.EnableVCPUsPinning
+// is set to true. Then it fetches sandbox's number of vCPU threads
+// and number of CPUs in CPUSet. If the two are equal, each vCPU thread
+// is then pinned to one fixed CPU in CPUSet.
+func (s *Sandbox) checkVCPUsPinning(ctx context.Context) error {
+	if s.config == nil {
+		return fmt.Errorf("no hypervisor config found")
+	}
+	if !s.config.HypervisorConfig.EnableVCPUsPinning {
+		return nil
+	}
+
+	// fetch vCPU thread ids and CPUSet
+	vCPUThreadsMap, err := s.hypervisor.GetThreadIDs(ctx)
+	if err != nil {
+		return fmt.Errorf("failed to get vCPU thread ids from hypervisor: %v", err)
+	}
+	cpuSetStr, _, err := s.getSandboxCPUSet()
+	if err != nil {
+		return fmt.Errorf("failed to get CPUSet config: %v", err)
+	}
+	cpuSet, err := cpuset.Parse(cpuSetStr)
+	if err != nil {
+		return fmt.Errorf("failed to parse CPUSet string: %v", err)
+	}
+	cpuSetSlice := cpuSet.ToSlice()
+
+	// check if vCPU thread numbers and CPU numbers are equal
+	numVCPUs, numCPUs := len(vCPUThreadsMap.vcpus), len(cpuSetSlice)
+	// if not equal, we should reset threads scheduling to random pattern
+	if numVCPUs != numCPUs {
+		if s.isVCPUsPinningOn {
+			s.isVCPUsPinningOn = false
+			return s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice)
+		}
+		return nil
+	}
+
+	// if equal, we can now start vCPU threads pinning
+	i := 0
+	for _, tid := range vCPUThreadsMap.vcpus {
+		unixCPUSet := unix.CPUSet{}
+		unixCPUSet.Set(cpuSetSlice[i])
+		if err := unix.SchedSetaffinity(tid, &unixCPUSet); err != nil {
+			if err := s.resetVCPUsPinning(ctx, vCPUThreadsMap, cpuSetSlice); err != nil {
+				return err
+			}
+			return fmt.Errorf("failed to set vcpu thread %d affinity to cpu %d: %v", tid, cpuSetSlice[i], err)
+		}
+		i++
+	}
+	s.isVCPUsPinningOn = true
+	return nil
+}
+
+// resetVCPUsPinning cancels current pinning and restores default random vCPU threads scheduling
+func (s *Sandbox) resetVCPUsPinning(ctx context.Context, vCPUThreadsMap VcpuThreadIDs, cpuSetSlice []int) error {
+	unixCPUSet := unix.CPUSet{}
+	for cpuId := range cpuSetSlice {
+		unixCPUSet.Set(cpuId)
+	}
+	for _, tid := range vCPUThreadsMap.vcpus {
+		if err := unix.SchedSetaffinity(tid, &unixCPUSet); err != nil {
+			return fmt.Errorf("failed to reset vcpu thread %d affinity to default mode: %v", tid, err)
+		}
+	}
+	return nil
+}