Skip to content

Commit

Permalink
Merge pull request #4310 from egernst/core-sched
Browse files Browse the repository at this point in the history
shim: add support for core scheduling
  • Loading branch information
egernst committed Jun 8, 2022
2 parents 5bd81ba + d2df120 commit 4ebf9d3
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 1 deletion.
2 changes: 1 addition & 1 deletion docs/design/README.md
Expand Up @@ -12,7 +12,7 @@ Kata Containers design documents:
- [Metrics(Kata 2.0)](kata-2-0-metrics.md)
- [Design for Kata Containers `Lazyload` ability with `nydus`](kata-nydus-design.md)
- [Design for direct-assigned volume](direct-blk-device-assignment.md)

- [Design for core-scheduling](core-scheduling.md)
---

- [Design proposals](proposals)
12 changes: 12 additions & 0 deletions docs/design/core-scheduling.md
@@ -0,0 +1,12 @@
# Core scheduling

Core scheduling is a Linux kernel feature that allows only trusted tasks to run concurrently on
CPUs sharing compute resources (for example, hyper-threads on a core).

Containerd versions >= 1.6.4 leverage this to treat all of the processes associated with a
given pod or container to be a single group of trusted tasks. To indicate this should be carried
out, containerd sets the `SCHED_CORE` environment variable for each shim it spawns. When this is
set, the Kata Containers shim implementation uses the `prctl` syscall to create a new core scheduling
domain for the shim process itself as well as future VMM processes it will start.

For more details on the core scheduling feature, see the [Linux documentation](https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/core-scheduling.html).
12 changes: 12 additions & 0 deletions src/runtime/pkg/containerd-shim-v2/service.go
Expand Up @@ -10,6 +10,7 @@ import (
"io"
"os"
sysexec "os/exec"
goruntime "runtime"
"sync"
"syscall"
"time"
Expand All @@ -31,6 +32,7 @@ import (
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils"
"github.com/kata-containers/kata-containers/src/runtime/pkg/katautils/katatrace"
"github.com/kata-containers/kata-containers/src/runtime/pkg/oci"
"github.com/kata-containers/kata-containers/src/runtime/pkg/utils"
vc "github.com/kata-containers/kata-containers/src/runtime/virtcontainers"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/pkg/compatoci"
"github.com/kata-containers/kata-containers/src/runtime/virtcontainers/types"
Expand Down Expand Up @@ -234,9 +236,19 @@ func (s *service) StartShim(ctx context.Context, opts cdshim.StartOpts) (_ strin

cmd.ExtraFiles = append(cmd.ExtraFiles, f)

goruntime.LockOSThread()
if os.Getenv("SCHED_CORE") != "" {
if err := utils.Create(utils.ProcessGroup); err != nil {
return "", errors.Wrap(err, "enable sched core support")
}
}

if err := cmd.Start(); err != nil {
return "", err
}

goruntime.UnlockOSThread()

defer func() {
if retErr != nil {
cmd.Process.Kill()
Expand Down
36 changes: 36 additions & 0 deletions src/runtime/pkg/utils/schedcore.go
@@ -0,0 +1,36 @@
// Copyright (c) 2022 Apple Inc.
//
// SPDX-License-Identifier: Apache-2.0
//

package utils

import (
"golang.org/x/sys/unix"
)

// PidType is the type of provided pid value and how it should be treated
type PidType int

const (
pidTypePid = 0
pidTypeThreadGroupId = 1
pidTypeProcessGroupId = 2

// Pid affects the current pid
Pid PidType = pidTypePid
// ThreadGroup affects all threads in the group
ThreadGroup PidType = pidTypeThreadGroupId
// ProcessGroup affects all processes in the group
ProcessGroup PidType = pidTypeProcessGroupId
)

// Create a new sched core domain
func Create(t PidType) error {
return unix.Prctl(unix.PR_SCHED_CORE, unix.PR_SCHED_CORE_CREATE, 0, uintptr(t), 0)
}

// ShareFrom shares the sched core domain from the provided pid
func ShareFrom(pid uint64, t PidType) error {
return unix.Prctl(unix.PR_SCHED_CORE, unix.PR_SCHED_CORE_SHARE_FROM, uintptr(pid), uintptr(t), 0)
}

0 comments on commit 4ebf9d3

Please sign in to comment.