Skip to content

Commit

Permalink
sched_ext: Add cgroup support
Browse files Browse the repository at this point in the history
Add sched_ext_ops operations to init/exit cgroups, and track task migrations
and config changes. Because different BPF schedulers may implement different
subsets of CPU control features, allow BPF schedulers to pick which cgroup
interface files to enable using SCX_OPS_CGROUP_KNOB_* flags. For now, only
the weight knobs are supported but adding more should be straightforward.

While a BPF scheduler is being enabled and disabled, relevant cgroup
operations are locked out using scx_cgroup_rwsem. This avoids situations
like task prep taking place while the task is being moved across cgroups,
making things easier for BPF schedulers.

This patch also adds scx_example_pair which implements a variant of core
scheduling where a hyperthread pair only run tasks from the same cgroup. The
BPF scheduler achieves this by putting tasks into per-cgroup queues,
time-slicing the cgroup to run for each pair first, and then scheduling
within the cgroup. See the header comment in scx_example_pair.bpf.c for more
details.

Note that scx_example_pair's cgroup-boundary guarantee breaks down for tasks
running in higher priority scheduler classes. This will be addressed by a
followup patch which implements a mechanism to track CPU preemption.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Acked-by: Josh Don <joshdon@google.com>
Acked-by: Hao Luo <haoluo@google.com>
Acked-by: Barret Rhoden <brho@google.com>
  • Loading branch information
htejun authored and intel-lab-lkp committed Jan 28, 2023
1 parent ddb561a commit 6f86161
Show file tree
Hide file tree
Showing 11 changed files with 1,239 additions and 24 deletions.
96 changes: 93 additions & 3 deletions include/linux/sched/ext.h
Expand Up @@ -12,6 +12,8 @@
#include <linux/rhashtable.h>
#include <linux/llist.h>

struct cgroup;

enum scx_consts {
SCX_OPS_NAME_LEN = 128,
SCX_EXIT_REASON_LEN = 128,
Expand Down Expand Up @@ -109,14 +111,27 @@ enum scx_ops_flags {
*/
SCX_OPS_ENQ_EXITING = 1LLU << 2,

/*
* CPU cgroup knob enable flags
*/
SCX_OPS_CGROUP_KNOB_WEIGHT = 1LLU << 16, /* cpu.weight */

SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE |
SCX_OPS_ENQ_LAST |
SCX_OPS_ENQ_EXITING,
SCX_OPS_ENQ_EXITING |
SCX_OPS_CGROUP_KNOB_WEIGHT,
};

/* argument container for ops.enable() and friends */
struct scx_enable_args {
/* empty for now */
/* the cgroup the task is joining */
struct cgroup *cgroup;
};

/* argument container for ops->cgroup_init() */
struct scx_cgroup_init_args {
/* the weight of the cgroup [1..10000] */
u32 weight;
};

/**
Expand Down Expand Up @@ -325,7 +340,8 @@ struct sched_ext_ops {
* @p: task to enable BPF scheduling for
* @args: enable arguments, see the struct definition
*
* Enable @p for BPF scheduling. @p will start running soon.
* Enable @p for BPF scheduling. @p is now in the cgroup specified for
* the preceding prep_enable() and will start running soon.
*/
void (*enable)(struct task_struct *p, struct scx_enable_args *args);

Expand All @@ -349,6 +365,77 @@ struct sched_ext_ops {
*/
void (*disable)(struct task_struct *p);

/**
* cgroup_init - Initialize a cgroup
* @cgrp: cgroup being initialized
* @args: init arguments, see the struct definition
*
* Either the BPF scheduler is being loaded or @cgrp created, initialize
* @cgrp for sched_ext. This operation may block.
*
* Return 0 for success, -errno for failure. An error return while
* loading will abort loading of the BPF scheduler. During cgroup
* creation, it will abort the specific cgroup creation.
*/
s32 (*cgroup_init)(struct cgroup *cgrp,
struct scx_cgroup_init_args *args);

/**
* cgroup_exit - Exit a cgroup
* @cgrp: cgroup being exited
*
* Either the BPF scheduler is being unloaded or @cgrp destroyed, exit
* @cgrp for sched_ext. This operation my block.
*/
void (*cgroup_exit)(struct cgroup *cgrp);

/**
* cgroup_prep_move - Prepare a task to be moved to a different cgroup
* @p: task being moved
* @from: cgroup @p is being moved from
* @to: cgroup @p is being moved to
*
* Prepare @p for move from cgroup @from to @to. This operation may
* block and can be used for allocations.
*
* Return 0 for success, -errno for failure. An error return aborts the
* migration.
*/
s32 (*cgroup_prep_move)(struct task_struct *p,
struct cgroup *from, struct cgroup *to);

/**
* cgroup_move - Commit cgroup move
* @p: task being moved
* @from: cgroup @p is being moved from
* @to: cgroup @p is being moved to
*
* Commit the move. @p is dequeued during this operation.
*/
void (*cgroup_move)(struct task_struct *p,
struct cgroup *from, struct cgroup *to);

/**
* cgroup_cancel_move - Cancel cgroup move
* @p: task whose cgroup move is being canceled
* @from: cgroup @p was being moved from
* @to: cgroup @p was being moved to
*
* @p was cgroup_prep_move()'d but failed before reaching cgroup_move().
* Undo the preparation.
*/
void (*cgroup_cancel_move)(struct task_struct *p,
struct cgroup *from, struct cgroup *to);

/**
* cgroup_set_weight - A cgroup's weight is being changed
* @cgrp: cgroup whose weight is being updated
* @weight: new weight [1..10000]
*
* Update @tg's weight to @weight.
*/
void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);

/*
* All online ops must come before ops.init().
*/
Expand Down Expand Up @@ -483,6 +570,9 @@ struct sched_ext_entity {

/* cold fields */
struct list_head tasks_node;
#ifdef CONFIG_EXT_GROUP_SCHED
struct cgroup *cgrp_moving_from;
#endif
};

void sched_ext_free(struct task_struct *p);
Expand Down
5 changes: 5 additions & 0 deletions init/Kconfig
Expand Up @@ -1043,6 +1043,11 @@ config RT_GROUP_SCHED
realtime bandwidth for them.
See Documentation/scheduler/sched-rt-group.rst for more information.

config EXT_GROUP_SCHED
bool
depends on SCHED_CLASS_EXT && CGROUP_SCHED
default y

endif #CGROUP_SCHED

config UCLAMP_TASK_GROUP
Expand Down
68 changes: 59 additions & 9 deletions kernel/sched/core.c
Expand Up @@ -9843,6 +9843,9 @@ void __init sched_init(void)
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_EXT_GROUP_SCHED
root_task_group.scx_weight = CGROUP_WEIGHT_DFL;
#endif /* CONFIG_EXT_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
root_task_group.rt_se = (struct sched_rt_entity **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
Expand Down Expand Up @@ -10299,6 +10302,7 @@ struct task_group *sched_create_group(struct task_group *parent)
if (!alloc_rt_sched_group(tg, parent))
goto err;

scx_group_set_weight(tg, CGROUP_WEIGHT_DFL);
alloc_uclamp_sched_group(tg, parent);

return tg;
Expand Down Expand Up @@ -10402,6 +10406,7 @@ void sched_move_task(struct task_struct *tsk)
SCHED_CHANGE_BLOCK(rq, tsk,
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK) {
sched_change_group(tsk);
scx_move_task(tsk);
}

/*
Expand Down Expand Up @@ -10438,6 +10443,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
struct task_group *parent = css_tg(css->parent);
int ret;

ret = scx_tg_online(tg);
if (ret)
return ret;

if (parent)
sched_online_group(tg, parent);
Expand All @@ -10454,6 +10464,13 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
return 0;
}

static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);

scx_tg_offline(tg);
}

static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
Expand All @@ -10471,17 +10488,19 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
sched_unregister_group(tg);
}

#ifdef CONFIG_RT_GROUP_SCHED
#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED)
static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
{
#ifdef CONFIG_RT_GROUP_SCHED
struct task_struct *task;
struct cgroup_subsys_state *css;

cgroup_taskset_for_each(task, css, tset) {
if (!sched_rt_can_attach(css_tg(css), task))
return -EINVAL;
}
return 0;
#endif
return scx_cgroup_can_attach(tset);
}
#endif

Expand All @@ -10492,7 +10511,16 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)

cgroup_taskset_for_each(task, css, tset)
sched_move_task(task);

scx_cgroup_finish_attach();
}

#ifdef CONFIG_EXT_GROUP_SCHED
static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset)
{
scx_cgroup_cancel_attach(tset);
}
#endif

#ifdef CONFIG_UCLAMP_TASK_GROUP
static void cpu_util_update_eff(struct cgroup_subsys_state *css)
Expand Down Expand Up @@ -10675,9 +10703,15 @@ static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
struct cftype *cftype, u64 shareval)
{
int ret;

if (shareval > scale_load_down(ULONG_MAX))
shareval = MAX_SHARES;
return sched_group_set_shares(css_tg(css), scale_load(shareval));
ret = sched_group_set_shares(css_tg(css), scale_load(shareval));
if (!ret)
scx_group_set_weight(css_tg(css),
sched_weight_to_cgroup(shareval));
return ret;
}

static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
Expand Down Expand Up @@ -11141,11 +11175,15 @@ static int cpu_extra_stat_show(struct seq_file *sf,
return 0;
}

#ifdef CONFIG_FAIR_GROUP_SCHED
#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED)

static unsigned long tg_weight(struct task_group *tg)
{
#ifdef CONFIG_FAIR_GROUP_SCHED
return scale_load_down(tg->shares);
#else
return sched_weight_from_cgroup(tg->cgrp_weight);
#endif
}

static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
Expand All @@ -11158,13 +11196,17 @@ static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
struct cftype *cft, u64 cgrp_weight)
{
unsigned long weight;
int ret;

if (cgrp_weight < CGROUP_WEIGHT_MIN || cgrp_weight > CGROUP_WEIGHT_MAX)
return -ERANGE;

weight = sched_weight_from_cgroup(cgrp_weight);

return sched_group_set_shares(css_tg(css), scale_load(weight));
ret = sched_group_set_shares(css_tg(css), scale_load(weight));
if (!ret)
scx_group_set_weight(css_tg(css), cgrp_weight);
return ret;
}

static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
Expand All @@ -11189,7 +11231,7 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
struct cftype *cft, s64 nice)
{
unsigned long weight;
int idx;
int idx, ret;

if (nice < MIN_NICE || nice > MAX_NICE)
return -ERANGE;
Expand All @@ -11198,7 +11240,11 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
idx = array_index_nospec(idx, 40);
weight = sched_prio_to_weight[idx];

return sched_group_set_shares(css_tg(css), scale_load(weight));
ret = sched_group_set_shares(css_tg(css), scale_load(weight));
if (!ret)
scx_group_set_weight(css_tg(css),
sched_weight_to_cgroup(weight));
return ret;
}
#endif

Expand Down Expand Up @@ -11260,7 +11306,7 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of,
#endif

struct cftype cpu_cftypes[CPU_CFTYPE_CNT + 1] = {
#ifdef CONFIG_FAIR_GROUP_SCHED
#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED)
[CPU_CFTYPE_WEIGHT] = {
.name = "weight",
.flags = CFTYPE_NOT_ON_ROOT,
Expand Down Expand Up @@ -11314,13 +11360,17 @@ struct cftype cpu_cftypes[CPU_CFTYPE_CNT + 1] = {
struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc,
.css_online = cpu_cgroup_css_online,
.css_offline = cpu_cgroup_css_offline,
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
.css_extra_stat_show = cpu_extra_stat_show,
#ifdef CONFIG_RT_GROUP_SCHED
#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED)
.can_attach = cpu_cgroup_can_attach,
#endif
.attach = cpu_cgroup_attach,
#ifdef CONFIG_EXT_GROUP_SCHED
.cancel_attach = cpu_cgroup_cancel_attach,
#endif
.legacy_cftypes = cpu_legacy_cftypes,
.dfl_cftypes = cpu_cftypes,
.early_init = true,
Expand Down

0 comments on commit 6f86161

Please sign in to comment.