Skip to content

Commit

Permalink
sched_ext: Implement core-sched support
Browse files Browse the repository at this point in the history
The core-sched support is composed of the following parts:

* task_struct->scx.core_sched_at is added. This is a timestamp which can be
  used to order tasks. Depending on whether the BPF scheduler implements
  custom ordering, it tracks either global FIFO ordering of all tasks or
  local-DSQ ordering within the dispatched tasks on a CPU.

* prio_less() is updated to call scx_prio_less() when comparing SCX tasks.
  scx_prio_less() calls ops.core_sched_before() if available or uses the
  core_sched_at timestamp. For global FIFO ordering, the BPF scheduler
  doesn't need to do anything. Otherwise, it should implement
  ops.core_sched_before() which reflects the ordering.

* When core-sched is enabled, balance_scx() balances all SMT siblings so
  that they all have tasks dispatched if necessary before pick_task_scx() is
  called. pick_task_scx() picks between the current task and the first
  dispatched task on the local DSQ based on availability and the
  core_sched_at timestamps. Note that FIFO ordering is expected among the
  already dispatched tasks whether running or on the local DSQ, so this path
  always compares core_sched_at instead of calling into
  ops.core_sched_before().

qmap_core_sched_before() is added to scx_example_qmap. It scales the
distances from the heads of the queues to compare the tasks across different
priority queues and seems to behave as expected.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
  • Loading branch information
htejun authored and intel-lab-lkp committed Jan 28, 2023
1 parent 7a3d3e9 commit 750f973
Show file tree
Hide file tree
Showing 7 changed files with 319 additions and 16 deletions.
21 changes: 21 additions & 0 deletions include/linux/sched/ext.h
Expand Up @@ -315,6 +315,24 @@ struct sched_ext_ops {
*/
bool (*yield)(struct task_struct *from, struct task_struct *to);

/**
* core_sched_before - Task ordering for core-sched
* @a: task A
* @b: task B
*
* Used by core-sched to determine the ordering between two tasks. See
* Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on
* core-sched.
*
* Both @a and @b are runnable and may or may not currently be queued on
* the BPF scheduler. Should return %true if @a should run before @b.
* %false if there's no required ordering or @b should run before @a.
*
* If not specified, the default is ordering them according to when they
* became runnable.
*/
bool (*core_sched_before)(struct task_struct *a, struct task_struct *b);

/**
* set_cpumask - Set CPU affinity
* @p: task to set CPU affinity for
Expand Down Expand Up @@ -611,6 +629,9 @@ struct sched_ext_entity {
u32 kf_mask; /* see scx_kf_mask above */
atomic64_t ops_state;
unsigned long runnable_at;
#ifdef CONFIG_SCHED_CORE
u64 core_sched_at; /* see scx_prio_less() */
#endif

/* BPF scheduler modifiable fields */

Expand Down
2 changes: 1 addition & 1 deletion kernel/Kconfig.preempt
Expand Up @@ -135,7 +135,7 @@ config SCHED_CORE

config SCHED_CLASS_EXT
bool "Extensible Scheduling Class"
depends on BPF_SYSCALL && BPF_JIT && !SCHED_CORE
depends on BPF_SYSCALL && BPF_JIT
help
This option enables a new scheduler class sched_ext (SCX), which
allows scheduling policies to be implemented as BPF programs to
Expand Down
12 changes: 11 additions & 1 deletion kernel/sched/core.c
Expand Up @@ -163,7 +163,12 @@ static inline int __task_prio(struct task_struct *p)
if (p->sched_class == &idle_sched_class)
return MAX_RT_PRIO + NICE_WIDTH; /* 140 */

return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */
#ifdef CONFIG_SCHED_CLASS_EXT
if (p->sched_class == &ext_sched_class)
return MAX_RT_PRIO + MAX_NICE + 1; /* 120, squash ext */
#endif

return MAX_RT_PRIO + MAX_NICE; /* 119, squash fair */
}

/*
Expand Down Expand Up @@ -191,6 +196,11 @@ static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool
if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
return cfs_prio_less(a, b, in_fi);

#ifdef CONFIG_SCHED_CLASS_EXT
if (pa == MAX_RT_PRIO + MAX_NICE + 1) /* ext */
return scx_prio_less(a, b, in_fi);
#endif

return false;
}

Expand Down
196 changes: 186 additions & 10 deletions kernel/sched/ext.c
Expand Up @@ -447,6 +447,44 @@ static int ops_sanitize_err(const char *ops_name, s32 err)
return -EPROTO;
}

/**
* touch_core_sched - Update timestamp used for core-sched task ordering
* @rq: rq to read clock from, must be locked
* @p: task to update the timestamp for
*
* Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to
* implement global or local-DSQ FIFO ordering for core-sched. Should be called
* when a task becomes runnable and its turn on the CPU ends (e.g. slice
* exhaustion).
*/
static void touch_core_sched(struct rq *rq, struct task_struct *p)
{
#ifdef CONFIG_SCHED_CORE
p->scx.core_sched_at = rq_clock_task(rq);
#endif
}

/**
* touch_core_sched_dispatch - Update core-sched timestamp on dispatch
* @rq: rq to read clock from, must be locked
* @p: task being dispatched
*
* If the BPF scheduler implements custom core-sched ordering via
* ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO
* ordering within each local DSQ. This function is called from dispatch paths
* and updates @p->scx.core_sched_at if custom core-sched ordering is in effect.
*/
static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p)
{
lockdep_assert_rq_held(rq);
assert_clock_updated(rq);

#ifdef CONFIG_SCHED_CORE
if (SCX_HAS_OP(core_sched_before))
touch_core_sched(rq, p);
#endif
}

static void update_curr_scx(struct rq *rq)
{
struct task_struct *curr = rq->curr;
Expand All @@ -462,8 +500,11 @@ static void update_curr_scx(struct rq *rq)
account_group_exec_runtime(curr, delta_exec);
cgroup_account_cputime(curr, delta_exec);

if (curr->scx.slice != SCX_SLICE_INF)
if (curr->scx.slice != SCX_SLICE_INF) {
curr->scx.slice -= min(curr->scx.slice, delta_exec);
if (!curr->scx.slice)
touch_core_sched(rq, curr);
}
}

static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p,
Expand Down Expand Up @@ -619,6 +660,8 @@ static void direct_dispatch(struct task_struct *ddsp_task, struct task_struct *p
return;
}

touch_core_sched_dispatch(task_rq(p), p);

dsq = find_dsq_for_dispatch(task_rq(p), dsq_id, p);
dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS);

Expand Down Expand Up @@ -702,12 +745,19 @@ static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags,
return;

local:
/*
* For task-ordering, slice refill must be treated as implying the end
* of the current slice. Otherwise, the longer @p stays on the CPU, the
* higher priority it becomes from scx_prio_less()'s POV.
*/
touch_core_sched(rq, p);
p->scx.slice = SCX_SLICE_DFL;
local_norefill:
dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags);
return;

global:
touch_core_sched(rq, p); /* see the comment in local: */
p->scx.slice = SCX_SLICE_DFL;
dispatch_enqueue(&scx_dsq_global, p, enq_flags);
}
Expand Down Expand Up @@ -762,6 +812,9 @@ static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags
if (SCX_HAS_OP(runnable))
scx_ops.runnable(p, enq_flags);

if (enq_flags & SCX_ENQ_WAKEUP)
touch_core_sched(rq, p);

do_enqueue_task(rq, p, enq_flags, sticky_cpu);
}

Expand Down Expand Up @@ -1201,6 +1254,7 @@ static void finish_dispatch(struct rq *rq, struct rq_flags *rf,
struct scx_dispatch_q *dsq;
u64 opss;

touch_core_sched_dispatch(rq, p);
retry:
/*
* No need for _acquire here. @p is accessed only after a successful
Expand Down Expand Up @@ -1278,8 +1332,8 @@ static void flush_dispatch_buf(struct rq *rq, struct rq_flags *rf)
dspc->buf_cursor = 0;
}

static int balance_scx(struct rq *rq, struct task_struct *prev,
struct rq_flags *rf)
static int balance_one(struct rq *rq, struct task_struct *prev,
struct rq_flags *rf, bool local)
{
struct scx_rq *scx_rq = &rq->scx;
struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx);
Expand All @@ -1302,7 +1356,7 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,
}

if (prev_on_scx) {
WARN_ON_ONCE(prev->scx.flags & SCX_TASK_BAL_KEEP);
WARN_ON_ONCE(local && (prev->scx.flags & SCX_TASK_BAL_KEEP));
update_curr_scx(rq);

/*
Expand All @@ -1314,10 +1368,16 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,
*
* See scx_ops_disable_workfn() for the explanation on the
* disabling() test.
*
* When balancing a remote CPU for core-sched, there won't be a
* following put_prev_task_scx() call and we don't own
* %SCX_TASK_BAL_KEEP. Instead, pick_task_scx() will test the
* same conditions later and pick @rq->curr accordingly.
*/
if ((prev->scx.flags & SCX_TASK_QUEUED) &&
prev->scx.slice && !scx_ops_disabling()) {
prev->scx.flags |= SCX_TASK_BAL_KEEP;
if (local)
prev->scx.flags |= SCX_TASK_BAL_KEEP;
return 1;
}
}
Expand Down Expand Up @@ -1373,10 +1433,55 @@ static int balance_scx(struct rq *rq, struct task_struct *prev,
return 0;
}

static int balance_scx(struct rq *rq, struct task_struct *prev,
struct rq_flags *rf)
{
int ret;

ret = balance_one(rq, prev, rf, true);

/*
* When core-sched is enabled, this ops.balance() call will be followed
* by put_prev_scx() and pick_task_scx() on this CPU and pick_task_scx()
* on the SMT siblings. Balance the siblings too.
*/
if (sched_core_enabled(rq)) {
const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq));
int scpu;

for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) {
struct rq *srq = cpu_rq(scpu);
struct rq_flags srf;
struct task_struct *sprev = srq->curr;

/*
* While core-scheduling, rq lock is shared among
* siblings but the debug annotations and rq clock
* aren't. Do pinning dance to transfer the ownership.
*/
WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq));
rq_unpin_lock(rq, rf);
rq_pin_lock(srq, &srf);

update_rq_clock(srq);
balance_one(srq, sprev, &srf, false);

rq_unpin_lock(srq, &srf);
rq_repin_lock(rq, rf);
}
}

return ret;
}

static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first)
{
if (p->scx.flags & SCX_TASK_QUEUED) {
WARN_ON_ONCE(atomic64_read(&p->scx.ops_state) != SCX_OPSS_NONE);
/*
* Core-sched might decide to execute @p before it is
* dispatched. Call ops_dequeue() to notify the BPF scheduler.
*/
ops_dequeue(p, SCX_DEQ_CORE_SCHED_EXEC);
dispatch_dequeue(&rq->scx, p);
}

Expand Down Expand Up @@ -1516,6 +1621,69 @@ static struct task_struct *pick_next_task_scx(struct rq *rq)
return p;
}

#ifdef CONFIG_SCHED_CORE
/**
* scx_prio_less - Task ordering for core-sched
* @a: task A
* @b: task B
*
* Core-sched is implemented as an additional scheduling layer on top of the
* usual sched_class'es and needs to find out the expected task ordering. For
* SCX, core-sched calls this function to interrogate the task ordering.
*
* Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used
* to implement the default task ordering. The older the timestamp, the higher
* prority the task - the global FIFO ordering matching the default scheduling
* behavior.
*
* When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to
* implement FIFO ordering within each local DSQ. See pick_task_scx().
*/
bool scx_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
{
if (SCX_HAS_OP(core_sched_before) && !scx_ops_disabling())
return scx_ops.core_sched_before(a, b);
else
return time_after64(a->scx.core_sched_at, b->scx.core_sched_at);
}

/**
* pick_task_scx - Pick a candidate task for core-sched
* @rq: rq to pick the candidate task from
*
* Core-sched calls this function on each SMT sibling to determine the next
* tasks to run on the SMT siblings. balance_one() has been called on all
* siblings and put_prev_task_scx() has been called only for the current CPU.
*
* As put_prev_task_scx() hasn't been called on remote CPUs, we can't just look
* at the first task in the local dsq. @rq->curr has to be considered explicitly
* to mimic %SCX_TASK_BAL_KEEP.
*/
static struct task_struct *pick_task_scx(struct rq *rq)
{
struct task_struct *curr = rq->curr;
struct task_struct *first = first_local_task(rq);

if (curr->scx.flags & SCX_TASK_QUEUED) {
/* is curr the only runnable task? */
if (!first)
return curr;

/*
* Does curr trump first? We can always go by core_sched_at for
* this comparison as it represents global FIFO ordering when
* the default core-sched ordering is in used and local-DSQ FIFO
* ordering otherwise.
*/
if (curr->scx.slice && time_before64(curr->scx.core_sched_at,
first->scx.core_sched_at))
return curr;
}

return first; /* this may be %NULL */
}
#endif /* CONFIG_SCHED_CORE */

static enum scx_cpu_preempt_reason
preempt_reason_from_class(const struct sched_class *class)
{
Expand Down Expand Up @@ -1795,11 +1963,13 @@ static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued)
update_curr_scx(rq);

/*
* While disabling, always resched as we can't trust the slice
* management.
* While disabling, always resched and refresh core-sched timestamp as
* we can't trust the slice management or ops.core_sched_before().
*/
if (scx_ops_disabling())
if (scx_ops_disabling()) {
curr->scx.slice = 0;
touch_core_sched(rq, curr);
}

if (!curr->scx.slice)
resched_curr(rq);
Expand Down Expand Up @@ -2232,6 +2402,10 @@ DEFINE_SCHED_CLASS(ext) = {
.rq_offline = rq_offline_scx,
#endif

#ifdef CONFIG_SCHED_CORE
.pick_task = pick_task_scx,
#endif

.task_tick = task_tick_scx,

.switching_to = switching_to_scx,
Expand Down Expand Up @@ -2560,9 +2734,11 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
*
* b. balance_scx() never sets %SCX_TASK_BAL_KEEP as the slice value
* can't be trusted. Whenever a tick triggers, the running task is
* rotated to the tail of the queue.
* rotated to the tail of the queue with core_sched_at touched.
*
* c. pick_next_task() suppresses zero slice warning.
*
* d. scx_prio_less() reverts to the default core_sched_at order.
*/
scx_ops.enqueue = scx_ops_fallback_enqueue;
scx_ops.dispatch = scx_ops_fallback_dispatch;
Expand Down

0 comments on commit 750f973

Please sign in to comment.