Skip to content

Commit

Permalink
sched/fair: Introduce priority load balance to reduce interference fr…
Browse files Browse the repository at this point in the history
…om IDLE tasks

For co-location with NORMAL and IDLE tasks, when CFS trigger load balance,
it is reasonable to prefer migrating NORMAL(Latency Sensitive) tasks from
the busy src CPU to dst CPU, and migrating IDLE tasks lastly.

This is very important for reducing interference from IDLE tasks.
So the CFS load balance can be optimized to below:

1.`cfs_tasks` list of CPU rq is owned by NORMAL tasks.
2.`cfs_idle_tasks` list of CPU rq which is owned by IDLE tasks.
3.Prefer to migrate NORMAL tasks of cfs_tasks to dst CPU.
4.Lastly migrate IDLE tasks of cfs_idle_tasks to dst CPU.

This was tested with the following reproduction:
- small number of NORMAL tasks colocated with a large number of IDLE tasks

With this patch, NORMAL tasks latency can be reduced
about 5~10% compared with current.

Signed-off-by: zhangsong <zhangsong34@huawei.com>
  • Loading branch information
zhangsong authored and intel-lab-lkp committed Aug 9, 2022
1 parent 8648f92 commit 20ac252
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 4 deletions.
1 change: 1 addition & 0 deletions kernel/sched/core.c
Expand Up @@ -9684,6 +9684,7 @@ void __init sched_init(void)
rq->max_idle_balance_cost = sysctl_sched_migration_cost;

INIT_LIST_HEAD(&rq->cfs_tasks);
INIT_LIST_HEAD(&rq->cfs_idle_tasks);

rq_attach_root(rq, &def_root_domain);
#ifdef CONFIG_NO_HZ_COMMON
Expand Down
43 changes: 39 additions & 4 deletions kernel/sched/fair.c
Expand Up @@ -3034,6 +3034,19 @@ static inline void update_scan_period(struct task_struct *p, int new_cpu)

#endif /* CONFIG_NUMA_BALANCING */

static void
adjust_rq_cfs_tasks(void (*list_op)(struct list_head *, struct list_head *),
struct rq *rq,
struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);

if (task_has_idle_policy(task_of(se)) || tg_is_idle(cfs_rq->tg))
(*list_op)(&se->group_node, &rq->cfs_idle_tasks);
else
(*list_op)(&se->group_node, &rq->cfs_tasks);
}

static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
Expand All @@ -3043,7 +3056,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
struct rq *rq = rq_of(cfs_rq);

account_numa_enqueue(rq, task_of(se));
list_add(&se->group_node, &rq->cfs_tasks);
adjust_rq_cfs_tasks(list_add, rq, se);
}
#endif
cfs_rq->nr_running++;
Expand Down Expand Up @@ -7465,7 +7478,7 @@ done: __maybe_unused;
* the list, so our cfs_tasks list becomes MRU
* one.
*/
list_move(&p->se.group_node, &rq->cfs_tasks);
adjust_rq_cfs_tasks(list_move, rq, &p->se);
#endif

if (hrtick_enabled_fair(rq))
Expand Down Expand Up @@ -7788,6 +7801,9 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
if (unlikely(task_has_idle_policy(p)))
return 0;

if (tg_is_idle(cfs_rq_of(&p->se)->tg))
return 0;

/* SMT siblings share cache */
if (env->sd->flags & SD_SHARE_CPUCAPACITY)
return 0;
Expand All @@ -7800,6 +7816,11 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
&p->se == cfs_rq_of(&p->se)->last))
return 1;

/* Preempt sched idle cpu do not consider migration cost */
if (cpus_share_cache(env->src_cpu, env->dst_cpu) &&
sched_idle_cpu(env->dst_cpu))
return 0;

if (sysctl_sched_migration_cost == -1)
return 1;

Expand Down Expand Up @@ -7990,11 +8011,14 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
static struct task_struct *detach_one_task(struct lb_env *env)
{
struct task_struct *p;
struct list_head *tasks = &env->src_rq->cfs_tasks;
int loop = 0;

lockdep_assert_rq_held(env->src_rq);

again:
list_for_each_entry_reverse(p,
&env->src_rq->cfs_tasks, se.group_node) {
tasks, se.group_node) {
if (!can_migrate_task(p, env))
continue;

Expand All @@ -8009,6 +8033,10 @@ static struct task_struct *detach_one_task(struct lb_env *env)
schedstat_inc(env->sd->lb_gained[env->idle]);
return p;
}
if (++loop == 1) {
tasks = &env->src_rq->cfs_idle_tasks;
goto again;
}
return NULL;
}

Expand All @@ -8026,6 +8054,7 @@ static int detach_tasks(struct lb_env *env)
unsigned long util, load;
struct task_struct *p;
int detached = 0;
int loop = 0;

lockdep_assert_rq_held(env->src_rq);

Expand All @@ -8041,6 +8070,7 @@ static int detach_tasks(struct lb_env *env)
if (env->imbalance <= 0)
return 0;

again:
while (!list_empty(tasks)) {
/*
* We don't want to steal all, otherwise we may be treated likewise,
Expand Down Expand Up @@ -8142,6 +8172,11 @@ static int detach_tasks(struct lb_env *env)
list_move(&p->se.group_node, tasks);
}

if (env->imbalance > 0 && ++loop == 1) {
tasks = &env->src_rq->cfs_idle_tasks;
goto again;
}

/*
* Right now, this is one of only two places we collect this stat
* so we can safely collect detach_one_task() stats here rather
Expand Down Expand Up @@ -11642,7 +11677,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
* Move the next running task to the front of the list, so our
* cfs_tasks list becomes MRU one.
*/
list_move(&se->group_node, &rq->cfs_tasks);
adjust_rq_cfs_tasks(list_move, rq, se);
}
#endif

Expand Down
1 change: 1 addition & 0 deletions kernel/sched/sched.h
Expand Up @@ -1051,6 +1051,7 @@ struct rq {
int online;

struct list_head cfs_tasks;
struct list_head cfs_idle_tasks;

struct sched_avg avg_rt;
struct sched_avg avg_dl;
Expand Down

0 comments on commit 20ac252

Please sign in to comment.