Skip to content

Commit

Permalink
mm: mempolicy: N:M interleave policy for tiered memory nodes
Browse files Browse the repository at this point in the history
Existing interleave policy spreads out pages evenly across a set of
specified nodes, i.e. 1:1 interleave. Upcoming tiered memory systems
have CPU-less memory nodes with different peak bandwidth and
latency-bandwidth characteristics. In such systems, we will want to
use the additional bandwidth provided by lowtier memory for
bandwidth-intensive applications. However, the default 1:1 interleave
can lead to suboptimal bandwidth distribution.

Introduce an N:M interleave policy, where N pages allocated to the
top-tier nodes are followed by M pages allocated to lowtier nodes.
This provides the capability to steer the fraction of memory traffic
that goes to toptier vs. lowtier nodes. For example, 4:1 interleave
leads to an 80%/20% traffic breakdown between toptier and lowtier.

The ratios are configured through a new sysctl:

	vm.numa_tier_interleave = toptier lowtier

We have run experiments on bandwidth-intensive production services on
CXL-based tiered memory systems, where lowtier CXL memory has, when
compared to the toptier memory directly connected to the CPU:

	- ~half of the peak bandwidth
	- ~80ns higher idle latency
	- steeper latency vs. bandwidth curve

Results show that regular interleaving leads to a ~40% performance
regression over baseline; 5:1 interleaving shows an ~8% improvement
over baseline. We have found the optimal distribution changes based on
hardware characteristics: slower CXL memory will shift the optimal
breakdown from 5:1 to (e.g.) 8:1.

The sysctl only applies to processes and vmas with an "interleave"
policy and has no bearing on contexts using prefer or bind policies.

It defaults to a setting of "1 1", which represents even interleaving,
and so is backward compatible with existing setups.

Signed-off-by: Hasan Al Maruf <hasanalmaruf@fb.com>
Signed-off-by: Hao Wang <haowang3@fb.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
  • Loading branch information
Hasan Al Maruf authored and intel-lab-lkp committed Jun 7, 2022
1 parent dbd0d47 commit 876d8da
Show file tree
Hide file tree
Showing 5 changed files with 93 additions and 3 deletions.
16 changes: 16 additions & 0 deletions Documentation/admin-guide/sysctl/vm.rst
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ Currently, these files are in /proc/sys/vm:
- nr_hugepages_mempolicy
- nr_overcommit_hugepages
- nr_trim_pages (only if CONFIG_MMU=n)
- numa_tier_interleave
- numa_zonelist_order
- oom_dump_tasks
- oom_kill_allocating_task
Expand Down Expand Up @@ -635,6 +636,21 @@ The default value is 1.
See Documentation/admin-guide/mm/nommu-mmap.rst for more information.


numa_tier_interleave
====================

This sysctl is for tiered NUMA systems. It's a tuple that configures
an N:M distribution between toptier and lowtier nodes for interleaving
memory allocation policies.

The first value configures the share of pages allocated on toptier
nodes. The second value configures the share of lowtier placements.

Allowed values range from 1 up to (and including) 100.

The default value is 1 1, meaning even distribution.


numa_zonelist_order
===================

Expand Down
2 changes: 2 additions & 0 deletions include/linux/mempolicy.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ struct mempolicy {
} w;
};

extern int numa_tier_interleave[2];

/*
* Support for managing mempolicy data objects (clone, copy, destroy)
* The default fast path of a NULL MPOL_DEFAULT policy is always inlined.
Expand Down
1 change: 1 addition & 0 deletions include/linux/sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -1236,6 +1236,7 @@ struct task_struct {
/* Protected by alloc_lock: */
struct mempolicy *mempolicy;
short il_prev;
short il_count;
short pref_node_fork;
#endif
#ifdef CONFIG_NUMA_BALANCING
Expand Down
10 changes: 10 additions & 0 deletions kernel/sysctl.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

#include <linux/module.h>
#include <linux/mm.h>
#include <linux/mempolicy.h>
#include <linux/swap.h>
#include <linux/slab.h>
#include <linux/sysctl.h>
Expand Down Expand Up @@ -2116,6 +2117,15 @@ static struct ctl_table vm_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
{
.procname = "numa_tier_interleave",
.data = &numa_tier_interleave,
.maxlen = sizeof(numa_tier_interleave),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ONE,
.extra2 = SYSCTL_ONE_HUNDRED,
},
#endif
{
.procname = "hugetlb_shm_group",
Expand Down
67 changes: 64 additions & 3 deletions mm/mempolicy.c
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,9 @@ static struct kmem_cache *sn_cache;
policied. */
enum zone_type policy_zone = 0;

/* Toptier:lowtier interleaving ratio */
int numa_tier_interleave[2] = { 1, 1 };

/*
* run-time system-wide default policy => local allocation
*/
Expand Down Expand Up @@ -871,8 +874,10 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
task_lock(current);
old = current->mempolicy;
current->mempolicy = new;
if (new && new->mode == MPOL_INTERLEAVE)
if (new && new->mode == MPOL_INTERLEAVE) {
current->il_prev = MAX_NUMNODES-1;
current->il_count = 0;
}
task_unlock(current);
mpol_put(old);
ret = 0;
Expand Down Expand Up @@ -1880,15 +1885,47 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
return nd;
}

static unsigned next_node_tier(int nid, struct mempolicy *policy, bool toptier)
{
unsigned next, start = nid;

do {
next = next_node_in(next, policy->nodes);
if (next == MAX_NUMNODES)
break;
if (toptier == node_is_toptier(next))
break;
} while (next != start);
return next;
}

/* Do dynamic interleaving for a process */
static unsigned interleave_nodes(struct mempolicy *policy)
{
unsigned next;
struct task_struct *me = current;

next = next_node_in(me->il_prev, policy->nodes);
if (numa_tier_interleave[0] > 1 || numa_tier_interleave[1] > 1) {
/*
* When N:M interleaving is configured, allocate N
* pages over toptier nodes first, then the remainder
* on lowtier ones.
*/
if (me->il_count < numa_tier_interleave[0])
next = next_node_tier(me->il_prev, policy, true);
else
next = next_node_tier(me->il_prev, policy, false);
me->il_count++;
if (me->il_count >=
numa_tier_interleave[0] + numa_tier_interleave[1])
me->il_count = 0;
} else {
next = next_node_in(me->il_prev, policy->nodes);
}

if (next < MAX_NUMNODES)
me->il_prev = next;

return next;
}

Expand Down Expand Up @@ -1962,7 +1999,31 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
nnodes = nodes_weight(nodemask);
if (!nnodes)
return numa_node_id();
target = (unsigned int)n % nnodes;

if (numa_tier_interleave[0] > 1 || numa_tier_interleave[1] > 1) {
unsigned vnnodes = 0;
int vtarget;

/*
* When N:M interleaving is configured, calculate a
* virtual target for @n in an N:M-scaled nodelist...
*/
for_each_node_mask(nid, nodemask)
vnnodes += numa_tier_interleave[!node_is_toptier(nid)];
vtarget = (int)((unsigned int)n % vnnodes);

/* ...then map it back to the physical nodelist */
target = 0;
for_each_node_mask(nid, nodemask) {
vtarget -= numa_tier_interleave[!node_is_toptier(nid)];
if (vtarget < 0)
break;
target++;
}
} else {
target = (unsigned int)n % nnodes;
}

nid = first_node(nodemask);
for (i = 0; i < target; i++)
nid = next_node(nid, nodemask);
Expand Down

0 comments on commit 876d8da

Please sign in to comment.