mm: mempolicy: N:M interleave policy for tiered memory nodes

Existing interleave policy spreads out pages evenly across a set of specified nodes, i.e. 1:1 interleave. Upcoming tiered memory systems have CPU-less memory nodes with different peak bandwidth and latency-bandwidth characteristics. In such systems, we will want to use the additional bandwidth provided by lowtier memory for bandwidth-intensive applications. However, the default 1:1 interleave can lead to suboptimal bandwidth distribution. Introduce an N:M interleave policy, where N pages allocated to the top-tier nodes are followed by M pages allocated to lowtier nodes. This provides the capability to steer the fraction of memory traffic that goes to toptier vs. lowtier nodes. For example, 4:1 interleave leads to an 80%/20% traffic breakdown between toptier and lowtier. The ratios are configured through a new sysctl: vm.numa_tier_interleave = toptier lowtier We have run experiments on bandwidth-intensive production services on CXL-based tiered memory systems, where lowtier CXL memory has, when compared to the toptier memory directly connected to the CPU: - ~half of the peak bandwidth - ~80ns higher idle latency - steeper latency vs. bandwidth curve Results show that regular interleaving leads to a ~40% performance regression over baseline; 5:1 interleaving shows an ~8% improvement over baseline. We have found the optimal distribution changes based on hardware characteristics: slower CXL memory will shift the optimal breakdown from 5:1 to (e.g.) 8:1. The sysctl only applies to processes and vmas with an "interleave" policy and has no bearing on contexts using prefer or bind policies. It defaults to a setting of "1 1", which represents even interleaving, and so is backward compatible with existing setups. Signed-off-by: Hasan Al Maruf <hasanalmaruf@fb.com> Signed-off-by: Hao Wang <haowang3@fb.com> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
intel-lab-lkp · Jun 7, 2022 · 876d8da · 876d8da
1 parent dbd0d47
commit 876d8da
Show file tree

Hide file tree

Showing 5 changed files with 93 additions and 3 deletions.
diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
@@ -55,6 +55,7 @@ Currently, these files are in /proc/sys/vm:
 - nr_hugepages_mempolicy
 - nr_overcommit_hugepages
 - nr_trim_pages         (only if CONFIG_MMU=n)
+- numa_tier_interleave
 - numa_zonelist_order
 - oom_dump_tasks
 - oom_kill_allocating_task
@@ -635,6 +636,21 @@ The default value is 1.
 See Documentation/admin-guide/mm/nommu-mmap.rst for more information.
 
 
+numa_tier_interleave
+====================
+
+This sysctl is for tiered NUMA systems. It's a tuple that configures
+an N:M distribution between toptier and lowtier nodes for interleaving
+memory allocation policies.
+
+The first value configures the share of pages allocated on toptier
+nodes. The second value configures the share of lowtier placements.
+
+Allowed values range from 1 up to (and including) 100.
+
+The default value is 1 1, meaning even distribution.
+
+
 numa_zonelist_order
 ===================
 

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
@@ -54,6 +54,8 @@ struct mempolicy {
 	} w;
 };
 
+extern int numa_tier_interleave[2];
+
 /*
  * Support for managing mempolicy data objects (clone, copy, destroy)
  * The default fast path of a NULL MPOL_DEFAULT policy is always inlined.

diff --git a/include/linux/sched.h b/include/linux/sched.h
@@ -1236,6 +1236,7 @@ struct task_struct {
 	/* Protected by alloc_lock: */
 	struct mempolicy		*mempolicy;
 	short				il_prev;
+	short				il_count;
 	short				pref_node_fork;
 #endif
 #ifdef CONFIG_NUMA_BALANCING

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
@@ -21,6 +21,7 @@
 
 #include <linux/module.h>
 #include <linux/mm.h>
+#include <linux/mempolicy.h>
 #include <linux/swap.h>
 #include <linux/slab.h>
 #include <linux/sysctl.h>
@@ -2116,6 +2117,15 @@ static struct ctl_table vm_table[] = {
 		.extra1			= SYSCTL_ZERO,
 		.extra2			= SYSCTL_ONE,
 	},
+	{
+		.procname	= "numa_tier_interleave",
+		.data		= &numa_tier_interleave,
+		.maxlen		= sizeof(numa_tier_interleave),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ONE,
+		.extra2		= SYSCTL_ONE_HUNDRED,
+	},
 #endif
 	 {
 		.procname	= "hugetlb_shm_group",

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
@@ -120,6 +120,9 @@ static struct kmem_cache *sn_cache;
    policied. */
 enum zone_type policy_zone = 0;
 
+/* Toptier:lowtier interleaving ratio */
+int numa_tier_interleave[2] = { 1, 1 };
+
 /*
  * run-time system-wide default policy => local allocation
  */
@@ -871,8 +874,10 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 	task_lock(current);
 	old = current->mempolicy;
 	current->mempolicy = new;
-	if (new && new->mode == MPOL_INTERLEAVE)
+	if (new && new->mode == MPOL_INTERLEAVE) {
 		current->il_prev = MAX_NUMNODES-1;
+		current->il_count = 0;
+	}
 	task_unlock(current);
 	mpol_put(old);
 	ret = 0;
@@ -1880,15 +1885,47 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
 	return nd;
 }
 
+static unsigned next_node_tier(int nid, struct mempolicy *policy, bool toptier)
+{
+	unsigned next, start = nid;
+
+	do {
+		next = next_node_in(next, policy->nodes);
+		if (next == MAX_NUMNODES)
+			break;
+		if (toptier == node_is_toptier(next))
+			break;
+	} while (next != start);
+	return next;
+}
+
 /* Do dynamic interleaving for a process */
 static unsigned interleave_nodes(struct mempolicy *policy)
 {
 	unsigned next;
 	struct task_struct *me = current;
 
-	next = next_node_in(me->il_prev, policy->nodes);
+	if (numa_tier_interleave[0] > 1 || numa_tier_interleave[1] > 1) {
+		/*
+		 * When N:M interleaving is configured, allocate N
+		 * pages over toptier nodes first, then the remainder
+		 * on lowtier ones.
+		 */
+		if (me->il_count < numa_tier_interleave[0])
+			next = next_node_tier(me->il_prev, policy, true);
+		else
+			next = next_node_tier(me->il_prev, policy, false);
+		me->il_count++;
+		if (me->il_count >=
+		    numa_tier_interleave[0] + numa_tier_interleave[1])
+			me->il_count = 0;
+	} else {
+		next = next_node_in(me->il_prev, policy->nodes);
+	}
+
 	if (next < MAX_NUMNODES)
 		me->il_prev = next;
+
 	return next;
 }
 
@@ -1962,7 +1999,31 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n)
 	nnodes = nodes_weight(nodemask);
 	if (!nnodes)
 		return numa_node_id();
-	target = (unsigned int)n % nnodes;
+
+	if (numa_tier_interleave[0] > 1 || numa_tier_interleave[1] > 1) {
+		unsigned vnnodes = 0;
+		int vtarget;
+
+		/*
+		 * When N:M interleaving is configured, calculate a
+		 * virtual target for @n in an N:M-scaled nodelist...
+		 */
+		for_each_node_mask(nid, nodemask)
+			vnnodes += numa_tier_interleave[!node_is_toptier(nid)];
+		vtarget = (int)((unsigned int)n % vnnodes);
+
+		/* ...then map it back to the physical nodelist */
+		target = 0;
+		for_each_node_mask(nid, nodemask) {
+			vtarget -= numa_tier_interleave[!node_is_toptier(nid)];
+			if (vtarget < 0)
+				break;
+			target++;
+		}
+	} else {
+		target = (unsigned int)n % nnodes;
+	}
+
 	nid = first_node(nodemask);
 	for (i = 0; i < target; i++)
 		nid = next_node(nid, nodemask);