kprobes,lib: kretprobe scalability improvement

kretprobe is using freelist to manage return-instances, but freelist, as LIFO queue based on singly linked list, scales badly and reduces the overall throughput of kretprobed routines, especially for high contention scenarios. Here's a typical throughput test of sys_flock (counts in 10 seconds, measured with perf stat -a -I 10000 -e syscalls:sys_enter_flock): OS: Debian 10 X86_64, Linux 6.1rc2 HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s 1X 2X 4X 6X 8X 12X 16X 34762430 36546920 17949900 13101899 12569595 12646601 14729195 24X 32X 48X 64X 72X 96X 128X 19263546 10102064 8985418 11936495 11493980 7127789 9330985 This patch implements a scalable, lock-less and numa-aware object pool, which brings near-linear scalability to kretprobed routines. Tests of kretprobe throughput show the biggest ratio as 333.9x of the original freelist. Here's the comparison: 1X 2X 4X 8X 16X freelist: 34762430 36546920 17949900 12569595 14729195 objpool: 35627544 72182095 144068494 287564688 576903916 32X 48X 64X 96X 128X freelist: 10102064 8985418 11936495 7127789 9330985 objpool: 1158876372 1737828164 2324371724 2380310472 2463182819 Tests on 96-core ARM64 system output similar results, with biggest ratio up to 642.2x: OS: Debian 10 AARCH64, Linux 6.1rc2 HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s 1X 2X 4X 8X 16X freelist: 17498299 10887037 10224710 8499132 6421751 objpool: 18715726 35549845 71615884 144258971 283707220 24X 32X 48X 64X 96X freelist: 5339868 4819116 3593919 3121575 2687167 objpool: 419830913 571609748 877456139 1143316315 1725668029 The object pool is a scalable implementation of high performance queue for objects allocation and reclamation, such as kretprobe instances. With leveraging percpu ring-array to mitigate the hot spots of memory contention, it could deliver near-linear scalability for high parallel scenarios. The ring-array is compactly managed in a single cacheline for most cases or continuous cachelines if more than 4 instances are pre-allocated for every core. Changes since V2: 1) the percpu-extended version of the freelist replaced by new percpu- ring-array. freelist has data-contention in freelist_node (refs and next) even after node is removed from freelist and the node could be polluted easily (with freelist_node defined as union) 2) routines split to objpool.h and objpool.c, the latter moved to lib 3) test module (test_objpool.ko) added to lib for functional testings Changes since V1: 1) reformat to a single patch as Masami Hiramatsu suggested 2) use __vmalloc_node to replace vmalloc_node for vmalloc 3) a few minor fixes: typo and coding-style issues Signed-off-by: wuqiang <wuqiang.matt@bytedance.com>
intel-lab-lkp · Nov 1, 2022 · a0deeba · a0deeba
1 parent 192b494
commit a0deeba
Show file tree

Hide file tree

Showing 11 changed files with 1,772 additions and 250 deletions.
diff --git a/include/linux/freelist.h b/include/linux/freelist.h
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
@@ -27,7 +27,7 @@
 #include <linux/mutex.h>
 #include <linux/ftrace.h>
 #include <linux/refcount.h>
-#include <linux/freelist.h>
+#include <linux/objpool.h>
 #include <linux/rethook.h>
 #include <asm/kprobes.h>
 
@@ -141,6 +141,7 @@ static inline bool kprobe_ftrace(struct kprobe *p)
  */
 struct kretprobe_holder {
 	struct kretprobe	*rp;
+	struct objpool_head	oh;
 	refcount_t		ref;
 };
 
@@ -154,7 +155,6 @@ struct kretprobe {
 #ifdef CONFIG_KRETPROBE_ON_RETHOOK
 	struct rethook *rh;
 #else
-	struct freelist_head freelist;
 	struct kretprobe_holder *rph;
 #endif
 };
@@ -165,10 +165,7 @@ struct kretprobe_instance {
 #ifdef CONFIG_KRETPROBE_ON_RETHOOK
 	struct rethook_node node;
 #else
-	union {
-		struct freelist_node freelist;
-		struct rcu_head rcu;
-	};
+	struct rcu_head rcu;
 	struct llist_node llist;
 	struct kretprobe_holder *rph;
 	kprobe_opcode_t *ret_addr;

diff --git a/include/linux/objpool.h b/include/linux/objpool.h
@@ -0,0 +1,151 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _LINUX_OBJPOOL_H
+#define _LINUX_OBJPOOL_H
+
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/atomic.h>
+
+/*
+ * objpool: ring-array based lockless MPMC/FIFO queues
+ *
+ * Copyright: wuqiang.matt@bytedance.com
+ *
+ * The object pool is a scalable implementaion of high performance queue
+ * for objects allocation and reclamation, such as kretprobe instances.
+ *
+ * With leveraging per-cpu ring-array to mitigate the hot spots of memory
+ * contention, it could deliver near-linear scalability for high parallel
+ * cases. Meanwhile, it also achieves high throughput with benifiting from
+ * warmed cache on each core.
+ *
+ * The object pool are best suited for the following cases:
+ * 1) memory allocation or reclamation is prohibited or too expensive
+ * 2) the objects are allocated/used/reclaimed very frequently
+ *
+ * Before using, you must be aware of it's limitations:
+ * 1) Maximum number of objects is determined during pool initializing
+ * 2) The memory of objects won't be freed until the poll is de-allocated
+ * 3) Both allocation and reclamation could be nested
+ */
+
+/*
+ * objpool_slot: per-cpu ring array
+ *
+ * Represents a cpu-local array-based ring buffer, its size is specialized
+ * during initialization of object pool.
+ *
+ * The objpool_slot is allocated from local memory for NUMA system, and to
+ * be kept compact in a single cacheline. ages[] is stored just after the
+ * body of objpool_slot, and ents[] is after ages[]. ages[] describes the
+ * revision of epoch of the item, solely used to avoid ABA. ents[] contains
+ * the object pointers.
+ *
+ * The default size of objpool_slot is a single cacheline, aka. 64 bytes.
+ *
+ * 64bit:
+ *        4      8      12     16        32                 64
+ * | head | tail | size | mask | ages[4] | ents[4]: (8 * 4) |
+ *
+ * 32bit:
+ *        4      8      12     16        32        48       64
+ * | head | tail | size | mask | ages[4] | ents[4] | unused |
+ *
+ */
+
+struct objpool_slot {
+	uint32_t                os_head;	/* head of ring array */
+	uint32_t                os_tail;	/* tail of ring array */
+	uint32_t                os_size;	/* max item slots, pow of 2 */
+	uint32_t                os_mask;	/* os_size - 1 */
+/*
+ *	uint32_t                os_ages[];	// ring epoch id
+ *	void                   *os_ents[];	// objects array
+ */
+};
+
+/* caller-specified object initial callback to setup each object, only called once */
+typedef int (*objpool_init_node_cb)(void *context, void *obj);
+
+/* caller-specified cleanup callback for private objects/pool/context */
+typedef int (*objpool_release_cb)(void *context, void *ptr, uint32_t flags);
+
+/* called for object releasing: ptr points to an object */
+#define OBJPOOL_FLAG_NODE        (0x00000001)
+/* for user pool and context releasing, ptr could be NULL */
+#define OBJPOOL_FLAG_POOL        (0x00001000)
+/* the object or pool to be released is user-managed */
+#define OBJPOOL_FLAG_USER        (0x00008000)
+
+/*
+ * objpool_head: object pooling metadata
+ */
+
+struct objpool_head {
+	uint32_t                oh_objsz;	/* object & element size */
+	uint32_t                oh_nobjs;	/* total objs (pre-allocated) */
+	uint32_t                oh_nents;	/* max objects per cpuslot */
+	uint32_t                oh_ncpus;	/* num of possible cpus */
+	uint32_t                oh_in_user:1;	/* user-specified buffer */
+	uint32_t                oh_in_slot:1;	/* objs alloced with slots */
+	uint32_t                oh_vmalloc:1;	/* alloc from vmalloc zone */
+	gfp_t                   oh_gfp;		/* k/vmalloc gfp flags */
+	uint32_t                oh_sz_pool;	/* user pool size in byes */
+	void                   *oh_pool;	/* user managed memory pool */
+	struct objpool_slot   **oh_slots;	/* array of percpu slots */
+	uint32_t               *oh_sz_slots;	/* size in bytes of slots */
+	objpool_release_cb      oh_release;	/* resource cleanup callback */
+	void                   *oh_context;	/* caller-provided context */
+};
+
+/* initialize object pool and pre-allocate objects */
+int objpool_init(struct objpool_head *oh,
+		int nobjs, int max, int objsz,
+		gfp_t gfp, void *context,
+		objpool_init_node_cb objinit,
+		objpool_release_cb release);
+
+/* add objects in batch from user provided pool */
+int objpool_populate(struct objpool_head *oh, void *buf,
+			int size, int objsz, void *context,
+			objpool_init_node_cb objinit);
+
+/* add pre-allocated object (managed by user) to objpool */
+int objpool_add(void *obj, struct objpool_head *oh);
+
+/* allocate an object from objects pool */
+void *objpool_pop(struct objpool_head *oh);
+
+/* reclaim an object and return it back to objects pool */
+int objpool_push(void *node, struct objpool_head *oh);
+
+/* cleanup the whole object pool (including all chained objects) */
+void objpool_fini(struct objpool_head *oh);
+
+/* whether the object is pre-allocated with percpu slots */
+static inline int objpool_is_inslot(void *obj, struct objpool_head *oh)
+{
+	void *slot;
+	int i;
+
+	if (!obj)
+		return 0;
+
+	for (i = 0; i < oh->oh_ncpus; i++) {
+		slot = oh->oh_slots[i];
+		if (obj >= slot && obj < slot + oh->oh_sz_slots[i])
+			return 1;
+	}
+
+	return 0;
+}
+
+/* whether the object is from user pool (batched adding) */
+static inline int objpool_is_inpool(void *obj, struct objpool_head *oh)
+{
+	return (obj && oh->oh_pool && obj >= oh->oh_pool &&
+		obj < oh->oh_pool + oh->oh_sz_pool);
+}
+
+#endif /* _LINUX_OBJPOOL_H */
diff --git a/include/linux/rethook.h b/include/linux/rethook.h
@@ -6,7 +6,7 @@
 #define _LINUX_RETHOOK_H
 
 #include <linux/compiler.h>
-#include <linux/freelist.h>
+#include <linux/objpool.h>
 #include <linux/kallsyms.h>
 #include <linux/llist.h>
 #include <linux/rcupdate.h>
@@ -30,14 +30,14 @@ typedef void (*rethook_handler_t) (struct rethook_node *, void *, struct pt_regs
 struct rethook {
 	void			*data;
 	rethook_handler_t	handler;
-	struct freelist_head	pool;
+	struct objpool_head	pool;
 	refcount_t		ref;
 	struct rcu_head		rcu;
 };
 
 /**
  * struct rethook_node - The rethook shadow-stack entry node.
- * @freelist: The freelist, linked to struct rethook::pool.
+ * @nod: The objpool node, linked to struct rethook::pool.
  * @rcu: The rcu_head for deferred freeing.
  * @llist: The llist, linked to a struct task_struct::rethooks.
  * @rethook: The pointer to the struct rethook.
@@ -48,19 +48,15 @@ struct rethook {
  * on each entry of the shadow stack.
  */
 struct rethook_node {
-	union {
-		struct freelist_node freelist;
-		struct rcu_head      rcu;
-	};
+	struct rcu_head		rcu;
 	struct llist_node	llist;
 	struct rethook		*rethook;
 	unsigned long		ret_addr;
 	unsigned long		frame;
 };
 
-struct rethook *rethook_alloc(void *data, rethook_handler_t handler);
+struct rethook *rethook_alloc(void *data, rethook_handler_t handler, gfp_t gfp, int size, int max);
 void rethook_free(struct rethook *rh);
-void rethook_add_node(struct rethook *rh, struct rethook_node *node);
 struct rethook_node *rethook_try_get(struct rethook *rh);
 void rethook_recycle(struct rethook_node *node);
 void rethook_hook(struct rethook_node *node, struct pt_regs *regs, bool mcount);
@@ -97,4 +93,3 @@ void rethook_flush_task(struct task_struct *tk);
 #endif
 
 #endif
-