Skip to content

Commit

Permalink
kprobes,lib: kretprobe scalability improvement
Browse files Browse the repository at this point in the history
kretprobe is using freelist to manage return-instances, but freelist,
as LIFO queue based on singly linked list, scales badly and reduces
the overall throughput of kretprobed routines, especially for high
contention scenarios.

Here's a typical throughput test of sys_flock (counts in 10 seconds,
measured with perf stat -a -I 10000 -e syscalls:sys_enter_flock):

OS: Debian 10 X86_64, Linux 6.1rc2
HW: XEON 8336C x 2, 64 cores/128 threads, DDR4 3200MT/s

      1X       2X       4X       6X       8X      12X      16X
34762430 36546920 17949900 13101899 12569595 12646601 14729195
     24X      32X      48X      64X      72X      96X     128X
19263546 10102064  8985418 11936495 11493980  7127789  9330985

This patch implements a scalable, lock-less and numa-aware object pool,
which brings near-linear scalability to kretprobed routines. Tests of
kretprobe throughput show the biggest ratio as 333.9x of the original
freelist. Here's the comparison:

                  1X         2X         4X         8X        16X
freelist:   34762430   36546920   17949900   12569595   14729195
objpool:    35627544   72182095  144068494  287564688  576903916
                 32X        48X        64X        96X       128X
freelist:   10102064    8985418   11936495    7127789    9330985
objpool:  1158876372 1737828164 2324371724 2380310472 2463182819

Tests on 96-core ARM64 system output similar results, with biggest
ratio up to 642.2x:

OS: Debian 10 AARCH64, Linux 6.1rc2
HW: Kunpeng-920 96 cores/2 sockets/4 NUMA nodes, DDR4 2933 MT/s

                  1X         2X         4X         8X        16X
freelist:   17498299   10887037   10224710    8499132    6421751
objpool:    18715726   35549845   71615884  144258971  283707220
                 24X        32X        48X        64X        96X
freelist:    5339868    4819116    3593919    3121575    2687167
objpool:   419830913  571609748  877456139 1143316315 1725668029

The object pool is a scalable implementation of high performance queue
for objects allocation and reclamation, such as kretprobe instances.

With leveraging percpu ring-array to mitigate the hot spots of memory
contention, it could deliver near-linear scalability for high parallel
scenarios. The ring-array is compactly managed in a single cacheline
for most cases or continuous cachelines if more than 4 instances are
pre-allocated for every core.

Changes since V2:
1) the percpu-extended version of the freelist replaced by new percpu-
   ring-array. freelist has data-contention in freelist_node (refs and
   next) even after node is removed from freelist and the node could
   be polluted easily (with freelist_node defined as union)
2) routines split to objpool.h and objpool.c, the latter moved to lib
3) test module (test_objpool.ko) added to lib for functional testings

Changes since V1:
1) reformat to a single patch as Masami Hiramatsu suggested
2) use __vmalloc_node to replace vmalloc_node for vmalloc
3) a few minor fixes: typo and coding-style issues

Signed-off-by: wuqiang <wuqiang.matt@bytedance.com>
  • Loading branch information
mattwuq authored and intel-lab-lkp committed Nov 1, 2022
1 parent 192b494 commit a0deeba
Show file tree
Hide file tree
Showing 11 changed files with 1,772 additions and 250 deletions.
129 changes: 0 additions & 129 deletions include/linux/freelist.h

This file was deleted.

9 changes: 3 additions & 6 deletions include/linux/kprobes.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
#include <linux/mutex.h>
#include <linux/ftrace.h>
#include <linux/refcount.h>
#include <linux/freelist.h>
#include <linux/objpool.h>
#include <linux/rethook.h>
#include <asm/kprobes.h>

Expand Down Expand Up @@ -141,6 +141,7 @@ static inline bool kprobe_ftrace(struct kprobe *p)
*/
struct kretprobe_holder {
struct kretprobe *rp;
struct objpool_head oh;
refcount_t ref;
};

Expand All @@ -154,7 +155,6 @@ struct kretprobe {
#ifdef CONFIG_KRETPROBE_ON_RETHOOK
struct rethook *rh;
#else
struct freelist_head freelist;
struct kretprobe_holder *rph;
#endif
};
Expand All @@ -165,10 +165,7 @@ struct kretprobe_instance {
#ifdef CONFIG_KRETPROBE_ON_RETHOOK
struct rethook_node node;
#else
union {
struct freelist_node freelist;
struct rcu_head rcu;
};
struct rcu_head rcu;
struct llist_node llist;
struct kretprobe_holder *rph;
kprobe_opcode_t *ret_addr;
Expand Down
151 changes: 151 additions & 0 deletions include/linux/objpool.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
/* SPDX-License-Identifier: GPL-2.0 */

#ifndef _LINUX_OBJPOOL_H
#define _LINUX_OBJPOOL_H

#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/atomic.h>

/*
* objpool: ring-array based lockless MPMC/FIFO queues
*
* Copyright: wuqiang.matt@bytedance.com
*
* The object pool is a scalable implementaion of high performance queue
* for objects allocation and reclamation, such as kretprobe instances.
*
* With leveraging per-cpu ring-array to mitigate the hot spots of memory
* contention, it could deliver near-linear scalability for high parallel
* cases. Meanwhile, it also achieves high throughput with benifiting from
* warmed cache on each core.
*
* The object pool are best suited for the following cases:
* 1) memory allocation or reclamation is prohibited or too expensive
* 2) the objects are allocated/used/reclaimed very frequently
*
* Before using, you must be aware of it's limitations:
* 1) Maximum number of objects is determined during pool initializing
* 2) The memory of objects won't be freed until the poll is de-allocated
* 3) Both allocation and reclamation could be nested
*/

/*
* objpool_slot: per-cpu ring array
*
* Represents a cpu-local array-based ring buffer, its size is specialized
* during initialization of object pool.
*
* The objpool_slot is allocated from local memory for NUMA system, and to
* be kept compact in a single cacheline. ages[] is stored just after the
* body of objpool_slot, and ents[] is after ages[]. ages[] describes the
* revision of epoch of the item, solely used to avoid ABA. ents[] contains
* the object pointers.
*
* The default size of objpool_slot is a single cacheline, aka. 64 bytes.
*
* 64bit:
* 4 8 12 16 32 64
* | head | tail | size | mask | ages[4] | ents[4]: (8 * 4) |
*
* 32bit:
* 4 8 12 16 32 48 64
* | head | tail | size | mask | ages[4] | ents[4] | unused |
*
*/

struct objpool_slot {
uint32_t os_head; /* head of ring array */
uint32_t os_tail; /* tail of ring array */
uint32_t os_size; /* max item slots, pow of 2 */
uint32_t os_mask; /* os_size - 1 */
/*
* uint32_t os_ages[]; // ring epoch id
* void *os_ents[]; // objects array
*/
};

/* caller-specified object initial callback to setup each object, only called once */
typedef int (*objpool_init_node_cb)(void *context, void *obj);

/* caller-specified cleanup callback for private objects/pool/context */
typedef int (*objpool_release_cb)(void *context, void *ptr, uint32_t flags);

/* called for object releasing: ptr points to an object */
#define OBJPOOL_FLAG_NODE (0x00000001)
/* for user pool and context releasing, ptr could be NULL */
#define OBJPOOL_FLAG_POOL (0x00001000)
/* the object or pool to be released is user-managed */
#define OBJPOOL_FLAG_USER (0x00008000)

/*
* objpool_head: object pooling metadata
*/

struct objpool_head {
uint32_t oh_objsz; /* object & element size */
uint32_t oh_nobjs; /* total objs (pre-allocated) */
uint32_t oh_nents; /* max objects per cpuslot */
uint32_t oh_ncpus; /* num of possible cpus */
uint32_t oh_in_user:1; /* user-specified buffer */
uint32_t oh_in_slot:1; /* objs alloced with slots */
uint32_t oh_vmalloc:1; /* alloc from vmalloc zone */
gfp_t oh_gfp; /* k/vmalloc gfp flags */
uint32_t oh_sz_pool; /* user pool size in byes */
void *oh_pool; /* user managed memory pool */
struct objpool_slot **oh_slots; /* array of percpu slots */
uint32_t *oh_sz_slots; /* size in bytes of slots */
objpool_release_cb oh_release; /* resource cleanup callback */
void *oh_context; /* caller-provided context */
};

/* initialize object pool and pre-allocate objects */
int objpool_init(struct objpool_head *oh,
int nobjs, int max, int objsz,
gfp_t gfp, void *context,
objpool_init_node_cb objinit,
objpool_release_cb release);

/* add objects in batch from user provided pool */
int objpool_populate(struct objpool_head *oh, void *buf,
int size, int objsz, void *context,
objpool_init_node_cb objinit);

/* add pre-allocated object (managed by user) to objpool */
int objpool_add(void *obj, struct objpool_head *oh);

/* allocate an object from objects pool */
void *objpool_pop(struct objpool_head *oh);

/* reclaim an object and return it back to objects pool */
int objpool_push(void *node, struct objpool_head *oh);

/* cleanup the whole object pool (including all chained objects) */
void objpool_fini(struct objpool_head *oh);

/* whether the object is pre-allocated with percpu slots */
static inline int objpool_is_inslot(void *obj, struct objpool_head *oh)
{
void *slot;
int i;

if (!obj)
return 0;

for (i = 0; i < oh->oh_ncpus; i++) {
slot = oh->oh_slots[i];
if (obj >= slot && obj < slot + oh->oh_sz_slots[i])
return 1;
}

return 0;
}

/* whether the object is from user pool (batched adding) */
static inline int objpool_is_inpool(void *obj, struct objpool_head *oh)
{
return (obj && oh->oh_pool && obj >= oh->oh_pool &&
obj < oh->oh_pool + oh->oh_sz_pool);
}

#endif /* _LINUX_OBJPOOL_H */
15 changes: 5 additions & 10 deletions include/linux/rethook.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#define _LINUX_RETHOOK_H

#include <linux/compiler.h>
#include <linux/freelist.h>
#include <linux/objpool.h>
#include <linux/kallsyms.h>
#include <linux/llist.h>
#include <linux/rcupdate.h>
Expand All @@ -30,14 +30,14 @@ typedef void (*rethook_handler_t) (struct rethook_node *, void *, struct pt_regs
struct rethook {
void *data;
rethook_handler_t handler;
struct freelist_head pool;
struct objpool_head pool;
refcount_t ref;
struct rcu_head rcu;
};

/**
* struct rethook_node - The rethook shadow-stack entry node.
* @freelist: The freelist, linked to struct rethook::pool.
* @nod: The objpool node, linked to struct rethook::pool.
* @rcu: The rcu_head for deferred freeing.
* @llist: The llist, linked to a struct task_struct::rethooks.
* @rethook: The pointer to the struct rethook.
Expand All @@ -48,19 +48,15 @@ struct rethook {
* on each entry of the shadow stack.
*/
struct rethook_node {
union {
struct freelist_node freelist;
struct rcu_head rcu;
};
struct rcu_head rcu;
struct llist_node llist;
struct rethook *rethook;
unsigned long ret_addr;
unsigned long frame;
};

struct rethook *rethook_alloc(void *data, rethook_handler_t handler);
struct rethook *rethook_alloc(void *data, rethook_handler_t handler, gfp_t gfp, int size, int max);
void rethook_free(struct rethook *rh);
void rethook_add_node(struct rethook *rh, struct rethook_node *node);
struct rethook_node *rethook_try_get(struct rethook *rh);
void rethook_recycle(struct rethook_node *node);
void rethook_hook(struct rethook_node *node, struct pt_regs *regs, bool mcount);
Expand Down Expand Up @@ -97,4 +93,3 @@ void rethook_flush_task(struct task_struct *tk);
#endif

#endif

Loading

0 comments on commit a0deeba

Please sign in to comment.