Skip to content

Commit

Permalink
Merge branch 'Notify user space when a struct_ops object is detached/…
Browse files Browse the repository at this point in the history
…unregistered'

Kui-Feng Lee says:

====================
The subsystems managing struct_ops objects may need to detach a
struct_ops object due to errors or other reasons. It would be useful
to notify user space programs so that error recovery or logging can be
carried out.

This patch set enables the detach feature for struct_ops links and
send an event to epoll when a link is detached.  Subsystems could call
link->ops->detach() to detach a link and notify user space programs
through epoll.

The signatures of callback functions in "struct bpf_struct_ops" have
been changed as well to pass an extra link argument to
subsystems. Subsystems could detach the links received from reg() and
update() callbacks if there is. This also provides a way that
subsystems can distinguish registrations for an object that has been
registered multiple times for several links.

However, bpf struct_ops maps without BPF_F_LINK have no any link.
Subsystems will receive NULL link pointer for this case.
---
Changes from v6:

 - Fix the missing header at patch 5.

 - Move RCU_INIT_POINTER() back to its original position.

Changes from v5:

 - Change the commit title of the patch for bpftool.

Changes from v4:

 - Change error code for bpf_struct_ops_map_link_update()

 - Always return 0 for bpf_struct_ops_map_link_detach()

 - Hold update_mutex in bpf_struct_ops_link_create()

 - Add a separated instance of file_operations for links supporting
    poll.

 - Fix bpftool for bpf_link_fops_poll.

Changes from v3:

 - Add a comment to explain why holding update_mutex is not necessary
    in bpf_struct_ops_link_create()

 - Use rcu_access_pointer() in bpf_struct_ops_map_link_poll().

Changes from v2:

 - Rephrased commit logs and comments.

 - Addressed some mistakes from patch splitting.

 - Replace mutex with spinlock in bpf_testmod.c to address lockdep
    Splat and simplify the implementation.

 - Fix an argument passing to rcu_dereference_protected().

Changes from v1:

 - Pass a link to reg, unreg, and update callbacks.

 - Provide a function to detach a link from underlying subsystems.

 - Add a kfunc to mimic detachments from subsystems, and provide a
    flexible way to control when to do detachments.

 - Add two tests to detach a link from the subsystem after the refcount
    of the link drops to zero.

v6: https://lore.kernel.org/bpf/20240524223036.318800-1-thinker.li@gmail.com/
v5: https://lore.kernel.org/all/20240523230848.2022072-1-thinker.li@gmail.com/
v4: https://lore.kernel.org/all/20240521225121.770930-1-thinker.li@gmail.com/
v3: https://lore.kernel.org/all/20240510002942.1253354-1-thinker.li@gmail.com/
v2: https://lore.kernel.org/all/20240507055600.2382627-1-thinker.li@gmail.com/
v1: https://lore.kernel.org/all/20240429213609.487820-1-thinker.li@gmail.com/
====================

Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
  • Loading branch information
Martin KaFai Lau committed May 30, 2024
2 parents 46253c4 + d14c1fa commit 3f8fde3
Show file tree
Hide file tree
Showing 10 changed files with 186 additions and 30 deletions.
13 changes: 10 additions & 3 deletions include/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -1612,6 +1612,7 @@ struct bpf_link_ops {
struct bpf_link_info *info);
int (*update_map)(struct bpf_link *link, struct bpf_map *new_map,
struct bpf_map *old_map);
__poll_t (*poll)(struct file *file, struct poll_table_struct *pts);
};

struct bpf_tramp_link {
Expand Down Expand Up @@ -1730,9 +1731,9 @@ struct bpf_struct_ops {
int (*init_member)(const struct btf_type *t,
const struct btf_member *member,
void *kdata, const void *udata);
int (*reg)(void *kdata);
void (*unreg)(void *kdata);
int (*update)(void *kdata, void *old_kdata);
int (*reg)(void *kdata, struct bpf_link *link);
void (*unreg)(void *kdata, struct bpf_link *link);
int (*update)(void *kdata, void *old_kdata, struct bpf_link *link);
int (*validate)(void *kdata);
void *cfi_stubs;
struct module *owner;
Expand Down Expand Up @@ -2333,6 +2334,7 @@ int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer);
int bpf_link_settle(struct bpf_link_primer *primer);
void bpf_link_cleanup(struct bpf_link_primer *primer);
void bpf_link_inc(struct bpf_link *link);
struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link);
void bpf_link_put(struct bpf_link *link);
int bpf_link_new_fd(struct bpf_link *link);
struct bpf_link *bpf_link_get_from_fd(u32 ufd);
Expand Down Expand Up @@ -2704,6 +2706,11 @@ static inline void bpf_link_inc(struct bpf_link *link)
{
}

static inline struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
{
return NULL;
}

static inline void bpf_link_put(struct bpf_link *link)
{
}
Expand Down
75 changes: 65 additions & 10 deletions kernel/bpf/bpf_struct_ops.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <linux/mutex.h>
#include <linux/btf_ids.h>
#include <linux/rcupdate_wait.h>
#include <linux/poll.h>

struct bpf_struct_ops_value {
struct bpf_struct_ops_common_value common;
Expand Down Expand Up @@ -56,6 +57,7 @@ struct bpf_struct_ops_map {
struct bpf_struct_ops_link {
struct bpf_link link;
struct bpf_map __rcu *map;
wait_queue_head_t wait_hup;
};

static DEFINE_MUTEX(update_mutex);
Expand Down Expand Up @@ -757,7 +759,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
goto unlock;
}

err = st_ops->reg(kdata);
err = st_ops->reg(kdata, NULL);
if (likely(!err)) {
/* This refcnt increment on the map here after
* 'st_ops->reg()' is secure since the state of the
Expand Down Expand Up @@ -805,7 +807,7 @@ static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key)
BPF_STRUCT_OPS_STATE_TOBEFREE);
switch (prev_state) {
case BPF_STRUCT_OPS_STATE_INUSE:
st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data);
st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, NULL);
bpf_map_put(map);
return 0;
case BPF_STRUCT_OPS_STATE_TOBEFREE:
Expand Down Expand Up @@ -1057,10 +1059,7 @@ static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link)
st_map = (struct bpf_struct_ops_map *)
rcu_dereference_protected(st_link->map, true);
if (st_map) {
/* st_link->map can be NULL if
* bpf_struct_ops_link_create() fails to register.
*/
st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data);
st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link);
bpf_map_put(&st_map->map);
}
kfree(st_link);
Expand All @@ -1075,7 +1074,8 @@ static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link,
st_link = container_of(link, struct bpf_struct_ops_link, link);
rcu_read_lock();
map = rcu_dereference(st_link->map);
seq_printf(seq, "map_id:\t%d\n", map->id);
if (map)
seq_printf(seq, "map_id:\t%d\n", map->id);
rcu_read_unlock();
}

Expand All @@ -1088,7 +1088,8 @@ static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link,
st_link = container_of(link, struct bpf_struct_ops_link, link);
rcu_read_lock();
map = rcu_dereference(st_link->map);
info->struct_ops.map_id = map->id;
if (map)
info->struct_ops.map_id = map->id;
rcu_read_unlock();
return 0;
}
Expand All @@ -1113,6 +1114,10 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map
mutex_lock(&update_mutex);

old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex));
if (!old_map) {
err = -ENOLINK;
goto err_out;
}
if (expected_old_map && old_map != expected_old_map) {
err = -EPERM;
goto err_out;
Expand All @@ -1125,7 +1130,7 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map
goto err_out;
}

err = st_map->st_ops_desc->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data);
err = st_map->st_ops_desc->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data, link);
if (err)
goto err_out;

Expand All @@ -1139,11 +1144,53 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map
return err;
}

static int bpf_struct_ops_map_link_detach(struct bpf_link *link)
{
struct bpf_struct_ops_link *st_link = container_of(link, struct bpf_struct_ops_link, link);
struct bpf_struct_ops_map *st_map;
struct bpf_map *map;

mutex_lock(&update_mutex);

map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex));
if (!map) {
mutex_unlock(&update_mutex);
return 0;
}
st_map = container_of(map, struct bpf_struct_ops_map, map);

st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link);

RCU_INIT_POINTER(st_link->map, NULL);
/* Pair with bpf_map_get() in bpf_struct_ops_link_create() or
* bpf_map_inc() in bpf_struct_ops_map_link_update().
*/
bpf_map_put(&st_map->map);

mutex_unlock(&update_mutex);

wake_up_interruptible_poll(&st_link->wait_hup, EPOLLHUP);

return 0;
}

static __poll_t bpf_struct_ops_map_link_poll(struct file *file,
struct poll_table_struct *pts)
{
struct bpf_struct_ops_link *st_link = file->private_data;

poll_wait(file, &st_link->wait_hup, pts);

return rcu_access_pointer(st_link->map) ? 0 : EPOLLHUP;
}

static const struct bpf_link_ops bpf_struct_ops_map_lops = {
.dealloc = bpf_struct_ops_map_link_dealloc,
.detach = bpf_struct_ops_map_link_detach,
.show_fdinfo = bpf_struct_ops_map_link_show_fdinfo,
.fill_link_info = bpf_struct_ops_map_link_fill_link_info,
.update_map = bpf_struct_ops_map_link_update,
.poll = bpf_struct_ops_map_link_poll,
};

int bpf_struct_ops_link_create(union bpf_attr *attr)
Expand Down Expand Up @@ -1176,13 +1223,21 @@ int bpf_struct_ops_link_create(union bpf_attr *attr)
if (err)
goto err_out;

err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data);
init_waitqueue_head(&link->wait_hup);

/* Hold the update_mutex such that the subsystem cannot
* do link->ops->detach() before the link is fully initialized.
*/
mutex_lock(&update_mutex);
err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data, &link->link);
if (err) {
mutex_unlock(&update_mutex);
bpf_link_cleanup(&link_primer);
link = NULL;
goto err_out;
}
RCU_INIT_POINTER(link->map, map);
mutex_unlock(&update_mutex);

return bpf_link_settle(&link_primer);

Expand Down
34 changes: 28 additions & 6 deletions kernel/bpf/syscall.c
Original file line number Diff line number Diff line change
Expand Up @@ -3150,6 +3150,13 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp)
}
#endif

static __poll_t bpf_link_poll(struct file *file, struct poll_table_struct *pts)
{
struct bpf_link *link = file->private_data;

return link->ops->poll(file, pts);
}

static const struct file_operations bpf_link_fops = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_link_show_fdinfo,
Expand All @@ -3159,6 +3166,16 @@ static const struct file_operations bpf_link_fops = {
.write = bpf_dummy_write,
};

static const struct file_operations bpf_link_fops_poll = {
#ifdef CONFIG_PROC_FS
.show_fdinfo = bpf_link_show_fdinfo,
#endif
.release = bpf_link_release,
.read = bpf_dummy_read,
.write = bpf_dummy_write,
.poll = bpf_link_poll,
};

static int bpf_link_alloc_id(struct bpf_link *link)
{
int id;
Expand Down Expand Up @@ -3201,7 +3218,9 @@ int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer)
return id;
}

file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC);
file = anon_inode_getfile("bpf_link",
link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops,
link, O_CLOEXEC);
if (IS_ERR(file)) {
bpf_link_free_id(id);
put_unused_fd(fd);
Expand Down Expand Up @@ -3229,7 +3248,9 @@ int bpf_link_settle(struct bpf_link_primer *primer)

int bpf_link_new_fd(struct bpf_link *link)
{
return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC);
return anon_inode_getfd("bpf-link",
link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops,
link, O_CLOEXEC);
}

struct bpf_link *bpf_link_get_from_fd(u32 ufd)
Expand All @@ -3239,7 +3260,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd)

if (!f.file)
return ERR_PTR(-EBADF);
if (f.file->f_op != &bpf_link_fops) {
if (f.file->f_op != &bpf_link_fops && f.file->f_op != &bpf_link_fops_poll) {
fdput(f);
return ERR_PTR(-EINVAL);
}
Expand Down Expand Up @@ -4971,7 +4992,7 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
uattr);
else if (f.file->f_op == &btf_fops)
err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr);
else if (f.file->f_op == &bpf_link_fops)
else if (f.file->f_op == &bpf_link_fops || f.file->f_op == &bpf_link_fops_poll)
err = bpf_link_get_info_by_fd(f.file, f.file->private_data,
attr, uattr);
else
Expand Down Expand Up @@ -5106,7 +5127,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
if (!file)
return -EBADF;

if (file->f_op == &bpf_link_fops) {
if (file->f_op == &bpf_link_fops || file->f_op == &bpf_link_fops_poll) {
struct bpf_link *link = file->private_data;

if (link->ops == &bpf_raw_tp_link_lops) {
Expand Down Expand Up @@ -5416,10 +5437,11 @@ static int link_detach(union bpf_attr *attr)
return ret;
}

static struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link)
{
return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT);
}
EXPORT_SYMBOL(bpf_link_inc_not_zero);

struct bpf_link *bpf_link_by_id(u32 id)
{
Expand Down
4 changes: 2 additions & 2 deletions net/bpf/bpf_dummy_struct_ops.c
Original file line number Diff line number Diff line change
Expand Up @@ -272,12 +272,12 @@ static int bpf_dummy_init_member(const struct btf_type *t,
return -EOPNOTSUPP;
}

static int bpf_dummy_reg(void *kdata)
static int bpf_dummy_reg(void *kdata, struct bpf_link *link)
{
return -EOPNOTSUPP;
}

static void bpf_dummy_unreg(void *kdata)
static void bpf_dummy_unreg(void *kdata, struct bpf_link *link)
{
}

Expand Down
6 changes: 3 additions & 3 deletions net/ipv4/bpf_tcp_ca.c
Original file line number Diff line number Diff line change
Expand Up @@ -260,17 +260,17 @@ static int bpf_tcp_ca_check_member(const struct btf_type *t,
return 0;
}

static int bpf_tcp_ca_reg(void *kdata)
static int bpf_tcp_ca_reg(void *kdata, struct bpf_link *link)
{
return tcp_register_congestion_control(kdata);
}

static void bpf_tcp_ca_unreg(void *kdata)
static void bpf_tcp_ca_unreg(void *kdata, struct bpf_link *link)
{
tcp_unregister_congestion_control(kdata);
}

static int bpf_tcp_ca_update(void *kdata, void *old_kdata)
static int bpf_tcp_ca_update(void *kdata, void *old_kdata, struct bpf_link *link)
{
return tcp_update_congestion_control(kdata, old_kdata);
}
Expand Down
7 changes: 6 additions & 1 deletion tools/bpf/bpftool/skeleton/pid_iter.bpf.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ enum bpf_link_type___local {
};

extern const void bpf_link_fops __ksym;
extern const void bpf_link_fops_poll __ksym __weak;
extern const void bpf_map_fops __ksym;
extern const void bpf_prog_fops __ksym;
extern const void btf_fops __ksym;
Expand Down Expand Up @@ -84,7 +85,11 @@ int iter(struct bpf_iter__task_file *ctx)
fops = &btf_fops;
break;
case BPF_OBJ_LINK:
fops = &bpf_link_fops;
if (&bpf_link_fops_poll &&
file->f_op == &bpf_link_fops_poll)
fops = &bpf_link_fops_poll;
else
fops = &bpf_link_fops;
break;
default:
return 0;
Expand Down
4 changes: 2 additions & 2 deletions tools/testing/selftests/bpf/bpf_test_no_cfi/bpf_test_no_cfi.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@ static int dummy_init_member(const struct btf_type *t,
return 0;
}

static int dummy_reg(void *kdata)
static int dummy_reg(void *kdata, struct bpf_link *link)
{
return 0;
}

static void dummy_unreg(void *kdata)
static void dummy_unreg(void *kdata, struct bpf_link *link)
{
}

Expand Down
6 changes: 3 additions & 3 deletions tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c
Original file line number Diff line number Diff line change
Expand Up @@ -820,7 +820,7 @@ static const struct bpf_verifier_ops bpf_testmod_verifier_ops = {
.is_valid_access = bpf_testmod_ops_is_valid_access,
};

static int bpf_dummy_reg(void *kdata)
static int bpf_dummy_reg(void *kdata, struct bpf_link *link)
{
struct bpf_testmod_ops *ops = kdata;

Expand All @@ -835,7 +835,7 @@ static int bpf_dummy_reg(void *kdata)
return 0;
}

static void bpf_dummy_unreg(void *kdata)
static void bpf_dummy_unreg(void *kdata, struct bpf_link *link)
{
}

Expand Down Expand Up @@ -871,7 +871,7 @@ struct bpf_struct_ops bpf_bpf_testmod_ops = {
.owner = THIS_MODULE,
};

static int bpf_dummy_reg2(void *kdata)
static int bpf_dummy_reg2(void *kdata, struct bpf_link *link)
{
struct bpf_testmod_ops2 *ops = kdata;

Expand Down
Loading

0 comments on commit 3f8fde3

Please sign in to comment.