Skip to content

Commit

Permalink
vsock: Add lockless sendmsg() support
Browse files Browse the repository at this point in the history
Because the dgram sendmsg() path for AF_VSOCK acquires the socket lock
it does not scale when many senders share a socket.

Prior to this patch the socket lock is used to protect both reads and
writes to the local_addr, remote_addr, transport, and buffer size
variables of a vsock socket. What follows are the new protection schemes
for these fields that ensure a race-free and usually lock-free
multi-sender sendmsg() path for vsock dgrams.

- local_addr
    local_addr changes as a result of binding a socket. The write path
    for local_addr is bind() and various vsock_auto_bind() call sites.
    After a socket has been bound via vsock_auto_bind() or bind(), subsequent
    calls to bind()/vsock_auto_bind() do not write to local_addr again. bind()
    rejects the user request and vsock_auto_bind() early exits.
    Therefore, the local addr can not change while a parallel thread is
    in sendmsg() and lock-free reads of local addr in sendmsg() are safe.
    Change: only acquire lock for auto-binding as-needed in sendmsg().

- buffer size variables
    Not used by dgram, so they do not need protection. No change.

- remote_addr and transport
    Because a remote_addr update may result in a changed transport, but we
    would like to be able to read these two fields lock-free but coherently
    in the vsock send path, this patch packages these two fields into a new
    struct vsock_remote_info that is referenced by an RCU-protected pointer.

    Writes are synchronized as usual by the socket lock. Reads only take
    place in RCU read-side critical sections. When remote_addr or transport
    is updated, a new remote info is allocated. Old readers still see the
    old coherent remote_addr/transport pair, and new readers will refer to
    the new coherent. The coherency between remote_addr and transport
    previously provided by the socket lock alone is now also preserved by
    RCU, except with the highly-scalable lock-free read-side.

Helpers are introduced for accessing and updating the new pointer.

The new structure is contains an rcu_head so that kfree_rcu() can be
used. This removes the need of writers to use synchronize_rcu() after
freeing old structures which is simply more efficient and reduces code
churn where remote_addr/transport are already being updated inside RCU
read-side sections.

Only virtio has been tested, but updates were necessary to the VMCI and
hyperv code. Unfortunately the author does not have access to
VMCI/hyperv systems so those changes are untested.

Perf Tests (results from patch v2)
vCPUS: 16
Threads: 16
Payload: 4KB
Test Runs: 5
Type: SOCK_DGRAM

Before: 245.2 MB/s
After: 509.2 MB/s (+107%)

Notably, on the same test system, vsock dgram even outperforms
multi-threaded UDP over virtio-net with vhost and MQ support enabled.

Throughput metrics for single-threaded SOCK_DGRAM and
single/multi-threaded SOCK_STREAM showed no statistically signficant
throughput changes (lowest p-value reaching 0.27), with the range of the
mean difference ranging between -5% to +1%.

Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
  • Loading branch information
Bobby Eshleman authored and intel-lab-lkp committed May 31, 2023
1 parent f69f8a1 commit 4e9bf74
Show file tree
Hide file tree
Showing 9 changed files with 518 additions and 150 deletions.
12 changes: 10 additions & 2 deletions drivers/vhost/vsock.c
Original file line number Diff line number Diff line change
Expand Up @@ -297,13 +297,17 @@ static int
vhost_transport_cancel_pkt(struct vsock_sock *vsk)
{
struct vhost_vsock *vsock;
unsigned int cid;
int cnt = 0;
int ret = -ENODEV;

rcu_read_lock();
ret = vsock_remote_addr_cid(vsk, &cid);
if (ret < 0)
goto out;

/* Find the vhost_vsock according to guest context id */
vsock = vhost_vsock_get(vsk->remote_addr.svm_cid);
vsock = vhost_vsock_get(cid);
if (!vsock)
goto out;

Expand Down Expand Up @@ -706,14 +710,18 @@ static void vhost_vsock_flush(struct vhost_vsock *vsock)
static void vhost_vsock_reset_orphans(struct sock *sk)
{
struct vsock_sock *vsk = vsock_sk(sk);
unsigned int cid;

if (vsock_remote_addr_cid(vsk, &cid) < 0)
return;

/* vmci_transport.c doesn't take sk_lock here either. At least we're
* under vsock_table_lock so the sock cannot disappear while we're
* executing.
*/

/* If the peer is still valid, no need to reset connection */
if (vhost_vsock_get(vsk->remote_addr.svm_cid))
if (vhost_vsock_get(cid))
return;

/* If the close timeout is pending, let it expire. This avoids races
Expand Down
3 changes: 2 additions & 1 deletion include/linux/virtio_vsock.h
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,8 @@ virtio_transport_stream_enqueue(struct vsock_sock *vsk,
struct msghdr *msg,
size_t len);
int
virtio_transport_dgram_enqueue(struct vsock_sock *vsk,
virtio_transport_dgram_enqueue(const struct vsock_transport *transport,
struct vsock_sock *vsk,
struct sockaddr_vm *remote_addr,
struct msghdr *msg,
size_t len);
Expand Down
39 changes: 33 additions & 6 deletions include/net/af_vsock.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,17 @@ extern spinlock_t vsock_table_lock;
#define vsock_sk(__sk) ((struct vsock_sock *)__sk)
#define sk_vsock(__vsk) (&(__vsk)->sk)

struct vsock_remote_info {
struct sockaddr_vm addr;
struct rcu_head rcu;
const struct vsock_transport *transport;
};

struct vsock_sock {
/* sk must be the first member. */
struct sock sk;
const struct vsock_transport *transport;
struct sockaddr_vm local_addr;
struct sockaddr_vm remote_addr;
struct vsock_remote_info * __rcu remote_info;
/* Links for the global tables of bound and connected sockets. */
struct list_head bound_table;
struct list_head connected_table;
Expand Down Expand Up @@ -120,8 +125,8 @@ struct vsock_transport {

/* DGRAM. */
int (*dgram_bind)(struct vsock_sock *, struct sockaddr_vm *);
int (*dgram_enqueue)(struct vsock_sock *, struct sockaddr_vm *,
struct msghdr *, size_t len);
int (*dgram_enqueue)(const struct vsock_transport *, struct vsock_sock *,
struct sockaddr_vm *, struct msghdr *, size_t len);
bool (*dgram_allow)(u32 cid, u32 port);
int (*dgram_get_cid)(struct sk_buff *skb, unsigned int *cid);
int (*dgram_get_port)(struct sk_buff *skb, unsigned int *port);
Expand Down Expand Up @@ -196,6 +201,17 @@ void vsock_core_unregister(const struct vsock_transport *t);
/* The transport may downcast this to access transport-specific functions */
const struct vsock_transport *vsock_core_get_transport(struct vsock_sock *vsk);

static inline struct vsock_remote_info *
vsock_core_get_remote_info(struct vsock_sock *vsk)
{

/* vsk->remote_info may be accessed if the rcu read lock is held OR the
* socket lock is held
*/
return rcu_dereference_check(vsk->remote_info,
lockdep_sock_is_held(sk_vsock(vsk)));
}

/**** UTILS ****/

/* vsock_table_lock must be held */
Expand All @@ -214,7 +230,7 @@ void vsock_release_pending(struct sock *pending);
void vsock_add_pending(struct sock *listener, struct sock *pending);
void vsock_remove_pending(struct sock *listener, struct sock *pending);
void vsock_enqueue_accept(struct sock *listener, struct sock *connected);
void vsock_insert_connected(struct vsock_sock *vsk);
int vsock_insert_connected(struct vsock_sock *vsk);
void vsock_remove_bound(struct vsock_sock *vsk);
void vsock_remove_connected(struct vsock_sock *vsk);
struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr);
Expand All @@ -223,7 +239,8 @@ struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
void vsock_remove_sock(struct vsock_sock *vsk);
void vsock_for_each_connected_socket(struct vsock_transport *transport,
void (*fn)(struct sock *sk));
int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk);
int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk,
struct sockaddr_vm *remote_addr);
bool vsock_find_cid(unsigned int cid);
struct sock *vsock_find_bound_dgram_socket(struct sockaddr_vm *addr);

Expand Down Expand Up @@ -253,4 +270,14 @@ static inline void __init vsock_bpf_build_proto(void)
{}
#endif

/* RCU-protected remote addr helpers */
int vsock_remote_addr_cid(struct vsock_sock *vsk, unsigned int *cid);
int vsock_remote_addr_port(struct vsock_sock *vsk, unsigned int *port);
int vsock_remote_addr_cid_port(struct vsock_sock *vsk, unsigned int *cid,
unsigned int *port);
int vsock_remote_addr_copy(struct vsock_sock *vsk, struct sockaddr_vm *dest);
bool vsock_remote_addr_bound(struct vsock_sock *vsk);
bool vsock_remote_addr_equals(struct vsock_sock *vsk, struct sockaddr_vm *other);
int vsock_remote_addr_update_cid_port(struct vsock_sock *vsk, u32 cid, u32 port);

#endif /* __AF_VSOCK_H__ */

0 comments on commit 4e9bf74

Please sign in to comment.