Skip to content

Commit

Permalink
xsk: Support UMEM chunk_size > PAGE_SIZE
Browse files Browse the repository at this point in the history
Add core AF_XDP support for chunk sizes larger than PAGE_SIZE. This
enables sending/receiving jumbo ethernet frames up to the theoretical
maxiumum of 64 KiB. For chunk sizes > PAGE_SIZE, the UMEM is required
to consist of HugeTLB VMAs (and be hugepage aligned). Initially, only
XDP_COPY mode is usuable pending future driver work.

For consistency, check for HugeTLB pages during UMEM registration. This
implies that hugepages are required for XDP_COPY mode despite DMA not
being used. This restriction is desirable since it ensures user software
can take advantage of future driver support.

Even in HugeTLB mode, continue to do page accounting using order-0
(4 KiB) pages. This minimizes the size of this change and reduces the
risk of impacting driver code. Taking full advantage of hugepages for
accounting should improve XDP performance in the general case.

No significant change in RX/TX performance was observed with this patch.
A few data points are reproduced below:

Machine : Dell PowerEdge R940
CPU     : Intel(R) Xeon(R) Platinum 8168 CPU @ 2.70GHz
NIC     : MT27700 Family [ConnectX-4]

+-----+------------+-------------+---------------+
|     | frame size | packet size | rxdrop (Mpps) |
+-----+------------+-------------+---------------+
| old |       4000 |         320 |          15.7 |
| new |       4000 |         320 |          15.8 |
+-----+------------+-------------+---------------+
| old |       4096 |         320 |          16.4 |
| new |       4096 |         320 |          16.3 |
+-----+------------+-------------+---------------+
| new |       9000 |         320 |           6.3 |
| new |      10240 |        9000 |           0.4 |
+-----+------------+-------------+---------------+

Signed-off-by: Kal Conley <kal.conley@dectris.com>
  • Loading branch information
kalcutter authored and intel-lab-lkp committed Mar 19, 2023
1 parent c7df481 commit bbcc35c
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 15 deletions.
1 change: 1 addition & 0 deletions include/net/xdp_sock.h
Expand Up @@ -28,6 +28,7 @@ struct xdp_umem {
struct user_struct *user;
refcount_t users;
u8 flags;
bool hugetlb;
bool zc;
struct page **pgs;
int id;
Expand Down
6 changes: 6 additions & 0 deletions include/net/xdp_sock_drv.h
Expand Up @@ -12,6 +12,12 @@
#define XDP_UMEM_MIN_CHUNK_SHIFT 11
#define XDP_UMEM_MIN_CHUNK_SIZE (1 << XDP_UMEM_MIN_CHUNK_SHIFT)

/* Allow chunk sizes up to the maximum size of an ethernet frame (64 KiB).
* Larger chunks are not guaranteed to fit in a single SKB.
*/
#define XDP_UMEM_MAX_CHUNK_SHIFT 16
#define XDP_UMEM_MAX_CHUNK_SIZE (1 << XDP_UMEM_MAX_CHUNK_SHIFT)

#ifdef CONFIG_XDP_SOCKETS

void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries);
Expand Down
4 changes: 3 additions & 1 deletion include/net/xsk_buff_pool.h
Expand Up @@ -78,6 +78,7 @@ struct xsk_buff_pool {
u8 cached_need_wakeup;
bool uses_need_wakeup;
bool dma_need_sync;
bool hugetlb;
bool unaligned;
void *addrs;
/* Mutual exclusion of the completion ring in the SKB mode. Two cases to protect:
Expand Down Expand Up @@ -175,7 +176,8 @@ static inline void xp_dma_sync_for_device(struct xsk_buff_pool *pool,
static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool,
u64 addr, u32 len)
{
bool cross_pg = (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE;
bool cross_pg = pool->hugetlb ? (addr & (HPAGE_SIZE - 1)) + len > HPAGE_SIZE :
(addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE;

if (likely(!cross_pg))
return false;
Expand Down
46 changes: 36 additions & 10 deletions net/xdp/xdp_umem.c
Expand Up @@ -10,6 +10,7 @@
#include <linux/uaccess.h>
#include <linux/slab.h>
#include <linux/bpf.h>
#include <linux/hugetlb_inline.h>
#include <linux/mm.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
Expand All @@ -19,6 +20,9 @@
#include "xdp_umem.h"
#include "xsk_queue.h"

_Static_assert(XDP_UMEM_MIN_CHUNK_SIZE <= PAGE_SIZE);
_Static_assert(XDP_UMEM_MAX_CHUNK_SIZE <= HPAGE_SIZE);

static DEFINE_IDA(umem_ida);

static void xdp_umem_unpin_pages(struct xdp_umem *umem)
Expand Down Expand Up @@ -91,7 +95,26 @@ void xdp_put_umem(struct xdp_umem *umem, bool defer_cleanup)
}
}

static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address)
/* Returns true if the UMEM contains HugeTLB pages exclusively, false otherwise.
*
* The mmap_lock must be held by the caller.
*/
static bool xdp_umem_is_hugetlb(struct xdp_umem *umem, unsigned long address)
{
unsigned long end = address + umem->npgs * PAGE_SIZE;
struct vm_area_struct *vma;
struct vma_iterator vmi;

vma_iter_init(&vmi, current->mm, address);
for_each_vma_range(vmi, vma, end) {
if (!is_vm_hugetlb_page(vma))
return false;
}

return true;
}

static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address, bool hugetlb)
{
unsigned int gup_flags = FOLL_WRITE;
long npgs;
Expand All @@ -102,8 +125,17 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address)
return -ENOMEM;

mmap_read_lock(current->mm);

umem->hugetlb = IS_ALIGNED(address, HPAGE_SIZE) && xdp_umem_is_hugetlb(umem, address);
if (hugetlb && !umem->hugetlb) {
mmap_read_unlock(current->mm);
err = -EINVAL;
goto out_pgs;
}

npgs = pin_user_pages(address, umem->npgs,
gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL);

mmap_read_unlock(current->mm);

if (npgs != umem->npgs) {
Expand Down Expand Up @@ -152,20 +184,14 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
{
bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG;
u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
bool hugetlb = chunk_size > PAGE_SIZE;
u64 addr = mr->addr, size = mr->len;
u32 chunks_rem, npgs_rem;
u64 chunks, npgs;
int err;

if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
/* Strictly speaking we could support this, if:
* - huge pages, or*
* - using an IOMMU, or
* - making sure the memory area is consecutive
* but for now, we simply say "computer says no".
*/
if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > XDP_UMEM_MAX_CHUNK_SIZE)
return -EINVAL;
}

if (mr->flags & ~XDP_UMEM_UNALIGNED_CHUNK_FLAG)
return -EINVAL;
Expand Down Expand Up @@ -215,7 +241,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
if (err)
return err;

err = xdp_umem_pin_pages(umem, (unsigned long)addr);
err = xdp_umem_pin_pages(umem, (unsigned long)addr, hugetlb);
if (err)
goto out_account;

Expand Down
3 changes: 3 additions & 0 deletions net/xdp/xsk.c
Expand Up @@ -421,6 +421,9 @@ static void xsk_destruct_skb(struct sk_buff *skb)
sock_wfree(skb);
}

/* Chunks must fit in the SKB `frags` array. */
_Static_assert(XDP_UMEM_MAX_CHUNK_SIZE / PAGE_SIZE <= MAX_SKB_FRAGS);

static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
struct xdp_desc *desc)
{
Expand Down
16 changes: 12 additions & 4 deletions net/xdp/xsk_buff_pool.c
Expand Up @@ -80,6 +80,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
pool->headroom = umem->headroom;
pool->chunk_size = umem->chunk_size;
pool->chunk_shift = ffs(umem->chunk_size) - 1;
pool->hugetlb = umem->hugetlb;
pool->unaligned = unaligned;
pool->frame_len = umem->chunk_size - umem->headroom -
XDP_PACKET_HEADROOM;
Expand Down Expand Up @@ -369,16 +370,23 @@ void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs)
}
EXPORT_SYMBOL(xp_dma_unmap);

static void xp_check_dma_contiguity(struct xsk_dma_map *dma_map)
/* HugeTLB pools consider contiguity at hugepage granularity only. Hence, all
* order-0 pages within a hugepage have the same contiguity value.
*/
static void xp_check_dma_contiguity(struct xsk_dma_map *dma_map, bool hugetlb)
{
u32 page_size = hugetlb ? HPAGE_SIZE : PAGE_SIZE;
u32 n = page_size >> PAGE_SHIFT;
u32 i;

for (i = 0; i < dma_map->dma_pages_cnt - 1; i++) {
if (dma_map->dma_pages[i] + PAGE_SIZE == dma_map->dma_pages[i + 1])
for (i = 0; i + n < dma_map->dma_pages_cnt; i++) {
if (dma_map->dma_pages[i] + page_size == dma_map->dma_pages[i + n])
dma_map->dma_pages[i] |= XSK_NEXT_PG_CONTIG_MASK;
else
dma_map->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK;
}
for (; i < dma_map->dma_pages_cnt; i++)
dma_map->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK;
}

static int xp_init_dma_info(struct xsk_buff_pool *pool, struct xsk_dma_map *dma_map)
Expand Down Expand Up @@ -441,7 +449,7 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev,
}

if (pool->unaligned)
xp_check_dma_contiguity(dma_map);
xp_check_dma_contiguity(dma_map, pool->hugetlb);

err = xp_init_dma_info(pool, dma_map);
if (err) {
Expand Down

0 comments on commit bbcc35c

Please sign in to comment.