diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 3057e1a4a11c63..e562ac3f5295be 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -28,6 +28,7 @@ struct xdp_umem { struct user_struct *user; refcount_t users; u8 flags; + bool hugetlb; bool zc; struct page **pgs; int id; diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 9c0d860609ba94..eb630d17f9946a 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -12,6 +12,12 @@ #define XDP_UMEM_MIN_CHUNK_SHIFT 11 #define XDP_UMEM_MIN_CHUNK_SIZE (1 << XDP_UMEM_MIN_CHUNK_SHIFT) +/* Allow chunk sizes up to the maximum size of an ethernet frame (64 KiB). + * Larger chunks are not guaranteed to fit in a single SKB. + */ +#define XDP_UMEM_MAX_CHUNK_SHIFT 16 +#define XDP_UMEM_MAX_CHUNK_SIZE (1 << XDP_UMEM_MAX_CHUNK_SHIFT) + #ifdef CONFIG_XDP_SOCKETS void xsk_tx_completed(struct xsk_buff_pool *pool, u32 nb_entries); diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 3e952e56941885..69e3970da0926a 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -78,6 +78,7 @@ struct xsk_buff_pool { u8 cached_need_wakeup; bool uses_need_wakeup; bool dma_need_sync; + bool hugetlb; bool unaligned; void *addrs; /* Mutual exclusion of the completion ring in the SKB mode. Two cases to protect: @@ -175,7 +176,8 @@ static inline void xp_dma_sync_for_device(struct xsk_buff_pool *pool, static inline bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool, u64 addr, u32 len) { - bool cross_pg = (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE; + bool cross_pg = pool->hugetlb ? (addr & (HPAGE_SIZE - 1)) + len > HPAGE_SIZE : + (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE; if (likely(!cross_pg)) return false; diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 02207e852d796d..c96eefb9f5aecf 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -19,6 +20,9 @@ #include "xdp_umem.h" #include "xsk_queue.h" +_Static_assert(XDP_UMEM_MIN_CHUNK_SIZE <= PAGE_SIZE); +_Static_assert(XDP_UMEM_MAX_CHUNK_SIZE <= HPAGE_SIZE); + static DEFINE_IDA(umem_ida); static void xdp_umem_unpin_pages(struct xdp_umem *umem) @@ -91,7 +95,26 @@ void xdp_put_umem(struct xdp_umem *umem, bool defer_cleanup) } } -static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address) +/* Returns true if the UMEM contains HugeTLB pages exclusively, false otherwise. + * + * The mmap_lock must be held by the caller. + */ +static bool xdp_umem_is_hugetlb(struct xdp_umem *umem, unsigned long address) +{ + unsigned long end = address + umem->npgs * PAGE_SIZE; + struct vm_area_struct *vma; + struct vma_iterator vmi; + + vma_iter_init(&vmi, current->mm, address); + for_each_vma_range(vmi, vma, end) { + if (!is_vm_hugetlb_page(vma)) + return false; + } + + return true; +} + +static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address, bool hugetlb) { unsigned int gup_flags = FOLL_WRITE; long npgs; @@ -102,8 +125,17 @@ static int xdp_umem_pin_pages(struct xdp_umem *umem, unsigned long address) return -ENOMEM; mmap_read_lock(current->mm); + + umem->hugetlb = IS_ALIGNED(address, HPAGE_SIZE) && xdp_umem_is_hugetlb(umem, address); + if (hugetlb && !umem->hugetlb) { + mmap_read_unlock(current->mm); + err = -EINVAL; + goto out_pgs; + } + npgs = pin_user_pages(address, umem->npgs, gup_flags | FOLL_LONGTERM, &umem->pgs[0], NULL); + mmap_read_unlock(current->mm); if (npgs != umem->npgs) { @@ -152,20 +184,14 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) { bool unaligned_chunks = mr->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG; u32 chunk_size = mr->chunk_size, headroom = mr->headroom; + bool hugetlb = chunk_size > PAGE_SIZE; u64 addr = mr->addr, size = mr->len; u32 chunks_rem, npgs_rem; u64 chunks, npgs; int err; - if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) { - /* Strictly speaking we could support this, if: - * - huge pages, or* - * - using an IOMMU, or - * - making sure the memory area is consecutive - * but for now, we simply say "computer says no". - */ + if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > XDP_UMEM_MAX_CHUNK_SIZE) return -EINVAL; - } if (mr->flags & ~XDP_UMEM_UNALIGNED_CHUNK_FLAG) return -EINVAL; @@ -215,7 +241,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if (err) return err; - err = xdp_umem_pin_pages(umem, (unsigned long)addr); + err = xdp_umem_pin_pages(umem, (unsigned long)addr, hugetlb); if (err) goto out_account; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 2ac58b282b5eb2..3899a2d235bb80 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -421,6 +421,9 @@ static void xsk_destruct_skb(struct sk_buff *skb) sock_wfree(skb); } +/* Chunks must fit in the SKB `frags` array. */ +_Static_assert(XDP_UMEM_MAX_CHUNK_SIZE / PAGE_SIZE <= MAX_SKB_FRAGS); + static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, struct xdp_desc *desc) { diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index b2df1e0f81538a..777e8a38a232ce 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -80,6 +80,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, pool->headroom = umem->headroom; pool->chunk_size = umem->chunk_size; pool->chunk_shift = ffs(umem->chunk_size) - 1; + pool->hugetlb = umem->hugetlb; pool->unaligned = unaligned; pool->frame_len = umem->chunk_size - umem->headroom - XDP_PACKET_HEADROOM; @@ -369,16 +370,23 @@ void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) } EXPORT_SYMBOL(xp_dma_unmap); -static void xp_check_dma_contiguity(struct xsk_dma_map *dma_map) +/* HugeTLB pools consider contiguity at hugepage granularity only. Hence, all + * order-0 pages within a hugepage have the same contiguity value. + */ +static void xp_check_dma_contiguity(struct xsk_dma_map *dma_map, bool hugetlb) { + u32 page_size = hugetlb ? HPAGE_SIZE : PAGE_SIZE; + u32 n = page_size >> PAGE_SHIFT; u32 i; - for (i = 0; i < dma_map->dma_pages_cnt - 1; i++) { - if (dma_map->dma_pages[i] + PAGE_SIZE == dma_map->dma_pages[i + 1]) + for (i = 0; i + n < dma_map->dma_pages_cnt; i++) { + if (dma_map->dma_pages[i] + page_size == dma_map->dma_pages[i + n]) dma_map->dma_pages[i] |= XSK_NEXT_PG_CONTIG_MASK; else dma_map->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK; } + for (; i < dma_map->dma_pages_cnt; i++) + dma_map->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK; } static int xp_init_dma_info(struct xsk_buff_pool *pool, struct xsk_dma_map *dma_map) @@ -441,7 +449,7 @@ int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, } if (pool->unaligned) - xp_check_dma_contiguity(dma_map); + xp_check_dma_contiguity(dma_map, pool->hugetlb); err = xp_init_dma_info(pool, dma_map); if (err) {