Skip to content

Commit

Permalink
userfaultfd: introduce access-likely mode for copy/wp operations
Browse files Browse the repository at this point in the history
Using a PTE on x86 with cleared access-bit (aka young-bit)
takes ~600 cycles more than when the access bit is set. At the same
time, setting the access-bit for memory that is not used (e.g.,
prefetched) can introduce greater overheads, as the prefetched memory is
reclaimed later than it should be.

Userfaultfd currently does not set the access-bit (excluding the
huge-pages case). Arguably, it is best to let the user control whether
the access bit should be set or not. The expected use is to request
userfaultfd to set the access-bit when the copy/wp operation is done to
resolve a page-fault, and not to set the access-bit when the memory is
prefetched.

Introduce UFFDIO_COPY_MODE_ACCESS_LIKELY and
UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY to enable userspace to request
the young bit to be set. Set for UFFDIO_CONTINUE and UFFDIO_ZEROPAGE the
bit unconditionally since the former is only used to resolve page-faults
and the latter would not benefit from not setting the access-bit.

Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Mike Rapoport <rppt@linux.ibm.com>
Signed-off-by: Nadav Amit <namit@vmware.com>
  • Loading branch information
anadav authored and intel-lab-lkp committed Jun 20, 2022
1 parent 0b36999 commit ad1b812
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 10 deletions.
23 changes: 16 additions & 7 deletions fs/userfaultfd.c
Expand Up @@ -1700,7 +1700,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
struct uffdio_copy uffdio_copy;
struct uffdio_copy __user *user_uffdio_copy;
struct userfaultfd_wake_range range;
bool mode_wp;
bool mode_wp, mode_access_likely;
uffd_flags_t uffd_flags;

user_uffdio_copy = (struct uffdio_copy __user *) arg;
Expand All @@ -1726,12 +1726,15 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx,
ret = -EINVAL;
if (uffdio_copy.src + uffdio_copy.len <= uffdio_copy.src)
goto out;
if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP))
if (uffdio_copy.mode & ~(UFFDIO_COPY_MODE_DONTWAKE|UFFDIO_COPY_MODE_WP|
UFFDIO_COPY_MODE_ACCESS_LIKELY))
goto out;

mode_wp = uffdio_copy.mode & UFFDIO_COPY_MODE_WP;
mode_access_likely = uffdio_copy.mode & UFFDIO_COPY_MODE_ACCESS_LIKELY;

uffd_flags = mode_wp ? UFFD_FLAGS_WP : 0;
uffd_flags = (mode_wp ? UFFD_FLAGS_WP : 0) |
(mode_access_likely ? UFFD_FLAGS_ACCESS_LIKELY : 0);

if (mmget_not_zero(ctx->mm)) {
ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src,
Expand Down Expand Up @@ -1816,7 +1819,7 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
struct uffdio_writeprotect uffdio_wp;
struct uffdio_writeprotect __user *user_uffdio_wp;
struct userfaultfd_wake_range range;
bool mode_wp, mode_dontwake;
bool mode_wp, mode_dontwake, mode_access_likely;
uffd_flags_t uffd_flags;

if (atomic_read(&ctx->mmap_changing))
Expand All @@ -1834,16 +1837,19 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
return ret;

if (uffdio_wp.mode & ~(UFFDIO_WRITEPROTECT_MODE_DONTWAKE |
UFFDIO_WRITEPROTECT_MODE_WP))
UFFDIO_WRITEPROTECT_MODE_WP |
UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY))
return -EINVAL;

mode_wp = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_WP;
mode_dontwake = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
mode_access_likely = uffdio_wp.mode & UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY;

if (mode_wp && mode_dontwake)
return -EINVAL;

uffd_flags = (mode_wp ? UFFD_FLAGS_WP : 0);
uffd_flags = (mode_wp ? UFFD_FLAGS_WP : 0) |
(mode_access_likely ? UFFD_FLAGS_ACCESS_LIKELY : 0);

if (mmget_not_zero(ctx->mm)) {
ret = mwriteprotect_range(ctx->mm, uffdio_wp.range.start,
Expand Down Expand Up @@ -1871,6 +1877,7 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
struct uffdio_continue uffdio_continue;
struct uffdio_continue __user *user_uffdio_continue;
struct userfaultfd_wake_range range;
uffd_flags_t uffd_flags;

user_uffdio_continue = (struct uffdio_continue __user *)arg;

Expand Down Expand Up @@ -1898,10 +1905,12 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE)
goto out;

uffd_flags = UFFD_FLAGS_ACCESS_LIKELY;

if (mmget_not_zero(ctx->mm)) {
ret = mcopy_continue(ctx->mm, uffdio_continue.range.start,
uffdio_continue.range.len,
&ctx->mmap_changing, 0);
&ctx->mmap_changing, uffd_flags);
mmput(ctx->mm);
} else {
return -ESRCH;
Expand Down
1 change: 1 addition & 0 deletions include/linux/userfaultfd_k.h
Expand Up @@ -58,6 +58,7 @@ enum mcopy_atomic_mode {
typedef unsigned int __bitwise uffd_flags_t;

#define UFFD_FLAGS_WP ((__force uffd_flags_t)BIT(0))
#define UFFD_FLAGS_ACCESS_LIKELY ((__force uffd_flags_t)BIT(1))

extern int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
Expand Down
20 changes: 19 additions & 1 deletion include/uapi/linux/userfaultfd.h
Expand Up @@ -38,7 +38,8 @@
UFFD_FEATURE_MINOR_HUGETLBFS | \
UFFD_FEATURE_MINOR_SHMEM | \
UFFD_FEATURE_EXACT_ADDRESS | \
UFFD_FEATURE_WP_HUGETLBFS_SHMEM)
UFFD_FEATURE_WP_HUGETLBFS_SHMEM | \
UFFD_FEATURE_ACCESS_HINTS)
#define UFFD_API_IOCTLS \
((__u64)1 << _UFFDIO_REGISTER | \
(__u64)1 << _UFFDIO_UNREGISTER | \
Expand Down Expand Up @@ -203,6 +204,10 @@ struct uffdio_api {
*
* UFFD_FEATURE_WP_HUGETLBFS_SHMEM indicates that userfaultfd
* write-protection mode is supported on both shmem and hugetlbfs.
*
* UFFD_FEATURE_ACCESS_HINTS indicates that the copy supports
* UFFDIO_COPY_MODE_ACCESS_LIKELY supports
* UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY.
*/
#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0)
#define UFFD_FEATURE_EVENT_FORK (1<<1)
Expand All @@ -217,6 +222,7 @@ struct uffdio_api {
#define UFFD_FEATURE_MINOR_SHMEM (1<<10)
#define UFFD_FEATURE_EXACT_ADDRESS (1<<11)
#define UFFD_FEATURE_WP_HUGETLBFS_SHMEM (1<<12)
#define UFFD_FEATURE_ACCESS_HINTS (1<<13)
__u64 features;

__u64 ioctls;
Expand Down Expand Up @@ -260,6 +266,13 @@ struct uffdio_copy {
* copy_from_user will not read the last 8 bytes.
*/
__s64 copy;
/*
* UFFDIO_COPY_MODE_ACCESS_LIKELY will set the mapped page as young.
* This can reduce the time that the first access to the page takes.
* Yet, if set opportunistically to memory that is not used, it might
* extend the time before the unused memory pages are reclaimed.
*/
#define UFFDIO_COPY_MODE_ACCESS_LIKELY ((__u64)1<<3)
};

struct uffdio_zeropage {
Expand All @@ -284,13 +297,18 @@ struct uffdio_writeprotect {
* UFFDIO_WRITEPROTECT_MODE_DONTWAKE: set the flag to avoid waking up
* any wait thread after the operation succeeds.
*
* UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY: set the flag to mark the modified
* memory as young, which can reduce the time that the first access
* to the page takes.
*
* NOTE: Write protecting a region (WP=1) is unrelated to page faults,
* therefore DONTWAKE flag is meaningless with WP=1. Removing write
* protection (WP=0) in response to a page fault wakes the faulting
* task unless DONTWAKE is set.
*/
#define UFFDIO_WRITEPROTECT_MODE_WP ((__u64)1<<0)
#define UFFDIO_WRITEPROTECT_MODE_DONTWAKE ((__u64)1<<1)
#define UFFDIO_WRITEPROTECT_MODE_ACCESS_LIKELY ((__u64)1<<2)
__u64 mode;
};

Expand Down
18 changes: 16 additions & 2 deletions mm/userfaultfd.c
Expand Up @@ -92,6 +92,9 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
*/
_dst_pte = pte_wrprotect(_dst_pte);

if (uffd_flags & UFFD_FLAGS_ACCESS_LIKELY)
_dst_pte = pte_mkyoung(_dst_pte);

dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);

if (vma_is_shmem(dst_vma)) {
Expand Down Expand Up @@ -202,7 +205,8 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
static int mfill_zeropage_pte(struct mm_struct *dst_mm,
pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr)
unsigned long dst_addr,
uffd_flags_t uffd_flags)
{
pte_t _dst_pte, *dst_pte;
spinlock_t *ptl;
Expand All @@ -225,6 +229,10 @@ static int mfill_zeropage_pte(struct mm_struct *dst_mm,
ret = -EEXIST;
if (!pte_none(*dst_pte))
goto out_unlock;

if (uffd_flags & UFFD_FLAGS_ACCESS_LIKELY)
_dst_pte = pte_mkyoung(_dst_pte);

set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
/* No need to invalidate - it was non-present before */
update_mmu_cache(dst_vma, dst_addr, dst_pte);
Expand Down Expand Up @@ -498,7 +506,7 @@ static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
uffd_flags);
else
err = mfill_zeropage_pte(dst_mm, dst_pmd,
dst_vma, dst_addr);
dst_vma, dst_addr, uffd_flags);
} else {
err = shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
dst_addr, src_addr,
Expand Down Expand Up @@ -691,6 +699,9 @@ ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
unsigned long len, atomic_t *mmap_changing,
uffd_flags_t uffd_flags)
{
/* There is no cost for setting the access bit of a zeropage */
uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;

return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE,
mmap_changing, 0);
}
Expand All @@ -699,6 +710,9 @@ ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start,
unsigned long len, atomic_t *mmap_changing,
uffd_flags_t uffd_flags)
{
/* The page is likely to be accessed */
uffd_flags |= UFFD_FLAGS_ACCESS_LIKELY;

return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE,
mmap_changing, 0);
}
Expand Down

0 comments on commit ad1b812

Please sign in to comment.