Skip to content

Commit

Permalink
mm/madvise: introduce MADV_COLLAPSE sync hugepage collapse
Browse files Browse the repository at this point in the history
This idea was introduced by David Rientjes[1], and the semantics and
implementation were introduced and discussed in a previous PATCH RFC[2].

Introduce a new madvise mode, MADV_COLLAPSE, that allows users to request a
synchronous collapse of memory at their own expense.

The benefits of this approach are:

* CPU is charged to the process that wants to spend the cycles for the
  THP
* avoid unpredictable timing of khugepaged collapse

Immediate users of this new functionality include:

* immediately back executable text by hugepages.  Current support
  provided by CONFIG_READ_ONLY_THP_FOR_FS may take too long on a large
  system.
* malloc implementations that manage memory in hugepage-sized chunks,
  but sometimes subrelease memory back to the system in native-sized
  chunks via MADV_DONTNEED; zapping the pmd.  Later, when the memory
  is hot, the implementation could madvise(MADV_COLLAPSE) to re-back the
  memory by THP to regain TLB performance.

Allocation semantics are the same as khugepaged, and depend on (1) the
active sysfs settings /sys/kernel/mm/transparent_hugepage/enabled and
/sys/kernel/mm/transparent_hugepage/khugepaged/defrag, and (2) the VMA
flags of the memory range being collapsed.

Only privately-mapped anon memory is supported for now.

[1] https://lore.kernel.org/linux-mm/d098c392-273a-36a4-1a29-59731cdf5d3d@google.com/
[2] https://lore.kernel.org/linux-mm/20220308213417.1407042-1-zokeefe@google.com/

Suggested-by: David Rientjes <rientjes@google.com>
Signed-off-by: Zach O'Keefe <zokeefe@google.com>
  • Loading branch information
zokeefe authored and intel-lab-lkp committed Apr 10, 2022
1 parent 4e52cda commit 4f4775a
Show file tree
Hide file tree
Showing 4 changed files with 157 additions and 13 deletions.
12 changes: 12 additions & 0 deletions include/linux/huge_mm.h
Expand Up @@ -236,6 +236,9 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,

int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags,
int advice);
int madvise_collapse(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end);
void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start,
unsigned long end, long adjust_next);
spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma);
Expand Down Expand Up @@ -392,6 +395,15 @@ static inline int hugepage_madvise(struct vm_area_struct *vma,
BUG();
return 0;
}

static inline int madvise_collapse(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end)
{
BUG();
return 0;
}

static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
unsigned long start,
unsigned long end,
Expand Down
2 changes: 2 additions & 0 deletions include/uapi/asm-generic/mman-common.h
Expand Up @@ -77,6 +77,8 @@

#define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */

#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */

/* compatibility flags */
#define MAP_FILE 0

Expand Down
151 changes: 138 additions & 13 deletions mm/khugepaged.c
Expand Up @@ -846,7 +846,6 @@ static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT;
}

#ifdef CONFIG_NUMA
static int khugepaged_find_target_node(struct collapse_control *cc)
{
int nid, target_node = 0, max_value = 0;
Expand All @@ -872,6 +871,24 @@ static int khugepaged_find_target_node(struct collapse_control *cc)
return target_node;
}

static struct page *alloc_hpage(struct collapse_control *cc, gfp_t gfp,
int node)
{
VM_BUG_ON_PAGE(cc->hpage, cc->hpage);

cc->hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
if (unlikely(!cc->hpage)) {
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
cc->hpage = ERR_PTR(-ENOMEM);
return NULL;
}

prep_transhuge_page(cc->hpage);
count_vm_event(THP_COLLAPSE_ALLOC);
return cc->hpage;
}

#ifdef CONFIG_NUMA
static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
{
if (IS_ERR(*hpage)) {
Expand All @@ -892,18 +909,7 @@ static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
static struct page *khugepaged_alloc_page(struct collapse_control *cc,
gfp_t gfp, int node)
{
VM_BUG_ON_PAGE(cc->hpage, cc->hpage);

cc->hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER);
if (unlikely(!cc->hpage)) {
count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
cc->hpage = ERR_PTR(-ENOMEM);
return NULL;
}

prep_transhuge_page(cc->hpage);
count_vm_event(THP_COLLAPSE_ALLOC);
return cc->hpage;
return alloc_hpage(cc, gfp, node);
}
#else
static int khugepaged_find_target_node(struct collapse_control *cc)
Expand Down Expand Up @@ -2456,3 +2462,122 @@ void khugepaged_min_free_kbytes_update(void)
set_recommended_min_free_kbytes();
mutex_unlock(&khugepaged_mutex);
}

static void madvise_collapse_cleanup_page(struct page **hpage)
{
if (!IS_ERR(*hpage) && *hpage)
put_page(*hpage);
*hpage = NULL;
}

int madvise_collapse_errno(enum scan_result r)
{
switch (r) {
case SCAN_PMD_NULL:
case SCAN_ADDRESS_RANGE:
case SCAN_VMA_NULL:
case SCAN_PTE_NON_PRESENT:
case SCAN_PAGE_NULL:
/*
* Addresses in the specified range are not currently mapped,
* or are outside the AS of the process.
*/
return -ENOMEM;
case SCAN_ALLOC_HUGE_PAGE_FAIL:
case SCAN_CGROUP_CHARGE_FAIL:
/* A kernel resource was temporarily unavailable. */
return -EAGAIN;
default:
return -EINVAL;
}
}

int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
unsigned long start, unsigned long end)
{
struct collapse_control cc = {
.last_target_node = NUMA_NO_NODE,
.hpage = NULL,
.alloc_hpage = &alloc_hpage,
};
struct mm_struct *mm = vma->vm_mm;
struct collapse_result cr;
unsigned long hstart, hend, addr;
int thps = 0, nr_hpages = 0;

BUG_ON(vma->vm_start > start);
BUG_ON(vma->vm_end < end);

*prev = vma;

if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file)
return -EINVAL;

hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
hend = end & HPAGE_PMD_MASK;
nr_hpages = (hend - hstart) >> HPAGE_PMD_SHIFT;

if (hstart >= hend || !transparent_hugepage_active(vma))
return -EINVAL;

mmgrab(mm);
lru_add_drain();

for (addr = hstart; ; ) {
mmap_assert_locked(mm);
cond_resched();
memset(&cr, 0, sizeof(cr));

if (unlikely(khugepaged_test_exit(mm)))
break;

memset(cc.node_load, 0, sizeof(cc.node_load));
khugepaged_scan_pmd(mm, vma, addr, &cc, &cr);
if (cr.dropped_mmap_lock)
*prev = NULL; /* tell madvise we dropped mmap_lock */

switch (cr.result) {
/* Whitelisted set of results where continuing OK */
case SCAN_SUCCEED:
case SCAN_PMD_MAPPED:
++thps;
case SCAN_PMD_NULL:
case SCAN_PTE_NON_PRESENT:
case SCAN_PTE_UFFD_WP:
case SCAN_PAGE_RO:
case SCAN_LACK_REFERENCED_PAGE:
case SCAN_PAGE_NULL:
case SCAN_PAGE_COUNT:
case SCAN_PAGE_LOCK:
case SCAN_PAGE_COMPOUND:
break;
case SCAN_PAGE_LRU:
lru_add_drain_all();
goto retry;
default:
/* Other error, exit */
goto break_loop;
}
addr += HPAGE_PMD_SIZE;
if (addr >= hend)
break;
retry:
if (cr.dropped_mmap_lock) {
mmap_read_lock(mm);
if (hugepage_vma_revalidate(mm, addr, &vma))
goto out;
}
madvise_collapse_cleanup_page(&cc.hpage);
}

break_loop:
/* madvise_walk_vmas() expects us to hold mmap_lock on return */
if (cr.dropped_mmap_lock)
mmap_read_lock(mm);
out:
mmap_assert_locked(mm);
madvise_collapse_cleanup_page(&cc.hpage);
mmdrop(mm);

return thps == nr_hpages ? 0 : madvise_collapse_errno(cr.result);
}
5 changes: 5 additions & 0 deletions mm/madvise.c
Expand Up @@ -59,6 +59,7 @@ static int madvise_need_mmap_write(int behavior)
case MADV_FREE:
case MADV_POPULATE_READ:
case MADV_POPULATE_WRITE:
case MADV_COLLAPSE:
return 0;
default:
/* be safe, default to 1. list exceptions explicitly */
Expand Down Expand Up @@ -1051,6 +1052,8 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
if (error)
goto out;
break;
case MADV_COLLAPSE:
return madvise_collapse(vma, prev, start, end);
}

anon_name = anon_vma_name(vma);
Expand Down Expand Up @@ -1144,6 +1147,7 @@ madvise_behavior_valid(int behavior)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
case MADV_HUGEPAGE:
case MADV_NOHUGEPAGE:
case MADV_COLLAPSE:
#endif
case MADV_DONTDUMP:
case MADV_DODUMP:
Expand Down Expand Up @@ -1333,6 +1337,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
* MADV_NOHUGEPAGE - mark the given range as not worth being backed by
* transparent huge pages so the existing pages will not be
* coalesced into THP and new pages will not be allocated as THP.
* MADV_COLLAPSE - synchronously coalesce pages into new THP.
* MADV_DONTDUMP - the application wants to prevent pages in the given range
* from being included in its core dump.
* MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
Expand Down

0 comments on commit 4f4775a

Please sign in to comment.