diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bf80adca980b96..f45481ab60ba18 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -369,7 +369,7 @@ int folio_mkclean(struct folio *); int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, struct vm_area_struct *vma); -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked); +void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked, bool unmap_clean); int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 404024486fa539..585609568bafe0 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -104,6 +104,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD THP_SPLIT_PUD, #endif + THP_SPLIT_FREE, + THP_SPLIT_UNMAP, + THP_SPLIT_REMAP_READONLY_ZERO_PAGE, THP_ZERO_PAGE_ALLOC, THP_ZERO_PAGE_ALLOC_FAILED, THP_SWPOUT, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f7fdf28a296844..0fd968e0716e17 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2420,7 +2420,7 @@ static void unmap_page(struct page *page) try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK); } -static void remap_page(struct folio *folio, unsigned long nr) +static void remap_page(struct folio *folio, unsigned long nr, bool unmap_clean) { int i = 0; @@ -2428,7 +2428,7 @@ static void remap_page(struct folio *folio, unsigned long nr) if (!folio_test_anon(folio)) return; for (;;) { - remove_migration_ptes(folio, folio, true); + remove_migration_ptes(folio, folio, true, unmap_clean); i += folio_nr_pages(folio); if (i >= nr) break; @@ -2542,6 +2542,8 @@ static void __split_huge_page(struct page *page, struct list_head *list, struct address_space *swap_cache = NULL; unsigned long offset = 0; unsigned int nr = thp_nr_pages(head); + LIST_HEAD(pages_to_free); + int nr_pages_to_free = 0; int i; /* complete memcg works before add pages to LRU */ @@ -2604,7 +2606,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, } local_irq_enable(); - remap_page(folio, nr); + remap_page(folio, nr, PageAnon(head)); if (PageSwapCache(head)) { swp_entry_t entry = { .val = page_private(head) }; @@ -2618,6 +2620,33 @@ static void __split_huge_page(struct page *page, struct list_head *list, continue; unlock_page(subpage); + /* + * If a tail page has only two references left, one inherited + * from the isolation of its head and the other from + * lru_add_page_tail() which we are about to drop, it means this + * tail page was concurrently zapped. Then we can safely free it + * and save page reclaim or migration the trouble of trying it. + */ + if (list && page_ref_freeze(subpage, 2)) { + VM_BUG_ON_PAGE(PageLRU(subpage), subpage); + VM_BUG_ON_PAGE(PageCompound(subpage), subpage); + VM_BUG_ON_PAGE(page_mapped(subpage), subpage); + + ClearPageActive(subpage); + ClearPageUnevictable(subpage); + list_move(&subpage->lru, &pages_to_free); + nr_pages_to_free++; + continue; + } + /* + * If a tail page has only one reference left, it will be freed + * by the call to free_page_and_swap_cache below. Since zero + * subpages are no longer remapped, there will only be one + * reference left in cases outside of reclaim or migration. + */ + if (page_ref_count(subpage) == 1) + nr_pages_to_free++; + /* * Subpages may be freed if there wasn't any mapping * like if add_to_swap() is running on a lru page that @@ -2627,6 +2656,13 @@ static void __split_huge_page(struct page *page, struct list_head *list, */ free_page_and_swap_cache(subpage); } + + if (!nr_pages_to_free) + return; + + mem_cgroup_uncharge_list(&pages_to_free); + free_unref_page_list(&pages_to_free); + count_vm_events(THP_SPLIT_FREE, nr_pages_to_free); } /* Racy check whether the huge page can be split */ @@ -2789,7 +2825,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) if (mapping) xas_unlock(&xas); local_irq_enable(); - remap_page(folio, folio_nr_pages(folio)); + remap_page(folio, folio_nr_pages(folio), false); ret = -EBUSY; } diff --git a/mm/migrate.c b/mm/migrate.c index 6a1597c92261d3..8da61f900ad9e1 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -167,13 +167,62 @@ void putback_movable_pages(struct list_head *l) } } +static bool try_to_unmap_clean(struct page_vma_mapped_walk *pvmw, struct page *page) +{ + void *addr; + bool dirty; + pte_t newpte; + + VM_BUG_ON_PAGE(PageCompound(page), page); + VM_BUG_ON_PAGE(!PageAnon(page), page); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page); + + if (PageMlocked(page) || (pvmw->vma->vm_flags & VM_LOCKED)) + return false; + + /* + * The pmd entry mapping the old thp was flushed and the pte mapping + * this subpage has been non present. Therefore, this subpage is + * inaccessible. We don't need to remap it if it contains only zeros. + */ + addr = kmap_local_page(page); + dirty = memchr_inv(addr, 0, PAGE_SIZE); + kunmap_local(addr); + + if (dirty) + return false; + + pte_clear_not_present_full(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, false); + + if (userfaultfd_armed(pvmw->vma)) { + newpte = pte_mkspecial(pfn_pte(page_to_pfn(ZERO_PAGE(pvmw->address)), + pvmw->vma->vm_page_prot)); + ptep_clear_flush(pvmw->vma, pvmw->address, pvmw->pte); + set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte); + dec_mm_counter(pvmw->vma->vm_mm, MM_ANONPAGES); + count_vm_event(THP_SPLIT_REMAP_READONLY_ZERO_PAGE); + return true; + } + + dec_mm_counter(pvmw->vma->vm_mm, mm_counter(page)); + count_vm_event(THP_SPLIT_UNMAP); + return true; +} + +struct rmap_walk_arg { + struct folio *folio; + bool unmap_clean; +}; + /* * Restore a potential migration pte to a working pte entry */ static bool remove_migration_pte(struct folio *folio, - struct vm_area_struct *vma, unsigned long addr, void *old) + struct vm_area_struct *vma, unsigned long addr, void *arg) { - DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION); + struct rmap_walk_arg *rmap_walk_arg = arg; + DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr, PVMW_SYNC | PVMW_MIGRATION); while (page_vma_mapped_walk(&pvmw)) { rmap_t rmap_flags = RMAP_NONE; @@ -196,6 +245,8 @@ static bool remove_migration_pte(struct folio *folio, continue; } #endif + if (rmap_walk_arg->unmap_clean && try_to_unmap_clean(&pvmw, new)) + continue; folio_get(folio); pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot))); @@ -267,13 +318,20 @@ static bool remove_migration_pte(struct folio *folio, * Get rid of all migration entries and replace them by * references to the indicated page. */ -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked) +void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked, bool unmap_clean) { + struct rmap_walk_arg rmap_walk_arg = { + .folio = src, + .unmap_clean = unmap_clean, + }; + struct rmap_walk_control rwc = { .rmap_one = remove_migration_pte, - .arg = src, + .arg = &rmap_walk_arg, }; + VM_BUG_ON_FOLIO(unmap_clean && src != dst, src); + if (locked) rmap_walk_locked(dst, &rwc); else @@ -849,7 +907,7 @@ static int writeout(struct address_space *mapping, struct folio *folio) * At this point we know that the migration attempt cannot * be successful. */ - remove_migration_ptes(folio, folio, false); + remove_migration_ptes(folio, folio, false, false); rc = mapping->a_ops->writepage(&folio->page, &wbc); @@ -1108,7 +1166,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, if (page_was_mapped) remove_migration_ptes(folio, - rc == MIGRATEPAGE_SUCCESS ? dst : folio, false); + rc == MIGRATEPAGE_SUCCESS ? dst : folio, false, false); out_unlock_both: unlock_page(newpage); @@ -1318,7 +1376,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (page_was_mapped) remove_migration_ptes(src, - rc == MIGRATEPAGE_SUCCESS ? dst : src, false); + rc == MIGRATEPAGE_SUCCESS ? dst : src, false, false); unlock_put_anon: unlock_page(new_hpage); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 27fb37d65476c2..cf5a54715a58be 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -407,7 +407,7 @@ static void migrate_vma_unmap(struct migrate_vma *migrate) continue; folio = page_folio(page); - remove_migration_ptes(folio, folio, false); + remove_migration_ptes(folio, folio, false, false); migrate->src[i] = 0; folio_unlock(folio); @@ -783,7 +783,7 @@ void migrate_vma_finalize(struct migrate_vma *migrate) src = page_folio(page); dst = page_folio(newpage); - remove_migration_ptes(src, dst, false); + remove_migration_ptes(src, dst, false, false); folio_unlock(src); if (is_zone_device_page(page)) diff --git a/mm/vmstat.c b/mm/vmstat.c index 373d2730fcf215..ca2fed7f00419b 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1363,6 +1363,9 @@ const char * const vmstat_text[] = { #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD "thp_split_pud", #endif + "thp_split_free", + "thp_split_unmap", + "thp_split_remap_readonly_zero_page", "thp_zero_page_alloc", "thp_zero_page_alloc_failed", "thp_swpout", diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c index 6aa2b8253aeda1..2c669aadbfd0e9 100644 --- a/tools/testing/selftests/vm/split_huge_page_test.c +++ b/tools/testing/selftests/vm/split_huge_page_test.c @@ -16,6 +16,9 @@ #include #include #include +#include /* Definition of SYS_* constants */ +#include +#include #include "vm_util.h" uint64_t pagesize; @@ -88,6 +91,113 @@ static void write_debugfs(const char *fmt, ...) } } +static char *allocate_zero_filled_hugepage(size_t len) +{ + char *result; + size_t i; + + result = memalign(pmd_pagesize, len); + if (!result) { + printf("Fail to allocate memory\n"); + exit(EXIT_FAILURE); + } + madvise(result, len, MADV_HUGEPAGE); + + for (i = 0; i < len; i++) + result[i] = (char)0; + + return result; +} + +static void verify_rss_anon_split_huge_page_all_zeroes(char *one_page, size_t len) +{ + uint64_t thp_size, rss_anon_before, rss_anon_after; + size_t i; + + thp_size = check_huge(one_page); + if (!thp_size) { + printf("No THP is allocated\n"); + exit(EXIT_FAILURE); + } + + rss_anon_before = rss_anon(); + if (!rss_anon_before) { + printf("No RssAnon is allocated before split\n"); + exit(EXIT_FAILURE); + } + /* split all THPs */ + write_debugfs(PID_FMT, getpid(), (uint64_t)one_page, + (uint64_t)one_page + len); + + for (i = 0; i < len; i++) + if (one_page[i] != (char)0) { + printf("%ld byte corrupted\n", i); + exit(EXIT_FAILURE); + } + + thp_size = check_huge(one_page); + if (thp_size) { + printf("Still %ld kB AnonHugePages not split\n", thp_size); + exit(EXIT_FAILURE); + } + + rss_anon_after = rss_anon(); + if (rss_anon_after >= rss_anon_before) { + printf("Incorrect RssAnon value. Before: %ld After: %ld\n", + rss_anon_before, rss_anon_after); + exit(EXIT_FAILURE); + } +} + +void split_pmd_zero_pages(void) +{ + char *one_page; + size_t len = 4 * pmd_pagesize; + + one_page = allocate_zero_filled_hugepage(len); + verify_rss_anon_split_huge_page_all_zeroes(one_page, len); + printf("Split zero filled huge pages successful\n"); + free(one_page); +} + +void split_pmd_zero_pages_uffd(void) +{ + char *one_page; + size_t len = 4 * pmd_pagesize; + long uffd; /* userfaultfd file descriptor */ + struct uffdio_api uffdio_api; + struct uffdio_register uffdio_register; + + /* Create and enable userfaultfd object. */ + + uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + if (uffd == -1) { + perror("userfaultfd"); + exit(1); + } + + uffdio_api.api = UFFD_API; + uffdio_api.features = 0; + if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) { + perror("ioctl-UFFDIO_API"); + exit(1); + } + + one_page = allocate_zero_filled_hugepage(len); + + uffdio_register.range.start = (unsigned long)one_page; + uffdio_register.range.len = len; + uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) { + perror("ioctl-UFFDIO_REGISTER"); + exit(1); + } + + verify_rss_anon_split_huge_page_all_zeroes(one_page, len); + printf("Split zero filled huge pages with uffd successful\n"); + free(one_page); +} + void split_pmd_thp(void) { char *one_page; @@ -123,7 +233,6 @@ void split_pmd_thp(void) exit(EXIT_FAILURE); } - thp_size = check_huge(one_page); if (thp_size) { printf("Still %ld kB AnonHugePages not split\n", thp_size); @@ -305,6 +414,8 @@ int main(int argc, char **argv) pageshift = ffs(pagesize) - 1; pmd_pagesize = read_pmd_pagesize(); + split_pmd_zero_pages(); + split_pmd_zero_pages_uffd(); split_pmd_thp(); split_pte_mapped_thp(); split_file_backed_thp(); diff --git a/tools/testing/selftests/vm/vm_util.c b/tools/testing/selftests/vm/vm_util.c index b58ab11a7a302a..c6a785a67fc9f9 100644 --- a/tools/testing/selftests/vm/vm_util.c +++ b/tools/testing/selftests/vm/vm_util.c @@ -6,6 +6,7 @@ #define PMD_SIZE_FILE_PATH "/sys/kernel/mm/transparent_hugepage/hpage_pmd_size" #define SMAP_FILE_PATH "/proc/self/smaps" +#define STATUS_FILE_PATH "/proc/self/status" #define MAX_LINE_LENGTH 500 uint64_t pagemap_get_entry(int fd, char *start) @@ -72,6 +73,28 @@ uint64_t read_pmd_pagesize(void) return strtoul(buf, NULL, 10); } +uint64_t rss_anon(void) +{ + uint64_t rss_anon = 0; + int ret; + FILE *fp; + char buffer[MAX_LINE_LENGTH]; + + fp = fopen(STATUS_FILE_PATH, "r"); + if (!fp) + ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, STATUS_FILE_PATH); + + if (!check_for_pattern(fp, "RssAnon:", buffer)) + goto err_out; + + if (sscanf(buffer, "RssAnon:%10ld kB", &rss_anon) != 1) + ksft_exit_fail_msg("Reading status error\n"); + +err_out: + fclose(fp); + return rss_anon; +} + uint64_t check_huge(void *addr) { uint64_t thp = 0; diff --git a/tools/testing/selftests/vm/vm_util.h b/tools/testing/selftests/vm/vm_util.h index 2e512bd57ae141..00b92ccef20d00 100644 --- a/tools/testing/selftests/vm/vm_util.h +++ b/tools/testing/selftests/vm/vm_util.h @@ -6,4 +6,5 @@ uint64_t pagemap_get_entry(int fd, char *start); bool pagemap_is_softdirty(int fd, char *start); void clear_softdirty(void); uint64_t read_pmd_pagesize(void); +uint64_t rss_anon(void); uint64_t check_huge(void *addr);