mm/gup: track gup-pinned pages

Now that all callers of get_user_pages*() have been updated to use put_user_page() instead of put_page(), add tracking of such "gup-pinned" pages. The purpose of this tracking is to answer the question "has this page been pinned by a call to get_user_pages()?" In order to answer that, refcounting is required. get_user_pages() and all its variants increment a reference count, and put_user_page() and its variants decrement that reference count. If the net count is *effectively* non-zero (see below), then the page is considered gup-pinned. What to do in response to encountering such a page, is left to later patchsets. There is discussion about this in [1], and in an upcoming patch that adds: Documentation/vm/get_user_pages.rst So, this patch simply adds tracking of such pages. In order to achieve this without using up any more bits or fields in struct page, the page->_refcount field is overloaded. gup pins are incremented by adding a large chunk (1024) instead of 1. This provides a way to say, "either this page is gup-pinned, or you have a *lot* of references on it, and thus this is a false positive". False positives are generally OK, as long as they are expected to be rare: taking action for a page that looks gup-pinned, but is not, is not going to be a problem. It's false negatives (failing to detect a gup-pinned page) that would be a problem, and those won't happen with this approach. This takes advantage of two distinct, pre-existing lock-free algorithms: a) get_user_pages() and things such as page_mkclean(), both operate on page table entries, without taking locks. This relies partly on just letting the CPU hardware (which of course also never takes locks to use its own page tables) just take page faults if something has changed. b) page_cache_get_speculative(), called by get_user_pages(), is a way to avoid having pages get freed out from under get_user_pages() or other things that want to pin pages. As a result, performance is expected to be unchanged in any noticeable way, by this patch. This includes the following fix from Ira Weiny: DAX requires detection of a page crossing to a ref count of 1. Fix this for GUP pages by introducing put_devmap_managed_user_page() which accounts for GUP_PIN_COUNTING_BIAS now used by GUP. Tracking: Add several new /proc/vmstat items, to provide some visibility into what get_user_pages() and put_user_page() are doing. $ cat /proc/vmstat |grep gup nr_gup_slow_pages_requested 4842 nr_gup_fast_pages_requested 262718 nr_gup_fast_page_backoffs 0 nr_gup_page_count_overflows 0 nr_gup_page_count_neg_overflows 0 nr_gup_pages_returned 267560 Interpretation of the above: Total gup requests (slow + fast): 267560 Total put_user_page calls: 267560 Normally, those last two numbers should be equal, but a couple of things may cause them to differ: 1) Inherent race condition in reading /proc/vmstat values. 2) Bugs at any of the get_user_pages*() call sites. Those sites need to match get_user_pages() and put_user_page() calls. [1] https://lwn.net/Articles/753027/ "The trouble with get_user_pages()" Suggested-by: Jan Kara <jack@suse.cz> Suggested-by: Jérôme Glisse <jglisse@redhat.com> Tested-by: Ira Weiny <ira.weiny@intel.com> Signed-off-by: Ira Weiny <ira.weiny@intel.com> Cc: Christian Benvenuti <benve@cisco.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: Christopher Lameter <cl@linux.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Chinner <david@fromorbit.com> Cc: Dennis Dalessandro <dennis.dalessandro@intel.com> Cc: Doug Ledford <dledford@redhat.com> Cc: Jan Kara <jack@suse.cz> Cc: Jason Gunthorpe <jgg@ziepe.ca> Cc: Jérôme Glisse <jglisse@redhat.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Rapoport <rppt@linux.ibm.com> Cc: Mike Marciniszyn <mike.marciniszyn@intel.com> Cc: Ralph Campbell <rcampbell@nvidia.com> Cc: Tom Talpey <tom@talpey.com>
johnhubbard · Aug 8, 2019 · a0fb73c · a0fb73c
1 parent d5e8e65
commit a0fb73c
Show file tree

Hide file tree

Showing 11 changed files with 224 additions and 33 deletions.
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
@@ -512,7 +512,9 @@ struct page *follow_huge_pd(struct vm_area_struct *vma,
 		page = pte_page(*ptep);
 		page += ((address & mask) >> PAGE_SHIFT);
 		if (flags & FOLL_GET)
-			get_page(page);
+			if (unlikely(!try_get_gup_pin_page(page,
+					NR_GUP_SLOW_PAGES_REQUESTED)))
+				page = NULL;
 	} else {
 		if (is_hugetlb_entry_migration(*ptep)) {
 			spin_unlock(ptl);

diff --git a/include/linux/mm.h b/include/linux/mm.h
@@ -954,9 +954,10 @@ static inline bool is_zone_device_page(const struct page *page)
 #endif
 
 #ifdef CONFIG_DEV_PAGEMAP_OPS
-void __put_devmap_managed_page(struct page *page);
+void __put_devmap_managed_page(struct page *page, int count);
 DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
-static inline bool put_devmap_managed_page(struct page *page)
+
+static inline bool page_is_devmap_managed(struct page *page)
 {
 	if (!static_branch_unlikely(&devmap_managed_key))
 		return false;
@@ -965,14 +966,26 @@ static inline bool put_devmap_managed_page(struct page *page)
 	switch (page->pgmap->type) {
 	case MEMORY_DEVICE_PRIVATE:
 	case MEMORY_DEVICE_FS_DAX:
-		__put_devmap_managed_page(page);
 		return true;
 	default:
 		break;
 	}
 	return false;
 }
 
+static inline bool put_devmap_managed_page(struct page *page)
+{
+	bool is_devmap = page_is_devmap_managed(page);
+
+	if (is_devmap) {
+		int count = page_ref_dec_return(page);
+
+		__put_devmap_managed_page(page, count);
+	}
+
+	return is_devmap;
+}
+
 #else /* CONFIG_DEV_PAGEMAP_OPS */
 static inline bool put_devmap_managed_page(struct page *page)
 {
@@ -1020,6 +1033,9 @@ static inline __must_check bool try_get_page(struct page *page)
 	return true;
 }
 
+__must_check bool try_get_gup_pin_page(struct page *page,
+				       enum node_stat_item gup_type);
+
 static inline void put_page(struct page *page)
 {
 	page = compound_head(page);
@@ -1037,31 +1053,56 @@ static inline void put_page(struct page *page)
 		__put_page(page);
 }
 
-/**
- * put_user_page() - release a gup-pinned page
- * @page:            pointer to page to be released
+/*
+ * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload
+ * the page's refcount so that two separate items are tracked: the original page
+ * reference count, and also a new count of how many get_user_pages() calls were
+ * made against the page. ("gup-pinned" is another term for the latter).
+ *
+ * With this scheme, get_user_pages() becomes special: such pages are marked
+ * as distinct from normal pages. As such, the new put_user_page() call (and
+ * its variants) must be used in order to release gup-pinned pages.
+ *
+ * Choice of value:
  *
- * Pages that were pinned via get_user_pages*() must be released via
- * either put_user_page(), or one of the put_user_pages*() routines
- * below. This is so that eventually, pages that are pinned via
- * get_user_pages*() can be separately tracked and uniquely handled. In
- * particular, interactions with RDMA and filesystems need special
- * handling.
+ * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference
+ * counts with respect to get_user_pages() and put_user_page() becomes simpler,
+ * due to the fact that adding an even power of two to the page refcount has
+ * the effect of using only the upper N bits, for the code that counts up using
+ * the bias value. This means that the lower bits are left for the exclusive
+ * use of the original code that increments and decrements by one (or at least,
+ * by much smaller values than the bias value).
  *
- * put_user_page() and put_page() are not interchangeable, despite this early
- * implementation that makes them look the same. put_user_page() calls must
- * be perfectly matched up with get_user_page() calls.
+ * Of course, once the lower bits overflow into the upper bits (and this is
+ * OK, because subtraction recovers the original values), then visual inspection
+ * no longer suffices to directly view the separate counts. However, for normal
+ * applications that don't have huge page reference counts, this won't be an
+ * issue.
+ *
+ * Locking: the lockless algorithm described in page_cache_get_speculative()
+ * and page_cache_gup_pin_speculative() provides safe operation for
+ * get_user_pages and page_mkclean and other calls that race to set up page
+ * table entries.
  */
-static inline void put_user_page(struct page *page)
-{
-	put_page(page);
-}
+#define GUP_PIN_COUNTING_BIAS (1UL << 10)
 
+void put_user_page(struct page *page);
 void put_user_pages_dirty_lock(struct page **pages, unsigned long npages,
 			       bool make_dirty);
-
 void put_user_pages(struct page **pages, unsigned long npages);
 
+/**
+ * page_gup_pinned() - report if a page is gup-pinned (pinned by a call to
+ *			get_user_pages).
+ * @page:	pointer to page to be queried.
+ * @Return:	True, if it is likely that the page has been "gup-pinned".
+ *		False, if the page is definitely not gup-pinned.
+ */
+static inline bool page_gup_pinned(struct page *page)
+{
+	return (page_ref_count(page)) > GUP_PIN_COUNTING_BIAS;
+}
+
 #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP)
 #define SECTION_IN_PAGE_FLAGS
 #endif

diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h
@@ -37,6 +37,17 @@ void dump_mm(const struct mm_struct *mm);
 			BUG();						\
 		}							\
 	} while (0)
+#define VM_WARN_ONCE_PAGE(condition, page) ({				\
+	static int __warned;						\
+	int __ret_warn_once = !!(condition);				\
+									\
+	if (unlikely(__ret_warn_once && !__warned)) {			\
+		__warned = true;					\
+		dump_page(page,						\
+			  "VM_WARN_ONCE_PAGE(" __stringify(condition)")");\
+	}								\
+	unlikely(__ret_warn_once);					\
+})
 #define VM_WARN_ON(cond) (void)WARN_ON(cond)
 #define VM_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond)
 #define VM_WARN_ONCE(cond, format...) (void)WARN_ONCE(cond, format)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
@@ -241,6 +241,12 @@ enum node_stat_item {
 	NR_DIRTIED,		/* page dirtyings since bootup */
 	NR_WRITTEN,		/* page writings since bootup */
 	NR_KERNEL_MISC_RECLAIMABLE,	/* reclaimable non-slab kernel pages */
+	NR_GUP_SLOW_PAGES_REQUESTED,	/* via: get_user_pages() */
+	NR_GUP_FAST_PAGES_REQUESTED,	/* via: get_user_pages_fast() */
+	NR_GUP_FAST_PAGE_BACKOFFS,	/* gup_fast() lost to page_mkclean() */
+	NR_GUP_PAGE_COUNT_OVERFLOWS,	/* gup count overflowed: gup() failed */
+	NR_GUP_PAGE_COUNT_NEG_OVERFLOWS,/* gup count wrapped negative: gup() failed */
+	NR_GUP_PAGES_RETURNED,		/* via: put_user_page() */
 	NR_VM_NODE_STAT_ITEMS
 };
 

diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h
@@ -102,6 +102,16 @@ static inline void page_ref_sub(struct page *page, int nr)
 		__page_ref_mod(page, -nr);
 }
 
+static inline int page_ref_sub_return(struct page *page, int nr)
+{
+	int ret = atomic_sub_return(nr, &page->_refcount);
+
+	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
+		__page_ref_mod(page, -nr);
+
+	return ret;
+}
+
 static inline void page_ref_inc(struct page *page)
 {
 	atomic_inc(&page->_refcount);

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
@@ -207,6 +207,11 @@ static inline int page_cache_add_speculative(struct page *page, int count)
 	return __page_cache_add_speculative(page, count);
 }
 
+static inline int page_cache_gup_pin_speculative(struct page *page)
+{
+	return __page_cache_add_speculative(page, GUP_PIN_COUNTING_BIAS);
+}
+
 #ifdef CONFIG_NUMA
 extern struct page *__page_cache_alloc(gfp_t gfp);
 #else

diff --git a/mm/gup.c b/mm/gup.c
@@ -29,6 +29,61 @@ struct follow_page_context {
 	unsigned int page_mask;
 };
 
+int __user_page_ref_dec_return(struct page *page)
+{
+	VM_WARN_ONCE_PAGE(page_ref_count(page) < GUP_PIN_COUNTING_BIAS, page);
+
+	mod_node_page_state(page_pgdat(page), NR_GUP_PAGES_RETURNED, 1);
+	return page_ref_sub_return(page, GUP_PIN_COUNTING_BIAS);
+}
+
+#ifdef CONFIG_DEV_PAGEMAP_OPS
+static bool __put_devmap_managed_user_page(struct page *page)
+{
+	bool is_devmap = page_is_devmap_managed(page);
+
+	if (is_devmap) {
+		int count = __user_page_ref_dec_return(page);
+		__put_devmap_managed_page(page, count);
+	}
+
+	return is_devmap;
+}
+#else
+static bool __put_devmap_managed_user_page(struct page *page)
+{
+	return false;
+}
+#endif /* CONFIG_DEV_PAGEMAP_OPS */
+
+/**
+ * put_user_page() - release a gup-pinned page
+ * @page:            pointer to page to be released
+ *
+ * Pages that were pinned via get_user_pages*() must be released via
+ * either put_user_page(), or one of the put_user_pages*() routines
+ * below. This is so that eventually, pages that are pinned via
+ * get_user_pages*() can be separately tracked and uniquely handled. In
+ * particular, interactions with RDMA and filesystems need special
+ * handling.
+ */
+void put_user_page(struct page *page)
+{
+	page = compound_head(page);
+
+	/*
+	 * For devmap managed pages we need to catch refcount transition from
+	 * GUP_PIN_COUNTING_BIAS to 1, when refcount reach one it means the
+	 * page is free and we need to inform the device driver through
+	 * callback. See include/linux/memremap.h and HMM for details.
+	 */
+	if (__put_devmap_managed_user_page(page))
+		return;
+
+	__user_page_ref_dec_return(page);
+}
+EXPORT_SYMBOL(put_user_page);
+
 /**
  * put_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
  * @pages:  array of pages to be put
@@ -111,6 +166,36 @@ void put_user_pages(struct page **pages, unsigned long npages)
 }
 EXPORT_SYMBOL(put_user_pages);
 
+/**
+ * try_get_gup_pin_page() - mark a page as being used by get_user_pages().
+ * @page:	pointer to page to be marked
+ * @Return:	true for success, false for failure
+ */
+__must_check bool try_get_gup_pin_page(struct page *page,
+				       enum node_stat_item gup_type)
+{
+	page = compound_head(page);
+	if (WARN_ON_ONCE(page_ref_count(page) <= 0)) {
+		mod_node_page_state(page_pgdat(page),
+				    NR_GUP_PAGE_COUNT_NEG_OVERFLOWS, 1);
+		WARN_ONCE(1, "get_user_pages pin count wrapped negative");
+		return false;
+	}
+
+	if (WARN_ON_ONCE(page_ref_count(page) >=
+			 (UINT_MAX - GUP_PIN_COUNTING_BIAS))) {
+		mod_node_page_state(page_pgdat(page),
+				    NR_GUP_PAGE_COUNT_OVERFLOWS, 1);
+		WARN_ONCE(1, "get_user_pages pin count overflowed");
+
+		return false;
+	}
+
+	page_ref_add(page, GUP_PIN_COUNTING_BIAS);
+	mod_node_page_state(page_pgdat(page), gup_type, 1);
+	return true;
+}
+
 #ifdef CONFIG_MMU
 static struct page *no_page_table(struct vm_area_struct *vma,
 		unsigned int flags)
@@ -245,7 +330,9 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
 	}
 
 	if (flags & FOLL_GET) {
-		if (unlikely(!try_get_page(page))) {
+		if (unlikely(
+			!try_get_gup_pin_page(page,
+					      NR_GUP_SLOW_PAGES_REQUESTED))) {
 			page = ERR_PTR(-ENOMEM);
 			goto out;
 		}
@@ -587,7 +674,8 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
 			goto unmap;
 		*page = pte_page(*pte);
 	}
-	if (unlikely(!try_get_page(*page))) {
+	if (unlikely(!try_get_gup_pin_page(*page,
+					   NR_GUP_SLOW_PAGES_REQUESTED))) {
 		ret = -ENOMEM;
 		goto unmap;
 	}
@@ -1786,7 +1874,7 @@ static inline struct page *try_get_compound_head(struct page *page, int refs)
 	struct page *head = compound_head(page);
 	if (WARN_ON_ONCE(page_ref_count(head) < 0))
 		return NULL;
-	if (unlikely(!page_cache_add_speculative(head, refs)))
+	if (unlikely(!page_cache_gup_pin_speculative(head)))
 		return NULL;
 	return head;
 }
@@ -1833,8 +1921,13 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
 		if (!head)
 			goto pte_unmap;
 
+		mod_node_page_state(page_pgdat(head),
+				    NR_GUP_FAST_PAGES_REQUESTED, 1);
+
 		if (unlikely(pte_val(pte) != pte_val(*ptep))) {
-			put_page(head);
+			mod_node_page_state(page_pgdat(head),
+					    NR_GUP_FAST_PAGE_BACKOFFS, 1);
+			put_user_page(head);
 			goto pte_unmap;
 		}
 
@@ -1889,7 +1982,11 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
 		}
 		SetPageReferenced(page);
 		pages[*nr] = page;
-		get_page(page);
+		if (try_get_gup_pin_page(page, NR_GUP_FAST_PAGES_REQUESTED)) {
+			undo_dev_pagemap(nr, nr_start, pages);
+			return 0;
+		}
+
 		(*nr)++;
 		pfn++;
 	} while (addr += PAGE_SIZE, addr != end);
@@ -2062,6 +2159,8 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
 		return 0;
 	}
 
+	mod_node_page_state(page_pgdat(head), NR_GUP_FAST_PAGES_REQUESTED, 1);
+
 	if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
 		*nr -= refs;
 		while (refs--)
@@ -2103,6 +2202,8 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
 		return 0;
 	}
 
+	mod_node_page_state(page_pgdat(head), NR_GUP_FAST_PAGES_REQUESTED, 1);
+
 	if (unlikely(pud_val(orig) != pud_val(*pudp))) {
 		*nr -= refs;
 		while (refs--)
@@ -2140,6 +2241,8 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
 		return 0;
 	}
 
+	mod_node_page_state(page_pgdat(head), NR_GUP_FAST_PAGES_REQUESTED, 1);
+
 	if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
 		*nr -= refs;
 		while (refs--)