diff --git a/include/linux/mm.h b/include/linux/mm.h index 8bbcccbc556543..8c6ec1da2336fd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2352,6 +2352,21 @@ static inline bool ptlock_init(struct page *page) { return true; } static inline void ptlock_free(struct page *page) {} #endif /* USE_SPLIT_PTE_PTLOCKS */ +static inline int pmd_get_pte(pmd_t *pmd) +{ + return page_ref_inc_return(pmd_page(*pmd)); +} + +static inline bool pmd_put_pte(pmd_t *pmd) +{ + return page_ref_add_unless(pmd_page(*pmd), -1, 1); +} + +static inline int cow_pte_count(pmd_t *pmd) +{ + return page_count(pmd_page(*pmd)); +} + static inline void pgtable_init(void) { ptlock_cache_init(); @@ -2364,6 +2379,7 @@ static inline bool pgtable_pte_page_ctor(struct page *page) return false; __SetPageTable(page); inc_lruvec_page_state(page, NR_PAGETABLE); + set_page_count(page, 1); return true; } diff --git a/mm/memory.c b/mm/memory.c index 8a6d5c823f91b9..5b474d14a5411b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -786,11 +786,17 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, unsigned long addr, int *rss) { + /* With COW PTE, dst_vma is src_vma. */ unsigned long vm_flags = dst_vma->vm_flags; pte_t pte = *src_pte; struct page *page; swp_entry_t entry = pte_to_swp_entry(pte); + /* + * If it's COW PTE, parent shares PTE with child. Which means the + * following modifications of child will also affect parent. + */ + if (likely(!non_swap_entry(entry))) { if (swap_duplicate(entry) < 0) return -EIO; @@ -937,6 +943,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma /* * Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page * is required to copy this pte. + * However, if prealloc is NULL, it is COW PTE. */ static inline int copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, @@ -960,6 +967,14 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, if (unlikely(page_try_dup_anon_rmap(page, false, src_vma))) { /* Page maybe pinned, we have to copy. */ put_page(page); + /* + * If prealloc is NULL, we are processing share page + * table (COW PTE, in copy_cow_pte_range()). We cannot + * call copy_present_page() right now, instead, we + * should fall back to copy_pte_range(). + */ + if (!prealloc) + return -EAGAIN; return copy_present_page(dst_vma, src_vma, dst_pte, src_pte, addr, rss, prealloc, page); } @@ -980,6 +995,11 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, } VM_BUG_ON(page && PageAnon(page) && PageAnonExclusive(page)); + /* + * If it's COW PTE, parent shares PTE with child. + * Which means the following will also affect parent. + */ + /* * If it's a shared mapping, mark it clean in * the child @@ -988,6 +1008,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pte = pte_mkclean(pte); pte = pte_mkold(pte); + /* For COW PTE, dst_vma is still src_vma. */ if (!userfaultfd_wp(dst_vma)) pte = pte_clear_uffd_wp(pte); @@ -1014,6 +1035,8 @@ page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma, return new_page; } + +/* copy_pte_range() will immediately allocate new page table. */ static int copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, @@ -1138,6 +1161,199 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, return ret; } +/* + * copy_cow_pte_range() will try to share the page table with child. + * The logic of non-present, present and error handling is same as + * copy_pte_range() but dst_vma and dst_pte are src_vma and src_pte. + * + * We cannot preserve soft-dirty information, because PTE will share + * between multiple processes. + */ +static int +copy_cow_pte_range(struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, + pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, + unsigned long end, unsigned long *recover_end) +{ + struct mm_struct *dst_mm = dst_vma->vm_mm; + struct mm_struct *src_mm = src_vma->vm_mm; + struct vma_iterator vmi; + struct vm_area_struct *curr = src_vma; + pte_t *src_pte, *orig_src_pte; + spinlock_t *src_ptl; + int ret = 0; + int rss[NR_MM_COUNTERS]; + swp_entry_t entry = (swp_entry_t){0}; + unsigned long vm_end, orig_addr = addr; + pgtable_t pte_table = pmd_page(*src_pmd); + + end = (addr + PMD_SIZE) & PMD_MASK; + addr = addr & PMD_MASK; + + /* + * Increase the refcount to prevent the parent's PTE + * dropped/reused. Only increace the refcount at first + * time attached. + */ + src_ptl = pte_lockptr(src_mm, src_pmd); + spin_lock(src_ptl); + pmd_get_pte(src_pmd); + pmd_install(dst_mm, dst_pmd, &pte_table); + spin_unlock(src_ptl); + + /* + * We should handle all of the entries in this PTE at this traversal, + * since we cannot promise that the next vma will not do the lazy fork. + */ + vma_iter_init(&vmi, src_mm, addr); + for_each_vma_range(vmi, curr, end) { + vm_end = min(end, curr->vm_end); + addr = max(addr, curr->vm_start); +again: + init_rss_vec(rss); + src_pte = pte_offset_map(src_pmd, addr); + src_ptl = pte_lockptr(src_mm, src_pmd); + orig_src_pte = src_pte; + spin_lock(src_ptl); + + arch_enter_lazy_mmu_mode(); + + do { + if (pte_none(*src_pte)) + continue; + if (unlikely(!pte_present(*src_pte))) { + /* + * Although, parent's PTE is COW-ed, we should + * still need to handle all the swap stuffs. + */ + ret = copy_nonpresent_pte(dst_mm, src_mm, + src_pte, src_pte, + curr, curr, + addr, rss); + if (ret == -EIO) { + entry = pte_to_swp_entry(*src_pte); + break; + } else if (ret == -EBUSY) { + break; + } else if (!ret) + continue; + /* + * Device exclusive entry restored, continue by + * copying the now present pte. + */ + WARN_ON_ONCE(ret != -ENOENT); + } + /* + * copy_present_pte() will determine the mapped page + * should be COW or not. + */ + ret = copy_present_pte(curr, curr, src_pte, src_pte, + addr, rss, NULL); + /* + * If we need a pre-allocated page for this pte, + * drop the lock, recover all the entries, fall + * back to copy_pte_range(), and try again. + */ + if (unlikely(ret == -EAGAIN)) + break; + } while (src_pte++, addr += PAGE_SIZE, addr != vm_end); + + arch_leave_lazy_mmu_mode(); + add_mm_rss_vec(dst_mm, rss); + spin_unlock(src_ptl); + pte_unmap(orig_src_pte); + cond_resched(); + + if (ret == -EIO) { + VM_WARN_ON_ONCE(!entry.val); + if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) { + ret = -ENOMEM; + goto out; + } + entry.val = 0; + } else if (ret == -EBUSY) { + goto out; + } else if (ret == -EAGAIN) { + /* + * We've to allocate the page immediately but first we + * should recover the processed entries and fall back + * to copy_pte_range(). + */ + *recover_end = addr; + return -EAGAIN; + } else if (ret) { + VM_WARN_ON_ONCE(1); + } + + /* We've captured and resolved the error. Reset, try again. */ + ret = 0; + if (addr != vm_end) + goto again; + } + +out: + /* + * All the pte entries are available to COW. + * Now, we can share with child. + */ + pmdp_set_wrprotect(src_mm, orig_addr, src_pmd); + set_pmd_at(dst_mm, orig_addr, dst_pmd, pmd_wrprotect(*src_pmd)); + + return ret; +} + +/* WHen doing the recover, we should hold the locks entirely. */ +static int +recover_pte_range(struct vm_area_struct *dst_vma, + struct vm_area_struct *src_vma, + pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long end) +{ + struct mm_struct *dst_mm = dst_vma->vm_mm; + struct mm_struct *src_mm = src_vma->vm_mm; + pte_t *orig_src_pte, *orig_dst_pte; + pte_t *src_pte, *dst_pte; + spinlock_t *src_ptl, *dst_ptl; + unsigned long addr = end & PMD_MASK; + int ret = 0; + + /* Before we allocate the new PTE, clear the entry. */ + pmd_clear(dst_pmd); + dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); + if (!dst_pte) { + ret = -ENOMEM; + goto out; + } + + src_pte = pte_offset_map(src_pmd, addr); + src_ptl = pte_lockptr(src_mm, src_pmd); + spin_lock(src_ptl); + + orig_src_pte = src_pte; + orig_dst_pte = dst_pte; + arch_enter_lazy_mmu_mode(); + + do { + if (pte_none(*src_pte)) + continue; + /* COW mapping should also handled by COW PTE. */ + set_pte_at(dst_mm, addr, dst_pte, *src_pte); + } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); + + arch_leave_lazy_mmu_mode(); + /* + * Before unlock src_ptl, release the holding from child. + * Parent may still share with others, so don't make it writeable. + */ + pmd_put_pte(src_pmd); + spin_unlock(src_ptl); + pte_unmap(orig_src_pte); + pte_unmap_unlock(orig_dst_pte, dst_ptl); + cond_resched(); +out: + + return ret; +} + static inline int copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pud_t *dst_pud, pud_t *src_pud, unsigned long addr, @@ -1166,6 +1382,53 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, continue; /* fall through */ } + + /* + * If MMF_COW_PTE set, copy_pte_range() will try to share + * the PTE page table first. In other words, it attempts to + * do COW on PTE (and mapped pages). However, if there has + * any unshareable page (e.g., pinned page, device private + * page), it will fall back to the default path, which will + * copy the page table immediately. + * In such a case, it stores the address of first unshareable + * page to recover_end then goes back to the beginning of PTE + * and recovers the COW-ed PTE entries until it meets the same + * unshareable page again. During the recovering, because of + * COW-ed PTE entries are logical same as COW mapping, so it + * only needs to allocate the new PTE and sets COW-ed PTE + * entries to new PTE (which will be same as COW mapping). + */ + if (test_bit(MMF_COW_PTE, &src_mm->flags)) { + unsigned long recover_end = 0; + int ret; + + /* + * Setting wrprotect with normal PTE to pmd entry + * will trigger pmd_bad(). Skip bad checking here. + */ + if (pmd_none(*src_pmd)) + continue; + /* Skip if the PTE already did COW PTE this time. */ + if (!pmd_none(*dst_pmd) && !pmd_write(*dst_pmd)) + continue; + + ret = copy_cow_pte_range(dst_vma, src_vma, + dst_pmd, src_pmd, + addr, next, &recover_end); + if (!ret) { + /* COW PTE succeeded. */ + continue; + } else if (ret == -EAGAIN) { + /* fall back to normal copy method. */ + if (recover_pte_range(dst_vma, src_vma, + dst_pmd, src_pmd, + recover_end)) + return -ENOMEM; + addr = recover_end; + /* fall through */ + } else if (ret) + return -ENOMEM; + } if (pmd_none_or_clear_bad(src_pmd)) continue; if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd,