Skip to content

Commit a577509

Browse files
shvipinsean-jc
authored andcommitted
KVM: x86/mmu: Recover TDP MMU NX huge pages using MMU read lock
Use MMU read lock to recover TDP MMU NX huge pages. To prevent concurrent modification of the list of potential huge pages, iterate over the list under tdp_mmu_pages_lock protection and unaccount the page before dropping the lock. Zapping under MMU read lock unblocks vCPUs which are waiting for MMU read lock, which solves a guest jitter issue on Windows VMs which were observing an increase in network latency. Do not zap an SPTE if: - The SPTE is a root page. - The SPTE does not point at the SP's page table. If the SPTE does not point at the SP's page table, then something else has change the SPTE, so KVM cannot safely zap it. Warn if zapping SPTE fails and current SPTE is still pointing to same page table, as it should be impossible for the CMPXCHG to fail due to all other write scenarios being mutually exclusive. There is always a race between dirty logging, vCPU faults, and NX huge page recovery for backing a gfn by an NX huge page or an executable small page. Unaccounting sooner during the list traversal increases the window of that race, but functionally, it is okay. Accounting doesn't protect against iTLB multi-hit bug, it is there purely to prevent KVM from bouncing a gfn between two page sizes. The only downside is that a vCPU will end up doing more work in tearing down all the child SPTEs. This should be a very rare race. Suggested-by: Sean Christopherson <seanjc@google.com> Signed-off-by: Vipin Sharma <vipinsh@google.com> Co-developed-by: James Houghton <jthoughton@google.com> Signed-off-by: James Houghton <jthoughton@google.com> Link: https://lore.kernel.org/r/20250707224720.4016504-4-jthoughton@google.com [sean: clean up kvm_mmu_sp_dirty_logging_enabled() and the changelog] Signed-off-by: Sean Christopherson <seanjc@google.com>
1 parent 6210556 commit a577509

File tree

2 files changed

+98
-44
lines changed

2 files changed

+98
-44
lines changed

arch/x86/kvm/mmu/mmu.c

Lines changed: 63 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -7596,17 +7596,43 @@ static unsigned long nx_huge_pages_to_zap(struct kvm *kvm,
75967596
return ratio ? DIV_ROUND_UP(pages, ratio) : 0;
75977597
}
75987598

7599+
static bool kvm_mmu_sp_dirty_logging_enabled(struct kvm *kvm,
7600+
struct kvm_mmu_page *sp)
7601+
{
7602+
struct kvm_memory_slot *slot;
7603+
7604+
/*
7605+
* Skip the memslot lookup if dirty tracking can't possibly be enabled,
7606+
* as memslot lookups are relatively expensive.
7607+
*
7608+
* If a memslot update is in progress, reading an incorrect value of
7609+
* kvm->nr_memslots_dirty_logging is not a problem: if it is becoming
7610+
* zero, KVM will do an unnecessary memslot lookup; if it is becoming
7611+
* nonzero, the page will be zapped unnecessarily. Either way, this
7612+
* only affects efficiency in racy situations, and not correctness.
7613+
*/
7614+
if (!atomic_read(&kvm->nr_memslots_dirty_logging))
7615+
return false;
7616+
7617+
slot = __gfn_to_memslot(kvm_memslots_for_spte_role(kvm, sp->role), sp->gfn);
7618+
if (WARN_ON_ONCE(!slot))
7619+
return false;
7620+
7621+
return kvm_slot_dirty_track_enabled(slot);
7622+
}
7623+
75997624
static void kvm_recover_nx_huge_pages(struct kvm *kvm,
7600-
enum kvm_mmu_type mmu_type)
7625+
const enum kvm_mmu_type mmu_type)
76017626
{
76027627
#ifdef CONFIG_X86_64
76037628
const bool is_tdp_mmu = mmu_type == KVM_TDP_MMU;
7629+
spinlock_t *tdp_mmu_pages_lock = &kvm->arch.tdp_mmu_pages_lock;
76047630
#else
76057631
const bool is_tdp_mmu = false;
7632+
spinlock_t *tdp_mmu_pages_lock = NULL;
76067633
#endif
76077634
unsigned long to_zap = nx_huge_pages_to_zap(kvm, mmu_type);
76087635
struct list_head *nx_huge_pages;
7609-
struct kvm_memory_slot *slot;
76107636
struct kvm_mmu_page *sp;
76117637
LIST_HEAD(invalid_list);
76127638
bool flush = false;
@@ -7615,7 +7641,10 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm,
76157641
nx_huge_pages = &kvm->arch.possible_nx_huge_pages[mmu_type].pages;
76167642

76177643
rcu_idx = srcu_read_lock(&kvm->srcu);
7618-
write_lock(&kvm->mmu_lock);
7644+
if (is_tdp_mmu)
7645+
read_lock(&kvm->mmu_lock);
7646+
else
7647+
write_lock(&kvm->mmu_lock);
76197648

76207649
/*
76217650
* Zapping TDP MMU shadow pages, including the remote TLB flush, must
@@ -7625,8 +7654,14 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm,
76257654
rcu_read_lock();
76267655

76277656
for ( ; to_zap; --to_zap) {
7628-
if (list_empty(nx_huge_pages))
7657+
if (is_tdp_mmu)
7658+
spin_lock(tdp_mmu_pages_lock);
7659+
7660+
if (list_empty(nx_huge_pages)) {
7661+
if (is_tdp_mmu)
7662+
spin_unlock(tdp_mmu_pages_lock);
76297663
break;
7664+
}
76307665

76317666
/*
76327667
* We use a separate list instead of just using active_mmu_pages
@@ -7641,58 +7676,49 @@ static void kvm_recover_nx_huge_pages(struct kvm *kvm,
76417676
WARN_ON_ONCE(!sp->nx_huge_page_disallowed);
76427677
WARN_ON_ONCE(!sp->role.direct);
76437678

7679+
unaccount_nx_huge_page(kvm, sp);
7680+
7681+
if (is_tdp_mmu)
7682+
spin_unlock(tdp_mmu_pages_lock);
7683+
76447684
/*
7645-
* Unaccount and do not attempt to recover any NX Huge Pages
7646-
* that are being dirty tracked, as they would just be faulted
7647-
* back in as 4KiB pages. The NX Huge Pages in this slot will be
7648-
* recovered, along with all the other huge pages in the slot,
7649-
* when dirty logging is disabled.
7650-
*
7651-
* Since gfn_to_memslot() is relatively expensive, it helps to
7652-
* skip it if it the test cannot possibly return true. On the
7653-
* other hand, if any memslot has logging enabled, chances are
7654-
* good that all of them do, in which case unaccount_nx_huge_page()
7655-
* is much cheaper than zapping the page.
7656-
*
7657-
* If a memslot update is in progress, reading an incorrect value
7658-
* of kvm->nr_memslots_dirty_logging is not a problem: if it is
7659-
* becoming zero, gfn_to_memslot() will be done unnecessarily; if
7660-
* it is becoming nonzero, the page will be zapped unnecessarily.
7661-
* Either way, this only affects efficiency in racy situations,
7662-
* and not correctness.
7685+
* Do not attempt to recover any NX Huge Pages that are being
7686+
* dirty tracked, as they would just be faulted back in as 4KiB
7687+
* pages. The NX Huge Pages in this slot will be recovered,
7688+
* along with all the other huge pages in the slot, when dirty
7689+
* logging is disabled.
76637690
*/
7664-
slot = NULL;
7665-
if (atomic_read(&kvm->nr_memslots_dirty_logging)) {
7666-
struct kvm_memslots *slots;
7691+
if (!kvm_mmu_sp_dirty_logging_enabled(kvm, sp)) {
7692+
if (is_tdp_mmu)
7693+
flush |= kvm_tdp_mmu_zap_possible_nx_huge_page(kvm, sp);
7694+
else
7695+
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
76677696

7668-
slots = kvm_memslots_for_spte_role(kvm, sp->role);
7669-
slot = __gfn_to_memslot(slots, sp->gfn);
7670-
WARN_ON_ONCE(!slot);
76717697
}
76727698

7673-
if (slot && kvm_slot_dirty_track_enabled(slot))
7674-
unaccount_nx_huge_page(kvm, sp);
7675-
else if (is_tdp_mmu)
7676-
flush |= kvm_tdp_mmu_zap_possible_nx_huge_page(kvm, sp);
7677-
else
7678-
kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
76797699
WARN_ON_ONCE(sp->nx_huge_page_disallowed);
76807700

76817701
if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) {
76827702
kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
76837703
rcu_read_unlock();
76847704

7685-
cond_resched_rwlock_write(&kvm->mmu_lock);
7686-
flush = false;
7705+
if (is_tdp_mmu)
7706+
cond_resched_rwlock_read(&kvm->mmu_lock);
7707+
else
7708+
cond_resched_rwlock_write(&kvm->mmu_lock);
76877709

7710+
flush = false;
76887711
rcu_read_lock();
76897712
}
76907713
}
76917714
kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush);
76927715

76937716
rcu_read_unlock();
76947717

7695-
write_unlock(&kvm->mmu_lock);
7718+
if (is_tdp_mmu)
7719+
read_unlock(&kvm->mmu_lock);
7720+
else
7721+
write_unlock(&kvm->mmu_lock);
76967722
srcu_read_unlock(&kvm->srcu, rcu_idx);
76977723
}
76987724

arch/x86/kvm/mmu/tdp_mmu.c

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -928,21 +928,49 @@ static void tdp_mmu_zap_root(struct kvm *kvm, struct kvm_mmu_page *root,
928928
bool kvm_tdp_mmu_zap_possible_nx_huge_page(struct kvm *kvm,
929929
struct kvm_mmu_page *sp)
930930
{
931-
u64 old_spte;
931+
struct tdp_iter iter = {
932+
.old_spte = sp->ptep ? kvm_tdp_mmu_read_spte(sp->ptep) : 0,
933+
.sptep = sp->ptep,
934+
.level = sp->role.level + 1,
935+
.gfn = sp->gfn,
936+
.as_id = kvm_mmu_page_as_id(sp),
937+
};
938+
939+
lockdep_assert_held_read(&kvm->mmu_lock);
940+
941+
if (WARN_ON_ONCE(!is_tdp_mmu_page(sp)))
942+
return false;
932943

933944
/*
934-
* This helper intentionally doesn't allow zapping a root shadow page,
935-
* which doesn't have a parent page table and thus no associated entry.
945+
* Root shadow pages don't have a parent page table and thus no
946+
* associated entry, but they can never be possible NX huge pages.
936947
*/
937948
if (WARN_ON_ONCE(!sp->ptep))
938949
return false;
939950

940-
old_spte = kvm_tdp_mmu_read_spte(sp->ptep);
941-
if (WARN_ON_ONCE(!is_shadow_present_pte(old_spte)))
951+
/*
952+
* Since mmu_lock is held in read mode, it's possible another task has
953+
* already modified the SPTE. Zap the SPTE if and only if the SPTE
954+
* points at the SP's page table, as checking shadow-present isn't
955+
* sufficient, e.g. the SPTE could be replaced by a leaf SPTE, or even
956+
* another SP. Note, spte_to_child_pt() also checks that the SPTE is
957+
* shadow-present, i.e. guards against zapping a frozen SPTE.
958+
*/
959+
if ((tdp_ptep_t)sp->spt != spte_to_child_pt(iter.old_spte, iter.level))
942960
return false;
943961

944-
tdp_mmu_set_spte(kvm, kvm_mmu_page_as_id(sp), sp->ptep, old_spte,
945-
SHADOW_NONPRESENT_VALUE, sp->gfn, sp->role.level + 1);
962+
/*
963+
* If a different task modified the SPTE, then it should be impossible
964+
* for the SPTE to still be used for the to-be-zapped SP. Non-leaf
965+
* SPTEs don't have Dirty bits, KVM always sets the Accessed bit when
966+
* creating non-leaf SPTEs, and all other bits are immutable for non-
967+
* leaf SPTEs, i.e. the only legal operations for non-leaf SPTEs are
968+
* zapping and replacement.
969+
*/
970+
if (tdp_mmu_set_spte_atomic(kvm, &iter, SHADOW_NONPRESENT_VALUE)) {
971+
WARN_ON_ONCE((tdp_ptep_t)sp->spt == spte_to_child_pt(iter.old_spte, iter.level));
972+
return false;
973+
}
946974

947975
return true;
948976
}

0 commit comments

Comments
 (0)