Skip to content
Permalink
Browse files
KVM, x86/tdp_mmu: optimize remote tlb flush
Implement batched TLB shootdown for optimization.  The current
implementation to zap multiple the EPT entries,

Loop on GFNs
  1) Zap the EPT entry.  Zero the EPT entry.  PFN is saved on stack.
  2) TDX SEAMCALL TDH.MEM.RANGE.BLOCK with GFN.
     This corresponds to clearing the present bit.
  3) TDH.MEM.TRACK. This corresponds to local tlb flush.
  4) Send IPI to remote vcpus. This corresponds to remote tlb flush.
  5) When destructing VM, TDH.MEM.REMOVE with PFN.
     There is no corresponding to the VMX EPT operation.
At the last of zapping
6) kvm_flush_remote_tlbs_with_address(). This flushes shared EPT pointer.
   No operations on Secure EPT.

The new implementation looks like.  The number of TLB shootdown is reduced
from the number of the EPT entries to zap to one.

Loop on GFNs
  1) Zap the EPT entry.
     Clear present/dirty/access bits
     Keep PFN
     Set SPTE_PRIVATE_ZAPPED to indicate valid PFN to unlink from Secure
     EPT.
  2) TDX SEAMCALL TDH.MEM.RANGE.BLOCK with GFN
     This corresponds to clearing the present bit.
3) TDH.MEM.TRACK. corresponds to local tlb flush
4) Send IPI to remote vcpus. This corresponds to remote tlb flush.
5) kvm_flush_remote_tlbs_with_address().  This flushes shared EPT
   pointer. No operations on Secure EPT.  When destructing VM, Check if
   SPTE_PRIVATE_ZAPPED and issue TDH.MEM.REMOVE with PFNs.

Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
  • Loading branch information
yamahata committed Jan 24, 2022
1 parent c0965be commit 6d8632a26d7060510bfe4fbb3011c2d6e2d2f079
Show file tree
Hide file tree
Showing 6 changed files with 234 additions and 34 deletions.
@@ -89,7 +89,9 @@ KVM_X86_OP(set_tss_addr)
KVM_X86_OP(set_identity_map_addr)
KVM_X86_OP(get_mt_mask)
KVM_X86_OP(load_mmu_pgd)
KVM_X86_OP(zap_private_spte)
KVM_X86_OP(free_private_sp)
KVM_X86_OP(handle_zapped_private_spte)
KVM_X86_OP(handle_changed_private_spte)
KVM_X86_OP_NULL(has_wbinvd_exit)
KVM_X86_OP(get_l2_tsc_offset)
@@ -1428,12 +1428,17 @@ struct kvm_x86_ops {
void (*load_mmu_pgd)(struct kvm_vcpu *vcpu, hpa_t root_hpa,
int root_level);

void (*zap_private_spte)(struct kvm *kvm, gfn_t gfn, enum pg_level level);
int (*free_private_sp)(struct kvm *kvm, gfn_t gfn, enum pg_level level,
void *private_sp);
void (*handle_zapped_private_spte)(
struct kvm *kvm, gfn_t gfn, enum pg_level level,
kvm_pfn_t old_pfn, bool is_present);
void (*handle_changed_private_spte)(
struct kvm *kvm, gfn_t gfn, enum pg_level level,
kvm_pfn_t old_pfn, bool was_present, bool was_leaf,
kvm_pfn_t new_pfn, bool is_present, bool is_leaf, void *sept_page);
kvm_pfn_t new_pfn, bool is_present, bool is_leaf, void *sept_page,
bool shared);

bool (*has_wbinvd_exit)(void);

@@ -304,6 +304,8 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
}

void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end);
void kvm_tdp_mmu_drop_zapped_private_gfn(struct kvm *kvm,
struct kvm_gfn_range *range);

int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);

@@ -396,7 +396,8 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
* unreachable.
*/
old_child_spte = READ_ONCE(*sptep);
if (!is_shadow_present_pte(old_child_spte))
if (!is_shadow_present_pte(old_child_spte) &&
!is_zapped_private_pte(old_child_spte))
continue;

/*
@@ -466,10 +467,12 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
kvm_pfn_t old_pfn = spte_to_pfn(old_spte);
kvm_pfn_t new_pfn = spte_to_pfn(new_spte);
bool pfn_changed = old_pfn != new_pfn;
bool was_zapped_private = is_zapped_private_pte(old_spte);

WARN_ON(level > PT64_ROOT_MAX_LEVEL);
WARN_ON(level < PG_LEVEL_4K);
WARN_ON(gfn & (KVM_PAGES_PER_HPAGE(level) - 1));
WARN_ON(was_zapped_private && !private_spte);

/*
* If this warning were to trigger it would indicate that there was a
@@ -499,6 +502,14 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,

trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte);

if (was_zapped_private) {
static_call(kvm_x86_handle_zapped_private_spte)(
kvm, gfn, level, old_pfn, is_present);
/* Temporarily blocked private SPTE can only be leaf. */
WARN_ON(!is_last_spte(old_spte, level));
return;
}

/*
* The only times a SPTE should be changed from a non-present to
* non-present state is when an MMIO entry is installed/modified/
@@ -549,10 +560,12 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn,
WARN_ON(sp->gfn != gfn);
}

WARN_ON(was_zapped_private && is_zapped_private_pte(new_spte));
WARN_ON(!shared && !is_zapped_private_pte(new_spte));
static_call(kvm_x86_handle_changed_private_spte)(
kvm, gfn, level,
old_pfn, was_present, was_leaf,
new_pfn, is_present, is_leaf, sept_page);
new_pfn, is_present, is_leaf, sept_page, shared);
}

/*
@@ -733,9 +746,15 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
#define tdp_root_for_each_pte(_iter, _root, _start, _end) \
for_each_tdp_pte(_iter, _root->spt, _root->role.level, _start, _end)

/*
* Note temporarily blocked private SPTE is consider as valid leaf,
* although !is_shadow_present_pte() returns true for it, since the
* target page (which the mapping maps to ) is still there.
*/
#define tdp_root_for_each_leaf_pte(_iter, _root, _start, _end) \
tdp_root_for_each_pte(_iter, _root, _start, _end) \
if ((!is_shadow_present_pte(_iter.old_spte)) || \
if ((!is_shadow_present_pte(_iter.old_spte) && \
!is_zapped_private_pte(_iter.old_spte)) || \
!is_last_spte(_iter.old_spte, _iter.level)) \
continue; \
else
@@ -745,6 +764,16 @@ static inline void tdp_mmu_set_spte_no_dirty_log(struct kvm *kvm,
__va((_private) ? _mmu->private_root_hpa : _mmu->root_hpa), \
_mmu->shadow_root_level, _start, _end)

static u64 zapped_private_spte(struct kvm *kvm, const struct tdp_iter *iter)
{
if (!kvm_gfn_stolen_mask(kvm))
return shadow_init_value;

return shadow_init_value | SPTE_PRIVATE_ZAPPED |
(spte_to_pfn(iter->old_spte) << PAGE_SHIFT) |
is_large_pte(iter->old_spte) ? PT_PAGE_SIZE_MASK : 0;
}

/*
* Yield if the MMU lock is contended or this thread needs to return control
* to the scheduler.
@@ -853,7 +882,12 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,
continue;
}

if (!is_shadow_present_pte(iter.old_spte))
/*
* Skip non-present SPTE, with exception of temporarily
* blocked private SPTE, which also needs to be zapped.
*/
if (!is_shadow_present_pte(iter.old_spte) &&
!is_zapped_private_pte(iter.old_spte))
continue;

/*
@@ -869,7 +903,8 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root,

if (!shared) {
/* see comments in tdp_mmu_zap_spte_atomic() */
tdp_mmu_set_spte(kvm, &iter, shadow_init_value);
tdp_mmu_set_spte(kvm, &iter,
zapped_private_spte(kvm, &iter));
flush = true;
} else if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
/*
@@ -1234,6 +1269,14 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
if (!is_shadow_present_pte(iter.old_spte)) {
bool account_nx;

/*
* TODO: large page support.
* Not expecting blocked private SPTE points to a
* large page now.
*/
WARN_ON(is_zapped_private_pte(iter.old_spte) &&
is_large_pte(iter.old_spte));

/*
* If SPTE has been frozen by another thread, just
* give up and retry, avoiding unnecessary page table
@@ -1319,6 +1362,41 @@ static __always_inline bool kvm_tdp_mmu_handle_gfn(struct kvm *kvm,
return ret;
}

static bool drop_zapped_private_spte(struct kvm *kvm, struct tdp_iter *iter,
struct kvm_gfn_range *range)
{
if (tdp_mmu_iter_cond_resched(kvm, iter, false, false))
return false;

if (WARN_ON(!is_private_spte(iter->sptep)))
return false;
if (!is_zapped_private_pte(iter->old_spte))
return false;

static_call(kvm_x86_zap_private_spte)(kvm, iter->gfn, iter->level);
WRITE_ONCE(*rcu_dereference(iter->sptep), shadow_init_value);

return true;
}

void kvm_tdp_mmu_drop_zapped_private_gfn(struct kvm *kvm,
struct kvm_gfn_range *range)
{
struct kvm_mmu_page *root;
int as_id;

for (as_id = 0; as_id < KVM_ADDRESS_SPACE_NUM; as_id++) {
for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, false) {
if (!is_private_sp(root))
continue;

kvm_tdp_mmu_handle_gfn(kvm, range,
drop_zapped_private_spte);
}
}
}
EXPORT_SYMBOL_GPL(kvm_tdp_mmu_drop_zapped_private_gfn);

/*
* Mark the SPTEs range of GFNs [start, end) unaccessed and return non-zero
* if any of the GFNs in the range have been accessed.
@@ -1402,7 +1480,7 @@ static bool set_spte_gfn(struct kvm *kvm, struct tdp_iter *iter,
* invariant that the PFN of a present * leaf SPTE can never change.
* See __handle_changed_spte().
*/
tdp_mmu_set_spte(kvm, iter, shadow_init_value);
tdp_mmu_set_spte(kvm, iter, zapped_private_spte(kvm, iter));

if (!pte_write(range->pte)) {
new_spte = kvm_mmu_changed_pte_notifier_make_spte(iter->old_spte,

0 comments on commit 6d8632a

Please sign in to comment.