Skip to content

Commit

Permalink
KVM, x86/mmu: Support TDX private mapping for TDP MMU
Browse files Browse the repository at this point in the history
Frame in TDX private mapping support to support running TD with TDP MMU.

Similar to legacy MMU, use private mapping related kvm_x86_ops hooks in
__handle_changed_spte()/handle_removed_tdp_mmu_page() to support
creating/removing private mapping.  And support temporarily blocking
private mapping upon receiving MMU notifier and later on unblocking it upon
EPT violation, rather than completely removing the private page, because
currently page migration for the private page is not supported, therefore
the page cannot be removed from TD upon MMU notifier.

Similar to legacy MMU, zap aliasing mapping (truly remove the page) upon
EPT violation.  And only the leaf page is zapped, but not intermediate page
tables.

Support 4K page only at this stage.  2M page support can be done in future
patches.

A key change to TDP MMU to support the TDX guest is, the read_lock() is
changed to write_lock() for page fault on private GPA in
direct_page_fault(), while for fault on shared GPA, read_lock() is still
used.  This is because for TD guest, at a given time for given GFN, only
one type of mapping (either private, or shared) can be valid, but not both,
otherwise it may cause machine check and data loss.  As a result, aliasing
mapping is zapped in TDP MMU fault handler.  In this case, running multiple
fault threads with both private and shared addresses concurrently may end
up with having both private and shared mapping for given GPN.  Consider the
below case: vcpu 0 is accessing using private GPA, and vcpu 1 is accessing
the shared GPA (i.e. right after MAP_GPA).

	vcpu 0 				vcpu 1

  (fault with private GPA)	(fault with shared GPA)
	zap shared mapping
				zap private mapping
				setup shared mapping
	setup private mapping

This may end up with having both private and shared mappings.  Perhaps it
is arguable whether the above case is valid, but for security, just don't
allow running multiple fault threads concurrently.

Signed-off-by: Kai Huang <kai.huang@intel.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
  • Loading branch information
kaihuang authored and yamahata committed Dec 16, 2021
1 parent c2120b3 commit 81d423a
Show file tree
Hide file tree
Showing 5 changed files with 569 additions and 78 deletions.
63 changes: 57 additions & 6 deletions arch/x86/kvm/mmu/mmu.c
Expand Up @@ -3698,7 +3698,11 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
goto out_unlock;

if (is_tdp_mmu_enabled(vcpu->kvm)) {
root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
if (gfn_shared && !VALID_PAGE(mmu->private_root_hpa)) {
root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu, true);
mmu->private_root_hpa = root;
}
root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu, false);
mmu->root_hpa = root;
} else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
if (gfn_shared && !VALID_PAGE(vcpu->arch.mmu->private_root_hpa)) {
Expand Down Expand Up @@ -4368,7 +4372,33 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault

r = RET_PF_RETRY;

if (is_tdp_mmu_fault)
/*
* Unfortunately, when running TD, TDP MMU cannot always run multiple
* fault threads concurrently. TDX has two page tables supporting
* private and shared mapping simultaneously, but at given time, only
* one mapping can be valid for given GFN, otherwise it may cause
* machine check and data lose. Therefore, TDP MMU fault handler zaps
* aliasing mapping.
*
* Running fault threads for both private and shared GPA concurrently
* can potentially end up with having both private and shared mapping
* for one GPA. For instance, vcpu 0 is accessing using private GPA,
* and vcpu 1 is accessing using shared GPA (i.e. right after MAP_GPA):
*
* vcpu 0 vcpu 1
* (fault with private GPA) (fault with shared GPA)
*
* zap shared mapping
* zap priavte mapping
* setup shared mapping
* setup private mapping
*
* This can be prevented by only allowing one type of fault (private
* or shared) to run concurrently. Choose to let private fault to run
* concurrently, because for TD, most pages should be private.
*/
if (is_tdp_mmu_fault && kvm_is_private_gfn(vcpu->kvm,
fault->addr >> PAGE_SHIFT))
read_lock(&vcpu->kvm->mmu_lock);
else
write_lock(&vcpu->kvm->mmu_lock);
Expand All @@ -4386,7 +4416,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
r = __direct_map(vcpu, fault);

out_unlock:
if (is_tdp_mmu_fault)
if (is_tdp_mmu_fault && kvm_is_private_gfn(vcpu->kvm,
fault->addr >> PAGE_SHIFT))
read_unlock(&vcpu->kvm->mmu_lock);
else
write_unlock(&vcpu->kvm->mmu_lock);
Expand Down Expand Up @@ -6073,6 +6104,10 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)

write_unlock(&kvm->mmu_lock);

/*
* For now private root is never invalidate during VM is running,
* so this can only happen for shared roots.
*/
if (is_tdp_mmu_enabled(kvm)) {
read_lock(&kvm->mmu_lock);
kvm_tdp_mmu_zap_invalidated_roots(kvm);
Expand Down Expand Up @@ -6181,7 +6216,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
if (is_tdp_mmu_enabled(kvm)) {
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start,
gfn_end, flush);
gfn_end, flush,
false);
}

if (flush)
Expand Down Expand Up @@ -6214,6 +6250,11 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
write_unlock(&kvm->mmu_lock);
}

/*
* For now this can only happen for non-TD VM, because TD private
* mapping doesn't support write protection. kvm_tdp_mmu_wrprot_slot()
* will give a WARN() if it hits for TD.
*/
if (is_tdp_mmu_enabled(kvm)) {
read_lock(&kvm->mmu_lock);
flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level);
Expand Down Expand Up @@ -6293,6 +6334,11 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
write_unlock(&kvm->mmu_lock);
}

/*
* This should only be reachable in case of log-dirty, wihch TD private
* mapping doesn't support so far. kvm_tdp_mmu_zap_collapsible_sptes()
* internally gives a WARN() when it hits.
*/
if (is_tdp_mmu_enabled(kvm)) {
read_lock(&kvm->mmu_lock);
kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
Expand Down Expand Up @@ -6330,6 +6376,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
write_unlock(&kvm->mmu_lock);
}

/* See comments in kvm_mmu_slot_remove_write_access() */
if (is_tdp_mmu_enabled(kvm)) {
read_lock(&kvm->mmu_lock);
flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
Expand Down Expand Up @@ -6364,8 +6411,12 @@ static void __kvm_mmu_zap_all(struct kvm *kvm, struct list_head *mmu_pages)
}
kvm_mmu_commit_zap_page(kvm, &invalid_list);

if (is_tdp_mmu_enabled(kvm))
kvm_tdp_mmu_zap_all(kvm);
if (is_tdp_mmu_enabled(kvm)) {
bool zap_private =
(mmu_pages == &kvm->arch.private_mmu_pages) ?
true : false;
kvm_tdp_mmu_zap_all(kvm, zap_private);
}

write_unlock(&kvm->mmu_lock);
}
Expand Down
20 changes: 17 additions & 3 deletions arch/x86/kvm/mmu/spte.h
Expand Up @@ -172,7 +172,9 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
* If a thread running without exclusive control of the MMU lock must perform a
* multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a
* non-present intermediate value. Other threads which encounter this value
* should not modify the SPTE.
* should not modify the SPTE. When TDX is enabled, shadow_init_value, which
* is "suppress #VE" bit set, is also set to removed SPTE, because TDX module
* always enables "EPT violation #VE".
*
* Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on
* bot AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF
Expand All @@ -182,12 +184,24 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
*/
#define REMOVED_SPTE 0x5a0ULL

/* Removed SPTEs must not be misconstrued as shadow present PTEs. */
/*
* Removed SPTEs must not be misconstrued as shadow present PTEs, and
* temporarily blocked private PTEs.
*/
static_assert(!(REMOVED_SPTE & SPTE_MMU_PRESENT_MASK));
static_assert(!(REMOVED_SPTE & SPTE_PRIVATE_ZAPPED));

/*
* See above comment around REMOVED_SPTE. SHADOW_REMOVED_SPTE is the actual
* intermediate value set to the removed SPET. When TDX is enabled, it sets
* the "suppress #VE" bit, otherwise it's REMOVED_SPTE.
*/
extern u64 __read_mostly shadow_init_value;
#define SHADOW_REMOVED_SPTE (shadow_init_value | REMOVED_SPTE)

static inline bool is_removed_spte(u64 spte)
{
return spte == REMOVED_SPTE;
return spte == SHADOW_REMOVED_SPTE;
}

/*
Expand Down
2 changes: 1 addition & 1 deletion arch/x86/kvm/mmu/tdp_iter.h
Expand Up @@ -28,7 +28,7 @@ struct tdp_iter {
tdp_ptep_t pt_path[PT64_ROOT_MAX_LEVEL];
/* A pointer to the current SPTE */
tdp_ptep_t sptep;
/* The lowest GFN mapped by the current SPTE */
/* The lowest GFN (stolen bits included) mapped by the current SPTE */
gfn_t gfn;
/* The level of the root page given to the iterator */
int root_level;
Expand Down

0 comments on commit 81d423a

Please sign in to comment.