KVM, x86/mmu: Support TDX private mapping for TDP MMU

Frame in TDX private mapping support to support running TD with TDP MMU. Similar to legacy MMU, use private mapping related kvm_x86_ops hooks in __handle_changed_spte()/handle_removed_tdp_mmu_page() to support creating/removing private mapping. And support temporarily blocking private mapping upon receiving MMU notifier and later on unblocking it upon EPT violation, rather than completely removing the private page, because currently page migration for the private page is not supported, therefore the page cannot be removed from TD upon MMU notifier. Similar to legacy MMU, zap aliasing mapping (truly remove the page) upon EPT violation. And only the leaf page is zapped, but not intermediate page tables. Support 4K page only at this stage. 2M page support can be done in future patches. A key change to TDP MMU to support the TDX guest is, the read_lock() is changed to write_lock() for page fault on private GPA in direct_page_fault(), while for fault on shared GPA, read_lock() is still used. This is because for TD guest, at a given time for given GFN, only one type of mapping (either private, or shared) can be valid, but not both, otherwise it may cause machine check and data loss. As a result, aliasing mapping is zapped in TDP MMU fault handler. In this case, running multiple fault threads with both private and shared addresses concurrently may end up with having both private and shared mapping for given GPN. Consider the below case: vcpu 0 is accessing using private GPA, and vcpu 1 is accessing the shared GPA (i.e. right after MAP_GPA). vcpu 0 vcpu 1 (fault with private GPA) (fault with shared GPA) zap shared mapping zap private mapping setup shared mapping setup private mapping This may end up with having both private and shared mappings. Perhaps it is arguable whether the above case is valid, but for security, just don't allow running multiple fault threads concurrently. Signed-off-by: Kai Huang <kai.huang@intel.com> Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
intel · Dec 16, 2021 · 81d423a · 81d423a
1 parent c2120b3
commit 81d423a
Show file tree

Hide file tree

Showing 5 changed files with 569 additions and 78 deletions.
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
@@ -3698,7 +3698,11 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
 		goto out_unlock;
 
 	if (is_tdp_mmu_enabled(vcpu->kvm)) {
-		root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu);
+		if (gfn_shared && !VALID_PAGE(mmu->private_root_hpa)) {
+			root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu, true);
+			mmu->private_root_hpa = root;
+		}
+		root = kvm_tdp_mmu_get_vcpu_root_hpa(vcpu, false);
 		mmu->root_hpa = root;
 	} else if (shadow_root_level >= PT64_ROOT_4LEVEL) {
 		if (gfn_shared && !VALID_PAGE(vcpu->arch.mmu->private_root_hpa)) {
@@ -4368,7 +4372,33 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 
 	r = RET_PF_RETRY;
 
-	if (is_tdp_mmu_fault)
+	/*
+	 * Unfortunately, when running TD, TDP MMU cannot always run multiple
+	 * fault threads concurrently.  TDX has two page tables supporting
+	 * private and shared mapping simultaneously, but at given time, only
+	 * one mapping can be valid for given GFN, otherwise it may cause
+	 * machine check and data lose.  Therefore, TDP MMU fault handler zaps
+	 * aliasing mapping.
+	 *
+	 * Running fault threads for both private and shared GPA concurrently
+	 * can potentially end up with having both private and shared mapping
+	 * for one GPA.  For instance, vcpu 0 is accessing using private GPA,
+	 * and vcpu 1 is accessing using shared GPA (i.e. right after MAP_GPA):
+	 *
+	 *		vcpu 0				vcpu 1
+	 *	(fault with private GPA)	(fault with shared GPA)
+	 *
+	 *	zap shared mapping
+	 *					zap priavte mapping
+	 *					setup shared mapping
+	 *	setup private mapping
+	 *
+	 * This can be prevented by only allowing one type of fault (private
+	 * or shared) to run concurrently.  Choose to let private fault to run
+	 * concurrently, because for TD, most pages should be private.
+	 */
+	if (is_tdp_mmu_fault && kvm_is_private_gfn(vcpu->kvm,
+				fault->addr >> PAGE_SHIFT))
 		read_lock(&vcpu->kvm->mmu_lock);
 	else
 		write_lock(&vcpu->kvm->mmu_lock);
@@ -4386,7 +4416,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 		r = __direct_map(vcpu, fault);
 
 out_unlock:
-	if (is_tdp_mmu_fault)
+	if (is_tdp_mmu_fault && kvm_is_private_gfn(vcpu->kvm,
+				fault->addr >> PAGE_SHIFT))
 		read_unlock(&vcpu->kvm->mmu_lock);
 	else
 		write_unlock(&vcpu->kvm->mmu_lock);
@@ -6073,6 +6104,10 @@ static void kvm_mmu_zap_all_fast(struct kvm *kvm)
 
 	write_unlock(&kvm->mmu_lock);
 
+	/*
+	 * For now private root is never invalidate during VM is running,
+	 * so this can only happen for shared roots.
+	 */
 	if (is_tdp_mmu_enabled(kvm)) {
 		read_lock(&kvm->mmu_lock);
 		kvm_tdp_mmu_zap_invalidated_roots(kvm);
@@ -6181,7 +6216,8 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
 	if (is_tdp_mmu_enabled(kvm)) {
 		for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
 			flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start,
-							  gfn_end, flush);
+							  gfn_end, flush,
+							  false);
 	}
 
 	if (flush)
@@ -6214,6 +6250,11 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
 		write_unlock(&kvm->mmu_lock);
 	}
 
+	/*
+	 * For now this can only happen for non-TD VM, because TD private
+	 * mapping doesn't support write protection.  kvm_tdp_mmu_wrprot_slot()
+	 * will give a WARN() if it hits for TD.
+	 */
 	if (is_tdp_mmu_enabled(kvm)) {
 		read_lock(&kvm->mmu_lock);
 		flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level);
@@ -6293,6 +6334,11 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
 		write_unlock(&kvm->mmu_lock);
 	}
 
+	/*
+	 * This should only be reachable in case of log-dirty, wihch TD private
+	 * mapping doesn't support so far.  kvm_tdp_mmu_zap_collapsible_sptes()
+	 * internally gives a WARN() when it hits.
+	 */
 	if (is_tdp_mmu_enabled(kvm)) {
 		read_lock(&kvm->mmu_lock);
 		kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
@@ -6330,6 +6376,7 @@ void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
 		write_unlock(&kvm->mmu_lock);
 	}
 
+	/* See comments in kvm_mmu_slot_remove_write_access() */
 	if (is_tdp_mmu_enabled(kvm)) {
 		read_lock(&kvm->mmu_lock);
 		flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot);
@@ -6364,8 +6411,12 @@ static void __kvm_mmu_zap_all(struct kvm *kvm, struct list_head *mmu_pages)
 	}
 	kvm_mmu_commit_zap_page(kvm, &invalid_list);
 
-	if (is_tdp_mmu_enabled(kvm))
-		kvm_tdp_mmu_zap_all(kvm);
+	if (is_tdp_mmu_enabled(kvm)) {
+		bool zap_private =
+			(mmu_pages == &kvm->arch.private_mmu_pages) ?
+			true : false;
+		kvm_tdp_mmu_zap_all(kvm, zap_private);
+	}
 
 	write_unlock(&kvm->mmu_lock);
 }

diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
@@ -172,7 +172,9 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
  * If a thread running without exclusive control of the MMU lock must perform a
  * multi-part operation on an SPTE, it can set the SPTE to REMOVED_SPTE as a
  * non-present intermediate value. Other threads which encounter this value
- * should not modify the SPTE.
+ * should not modify the SPTE.  When TDX is enabled, shadow_init_value, which
+ * is "suppress #VE" bit set, is also set to removed SPTE, because TDX module
+ * always enables "EPT violation #VE".
  *
  * Use a semi-arbitrary value that doesn't set RWX bits, i.e. is not-present on
  * bot AMD and Intel CPUs, and doesn't set PFN bits, i.e. doesn't create a L1TF
@@ -182,12 +184,24 @@ extern u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
  */
 #define REMOVED_SPTE	0x5a0ULL
 
-/* Removed SPTEs must not be misconstrued as shadow present PTEs. */
+/*
+ * Removed SPTEs must not be misconstrued as shadow present PTEs, and
+ * temporarily blocked private PTEs.
+ */
 static_assert(!(REMOVED_SPTE & SPTE_MMU_PRESENT_MASK));
+static_assert(!(REMOVED_SPTE & SPTE_PRIVATE_ZAPPED));
+
+/*
+ * See above comment around REMOVED_SPTE.  SHADOW_REMOVED_SPTE is the actual
+ * intermediate value set to the removed SPET.  When TDX is enabled, it sets
+ * the "suppress #VE" bit, otherwise it's REMOVED_SPTE.
+ */
+extern u64 __read_mostly shadow_init_value;
+#define SHADOW_REMOVED_SPTE	(shadow_init_value | REMOVED_SPTE)
 
 static inline bool is_removed_spte(u64 spte)
 {
-	return spte == REMOVED_SPTE;
+	return spte == SHADOW_REMOVED_SPTE;
 }
 
 /*

diff --git a/arch/x86/kvm/mmu/tdp_iter.h b/arch/x86/kvm/mmu/tdp_iter.h
@@ -28,7 +28,7 @@ struct tdp_iter {
 	tdp_ptep_t pt_path[PT64_ROOT_MAX_LEVEL];
 	/* A pointer to the current SPTE */
 	tdp_ptep_t sptep;
-	/* The lowest GFN mapped by the current SPTE */
+	/* The lowest GFN (stolen bits included) mapped by the current SPTE */
 	gfn_t gfn;
 	/* The level of the root page given to the iterator */
 	int root_level;