Skip to content

Commit dcab95e

Browse files
committed
KVM: TDX: Add sub-ioctl KVM_TDX_TERMINATE_VM
Add sub-ioctl KVM_TDX_TERMINATE_VM to release the HKID prior to shutdown, which enables more efficient reclaim of private memory. Private memory is removed from MMU/TDP when guest_memfds are closed. If the HKID has not been released, the TDX VM is still in the RUNNABLE state, and so pages must be removed using "Dynamic Page Removal" procedure (refer to the TDX Module Base spec) which involves a number of steps: Block further address translation Exit each VCPU Clear Secure EPT entry Flush/write-back/invalidate relevant caches However, when the HKID is released, the TDX VM moves to TD_TEARDOWN state, where all TDX VM pages are effectively unmapped, so pages can be reclaimed directly. Reclaiming TD Pages in TD_TEARDOWN State was seen to decrease the total reclaim time. For example: VCPUs Size (GB) Before (secs) After (secs) 4 18 72 24 32 107 517 134 64 400 5539 467 Add kvm_tdx_capabilities.supported_caps along with KVM_TDX_CAP_TERMINATE_VM to advertise support to userspace. Use a new field in kvm_tdx_capabilities instead of adding yet another generic KVM_CAP to avoid bleeding TDX details into common code (and #ifdefs), and so that userspace can query TDX capabilities in one shot. Enumerating capabilities as a mask of bits does limit supported_caps to 64 capabilities, but in the unlikely event KVM needs to enumerate more than 64 TDX capabilities, there are another 249 u64 entries reserved for future expansion. Link: https://lore.kernel.org/r/Z-V0qyTn2bXdrPF7@google.com Link: https://lore.kernel.org/r/aAL4dT1pWG5dDDeo@google.com Co-developed-by: Adrian Hunter <adrian.hunter@intel.com> Signed-off-by: Adrian Hunter <adrian.hunter@intel.com> Acked-by: Vishal Annapurve <vannapurve@google.com> Tested-by: Vishal Annapurve <vannapurve@google.com> Tested-by: Xiaoyao Li <xiaoyao.li@intel.com> Cc: Rick Edgecombe <rick.p.edgecombe@intel.com> Cc: Nikolay Borisov <nik.borisov@suse.com> Link: https://lore.kernel.org/r/20250718181541.98146-1-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
1 parent 347e9f5 commit dcab95e

File tree

3 files changed

+55
-10
lines changed

3 files changed

+55
-10
lines changed

Documentation/virt/kvm/x86/intel-tdx.rst

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ ioctl with TDX specific sub-ioctl() commands.
3838
KVM_TDX_INIT_MEM_REGION,
3939
KVM_TDX_FINALIZE_VM,
4040
KVM_TDX_GET_CPUID,
41+
KVM_TDX_TERMINATE_VM,
4142

4243
KVM_TDX_CMD_NR_MAX,
4344
};
@@ -92,7 +93,10 @@ to be configured to the TDX guest.
9293
__u64 kernel_tdvmcallinfo_1_r12;
9394
__u64 user_tdvmcallinfo_1_r12;
9495

95-
__u64 reserved[250];
96+
/* Misc capabilities enumerated via the KVM_TDX_CAP_* namespace. */
97+
__u64 supported_caps;
98+
99+
__u64 reserved[249];
96100

97101
/* Configurable CPUID bits for userspace */
98102
struct kvm_cpuid2 cpuid;
@@ -227,6 +231,22 @@ struct kvm_cpuid2.
227231
__u32 padding[3];
228232
};
229233

234+
KVM_TDX_TERMINATE_VM
235+
--------------------
236+
:Capability: KVM_TDX_CAP_TERMINATE_VM
237+
:Type: vm ioctl
238+
:Returns: 0 on success, <0 on error
239+
240+
Release Host Key ID (HKID) to allow more efficient reclaim of private memory.
241+
After this, the TD is no longer in a runnable state.
242+
243+
Using KVM_TDX_TERMINATE_VM is optional.
244+
245+
- id: KVM_TDX_TERMINATE_VM
246+
- flags: must be 0
247+
- data: must be 0
248+
- hw_error: must be 0
249+
230250
KVM TDX creation flow
231251
=====================
232252
In addition to the standard KVM flow, new TDX ioctls need to be called. The

arch/x86/include/uapi/asm/kvm.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -940,6 +940,7 @@ enum kvm_tdx_cmd_id {
940940
KVM_TDX_INIT_MEM_REGION,
941941
KVM_TDX_FINALIZE_VM,
942942
KVM_TDX_GET_CPUID,
943+
KVM_TDX_TERMINATE_VM,
943944

944945
KVM_TDX_CMD_NR_MAX,
945946
};
@@ -962,6 +963,8 @@ struct kvm_tdx_cmd {
962963
__u64 hw_error;
963964
};
964965

966+
#define KVM_TDX_CAP_TERMINATE_VM _BITULL(0)
967+
965968
struct kvm_tdx_capabilities {
966969
__u64 supported_attrs;
967970
__u64 supported_xfam;
@@ -971,7 +974,9 @@ struct kvm_tdx_capabilities {
971974
__u64 kernel_tdvmcallinfo_1_r12;
972975
__u64 user_tdvmcallinfo_1_r12;
973976

974-
__u64 reserved[250];
977+
__u64 supported_caps;
978+
979+
__u64 reserved[249];
975980

976981
/* Configurable CPUID bits for userspace */
977982
struct kvm_cpuid2 cpuid;

arch/x86/kvm/vmx/tdx.c

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,8 @@ static int init_kvm_tdx_caps(const struct tdx_sys_info_td_conf *td_conf,
189189
if (!caps->supported_xfam)
190190
return -EIO;
191191

192+
caps->supported_caps = KVM_TDX_CAP_TERMINATE_VM;
193+
192194
caps->cpuid.nent = td_conf->num_cpuid_config;
193195

194196
caps->user_tdvmcallinfo_1_r11 =
@@ -522,6 +524,7 @@ void tdx_mmu_release_hkid(struct kvm *kvm)
522524
goto out;
523525
}
524526

527+
write_lock(&kvm->mmu_lock);
525528
for_each_online_cpu(i) {
526529
if (packages_allocated &&
527530
cpumask_test_and_set_cpu(topology_physical_package_id(i),
@@ -546,7 +549,7 @@ void tdx_mmu_release_hkid(struct kvm *kvm)
546549
} else {
547550
tdx_hkid_free(kvm_tdx);
548551
}
549-
552+
write_unlock(&kvm->mmu_lock);
550553
out:
551554
mutex_unlock(&tdx_lock);
552555
cpus_read_unlock();
@@ -1888,13 +1891,13 @@ int tdx_sept_remove_private_spte(struct kvm *kvm, gfn_t gfn,
18881891
struct page *page = pfn_to_page(pfn);
18891892
int ret;
18901893

1891-
/*
1892-
* HKID is released after all private pages have been removed, and set
1893-
* before any might be populated. Warn if zapping is attempted when
1894-
* there can't be anything populated in the private EPT.
1895-
*/
1896-
if (KVM_BUG_ON(!is_hkid_assigned(to_kvm_tdx(kvm)), kvm))
1897-
return -EINVAL;
1894+
if (!is_hkid_assigned(to_kvm_tdx(kvm))) {
1895+
KVM_BUG_ON(!kvm->vm_dead, kvm);
1896+
ret = tdx_reclaim_page(page);
1897+
if (!ret)
1898+
tdx_unpin(kvm, page);
1899+
return ret;
1900+
}
18981901

18991902
ret = tdx_sept_zap_private_spte(kvm, gfn, level, page);
19001903
if (ret <= 0)
@@ -2889,6 +2892,20 @@ static int tdx_td_finalize(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
28892892
return 0;
28902893
}
28912894

2895+
static int tdx_terminate_vm(struct kvm *kvm)
2896+
{
2897+
if (kvm_trylock_all_vcpus(kvm))
2898+
return -EBUSY;
2899+
2900+
kvm_vm_dead(kvm);
2901+
2902+
kvm_unlock_all_vcpus(kvm);
2903+
2904+
tdx_mmu_release_hkid(kvm);
2905+
2906+
return 0;
2907+
}
2908+
28922909
int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
28932910
{
28942911
struct kvm_tdx_cmd tdx_cmd;
@@ -2916,6 +2933,9 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
29162933
case KVM_TDX_FINALIZE_VM:
29172934
r = tdx_td_finalize(kvm, &tdx_cmd);
29182935
break;
2936+
case KVM_TDX_TERMINATE_VM:
2937+
r = tdx_terminate_vm(kvm);
2938+
break;
29192939
default:
29202940
r = -EINVAL;
29212941
goto out;

0 commit comments

Comments
 (0)