Skip to content

Commit 75c69c8

Browse files
committed
KVM: x86: Load guest/host XCR0 and XSS outside of the fastpath run loop
Move KVM's swapping of XFEATURE masks, i.e. XCR0 and XSS, out of the fastpath loop now that the guts of the #MC handler runs in task context, i.e. won't invoke schedule() with preemption disabled and clobber state (or crash the kernel) due to trying to context switch XSTATE with a mix of host and guest state. For all intents and purposes, this reverts commit 1811d97 ("x86/kvm: move kvm_load/put_guest_xcr0 into atomic context"), which papered over an egregious bug/flaw in the #MC handler where it would do schedule() even though IRQs are disabled. E.g. the call stack from the commit: kvm_load_guest_xcr0 ... kvm_x86_ops->run(vcpu) vmx_vcpu_run vmx_complete_atomic_exit kvm_machine_check do_machine_check do_memory_failure memory_failure lock_page Commit 1811d97 "fixed" the immediate issue of XRSTORS exploding, but completely ignored that scheduling out a vCPU task while IRQs and preemption is wildly broken. Thankfully, commit 5567d11 ("x86/mce: Send #MC singal from task work") (somewhat incidentally?) fixed that flaw by pushing the meat of the work to the user-return path, i.e. to task context. KVM has also hardened itself against #MC goofs by moving #MC forwarding to kvm_x86_ops.handle_exit_irqoff(), i.e. out of the fastpath. While that's by no means a robust fix, restoring as much state as possible before handling the #MC will hopefully provide some measure of protection in the event that #MC handling goes off the rails again. Note, KVM always intercepts XCR0 writes for vCPUs without protected state, e.g. there's no risk of consuming a stale XCR0 when determining if a PKRU update is needed; kvm_load_host_xfeatures() only reads, and never writes, vcpu->arch.xcr0. Deferring the XCR0 and XSS loads shaves ~300 cycles off the fastpath for Intel, and ~500 cycles for AMD. E.g. using INVD in KVM-Unit-Test's vmexit.c, which an extra hack to enable CR4.OXSAVE, latency numbers for AMD Turin go from ~2000 => 1500, and for Intel Emerald Rapids, go from ~1300 => ~1000. Cc: Jon Kohler <jon@nutanix.com> Reviewed-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Reviewed-by: Jon Kohler <jon@nutanix.com> Link: https://patch.msgid.link/20251118222328.2265758-4-seanjc@google.com Signed-off-by: Sean Christopherson <seanjc@google.com>
1 parent 63669bd commit 75c69c8

File tree

1 file changed

+26
-13
lines changed

1 file changed

+26
-13
lines changed

arch/x86/kvm/x86.c

Lines changed: 26 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1205,20 +1205,40 @@ void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
12051205
}
12061206
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_lmsw);
12071207

1208-
void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
1208+
static void kvm_load_guest_xfeatures(struct kvm_vcpu *vcpu)
12091209
{
12101210
if (vcpu->arch.guest_state_protected)
12111211
return;
12121212

12131213
if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
1214-
12151214
if (vcpu->arch.xcr0 != kvm_host.xcr0)
12161215
xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
12171216

12181217
if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) &&
12191218
vcpu->arch.ia32_xss != kvm_host.xss)
12201219
wrmsrq(MSR_IA32_XSS, vcpu->arch.ia32_xss);
12211220
}
1221+
}
1222+
1223+
static void kvm_load_host_xfeatures(struct kvm_vcpu *vcpu)
1224+
{
1225+
if (vcpu->arch.guest_state_protected)
1226+
return;
1227+
1228+
if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
1229+
if (vcpu->arch.xcr0 != kvm_host.xcr0)
1230+
xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
1231+
1232+
if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) &&
1233+
vcpu->arch.ia32_xss != kvm_host.xss)
1234+
wrmsrq(MSR_IA32_XSS, kvm_host.xss);
1235+
}
1236+
}
1237+
1238+
void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
1239+
{
1240+
if (vcpu->arch.guest_state_protected)
1241+
return;
12221242

12231243
if (cpu_feature_enabled(X86_FEATURE_PKU) &&
12241244
vcpu->arch.pkru != vcpu->arch.host_pkru &&
@@ -1240,17 +1260,6 @@ void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
12401260
if (vcpu->arch.pkru != vcpu->arch.host_pkru)
12411261
wrpkru(vcpu->arch.host_pkru);
12421262
}
1243-
1244-
if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) {
1245-
1246-
if (vcpu->arch.xcr0 != kvm_host.xcr0)
1247-
xsetbv(XCR_XFEATURE_ENABLED_MASK, kvm_host.xcr0);
1248-
1249-
if (guest_cpu_cap_has(vcpu, X86_FEATURE_XSAVES) &&
1250-
vcpu->arch.ia32_xss != kvm_host.xss)
1251-
wrmsrq(MSR_IA32_XSS, kvm_host.xss);
1252-
}
1253-
12541263
}
12551264
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_load_host_xsave_state);
12561265

@@ -11264,6 +11273,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
1126411273
if (vcpu->arch.guest_fpu.xfd_err)
1126511274
wrmsrq(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
1126611275

11276+
kvm_load_guest_xfeatures(vcpu);
11277+
1126711278
if (unlikely(vcpu->arch.switch_db_regs &&
1126811279
!(vcpu->arch.switch_db_regs & KVM_DEBUGREG_AUTO_SWITCH))) {
1126911280
set_debugreg(DR7_FIXED_1, 7);
@@ -11350,6 +11361,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
1135011361
vcpu->mode = OUTSIDE_GUEST_MODE;
1135111362
smp_wmb();
1135211363

11364+
kvm_load_host_xfeatures(vcpu);
11365+
1135311366
/*
1135411367
* Sync xfd before calling handle_exit_irqoff() which may
1135511368
* rely on the fact that guest_fpu::xfd is up-to-date (e.g.

0 commit comments

Comments
 (0)