From e025537f1da01ae30af975f0fe9d50c534ee44d3 Mon Sep 17 00:00:00 2001 From: Wenchao Wang Date: Mon, 7 Dec 2020 16:36:20 +0800 Subject: [PATCH 1/6] Reset DR7 before setting DR0 Without resetting DR7, spurious exceptions encountered in unexpected conditions may lead to privilege escalation. The exploitability depends a lot on the host system, and how it processes #DB exceptions. Typically if the #DB handler has certain conditions that make it execute SWAPGS, then the ISR will execute with a guest-controlled TLS (the IA32_KERNEL_GS_BASE MSR, which contains a guest-controlled pointer), and here a privilege escalation is possible. Signed-off-by: Wenchao Wang --- core/vcpu.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/core/vcpu.c b/core/vcpu.c index 21561f8d..f1289841 100644 --- a/core/vcpu.c +++ b/core/vcpu.c @@ -1172,6 +1172,14 @@ static void load_guest_dr(struct vcpu_t *vcpu) if (!(is_guest_dr_dirty(vcpu) || is_host_debug_enabled(vcpu))) return; + // Reset DR7 to zero before setting DR0. + // Considering if the host has enabled guest debugging, it could trigger + // spurious exceptions in the host by setting a kernel address in DR0. + // Spurious exceptions encountered in unexpected conditions (such as with + // the user GS loaded, though this particular case does not seem to be + // triggerable here) can lead to privilege escalation. + set_dr7(0); + set_dr0(state->_dr0); set_dr1(state->_dr1); set_dr2(state->_dr2); From d07855f75a641a38704d02ca93d004d4e74587ee Mon Sep 17 00:00:00 2001 From: Wenchao Wang Date: Mon, 7 Dec 2020 16:49:42 +0800 Subject: [PATCH 2/6] Check instruction offset is valid before accessing Check the bytes being used in functions insn_fetch_*() do not exceed the 15-byte stack buffer where they are stored. It is easy for the guest to trigger out-of-bounds accesses in the host by just racing the bytes pointed to by the faulting RIP. It is an information disclosure of the host kernel stack: the out-of-bounds data has a direct influence on the instruction emulation and it doesn't seem complicated to leak host kernel stack data by observing the side effects of the altered emulation. Typically, if the destination of a MOVQ is encoded in 8 bytes located outside of the 15-byte stack buffer, the MOVQ will be applied to a destination address that is encoded by host kernel stack data. By scanning its pages to see at which address data got written, the guest can trivially infer what were the values of the 8 bytes of host kernel stack. Signed-off-by: Wenchao Wang --- core/emulate.c | 28 ++++++++++++++++++++++++---- core/include/emulate.h | 4 ++++ core/vcpu.c | 4 ---- 3 files changed, 28 insertions(+), 8 deletions(-) diff --git a/core/emulate.c b/core/emulate.c index f4084414..576fdef4 100644 --- a/core/emulate.c +++ b/core/emulate.c @@ -586,28 +586,48 @@ static void register_add(struct em_context_t *ctxt, static uint8_t insn_fetch_u8(struct em_context_t *ctxt) { - uint8_t result = *(uint8_t *)(&ctxt->insn[ctxt->len]); + uint8_t result; + + if (ctxt->len >= INSTR_MAX_LEN) + return 0; + + result = *(uint8_t *)(&ctxt->insn[ctxt->len]); ctxt->len += 1; return result; } static uint16_t insn_fetch_u16(struct em_context_t *ctxt) { - uint16_t result = *(uint16_t *)(&ctxt->insn[ctxt->len]); + uint16_t result; + + if (ctxt->len >= INSTR_MAX_LEN) + return 0; + + result = *(uint16_t *)(&ctxt->insn[ctxt->len]); ctxt->len += 2; return result; } static uint32_t insn_fetch_u32(struct em_context_t *ctxt) { - uint32_t result = *(uint32_t *)(&ctxt->insn[ctxt->len]); + uint32_t result; + + if (ctxt->len >= INSTR_MAX_LEN) + return 0; + + result = *(uint32_t *)(&ctxt->insn[ctxt->len]); ctxt->len += 4; return result; } static uint64_t insn_fetch_u64(struct em_context_t *ctxt) { - uint64_t result = *(uint64_t *)(&ctxt->insn[ctxt->len]); + uint64_t result; + + if (ctxt->len >= INSTR_MAX_LEN) + return 0; + + result = *(uint64_t *)(&ctxt->insn[ctxt->len]); ctxt->len += 8; return result; } diff --git a/core/include/emulate.h b/core/include/emulate.h index ea15f209..8582ede0 100644 --- a/core/include/emulate.h +++ b/core/include/emulate.h @@ -99,6 +99,10 @@ struct em_operand_t; /* Emulator interface flags */ #define EM_OPS_NO_TRANSLATION (1 << 0) +// Instructions are never longer than 15 bytes: +// http://wiki.osdev.org/X86-64_Instruction_Encoding +#define INSTR_MAX_LEN 15 + typedef struct em_vcpu_ops_t { uint64_t (*read_gpr)(void *vcpu, uint32_t reg_index); void (*write_gpr)(void *vcpu, uint32_t reg_index, uint64_t value); diff --git a/core/vcpu.c b/core/vcpu.c index f1289841..e86ab166 100644 --- a/core/vcpu.c +++ b/core/vcpu.c @@ -2117,10 +2117,6 @@ static void vcpu_exit_fpu_state(struct vcpu_t *vcpu) } } -// Instructions are never longer than 15 bytes: -// http://wiki.osdev.org/X86-64_Instruction_Encoding -#define INSTR_MAX_LEN 15 - static bool qemu_support_fastmmio(struct vcpu_t *vcpu) { struct vm_t *vm = vcpu->vm; From dd0abc084da06062a54867fe8828d3f99b247ba4 Mon Sep 17 00:00:00 2001 From: "Wang, Wenchao" Date: Fri, 8 Jan 2021 19:47:06 +0800 Subject: [PATCH 3/6] Fix MSR array initialization error When using 32-bit enum values to initialize the array of uint64_t, type casting is required. Otherwise, the upper 32 bits of each array element will be wrongly filled with 1 due to sign extension. Signed-off-by: Wenchao Wang --- core/vcpu.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/core/vcpu.c b/core/vcpu.c index e86ab166..09f67fde 100644 --- a/core/vcpu.c +++ b/core/vcpu.c @@ -47,29 +47,31 @@ #include "include/hax_core_interface.h" #include "include/hax_driver.h" +// Explicit type casting is to prevent the upper 32 bits of the array elements +// from being filled with 1 due to sign extension of the enum type. uint64_t gmsr_list[NR_GMSR] = { - IA32_STAR, - IA32_LSTAR, - IA32_CSTAR, - IA32_SF_MASK, - IA32_KERNEL_GS_BASE + (uint32_t)IA32_STAR, + (uint32_t)IA32_LSTAR, + (uint32_t)IA32_CSTAR, + (uint32_t)IA32_SF_MASK, + (uint32_t)IA32_KERNEL_GS_BASE }; uint64_t hmsr_list[NR_HMSR] = { - IA32_EFER, - IA32_STAR, - IA32_LSTAR, - IA32_CSTAR, - IA32_SF_MASK, - IA32_KERNEL_GS_BASE + (uint32_t)IA32_EFER, + (uint32_t)IA32_STAR, + (uint32_t)IA32_LSTAR, + (uint32_t)IA32_CSTAR, + (uint32_t)IA32_SF_MASK, + (uint32_t)IA32_KERNEL_GS_BASE }; uint64_t emt64_msr[NR_EMT64MSR] = { - IA32_STAR, - IA32_LSTAR, - IA32_CSTAR, - IA32_SF_MASK, - IA32_KERNEL_GS_BASE + (uint32_t)IA32_STAR, + (uint32_t)IA32_LSTAR, + (uint32_t)IA32_CSTAR, + (uint32_t)IA32_SF_MASK, + (uint32_t)IA32_KERNEL_GS_BASE }; static void vcpu_init(struct vcpu_t *vcpu); @@ -3096,7 +3098,7 @@ static int handle_msr_read(struct vcpu_t *vcpu, uint32_t msr, uint64_t *val) case IA32_SF_MASK: case IA32_KERNEL_GS_BASE: { for (index = 0; index < NR_GMSR; index++) { - if ((uint32_t)gstate->gmsr[index].entry == msr) { + if (gstate->gmsr[index].entry == msr) { *val = gstate->gmsr[index].value; break; } @@ -3430,7 +3432,7 @@ static int handle_msr_write(struct vcpu_t *vcpu, uint32_t msr, uint64_t val, case IA32_SF_MASK: case IA32_KERNEL_GS_BASE: { for (index = 0; index < NR_GMSR; index++) { - if ((uint32_t)gmsr_list[index] == msr) { + if (gmsr_list[index] == msr) { gstate->gmsr[index].value = val; gstate->gmsr[index].entry = msr; break; From 26d2e899fbc765376ae789ead05679806eb6467c Mon Sep 17 00:00:00 2001 From: "Wang, Wenchao" Date: Fri, 8 Jan 2021 19:49:06 +0800 Subject: [PATCH 4/6] Load guest MSRs automatically Use MSRs list to load guest MSRs on VM entries. The fact that these MSRs get loaded manually in the host means that the guest can trigger spurious interrupts in the host by enabling "system" PMC tracking and setting a low value in IA32_PMCx. It can cause two problems: * Guest DoS of the host by just setting a very low value in IA32_PMCx which will flood the host with interrupts. * Low-intensity information exfiltration if the emulator process has enabled PMCs (via host interfaces) and the loading of IA32_PMCx and IA32_PERFEVTSELx turns these "user-wide PMCs" to "system-wide PMCs" because the "OS" bit (bit 17) gets set. In that case the emulator can collect PMC events related to the kernel and not to itself, and information can be extracted this way (e.g., kernel cache-related events help defeat KASLR). This is a small form of "privilege escalation" since Kernel PMC events generally require higher privileges to be enabled. Signed-off-by: Wenchao Wang --- core/include/vcpu.h | 9 +++++++++ core/include/vmx.h | 6 ++++++ core/vcpu.c | 27 ++++++++++++++++++--------- 3 files changed, 33 insertions(+), 9 deletions(-) diff --git a/core/include/vcpu.h b/core/include/vcpu.h index 3cc8fa3b..ddfcfb10 100644 --- a/core/include/vcpu.h +++ b/core/include/vcpu.h @@ -42,9 +42,18 @@ #define NR_GMSR 5 #define NR_EMT64MSR 6 +// The number of MSRs to be loaded on VM entries +// Currently the MSRs list only supports automatic loading of below MSRs, the +// total count of which is 14. +// * IA32_PMCx +// * IA32_PERFEVTSELx +// * IA32_TSC_AUX +// * all MSRs defined in gmsr_list[] +#define NR_GMSR_AUTOLOAD 14 struct gstate { struct vmx_msr gmsr[NR_GMSR]; + vmx_msr_entry gmsr_autoload[NR_GMSR_AUTOLOAD]; // IA32_PMCx, since APM v1 uint64_t apm_pmc_msrs[APM_MAX_GENERAL_COUNT]; // IA32_PERFEVTSELx, since APM v1 diff --git a/core/include/vmx.h b/core/include/vmx.h index 22ad4e75..216f63a6 100644 --- a/core/include/vmx.h +++ b/core/include/vmx.h @@ -639,6 +639,12 @@ struct invept_desc { uint64_t rsvd; }; +// Intel SDM Vol. 3C: Table 24-12. Format of an MSR Entry +typedef struct ALIGNED(16) vmx_msr_entry { + uint64_t index; + uint64_t data; +} vmx_msr_entry; + struct vcpu_state_t; struct vcpu_t; diff --git a/core/vcpu.c b/core/vcpu.c index 09f67fde..51d1c2df 100644 --- a/core/vcpu.c +++ b/core/vcpu.c @@ -987,27 +987,35 @@ void load_guest_msr(struct vcpu_t *vcpu) int i; struct gstate *gstate = &vcpu->gstate; bool em64t_support = cpu_has_feature(X86_FEATURE_EM64T); + uint32_t count = 0; - for (i = 0; i < NR_GMSR; i++) { + for (i = 0; i < NR_GMSR; ++i) { if (em64t_support || !is_emt64_msr(gstate->gmsr[i].entry)) { - ia32_wrmsr(gstate->gmsr[i].entry, gstate->gmsr[i].value); + gstate->gmsr_autoload[count].index = gstate->gmsr[i].entry; + gstate->gmsr_autoload[count++].data = gstate->gmsr[i].value; } } if (cpu_has_feature(X86_FEATURE_RDTSCP)) { - ia32_wrmsr(IA32_TSC_AUX, gstate->tsc_aux); + gstate->gmsr_autoload[count].index = (uint32_t)IA32_TSC_AUX; + gstate->gmsr_autoload[count++].data = gstate->tsc_aux; } if (!hax->apm_version) return; // APM v1: restore IA32_PMCx and IA32_PERFEVTSELx - for (i = 0; i < (int)hax->apm_general_count; i++) { - uint32_t msr = (uint32_t)(IA32_PMC0 + i); - ia32_wrmsr(msr, gstate->apm_pmc_msrs[i]); - msr = (uint32_t)(IA32_PERFEVTSEL0 + i); - ia32_wrmsr(msr, gstate->apm_pes_msrs[i]); + for (i = 0; i < (int)hax->apm_general_count; ++i) { + gstate->gmsr_autoload[count].index = (uint32_t)(IA32_PMC0 + i); + gstate->gmsr_autoload[count++].data = gstate->apm_pmc_msrs[i]; } + + for (i = 0; i < (int)hax->apm_general_count; ++i) { + gstate->gmsr_autoload[count].index = (uint32_t)(IA32_PERFEVTSEL0 + i); + gstate->gmsr_autoload[count++].data = gstate->apm_pes_msrs[i]; + } + + vmwrite(vcpu, VMX_ENTRY_MSR_LOAD_COUNT, count); } static void save_host_msr(struct vcpu_t *vcpu) @@ -1510,7 +1518,8 @@ static void fill_common_vmcs(struct vcpu_t *vcpu) vmwrite(vcpu, VMX_ENTRY_INTERRUPT_INFO, 0); // vmwrite(NULL, VMX_ENTRY_EXCEPTION_ERROR_CODE, 0); vmwrite(vcpu, VMX_ENTRY_MSR_LOAD_COUNT, 0); - vmwrite(vcpu, VMX_ENTRY_MSR_LOAD_ADDRESS, 0); + vmwrite(vcpu, VMX_ENTRY_MSR_LOAD_ADDRESS, + (uint64_t)hax_pa(vcpu->gstate.gmsr_autoload)); vmwrite(vcpu, VMX_ENTRY_INSTRUCTION_LENGTH, 0); // vmwrite(NULL, VMX_TPR_THRESHOLD, 0); From f800982c63a360e1131b9703a73070c5f56e058b Mon Sep 17 00:00:00 2001 From: "Wang, Wenchao" Date: Fri, 8 Jan 2021 19:50:40 +0800 Subject: [PATCH 5/6] Load host MSRs automatically Use MSRs list to load host MSRs on VM exits. Corresponding to loading guest MSRs automatically, the similar approach is applied for host MSRs, and only some special-case MSRs are remained in manual load. Signed-off-by: Wenchao Wang --- core/include/cpu.h | 7 +++++++ core/vcpu.c | 38 +++++++++++++++++++++++++++++++------- 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/core/include/cpu.h b/core/include/cpu.h index bc91cd58..80b89682 100644 --- a/core/include/cpu.h +++ b/core/include/cpu.h @@ -45,6 +45,12 @@ struct vcpu_t; struct vcpu_state_t; #define NR_HMSR 6 +// The number of MSRs to be loaded on VM exits +// Currently the MSRs list only supports automatic loading of below MSRs, the +// total count of which is 8. +// * IA32_PMCx +// * IA32_PERFEVTSELx +#define NR_HMSR_AUTOLOAD 8 struct hstate { /* ldt is not covered by host vmcs area */ @@ -65,6 +71,7 @@ struct hstate { uint64_t fs_base; uint64_t hcr2; struct vmx_msr hmsr[NR_HMSR]; + vmx_msr_entry hmsr_autoload[NR_HMSR_AUTOLOAD]; // IA32_PMCx, since APM v1 uint64_t apm_pmc_msrs[APM_MAX_GENERAL_COUNT]; // IA32_PERFEVTSELx, since APM v1 diff --git a/core/vcpu.c b/core/vcpu.c index 51d1c2df..e6c7fac3 100644 --- a/core/vcpu.c +++ b/core/vcpu.c @@ -1052,13 +1052,26 @@ static void load_host_msr(struct vcpu_t *vcpu) int i; struct hstate *hstate = &get_cpu_data(vcpu->cpu_id)->hstate; bool em64t_support = cpu_has_feature(X86_FEATURE_EM64T); + uint32_t count = 0; - for (i = 0; i < NR_HMSR; i++) { + // Load below MSR values manually on VM exits. + + // * IA32_STAR, IA32_LSTAR and IA32_SF_MASK + // Host will crash immediatelly on automatic load. See IA SDM Vol. 3C + // 31.10.4.3 (Handling the SYSCALL and SYSRET Instructions). + // * IA32_EFER and IA32_CSTAR + // See the same section as above. + // * IA32_KERNEL_GS_BASE + // See IA SDM Vol. 3C 31.10.4.4 (Handling the SWAPGS Instruction). + for (i = 0; i < NR_HMSR; ++i) { if (em64t_support || !is_emt64_msr(hstate->hmsr[i].entry)) { ia32_wrmsr(hstate->hmsr[i].entry, hstate->hmsr[i].value); } } + // * IA32_TSC_AUX + // BSOD will occur in host after automatic loading for a while, sometimes + // even after VM is shutdown. if (cpu_has_feature(X86_FEATURE_RDTSCP)) { ia32_wrmsr(IA32_TSC_AUX, hstate->tsc_aux); } @@ -1066,13 +1079,23 @@ static void load_host_msr(struct vcpu_t *vcpu) if (!hax->apm_version) return; + // Load below MSR values automatically on VM exits. + + // TODO: It will be implemented to trap IA32_PERFEVTSELx MSRs and + // automatically load below host values only when IA32_PERFEVTSELx MSRs are + // changed during the guest runtime. // APM v1: restore IA32_PMCx and IA32_PERFEVTSELx - for (i = 0; i < (int)hax->apm_general_count; i++) { - uint32_t msr = (uint32_t)(IA32_PMC0 + i); - ia32_wrmsr(msr, hstate->apm_pmc_msrs[i]); - msr = (uint32_t)(IA32_PERFEVTSEL0 + i); - ia32_wrmsr(msr, hstate->apm_pes_msrs[i]); + for (i = 0; i < (int)hax->apm_general_count; ++i) { + hstate->hmsr_autoload[count].index = (uint32_t)(IA32_PMC0 + i); + hstate->hmsr_autoload[count++].data = hstate->apm_pmc_msrs[i]; + } + + for (i = 0; i < (int)hax->apm_general_count; ++i) { + hstate->hmsr_autoload[count].index = (uint32_t)(IA32_PERFEVTSEL0 + i); + hstate->hmsr_autoload[count++].data = hstate->apm_pes_msrs[i]; } + + vmwrite(vcpu, VMX_EXIT_MSR_LOAD_COUNT, count); } static inline bool is_host_debug_enabled(struct vcpu_t *vcpu) @@ -1513,7 +1536,8 @@ static void fill_common_vmcs(struct vcpu_t *vcpu) vmwrite(vcpu, VMX_EXIT_MSR_STORE_ADDRESS, 0); vmwrite(vcpu, VMX_EXIT_MSR_LOAD_COUNT, 0); - vmwrite(vcpu, VMX_EXIT_MSR_LOAD_ADDRESS, 0); + vmwrite(vcpu, VMX_EXIT_MSR_LOAD_ADDRESS, + (uint64_t)hax_pa(cpu_data->hstate.hmsr_autoload)); vmwrite(vcpu, VMX_ENTRY_INTERRUPT_INFO, 0); // vmwrite(NULL, VMX_ENTRY_EXCEPTION_ERROR_CODE, 0); From 42e349857270906e5f6ae02706bff90b4b4db77a Mon Sep 17 00:00:00 2001 From: "Wang, Wenchao" Date: Fri, 8 Jan 2021 19:54:03 +0800 Subject: [PATCH 6/6] Add more MSRs to pass-through Set some MSRs loaded on VM entries/exits to pass-through for improving the performance. Signed-off-by: Wenchao Wang --- core/hax.c | 85 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 47 insertions(+), 38 deletions(-) diff --git a/core/hax.c b/core/hax.c index 9bc5ea79..0f85c14c 100644 --- a/core/hax.c +++ b/core/hax.c @@ -267,6 +267,44 @@ static int hax_vmx_enable_check(void) return 0; } +/* + * Allows the guest to read from and/or write to the specified MSRs without + * causing a VM exit. + * |start| is the start MSR address, |count| the number of MSRs. Together they + * specify a range of consecutive MSR addresses. + * |read| and |write| determine if each MSR can be read or written freely by the + * guest, respectively. + */ +static void set_msr_access(uint32_t start, uint32_t count, bool read, bool write) +{ + uint32_t end = start + count - 1; + uint32_t read_base, write_base, bit; + uint8_t *msr_bitmap = hax_page_va(msr_bitmap_page); + + hax_assert(((start ^ (start << 1)) & 0x80000000) == 0); + hax_assert((start & 0x3fffe000) == 0); + hax_assert(((start ^ end) & 0xffffe000) == 0); + hax_assert(msr_bitmap); + + // See IA SDM Vol. 3C 24.6.9 for the layout of the MSR bitmaps page + read_base = start & 0x80000000 ? 1024 : 0; + write_base = read_base + 2048; + for (bit = (start & 0x1fff); bit <= (end & 0x1fff); bit++) { + // Bit clear means allowed + if (read) { + btr(msr_bitmap + read_base, bit); + } else { + bts(msr_bitmap + read_base, bit); + } + + if (write) { + btr(msr_bitmap + write_base, bit); + } else { + bts(msr_bitmap + write_base, bit); + } + } +} + static int hax_vmx_init(void) { int ret = -ENOMEM; @@ -297,6 +335,15 @@ static int hax_vmx_init(void) if ((ret = hax_vmx_enable_check()) < 0) goto out_5; + // Set MSRs loaded on VM entries/exits to pass-through + // See Intel SDM Vol. 3C 24.6.9 (MSR-Bitmap Address) + + // 4 consecutive MSRs starting from IA32_STAR: + // IA32_STAR, IA32_LSTAR, IA32_CSTAR and IA32_SF_MASK + set_msr_access(IA32_STAR, 4, true, true); + set_msr_access(IA32_KERNEL_GS_BASE, 1, true, true); + set_msr_access(IA32_TSC_AUX, 1, true, true); + return 0; out_5: hax_disable_vmx(); @@ -393,44 +440,6 @@ int hax_get_capability(void *buf, int bufLeng, int *outLength) return 0; } -/* - * Allows the guest to read from and/or write to the specified MSRs without - * causing a VM exit. - * |start| is the start MSR address, |count| the number of MSRs. Together they - * specify a range of consecutive MSR addresses. - * |read| and |write| determine if each MSR can be read or written freely by the - * guest, respectively. - */ -static void set_msr_access(uint32_t start, uint32_t count, bool read, bool write) -{ - uint32_t end = start + count - 1; - uint32_t read_base, write_base, bit; - uint8_t *msr_bitmap = hax_page_va(msr_bitmap_page); - - hax_assert(((start ^ (start << 1)) & 0x80000000) == 0); - hax_assert((start & 0x3fffe000) == 0); - hax_assert(((start ^ end) & 0xffffe000) == 0); - hax_assert(msr_bitmap); - - // See IA SDM Vol. 3C 24.6.9 for the layout of the MSR bitmaps page - read_base = start & 0x80000000 ? 1024 : 0; - write_base = read_base + 2048; - for (bit = (start & 0x1fff); bit <= (end & 0x1fff); bit++) { - // Bit clear means allowed - if (read) { - btr(msr_bitmap + read_base, bit); - } else { - bts(msr_bitmap + read_base, bit); - } - - if (write) { - btr(msr_bitmap + write_base, bit); - } else { - bts(msr_bitmap + write_base, bit); - } - } -} - /* * Probes the host CPU to determine its performance monitoring capabilities. */