diff --git a/core/emulate.c b/core/emulate.c index f4084414..576fdef4 100644 --- a/core/emulate.c +++ b/core/emulate.c @@ -586,28 +586,48 @@ static void register_add(struct em_context_t *ctxt, static uint8_t insn_fetch_u8(struct em_context_t *ctxt) { - uint8_t result = *(uint8_t *)(&ctxt->insn[ctxt->len]); + uint8_t result; + + if (ctxt->len >= INSTR_MAX_LEN) + return 0; + + result = *(uint8_t *)(&ctxt->insn[ctxt->len]); ctxt->len += 1; return result; } static uint16_t insn_fetch_u16(struct em_context_t *ctxt) { - uint16_t result = *(uint16_t *)(&ctxt->insn[ctxt->len]); + uint16_t result; + + if (ctxt->len >= INSTR_MAX_LEN) + return 0; + + result = *(uint16_t *)(&ctxt->insn[ctxt->len]); ctxt->len += 2; return result; } static uint32_t insn_fetch_u32(struct em_context_t *ctxt) { - uint32_t result = *(uint32_t *)(&ctxt->insn[ctxt->len]); + uint32_t result; + + if (ctxt->len >= INSTR_MAX_LEN) + return 0; + + result = *(uint32_t *)(&ctxt->insn[ctxt->len]); ctxt->len += 4; return result; } static uint64_t insn_fetch_u64(struct em_context_t *ctxt) { - uint64_t result = *(uint64_t *)(&ctxt->insn[ctxt->len]); + uint64_t result; + + if (ctxt->len >= INSTR_MAX_LEN) + return 0; + + result = *(uint64_t *)(&ctxt->insn[ctxt->len]); ctxt->len += 8; return result; } diff --git a/core/hax.c b/core/hax.c index 9bc5ea79..0f85c14c 100644 --- a/core/hax.c +++ b/core/hax.c @@ -267,6 +267,44 @@ static int hax_vmx_enable_check(void) return 0; } +/* + * Allows the guest to read from and/or write to the specified MSRs without + * causing a VM exit. + * |start| is the start MSR address, |count| the number of MSRs. Together they + * specify a range of consecutive MSR addresses. + * |read| and |write| determine if each MSR can be read or written freely by the + * guest, respectively. + */ +static void set_msr_access(uint32_t start, uint32_t count, bool read, bool write) +{ + uint32_t end = start + count - 1; + uint32_t read_base, write_base, bit; + uint8_t *msr_bitmap = hax_page_va(msr_bitmap_page); + + hax_assert(((start ^ (start << 1)) & 0x80000000) == 0); + hax_assert((start & 0x3fffe000) == 0); + hax_assert(((start ^ end) & 0xffffe000) == 0); + hax_assert(msr_bitmap); + + // See IA SDM Vol. 3C 24.6.9 for the layout of the MSR bitmaps page + read_base = start & 0x80000000 ? 1024 : 0; + write_base = read_base + 2048; + for (bit = (start & 0x1fff); bit <= (end & 0x1fff); bit++) { + // Bit clear means allowed + if (read) { + btr(msr_bitmap + read_base, bit); + } else { + bts(msr_bitmap + read_base, bit); + } + + if (write) { + btr(msr_bitmap + write_base, bit); + } else { + bts(msr_bitmap + write_base, bit); + } + } +} + static int hax_vmx_init(void) { int ret = -ENOMEM; @@ -297,6 +335,15 @@ static int hax_vmx_init(void) if ((ret = hax_vmx_enable_check()) < 0) goto out_5; + // Set MSRs loaded on VM entries/exits to pass-through + // See Intel SDM Vol. 3C 24.6.9 (MSR-Bitmap Address) + + // 4 consecutive MSRs starting from IA32_STAR: + // IA32_STAR, IA32_LSTAR, IA32_CSTAR and IA32_SF_MASK + set_msr_access(IA32_STAR, 4, true, true); + set_msr_access(IA32_KERNEL_GS_BASE, 1, true, true); + set_msr_access(IA32_TSC_AUX, 1, true, true); + return 0; out_5: hax_disable_vmx(); @@ -393,44 +440,6 @@ int hax_get_capability(void *buf, int bufLeng, int *outLength) return 0; } -/* - * Allows the guest to read from and/or write to the specified MSRs without - * causing a VM exit. - * |start| is the start MSR address, |count| the number of MSRs. Together they - * specify a range of consecutive MSR addresses. - * |read| and |write| determine if each MSR can be read or written freely by the - * guest, respectively. - */ -static void set_msr_access(uint32_t start, uint32_t count, bool read, bool write) -{ - uint32_t end = start + count - 1; - uint32_t read_base, write_base, bit; - uint8_t *msr_bitmap = hax_page_va(msr_bitmap_page); - - hax_assert(((start ^ (start << 1)) & 0x80000000) == 0); - hax_assert((start & 0x3fffe000) == 0); - hax_assert(((start ^ end) & 0xffffe000) == 0); - hax_assert(msr_bitmap); - - // See IA SDM Vol. 3C 24.6.9 for the layout of the MSR bitmaps page - read_base = start & 0x80000000 ? 1024 : 0; - write_base = read_base + 2048; - for (bit = (start & 0x1fff); bit <= (end & 0x1fff); bit++) { - // Bit clear means allowed - if (read) { - btr(msr_bitmap + read_base, bit); - } else { - bts(msr_bitmap + read_base, bit); - } - - if (write) { - btr(msr_bitmap + write_base, bit); - } else { - bts(msr_bitmap + write_base, bit); - } - } -} - /* * Probes the host CPU to determine its performance monitoring capabilities. */ diff --git a/core/include/cpu.h b/core/include/cpu.h index bc91cd58..80b89682 100644 --- a/core/include/cpu.h +++ b/core/include/cpu.h @@ -45,6 +45,12 @@ struct vcpu_t; struct vcpu_state_t; #define NR_HMSR 6 +// The number of MSRs to be loaded on VM exits +// Currently the MSRs list only supports automatic loading of below MSRs, the +// total count of which is 8. +// * IA32_PMCx +// * IA32_PERFEVTSELx +#define NR_HMSR_AUTOLOAD 8 struct hstate { /* ldt is not covered by host vmcs area */ @@ -65,6 +71,7 @@ struct hstate { uint64_t fs_base; uint64_t hcr2; struct vmx_msr hmsr[NR_HMSR]; + vmx_msr_entry hmsr_autoload[NR_HMSR_AUTOLOAD]; // IA32_PMCx, since APM v1 uint64_t apm_pmc_msrs[APM_MAX_GENERAL_COUNT]; // IA32_PERFEVTSELx, since APM v1 diff --git a/core/include/emulate.h b/core/include/emulate.h index ea15f209..8582ede0 100644 --- a/core/include/emulate.h +++ b/core/include/emulate.h @@ -99,6 +99,10 @@ struct em_operand_t; /* Emulator interface flags */ #define EM_OPS_NO_TRANSLATION (1 << 0) +// Instructions are never longer than 15 bytes: +// http://wiki.osdev.org/X86-64_Instruction_Encoding +#define INSTR_MAX_LEN 15 + typedef struct em_vcpu_ops_t { uint64_t (*read_gpr)(void *vcpu, uint32_t reg_index); void (*write_gpr)(void *vcpu, uint32_t reg_index, uint64_t value); diff --git a/core/include/vcpu.h b/core/include/vcpu.h index 3cc8fa3b..ddfcfb10 100644 --- a/core/include/vcpu.h +++ b/core/include/vcpu.h @@ -42,9 +42,18 @@ #define NR_GMSR 5 #define NR_EMT64MSR 6 +// The number of MSRs to be loaded on VM entries +// Currently the MSRs list only supports automatic loading of below MSRs, the +// total count of which is 14. +// * IA32_PMCx +// * IA32_PERFEVTSELx +// * IA32_TSC_AUX +// * all MSRs defined in gmsr_list[] +#define NR_GMSR_AUTOLOAD 14 struct gstate { struct vmx_msr gmsr[NR_GMSR]; + vmx_msr_entry gmsr_autoload[NR_GMSR_AUTOLOAD]; // IA32_PMCx, since APM v1 uint64_t apm_pmc_msrs[APM_MAX_GENERAL_COUNT]; // IA32_PERFEVTSELx, since APM v1 diff --git a/core/include/vmx.h b/core/include/vmx.h index 22ad4e75..216f63a6 100644 --- a/core/include/vmx.h +++ b/core/include/vmx.h @@ -639,6 +639,12 @@ struct invept_desc { uint64_t rsvd; }; +// Intel SDM Vol. 3C: Table 24-12. Format of an MSR Entry +typedef struct ALIGNED(16) vmx_msr_entry { + uint64_t index; + uint64_t data; +} vmx_msr_entry; + struct vcpu_state_t; struct vcpu_t; diff --git a/core/vcpu.c b/core/vcpu.c index 21561f8d..e6c7fac3 100644 --- a/core/vcpu.c +++ b/core/vcpu.c @@ -47,29 +47,31 @@ #include "include/hax_core_interface.h" #include "include/hax_driver.h" +// Explicit type casting is to prevent the upper 32 bits of the array elements +// from being filled with 1 due to sign extension of the enum type. uint64_t gmsr_list[NR_GMSR] = { - IA32_STAR, - IA32_LSTAR, - IA32_CSTAR, - IA32_SF_MASK, - IA32_KERNEL_GS_BASE + (uint32_t)IA32_STAR, + (uint32_t)IA32_LSTAR, + (uint32_t)IA32_CSTAR, + (uint32_t)IA32_SF_MASK, + (uint32_t)IA32_KERNEL_GS_BASE }; uint64_t hmsr_list[NR_HMSR] = { - IA32_EFER, - IA32_STAR, - IA32_LSTAR, - IA32_CSTAR, - IA32_SF_MASK, - IA32_KERNEL_GS_BASE + (uint32_t)IA32_EFER, + (uint32_t)IA32_STAR, + (uint32_t)IA32_LSTAR, + (uint32_t)IA32_CSTAR, + (uint32_t)IA32_SF_MASK, + (uint32_t)IA32_KERNEL_GS_BASE }; uint64_t emt64_msr[NR_EMT64MSR] = { - IA32_STAR, - IA32_LSTAR, - IA32_CSTAR, - IA32_SF_MASK, - IA32_KERNEL_GS_BASE + (uint32_t)IA32_STAR, + (uint32_t)IA32_LSTAR, + (uint32_t)IA32_CSTAR, + (uint32_t)IA32_SF_MASK, + (uint32_t)IA32_KERNEL_GS_BASE }; static void vcpu_init(struct vcpu_t *vcpu); @@ -985,27 +987,35 @@ void load_guest_msr(struct vcpu_t *vcpu) int i; struct gstate *gstate = &vcpu->gstate; bool em64t_support = cpu_has_feature(X86_FEATURE_EM64T); + uint32_t count = 0; - for (i = 0; i < NR_GMSR; i++) { + for (i = 0; i < NR_GMSR; ++i) { if (em64t_support || !is_emt64_msr(gstate->gmsr[i].entry)) { - ia32_wrmsr(gstate->gmsr[i].entry, gstate->gmsr[i].value); + gstate->gmsr_autoload[count].index = gstate->gmsr[i].entry; + gstate->gmsr_autoload[count++].data = gstate->gmsr[i].value; } } if (cpu_has_feature(X86_FEATURE_RDTSCP)) { - ia32_wrmsr(IA32_TSC_AUX, gstate->tsc_aux); + gstate->gmsr_autoload[count].index = (uint32_t)IA32_TSC_AUX; + gstate->gmsr_autoload[count++].data = gstate->tsc_aux; } if (!hax->apm_version) return; // APM v1: restore IA32_PMCx and IA32_PERFEVTSELx - for (i = 0; i < (int)hax->apm_general_count; i++) { - uint32_t msr = (uint32_t)(IA32_PMC0 + i); - ia32_wrmsr(msr, gstate->apm_pmc_msrs[i]); - msr = (uint32_t)(IA32_PERFEVTSEL0 + i); - ia32_wrmsr(msr, gstate->apm_pes_msrs[i]); + for (i = 0; i < (int)hax->apm_general_count; ++i) { + gstate->gmsr_autoload[count].index = (uint32_t)(IA32_PMC0 + i); + gstate->gmsr_autoload[count++].data = gstate->apm_pmc_msrs[i]; } + + for (i = 0; i < (int)hax->apm_general_count; ++i) { + gstate->gmsr_autoload[count].index = (uint32_t)(IA32_PERFEVTSEL0 + i); + gstate->gmsr_autoload[count++].data = gstate->apm_pes_msrs[i]; + } + + vmwrite(vcpu, VMX_ENTRY_MSR_LOAD_COUNT, count); } static void save_host_msr(struct vcpu_t *vcpu) @@ -1042,13 +1052,26 @@ static void load_host_msr(struct vcpu_t *vcpu) int i; struct hstate *hstate = &get_cpu_data(vcpu->cpu_id)->hstate; bool em64t_support = cpu_has_feature(X86_FEATURE_EM64T); - - for (i = 0; i < NR_HMSR; i++) { + uint32_t count = 0; + + // Load below MSR values manually on VM exits. + + // * IA32_STAR, IA32_LSTAR and IA32_SF_MASK + // Host will crash immediatelly on automatic load. See IA SDM Vol. 3C + // 31.10.4.3 (Handling the SYSCALL and SYSRET Instructions). + // * IA32_EFER and IA32_CSTAR + // See the same section as above. + // * IA32_KERNEL_GS_BASE + // See IA SDM Vol. 3C 31.10.4.4 (Handling the SWAPGS Instruction). + for (i = 0; i < NR_HMSR; ++i) { if (em64t_support || !is_emt64_msr(hstate->hmsr[i].entry)) { ia32_wrmsr(hstate->hmsr[i].entry, hstate->hmsr[i].value); } } + // * IA32_TSC_AUX + // BSOD will occur in host after automatic loading for a while, sometimes + // even after VM is shutdown. if (cpu_has_feature(X86_FEATURE_RDTSCP)) { ia32_wrmsr(IA32_TSC_AUX, hstate->tsc_aux); } @@ -1056,13 +1079,23 @@ static void load_host_msr(struct vcpu_t *vcpu) if (!hax->apm_version) return; + // Load below MSR values automatically on VM exits. + + // TODO: It will be implemented to trap IA32_PERFEVTSELx MSRs and + // automatically load below host values only when IA32_PERFEVTSELx MSRs are + // changed during the guest runtime. // APM v1: restore IA32_PMCx and IA32_PERFEVTSELx - for (i = 0; i < (int)hax->apm_general_count; i++) { - uint32_t msr = (uint32_t)(IA32_PMC0 + i); - ia32_wrmsr(msr, hstate->apm_pmc_msrs[i]); - msr = (uint32_t)(IA32_PERFEVTSEL0 + i); - ia32_wrmsr(msr, hstate->apm_pes_msrs[i]); + for (i = 0; i < (int)hax->apm_general_count; ++i) { + hstate->hmsr_autoload[count].index = (uint32_t)(IA32_PMC0 + i); + hstate->hmsr_autoload[count++].data = hstate->apm_pmc_msrs[i]; } + + for (i = 0; i < (int)hax->apm_general_count; ++i) { + hstate->hmsr_autoload[count].index = (uint32_t)(IA32_PERFEVTSEL0 + i); + hstate->hmsr_autoload[count++].data = hstate->apm_pes_msrs[i]; + } + + vmwrite(vcpu, VMX_EXIT_MSR_LOAD_COUNT, count); } static inline bool is_host_debug_enabled(struct vcpu_t *vcpu) @@ -1172,6 +1205,14 @@ static void load_guest_dr(struct vcpu_t *vcpu) if (!(is_guest_dr_dirty(vcpu) || is_host_debug_enabled(vcpu))) return; + // Reset DR7 to zero before setting DR0. + // Considering if the host has enabled guest debugging, it could trigger + // spurious exceptions in the host by setting a kernel address in DR0. + // Spurious exceptions encountered in unexpected conditions (such as with + // the user GS loaded, though this particular case does not seem to be + // triggerable here) can lead to privilege escalation. + set_dr7(0); + set_dr0(state->_dr0); set_dr1(state->_dr1); set_dr2(state->_dr2); @@ -1495,12 +1536,14 @@ static void fill_common_vmcs(struct vcpu_t *vcpu) vmwrite(vcpu, VMX_EXIT_MSR_STORE_ADDRESS, 0); vmwrite(vcpu, VMX_EXIT_MSR_LOAD_COUNT, 0); - vmwrite(vcpu, VMX_EXIT_MSR_LOAD_ADDRESS, 0); + vmwrite(vcpu, VMX_EXIT_MSR_LOAD_ADDRESS, + (uint64_t)hax_pa(cpu_data->hstate.hmsr_autoload)); vmwrite(vcpu, VMX_ENTRY_INTERRUPT_INFO, 0); // vmwrite(NULL, VMX_ENTRY_EXCEPTION_ERROR_CODE, 0); vmwrite(vcpu, VMX_ENTRY_MSR_LOAD_COUNT, 0); - vmwrite(vcpu, VMX_ENTRY_MSR_LOAD_ADDRESS, 0); + vmwrite(vcpu, VMX_ENTRY_MSR_LOAD_ADDRESS, + (uint64_t)hax_pa(vcpu->gstate.gmsr_autoload)); vmwrite(vcpu, VMX_ENTRY_INSTRUCTION_LENGTH, 0); // vmwrite(NULL, VMX_TPR_THRESHOLD, 0); @@ -2109,10 +2152,6 @@ static void vcpu_exit_fpu_state(struct vcpu_t *vcpu) } } -// Instructions are never longer than 15 bytes: -// http://wiki.osdev.org/X86-64_Instruction_Encoding -#define INSTR_MAX_LEN 15 - static bool qemu_support_fastmmio(struct vcpu_t *vcpu) { struct vm_t *vm = vcpu->vm; @@ -3092,7 +3131,7 @@ static int handle_msr_read(struct vcpu_t *vcpu, uint32_t msr, uint64_t *val) case IA32_SF_MASK: case IA32_KERNEL_GS_BASE: { for (index = 0; index < NR_GMSR; index++) { - if ((uint32_t)gstate->gmsr[index].entry == msr) { + if (gstate->gmsr[index].entry == msr) { *val = gstate->gmsr[index].value; break; } @@ -3426,7 +3465,7 @@ static int handle_msr_write(struct vcpu_t *vcpu, uint32_t msr, uint64_t val, case IA32_SF_MASK: case IA32_KERNEL_GS_BASE: { for (index = 0; index < NR_GMSR; index++) { - if ((uint32_t)gmsr_list[index] == msr) { + if (gmsr_list[index] == msr) { gstate->gmsr[index].value = val; gstate->gmsr[index].entry = msr; break;