diff --git a/src/cli.cpp b/src/cli.cpp index 53c9f49b..9b23f665 100755 --- a/src/cli.cpp +++ b/src/cli.cpp @@ -1160,7 +1160,7 @@ static void general( if (char_count <= 60) { continue; } else { - if ((char_count - 1) >= (static_cast(max_line_length) + 3)) { + if ((static_cast(char_count) - 1) >= (static_cast(max_line_length) + 3)) { it = divided_description.insert(it + 1, "\n"); char_count = it->length() + 1; } else { diff --git a/src/vmaware.hpp b/src/vmaware.hpp index 6ce6c9df..ba977120 100644 --- a/src/vmaware.hpp +++ b/src/vmaware.hpp @@ -3564,7 +3564,7 @@ struct VM { std::string msg_content = ss.str(); if (printed_messages.find(msg_content) == printed_messages.end()) { - #if (LINUX || APPLE) + #if (LINUX || APPLE) constexpr const char* black_bg = "\x1B[48;2;0;0;0m"; constexpr const char* bold = "\033[1m"; constexpr const char* blue = "\x1B[38;2;00;59;193m"; @@ -3578,9 +3578,9 @@ struct VM { << blue << "DEBUG" << ansiexit << bold << black_bg << "]" << ansiexit << " "; - #else + #else std::cout << "[DEBUG] "; - #endif + #endif std::cout << msg_content; std::cout << std::dec << "\n"; @@ -4555,9 +4555,9 @@ struct VM { #endif auto remove = [&](const enum brand_enum brand) noexcept { - for (u8 i = 0; i < active_brands.size(); i++) { - if (brand == active_brands.at(i).first) { - active_brands.erase(active_brands.begin() + i); + for (auto it = active_brands.begin(); it != active_brands.end(); ++it) { + if (it->first == brand) { + active_brands.erase(it); return; } } @@ -5507,19 +5507,53 @@ struct VM { }; // we dont use cpu::cpuid on purpose - auto trigger_vmexit = [](i32* info, i32 leaf, i32 sub) { + auto trigger_vmexit = []() { + #if (GCC || CLANG) + u32 a = 0, c = 0, d; + #if (x86_64) + __asm__ volatile ( + "pushq %%rbx\n\t" // better than doing something like xchgq %%rbx, %%rdi\n\t to swap rbx to rdi avoiding GCC pushing/popping rbx on the stack + "cpuid\n\t" + "popq %%rbx\n\t" + : "+a"(a), "+c"(c), "=d"(d) // + mapping forces compiler to use registers, avoids stack spills + : + : "cc" + ); + #else + __asm__ volatile ( + "pushl %%ebx\n\t" + "cpuid\n\t" + "popl %%ebx\n\t" + : "+a"(a), "+c"(c), "=d"(d) + : + : "cc" + ); + #endif + #else + i32 dummy[4]; + __cpuidex(dummy, 0x0, 0); + #endif + }; + + auto execute_lfence_8 = []() { + // hard-unrolled to prevent GCC from inserting cmp/jl loop validations inside the timed window #if (GCC || CLANG) __asm__ volatile ( - "cpuid" - : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]), "=d"(info[3]) - : "a"(leaf), "c"(sub) - : "cc", "memory" + "lfence\n\t lfence\n\t lfence\n\t lfence\n\t" + "lfence\n\t lfence\n\t lfence\n\t lfence\n\t" + ::: "memory" ); #else - __cpuidex(info, leaf, sub); + _mm_lfence(); _mm_lfence(); _mm_lfence(); _mm_lfence(); + _mm_lfence(); _mm_lfence(); _mm_lfence(); _mm_lfence(); #endif }; + auto execute_lfence_16 = [&]() { + execute_lfence_8(); + execute_lfence_8(); + }; + const DWORD_PTR target_affinity = get_target_mask(); // our software clock, it will count how many cycles a vmexit takes @@ -5532,23 +5566,17 @@ struct VM { while (!state.start_test.load(std::memory_order_acquire)) {} while (!state.test_done.load(std::memory_order_relaxed)) { + const u64 current = state.counter; // to silence warnings about incrementing volatile stuff + state.counter = current + 1; // better than calling incq in inline assembly, standard increment forces the correct cache behavior we want + #if (GCC || CLANG) - #if (x86_64) - // A single incq is enough. Unrolling for example 8 times on a volatile memory address - // creates unpredictable store-buffer behavior and we want it stable, latency is aprox 1 cycle in all microarchs - __asm__ volatile ("incq %0\n\t" : "+m" (state.counter) : : "cc", "memory"); - #else - __asm__ volatile ( - "addl $1, %0; adcl $0, %1\n\t" - : "+m" (((u32*)&state.counter)[0]), "+m" (((u32*)&state.counter)[1]) - : : "cc", "memory"); - #endif - #else - state.counter++; + // prevents aggressive loop unrolling/batching of volatile stores + __asm__ volatile("" ::: "memory"); #endif } }; + // it will execute cpuid and lfence, and compare its latency auto trigger_thread = [&]() { auto calculate_latency = [&](const std::vector& samples_in) -> u64 { if (samples_in.empty()) return 0; @@ -5723,7 +5751,6 @@ struct VM { seed ^= static_cast(reinterpret_cast(&local2)) << 2; seed ^= static_cast(reinterpret_cast(&local3)) << 3; - // splitmix64 seed ^= seed >> 33; seed *= 0xff51afd7ed558ccdULL; seed ^= seed >> 33; @@ -5740,7 +5767,6 @@ struct VM { std::mt19937 gen(seq); std::uniform_int_distribution batch_dist(30000, 70000); const size_t BATCH_SIZE = batch_dist(gen); - i32 dummy_res[4]{}; size_t valid = 0; i16 invalid = 0; bool apply_multiplier = false; // end of setup phase @@ -5754,47 +5780,65 @@ struct VM { state.start_test.store(true, std::memory_order_release); // _mm_pause can be exited conditionally, spam hit L3 // warm-up to settle caches, scheduler and frequency boosts for (int i = 0; i < 1000; ++i) { - for (int j = 0; j < 2; ++j) trigger_vmexit(dummy_res, 0x0, 0); + for (int j = 0; j < 2; ++j) trigger_vmexit(); for (int j = 0; j < 16; ++j) _mm_lfence(); // good candidate as a reference with cpuid because it's a serializing instruction AND can't be intercepted in VCMB/VMCS } + // inside the timing windows, there must be zero memory uutpu (no stack arrays can be written to), zero conditional branches and zero stack spilling (no register push/pops) while (valid < BATCH_SIZE) { - // interpolated so that any turbo boost, thermal throttling, speculation (for the loop overhead itself, not for the serializing instructions), etc affects samples equally + // cpuid and lfence interpolated so that any turbo boost, thermal throttling, speculation (for the loop overhead itself, not for the serializing instructions), etc affects samples equally u64 v_pre, v_post, r_pre, r_post, sync; sync = state.counter; while (state.counter == sync); // infer if counter got enough quantum momentum (so its currently scheduled) sync = state.counter; while (state.counter == sync); // fastest busy-waiting strategy, PAUSE affects cache, calling APIs like SwitchToThread() would be even worse - v_pre = state.counter; - std::atomic_signal_fence(std::memory_order_seq_cst); // _ReadWriteBarrier() aka dont emit runtime fences - // vmexit here so that the hypervisor is either forced to keep interception and try to bypass latency, or disable interception and try to bypass XSAVE states + // tick variables (v_pre, v_post, r_pre and r_post) are repeated inside loops on purpose if (!apply_multiplier) { - trigger_vmexit(dummy_res, 0x0, 0); + v_pre = state.counter; + std::atomic_signal_fence(std::memory_order_seq_cst); // _ReadWriteBarrier() aka dont emit runtime fences + + trigger_vmexit(); // this forces the hypervisor to keep interception and try to bypass latency, or disable interception and try to bypass XSAVE states + + std::atomic_signal_fence(std::memory_order_seq_cst); + v_post = state.counter; } else { // scaled by 2x if we dynamically detect cache invalidation ping-ponging across distant NUMA nodes, as our core randomizer pin our threads on different CPUs - trigger_vmexit(dummy_res, 0x0, 0); - trigger_vmexit(dummy_res, 0x0, 0); + v_pre = state.counter; + std::atomic_signal_fence(std::memory_order_seq_cst); + + trigger_vmexit(); + trigger_vmexit(); + + std::atomic_signal_fence(std::memory_order_seq_cst); + v_post = state.counter; } - std::atomic_signal_fence(std::memory_order_seq_cst); - v_post = state.counter; sync = state.counter; while (state.counter == sync); // sync to our counter tick again sync = state.counter; while (state.counter == sync); - r_pre = state.counter; - std::atomic_signal_fence(std::memory_order_seq_cst); // ensure compiler-level ordering if (!apply_multiplier) { - for (int i = 0; i < 8; ++i) _mm_lfence(); // 8 LFENCES is enough for the Cross-Core/Cross-CCD MESI RFO cache bounce in the data race (so that the counter thread sees an increment) + r_pre = state.counter; + std::atomic_signal_fence(std::memory_order_seq_cst); // ensure compiler-level ordering + + // 8 LFENCES is enough for the Cross-Core/Cross-CCD MESI RFO cache bounce in the data race (so that the counter thread sees an increment) + execute_lfence_8(); + + std::atomic_signal_fence(std::memory_order_seq_cst); + r_post = state.counter; } else { + r_pre = state.counter; + std::atomic_signal_fence(std::memory_order_seq_cst); + // scaled if counter thread is not able to increment in time due to CPUID being too fast - for (int i = 0; i < 16; ++i) _mm_lfence(); + execute_lfence_16(); + + std::atomic_signal_fence(std::memory_order_seq_cst); + r_post = state.counter; } - std::atomic_signal_fence(std::memory_order_seq_cst); - r_post = state.counter; - // we dont filter by cycles spent here (for example by querying thread cycle time) because the point of this function is to not let either the kernel or this app handle a TSC read + // we dont filter by cycles spent here (for example by querying thread cycle time) because the point of this function is to not use TSC or any other clock if (v_post > v_pre && r_post > r_pre) { vm_samples[valid] = v_post - v_pre; ref_samples[valid] = r_post - r_pre; @@ -5802,17 +5846,20 @@ struct VM { } else if (v_post <= v_pre && !apply_multiplier) { invalid++; - if (invalid >= 1000) apply_multiplier = true; + if (invalid >= 250) { + debug("TIMER: Detected trigger thread monopolizing cache ownership; unstable path was activated to increase performance"); + apply_multiplier = true; + } } } state.test_done.store(true, std::memory_order_release); - const u64 cpuid_l = calculate_latency(vm_samples); // check for lowest dense cluster with no interrupt spikes, filter noise we can detect (SMIs, etc) + const u64 cpuid_l = calculate_latency(vm_samples); // check for lowest dense cluster with no interrupt spikes, filter noise we can't detect (SMIs, NMIs, etc) const u64 ref_l = calculate_latency(ref_samples); const double latency_ratio = ref_l ? (double)cpuid_l / (double)ref_l : 0; - // VMM == Time spent in hypervisor; nVMM == Time spent in baremetal + // VMM = Time spent in hypervisor; nVMM = Time spent in baremetal debug("TIMER: VMM -> ", cpuid_l, " | nVMM -> ", ref_l, " | Ratio -> ", latency_ratio); if (latency_ratio >= threshold) hypervisor_detected = true; @@ -5822,10 +5869,10 @@ struct VM { // if hypervisor lies about the CPU vendor, it will create 100000 more detectable signals (querying Intel-specific behavior) if (cpu::is_amd() && !hypervisor_detected) { i32 res_d0[4], res_d1[4], res_d12[4], res_ext[4]; - trigger_vmexit(res_d0, 0xD, 0); // XCR0 features - trigger_vmexit(res_d1, 0xD, 1); // XCR0 + XSS features - trigger_vmexit(res_d12, 0xD, 12); // CET State details - trigger_vmexit(res_ext, 0x80000008, 0); + cpu::cpuid(res_d0, 0xD, 0); // XCR0 features + cpu::cpuid(res_d1, 0xD, 1); // XCR0 + XSS features + cpu::cpuid(res_d12, 0xD, 12); // CET State details + cpu::cpuid(res_ext, 0x80000008, 0); const bool hardware_supports_cet = (res_d12[0] > 0); const u32 active_xcr0_size = (u32)res_d0[1]; // size for features enabled in XCR0 @@ -10367,127 +10414,90 @@ struct VM { [[nodiscard]] static bool blockstep() { volatile int saw_single_step = 0; - #if (x86_32) - __try - { - #if (CLANG || GCC) - __asm__ __volatile__( - // set TF in EFLAGS - "pushfd\n\t" - "orl $0x100, (%%esp)\n\t" - "popfd\n\t" - - // because TF was set, CPUID would normally cause a #DB on the next instruction - // if placed after 'mov ss', it consumes the 1-instruction inhibition window so the check wouldn't work - "xor %%eax, %%eax\n\t" - - // execute MOV SS,AX (reload SS with itself) to force the interruptible state block - "mov %%ss, %%ax\n\t" - "mov %%ax, %%ss\n\t" // this blocks any debug exception for exactly one instruction - - "cpuid\n\t" - - // TF's single-step now fires here on baremetal - "nop\n\t" - - "pushfd\n\t" - "andl $0xFFFFFEFF, (%%esp)\n\t" - "popfd\n\t" - : - : - : "eax", "ebx", "ecx", "edx", "cc", "memory" - ); - #else - __asm - { - // same logic as above + #if (x86_32) && !(CLANG || GCC) + __try { + __asm { pushfd - or dword ptr[esp], 0x100 + or dword ptr[esp], 0x100 // set TF popfd xor eax, eax - mov ax, ss - mov ss, ax + mov ax, ss + mov ss, ax // this blocks any debug exception for exactly one instruction cpuid - nop + nop // TF's single-step should fire here on baremetal except on a few buggy processors pushfd and dword ptr[esp], 0xFFFFFEFF popfd } - #endif } - __except (GetExceptionCode() == EXCEPTION_SINGLE_STEP - ? EXCEPTION_EXECUTE_HANDLER - : EXCEPTION_CONTINUE_SEARCH) - { + __except (GetExceptionCode() == EXCEPTION_SINGLE_STEP ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH) { saw_single_step = 1; } - return (saw_single_step == 0) ? true : false; - - #elif (x86_64) + return (saw_single_step == 0); + #elif (x86_64) || ((x86_32) && (CLANG || GCC)) const HMODULE ntdll = util::get_ntdll(); if (!ntdll) return false; - const char* names[] = { "NtAllocateVirtualMemory", "NtProtectVirtualMemory", "NtFlushInstructionCache", "NtFreeVirtualMemory" }; + const char* names[] = { + "NtAllocateVirtualMemory", "NtProtectVirtualMemory", + "NtFlushInstructionCache", "NtFreeVirtualMemory" + }; void* funcs[ARRAYSIZE(names)] = {}; util::get_function_address(ntdll, names, funcs, ARRAYSIZE(names)); - const auto nt_allocate_virtual_memory = reinterpret_cast(funcs[0]); - const auto nt_protect_virtual_memory = reinterpret_cast(funcs[1]); - const auto nt_flush_instruction_cache = reinterpret_cast(funcs[2]); - const auto nt_free_virtual_memory = reinterpret_cast(funcs[3]); + const auto nt_alloc = reinterpret_cast(funcs[0]); + const auto nt_protect = reinterpret_cast(funcs[1]); + const auto nt_flush = reinterpret_cast(funcs[2]); + const auto nt_free = reinterpret_cast(funcs[3]); - if (!nt_allocate_virtual_memory || !nt_protect_virtual_memory || !nt_flush_instruction_cache || !nt_free_virtual_memory) { - return false; - } + if (!nt_alloc || !nt_protect || !nt_flush || !nt_free) return false; + // these opcodes are byte-for-byte identical for both x86_32 and x86_64 architectures + // like, 0x53 maps to push ebx in 32-bit and push rbx in 64-bit static constexpr u8 blockstep_opcodes[] = { - 0x53, // push rbx (to preserve non-volatile register against cpuid) - 0x9C, // pushfq - 0x81, 0x0C, 0x24, 0x00, 0x01, 0x00, 0x00, // or dword ptr [rsp], 0x100 - 0x9D, // popfq - 0x31, 0xC0, // xor eax, eax - 0x8C, 0xD0, // mov ax, ss - 0x8E, 0xD0, // mov ss, ax - 0x0F, 0xA2, // cpuid - 0x90, // nop - 0x9C, // pushfq - 0x81, 0x24, 0x24, 0xFF, 0xFE, 0xFF, 0xFF, // and dword ptr [rsp], 0xFFFFFEFF - 0x9D, // popfq - 0x5B, // pop rbx - 0xC3 // ret + 0x53, // push rbx/ebx (preserve non-volatile register against cpuid) + 0x9C, // pushfq/pushfd + 0x81, 0x0C, 0x24, 0x00, 0x01, 0x00, 0x00, // or dword ptr [rsp/esp], 0x100 + 0x9D, // popfq/popfd + 0x31, 0xC0, // xor eax, eax + 0x8C, 0xD0, // mov ax, ss + 0x8E, 0xD0, // mov ss, ax + 0x0F, 0xA2, // cpuid + 0x90, // nop + 0x9C, // pushfq/pushfd + 0x81, 0x24, 0x24, 0xFF, 0xFE, 0xFF, 0xFF, // and dword ptr [rsp/esp], 0xFFFFFEFF + 0x9D, // popfq/popfd + 0x5B, // pop rbx/ebx + 0xC3 // ret }; const HANDLE current_process = reinterpret_cast(-1LL); PVOID base = nullptr; SIZE_T region_size = sizeof(blockstep_opcodes); - NTSTATUS st = nt_allocate_virtual_memory(current_process, &base, 0, ®ion_size, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE); - if (!NT_SUCCESS(st) || !base) { + + if (!NT_SUCCESS(nt_alloc(current_process, &base, 0, ®ion_size, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE)) || !base) { return false; } memcpy(base, blockstep_opcodes, sizeof(blockstep_opcodes)); ULONG old_protection = 0; - st = nt_protect_virtual_memory(current_process, &base, ®ion_size, PAGE_EXECUTE_READ, &old_protection); - if (!NT_SUCCESS(st)) { - region_size = 0; - nt_free_virtual_memory(current_process, &base, ®ion_size, MEM_RELEASE); - return false; - } - - nt_flush_instruction_cache(current_process, base, region_size); + NTSTATUS st = nt_protect(current_process, &base, ®ion_size, PAGE_EXECUTE_READ, &old_protection); - __try { - reinterpret_cast(base)(); - } - __except (GetExceptionCode() == EXCEPTION_SINGLE_STEP ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH) { - saw_single_step = 1; + if (NT_SUCCESS(st)) { + nt_flush(current_process, base, region_size); + __try { + reinterpret_cast(base)(); + } + __except (GetExceptionCode() == EXCEPTION_SINGLE_STEP ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH) { + saw_single_step = 1; + } } region_size = 0; - nt_free_virtual_memory(current_process, &base, ®ion_size, MEM_RELEASE); + nt_free(current_process, &base, ®ion_size, MEM_RELEASE); - return (saw_single_step == 0) ? true : false; + return NT_SUCCESS(st) && (saw_single_step == 0); #else return false; #endif @@ -11443,7 +11453,7 @@ struct VM { /** * @brief Check whether the CPU is genuine and its reported instruction capabilities are not masked - * @category Windows + * @category Windows, x86 * @implements VM::CPU_HEURISTIC */ [[nodiscard]] static bool cpu_heuristic() { @@ -11918,7 +11928,7 @@ struct VM { [[nodiscard]] static bool clock() { #if (ARM) return false; // ARM systems do not have the classic x86 timers - #endif + #else if (util::is_running_under_translator()) { debug("CLOCK: Running inside an ARM CPU"); return false; @@ -12056,12 +12066,13 @@ struct VM { SetupDiDestroyDeviceInfoList(devs); return !found; + #endif } /** * @brief Check whether the hypervisor correctly handles MSR behavior - * @category Windows + * @category Windows, x86 * @implements VM::MSR */ [[nodiscard]] static bool msr() { @@ -12126,7 +12137,7 @@ struct VM { /** * @brief Check whether KVM attempts to patch a mismatched hypercall instruction * @link https://lists.nongnu.org/archive/html/qemu-devel/2025-07/msg05044.html - * @category Windows + * @category Windows, x86 * @implements VM::KVM_INTERCEPTION */ [[nodiscard]] static bool kvm_interception() { @@ -12229,13 +12240,14 @@ struct VM { /** * @brief Check whether a hypervisor uses EPT/NPT hooking to intercept hardware breakpoints * @note This hypervisor detection also affects debuggers - * @category Windows + * @category Windows, x86 * @implements VM::HYPERVISOR_HOOK */ [[nodiscard]] static bool hypervisor_hook() { #if (!x86) return false; #else + if (util::is_running_under_translator()) return false; const HMODULE ntdll = util::get_ntdll(); if (!ntdll) return false; @@ -12315,7 +12327,7 @@ struct VM { auto* dos_header = reinterpret_cast(module); if (dos_header->e_magic != IMAGE_DOS_SIGNATURE) return nullptr; - auto* nt_headers = reinterpret_cast(reinterpret_cast(module) + dos_header->e_lfanew); + auto* nt_headers = reinterpret_cast(reinterpret_cast(module) + dos_header->e_lfanew); if (nt_headers->Signature != IMAGE_NT_SIGNATURE) return nullptr; auto* section = IMAGE_FIRST_SECTION(nt_headers); @@ -12323,7 +12335,7 @@ struct VM { // only scan memory marked as executable if ((section->Characteristics & IMAGE_SCN_MEM_EXECUTE) != 0) { - uint8_t* ptr = reinterpret_cast(module) + section->VirtualAddress; + u8* ptr = reinterpret_cast(module) + section->VirtualAddress; size_t size = section->Misc.VirtualSize; if (size < 2) { @@ -12345,7 +12357,7 @@ struct VM { using find_double_cc_t = void* (*)(void*); find_double_cc_t find_double_cc = [](void* pointer_in_page) -> void* { // align down to the start of the 4KB page - auto* ptr = reinterpret_cast(reinterpret_cast(pointer_in_page) & ~0xFFF); + auto* ptr = reinterpret_cast(reinterpret_cast(pointer_in_page) & ~0xFFF); for (size_t i = 0; i < (0x1000 - 1); ++i) { if (ptr[i] == 0xCC && ptr[i + 1] == 0xCC) { @@ -12377,7 +12389,7 @@ struct VM { NTSTATUS status = nt_protect_virtual_memory(current_process, &base_address, &prot_region_size, PAGE_EXECUTE_READWRITE, &old_protect); if (status < 0) return false; - *static_cast(pointer) = 0xC3; + *static_cast(pointer) = 0xC3; base_address = pointer; prot_region_size = 1; @@ -12392,7 +12404,7 @@ struct VM { hook_detected = true; } else { - // total hardware processors + // now try on all cores, total hardware processors struct SYSTEM_BASIC_INFORMATION_LOCAL { ULONG Reserved; ULONG TimerResolution; @@ -12407,7 +12419,7 @@ struct VM { CCHAR NumberOfProcessors; }; - SYSTEM_BASIC_INFORMATION_LOCAL sys_info = { 0 }; + SYSTEM_BASIC_INFORMATION_LOCAL sys_info{}; ULONG ret_len = 0; ULONG num_processors = 1; @@ -12418,204 +12430,28 @@ struct VM { } num_processors = sys_info.NumberOfProcessors; - // plain struct to pass memory pointers (no destructors) - struct thread_context { - void* pointer; - volatile LONG* did_anyone_throw; - }; - volatile LONG did_anyone_throw = 0; - thread_context t_ctx{}; - t_ctx.pointer = pointer; - t_ctx.did_anyone_throw = &did_anyone_throw; - - struct thread_proc_thunk { - static DWORD __stdcall proc(PVOID param) { - auto* c = static_cast(param); - - __try { - using func_t = void(*)(); - reinterpret_cast(c->pointer)(); - } - __except (EXCEPTION_EXECUTE_HANDLER) { - _InterlockedExchange(c->did_anyone_throw, 1); - } - - return 0; - } - }; - - using thread_routine_t = DWORD(__stdcall*)(PVOID); - thread_routine_t thread_proc = &thread_proc_thunk::proc; - - HANDLE thread_handles[256] = {}; - ULONG active_threads = 0; - - PVOID src_page = nullptr; - PVOID dst_page = nullptr; - SIZE_T region_size = 0x2000; - PVOID veh_handle = nullptr; - - // allocate source and destination pages - const NTSTATUS status_src = nt_allocate_virtual_memory(current_process, &src_page, 0, ®ion_size, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE); - const NTSTATUS status_dst = nt_allocate_virtual_memory(current_process, &dst_page, 0, ®ion_size, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE); - - if (status_src < 0 || status_dst < 0) { - if (src_page) { - SIZE_T free_size = 0; - nt_free_virtual_memory(current_process, &src_page, &free_size, MEM_RELEASE); - } - if (dst_page) { - SIZE_T free_size = 0; - nt_free_virtual_memory(current_process, &dst_page, &free_size, MEM_RELEASE); - } - return false; - } - - auto cleanup_pages = [&]() -> bool { - bool ok = true; - - if (veh_handle) { - rtl_remove_vectored_exception_handler(veh_handle); - veh_handle = nullptr; - } - - for (ULONG i = 0; i < active_threads; ++i) { - if (thread_handles[i]) { - ok = NT_SUCCESS(nt_close(thread_handles[i])) && ok; - thread_handles[i] = nullptr; - } - } - - if (src_page) { - SIZE_T free_size = 0; - ok = NT_SUCCESS(nt_free_virtual_memory(current_process, &src_page, &free_size, MEM_RELEASE)) && ok; - } - if (dst_page) { - SIZE_T free_size = 0; - ok = NT_SUCCESS(nt_free_virtual_memory(current_process, &dst_page, &free_size, MEM_RELEASE)) && ok; - } - - return ok; - }; - - if (util::is_running_under_translator()) - return cleanup_pages() ? hook_detected : false; - - // initialize src memory - __stosb(static_cast(src_page), 0xAB, 0x2000); - - thread_local static volatile bool ermsb_trap_detected = false; - ermsb_trap_detected = false; - - // capture-less local lambda decays to PVECTORED_EXCEPTION_HANDLER function pointer - auto veh_handler = [](PEXCEPTION_POINTERS ctx) -> LONG { - if (ctx->ExceptionRecord->ExceptionCode == EXCEPTION_SINGLE_STEP) { - ermsb_trap_detected = true; - return EXCEPTION_CONTINUE_EXECUTION; - } - return EXCEPTION_CONTINUE_SEARCH; - }; - - veh_handle = rtl_add_vectored_exception_handler(1, static_cast(veh_handler)); - if (!veh_handle) { - cleanup_pages(); - return false; - } - - CONTEXT ctx{}; - ctx.ContextFlags = CONTEXT_DEBUG_REGISTERS; - status = nt_get_context_thread(current_thread, &ctx); - if (status < 0) { - cleanup_pages(); - return false; - } - - // set hw breakpoint inside the source page - ctx.Dr0 = reinterpret_cast(src_page) + 0x1000; - - // Dr7 = 0x30001 - // bit 0 = 1 - // bits 17:16 = 11b - // bits 19:18 = 00b - ctx.Dr7 = 0x30001; - status = nt_set_context_thread(current_thread, &ctx); - if (status < 0) { - cleanup_pages(); - return false; - } - - __try { - __movsb(static_cast(dst_page), static_cast(src_page), 0x2000); - } - __except (EXCEPTION_EXECUTE_HANDLER) { - // veh will already detect if Dr0 fired successfully - } - - rtl_remove_vectored_exception_handler(veh_handle); - veh_handle = nullptr; - - ctx.Dr0 = 0; - ctx.Dr7 = 0; - status = nt_set_context_thread(current_thread, &ctx); - if (status < 0) { - cleanup_pages(); - return false; - } - for (ULONG i = 0; i < num_processors && i < 256; ++i) { - HANDLE h_thread = nullptr; - // 0x1FFFFF = THREAD_ALL_ACCESS - NTSTATUS t_status = nt_create_thread_ex(&h_thread, THREAD_ALL_ACCESS, nullptr, current_process, - reinterpret_cast(thread_proc), &t_ctx, - 0, 0, 0, 0, nullptr); - - if (t_status < 0 || !h_thread) { - if (h_thread && nt_close(h_thread) < 0) { - cleanup_pages(); - return false; - } - cleanup_pages(); - return false; - } - - // ebind the created thread to physical core i + // pin the current thread to physical core i ULONG_PTR affinity = (ULONG_PTR)1 << i; - status = nt_set_information_thread(h_thread, 4 /* ThreadAffinityMask */, &affinity, sizeof(affinity)); + status = nt_set_information_thread(current_thread, 4 /* ThreadAffinityMask */, &affinity, sizeof(affinity)); if (status < 0) { - if (nt_close(h_thread) < 0) { - cleanup_pages(); - return false; - } - cleanup_pages(); return false; } - thread_handles[active_threads++] = h_thread; - } - - // wait for completion on all cores - for (ULONG i = 0; i < active_threads; ++i) { - status = nt_wait_for_single_object(thread_handles[i], FALSE, nullptr); - if (status < 0) { - cleanup_pages(); - return false; + __try { + using func_t = void(*)(); + reinterpret_cast(pointer)(); } - - status = nt_close(thread_handles[i]); - if (status < 0) { - cleanup_pages(); - return false; + __except (EXCEPTION_EXECUTE_HANDLER) { + did_anyone_throw = 1; } - thread_handles[i] = nullptr; } if (did_anyone_throw != 0) { hook_detected = true; } - - cleanup_pages(); } PVOID src_page = nullptr; @@ -12638,9 +12474,6 @@ struct VM { return false; } - if (util::is_running_under_translator()) - return hook_detected; - // initialize src memory __stosb(static_cast(src_page), 0xAB, 0x2000); @@ -12720,12 +12553,16 @@ struct VM { #endif } + /** * @brief Check whether a hypervisor delays trap flags over exiting instructions - * @category Windows + * @category Windows, x86 * @implements VM::POPF */ [[nodiscard]] static bool popf() { + #if (!x86) + return false; + #else const HMODULE ntdll = util::get_ntdll(); if (!ntdll) return false; @@ -12814,17 +12651,20 @@ struct VM { nt_free_virtual_memory(current_process, &base_address, &free_size, MEM_RELEASE); return is_vm; + #endif } /** * @brief Check whether a hypervisor does not correctly emulate instructions in compatibility mode - * @category Windows + * @category Windows, x86_64 * @implements VM::EIP_OVERFLOW */ [[nodiscard]] static bool eip_overflow() { #if (!x86_64) - return false; + // this requires mapping executable memory at the end of the 4GB address space (0xFFFF0000) so an instruction can wrap the 32 bit boundary + // because NtAllocateVirtualMemory will always return 0xC0000018 (STATUS_CONFLICTING_ADDRESSES) we physically cannot place an instruction at 0xFFFFFFFE + return false; #else #pragma pack(push, 1) struct iretq_frame { @@ -12867,7 +12707,7 @@ struct VM { const auto rtl_remove_vectored_exception_handler = reinterpret_cast(funcs[5]); if (!nt_allocate_virtual_memory || !nt_protect_virtual_memory || - !nt_flush_instruction_cache || !nt_free_virtual_memory || + !nt_flush_instruction_cache || !nt_free_virtual_memory || !rtl_add_vectored_exception_handler || !rtl_remove_vectored_exception_handler) { return false; } @@ -12888,9 +12728,9 @@ struct VM { } if (g_recovery_pad != 0) { - exc_info->ContextRecord->SegCs = 0x33; + exc_info->ContextRecord->SegCs = 0x33; exc_info->ContextRecord->Rip = g_recovery_pad; // cleanup shellcode - exc_info->ContextRecord->Rsp = g_saved_rsp; + exc_info->ContextRecord->Rsp = g_saved_rsp; return EXCEPTION_CONTINUE_EXECUTION; } @@ -12929,10 +12769,10 @@ struct VM { } // map stub bytes into allocated chunk - uint8_t* code_ptr = static_cast(shellcode_base); + u8* code_ptr = static_cast(shellcode_base); for (size_t i = 0; i < sizeof(switch_stub); ++i) code_ptr[i] = switch_stub[i]; - uint8_t* rec_ptr = code_ptr + sizeof(switch_stub); + u8* rec_ptr = code_ptr + sizeof(switch_stub); for (size_t i = 0; i < sizeof(recover_stub); ++i) rec_ptr[i] = recover_stub[i]; // executable memory protection @@ -12971,7 +12811,7 @@ struct VM { if (alloc_status >= 0 && boundary_base == reinterpret_cast(0xFFFF0000ULL)) { // inject cpuid at the strict end of the compat-mode space - uint8_t* execution_target = reinterpret_cast(0xFFFFFFFEULL); + u8* execution_target = reinterpret_cast(0xFFFFFFFEULL); execution_target[0] = 0x0F; execution_target[1] = 0xA2; @@ -12980,7 +12820,7 @@ struct VM { frame.cs = 0x23; // dispatch hardware context switch shellcode - auto switch_func = reinterpret_cast(code_ptr); + auto switch_func = reinterpret_cast(code_ptr); switch_func(&frame, stack32_ptr, &g_saved_rsp); SIZE_T free_size = 0; @@ -14273,6 +14113,7 @@ std::array VM::core::technique_table = [ {VM::EIP_OVERFLOW, {100, VM::eip_overflow}}, {VM::HYPERVISOR_HOOK, {100, VM::hypervisor_hook}}, {VM::POPF, {100, VM::popf}}, + {VM::BLOCKSTEP, {100, VM::blockstep}}, {VM::MSR, {100, VM::msr}}, {VM::EDID, {100, VM::edid}}, {VM::VIRTUAL_PROCESSORS, {100, VM::virtual_processors}}, @@ -14286,7 +14127,6 @@ std::array VM::core::technique_table = [ {VM::DISPLAY, {25, VM::display}}, {VM::DLL, {50, VM::dll}}, {VM::UD, {100, VM::ud}}, - {VM::BLOCKSTEP, {100, VM::blockstep}}, {VM::VMWARE_BACKDOOR, {100, VM::vmware_backdoor}}, {VM::VIRTUAL_REGISTRY, {90, VM::virtual_registry}}, {VM::MUTEX, {100, VM::mutex}},