Skip to content

Commit

Permalink
Improve Windows multiarch support
Browse files Browse the repository at this point in the history
  • Loading branch information
kimwalisch committed Apr 15, 2024
1 parent 34179cc commit 17a49bd
Show file tree
Hide file tree
Showing 10 changed files with 233 additions and 173 deletions.
3 changes: 2 additions & 1 deletion ChangeLog
@@ -1,6 +1,7 @@
Changes in version 12.3, 04/04/2024
Changes in version 12.3, 15/04/2024
===================================

* Improve Windows multiarch support (now works with MinGW64).
* Add runtime POPCNT detection using CPUID for x86 CPUs.
* Improve GCC/Clang multiarch preprocessor logic.
* CMakeLists.txt: Remove POPCNT/BMI check for x86 CPUs.
Expand Down
36 changes: 24 additions & 12 deletions cmake/multiarch_avx512_vbmi2.cmake
@@ -1,14 +1,15 @@
# We use GCC/Clang's function multi-versioning for AVX512
# support. This code will automatically dispatch to the
# AVX512 algorithm if the CPU supports AVX512 and use the
# default (portable) algorithm otherwise.
# AVX512 VBMI2 algorithm if the CPU supports it and use
# the default (portable) algorithm otherwise.

include(CheckCXXSourceCompiles)
include(CMakePushCheckState)

check_cxx_source_compiles("
#include <immintrin.h>
#include <stdint.h>
cmake_push_check_state()
set(CMAKE_REQUIRED_INCLUDES "${PROJECT_SOURCE_DIR}/include")

check_cxx_source_compiles("
// GCC/Clang function multiversioning for AVX512 is not needed if
// the user compiles with -mavx512f -mavx512vbmi -mavx512vbmi2.
// GCC/Clang function multiversioning generally causes a minor
Expand All @@ -19,22 +20,31 @@ check_cxx_source_compiles("
Error: AVX512VBMI2 multiarch not needed!
#endif
#include <primesieve/cpu_supports_avx512_vbmi2.hpp>
#include <immintrin.h>
#include <stdint.h>
class PrimeGenerator {
public:
__attribute__ ((target (\"default\")))
void fillNextPrimes(uint64_t* primes64);
public:
__attribute__ ((target (\"avx512f,avx512vbmi,avx512vbmi2\")))
void fillNextPrimes(uint64_t* primes64);
void fillNextPrimes_avx512(uint64_t* primes64);
void fillNextPrimes_default(uint64_t* primes64);
void fillNextPrimes(uint64_t* primes64)
{
if (cpu_supports_avx512_vbmi2)
fillNextPrimes_avx512(primes64);
else
fillNextPrimes_default(primes64);
}
};
__attribute__ ((target (\"default\")))
void PrimeGenerator::fillNextPrimes(uint64_t* primes64)
void PrimeGenerator::fillNextPrimes_default(uint64_t* primes64)
{
primes64[0] = 2;
}
__attribute__ ((target (\"avx512f,avx512vbmi,avx512vbmi2\")))
void PrimeGenerator::fillNextPrimes(uint64_t* primes64)
void PrimeGenerator::fillNextPrimes_avx512(uint64_t* primes64)
{
__m512i bytes_0_to_7 = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
__m512i base = _mm512_set1_epi64(123);
Expand All @@ -56,3 +66,5 @@ check_cxx_source_compiles("
if(multiarch_avx512_vbmi2)
set(ENABLE_MULTIARCH_AVX512 "ENABLE_MULTIARCH_AVX512")
endif()

cmake_pop_check_state()
45 changes: 3 additions & 42 deletions include/primesieve/CPUID.hpp
@@ -1,6 +1,6 @@
///
/// @file CPUID.hpp
/// @brief POPCNT detection fo x86 and x86-64 CPUs.
/// @file cpuid.hpp
/// @brief CPUID for x86 and x86-64 CPUs.
///
/// Copyright (C) 2024 Kim Walisch, <kim.walisch@gmail.com>
///
Expand All @@ -11,32 +11,13 @@
#ifndef CPUID_HPP
#define CPUID_HPP

// Enable on x86 and x86-64 CPUs
#if defined(__x86_64__) || \
defined(__i386__) || \
defined(_M_X64) || \
defined(_M_IX86)

// Both GCC and Clang (even Clang on Windows) define the __POPCNT__
// macro if the user compiles with -mpopcnt. The __POPCNT__
// macro is even defined if the user compiles with other flags
// such as -mavx or -march=native.
#if defined(__POPCNT__)
#define HAS_POPCNT
// The MSVC compiler does not support a POPCNT macro, but if the user
// compiles with e.g. /arch:AVX or /arch:AVX512 then MSVC defines
// the __AVX__ macro and POPCNT is also supported.
#elif defined(_MSC_VER) && defined(__AVX__)
#define HAS_POPCNT
#endif

#if defined(_MSC_VER)
#include <intrin.h>
#endif

namespace {

inline void run_CPUID(int eax, int ecx, int* abcd)
inline void run_cpuid(int eax, int ecx, int* abcd)
{
#if defined(_MSC_VER)
__cpuidex(abcd, eax, ecx);
Expand Down Expand Up @@ -69,26 +50,6 @@ inline void run_CPUID(int eax, int ecx, int* abcd)
#endif
}

#if !defined(HAS_POPCNT)
#define ENABLE_CPUID_POPCNT

inline bool run_CPUID_POPCNT()
{
// %ecx POPCNT bit flag
int bit_POPCNT = 1 << 23;
int abcd[4];

run_CPUID(1, 0, abcd);
return (abcd[2] & bit_POPCNT) == bit_POPCNT;
}

/// Initialized at startup
const bool HAS_CPUID_POPCNT = run_CPUID_POPCNT();

#endif // ENABLE_CPUID_POPCNT

} // namespace

#endif // x86 CPU

#endif
48 changes: 40 additions & 8 deletions include/primesieve/PrimeGenerator.hpp
Expand Up @@ -23,6 +23,20 @@
#include <stdint.h>
#include <cstddef>

#if defined(__AVX512F__) && \
defined(__AVX512VBMI__) && \
defined(__AVX512VBMI2__) && \
__has_include(<immintrin.h>)
#define ENABLE_AVX512

#elif defined(ENABLE_MULTIARCH_AVX512) && \
__has_include(<immintrin.h>)
#include "cpu_supports_avx512_vbmi2.hpp"
#define ENABLE_DEFAULT
#else
#define ENABLE_DEFAULT
#endif

namespace primesieve {

class PreSieve;
Expand All @@ -34,18 +48,36 @@ class PrimeGenerator : public Erat
void fillPrevPrimes(Vector<uint64_t>& primes, std::size_t* size);
static uint64_t maxCachedPrime();

#if defined(ENABLE_MULTIARCH_AVX512)
#define ENABLE_MULTIARCH_DEFAULT
__attribute__ ((target ("avx512f,avx512vbmi,avx512vbmi2")))
void fillNextPrimes(Vector<uint64_t>& primes, std::size_t* size);
ALWAYS_INLINE void fillNextPrimes(Vector<uint64_t>& primes, std::size_t* size)
{
#if defined(ENABLE_AVX512)
fillNextPrimes_avx512(primes, size);
#elif defined(ENABLE_MULTIARCH_AVX512)
if (cpu_supports_avx512_vbmi2)
fillNextPrimes_avx512(primes, size);
else
fillNextPrimes_default(primes, size);
#else
fillNextPrimes_default(primes, size);
#endif
}

private:

#if defined(ENABLE_DEFAULT)
void fillNextPrimes_default(Vector<uint64_t>& primes, std::size_t* size);
#endif

#if defined(ENABLE_MULTIARCH_DEFAULT)
__attribute__ ((target ("default")))
#if defined(ENABLE_AVX512) || \
defined(ENABLE_MULTIARCH_AVX512)

#if defined(ENABLE_MULTIARCH_AVX512)
__attribute__ ((target ("avx512f,avx512vbmi,avx512vbmi2")))
#endif
void fillNextPrimes_avx512(Vector<uint64_t>& primes, std::size_t* size);

#endif
void fillNextPrimes(Vector<uint64_t>& primes, std::size_t* size);

private:
bool isInit_ = false;
uint64_t low_ = 0;
uint64_t prime_ = 0;
Expand Down
85 changes: 85 additions & 0 deletions include/primesieve/cpu_supports_avx512_vbmi2.hpp
@@ -0,0 +1,85 @@
///
/// @file cpu_supports_avx512_vbmi2.hpp
/// @brief Detect if the x86 CPU supports AVX512 VBMI2.
///
/// Copyright (C) 2024 Kim Walisch, <kim.walisch@gmail.com>
///
/// This file is distributed under the BSD License. See the COPYING
/// file in the top level directory.
///

#ifndef CPU_SUPPORTS_AVX512_VBMI2_HPP
#define CPU_SUPPORTS_AVX512_VBMI2_HPP

#include "cpuid.hpp"

#if defined(_MSC_VER)
#include <immintrin.h>
#endif

// %ebx bit flags
#define bit_AVX512F (1 << 16)

// %ecx bit flags
#define bit_AVX512VBMI (1 << 1)
#define bit_AVX512VBMI2 (1 << 6)

// xgetbv bit flags
#define XSTATE_SSE (1 << 1)
#define XSTATE_YMM (1 << 2)
#define XSTATE_ZMM (7 << 5)

namespace {

// Get Value of Extended Control Register
inline int get_xcr0()
{
int xcr0;

#if defined(_MSC_VER)
xcr0 = (int) _xgetbv(0);
#else
__asm__ ("xgetbv" : "=a" (xcr0) : "c" (0) : "%edx" );
#endif

return xcr0;
}

inline bool run_cpuid_avx512_vbmi2()
{
int abcd[4];

run_cpuid(1, 0, abcd);

int osxsave_mask = (1 << 27);

// Ensure OS supports extended processor state management
if ((abcd[2] & osxsave_mask) != osxsave_mask)
return false;

int ymm_mask = XSTATE_SSE | XSTATE_YMM;
int zmm_mask = XSTATE_SSE | XSTATE_YMM | XSTATE_ZMM;

int xcr0 = get_xcr0();

// Check AVX OS support
if ((xcr0 & ymm_mask) != ymm_mask)
return false;

// Check AVX512 OS support
if ((xcr0 & zmm_mask) != zmm_mask)
return false;

run_cpuid(7, 0, abcd);

// PrimeGenerator::fillNextPrimes() requires AVX512F, AVX512VBMI & AVX512VBMI2
return ((abcd[1] & bit_AVX512F) == bit_AVX512F &&
(abcd[2] & (bit_AVX512VBMI | bit_AVX512VBMI2)) == (bit_AVX512VBMI | bit_AVX512VBMI2));
}

/// Initialized at startup
bool cpu_supports_avx512_vbmi2 = run_cpuid_avx512_vbmi2();

} // namespace

#endif
58 changes: 58 additions & 0 deletions include/primesieve/cpu_supports_popcnt.hpp
@@ -0,0 +1,58 @@
///
/// @file cpu_supports_popcnt.hpp
/// @brief POPCNT detection fo x86 and x86-64 CPUs.
///
/// Copyright (C) 2024 Kim Walisch, <kim.walisch@gmail.com>
///
/// This file is distributed under the BSD License. See the COPYING
/// file in the top level directory.
///

#ifndef CPU_SUPPORTS_POPCNT_HPP
#define CPU_SUPPORTS_POPCNT_HPP

// Enable CPUID on x86 and x86-64 CPUs
#if defined(__x86_64__) || \
defined(__i386__) || \
defined(_M_X64) || \
defined(_M_IX86)

// Both GCC and Clang (even Clang on Windows) define the __POPCNT__
// macro if the user compiles with -mpopcnt. The __POPCNT__
// macro is even defined if the user compiles with other flags
// such as -mavx or -march=native.
#if defined(__POPCNT__)
#define HAS_POPCNT
// The MSVC compiler does not support a POPCNT macro, but if the user
// compiles with e.g. /arch:AVX or /arch:AVX512 then MSVC defines
// the __AVX__ macro and POPCNT is also supported.
#elif defined(_MSC_VER) && defined(__AVX__)
#define HAS_POPCNT
#endif

#if !defined(HAS_POPCNT)

#include "cpuid.hpp"
#define ENABLE_CPUID_POPCNT

namespace {

inline bool run_cpuid_supports_popcnt()
{
int abcd[4];
run_cpuid(1, 0, abcd);

// %ecx POPCNT bit flag
int bit_POPCNT = 1 << 23;
return (abcd[2] & bit_POPCNT) == bit_POPCNT;
}

/// Initialized at startup
bool cpu_supports_popcnt = run_cpuid_supports_popcnt();

} // namespace

#endif // !defined(HAS_POPCNT)
#endif // CPUID

#endif

0 comments on commit 17a49bd

Please sign in to comment.