Skip to content

Commit

Permalink
Merge pull request #435 from laurynas-biveinis/n48-avx2-experiment
Browse files Browse the repository at this point in the history
Add AVX2 support, use it for N48 insert position search
  • Loading branch information
laurynas-biveinis committed Apr 27, 2022
2 parents 502863b + a53f02e commit 2240690
Show file tree
Hide file tree
Showing 8 changed files with 79 additions and 17 deletions.
24 changes: 22 additions & 2 deletions .github/workflows/build.yml
Expand Up @@ -11,6 +11,7 @@ env:
DEFAULT_STATIC_ANALYSIS: OFF
DEFAULT_CPPLINT: OFF
DEFAULT_COVERAGE: OFF
DEFAULT_AVX2: ON

jobs:
build:
Expand All @@ -26,6 +27,7 @@ jobs:
COMPILER: ${{matrix.COMPILER}}
CPPLINT: ${{matrix.CPPLINT}}
CPPCHECK: ${{matrix.CPPCHECK}}
AVX2: ${{matrix.AVX2}}

strategy:
fail-fast: false
Expand Down Expand Up @@ -77,6 +79,12 @@ jobs:
COMPILER: gcc
SANITIZE_UB: ON

- name: GCC 11 Debug without AVX2
os: ubuntu-20.04
BUILD_TYPE: Debug
COMPILER: gcc
AVX2: OFF

- name: GCC 11 Release static analysis & cpplint
os: ubuntu-20.04
BUILD_TYPE: Release
Expand Down Expand Up @@ -152,60 +160,70 @@ jobs:
os: macos-latest
BUILD_TYPE: Release
COMPILER: macos-clang
AVX2: OFF

- name: XCode Release with ASan
os: macos-latest
BUILD_TYPE: Release
COMPILER: macos-clang
SANITIZE_ADDRESS: ON
AVX2: OFF

- name: XCode Release with TSan
os: macos-latest
BUILD_TYPE: Release
COMPILER: macos-clang
SANITIZE_THREAD: ON
AVX2: OFF

- name: XCode Release with UBSan
os: macos-latest
BUILD_TYPE: Release
COMPILER: macos-clang
SANITIZE_UB: ON
AVX2: OFF

- name: XCode Debug with cppcheck
os: macos-latest
BUILD_TYPE: Debug
COMPILER: macos-clang
CPPCHECK: ON
AVX2: OFF

- name: XCode Debug with ASan
os: macos-latest
BUILD_TYPE: Debug
COMPILER: macos-clang
SANITIZE_ADDRESS: ON
AVX2: OFF

- name: XCode Debug with TSan
os: macos-latest
BUILD_TYPE: Debug
COMPILER: macos-clang
SANITIZE_THREAD: ON
AVX2: OFF

- name: XCode Debug with UBSan
os: macos-latest
BUILD_TYPE: Debug
COMPILER: macos-clang
SANITIZE_UB: ON
AVX2: OFF

- name: Debug coverage
os: macos-latest
BUILD_TYPE: Debug
COMPILER: gcc
COVERAGE: ON
AVX2: OFF

- name: Release coverage
os: macos-latest
BUILD_TYPE: Release
COMPILER: gcc
COVERAGE: ON
AVX2: OFF

steps:
- uses: actions/checkout@v2
Expand Down Expand Up @@ -290,6 +308,7 @@ jobs:
SANITIZE_UB="${SANITIZE_UB:-$DEFAULT_SANITIZE_UB}"
STATIC_ANALYSIS="${STATIC_ANALYSIS:-$DEFAULT_STATIC_ANALYSIS}"
COVERAGE="${COVERAGE:-$DEFAULT_COVERAGE}"
AVX2="${AVX2:-$DEFAULT_AVX2}"
export PATH="$HOME/.local/bin:$PATH"
if [[ $COMPILER == "gcc" ]]; then
export CC=gcc-11
Expand Down Expand Up @@ -327,7 +346,7 @@ jobs:
"-DSANITIZE_THREAD=${SANITIZE_THREAD}" \
"-DSANITIZE_UB=${SANITIZE_UB}" \
"-DSTATIC_ANALYSIS=${STATIC_ANALYSIS}" "-DCOVERAGE=${COVERAGE}" \
"${EXTRA_CMAKE_ARGS[@]}"
"-DAVX2=${AVX2}" "${EXTRA_CMAKE_ARGS[@]}"
- name: Build
working-directory: ${{github.workspace}}/build
Expand Down Expand Up @@ -367,7 +386,8 @@ jobs:
- name: Valgrind test
working-directory: ${{github.workspace}}/build
run: |
sudo apt-get install -y valgrind
sudo apt-get install -y libc6-dbg
sudo snap install --classic valgrind
make valgrind
if: >
env.SANITIZE_ADDRESS != 'ON' && env.SANITIZE_THREAD != 'ON'
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/old-compilers.yml
Expand Up @@ -264,7 +264,8 @@ jobs:
- name: Valgrind test
working-directory: ${{github.workspace}}/build
run: |
sudo apt-get install -y valgrind
sudo apt-get install -y libc6-dbg
sudo snap install --classic valgrind
make valgrind
if: >
env.SANITIZE_ADDRESS != 'ON' && env.SANITIZE_THREAD != 'ON'
Expand Down
12 changes: 8 additions & 4 deletions CMakeLists.txt
Expand Up @@ -71,13 +71,16 @@ set(GCC_GE_11_CXX_WARNING_FLAGS

set(UNIX_CXX_FLAGS "-g")

if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
list(APPEND UNIX_CXX_FLAGS "-msse4.1")
option(AVX2 "Enable AVX2 instructions on x86_64" ON)
if(AVX2)
message(STATUS "Using AVX2 instructions on x86_64")
else()
message(STATUS "Using SSE4.1 instructions on x86_64")
endif()

if(MSVC)
# Remove it once CMake minimum is bumped to 3.15 or greater
string(REGEX REPLACE "/W3" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
string(REGEX REPLACE "/W3" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
endif()

# Disable the following warnings for MSVC:
Expand Down Expand Up @@ -463,6 +466,7 @@ function(COMMON_TARGET_PROPERTIES TARGET)
# on 3.14.
"$<$<OR:$<CXX_COMPILER_ID:MSVC>,$<AND:$<PLATFORM_ID:Windows>,$<CXX_COMPILER_ID:Clang>>>:${MSVC_CLANG_CXX_FLAGS}>"
"$<$<NOT:$<PLATFORM_ID:Windows>>:${UNIX_CXX_FLAGS}>"
"$<$<AND:$<NOT:$<PLATFORM_ID:Windows>>,$<STREQUAL:${CMAKE_SYSTEM_PROCESSOR},x86_64>>:$<IF:$<BOOL:${AVX2}>,-mavx2,-msse4.1>>"
"$<$<BOOL:${FATAL_WARNINGS}>:$<IF:$<OR:$<CXX_COMPILER_ID:MSVC>,$<AND:$<PLATFORM_ID:Windows>,$<CXX_COMPILER_ID:Clang>>>,/WX,-Werror>>"
"$<$<OR:$<CXX_COMPILER_ID:MSVC>,$<AND:$<PLATFORM_ID:Windows>,$<CXX_COMPILER_ID:Clang>>>:${MSVC_CXX_WARNING_FLAGS}>"
"$<$<AND:$<PLATFORM_ID:Windows>,$<CXX_COMPILER_ID:Clang>>:${MSVC_CLANGS_ONLY_WARNING_FLAGS}>"
Expand Down Expand Up @@ -563,7 +567,7 @@ if(LIBFUZZER_AVAILABLE)
endif()

set(VALGRIND_COMMAND "valgrind" "--error-exitcode=1" "--leak-check=full"
"--trace-children=yes")
"--trace-children=yes" "-v")

add_custom_target(valgrind DEPENDS valgrind_tests valgrind_benchmarks)

Expand Down
7 changes: 5 additions & 2 deletions README.md
Expand Up @@ -19,8 +19,9 @@ and I am trying to describe some of the things I learned at my [blog](https://of

## Requirements

The source code is C++17, using SSE4.1 intrinsics (Nehalem and higher) or AVX
in the case of MSVC. This is in contrast to the original ART paper needing SSE2
The source code is C++17, using SSE4.1 intrinsics (Nehalem and higher), AVX in
the case of MSVC, with optional AVX2 support, if available. This is in contrast
to the original ART paper needing SSE2
only.

Note: since this is my personal project, it only supports GCC 10, 11, LLVM 11 to
Expand Down Expand Up @@ -112,6 +113,8 @@ clang-tidy, cppcheck, and cpplint will be invoked automatically during build if
found. Currently the diagnostic level for them as well as for compiler warnings
is set very high, and can be relaxed, especially for clang-tidy, as need arises.

To disable AVX2 intrinsics to use SSE4.1/AVX only, add `-DWITH_AVX2=OFF`.

To enable AddressSanitizer and LeakSanitizer (the latter if available), add
`-DSANITIZE_ADDRESS=ON` CMake option. It is incompatible with
`-DSANITIZE_THREAD=ON`.
Expand Down
4 changes: 4 additions & 0 deletions art.cpp
Expand Up @@ -138,7 +138,11 @@ class [[nodiscard]] inode_48 final
}
};

#ifdef UNODB_DETAIL_AVX2
static_assert(sizeof(inode_48) == 672);
#else
static_assert(sizeof(inode_48) == 656);
#endif

class [[nodiscard]] inode_256 final
: public unodb::detail::basic_inode_256<art_policy> {
Expand Down
24 changes: 22 additions & 2 deletions art_internal_impl.hpp
Expand Up @@ -19,6 +19,9 @@
#include <emmintrin.h>
#include <smmintrin.h>
#endif
#ifdef UNODB_DETAIL_AVX2
#include <immintrin.h>
#endif

#include <gsl/util>

Expand Down Expand Up @@ -1529,7 +1532,7 @@ class basic_inode_48 : public basic_inode_48_parent<ArtPolicy> {
const auto key_byte = static_cast<uint8_t>(child->get_key()[depth]);
UNODB_DETAIL_ASSERT(child_indexes[key_byte] == empty_child);
unsigned i{0};
#ifdef UNODB_DETAIL_X86_64
#ifdef UNODB_DETAIL_SSE4_2
const auto nullptr_vector = _mm_setzero_si128();
while (true) {
const auto ptr_vec0 = _mm_load_si128(&children.pointer_vector[i]);
Expand All @@ -1553,6 +1556,19 @@ class basic_inode_48 : public basic_inode_48_parent<ArtPolicy> {
}
i += 4;
}
#elif defined(UNODB_DETAIL_AVX2)
const auto nullptr_vector = _mm256_setzero_si256();
while (true) {
const auto ptr_vec = _mm256_load_si256(&children.pointer_vector[i]);
const auto vec_cmp = _mm256_cmpeq_epi64(ptr_vec, nullptr_vector);
const auto cmp_mask =
static_cast<std::uint64_t>(_mm256_movemask_epi8(vec_cmp));
if (cmp_mask != 0) {
i = (i << 2U) + (detail::ctz64(cmp_mask) >> 3U);
break;
}
++i;
}
#else // #ifdef UNODB_DETAIL_X86_64
// This is also the current best ARM implementation
node_ptr child_ptr;
Expand Down Expand Up @@ -1719,14 +1735,18 @@ class basic_inode_48 : public basic_inode_48_parent<ArtPolicy> {
union children_union {
std::array<critical_section_policy<node_ptr>, basic_inode_48::capacity>
pointer_array;
#ifdef UNODB_DETAIL_X86_64
#ifdef UNODB_DETAIL_SSE4_2
static_assert(basic_inode_48::capacity % 2 == 0);
static_assert((basic_inode_48::capacity / 2) % 4 == 0,
"Node48 capacity must support unrolling without remainder");
// No std::array below because it would ignore the alignment attribute
// NOLINTNEXTLINE(modernize-avoid-c-arrays)
__m128i
pointer_vector[basic_inode_48::capacity / 2]; // NOLINT(runtime/arrays)
#elif defined(UNODB_DETAIL_AVX2)
// NOLINTNEXTLINE(modernize-avoid-c-arrays)
__m256i
pointer_vector[basic_inode_48::capacity / 4]; // NOLINT(runtime/arrays)
#endif

UNODB_DETAIL_DISABLE_MSVC_WARNING(26495)
Expand Down
8 changes: 8 additions & 0 deletions global.hpp
Expand Up @@ -46,6 +46,14 @@
#define UNODB_DETAIL_X86_64
#endif

#ifdef UNODB_DETAIL_X86_64
#ifdef __AVX2__
#define UNODB_DETAIL_AVX2
#else
#define UNODB_DETAIL_SSE4_2
#endif
#endif

#if defined(UNODB_DETAIL_X86_64) || \
defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
#define UNODB_DETAIL_LITTLE_ENDIAN
Expand Down
14 changes: 8 additions & 6 deletions olc_art.cpp
Expand Up @@ -453,20 +453,22 @@ class [[nodiscard]] olc_inode_48 final
UNODB_DETAIL_RESTORE_MSVC_WARNINGS()
};

// 656 == sizeof(inode_48)
// sizeof(inode_48) == 672 on AVX2, 656 otherwise
#ifdef NDEBUG
#ifdef __aarch64__
static_assert(sizeof(olc_inode_48) == 656 + 8);
#else // #ifdef __aarch64__
static_assert(sizeof(olc_inode_48) == 656 + 16);
#endif // #ifdef __aarch64__
#else
static_assert(sizeof(olc_inode_48) == 656 + 16); // AVX2 too. Padding?
#endif
#else // #ifdef NDEBUG
#ifdef __aarch64__
static_assert(sizeof(olc_inode_48) == 656 + 24);
#else // #ifdef __aarch64__
#elif defined(UNODB_DETAIL_AVX2)
static_assert(sizeof(olc_inode_48) == 672 + 32);
#else
static_assert(sizeof(olc_inode_48) == 656 + 32);
#endif // #ifdef __aarch64__
#endif
#endif // #ifdef NDEBUG

UNODB_DETAIL_DISABLE_MSVC_WARNING(26434)

Expand Down

0 comments on commit 2240690

Please sign in to comment.