From ca9033cd47fef8d7343bc8974aeffd5fa2f492af Mon Sep 17 00:00:00 2001 From: Laurynas Biveinis Date: Mon, 21 Jun 2021 09:06:59 +0300 Subject: [PATCH] WIP SIMD'ify Node4 insert position search --- art_internal_impl.hpp | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/art_internal_impl.hpp b/art_internal_impl.hpp index 5cd33849..2a667f8a 100644 --- a/art_internal_impl.hpp +++ b/art_internal_impl.hpp @@ -82,6 +82,12 @@ inline auto _mm_cmple_epu8(__m128i x, __m128i y) noexcept { return _mm_cmpeq_epi8(_mm_max_epu8(y, x), y); } +// Stolen from https://stackoverflow.com/a/24234695/80458 +inline auto _mm_cmplt_epu8(__m128i x, __m128i y) noexcept { + return _mm_cmplt_epi8(_mm_add_epi8(x, _mm_set1_epi8(-128)), + _mm_add_epi8(y, _mm_set1_epi8(-128))); +} + #else // #ifdef __x86_64 // From public domain @@ -1058,11 +1064,24 @@ class basic_inode_4 : public basic_inode_4_parent { const auto key_byte = static_cast(leaf_type::key(child.get())[depth]); +#if __x86_64 + const auto replicated_insert_key = + _mm_set1_epi8(static_cast(key_byte)); + const auto keys_in_sse_reg = + _mm_cvtsi32_si128(static_cast(keys.integer.load())); + const auto lt_node_key_positions = + _mm_cmplt_epu8(keys_in_sse_reg, replicated_insert_key); + const auto bit_field = + static_cast(_mm_movemask_epi8(lt_node_key_positions)) & 0xFU; + const auto insert_pos_index = + static_cast(__builtin_popcount(bit_field)); +#else const auto first_lt = ((keys.integer & 0xFFU) < key_byte) ? 1 : 0; const auto second_lt = (((keys.integer >> 8U) & 0xFFU) < key_byte) ? 1 : 0; const auto third_lt = ((keys.integer >> 16U) & 0xFFU) < key_byte ? 1 : 0; const auto insert_pos_index = static_cast(first_lt + second_lt + third_lt); +#endif for (typename decltype(keys.byte_array)::size_type i = children_count; i > insert_pos_index; --i) { @@ -1256,6 +1275,18 @@ class basic_inode_16 : public basic_inode_16_parent { const auto key_byte = static_cast(leaf_type::key(child.get())[depth]); +#if __x86_64 + const auto replicated_insert_key = + _mm_set1_epi8(static_cast(key_byte)); + const auto keys_in_sse_reg = _mm_cvtsi32_si128( + static_cast(source_node->keys.integer.load())); + const auto lt_node_key_positions = + _mm_cmplt_epu8(keys_in_sse_reg, replicated_insert_key); + const auto bit_field = + static_cast(_mm_movemask_epi8(lt_node_key_positions)) & 0xFU; + const auto insert_pos_index = + static_cast(__builtin_popcount(bit_field)); +#else const auto keys_integer = source_node->keys.integer.load(); const auto first_lt = ((keys_integer & 0xFFU) < key_byte) ? 1 : 0; const auto second_lt = (((keys_integer >> 8U) & 0xFFU) < key_byte) ? 1 : 0; @@ -1263,6 +1294,7 @@ class basic_inode_16 : public basic_inode_16_parent { const auto fourth_lt = (((keys_integer >> 24U) & 0xFFU) < key_byte) ? 1 : 0; const auto insert_pos_index = static_cast(first_lt + second_lt + third_lt + fourth_lt); +#endif unsigned i = 0; for (; i < insert_pos_index; ++i) {