Skip to content

Commit

Permalink
Uses function multiversioning for popcnt
Browse files Browse the repository at this point in the history
Currently, uses __builtin_popcount if __POPCNT__ defined.
But __builtin_popcount cannot be inlining and required -march compiler option.
This patch will inlining popcount if enabled function multiversioning.

Classify Benchmark Results
(euclid_lsh, hash_num=8192, haswell@2.4GHz, gcc5.3.0, avx2):

20news[1]:
  before: 18.2289[ms/classify]
   after: 10.2199 (-8.009ms, -43.94%)

dorothea_train[2]:
  before: 38.0193[ms/classify]
   after: 37.8907 (-0.127ms, -0.334%)

iris[3]:
  before: 0.34379[ms/classify]
  after:  0.29355 (-0.050ms, -14.61%)

[1] http://qwone.com/~jason/20Newsgroups/
[2] http://archive.ics.uci.edu/ml/datasets/Dorothea
[3] https://archive.ics.uci.edu/ml/datasets/Iris
  • Loading branch information
Kazuki OIKAWA committed Apr 15, 2016
1 parent a39b2ce commit 1e16228
Show file tree
Hide file tree
Showing 3 changed files with 129 additions and 33 deletions.
120 changes: 120 additions & 0 deletions jubatus/core/storage/bit_vector.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
// Jubatus: Online machine learning framework for distributed environment
// Copyright (C) 2016 Preferred Networks and Nippon Telegraph and Telephone Corporation.
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License version 2.1 as published by the Free Software Foundation.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

#include "bit_vector.hpp"

#ifdef JUBATUS_USE_FMV
#include <nmmintrin.h>
#endif

namespace jubatus {
namespace core {
namespace storage {
namespace detail {
namespace {

#ifdef JUBATUS_USE_FMV
__attribute__((target("default")))
#endif
size_t calc_hamming_distance_impl(const uint64_t *x,
const uint64_t *y,
size_t blocks) {
size_t match_num = 0;
for (size_t i = 0; i < blocks; ++i) {
match_num += bitcount(x[i] ^ y[i]);
}
return match_num;
}

#ifdef JUBATUS_USE_FMV
__attribute__((target("default")))
#endif
size_t bit_count_impl(const uint64_t *x, size_t blocks) {
size_t result = 0;
for (size_t i = 0; i < blocks; ++i) {
result += bitcount(x[i]);
}
return result;
}

#ifdef JUBATUS_USE_FMV
__attribute__((target("popcnt")))
size_t calc_hamming_distance_impl(const uint64_t *x,
const uint64_t *y,
size_t blocks) {
size_t match_num = 0;
#ifdef __x86_64__
ssize_t i;
for (i = 0; i < static_cast<ssize_t>(blocks) - 3; i += 4) {
match_num += _mm_popcnt_u64(x[i] ^ y[i]);
match_num += _mm_popcnt_u64(x[i + 1] ^ y[i + 1]);
match_num += _mm_popcnt_u64(x[i + 2] ^ y[i + 2]);
match_num += _mm_popcnt_u64(x[i + 3] ^ y[i + 3]);
}
for (; i < blocks; ++i) {
match_num += _mm_popcnt_u64(x[i] ^ y[i]);
}
#else // #ifdef __x86_64__
const uint32_t *p0 = (const uint32_t*)x;
const uint32_t *p1 = (const uint32_t*)y;
blocks *= 2;
for (size_t i = 0; i < blocks; ++i) {
match_num += _mm_popcnt_u32(p0[i] ^ p1[i]);
}
#endif // #ifdef __x86_64__
return match_num;
}

__attribute__((target("popcnt")))
size_t bit_count_impl(const uint64_t *x, size_t blocks) {
size_t result = 0;
#ifdef __x86_64__
ssize_t i;
for (i = 0; i < static_cast<ssize_t>(blocks) - 3; i += 4) {
result += _mm_popcnt_u64(x[i]);
result += _mm_popcnt_u64(x[i + 1]);
result += _mm_popcnt_u64(x[i + 2]);
result += _mm_popcnt_u64(x[i + 3]);
}
for (; i < blocks; ++i) {
result += _mm_popcnt_u64(x[i]);
}
#else // #ifdef __x86_64__
const uint32_t *p = (const uint32_t*)x;
blocks *= 2;
for (size_t i = 0; i < blocks; ++i) {
result += _mm_popcnt_u32(p[i]);
}
#endif // #ifdef __x86_64__
return result;
}
#endif

} // namespace

size_t calc_hamming_distance_internal(
const uint64_t *x, const uint64_t *y, size_t blocks) {
return calc_hamming_distance_impl(x, y, blocks);
}

size_t bit_count_internal(const uint64_t *x, size_t blocks) {
return bit_count_impl(x, blocks);
}

} // namespace detail
} // namespace storage
} // namespace core
} // namespace jubatus
39 changes: 7 additions & 32 deletions jubatus/core/storage/bit_vector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,29 +74,9 @@ struct bitcount_impl<T, 8> {
}
};

#ifdef __POPCNT__

inline int fast_bitcount(unsigned bits) {
return __builtin_popcount(bits);
}

inline int fast_bitcount(unsigned long bits) { // NOLINT
return __builtin_popcountl(bits);
}

inline int fast_bitcount(unsigned long long bits) { // NOLINT
return __builtin_popcountll(bits);
}

#endif

template <class T>
inline int bitcount_dispatcher(T bits) {
#ifdef __POPCNT__
return fast_bitcount(bits);
#else
return bitcount_impl<T, sizeof(T)>::call(bits);
#endif
}

inline int bitcount(unsigned bits) {
Expand All @@ -114,6 +94,10 @@ inline int bitcount(unsigned long long bits) { // NOLINT
template <class T>
inline int bitcount(T); // = delete;

size_t calc_hamming_distance_internal(
const uint64_t *x, const uint64_t *y, size_t blocks);
size_t bit_count_internal(const uint64_t *x, size_t blocks);

} // namespace detail

class bit_vector_unmatch_exception
Expand Down Expand Up @@ -323,12 +307,8 @@ struct bit_vector_base {
uint64_t calc_hamming_distance_unsafe(const bit_base *bv) const {
if (bits_ == NULL)
return bit_count_unsafe(bv, used_bytes());
size_t match_num = 0;
for (size_t i = 0, blocks = used_bytes() / sizeof(bit_base);
i < blocks; ++i) {
match_num += detail::bitcount(bits_[i] ^ bv[i]);
}
return match_num;
return detail::calc_hamming_distance_internal(
bits_, bv, used_bytes() / sizeof(bit_base));
}
size_t bit_count() const {
return bit_count_unsafe(bits_, used_bytes());
Expand Down Expand Up @@ -460,12 +440,7 @@ struct bit_vector_base {
if (bits == NULL) {
return 0;
}
size_t result = 0;
for (size_t i = 0, blocks = bytes / sizeof(bit_base);
i < blocks; ++i) {
result += detail::bitcount(bits[i]);
}
return result;
return detail::bit_count_internal(bits, bytes / sizeof(bit_base));
}

bit_base* bits_;
Expand Down
3 changes: 2 additions & 1 deletion jubatus/core/storage/wscript
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ def build(bld):
'bit_index_storage.cpp',
'lsh_vector.cpp',
'lsh_util.cpp',
'lsh_index_storage.cpp'
'lsh_index_storage.cpp',
'bit_vector.cpp'
]
headers = [
'abstract_column.hpp',
Expand Down

0 comments on commit 1e16228

Please sign in to comment.