Skip to content

Commit

Permalink
Combined performance patch (5% overall, 15% stage 1) (#317)
Browse files Browse the repository at this point in the history
* Allow -f

* Support parse -s (force sse)

* Simplify flatten_bits

- Add directly to base instead of storing variable
- Don't modify base_ptr after beginning of function
- Eliminate base variable and increment base_ptr instead

* De-unroll the flatten_bits loops

* Decrease dependencies in stage 1

- Do all finalize_structurals work before computing the quote mask; mask
  out the quote mask later
- Join find_whitespace_and_structurals and finalize_structurals into
  single find_structurals call, to reduce variable leakage
- Rework pseudo_pred algorithm to refer to "primitive" for clarity and some
  dependency reduction
- Rename quote_mask to in_string to describe what we're trying to
  achieve ("mask" could mean many things)
- Break up find_quote_mask_and_bits into find_quote_mask and
  invalid_string_bytes to reduce data leakage (i.e. don't expose quote bits
  or odd_ends at all to find_structural_bits)
- Genericize overflow methods "follows" and "follows_odd_sequence" for
  descriptiveness and possible lifting into a generic simd parsing library

* Mark branches as likely/unlikely

* Reorder and unroll+interleave stage 1 loop

* Nest the cnt > 16 branch inside cnt > 8
  • Loading branch information
jkeiser authored and lemire committed Oct 1, 2019
1 parent 53b6dea commit de8df0a
Show file tree
Hide file tree
Showing 13 changed files with 412 additions and 370 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
/jsoncheck
/jsonpointer
/jsonstats
/integer_tests
/libsimdjson.so*
/minify
/numberparsingcheck
Expand Down
34 changes: 24 additions & 10 deletions benchmark/parse.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,18 @@
#include "simdjson/parsedjson.h"
#include "simdjson/stage1_find_marks.h"
#include "simdjson/stage2_build_tape.h"

// Global arguments
bool find_marks_only = false;
bool verbose = false;
bool dump = false;
bool json_output = false;
bool force_one_iteration = false;
bool just_data = false;
bool force_sse = false;
int32_t iterations = -1;
int32_t warmup_iterations = -1;

namespace simdjson {
Architecture _find_best_supported_implementation() {
constexpr uint32_t haswell_flags =
Expand All @@ -43,7 +55,7 @@ Architecture _find_best_supported_implementation() {
instruction_set::SSE42 | instruction_set::PCLMULQDQ;
uint32_t supports = detect_supported_architectures();
// Order from best to worst (within architecture)
if ((haswell_flags & supports) == haswell_flags) {
if ((haswell_flags & supports) == haswell_flags && !force_sse) {
return Architecture::HASWELL;
}
if ((westmere_flags & supports) == westmere_flags) {
Expand All @@ -63,6 +75,9 @@ extern unified_functype *unified_ptr;
extern stage1_functype *stage1_ptr;

int unified_machine_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj) {
if (find_marks_only) {
return simdjson::SUCCESS;
}
Architecture best_implementation = _find_best_supported_implementation();
// Selecting the best implementation
switch (best_implementation) {
Expand Down Expand Up @@ -118,25 +133,21 @@ unified_functype *unified_ptr = &unified_machine_dispatch;
} // namespace simdjson

int main(int argc, char *argv[]) {
bool verbose = false;
bool dump = false;
bool json_output = false;
bool force_one_iteration = false;
bool just_data = false;
int32_t iterations = -1;
int32_t warmup_iterations = -1;

#ifndef _MSC_VER
int c;

while ((c = getopt(argc, argv, "1vdtn:w:")) != -1) {
while ((c = getopt(argc, argv, "1vdtn:w:fs")) != -1) {
switch (c) {
case 'n':
iterations = atoi(optarg);
break;
case 'w':
warmup_iterations = atoi(optarg);
break;
case 's':
force_sse = true;
break;
case 't':
just_data = true;
break;
Expand All @@ -152,6 +163,9 @@ int main(int argc, char *argv[]) {
case '1':
force_one_iteration = true;
break;
case 'f':
find_marks_only = true;
break;
default:
abort();
}
Expand Down Expand Up @@ -326,7 +340,7 @@ int main(int argc, char *argv[]) {
isok = (simdjson::stage1_ptr((const uint8_t *)p.data(), p.size(), pj) ==
simdjson::SUCCESS);
isok = isok &&
(simdjson::SUCCESS ==
(simdjson::SUCCESS ==
simdjson::unified_ptr((const uint8_t *)p.data(), p.size(), pj));
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> secs = end - start;
Expand Down
11 changes: 11 additions & 0 deletions include/simdjson/common_defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,17 @@
#define SIMDJSON_PADDING 32
#endif

#if defined(__GNUC__)
// Marks a block with a name so that MCA analysis can see it.
#define BEGIN_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-BEGIN " #name);
#define END_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-END " #name);
#define DEBUG_BLOCK(name, block) BEGIN_DEBUG_BLOCK(name); block; END_DEBUG_BLOCK(name);
#else
#define BEGIN_DEBUG_BLOCK(name)
#define END_DEBUG_BLOCK(name)
#define DEBUG_BLOCK(name, block)
#endif

#ifndef _MSC_VER
// Implemented using Labels as Values which works in GCC and CLANG (and maybe
// also in Intel's compiler), but won't work in MSVC.
Expand Down
4 changes: 2 additions & 2 deletions scripts/checkperf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,5 @@ make parse
make perfdiff

echo "Running perfdiff:"
echo ./perfdiff \"$current/parse -t $perftests\" \"$reference/parse -t $perftests\"
./perfdiff "$current/parse -t $perftests" "$reference/parse -t $perftests"
echo ./perfdiff \"$current/parse -t $perftests $CHECKPERF_ARGS\" \"$reference/parse -t $perftests $CHECKPERF_ARGS\"
./perfdiff "$current/parse -t $perftests $CHECKPERF_ARGS" "$reference/parse -t $perftests $CHECKPERF_ARGS"
48 changes: 27 additions & 21 deletions src/arm64/simd_input.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,33 +40,32 @@ using namespace simdjson::arm64;

template <>
struct simd_input<Architecture::ARM64> {
uint8x16_t chunks[4];
const uint8x16_t chunks[4];

really_inline simd_input(const uint8_t *ptr) {
this->chunks[0] = vld1q_u8(ptr + 0*16);
this->chunks[1] = vld1q_u8(ptr + 1*16);
this->chunks[2] = vld1q_u8(ptr + 2*16);
this->chunks[3] = vld1q_u8(ptr + 3*16);
}
really_inline simd_input()
: chunks{uint8x16_t(), uint8x16_t(), uint8x16_t(), uint8x16_t() } {}

really_inline simd_input(uint8x16_t chunk0, uint8x16_t chunk1, uint8x16_t chunk2, uint8x16_t chunk3) {
this->chunks[0] = chunk0;
this->chunks[1] = chunk1;
this->chunks[2] = chunk2;
this->chunks[3] = chunk3;
}
really_inline simd_input(const uint8x16_t chunk0, const uint8x16_t chunk1, const uint8x16_t chunk2, const uint8x16_t chunk3)
: chunks{chunk0, chunk1, chunk2, chunk3 } {}

really_inline simd_input(const uint8_t *ptr)
: chunks{
vld1q_u8(ptr + 0*16),
vld1q_u8(ptr + 1*16),
vld1q_u8(ptr + 2*16),
vld1q_u8(ptr + 3*16)
} {}

template <typename F>
really_inline void each(F const& each_chunk)
{
really_inline void each(F const& each_chunk) const {
each_chunk(this->chunks[0]);
each_chunk(this->chunks[1]);
each_chunk(this->chunks[2]);
each_chunk(this->chunks[3]);
}

template <typename F>
really_inline simd_input<Architecture::ARM64> map(F const& map_chunk) {
really_inline simd_input<Architecture::ARM64> map(F const& map_chunk) const {
return simd_input<Architecture::ARM64>(
map_chunk(this->chunks[0]),
map_chunk(this->chunks[1]),
Expand All @@ -76,7 +75,7 @@ struct simd_input<Architecture::ARM64> {
}

template <typename F>
really_inline simd_input<Architecture::ARM64> map(simd_input<Architecture::ARM64> b, F const& map_chunk) {
really_inline simd_input<Architecture::ARM64> map(simd_input<Architecture::ARM64> b, F const& map_chunk) const {
return simd_input<Architecture::ARM64>(
map_chunk(this->chunks[0], b.chunks[0]),
map_chunk(this->chunks[1], b.chunks[1]),
Expand All @@ -86,24 +85,31 @@ struct simd_input<Architecture::ARM64> {
}

template <typename F>
really_inline uint8x16_t reduce(F const& reduce_pair) {
really_inline uint8x16_t reduce(F const& reduce_pair) const {
uint8x16_t r01 = reduce_pair(this->chunks[0], this->chunks[1]);
uint8x16_t r23 = reduce_pair(this->chunks[2], this->chunks[3]);
return reduce_pair(r01, r23);
}

really_inline uint64_t to_bitmask() {
really_inline uint64_t to_bitmask() const {
return neon_movemask_bulk(this->chunks[0], this->chunks[1], this->chunks[2], this->chunks[3]);
}

really_inline uint64_t eq(uint8_t m) {
really_inline simd_input<Architecture::ARM64> bit_or(const uint8_t m) const {
const uint8x16_t mask = vmovq_n_u8(m);
return this->map( [&](auto a) {
return vorrq_u8(a, mask);
});
}

really_inline uint64_t eq(const uint8_t m) const {
const uint8x16_t mask = vmovq_n_u8(m);
return this->map( [&](auto a) {
return vceqq_u8(a, mask);
}).to_bitmask();
}

really_inline uint64_t lteq(uint8_t m) {
really_inline uint64_t lteq(const uint8_t m) const {
const uint8x16_t mask = vmovq_n_u8(m);
return this->map( [&](auto a) {
return vcleq_u8(a, mask);
Expand Down
14 changes: 7 additions & 7 deletions src/arm64/stage1_find_marks.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

namespace simdjson::arm64 {

really_inline uint64_t compute_quote_mask(uint64_t quote_bits) {
really_inline uint64_t compute_quote_mask(const uint64_t quote_bits) {

#ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension
return vmull_p64(-1ULL, quote_bits);
Expand All @@ -21,9 +21,9 @@ really_inline uint64_t compute_quote_mask(uint64_t quote_bits) {
#endif
}

really_inline void find_whitespace_and_structurals(
simd_input<ARCHITECTURE> in, uint64_t &whitespace,
uint64_t &structurals) {
really_inline void find_whitespace_and_operators(
const simd_input<ARCHITECTURE> in,
uint64_t &whitespace, uint64_t &op) {
const uint8x16_t low_nibble_mask =
(uint8x16_t){16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0};
const uint8x16_t high_nibble_mask =
Expand All @@ -38,9 +38,9 @@ really_inline void find_whitespace_and_structurals(
return vandq_u8(shuf_lo, shuf_hi);
});

const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
structurals = v.map([&](auto _v) {
return vtstq_u8(_v, structural_shufti_mask);
const uint8x16_t operator_shufti_mask = vmovq_n_u8(0x7);
op = v.map([&](auto _v) {
return vtstq_u8(_v, operator_shufti_mask);
}).to_bitmask();

const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);
Expand Down
Loading

0 comments on commit de8df0a

Please sign in to comment.