Combined performance patch (5% overall, 15% stage 1) (#317)

* Allow -f * Support parse -s (force sse) * Simplify flatten_bits - Add directly to base instead of storing variable - Don't modify base_ptr after beginning of function - Eliminate base variable and increment base_ptr instead * De-unroll the flatten_bits loops * Decrease dependencies in stage 1 - Do all finalize_structurals work before computing the quote mask; mask out the quote mask later - Join find_whitespace_and_structurals and finalize_structurals into single find_structurals call, to reduce variable leakage - Rework pseudo_pred algorithm to refer to "primitive" for clarity and some dependency reduction - Rename quote_mask to in_string to describe what we're trying to achieve ("mask" could mean many things) - Break up find_quote_mask_and_bits into find_quote_mask and invalid_string_bytes to reduce data leakage (i.e. don't expose quote bits or odd_ends at all to find_structural_bits) - Genericize overflow methods "follows" and "follows_odd_sequence" for descriptiveness and possible lifting into a generic simd parsing library * Mark branches as likely/unlikely * Reorder and unroll+interleave stage 1 loop * Nest the cnt > 16 branch inside cnt > 8
simdjson · Oct 1, 2019 · de8df0a · de8df0a
1 parent 53b6dea
commit de8df0a
Show file tree

Hide file tree

Showing 13 changed files with 412 additions and 370 deletions.
diff --git a/.gitignore b/.gitignore
@@ -14,6 +14,7 @@
 /jsoncheck
 /jsonpointer
 /jsonstats
+/integer_tests
 /libsimdjson.so*
 /minify
 /numberparsingcheck

diff --git a/benchmark/parse.cpp b/benchmark/parse.cpp
@@ -34,6 +34,18 @@
 #include "simdjson/parsedjson.h"
 #include "simdjson/stage1_find_marks.h"
 #include "simdjson/stage2_build_tape.h"
+
+// Global arguments
+bool find_marks_only = false;
+bool verbose = false;
+bool dump = false;
+bool json_output = false;
+bool force_one_iteration = false;
+bool just_data = false;
+bool force_sse = false;
+int32_t iterations = -1;
+int32_t warmup_iterations = -1;
+
 namespace simdjson {
 Architecture _find_best_supported_implementation() {
   constexpr uint32_t haswell_flags =
@@ -43,7 +55,7 @@ Architecture _find_best_supported_implementation() {
       instruction_set::SSE42 | instruction_set::PCLMULQDQ;
   uint32_t supports = detect_supported_architectures();
   // Order from best to worst (within architecture)
-  if ((haswell_flags & supports) == haswell_flags) {
+  if ((haswell_flags & supports) == haswell_flags && !force_sse) {
     return Architecture::HASWELL;
   }
   if ((westmere_flags & supports) == westmere_flags) {
@@ -63,6 +75,9 @@ extern unified_functype *unified_ptr;
 extern stage1_functype *stage1_ptr;
 
 int unified_machine_dispatch(const uint8_t *buf, size_t len, ParsedJson &pj) {
+  if (find_marks_only) {
+    return simdjson::SUCCESS;
+  }
   Architecture best_implementation = _find_best_supported_implementation();
   // Selecting the best implementation
   switch (best_implementation) {
@@ -118,25 +133,21 @@ unified_functype *unified_ptr = &unified_machine_dispatch;
 } // namespace simdjson
 
 int main(int argc, char *argv[]) {
-  bool verbose = false;
-  bool dump = false;
-  bool json_output = false;
-  bool force_one_iteration = false;
-  bool just_data = false;
-  int32_t iterations = -1;
-  int32_t warmup_iterations = -1;
 
 #ifndef _MSC_VER
   int c;
 
-  while ((c = getopt(argc, argv, "1vdtn:w:")) != -1) {
+  while ((c = getopt(argc, argv, "1vdtn:w:fs")) != -1) {
     switch (c) {
     case 'n':
       iterations = atoi(optarg);
       break;
     case 'w':
       warmup_iterations = atoi(optarg);
       break;
+    case 's':
+      force_sse = true;
+      break;
     case 't':
       just_data = true;
       break;
@@ -152,6 +163,9 @@ int main(int argc, char *argv[]) {
     case '1':
       force_one_iteration = true;
       break;
+    case 'f':
+      find_marks_only = true;
+      break;
     default:
       abort();
     }
@@ -326,7 +340,7 @@ int main(int argc, char *argv[]) {
     isok = (simdjson::stage1_ptr((const uint8_t *)p.data(), p.size(), pj) ==
             simdjson::SUCCESS);
     isok = isok &&
-           (simdjson::SUCCESS ==
+          (simdjson::SUCCESS ==
             simdjson::unified_ptr((const uint8_t *)p.data(), p.size(), pj));
     auto end = std::chrono::steady_clock::now();
     std::chrono::duration<double> secs = end - start;

diff --git a/include/simdjson/common_defs.h b/include/simdjson/common_defs.h
@@ -17,6 +17,17 @@
 #define SIMDJSON_PADDING 32
 #endif
 
+#if defined(__GNUC__)
+// Marks a block with a name so that MCA analysis can see it.
+#define BEGIN_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-BEGIN " #name);
+#define END_DEBUG_BLOCK(name) __asm volatile("# LLVM-MCA-END " #name);
+#define DEBUG_BLOCK(name, block) BEGIN_DEBUG_BLOCK(name); block; END_DEBUG_BLOCK(name);
+#else
+#define BEGIN_DEBUG_BLOCK(name)
+#define END_DEBUG_BLOCK(name)
+#define DEBUG_BLOCK(name, block)
+#endif
+
 #ifndef _MSC_VER
 // Implemented using Labels as Values which works in GCC and CLANG (and maybe
 // also in Intel's compiler), but won't work in MSVC.

diff --git a/scripts/checkperf.sh b/scripts/checkperf.sh
@@ -29,5 +29,5 @@ make parse
 make perfdiff
 
 echo "Running perfdiff:"
-echo ./perfdiff \"$current/parse -t $perftests\" \"$reference/parse -t $perftests\"
-./perfdiff "$current/parse -t $perftests" "$reference/parse -t $perftests"
+echo ./perfdiff \"$current/parse -t $perftests $CHECKPERF_ARGS\" \"$reference/parse -t $perftests $CHECKPERF_ARGS\"
+./perfdiff "$current/parse -t $perftests $CHECKPERF_ARGS" "$reference/parse -t $perftests $CHECKPERF_ARGS"
diff --git a/src/arm64/simd_input.h b/src/arm64/simd_input.h
@@ -40,33 +40,32 @@ using namespace simdjson::arm64;
 
 template <>
 struct simd_input<Architecture::ARM64> {
-  uint8x16_t chunks[4];
+  const uint8x16_t chunks[4];
 
-  really_inline simd_input(const uint8_t *ptr) {
-    this->chunks[0] = vld1q_u8(ptr + 0*16);
-    this->chunks[1] = vld1q_u8(ptr + 1*16);
-    this->chunks[2] = vld1q_u8(ptr + 2*16);
-    this->chunks[3] = vld1q_u8(ptr + 3*16);
-  }
+  really_inline simd_input()
+    : chunks{uint8x16_t(), uint8x16_t(), uint8x16_t(), uint8x16_t() } {}
 
-  really_inline simd_input(uint8x16_t chunk0, uint8x16_t chunk1, uint8x16_t chunk2, uint8x16_t chunk3) {
-    this->chunks[0] = chunk0;
-    this->chunks[1] = chunk1;
-    this->chunks[2] = chunk2;
-    this->chunks[3] = chunk3;
-  }
+  really_inline simd_input(const uint8x16_t chunk0, const uint8x16_t chunk1, const uint8x16_t chunk2, const uint8x16_t chunk3)
+    : chunks{chunk0, chunk1, chunk2, chunk3 } {}
+
+  really_inline simd_input(const uint8_t *ptr)
+      : chunks{
+        vld1q_u8(ptr + 0*16),
+        vld1q_u8(ptr + 1*16),
+        vld1q_u8(ptr + 2*16),
+        vld1q_u8(ptr + 3*16)
+       } {}
 
   template <typename F>
-  really_inline void each(F const& each_chunk)
-  {
+  really_inline void each(F const& each_chunk) const {
     each_chunk(this->chunks[0]);
     each_chunk(this->chunks[1]);
     each_chunk(this->chunks[2]);
     each_chunk(this->chunks[3]);
   }
 
   template <typename F>
-  really_inline simd_input<Architecture::ARM64> map(F const& map_chunk) {
+  really_inline simd_input<Architecture::ARM64> map(F const& map_chunk) const {
     return simd_input<Architecture::ARM64>(
       map_chunk(this->chunks[0]),
       map_chunk(this->chunks[1]),
@@ -76,7 +75,7 @@ struct simd_input<Architecture::ARM64> {
   }
 
   template <typename F>
-  really_inline simd_input<Architecture::ARM64> map(simd_input<Architecture::ARM64> b, F const& map_chunk) {
+  really_inline simd_input<Architecture::ARM64> map(simd_input<Architecture::ARM64> b, F const& map_chunk) const {
     return simd_input<Architecture::ARM64>(
       map_chunk(this->chunks[0], b.chunks[0]),
       map_chunk(this->chunks[1], b.chunks[1]),
@@ -86,24 +85,31 @@ struct simd_input<Architecture::ARM64> {
   }
 
   template <typename F>
-  really_inline uint8x16_t reduce(F const& reduce_pair) {
+  really_inline uint8x16_t reduce(F const& reduce_pair) const {
     uint8x16_t r01 = reduce_pair(this->chunks[0], this->chunks[1]);
     uint8x16_t r23 = reduce_pair(this->chunks[2], this->chunks[3]);
     return reduce_pair(r01, r23);
   }
 
-  really_inline uint64_t to_bitmask() {
+  really_inline uint64_t to_bitmask() const {
     return neon_movemask_bulk(this->chunks[0], this->chunks[1], this->chunks[2], this->chunks[3]);
   }
 
-  really_inline uint64_t eq(uint8_t m) {
+  really_inline simd_input<Architecture::ARM64> bit_or(const uint8_t m) const {
+    const uint8x16_t mask = vmovq_n_u8(m);
+    return this->map( [&](auto a) {
+      return vorrq_u8(a, mask);
+    });
+  }
+
+  really_inline uint64_t eq(const uint8_t m) const {
     const uint8x16_t mask = vmovq_n_u8(m);
     return this->map( [&](auto a) {
       return vceqq_u8(a, mask);
     }).to_bitmask();
   }
 
-  really_inline uint64_t lteq(uint8_t m) {
+  really_inline uint64_t lteq(const uint8_t m) const {
     const uint8x16_t mask = vmovq_n_u8(m);
     return this->map( [&](auto a) {
       return vcleq_u8(a, mask);

diff --git a/src/arm64/stage1_find_marks.h b/src/arm64/stage1_find_marks.h
@@ -12,7 +12,7 @@
 
 namespace simdjson::arm64 {
 
-really_inline uint64_t compute_quote_mask(uint64_t quote_bits) {
+really_inline uint64_t compute_quote_mask(const uint64_t quote_bits) {
 
 #ifdef __ARM_FEATURE_CRYPTO // some ARM processors lack this extension
   return vmull_p64(-1ULL, quote_bits);
@@ -21,9 +21,9 @@ really_inline uint64_t compute_quote_mask(uint64_t quote_bits) {
 #endif
 }
 
-really_inline void find_whitespace_and_structurals(
-    simd_input<ARCHITECTURE> in, uint64_t &whitespace,
-    uint64_t &structurals) {
+really_inline void find_whitespace_and_operators(
+    const simd_input<ARCHITECTURE> in,
+    uint64_t &whitespace, uint64_t &op) {
   const uint8x16_t low_nibble_mask =
       (uint8x16_t){16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0};
   const uint8x16_t high_nibble_mask =
@@ -38,9 +38,9 @@ really_inline void find_whitespace_and_structurals(
     return vandq_u8(shuf_lo, shuf_hi);
   });
 
-  const uint8x16_t structural_shufti_mask = vmovq_n_u8(0x7);
-  structurals = v.map([&](auto _v) {
-    return vtstq_u8(_v, structural_shufti_mask);
+  const uint8x16_t operator_shufti_mask = vmovq_n_u8(0x7);
+  op = v.map([&](auto _v) {
+    return vtstq_u8(_v, operator_shufti_mask);
   }).to_bitmask();
 
   const uint8x16_t whitespace_shufti_mask = vmovq_n_u8(0x18);