cpu: second part of sse42 -> sse41: actually change the logic

oneapi-src · Apr 15, 2019 · 6b83d02 · 6b83d02
1 parent 4975951
commit 6b83d02
Show file tree

Hide file tree

Showing 38 changed files with 231 additions and 231 deletions.
diff --git a/README.md b/README.md
@@ -93,8 +93,8 @@ request will be merged to the repository.
 ## System Requirements
 Intel MKL-DNN supports Intel 64 architecture and compatible architectures.
 The library is optimized for the systems based on
-* Intel Atom(R) processor with Intel SSE4.1 support
-* 4th, 5th, 6th, 7th, and 8th generation Intel(R) Core(TM) processor
+* Intel(R) Core(TM) and Intel Atom(R) processors with Intel(R) SSE4.1 or newer instruction set support
+* Intel Atom(R) processor with Intel SSE4.2 support
 * Intel(R) Xeon(R) processor E5 v3 family (formerly Haswell)
 * Intel Xeon processor E5 v4 family (formerly Broadwell)
 * Intel Xeon Platinum processor family (formerly Skylake)

diff --git a/doc/design/understanding_memory_formats.md b/doc/design/understanding_memory_formats.md
@@ -235,7 +235,7 @@ turn out that those layouts are sub-optimal from performance perspective.
 In order to achieve better vectorization and cache re-usage Intel MKL-DNN
 introduces blocked layout that splits one or several dimensions into the
 blocks of fixed size. The most popular Intel MKL-DNN data format is
-**nChw16c** on AVX512+ systems and **nChw8c** on SSE4.2+ systems. As one
+**nChw16c** on AVX512+ systems and **nChw8c** on SSE4.1+ systems. As one
 might guess from the name the only dimension that is blocked is channels and
 the block size is either 16 in the former case or 8 in the later case.
 

diff --git a/src/common/verbose.cpp b/src/common/verbose.cpp
@@ -41,7 +41,7 @@
 
 /* MKL-DNN CPU ISA info */
 #define ISA_ANY "No instruction set specific optimizations"
-#define SSE42 "Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2)"
+#define SSE41 "Intel(R) Streaming SIMD Extensions 4.1 (Intel(R) SSE4.1)"
 #define AVX "Intel(R) Advanced Vector Extensions (Intel(R) AVX)"
 #define AVX2 "Intel(R) Advanced Vector Extensions 2 (Intel(R) AVX2)"
 #define AVX512_COMMON "Intel(R) Advanced Vector Extensions 512 (Intel(R) " \
@@ -109,7 +109,7 @@ const char *get_isa_info() {
     if (mayiuse(avx512_common))    return AVX512_COMMON;
     if (mayiuse(avx2))             return AVX2;
     if (mayiuse(avx))              return AVX;
-    if (mayiuse(sse42))            return SSE42;
+    if (mayiuse(sse41))            return SSE41;
     return ISA_ANY;
 }
 

diff --git a/src/cpu/cpu_engine.cpp b/src/cpu/cpu_engine.cpp
@@ -33,9 +33,9 @@
 #include "cpu/jit_avx512_core_x8s8s32x_convolution.hpp"
 #include "cpu/jit_avx512_common_convolution.hpp"
 #include "cpu/jit_avx2_1x1_convolution.hpp"
-#include "cpu/jit_sse42_1x1_convolution.hpp"
+#include "cpu/jit_sse41_1x1_convolution.hpp"
 #include "cpu/jit_avx2_convolution.hpp"
-#include "cpu/jit_sse42_convolution.hpp"
+#include "cpu/jit_sse41_convolution.hpp"
 #include "cpu/gemm_convolution.hpp"
 #include "cpu/gemm_x8s8s32x_convolution.hpp"
 #include "cpu/ref_convolution.hpp"
@@ -120,14 +120,14 @@ static const pd_create_f cpu_impl_list[] = {
     INSTANCE(jit_avx2_1x1_convolution_fwd_t),
     INSTANCE(jit_avx2_1x1_convolution_bwd_data_t),
     INSTANCE(jit_avx2_1x1_convolution_bwd_weights_t),
-    INSTANCE(jit_sse42_dw_convolution_fwd_t),
-    INSTANCE(jit_sse42_dw_convolution_bwd_data_t),
-    INSTANCE(jit_sse42_dw_convolution_bwd_weights_t),
-    INSTANCE(jit_sse42_1x1_convolution_fwd_t),
+    INSTANCE(jit_sse41_dw_convolution_fwd_t),
+    INSTANCE(jit_sse41_dw_convolution_bwd_data_t),
+    INSTANCE(jit_sse41_dw_convolution_bwd_weights_t),
+    INSTANCE(jit_sse41_1x1_convolution_fwd_t),
     INSTANCE(jit_avx2_convolution_fwd_t),
     INSTANCE(jit_avx2_convolution_bwd_data_t),
     INSTANCE(jit_avx2_convolution_bwd_weights_t),
-    INSTANCE(jit_sse42_convolution_fwd_t),
+    INSTANCE(jit_sse41_convolution_fwd_t),
     INSTANCE(gemm_convolution_fwd_t),
     INSTANCE(gemm_convolution_bwd_data_t),
     INSTANCE(gemm_convolution_bwd_weights_t),
@@ -203,8 +203,8 @@ static const pd_create_f cpu_impl_list[] = {
     INSTANCE(jit_uni_eltwise_bwd_t<avx512_common>),
     INSTANCE(jit_uni_eltwise_fwd_t<avx2>),
     INSTANCE(jit_uni_eltwise_bwd_t<avx2>),
-    INSTANCE(jit_uni_eltwise_fwd_t<sse42>),
-    INSTANCE(jit_uni_eltwise_bwd_t<sse42>),
+    INSTANCE(jit_uni_eltwise_fwd_t<sse41>),
+    INSTANCE(jit_uni_eltwise_bwd_t<sse41>),
     INSTANCE(ref_eltwise_fwd_t<f32>),
     INSTANCE(ref_eltwise_bwd_t<f32>),
     /* eltwise (int) */
@@ -220,8 +220,8 @@ static const pd_create_f cpu_impl_list[] = {
     INSTANCE(jit_uni_pooling_bwd_t<avx512_common>),
     INSTANCE(jit_uni_pooling_fwd_t<avx>),
     INSTANCE(jit_uni_pooling_bwd_t<avx>),
-    INSTANCE(jit_uni_pooling_fwd_t<sse42>),
-    INSTANCE(jit_uni_pooling_bwd_t<sse42>),
+    INSTANCE(jit_uni_pooling_fwd_t<sse41>),
+    INSTANCE(jit_uni_pooling_bwd_t<sse41>),
     INSTANCE(nchw_pooling_fwd_t<f32>),
     INSTANCE(nchw_pooling_bwd_t<f32>),
     INSTANCE(nhwc_pooling_fwd_t<f32>),
@@ -240,16 +240,16 @@ static const pd_create_f cpu_impl_list[] = {
     INSTANCE(jit_avx512_common_lrn_bwd_t),
     INSTANCE(jit_uni_lrn_fwd_t<avx2>),
     INSTANCE(jit_uni_lrn_bwd_t<avx2>),
-    INSTANCE(jit_uni_lrn_fwd_t<sse42>),
+    INSTANCE(jit_uni_lrn_fwd_t<sse41>),
     INSTANCE(ref_lrn_fwd_t<f32>),
     INSTANCE(ref_lrn_bwd_t<f32>),
     /* batch normalization */
     INSTANCE(jit_uni_batch_normalization_fwd_t<avx512_common>),
     INSTANCE(jit_uni_batch_normalization_bwd_t<avx512_common>),
     INSTANCE(jit_uni_batch_normalization_fwd_t<avx2>),
     INSTANCE(jit_uni_batch_normalization_bwd_t<avx2>),
-    INSTANCE(jit_uni_batch_normalization_fwd_t<sse42>),
-    INSTANCE(jit_uni_batch_normalization_bwd_t<sse42>),
+    INSTANCE(jit_uni_batch_normalization_fwd_t<sse41>),
+    INSTANCE(jit_uni_batch_normalization_bwd_t<sse41>),
     INSTANCE(ncsp_batch_normalization_fwd_t),
     INSTANCE(ncsp_batch_normalization_bwd_t),
     INSTANCE(nspc_batch_normalization_fwd_t),

diff --git a/src/cpu/cpu_isa_traits.hpp b/src/cpu/cpu_isa_traits.hpp
@@ -40,7 +40,7 @@ namespace cpu {
 
 typedef enum {
     isa_any,
-    sse42,
+    sse41,
     avx,
     avx2,
     avx512_common,
@@ -52,7 +52,7 @@ typedef enum {
 
 template <cpu_isa_t> struct cpu_isa_traits {}; /* ::vlen -> 32 (for avx2) */
 
-template <> struct cpu_isa_traits<sse42> {
+template <> struct cpu_isa_traits<sse41> {
     typedef Xbyak::Xmm Vmm;
     static constexpr int vlen_shift = 4;
     static constexpr int vlen = 16;
@@ -89,8 +89,8 @@ static inline bool mayiuse(const cpu_isa_t cpu_isa) {
     using namespace Xbyak::util;
 
     switch (cpu_isa) {
-    case sse42:
-        return cpu.has(Cpu::tSSE42);
+    case sse41:
+        return cpu.has(Cpu::tSSE41);
     case avx:
         return cpu.has(Cpu::tAVX);
     case avx2:
@@ -131,7 +131,7 @@ static inline bool mayiuse(const cpu_isa_t cpu_isa) {
 /* whatever is required to generate string literals... */
 #include "z_magic.hpp"
 #define JIT_IMPL_NAME_HELPER(prefix, isa, suffix_if_any) \
-    (isa == sse42 ? prefix STRINGIFY(sse42) : \
+    (isa == sse41 ? prefix STRINGIFY(sse41) : \
     (isa == avx ? prefix STRINGIFY(avx) : \
     (isa == avx2 ? prefix STRINGIFY(avx2) : \
     (isa == avx512_common ? prefix STRINGIFY(avx512_common) : \

diff --git a/src/cpu/gemm_convolution_utils.cpp b/src/cpu/gemm_convolution_utils.cpp
@@ -534,7 +534,7 @@ status_t init_conf(jit_gemm_conv_conf_t &jcp,
         ? cpu_isa_traits<avx512_common>::vlen
         : mayiuse(avx)
             ? cpu_isa_traits<avx>::vlen
-            : mayiuse(sse42) ? cpu_isa_traits<sse42>::vlen : 4;
+            : mayiuse(sse41) ? cpu_isa_traits<sse41>::vlen : 4;
     const int simd_w = vlen / (is_int8_conv ? 1 : 4);
 
     const bool is_bwd_d = jcp.prop_kind == backward_data;

diff --git a/src/cpu/jit_sse41_1x1_conv_kernel_f32.cpp b/src/cpu/jit_sse41_1x1_conv_kernel_f32.cpp
@@ -20,7 +20,7 @@
 #include "utils.hpp"
 #include "memory.hpp"
 
-#include "jit_sse42_1x1_conv_kernel_f32.hpp"
+#include "jit_sse41_1x1_conv_kernel_f32.hpp"
 
 #define GET_OFF(field) offsetof(jit_1x1_conv_call_s, field)
 
@@ -34,7 +34,7 @@ using namespace mkldnn::impl::utils;
 
 using namespace Xbyak;
 
-void jit_sse42_1x1_conv_kernel_f32::generate_bcast_loop(int load_loop_blk)
+void jit_sse41_1x1_conv_kernel_f32::generate_bcast_loop(int load_loop_blk)
 {
     mov(aux1_reg_bcast_data, reg_bcast_data);
     mov(aux_reg_output_data, reg_output_data);
@@ -77,7 +77,7 @@ void jit_sse42_1x1_conv_kernel_f32::generate_bcast_loop(int load_loop_blk)
     }
 }
 
-void jit_sse42_1x1_conv_kernel_f32::generate_reduce_loop(
+void jit_sse41_1x1_conv_kernel_f32::generate_reduce_loop(
         int load_loop_blk, int ur)
 {
     auto reg_load = [=](int i, int n) {
@@ -270,7 +270,7 @@ void jit_sse42_1x1_conv_kernel_f32::generate_reduce_loop(
     store();
 } // reduce_loop()
 
-void jit_sse42_1x1_conv_kernel_f32::generate_diff_bias_loop(int load_loop_blk)
+void jit_sse41_1x1_conv_kernel_f32::generate_diff_bias_loop(int load_loop_blk)
 {
     if (!jcp.with_bias || jcp.prop_kind != backward_weights)
         return;
@@ -336,7 +336,7 @@ void jit_sse42_1x1_conv_kernel_f32::generate_diff_bias_loop(int load_loop_blk)
     L(diff_bias_loop_out);
 }
 
-void jit_sse42_1x1_conv_kernel_f32::generate()
+void jit_sse41_1x1_conv_kernel_f32::generate()
 {
     preamble();
 
@@ -434,7 +434,7 @@ void jit_sse42_1x1_conv_kernel_f32::generate()
         eltwise_injector_->prepare_table();
 }
 
-bool jit_sse42_1x1_conv_kernel_f32::post_ops_ok(
+bool jit_sse41_1x1_conv_kernel_f32::post_ops_ok(
         jit_1x1_conv_conf_t &jcp, const primitive_attr_t &attr) {
     const auto &p = attr.post_ops_;
 
@@ -451,12 +451,12 @@ bool jit_sse42_1x1_conv_kernel_f32::post_ops_ok(
     return false;
 }
 
-status_t jit_sse42_1x1_conv_kernel_f32::init_conf(jit_1x1_conv_conf_t &jcp,
+status_t jit_sse41_1x1_conv_kernel_f32::init_conf(jit_1x1_conv_conf_t &jcp,
         const convolution_desc_t &cd, const memory_desc_wrapper &src_d,
         const memory_desc_wrapper &weights_d, const memory_desc_wrapper &dst_d,
         const primitive_attr_t &attr)
 {
-    if (!mayiuse(sse42))
+    if (!mayiuse(sse41))
         return status::unimplemented;
 
     // TODO (Roma): this code is duplicated from the generic kernel; maybe the

diff --git a/src/cpu/jit_sse41_1x1_conv_kernel_f32.hpp b/src/cpu/jit_sse41_1x1_conv_kernel_f32.hpp
@@ -14,8 +14,8 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef JIT_SSE42_1x1_CONV_KERNEL_F32_HPP
-#define JIT_SSE42_1x1_CONV_KERNEL_F32_HPP
+#ifndef JIT_SSE41_1x1_CONV_KERNEL_F32_HPP
+#define JIT_SSE41_1x1_CONV_KERNEL_F32_HPP
 
 #include "c_types_map.hpp"
 #include "memory.hpp"
@@ -27,20 +27,20 @@ namespace mkldnn {
 namespace impl {
 namespace cpu {
 
-struct jit_sse42_1x1_conv_kernel_f32: public jit_generator {
-    jit_sse42_1x1_conv_kernel_f32(jit_1x1_conv_conf_t ajcp,
+struct jit_sse41_1x1_conv_kernel_f32: public jit_generator {
+    jit_sse41_1x1_conv_kernel_f32(jit_1x1_conv_conf_t ajcp,
             const primitive_attr_t &attr)
         : jcp(ajcp), attr_(attr), eltwise_injector_(nullptr)
     {
         if (jcp.with_eltwise)
-            eltwise_injector_ = new jit_uni_eltwise_injector_f32<sse42>(this,
+            eltwise_injector_ = new jit_uni_eltwise_injector_f32<sse41>(this,
                     jcp.eltwise);
 
         this->generate();
         jit_ker = (void (*)(jit_1x1_conv_call_s *))this->getCode();
     }
 
-    ~jit_sse42_1x1_conv_kernel_f32() {
+    ~jit_sse41_1x1_conv_kernel_f32() {
         delete eltwise_injector_;
     }
 
@@ -54,7 +54,7 @@ struct jit_sse42_1x1_conv_kernel_f32: public jit_generator {
             const memory_desc_wrapper &dst_d,
             const primitive_attr_t &attr);
 
-    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse42_1x1_conv_kernel_f32)
+    DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_1x1_conv_kernel_f32)
 
     jit_1x1_conv_conf_t jcp;
     const primitive_attr_t &attr_;
@@ -88,7 +88,7 @@ struct jit_sse42_1x1_conv_kernel_f32: public jit_generator {
 
     xmm_t reg_bcast = xmm_t(15);
 
-    jit_uni_eltwise_injector_f32<sse42> *eltwise_injector_;
+    jit_uni_eltwise_injector_f32<sse41> *eltwise_injector_;
 
     void generate_bcast_loop(int load_loop_blk);
     void generate_reduce_loop(int load_loop_blk, int ur);

diff --git a/src/cpu/jit_sse41_1x1_convolution.cpp b/src/cpu/jit_sse41_1x1_convolution.cpp
@@ -17,7 +17,7 @@
 #include "mkldnn_types.h"
 
 #include "c_types_map.hpp"
-#include "jit_sse42_1x1_convolution.hpp"
+#include "jit_sse41_1x1_convolution.hpp"
 #include "utils.hpp"
 #include "mkldnn_thread.hpp"
 #include "type_helpers.hpp"
@@ -34,7 +34,7 @@ namespace cpu {
 using namespace mkldnn::impl::status;
 using namespace mkldnn::impl::utils;
 
-void jit_sse42_1x1_convolution_fwd_t::execute_forward(
+void jit_sse41_1x1_convolution_fwd_t::execute_forward(
         const exec_ctx_t &ctx) const {
     auto src = CTX_IN_MEM(const data_t *, MKLDNN_ARG_SRC);
     auto weights = CTX_IN_MEM(const data_t *, MKLDNN_ARG_WEIGHTS);

diff --git a/src/cpu/jit_sse41_1x1_convolution.hpp b/src/cpu/jit_sse41_1x1_convolution.hpp
@@ -14,22 +14,22 @@
 * limitations under the License.
 *******************************************************************************/
 
-#ifndef CPU_JIT_SSE42_1x1_CONVOLUTION_HPP
-#define CPU_JIT_SSE42_1x1_CONVOLUTION_HPP
+#ifndef CPU_JIT_SSE41_1x1_CONVOLUTION_HPP
+#define CPU_JIT_SSE41_1x1_CONVOLUTION_HPP
 
 #include "c_types_map.hpp"
 #include "mkldnn_thread.hpp"
 #include "utils.hpp"
 
 #include "cpu_convolution_pd.hpp"
 #include "cpu_primitive.hpp"
-#include "jit_sse42_1x1_conv_kernel_f32.hpp"
+#include "jit_sse41_1x1_conv_kernel_f32.hpp"
 
 namespace mkldnn {
 namespace impl {
 namespace cpu {
 
-struct jit_sse42_1x1_convolution_fwd_t: public cpu_primitive_t {
+struct jit_sse41_1x1_convolution_fwd_t: public cpu_primitive_t {
     struct pd_t: public cpu_convolution_fwd_pd_t {
         pd_t(engine_t *engine,
                 const convolution_desc_t *adesc,
@@ -39,8 +39,8 @@ struct jit_sse42_1x1_convolution_fwd_t: public cpu_primitive_t {
             , jcp_() {}
 
         DECLARE_COMMON_PD_T(
-                JIT_IMPL_NAME_HELPER("jit_1x1:", sse42, ""),
-                jit_sse42_1x1_convolution_fwd_t);
+                JIT_IMPL_NAME_HELPER("jit_1x1:", sse41, ""),
+                jit_sse41_1x1_convolution_fwd_t);
 
         status_t init() {
             bool ok = true
@@ -52,7 +52,7 @@ struct jit_sse42_1x1_convolution_fwd_t: public cpu_primitive_t {
                 && set_default_formats();
             if (!ok) return status::unimplemented;
 
-            return jit_sse42_1x1_conv_kernel_f32::init_conf(jcp_, *desc(),
+            return jit_sse41_1x1_conv_kernel_f32::init_conf(jcp_, *desc(),
                     *src_md(), *weights_md(), *dst_md(), *attr());
         }
 
@@ -71,10 +71,10 @@ struct jit_sse42_1x1_convolution_fwd_t: public cpu_primitive_t {
         }
     };
 
-    jit_sse42_1x1_convolution_fwd_t(const pd_t *apd): cpu_primitive_t(apd) {
-        kernel_ = new jit_sse42_1x1_conv_kernel_f32(pd()->jcp_, *pd()->attr());
+    jit_sse41_1x1_convolution_fwd_t(const pd_t *apd): cpu_primitive_t(apd) {
+        kernel_ = new jit_sse41_1x1_conv_kernel_f32(pd()->jcp_, *pd()->attr());
     }
-    ~jit_sse42_1x1_convolution_fwd_t() { delete kernel_; };
+    ~jit_sse41_1x1_convolution_fwd_t() { delete kernel_; };
 
     typedef typename prec_traits<data_type::f32>::type data_t;
 
@@ -86,7 +86,7 @@ struct jit_sse42_1x1_convolution_fwd_t: public cpu_primitive_t {
 private:
     void execute_forward(const exec_ctx_t &ctx) const;
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); }
-    jit_sse42_1x1_conv_kernel_f32 *kernel_;
+    jit_sse41_1x1_conv_kernel_f32 *kernel_;
 };
 
 }