Skip to content

Commit

Permalink
cpu: second part of sse42 -> sse41: actually change the logic
Browse files Browse the repository at this point in the history
  • Loading branch information
Roman Dubtsov committed Apr 15, 2019
1 parent 4975951 commit 6b83d02
Show file tree
Hide file tree
Showing 38 changed files with 231 additions and 231 deletions.
4 changes: 2 additions & 2 deletions README.md
Expand Up @@ -93,8 +93,8 @@ request will be merged to the repository.
## System Requirements
Intel MKL-DNN supports Intel 64 architecture and compatible architectures.
The library is optimized for the systems based on
* Intel Atom(R) processor with Intel SSE4.1 support
* 4th, 5th, 6th, 7th, and 8th generation Intel(R) Core(TM) processor
* Intel(R) Core(TM) and Intel Atom(R) processors with Intel(R) SSE4.1 or newer instruction set support
* Intel Atom(R) processor with Intel SSE4.2 support
* Intel(R) Xeon(R) processor E5 v3 family (formerly Haswell)
* Intel Xeon processor E5 v4 family (formerly Broadwell)
* Intel Xeon Platinum processor family (formerly Skylake)
Expand Down
2 changes: 1 addition & 1 deletion doc/design/understanding_memory_formats.md
Expand Up @@ -235,7 +235,7 @@ turn out that those layouts are sub-optimal from performance perspective.
In order to achieve better vectorization and cache re-usage Intel MKL-DNN
introduces blocked layout that splits one or several dimensions into the
blocks of fixed size. The most popular Intel MKL-DNN data format is
**nChw16c** on AVX512+ systems and **nChw8c** on SSE4.2+ systems. As one
**nChw16c** on AVX512+ systems and **nChw8c** on SSE4.1+ systems. As one
might guess from the name the only dimension that is blocked is channels and
the block size is either 16 in the former case or 8 in the later case.
Expand Down
4 changes: 2 additions & 2 deletions src/common/verbose.cpp
Expand Up @@ -41,7 +41,7 @@

/* MKL-DNN CPU ISA info */
#define ISA_ANY "No instruction set specific optimizations"
#define SSE42 "Intel(R) Streaming SIMD Extensions 4.2 (Intel(R) SSE4.2)"
#define SSE41 "Intel(R) Streaming SIMD Extensions 4.1 (Intel(R) SSE4.1)"
#define AVX "Intel(R) Advanced Vector Extensions (Intel(R) AVX)"
#define AVX2 "Intel(R) Advanced Vector Extensions 2 (Intel(R) AVX2)"
#define AVX512_COMMON "Intel(R) Advanced Vector Extensions 512 (Intel(R) " \
Expand Down Expand Up @@ -109,7 +109,7 @@ const char *get_isa_info() {
if (mayiuse(avx512_common)) return AVX512_COMMON;
if (mayiuse(avx2)) return AVX2;
if (mayiuse(avx)) return AVX;
if (mayiuse(sse42)) return SSE42;
if (mayiuse(sse41)) return SSE41;
return ISA_ANY;
}

Expand Down
28 changes: 14 additions & 14 deletions src/cpu/cpu_engine.cpp
Expand Up @@ -33,9 +33,9 @@
#include "cpu/jit_avx512_core_x8s8s32x_convolution.hpp"
#include "cpu/jit_avx512_common_convolution.hpp"
#include "cpu/jit_avx2_1x1_convolution.hpp"
#include "cpu/jit_sse42_1x1_convolution.hpp"
#include "cpu/jit_sse41_1x1_convolution.hpp"
#include "cpu/jit_avx2_convolution.hpp"
#include "cpu/jit_sse42_convolution.hpp"
#include "cpu/jit_sse41_convolution.hpp"
#include "cpu/gemm_convolution.hpp"
#include "cpu/gemm_x8s8s32x_convolution.hpp"
#include "cpu/ref_convolution.hpp"
Expand Down Expand Up @@ -120,14 +120,14 @@ static const pd_create_f cpu_impl_list[] = {
INSTANCE(jit_avx2_1x1_convolution_fwd_t),
INSTANCE(jit_avx2_1x1_convolution_bwd_data_t),
INSTANCE(jit_avx2_1x1_convolution_bwd_weights_t),
INSTANCE(jit_sse42_dw_convolution_fwd_t),
INSTANCE(jit_sse42_dw_convolution_bwd_data_t),
INSTANCE(jit_sse42_dw_convolution_bwd_weights_t),
INSTANCE(jit_sse42_1x1_convolution_fwd_t),
INSTANCE(jit_sse41_dw_convolution_fwd_t),
INSTANCE(jit_sse41_dw_convolution_bwd_data_t),
INSTANCE(jit_sse41_dw_convolution_bwd_weights_t),
INSTANCE(jit_sse41_1x1_convolution_fwd_t),
INSTANCE(jit_avx2_convolution_fwd_t),
INSTANCE(jit_avx2_convolution_bwd_data_t),
INSTANCE(jit_avx2_convolution_bwd_weights_t),
INSTANCE(jit_sse42_convolution_fwd_t),
INSTANCE(jit_sse41_convolution_fwd_t),
INSTANCE(gemm_convolution_fwd_t),
INSTANCE(gemm_convolution_bwd_data_t),
INSTANCE(gemm_convolution_bwd_weights_t),
Expand Down Expand Up @@ -203,8 +203,8 @@ static const pd_create_f cpu_impl_list[] = {
INSTANCE(jit_uni_eltwise_bwd_t<avx512_common>),
INSTANCE(jit_uni_eltwise_fwd_t<avx2>),
INSTANCE(jit_uni_eltwise_bwd_t<avx2>),
INSTANCE(jit_uni_eltwise_fwd_t<sse42>),
INSTANCE(jit_uni_eltwise_bwd_t<sse42>),
INSTANCE(jit_uni_eltwise_fwd_t<sse41>),
INSTANCE(jit_uni_eltwise_bwd_t<sse41>),
INSTANCE(ref_eltwise_fwd_t<f32>),
INSTANCE(ref_eltwise_bwd_t<f32>),
/* eltwise (int) */
Expand All @@ -220,8 +220,8 @@ static const pd_create_f cpu_impl_list[] = {
INSTANCE(jit_uni_pooling_bwd_t<avx512_common>),
INSTANCE(jit_uni_pooling_fwd_t<avx>),
INSTANCE(jit_uni_pooling_bwd_t<avx>),
INSTANCE(jit_uni_pooling_fwd_t<sse42>),
INSTANCE(jit_uni_pooling_bwd_t<sse42>),
INSTANCE(jit_uni_pooling_fwd_t<sse41>),
INSTANCE(jit_uni_pooling_bwd_t<sse41>),
INSTANCE(nchw_pooling_fwd_t<f32>),
INSTANCE(nchw_pooling_bwd_t<f32>),
INSTANCE(nhwc_pooling_fwd_t<f32>),
Expand All @@ -240,16 +240,16 @@ static const pd_create_f cpu_impl_list[] = {
INSTANCE(jit_avx512_common_lrn_bwd_t),
INSTANCE(jit_uni_lrn_fwd_t<avx2>),
INSTANCE(jit_uni_lrn_bwd_t<avx2>),
INSTANCE(jit_uni_lrn_fwd_t<sse42>),
INSTANCE(jit_uni_lrn_fwd_t<sse41>),
INSTANCE(ref_lrn_fwd_t<f32>),
INSTANCE(ref_lrn_bwd_t<f32>),
/* batch normalization */
INSTANCE(jit_uni_batch_normalization_fwd_t<avx512_common>),
INSTANCE(jit_uni_batch_normalization_bwd_t<avx512_common>),
INSTANCE(jit_uni_batch_normalization_fwd_t<avx2>),
INSTANCE(jit_uni_batch_normalization_bwd_t<avx2>),
INSTANCE(jit_uni_batch_normalization_fwd_t<sse42>),
INSTANCE(jit_uni_batch_normalization_bwd_t<sse42>),
INSTANCE(jit_uni_batch_normalization_fwd_t<sse41>),
INSTANCE(jit_uni_batch_normalization_bwd_t<sse41>),
INSTANCE(ncsp_batch_normalization_fwd_t),
INSTANCE(ncsp_batch_normalization_bwd_t),
INSTANCE(nspc_batch_normalization_fwd_t),
Expand Down
10 changes: 5 additions & 5 deletions src/cpu/cpu_isa_traits.hpp
Expand Up @@ -40,7 +40,7 @@ namespace cpu {

typedef enum {
isa_any,
sse42,
sse41,
avx,
avx2,
avx512_common,
Expand All @@ -52,7 +52,7 @@ typedef enum {

template <cpu_isa_t> struct cpu_isa_traits {}; /* ::vlen -> 32 (for avx2) */

template <> struct cpu_isa_traits<sse42> {
template <> struct cpu_isa_traits<sse41> {
typedef Xbyak::Xmm Vmm;
static constexpr int vlen_shift = 4;
static constexpr int vlen = 16;
Expand Down Expand Up @@ -89,8 +89,8 @@ static inline bool mayiuse(const cpu_isa_t cpu_isa) {
using namespace Xbyak::util;

switch (cpu_isa) {
case sse42:
return cpu.has(Cpu::tSSE42);
case sse41:
return cpu.has(Cpu::tSSE41);
case avx:
return cpu.has(Cpu::tAVX);
case avx2:
Expand Down Expand Up @@ -131,7 +131,7 @@ static inline bool mayiuse(const cpu_isa_t cpu_isa) {
/* whatever is required to generate string literals... */
#include "z_magic.hpp"
#define JIT_IMPL_NAME_HELPER(prefix, isa, suffix_if_any) \
(isa == sse42 ? prefix STRINGIFY(sse42) : \
(isa == sse41 ? prefix STRINGIFY(sse41) : \
(isa == avx ? prefix STRINGIFY(avx) : \
(isa == avx2 ? prefix STRINGIFY(avx2) : \
(isa == avx512_common ? prefix STRINGIFY(avx512_common) : \
Expand Down
2 changes: 1 addition & 1 deletion src/cpu/gemm_convolution_utils.cpp
Expand Up @@ -534,7 +534,7 @@ status_t init_conf(jit_gemm_conv_conf_t &jcp,
? cpu_isa_traits<avx512_common>::vlen
: mayiuse(avx)
? cpu_isa_traits<avx>::vlen
: mayiuse(sse42) ? cpu_isa_traits<sse42>::vlen : 4;
: mayiuse(sse41) ? cpu_isa_traits<sse41>::vlen : 4;
const int simd_w = vlen / (is_int8_conv ? 1 : 4);

const bool is_bwd_d = jcp.prop_kind == backward_data;
Expand Down
16 changes: 8 additions & 8 deletions src/cpu/jit_sse41_1x1_conv_kernel_f32.cpp
Expand Up @@ -20,7 +20,7 @@
#include "utils.hpp"
#include "memory.hpp"

#include "jit_sse42_1x1_conv_kernel_f32.hpp"
#include "jit_sse41_1x1_conv_kernel_f32.hpp"

#define GET_OFF(field) offsetof(jit_1x1_conv_call_s, field)

Expand All @@ -34,7 +34,7 @@ using namespace mkldnn::impl::utils;

using namespace Xbyak;

void jit_sse42_1x1_conv_kernel_f32::generate_bcast_loop(int load_loop_blk)
void jit_sse41_1x1_conv_kernel_f32::generate_bcast_loop(int load_loop_blk)
{
mov(aux1_reg_bcast_data, reg_bcast_data);
mov(aux_reg_output_data, reg_output_data);
Expand Down Expand Up @@ -77,7 +77,7 @@ void jit_sse42_1x1_conv_kernel_f32::generate_bcast_loop(int load_loop_blk)
}
}

void jit_sse42_1x1_conv_kernel_f32::generate_reduce_loop(
void jit_sse41_1x1_conv_kernel_f32::generate_reduce_loop(
int load_loop_blk, int ur)
{
auto reg_load = [=](int i, int n) {
Expand Down Expand Up @@ -270,7 +270,7 @@ void jit_sse42_1x1_conv_kernel_f32::generate_reduce_loop(
store();
} // reduce_loop()

void jit_sse42_1x1_conv_kernel_f32::generate_diff_bias_loop(int load_loop_blk)
void jit_sse41_1x1_conv_kernel_f32::generate_diff_bias_loop(int load_loop_blk)
{
if (!jcp.with_bias || jcp.prop_kind != backward_weights)
return;
Expand Down Expand Up @@ -336,7 +336,7 @@ void jit_sse42_1x1_conv_kernel_f32::generate_diff_bias_loop(int load_loop_blk)
L(diff_bias_loop_out);
}

void jit_sse42_1x1_conv_kernel_f32::generate()
void jit_sse41_1x1_conv_kernel_f32::generate()
{
preamble();

Expand Down Expand Up @@ -434,7 +434,7 @@ void jit_sse42_1x1_conv_kernel_f32::generate()
eltwise_injector_->prepare_table();
}

bool jit_sse42_1x1_conv_kernel_f32::post_ops_ok(
bool jit_sse41_1x1_conv_kernel_f32::post_ops_ok(
jit_1x1_conv_conf_t &jcp, const primitive_attr_t &attr) {
const auto &p = attr.post_ops_;

Expand All @@ -451,12 +451,12 @@ bool jit_sse42_1x1_conv_kernel_f32::post_ops_ok(
return false;
}

status_t jit_sse42_1x1_conv_kernel_f32::init_conf(jit_1x1_conv_conf_t &jcp,
status_t jit_sse41_1x1_conv_kernel_f32::init_conf(jit_1x1_conv_conf_t &jcp,
const convolution_desc_t &cd, const memory_desc_wrapper &src_d,
const memory_desc_wrapper &weights_d, const memory_desc_wrapper &dst_d,
const primitive_attr_t &attr)
{
if (!mayiuse(sse42))
if (!mayiuse(sse41))
return status::unimplemented;

// TODO (Roma): this code is duplicated from the generic kernel; maybe the
Expand Down
16 changes: 8 additions & 8 deletions src/cpu/jit_sse41_1x1_conv_kernel_f32.hpp
Expand Up @@ -14,8 +14,8 @@
* limitations under the License.
*******************************************************************************/

#ifndef JIT_SSE42_1x1_CONV_KERNEL_F32_HPP
#define JIT_SSE42_1x1_CONV_KERNEL_F32_HPP
#ifndef JIT_SSE41_1x1_CONV_KERNEL_F32_HPP
#define JIT_SSE41_1x1_CONV_KERNEL_F32_HPP

#include "c_types_map.hpp"
#include "memory.hpp"
Expand All @@ -27,20 +27,20 @@ namespace mkldnn {
namespace impl {
namespace cpu {

struct jit_sse42_1x1_conv_kernel_f32: public jit_generator {
jit_sse42_1x1_conv_kernel_f32(jit_1x1_conv_conf_t ajcp,
struct jit_sse41_1x1_conv_kernel_f32: public jit_generator {
jit_sse41_1x1_conv_kernel_f32(jit_1x1_conv_conf_t ajcp,
const primitive_attr_t &attr)
: jcp(ajcp), attr_(attr), eltwise_injector_(nullptr)
{
if (jcp.with_eltwise)
eltwise_injector_ = new jit_uni_eltwise_injector_f32<sse42>(this,
eltwise_injector_ = new jit_uni_eltwise_injector_f32<sse41>(this,
jcp.eltwise);

this->generate();
jit_ker = (void (*)(jit_1x1_conv_call_s *))this->getCode();
}

~jit_sse42_1x1_conv_kernel_f32() {
~jit_sse41_1x1_conv_kernel_f32() {
delete eltwise_injector_;
}

Expand All @@ -54,7 +54,7 @@ struct jit_sse42_1x1_conv_kernel_f32: public jit_generator {
const memory_desc_wrapper &dst_d,
const primitive_attr_t &attr);

DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse42_1x1_conv_kernel_f32)
DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_sse41_1x1_conv_kernel_f32)

jit_1x1_conv_conf_t jcp;
const primitive_attr_t &attr_;
Expand Down Expand Up @@ -88,7 +88,7 @@ struct jit_sse42_1x1_conv_kernel_f32: public jit_generator {

xmm_t reg_bcast = xmm_t(15);

jit_uni_eltwise_injector_f32<sse42> *eltwise_injector_;
jit_uni_eltwise_injector_f32<sse41> *eltwise_injector_;

void generate_bcast_loop(int load_loop_blk);
void generate_reduce_loop(int load_loop_blk, int ur);
Expand Down
4 changes: 2 additions & 2 deletions src/cpu/jit_sse41_1x1_convolution.cpp
Expand Up @@ -17,7 +17,7 @@
#include "mkldnn_types.h"

#include "c_types_map.hpp"
#include "jit_sse42_1x1_convolution.hpp"
#include "jit_sse41_1x1_convolution.hpp"
#include "utils.hpp"
#include "mkldnn_thread.hpp"
#include "type_helpers.hpp"
Expand All @@ -34,7 +34,7 @@ namespace cpu {
using namespace mkldnn::impl::status;
using namespace mkldnn::impl::utils;

void jit_sse42_1x1_convolution_fwd_t::execute_forward(
void jit_sse41_1x1_convolution_fwd_t::execute_forward(
const exec_ctx_t &ctx) const {
auto src = CTX_IN_MEM(const data_t *, MKLDNN_ARG_SRC);
auto weights = CTX_IN_MEM(const data_t *, MKLDNN_ARG_WEIGHTS);
Expand Down
22 changes: 11 additions & 11 deletions src/cpu/jit_sse41_1x1_convolution.hpp
Expand Up @@ -14,22 +14,22 @@
* limitations under the License.
*******************************************************************************/

#ifndef CPU_JIT_SSE42_1x1_CONVOLUTION_HPP
#define CPU_JIT_SSE42_1x1_CONVOLUTION_HPP
#ifndef CPU_JIT_SSE41_1x1_CONVOLUTION_HPP
#define CPU_JIT_SSE41_1x1_CONVOLUTION_HPP

#include "c_types_map.hpp"
#include "mkldnn_thread.hpp"
#include "utils.hpp"

#include "cpu_convolution_pd.hpp"
#include "cpu_primitive.hpp"
#include "jit_sse42_1x1_conv_kernel_f32.hpp"
#include "jit_sse41_1x1_conv_kernel_f32.hpp"

namespace mkldnn {
namespace impl {
namespace cpu {

struct jit_sse42_1x1_convolution_fwd_t: public cpu_primitive_t {
struct jit_sse41_1x1_convolution_fwd_t: public cpu_primitive_t {
struct pd_t: public cpu_convolution_fwd_pd_t {
pd_t(engine_t *engine,
const convolution_desc_t *adesc,
Expand All @@ -39,8 +39,8 @@ struct jit_sse42_1x1_convolution_fwd_t: public cpu_primitive_t {
, jcp_() {}

DECLARE_COMMON_PD_T(
JIT_IMPL_NAME_HELPER("jit_1x1:", sse42, ""),
jit_sse42_1x1_convolution_fwd_t);
JIT_IMPL_NAME_HELPER("jit_1x1:", sse41, ""),
jit_sse41_1x1_convolution_fwd_t);

status_t init() {
bool ok = true
Expand All @@ -52,7 +52,7 @@ struct jit_sse42_1x1_convolution_fwd_t: public cpu_primitive_t {
&& set_default_formats();
if (!ok) return status::unimplemented;

return jit_sse42_1x1_conv_kernel_f32::init_conf(jcp_, *desc(),
return jit_sse41_1x1_conv_kernel_f32::init_conf(jcp_, *desc(),
*src_md(), *weights_md(), *dst_md(), *attr());
}

Expand All @@ -71,10 +71,10 @@ struct jit_sse42_1x1_convolution_fwd_t: public cpu_primitive_t {
}
};

jit_sse42_1x1_convolution_fwd_t(const pd_t *apd): cpu_primitive_t(apd) {
kernel_ = new jit_sse42_1x1_conv_kernel_f32(pd()->jcp_, *pd()->attr());
jit_sse41_1x1_convolution_fwd_t(const pd_t *apd): cpu_primitive_t(apd) {
kernel_ = new jit_sse41_1x1_conv_kernel_f32(pd()->jcp_, *pd()->attr());
}
~jit_sse42_1x1_convolution_fwd_t() { delete kernel_; };
~jit_sse41_1x1_convolution_fwd_t() { delete kernel_; };

typedef typename prec_traits<data_type::f32>::type data_t;

Expand All @@ -86,7 +86,7 @@ struct jit_sse42_1x1_convolution_fwd_t: public cpu_primitive_t {
private:
void execute_forward(const exec_ctx_t &ctx) const;
const pd_t *pd() const { return (const pd_t *)primitive_t::pd(); }
jit_sse42_1x1_conv_kernel_f32 *kernel_;
jit_sse41_1x1_conv_kernel_f32 *kernel_;
};

}
Expand Down

0 comments on commit 6b83d02

Please sign in to comment.