mxnet/wide_deep_criteo/patch/patch.diff

diff --git a/3rdparty/mkldnn b/3rdparty/mkldnn
index 9910b48..08bd90c 160000
--- a/3rdparty/mkldnn
+++ b/3rdparty/mkldnn
@@ -1 +1 @@
-Subproject commit 9910b480296a0d1496db466531e56729b3922bbf
+Subproject commit 08bd90cca77683dd5d1c98068cea8b92ed05784d
diff --git a/3rdparty/sparse-matrix/Makefile b/3rdparty/sparse-matrix/Makefile
new file mode 100644
index 0000000..214312f
--- /dev/null
+++ b/3rdparty/sparse-matrix/Makefile
@@ -0,0 +1,21 @@
+CC = g++
+C = gcc
+MKLROOT = /opt/intel/mkl
+
+ifneq ($(USE_INTEL_PATH),)
+	MKLROOT = $(USE_INTEL_PATH)/mkl
+endif
+
+CFLAGS  = -fpic -O2 -I/opt/intel/mkl/include -c -Wall -Werror  -DMKL_ILP64 -m64  -std=c++11 
+LDFLAGS =  -Wl,--start-group -L${MKLROOT}/../compiler/lib/intel64 ${MKLROOT}/lib/intel64/libmkl_intel_ilp64.a ${MKLROOT}/lib/intel64/libmkl_intel_thread.a ${MKLROOT}/lib/intel64/libmkl_core.a -Wl,--end-group -liomp5 -lpthread -lm -ldl
+
+default: libsparse_matrix.so
+
+libsparse_matrix.so:  sparse_matrix.o
+	$(CC) -shared -o libsparse_matrix.so sparse_matrix.o $(LDFLAGS) 
+
+sparse_matrix.o:  sparse_matrix.cc sparse_matrix.h
+	$(CC) $(CFLAGS) sparse_matrix.cc 
+
+clean:
+	$(RM) libsparse_matrix.so *.o *~
diff --git a/3rdparty/sparse-matrix/sparse_matrix.cc b/3rdparty/sparse-matrix/sparse_matrix.cc
new file mode 100644
index 0000000..f402294
--- /dev/null
+++ b/3rdparty/sparse-matrix/sparse_matrix.cc
@@ -0,0 +1,45 @@
+#include <iostream>
+#include <string>
+#include <fstream>
+#include <mkl_spblas.h>
+#include "sparse_matrix.h"
+
+
+
+bool mkl_DotCsrDnsDns(SP_INT64* rows_start, SP_INT64* col_indx,
+	float* values, float* X, float* y,
+	int rows, int cols, int X_columns)
+{
+
+	sparse_index_base_t indexing = SPARSE_INDEX_BASE_ZERO;
+	sparse_status_t status;
+	sparse_matrix_t A = NULL;
+	sparse_layout_t layout = SPARSE_LAYOUT_ROW_MAJOR;
+	float         one, zero;
+	one = (float)1.0;
+	zero = (float)0.0;
+
+	MKL_INT* rows_end = rows_start + 1;
+	status = mkl_sparse_s_create_csr(&A, indexing, rows, cols, rows_start, rows_end, col_indx, values);
+
+	if (status != SPARSE_STATUS_SUCCESS)
+	{
+		std::cout << "mkl_sparse_s_create_csr status :" << status << std::endl;
+		return false;
+	}
+	sparse_operation_t operation = SPARSE_OPERATION_NON_TRANSPOSE;
+	struct matrix_descr descrA;
+	descrA.type = SPARSE_MATRIX_TYPE_GENERAL;
+
+	status = mkl_sparse_s_mm(operation, one, A, descrA, layout, X, X_columns, X_columns, zero, y, X_columns);
+	if (status != SPARSE_STATUS_SUCCESS)
+	{
+		std::cout << "mkl_sparse_s_create_csr status :" << status << std::endl;
+		return false;
+	}
+	
+	mkl_sparse_destroy(A);
+	
+	return true;
+
+}
diff --git a/3rdparty/sparse-matrix/sparse_matrix.h b/3rdparty/sparse-matrix/sparse_matrix.h
new file mode 100644
index 0000000..93054a8
--- /dev/null
+++ b/3rdparty/sparse-matrix/sparse_matrix.h
@@ -0,0 +1,48 @@
+#ifndef MXNET_OPERATOR_SPARSE_MATRIX_INL_H_
+#define MXNET_OPERATOR_SPARSE_MATRIX_INL_H_
+
+
+#if (!defined(__INTEL_COMPILER)) & defined(_MSC_VER)
+#define SP_INT64 __int64
+#define SP_UINT64 unsigned __int64
+#else
+#define SP_INT64 long long int
+#define SP_UINT64 unsigned long long int
+#endif
+
+
+#if defined _WIN32 || defined __CYGWIN__
+  #ifdef BUILDING_DLL
+    #ifdef __GNUC__
+      #define SPM_API_PUBLIC __attribute__ ((dllexport))
+    #else
+      #define SPM_API_PUBLIC __declspec(dllexport) // Note: actually gcc seems to also supports this syntax.
+    #endif
+  #else
+    #ifdef __GNUC__
+      #define SPM_API_PUBLIC __attribute__ ((dllimport))
+    #else
+      #define SPM_API_PUBLIC __declspec(dllimport) // Note: actually gcc seems to also supports this syntax.
+    #endif
+  #endif
+  #define SPM_API_LOCAL
+#else
+  #if __GNUC__ >= 4
+    #define SPM_API_PUBLIC __attribute__ ((visibility ("default")))
+    #define SPM_API_LOCAL  __attribute__ ((visibility ("hidden")))
+  #else
+    #define SPM_API_PUBLIC
+    #define SPM_API_LOCAL
+  #endif
+#endif
+
+
+
+extern "C"
+{
+	extern SPM_API_PUBLIC bool mkl_DotCsrDnsDns(SP_INT64* rows_start, SP_INT64* col_indx,
+		float* values, float* X, float* y, int rows, int cols, int X_columns);
+
+}
+
+#endif //MXNET_OPERATOR_SPARSE_MATRIX_INL_H_
\ No newline at end of file
diff --git a/Makefile b/Makefile
index 16ea59f..c8644bd 100644
--- a/Makefile
+++ b/Makefile
@@ -135,6 +135,12 @@ ifeq ($(USE_MKLDNN), 1)
 	LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,'$${ORIGIN}'
 endif
 
+ifeq ($(USE_BLAS), mkl)
+SPARSE_MATRIX_DIR =  $(ROOTDIR)/3rdparty/sparse-matrix
+CFLAGS += -I$(SPARSE_MATRIX_DIR)
+LDFLAGS += -L$(SPARSE_MATRIX_DIR) -lsparse_matrix
+endif
+
 # setup opencv
 ifeq ($(USE_OPENCV), 1)
 	CFLAGS += -DMXNET_USE_OPENCV=1 $(shell pkg-config --cflags opencv)
diff --git a/ci/docker/install/ubuntu_mklml.sh b/ci/docker/install/ubuntu_mklml.sh
index 862e284..ba54bd4 100755
--- a/ci/docker/install/ubuntu_mklml.sh
+++ b/ci/docker/install/ubuntu_mklml.sh
@@ -21,5 +21,5 @@
 # the whole docker cache for the image
 
 set -ex
-wget -q --no-check-certificate -O /tmp/mklml.tgz https://github.com/intel/mkl-dnn/releases/download/v0.17-rc/mklml_lnx_2019.0.1.20180928.tgz
+wget -q --no-check-certificate -O /tmp/mklml.tgz https://github.com/intel/mkl-dnn/releases/download/v0.18-rc/mklml_lnx_2019.0.3.20190125.tgz
 tar -zxf /tmp/mklml.tgz && cp -rf mklml_*/* /usr/local/ && rm -rf mklml_*
diff --git a/cmake/DownloadMKLML.cmake b/cmake/DownloadMKLML.cmake
index eabf861..5a2875b 100644
--- a/cmake/DownloadMKLML.cmake
+++ b/cmake/DownloadMKLML.cmake
@@ -19,15 +19,19 @@
 
 message(STATUS "Downloading MKLML...")
 
-set(MKLDNN_RELEASE v0.17-rc)
-set(MKLML_RELEASE_FILE_SUFFIX 2019.0.1.20180928)
+set(MKLDNN_RELEASE v0.18-rc)
+set(MKLML_RELEASE_FILE_SUFFIX 2019.0.3.20190125)
+
+set(MKLDNN_WIN_MD5 88164189ff4f9ce8bcfd6065d4f2673d)
+set(MKLDNN_LNX_MD5 4e1a05d38491deb36001d62eae302920)
+set(MKLDNN_MAC_MD5 11a946c9623ef999145d6df92d803b2c)
 
 if(MSVC)
   set(MKL_NAME "mklml_win_${MKLML_RELEASE_FILE_SUFFIX}")
 
   file(DOWNLOAD "https://github.com/intel/mkl-dnn/releases/download/${MKLDNN_RELEASE}/${MKL_NAME}.zip"
        "${CMAKE_CURRENT_BINARY_DIR}/mklml/${MKL_NAME}.zip"
-       EXPECTED_MD5 "443e661bdfd32dbbc99b460b43afceee" SHOW_PROGRESS)
+       EXPECTED_MD5 "${MKLDNN_WIN_MD5}" SHOW_PROGRESS)
   file(DOWNLOAD "https://github.com/apache/incubator-mxnet/releases/download/utils/7z.exe"
        "${CMAKE_CURRENT_BINARY_DIR}/mklml/7z2.exe"
        EXPECTED_MD5 "E1CF766CF358F368EC97662D06EA5A4C" SHOW_PROGRESS)
@@ -47,7 +51,7 @@ elseif(APPLE)
 
   file(DOWNLOAD "https://github.com/intel/mkl-dnn/releases/download/${MKLDNN_RELEASE}/${MKL_NAME}.tgz"
        "${CMAKE_CURRENT_BINARY_DIR}/mklml/${MKL_NAME}.tgz"
-       EXPECTED_MD5 "95f887af332205b1d15b392260003952" SHOW_PROGRESS)
+       EXPECTED_MD5 "${MKLDNN_MAC_MD5}" SHOW_PROGRESS)
   execute_process(COMMAND "tar" "-xzf" "${CMAKE_CURRENT_BINARY_DIR}/mklml/${MKL_NAME}.tgz"
                   "-C" "${CMAKE_CURRENT_BINARY_DIR}/mklml/")
 
@@ -61,7 +65,7 @@ elseif(UNIX)
 
   file(DOWNLOAD "https://github.com/intel/mkl-dnn/releases/download/${MKLDNN_RELEASE}/${MKL_NAME}.tgz"
        "${CMAKE_CURRENT_BINARY_DIR}/mklml/${MKL_NAME}.tgz"
-       EXPECTED_MD5 "a63abf155361322b9c03f8fc50f4f317" SHOW_PROGRESS)
+       EXPECTED_MD5 "${MKLDNN_LNX_MD5}" SHOW_PROGRESS)
   execute_process(COMMAND "tar" "-xzf" "${CMAKE_CURRENT_BINARY_DIR}/mklml/${MKL_NAME}.tgz"
                   "-C" "${CMAKE_CURRENT_BINARY_DIR}/mklml/")
 
diff --git a/cpp-package/scripts/OpWrapperGenerator.py b/cpp-package/scripts/OpWrapperGenerator.py
index 1b5f8b5..c26c370 100644
--- a/cpp-package/scripts/OpWrapperGenerator.py
+++ b/cpp-package/scripts/OpWrapperGenerator.py
@@ -138,6 +138,8 @@ class Arg:
                 self.defaultString = 'Shape(' + self.defaultString[1:-1] + ")"
             elif self.type == 'dmlc::optional<int>':
                 self.defaultString = self.type + '(' + self.defaultString + ')'
+            elif self.type == 'dmlc::optional<bool>':
+                self.defaultString = self.type + '(' + self.defaultString + ')'
             elif typeString.startswith('caffe-layer-parameter'):
                 self.defaultString = 'textToCaffeLayerParameter(' + self.MakeCString(self.defaultString) + ')'
                 hasCaffe = True
diff --git a/docs/faq/env_var.md b/docs/faq/env_var.md
index c7d3b28..8d08e32 100644
--- a/docs/faq/env_var.md
+++ b/docs/faq/env_var.md
@@ -206,6 +206,12 @@ When USE_PROFILER is enabled in Makefile or CMake, the following environments ca
   If no such algorithm exists given other constraints, MXNet will error out. This variable affects the choice
   of CUDNN convolution algorithms. Please see [CUDNN developer guide](https://docs.nvidia.com/deeplearning/sdk/cudnn-developer-guide/index.html) for more details.
 
+* MXNET_CPU_PARALLEL_COPY_SIZE
+  - Values: Int ```(default=200000)```
+  - The minimum size to call parallel copy by OpenMP in CPU2CPU mode.
+  - When the array size is bigger than or equal to  this threshold, NDArray::Copy(from, to) is implemented by OpenMP with the Recommended OMP Thread Count.
+  - When the array size is less than this threshold, NDArray::Copy(from , to)) is implemented by memcpy in single thread.
+
 Settings for Minimum Memory Usage
 ---------------------------------
 - Make sure ```min(MXNET_EXEC_NUM_TEMP, MXNET_GPU_WORKER_NTHREADS) = 1```
diff --git a/example/sparse/wide_deep_census_quantization/data.py b/example/sparse/wide_deep_census_quantization/data.py
new file mode 100644
index 0000000..d90dff5
--- /dev/null
+++ b/example/sparse/wide_deep_census_quantization/data.py
@@ -0,0 +1,143 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: skip-file
+from csv import DictReader
+import os
+import mxnet as mx
+import numpy as np
+
+
+def get_uci_adult(data_dir, data_name, url):
+    if not os.path.isdir(data_dir):
+        os.mkdir(data_dir)
+    os.chdir(data_dir)
+    if (not os.path.exists(data_name)):
+        print("Dataset " + data_name + " not present. Downloading now ...")
+        os.system("wget %r" % url + data_name)
+        if "test" in data_name:
+            os.system("sed -i '1d' %r" % data_name)
+        print("Dataset " + data_name + " is now present.")
+    csr, dns, label = preprocess_uci_adult(data_name)
+    os.chdir("..")
+    return csr, dns, label
+
+max_dict = {'age': 90, 'education_num': 16, 'capital_gain': 99999, 'capital_loss': 4356, 'hours_per_week': 99}
+min_dict = {'age': 17, 'education_num': 1, 'capital_gain': 0, 'capital_loss': 0, 'hours_per_week': 1}
+
+def preprocess_uci_adult(data_name):
+    """Some tricks of feature engineering are adapted
+    from tensorflow's wide and deep tutorial.
+    """
+    csv_columns = [
+        "age", "workclass", "fnlwgt", "education", "education_num",
+        "marital_status", "occupation", "relationship", "race", "gender",
+        "capital_gain", "capital_loss", "hours_per_week", "native_country",
+        "income_bracket"
+    ]
+
+    vocabulary_dict = {
+        "gender": [
+            "Female", "Male"
+        ],
+        "education": [
+            "Bachelors", "HS-grad", "11th", "Masters", "9th",
+            "Some-college", "Assoc-acdm", "Assoc-voc", "7th-8th",
+            "Doctorate", "Prof-school", "5th-6th", "10th", "1st-4th",
+            "Preschool", "12th"
+        ],
+        "marital_status": [
+            "Married-civ-spouse", "Divorced", "Married-spouse-absent",
+            "Never-married", "Separated", "Married-AF-spouse", "Widowed"
+        ],
+        "relationship": [
+            "Husband", "Not-in-family", "Wife", "Own-child", "Unmarried",
+            "Other-relative"
+        ],
+        "workclass": [
+            "Self-emp-not-inc", "Private", "State-gov", "Federal-gov",
+            "Local-gov", "?", "Self-emp-inc", "Without-pay", "Never-worked"
+        ]
+    }
+    # wide columns
+    crossed_columns = [
+        ["education", "occupation"],
+        ["native_country", "occupation"],
+        ["age_buckets", "education", "occupation"],
+    ]
+    age_boundaries = [18, 25, 30, 35, 40, 45, 50, 55, 60, 65]
+    # deep columns
+    indicator_columns = ['workclass', 'education', 'gender', 'relationship']
+    
+    embedding_columns = ['native_country', 'occupation']
+
+    continuous_columns = ['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
+    # income_bracket column is the label
+    labels = ["<", ">"]
+
+    hash_bucket_size = 1000
+    hash_bucket_int8 = 255
+    csr_ncols = len(crossed_columns) * hash_bucket_size
+    dns_ncols = len(continuous_columns) + len(embedding_columns)
+    for col in indicator_columns:
+        dns_ncols += len(vocabulary_dict[col])
+
+    label_list = []
+    csr_list = []
+    dns_list = []
+
+    with open(data_name) as f:
+        for row in DictReader(f, fieldnames=csv_columns):
+            label_list.append(labels.index(row['income_bracket'].strip()[0]))
+
+            for i, cols in enumerate(crossed_columns):
+                if cols[0] == "age_buckets":
+                    age_bucket = np.digitize(float(row["age"]), age_boundaries)
+                    s = '_'.join([row[col].strip() for col in cols[1:]])
+                    s += '_' + str(age_bucket)
+                    csr_list.append((i * hash_bucket_size + hash(s) % hash_bucket_size, 1.0))
+                else:
+                    s = '_'.join([row[col].strip() for col in cols])
+                    csr_list.append((i * hash_bucket_size + hash(s) % hash_bucket_size, 1.0))
+            
+            dns_row = [0] * dns_ncols
+            dns_dim = 0
+            for col in embedding_columns:
+                dns_row[dns_dim] = hash(row[col].strip()) % hash_bucket_size
+                dns_dim += 1
+
+            for col in indicator_columns:
+                dns_row[dns_dim + vocabulary_dict[col].index(row[col].strip())] = 1.0
+                dns_dim += len(vocabulary_dict[col])
+            scale = 1.0 #this is adjustable to hit the good accuracy
+            for col in continuous_columns:
+                orig_range = float(max_dict[col] - min_dict[col])
+                dns_row[dns_dim] = (float(row[col].strip()) - min_dict[col]) * scale / orig_range
+                #dns_row[dns_dim] = float(row[col].strip())
+                dns_dim += 1
+
+            dns_list.append(dns_row)
+
+    data_list = [item[1] for item in csr_list]
+    indices_list = [item[0] for item in csr_list]
+    indptr_list = range(0, len(indices_list) + 1, len(crossed_columns))
+    # convert to ndarrays
+    csr = mx.nd.sparse.csr_matrix((data_list, indices_list, indptr_list),
+                                  shape=(len(label_list), hash_bucket_size * len(crossed_columns)))
+    dns = np.array(dns_list)
+    label = np.array(label_list)
+    return csr, dns, label
diff --git a/example/sparse/wide_deep_census_quantization/inference.py b/example/sparse/wide_deep_census_quantization/inference.py
new file mode 100644
index 0000000..92b0e06
--- /dev/null
+++ b/example/sparse/wide_deep_census_quantization/inference.py
@@ -0,0 +1,200 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+import logging
+import os
+import time
+import mxnet as mx
+from mxnet import nd
+from mxnet.contrib.quantization import *
+from data import *
+from mxnet.base import check_call, _LIB
+
+def download_dataset(dataset_url, dataset_dir, logger=None):
+    if logger is not None:
+        logger.info('Downloading dataset for inference from %s to %s' % (dataset_url, dataset_dir))
+    mx.test_utils.download(dataset_url, dataset_dir)
+
+
+def load_model(symbol_file, param_file, logger=None):
+    cur_path = os.path.dirname(os.path.realpath(__file__))
+    symbol_file_path = os.path.join(cur_path, symbol_file)
+    if logger is not None:
+        logger.info('Loading symbol from file %s' % symbol_file_path)
+    symbol = mx.sym.load(symbol_file_path)
+
+    param_file_path = os.path.join(cur_path, param_file)
+    if logger is not None:
+        logger.info('Loading params from file %s' % param_file_path)
+    save_dict = nd.load(param_file_path)
+    arg_params = {}
+    aux_params = {}
+    for k, v in save_dict.items():
+        tp, name = k.split(':', 1)
+        if tp == 'arg':
+            arg_params[name] = v
+        if tp == 'aux':
+            aux_params[name] = v
+    return symbol, arg_params, aux_params
+
+
+def advance_data_iter(data_iter, n):
+    assert n >= 0
+    if n == 0:
+        return data_iter
+    has_next_batch = True
+    while has_next_batch:
+        try:
+            data_iter.next()
+            n -= 1
+            if n == 0:
+                return data_iter
+        except StopIteration:
+            has_next_batch = False
+
+
+
+# Related to feature engineering, please see preprocess in data.py
+ADULT = {
+    'train': 'adult.data',
+    'test': 'adult.test',
+    'url': 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/',
+    'num_linear_features': 3000,
+    'num_embed_features': 2,
+    'num_cont_features': 38,
+    'embed_input_dims': [1000, 1000],
+    'hidden_units': [32, 1024, 512, 256],
+}
+symbol_file = 'checkpoint-symbol.json'
+param_file = 'checkpoint-0009.params'
+#symbol_file = 'WD-quantized-162batches-naive-symbol.json'
+#param_file = 'WD-quantized-0000.params'
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Score a model on a dataset')
+
+    parser.add_argument('--symbol-file', type=str, default=symbol_file, help='symbol file path')
+    parser.add_argument('--param-file', type=str, default=param_file, help='param file path')
+    parser.add_argument('--batch-size', type=int, default=1024)
+    parser.add_argument('--label-name', type=str, default='softmax_label')
+    parser.add_argument('--accuracy', type=bool, default=False)
+    parser.add_argument('--shuffle-dataset', action='store_true', default=True,
+                        help='shuffle the calibration dataset')
+    parser.add_argument('--num-omp-threads', type=int, default=28)
+
+    args = parser.parse_args()
+
+
+    ctx = mx.cpu()
+
+
+    logging.basicConfig()
+    logger = logging.getLogger('logger')
+    logger.setLevel(logging.INFO)
+    if args.accuracy == True:
+        logger.info('Accuracy Mode')
+    else:
+        logger.info('Performance Mode')
+
+    symbol_file = args.symbol_file
+    param_file = args.param_file
+
+
+    batch_size = args.batch_size
+    logger.info('batch size = %d for inference' % batch_size)
+    label_name = args.label_name
+    logger.info('label_name = %s' % label_name)
+    data_dir = os.path.join(os.getcwd(), 'data')
+    val_data = os.path.join(data_dir, ADULT['test'])
+    
+    if args.accuracy == False:
+        val_csr_np = np.load('train_csr.npy')
+        val_csr = mx.nd.sparse.csr_matrix(val_csr_np)
+        val_dns = np.load('train_dns.npy')
+        val_label = np.load('train_label.npy')
+    else:
+        val_csr_np = np.load('val_csr.npy')
+        val_csr = mx.nd.sparse.csr_matrix(val_csr_np)
+        val_dns = np.load('val_dns.npy')
+        val_label = np.load('val_label.npy')
+
+    # creating data iterator
+    data = mx.io.NDArrayIter({'csr_data': val_csr, 'dns_data': val_dns},
+                                  {'softmax_label': val_label}, batch_size,
+                                  shuffle=False, last_batch_handle='discard')
+
+    # loading model
+    sym, arg_params, aux_params = load_model(symbol_file, param_file, logger)
+
+    # make sure that fp32 inference works on the same images as calibrated quantized model
+
+    logger.info('Running model %s for inference' % symbol_file)
+
+    acc_m = mx.metric.create('acc')
+    mod = mx.mod.Module(symbol=sym, context=ctx, data_names=['csr_data', 'dns_data'], label_names=[label_name, ])
+    mod.bind(for_training=False,
+             data_shapes=data.provide_data,
+             label_shapes=data.provide_label)
+    mod.set_params(arg_params, aux_params)
+
+    check_call(_LIB.MXSetNumOMPThreads(ctypes.c_int(args.num_omp_threads)))
+    batch_data = []
+    nbatch = 0
+    for batch in data:
+        batch_data.append(batch)
+    #data warm up
+    wi = 50
+    i = 0
+    for batch in batch_data:
+        if i < wi:
+            mod.forward(batch, is_train=False)
+            i += 1
+        else:
+            break
+    data.hard_reset()
+    mx.nd.waitall()
+
+    collector = None
+
+    #real run
+    if "DO_WIDE_DEEP_PROFILING" in os.environ:
+        print("wide_deep profiling start !!!!!!!!!!!!!")
+        mx.profiler.set_config(profile_symbolic=True, profile_imperative=True, profile_memory=False, profile_api=False)
+        mx.profiler.set_state('run')
+
+    nbatch = 0
+    tic = time.time()
+    for batch in batch_data:
+        nbatch += 1
+        mod.forward(batch, is_train=False)
+        if args.accuracy == True:
+            for output in mod.get_outputs():
+                output.wait_to_read()
+            mod.update_metric(acc_m, batch.label)
+        else:
+            mx.nd.waitall()
+    speed = nbatch * batch_size / (time.time() - tic)
+    logger.info("Run [%d] Batchs \tSpeed: %.2f samples/sec", nbatch, speed)
+
+    if args.accuracy == True:
+        logger.info(acc_m.get())
+    if "DO_WIDE_DEEP_PROFILING" in os.environ :
+        print("wide_deep profiling end !")
+        mx.profiler.set_state('stop')
+        profiler_info = mx.profiler.dumps()
+        print(profiler_info)
diff --git a/example/sparse/wide_deep_census_quantization/model.py b/example/sparse/wide_deep_census_quantization/model.py
new file mode 100644
index 0000000..7e6e216
--- /dev/null
+++ b/example/sparse/wide_deep_census_quantization/model.py
@@ -0,0 +1,58 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+
+
+def wide_deep_model(num_linear_features, num_embed_features, num_cont_features, 
+                    input_dims, hidden_units):
+    # wide model
+    csr_data = mx.symbol.Variable("csr_data", stype='csr')
+    label = mx.symbol.Variable("softmax_label")
+
+    norm_init = mx.initializer.Normal(sigma=0.01)
+    # weight with row_sparse storage type to enable sparse gradient updates
+    weight = mx.symbol.Variable("linear_weight", shape=(num_linear_features, hidden_units[3]),
+                                init=norm_init, stype='row_sparse')
+    bias = mx.symbol.Variable("linear_bias", shape=(hidden_units[3],))
+    dot = mx.symbol.sparse.dot(csr_data, weight)
+    linear_out = mx.symbol.broadcast_add(dot, bias)
+    # deep model
+    dns_data = mx.symbol.Variable("dns_data")
+    # embedding features
+    x = mx.symbol.slice(data=dns_data, begin=(0, 0),
+                        end=(None, num_embed_features))
+    embeds = mx.symbol.split(data=x, num_outputs=num_embed_features, squeeze_axis=1)
+    # continuous features
+    x = mx.symbol.slice(data=dns_data, begin=(0, num_embed_features),
+                        end=(None, num_embed_features + num_cont_features))
+    features = [x]
+
+    for i, embed in enumerate(embeds):
+        embed_weight = mx.symbol.Variable('embed_%d_weight' % i, stype='row_sparse')
+        features.append(mx.symbol.sparse.Embedding(data=embed, weight=embed_weight,
+                        input_dim=input_dims[i], output_dim=hidden_units[0], sparse_grad=True))
+
+    hidden = mx.symbol.concat(*features, dim=1)
+    hidden = mx.symbol.FullyConnected(data=hidden, num_hidden=hidden_units[1])
+    hidden = mx.symbol.Activation(data=hidden, act_type='relu')
+    hidden = mx.symbol.FullyConnected(data=hidden, num_hidden=hidden_units[2])
+    hidden = mx.symbol.Activation(data=hidden, act_type='relu')
+    deep_out = mx.symbol.FullyConnected(data=hidden, num_hidden=hidden_units[3])
+
+    out = mx.symbol.SoftmaxOutput(linear_out + deep_out, label, name='model')
+    return out
diff --git a/example/sparse/wide_deep_census_quantization/quant_accuracy.sh b/example/sparse/wide_deep_census_quantization/quant_accuracy.sh
new file mode 100755
index 0000000..dede7b8
--- /dev/null
+++ b/example/sparse/wide_deep_census_quantization/quant_accuracy.sh
@@ -0,0 +1,2 @@
+python inference.py --accuracy=True --symbol-file=WD-quantized-162batches-naive-symbol.json --param-file=WD-quantized-0000.params
+
diff --git a/example/sparse/wide_deep_census_quantization/qunatization_settings/all.py b/example/sparse/wide_deep_census_quantization/qunatization_settings/all.py
new file mode 100644
index 0000000..6cd6719
--- /dev/null
+++ b/example/sparse/wide_deep_census_quantization/qunatization_settings/all.py
@@ -0,0 +1,10 @@
+
+def get_qsettings():
+    settings = {
+        'excluse':  None,
+        'quantized_alg_setting': {
+            'fullyconnected1': ['int8', 'naive'],
+            'fullyconnected1': ['uint8', 'naive'],
+            'fullyconnected2': ['uint8', 'naive'], },
+    }
+    return settings
\ No newline at end of file
diff --git a/example/sparse/wide_deep_census_quantization/qunatization_settings/fc1_fc2.py b/example/sparse/wide_deep_census_quantization/qunatization_settings/fc1_fc2.py
new file mode 100644
index 0000000..2af04cf
--- /dev/null
+++ b/example/sparse/wide_deep_census_quantization/qunatization_settings/fc1_fc2.py
@@ -0,0 +1,9 @@
+
+def get_qsettings():
+    settings = {
+        'excluse':  ['fullyconnected0'],
+        'quantized_alg_setting': {
+            'fullyconnected1': ['uint8', 'naive'],
+            'fullyconnected2': ['uint8', 'naive'], },
+    }
+    return settings
\ No newline at end of file
diff --git a/example/sparse/wide_deep_census_quantization/qunatization_settings/fullyconnected2.py b/example/sparse/wide_deep_census_quantization/qunatization_settings/fullyconnected2.py
new file mode 100644
index 0000000..ada3ab6
--- /dev/null
+++ b/example/sparse/wide_deep_census_quantization/qunatization_settings/fullyconnected2.py
@@ -0,0 +1,8 @@
+
+def get_qsettings():
+    settings = {
+        'excluse':  ['fullyconnected0', 'fullyconnected1'],
+        'quantized_alg_setting': {'fullyconnected2': ['uint8', 'naive'],
+                                  },
+    }
+    return settings
\ No newline at end of file
diff --git a/example/sparse/wide_deep_census_quantization/train.py b/example/sparse/wide_deep_census_quantization/train.py
new file mode 100644
index 0000000..204b2c6
--- /dev/null
+++ b/example/sparse/wide_deep_census_quantization/train.py
@@ -0,0 +1,132 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+from mxnet.test_utils import *
+from data import *
+from model import *
+import argparse
+import os
+
+
+parser = argparse.ArgumentParser(description="Run sparse wide and deep classification ",
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--num-epoch', type=int, default=10,
+                    help='number of epochs to train')
+parser.add_argument('--batch-size', type=int, default=100,
+                    help='number of examples per batch')
+parser.add_argument('--lr', type=float, default=0.001,
+                    help='learning rate')
+parser.add_argument('--cuda', action='store_true', default=False,
+                    help='Train on GPU with CUDA')
+parser.add_argument('--optimizer', type=str, default='adam',
+                    help='what optimizer to use',
+                    choices=["ftrl", "sgd", "adam"])
+parser.add_argument('--log-interval', type=int, default=100,
+                    help='number of batches to wait before logging training status')
+
+
+# Related to feature engineering, please see preprocess in data.py
+ADULT = {
+    'train': 'adult.data',
+    'test': 'adult.test',
+    'url': 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/',
+    'num_linear_features': 3000,
+    'num_embed_features': 2,
+    'num_cont_features': 38,
+    'embed_input_dims': [1000, 1000],
+    'hidden_units': [32, 1024, 512, 256],
+}
+
+
+if __name__ == '__main__':
+    import logging
+    head = '%(asctime)-15s %(message)s'
+    logging.basicConfig(level=logging.INFO, format=head)
+
+    # arg parser
+    args = parser.parse_args()
+    logging.info(args)
+    num_epoch = args.num_epoch
+    batch_size = args.batch_size
+    optimizer = args.optimizer
+    log_interval = args.log_interval
+    lr = args.lr
+    ctx = mx.gpu(0) if args.cuda else mx.cpu()
+
+    # dataset    
+    data_dir = os.path.join(os.getcwd(), 'data')
+    train_data = os.path.join(data_dir, ADULT['train'])
+    val_data = os.path.join(data_dir, ADULT['test'])
+    train_csr, train_dns, train_label = get_uci_adult(data_dir, ADULT['train'], ADULT['url'])
+    val_csr, val_dns, val_label = get_uci_adult(data_dir, ADULT['test'], ADULT['url'])
+    np.save('train_csr', train_csr.asnumpy())
+    np.save('train_dns', train_dns)
+    np.save('train_label', train_label)
+    np.save('val_csr', val_csr.asnumpy())
+    np.save('val_dns', val_dns)
+    np.save('val_label', val_label)
+
+    model = wide_deep_model(ADULT['num_linear_features'], ADULT['num_embed_features'],
+                            ADULT['num_cont_features'], ADULT['embed_input_dims'],
+                            ADULT['hidden_units'])
+
+    # data iterator
+    train_data = mx.io.NDArrayIter({'csr_data': train_csr, 'dns_data': train_dns},
+                                   {'softmax_label': train_label}, batch_size,
+                                   shuffle=True, last_batch_handle='discard')
+    eval_data = mx.io.NDArrayIter({'csr_data': val_csr, 'dns_data': val_dns},
+                                  {'softmax_label': val_label}, batch_size,
+                                  shuffle=True, last_batch_handle='discard')
+
+    # module
+    mod = mx.mod.Module(symbol=model, context=ctx ,data_names=['csr_data', 'dns_data'],
+                        label_names=['softmax_label'])
+    mod.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label)
+    mod.init_params()
+    optim = mx.optimizer.create(optimizer, learning_rate=lr, rescale_grad=1.0/batch_size)
+    mod.init_optimizer(optimizer=optim)
+    # use accuracy as the metric
+    metric = mx.metric.create(['acc'])
+    # get the sparse weight parameter
+    speedometer = mx.callback.Speedometer(batch_size, log_interval)
+
+    logging.info('Training started ...')
+    
+    data_iter = iter(train_data)
+    for epoch in range(num_epoch):
+        nbatch = 0
+        metric.reset()
+        for batch in data_iter:
+            nbatch += 1
+            mod.forward_backward(batch)
+            # update all parameters (including the weight parameter)
+            mod.update()
+            # update training metric
+            mod.update_metric(metric, batch.label)
+            speedometer_param = mx.model.BatchEndParam(epoch=epoch, nbatch=nbatch,
+                                                       eval_metric=metric, locals=locals())
+            speedometer(speedometer_param)
+        # evaluate metric on validation dataset
+        score = mod.score(eval_data, ['acc'])
+        logging.info('epoch %d, accuracy = %s' % (epoch, score[0][1]))
+        
+        mod.save_checkpoint("checkpoint", epoch, save_optimizer_states=True)
+        # reset the iterator for next pass of data
+        data_iter.reset()
+    
+    logging.info('Training completed.')
diff --git a/example/sparse/wide_deep_census_quantization/wd_gen_qsym_mkldnn.py b/example/sparse/wide_deep_census_quantization/wd_gen_qsym_mkldnn.py
new file mode 100644
index 0000000..a609d92
--- /dev/null
+++ b/example/sparse/wide_deep_census_quantization/wd_gen_qsym_mkldnn.py
@@ -0,0 +1,169 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+import os
+import logging
+import mxnet as mx
+from mxnet import nd
+from mxnet.contrib.quantization import *
+from mxnet.base import SymbolHandle, check_call, _LIB, mx_uint, c_str_array
+import ctypes
+
+
+
+def load_model(symbol_file, param_file, logger=None):
+    cur_path = os.path.dirname(os.path.realpath(__file__))
+    symbol_file_path = os.path.join(cur_path, symbol_file)
+    if logger is not None:
+        logger.info('Loading symbol from file %s' % symbol_file_path)
+    symbol = mx.sym.load(symbol_file_path)
+
+    param_file_path = os.path.join(cur_path, param_file)
+    if logger is not None:
+        logger.info('Loading params from file %s' % param_file_path)
+    save_dict = nd.load(param_file_path)
+    arg_params = {}
+    aux_params = {}
+    for k, v in save_dict.items():
+        tp, name = k.split(':', 1)
+        if tp == 'arg':
+            arg_params[name] = v
+        if tp == 'aux':
+            aux_params[name] = v
+    return symbol, arg_params, aux_params
+
+
+def save_symbol(fname, sym, logger=None):
+    if logger is not None:
+        logger.info('Saving symbol into file at %s' % fname)
+    sym.save(fname)
+
+
+def save_params(fname, arg_params, aux_params, logger=None):
+    if logger is not None:
+        logger.info('Saving params into file at %s' % fname)
+    save_dict = {('arg:%s' % k): v.as_in_context(cpu()) for k, v in arg_params.items()}
+    save_dict.update({('aux:%s' % k): v.as_in_context(cpu()) for k, v in aux_params.items()})
+    mx.nd.save(fname, save_dict)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Generate a calibrated quantized model from a FP32 model with Intel MKL-DNN support')
+    parser.add_argument('--batch-size', type=int, default=32)
+    parser.add_argument('--label-name', type=str, default='softmax_label')
+    parser.add_argument('--num-calib-batches', type=int, default=162,
+                        help='number of batches for calibration')
+    parser.add_argument('--calib-mode', type=str, default='naive',
+                        help='calibration mode used for generating calibration table for the quantized symbol; supports'
+                             ' 1. none: no calibration will be used. The thresholds for quantization will be calculated'
+                             ' on the fly. This will result in inference speed slowdown and loss of accuracy'
+                             ' in general.'
+                             ' 2. naive: simply take min and max values of layer outputs as thresholds for'
+                             ' quantization. In general, the inference accuracy worsens with more examples used in'
+                             ' calibration. It is recommended to use `entropy` mode as it produces more accurate'
+                             ' inference results.'
+                             ' 3. entropy: calculate KL divergence of the fp32 output and quantized output for optimal'
+                             ' thresholds. This mode is expected to produce the best inference accuracy of all three'
+                             ' kinds of quantized models if the calibration dataset is representative enough of the'
+                             ' inference dataset.')
+    parser.add_argument('--quantized-dtype', type=str, default='uint8',
+                        choices=['int8', 'uint8'],
+                        help='quantization destination data type for input data')
+    parser.add_argument('--enable-calib-quantize', type=bool, default=True,
+                        help='If enabled, the quantize op will '
+                             'be calibrated offline if calibration mode is '
+                             'enabled')
+    args = parser.parse_args()
+    ctx = mx.cpu(0)
+    logging.basicConfig()
+    logger = logging.getLogger('logger')
+    logger.setLevel(logging.INFO)
+
+    calib_mode = args.calib_mode
+    logger.info('calibration mode set to %s' % calib_mode)
+    batch_size = args.batch_size
+
+    train_csr_np = np.load('train_csr.npy')
+    train_csr = mx.nd.sparse.csr_matrix(train_csr_np)
+    train_dns = np.load('train_dns.npy')
+    train_label = np.load('train_label.npy')
+
+    val_csr_np = np.load('val_csr.npy')
+    val_csr = mx.nd.sparse.csr_matrix(val_csr_np)
+    val_dns = np.load('val_dns.npy')
+    val_label = np.load('val_label.npy')
+    # creating data iterator
+
+    # creating data iterator
+    data = mx.io.NDArrayIter({'csr_data': val_csr, 'dns_data': val_dns},
+                                  {'softmax_label': val_label}, batch_size,
+                                  shuffle=True, last_batch_handle='discard')
+    # loading model
+    sym, arg_params, aux_params = load_model('checkpoint-symbol.json', 'checkpoint-0009.params', logger)
+    sym = sym.get_backend_symbol('MKLDNN_PARALLEL_EMBEDDING')
+
+    # get batch size
+    batch_size = args.batch_size
+    logger.info('batch size = %d for calibration' % batch_size)
+
+    # get number of batches for calibration
+    num_calib_batches = args.num_calib_batches
+    if calib_mode == 'none':
+        logger.info('skip calibration step as calib_mode is none')
+    else:
+        logger.info('number of batches = %d for calibration' % num_calib_batches)
+
+
+
+    label_name = args.label_name
+    logger.info('label_name = %s' % label_name)
+
+    excluded_sym_names = None
+    prefix = 'WD'
+    epoch=0
+    calib_layer = lambda name: (name.find('fullyconnected') != -1 or name.find('FullyConnected') != -1)
+    if calib_mode == 'none':
+        logger.info('Quantizing FP32 model %s' % args.model)
+        qsym, qarg_params, aux_params = quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params,
+                                                       ctx=ctx, excluded_sym_names=excluded_sym_names,
+                                                       data_names=['csr_data', 'dns_data'],
+                                                       calib_mode=calib_mode, quantized_dtype=args.quantized_dtype,
+                                                       logger=logger)
+        sym_name = '%s-symbol.json' % (prefix + '-quantized')
+    else:
+        qsym, qarg_params, aux_params = quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params,
+                                                        ctx=ctx, excluded_sym_names=excluded_sym_names,
+                                                        data_names=['csr_data', 'dns_data'],
+                                                        calib_mode=calib_mode, calib_data=data,
+                                                        num_calib_examples=num_calib_batches * batch_size,
+                                                        calib_layer=calib_layer, quantized_dtype=args.quantized_dtype,
+                                                        label_names=(label_name,), calib_quantize_op = True,
+                                                        logger=logger)
+        if calib_mode == 'entropy':
+            suffix = '-quantized-%dbatches-entropy' % num_calib_batches
+        elif calib_mode == 'naive':
+            suffix = '-quantized-%dbatches-naive' % num_calib_batches
+        else:
+            raise ValueError('unknow calibration mode %s received, only supports `none`, `naive`, and `entropy`'
+                             % calib_mode)
+        sym_name = '%s-symbol.json' % (prefix + suffix)
+    qsym = qsym.get_backend_symbol('MKLDNN_POST_FC_QUANTIZE')
+    qsym = qsym.get_backend_symbol('MKLDNN_QFC_POST_RELU_FUSED')
+    save_symbol(sym_name, qsym, logger)
+    param_name = '%s-%04d.params' % (prefix + '-quantized', epoch)
+    save_params(param_name, qarg_params, aux_params, logger)
diff --git a/example/sparse/wide_deep_criteo_quantization/data.py b/example/sparse/wide_deep_criteo_quantization/data.py
new file mode 100644
index 0000000..2fc0f66
--- /dev/null
+++ b/example/sparse/wide_deep_criteo_quantization/data.py
@@ -0,0 +1,74 @@
+from csv import DictReader
+import os
+import mxnet as mx
+import numpy as np
+
+def get_uci_criteo(data_dir, data_name):
+    data_file = os.path.join(data_dir, data_name)
+    if (not os.path.exists(data_file)):
+        print("Dataset " + data_file + " not present")
+    csr, dns, label = preprocess_uci_criteo(data_name)
+    return csr, dns, label
+
+
+
+#    Label - Target variable that indicates if an ad was clicked (1) or not (0).
+#    I1-I13 - A total of 13 columns of integer features (mostly count features).
+#    C1-C26 - A total of 26 columns of categorical features. The values of these features have been hashed onto 32 bits for anonymization purposes.
+
+CONTINUOUS_COLUMNS =  ["I"+str(i) for i in range(1,14)] # 1-13 inclusive
+CATEGORICAL_COLUMNS = ["C"+str(i) for i in range(1,27)] # 1-26 inclusive
+LABEL_COLUMN = ["clicked"]
+
+TRAIN_DATA_COLUMNS = LABEL_COLUMN + CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS
+FEATURE_COLUMNS = CONTINUOUS_COLUMNS + CATEGORICAL_COLUMNS
+max_dict = {'I11': 79, 'I7': 1251, 'I13': 369, 'I8': 682, 'I5': 1618112, 'I2': 18522, 'I10': 6, 'I6': 12167, 'I3': 65535, 'I1': 214, 'I12': 72, 'I4': 280, 'I9': 7335}
+min_dict = {'I11': 0, 'I7': 0, 'I13': 0, 'I8': 0, 'I5': 0, 'I2': -2, 'I10': 0, 'I6': 0, 'I3': 0, 'I1': 0, 'I12': 0, 'I4': 0, 'I9': 0}
+def preprocess_uci_criteo(data_name):
+    
+    hash_bucket_size = 1000
+    hash_bucket_q = 256
+    cont_defaults = [ [0] for i in range(1,14) ]
+    cate_defaults = [ [" "] for i in range(1,27) ]
+    label_defaults = [ [0] ]
+    column_headers = TRAIN_DATA_COLUMNS
+    record_defaults = label_defaults + cont_defaults + cate_defaults
+
+    label_list = []
+    csr_list = []
+    dns_list = []
+
+    csr_ncols = len(CATEGORICAL_COLUMNS) * hash_bucket_size
+    dns_ncols = len(CONTINUOUS_COLUMNS) + len(CATEGORICAL_COLUMNS)
+    with open(data_name) as f:
+        for row in DictReader(f, fieldnames=TRAIN_DATA_COLUMNS):
+            label_list.append(row['clicked'])
+            # Sparse base columns.
+            for name in CATEGORICAL_COLUMNS:
+                csr_list.append((hash(row[name]) % hash_bucket_q, 1.0))
+
+
+            dns_row = [0] * dns_ncols
+            dns_dim = 0
+            # Embed wide columns into deep columns
+            for col in CATEGORICAL_COLUMNS:
+                dns_row[dns_dim] = hash(row[col].strip()) % hash_bucket_q
+                dns_dim += 1
+            # Continuous base columns.
+            scale = 1.0 #this is adjustable variable for normalizer to achieve good accuracy
+            for col in CONTINUOUS_COLUMNS:
+                #dns_row[dns_dim] = float(row[col].strip())
+                orig_range = float(max_dict[col] - min_dict[col])
+                dns_row[dns_dim] = (float(row[col].strip()) - min_dict[col]) * scale / orig_range
+                dns_dim += 1
+            # No transformations.
+
+            dns_list.append(dns_row)
+    data_list = [item[1] for item in csr_list]
+    indices_list = [item[0] for item in csr_list]
+    indptr_list = range(0, len(indices_list) + 1, len(CATEGORICAL_COLUMNS))
+    csr = mx.nd.sparse.csr_matrix((data_list, indices_list, indptr_list),
+            shape=(len(label_list), hash_bucket_size * len(CATEGORICAL_COLUMNS)))
+    dns = np.array(dns_list)
+    label = np.array(label_list)
+    return csr, dns, label
diff --git a/example/sparse/wide_deep_criteo_quantization/inference.py b/example/sparse/wide_deep_criteo_quantization/inference.py
new file mode 100644
index 0000000..88ad2e1
--- /dev/null
+++ b/example/sparse/wide_deep_criteo_quantization/inference.py
@@ -0,0 +1,203 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#python inference.py  --symbol-file=WD-quantized-162batches-naive-symbol.json --param-file=WD-quantized-0000.params
+import argparse
+import logging
+import os
+import time
+import mxnet as mx
+from mxnet import nd
+from mxnet.contrib.quantization import *
+from data import *
+import pickle
+from mxnet.base import check_call, _LIB
+def download_dataset(dataset_url, dataset_dir, logger=None):
+    if logger is not None:
+        logger.info('Downloading dataset for inference from %s to %s' % (dataset_url, dataset_dir))
+    mx.test_utils.download(dataset_url, dataset_dir)
+
+
+def load_model(symbol_file, param_file, logger=None):
+    cur_path = os.path.dirname(os.path.realpath(__file__))
+    symbol_file_path = os.path.join(cur_path, symbol_file)
+    if logger is not None:
+        logger.info('Loading symbol from file %s' % symbol_file_path)
+    symbol = mx.sym.load(symbol_file_path)
+
+    param_file_path = os.path.join(cur_path, param_file)
+    if logger is not None:
+        logger.info('Loading params from file %s' % param_file_path)
+    save_dict = nd.load(param_file_path)
+    arg_params = {}
+    aux_params = {}
+    for k, v in save_dict.items():
+        tp, name = k.split(':', 1)
+        if tp == 'arg':
+            arg_params[name] = v
+        if tp == 'aux':
+            aux_params[name] = v
+    return symbol, arg_params, aux_params
+
+
+def advance_data_iter(data_iter, n):
+    assert n >= 0
+    if n == 0:
+        return data_iter
+    has_next_batch = True
+    while has_next_batch:
+        try:
+            data_iter.next()
+            n -= 1
+            if n == 0:
+                return data_iter
+        except StopIteration:
+            has_next_batch = False
+
+
+
+# Related to feature engineering, please see preprocess in data.py
+#CRITEO = {
+#    'train': 'train.csv',
+#    'test': 'eval.csv',
+#    'num_linear_features': 26000,
+#    'num_embed_features': 26,
+#    'num_cont_features': 13,
+#    'embed_input_dims': 1000,
+#    'hidden_units': [8, 50, 100],
+#}
+
+
+CRITEO = {
+    'train': 'train.csv',
+    'test': 'eval.csv',
+    'num_linear_features': 26000,
+    'num_embed_features': 26,
+    'num_cont_features': 13,
+    'embed_input_dims': 1000,
+    'hidden_units': [32, 1024, 512, 256],
+}
+def load_object(filename):
+    with open(filename, 'rb') as input:
+        return pickle.load(input)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Score a model on a dataset')
+
+    parser.add_argument('--symbol-file', type=str, default='checkpoint-symbol.json', help='symbol file path')
+    parser.add_argument('--param-file', type=str, default='checkpoint-0000.params', help='param file path')
+    parser.add_argument('--batch-size', type=int, default=1024)
+    parser.add_argument('--label-name', type=str, default='softmax_label')
+    parser.add_argument('--accuracy', type=bool, default=False)
+    parser.add_argument('--shuffle-dataset', action='store_true', default=True,
+                        help='shuffle the calibration dataset')
+    parser.add_argument('--num-omp-threads', type=int, default=28)
+    parser.add_argument('--num-batches', type=int, default=8000000)
+    args = parser.parse_args()
+
+    ctx = mx.cpu()
+
+    logging.basicConfig()
+    logger = logging.getLogger('logger')
+    logger.setLevel(logging.INFO)
+
+    if args.accuracy == True:
+        logger.info('Accuracy Mode')
+    else:
+        logger.info('Performance Mode')
+
+    symbol_file = args.symbol_file
+    param_file = args.param_file
+
+
+    batch_size = args.batch_size
+    logger.info('batch size = %d for inference' % batch_size)
+    label_name = args.label_name
+    logger.info('label_name = %s' % label_name)
+
+    if args.accuracy == False:
+        val_csr = load_object('train_csr.pkl')
+        val_dns = load_object('train_dns.pkl')
+        val_label = load_object('train_label.pkl')
+    else:
+        val_csr = load_object('val_csr.pkl')
+        val_dns = load_object('val_dns.pkl')
+        val_label = load_object('val_label.pkl')
+
+    # creating data iterator
+    data = mx.io.NDArrayIter({'csr_data': val_csr, 'dns_data': val_dns},
+                                  {'softmax_label': val_label}, batch_size,
+                                  shuffle=False, last_batch_handle='discard')
+
+    # loading model
+    sym, arg_params, aux_params = load_model(symbol_file, param_file, logger)
+
+
+    # make sure that fp32 inference works on the same images as calibrated quantized model
+
+    logger.info('Running model %s for inference' % symbol_file)
+
+    acc_m = mx.metric.create('acc')
+    mod = mx.mod.Module(symbol=sym, context=ctx, data_names=['csr_data', 'dns_data'], label_names=[label_name, ])
+    mod.bind(for_training=False,
+             data_shapes=data.provide_data,
+             label_shapes=data.provide_label)
+    mod.set_params(arg_params, aux_params)
+
+    check_call(_LIB.MXSetNumOMPThreads(ctypes.c_int(args.num_omp_threads)))
+    batch_data = []
+    nbatch = 0
+    for batch in data:
+        if nbatch < args.num_batches:
+            batch_data.append(batch) 
+            nbatch += 1 
+        else:
+            break
+    #for data warmup
+    wi = 50
+    i = 0
+    for batch in batch_data:
+        if i < wi:
+            mod.forward(batch, is_train=False)
+            i += 1
+        else:        
+            break
+    data.hard_reset()
+    mx.nd.waitall()
+    #real run
+    if "DO_WIDE_DEEP_PROFILING" in os.environ:
+        print("wide_deep profiling start !!!!!!!!!!!!!")
+        mx.profiler.set_config(profile_symbolic=True, profile_imperative=True, profile_memory=False, profile_api=False)
+        mx.profiler.set_state('run')
+    nbatch = 0
+    tic = time.time()
+    for batch in batch_data:
+        nbatch += 1
+        mod.forward(batch, is_train=False)
+        if args.accuracy == True:
+            for output in mod.get_outputs():
+                output.wait_to_read()
+            mod.update_metric(acc_m, batch.label)
+        else:
+            mx.nd.waitall()
+    speed = nbatch * batch_size / (time.time() - tic)
+    logger.info("Run [%d] Batchs \tSpeed: %.2f samples/sec", nbatch, speed)
+    if args.accuracy == True:
+        logger.info(acc_m.get())
+    if "DO_WIDE_DEEP_PROFILING" in os.environ :
+        print("wide_deep profiling end !")
+        mx.profiler.set_state('stop')
+        profiler_info = mx.profiler.dumps()
+        print(profiler_info)
diff --git a/example/sparse/wide_deep_criteo_quantization/model.py b/example/sparse/wide_deep_criteo_quantization/model.py
new file mode 100644
index 0000000..72589a3
--- /dev/null
+++ b/example/sparse/wide_deep_criteo_quantization/model.py
@@ -0,0 +1,58 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+
+
+def wide_deep_model(num_linear_features, num_embed_features, num_cont_features, 
+                    input_dims, hidden_units):
+    # wide model
+    csr_data = mx.symbol.Variable("csr_data", stype='csr')
+    label = mx.symbol.Variable("softmax_label")
+
+    norm_init = mx.initializer.Normal(sigma=0.01)
+    # weight with row_sparse storage type to enable sparse gradient updates
+    weight = mx.symbol.Variable("linear_weight", shape=(num_linear_features, hidden_units[3]),
+                                init=norm_init, stype='row_sparse')
+    bias = mx.symbol.Variable("linear_bias", shape=(hidden_units[3],))
+    dot = mx.symbol.sparse.dot(csr_data, weight)
+    linear_out = mx.symbol.broadcast_add(dot, bias)
+    # deep model
+    dns_data = mx.symbol.Variable("dns_data")
+    # embedding features
+    x = mx.symbol.slice(data=dns_data, begin=(0, 0),
+                        end=(None, num_embed_features))
+    embeds = mx.symbol.split(data=x, num_outputs=num_embed_features, squeeze_axis=1)
+    # continuous features
+    x = mx.symbol.slice(data=dns_data, begin=(0, num_embed_features),
+                        end=(None, num_embed_features + num_cont_features))
+    features = [x]
+
+    for i, embed in enumerate(embeds):
+        embed_weight = mx.symbol.Variable('embed_%d_weight' % i, stype='row_sparse')
+        features.append(mx.symbol.sparse.Embedding(data=embed, weight=embed_weight,
+                        input_dim=input_dims, output_dim=hidden_units[0]))
+
+    hidden = mx.symbol.concat(*features, dim=1)
+    hidden = mx.symbol.FullyConnected(data=hidden, num_hidden=hidden_units[1])
+    hidden = mx.symbol.Activation(data=hidden, act_type='relu')
+    hidden = mx.symbol.FullyConnected(data=hidden, num_hidden=hidden_units[2])
+    hidden = mx.symbol.Activation(data=hidden, act_type='relu')
+    deep_out = mx.symbol.FullyConnected(data=hidden, num_hidden=hidden_units[3])
+
+    out = mx.symbol.SoftmaxOutput(linear_out + deep_out, label, name='model')
+    return out
diff --git a/example/sparse/wide_deep_criteo_quantization/train.py b/example/sparse/wide_deep_criteo_quantization/train.py
new file mode 100644
index 0000000..3413505
--- /dev/null
+++ b/example/sparse/wide_deep_criteo_quantization/train.py
@@ -0,0 +1,134 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+from mxnet.test_utils import *
+from data import *
+from model import *
+import argparse
+import os
+import pickle
+parser = argparse.ArgumentParser(description="Run sparse wide and deep classification ",
+                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+parser.add_argument('--num-epoch', type=int, default=1,
+                    help='number of epochs to train')
+parser.add_argument('--batch-size', type=int, default=1000,
+                    help='number of examples per batch')
+parser.add_argument('--lr', type=float, default=0.001,
+                    help='learning rate')
+parser.add_argument('--cuda', action='store_true', default=False,
+                    help='Train on GPU with CUDA')
+parser.add_argument('--optimizer', type=str, default='adam',
+                    help='what optimizer to use',
+                    choices=["ftrl", "sgd", "adam"])
+parser.add_argument('--log-interval', type=int, default=100,
+                    help='number of batches to wait before logging training status')
+parser.add_argument('--data-dir', type=str, default='large_version',
+                    help='folder for data')
+
+# Related to feature engineering, please see preprocess in data.py
+CRITEO = {
+    'train': 'train.csv',
+    'test': 'eval.csv',
+    'num_linear_features': 26000,
+    'num_embed_features': 26,
+    'num_cont_features': 13,
+    'embed_input_dims': 1000,
+    'hidden_units': [32, 1024, 512, 256],
+}
+def save_object(filename, obj):
+    with open(filename, 'wb') as output:  # Overwrites any existing file.
+        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)
+if __name__ == '__main__':
+    import logging
+
+    head = '%(asctime)-15s %(message)s'
+    logging.basicConfig(level=logging.INFO, format=head)
+
+    # arg parser
+    args = parser.parse_args()
+    logging.info(args)
+    num_epoch = args.num_epoch
+    batch_size = args.batch_size
+    optimizer = args.optimizer
+    log_interval = args.log_interval
+    lr = args.lr
+    ctx = mx.gpu(0) if args.cuda else mx.cpu()
+
+    # dataset    
+    data_dir = os.path.join(os.getcwd(), args.data_dir)
+    train_data = os.path.join(data_dir, CRITEO['train'])
+    val_data = os.path.join(data_dir, CRITEO['test'])
+    train_csr, train_dns, train_label = get_uci_criteo(data_dir, train_data)
+    val_csr, val_dns, val_label = get_uci_criteo(data_dir, val_data)
+
+    save_object('val_csr.pkl', val_csr)
+    save_object('val_dns.pkl', val_dns)
+    save_object('val_label.pkl', val_label)
+    save_object('train_csr.pkl', train_csr)
+    save_object('train_dns.pkl', train_dns)
+    save_object('train_label.pkl', train_label)
+
+    model = wide_deep_model(CRITEO['num_linear_features'], CRITEO['num_embed_features'],
+                            CRITEO['num_cont_features'], CRITEO['embed_input_dims'],
+                            CRITEO['hidden_units'])
+
+    # data iterator
+    train_data = mx.io.NDArrayIter({'csr_data': train_csr, 'dns_data': train_dns},
+                                   {'softmax_label': train_label}, batch_size,
+                                   shuffle=True, last_batch_handle='discard')
+    eval_data = mx.io.NDArrayIter({'csr_data': val_csr, 'dns_data': val_dns},
+                                  {'softmax_label': val_label}, batch_size,
+                                  shuffle=True, last_batch_handle='discard')
+
+    # module
+    mod = mx.mod.Module(symbol=model, context=ctx, data_names=['csr_data', 'dns_data'],
+                        label_names=['softmax_label'])
+    mod.bind(data_shapes=train_data.provide_data, label_shapes=train_data.provide_label)
+    mod.init_params()
+    optim = mx.optimizer.create(optimizer, learning_rate=lr, rescale_grad=1.0 / batch_size)
+    mod.init_optimizer(optimizer=optim)
+    # use accuracy as the metric
+    metric = mx.metric.create(['acc'])
+    # get the sparse weight parameter
+    speedometer = mx.callback.Speedometer(batch_size, log_interval)
+
+    logging.info('Training started ...')
+
+    data_iter = iter(train_data)
+    for epoch in range(num_epoch):
+        nbatch = 0
+        metric.reset()
+        for batch in data_iter:
+            nbatch += 1
+            mod.forward_backward(batch)
+            # update all parameters (including the weight parameter)
+            mod.update()
+            # update training metric
+            mod.update_metric(metric, batch.label)
+            speedometer_param = mx.model.BatchEndParam(epoch=epoch, nbatch=nbatch,
+                                                       eval_metric=metric, locals=locals())
+            speedometer(speedometer_param)
+        # evaluate metric on validation dataset
+        score = mod.score(eval_data, ['acc'])
+        logging.info('epoch %d, accuracy = %s' % (epoch, score[0][1]))
+
+        mod.save_checkpoint("checkpoint", epoch, save_optimizer_states=False)
+        # reset the iterator for next pass of data
+        data_iter.reset()
+
+    logging.info('Training completed.')
diff --git a/example/sparse/wide_deep_criteo_quantization/wd_gen_qsym_mkldnn.py b/example/sparse/wide_deep_criteo_quantization/wd_gen_qsym_mkldnn.py
new file mode 100644
index 0000000..d21e947
--- /dev/null
+++ b/example/sparse/wide_deep_criteo_quantization/wd_gen_qsym_mkldnn.py
@@ -0,0 +1,166 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import argparse
+import os
+import logging
+import mxnet as mx
+from mxnet import nd
+from mxnet.contrib.quantization import *
+from mxnet.base import SymbolHandle, check_call, _LIB, mx_uint, c_str_array
+import ctypes
+import pickle
+
+
+def load_model(symbol_file, param_file, logger=None):
+    cur_path = os.path.dirname(os.path.realpath(__file__))
+    symbol_file_path = os.path.join(cur_path, symbol_file)
+    if logger is not None:
+        logger.info('Loading symbol from file %s' % symbol_file_path)
+    symbol = mx.sym.load(symbol_file_path)
+
+    param_file_path = os.path.join(cur_path, param_file)
+    if logger is not None:
+        logger.info('Loading params from file %s' % param_file_path)
+    save_dict = nd.load(param_file_path)
+    arg_params = {}
+    aux_params = {}
+    for k, v in save_dict.items():
+        tp, name = k.split(':', 1)
+        if tp == 'arg':
+            arg_params[name] = v
+        if tp == 'aux':
+            aux_params[name] = v
+    return symbol, arg_params, aux_params
+
+
+def save_symbol(fname, sym, logger=None):
+    if logger is not None:
+        logger.info('Saving symbol into file at %s' % fname)
+    sym.save(fname)
+
+
+def save_params(fname, arg_params, aux_params, logger=None):
+    if logger is not None:
+        logger.info('Saving params into file at %s' % fname)
+    save_dict = {('arg:%s' % k): v.as_in_context(cpu()) for k, v in arg_params.items()}
+    save_dict.update({('aux:%s' % k): v.as_in_context(cpu()) for k, v in aux_params.items()})
+    mx.nd.save(fname, save_dict)
+
+def load_object(filename):
+    with open(filename, 'rb') as input:
+        return pickle.load(input)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Generate a calibrated quantized model from a FP32 model with Intel MKL-DNN support')
+    parser.add_argument('--batch-size', type=int, default=32)
+    parser.add_argument('--label-name', type=str, default='softmax_label')
+    parser.add_argument('--num-calib-batches', type=int, default=162,
+                        help='number of batches for calibration')
+    parser.add_argument('--calib-mode', type=str, default='naive',
+                        help='calibration mode used for generating calibration table for the quantized symbol; supports'
+                             ' 1. none: no calibration will be used. The thresholds for quantization will be calculated'
+                             ' on the fly. This will result in inference speed slowdown and loss of accuracy'
+                             ' in general.'
+                             ' 2. naive: simply take min and max values of layer outputs as thresholds for'
+                             ' quantization. In general, the inference accuracy worsens with more examples used in'
+                             ' calibration. It is recommended to use `entropy` mode as it produces more accurate'
+                             ' inference results.'
+                             ' 3. entropy: calculate KL divergence of the fp32 output and quantized output for optimal'
+                             ' thresholds. This mode is expected to produce the best inference accuracy of all three'
+                             ' kinds of quantized models if the calibration dataset is representative enough of the'
+                             ' inference dataset.')
+    parser.add_argument('--quantized-dtype', type=str, default='uint8',
+                        choices=['int8', 'uint8'],
+                        help='quantization destination data type for input data')
+    parser.add_argument('--enable-calib-quantize', type=bool, default=True,
+                        help='If enabled, the quantize op will '
+                             'be calibrated offline if calibration mode is '
+                             'enabled')
+    args = parser.parse_args()
+    ctx = mx.cpu(0)
+    logging.basicConfig()
+    logger = logging.getLogger('logger')
+    logger.setLevel(logging.INFO)
+
+    calib_mode = args.calib_mode
+    logger.info('calibration mode set to %s' % calib_mode)
+    batch_size = args.batch_size
+
+    val_csr = load_object('val_csr.pkl')
+    val_dns = load_object('val_dns.pkl')
+    val_label = load_object('val_label.pkl')
+    # creating data iterator
+
+    # creating data iterator
+    data = mx.io.NDArrayIter({'csr_data': val_csr, 'dns_data': val_dns},
+                                  {'softmax_label': val_label}, batch_size,
+                                  shuffle=True, last_batch_handle='discard')
+    # loading model
+    sym, arg_params, aux_params = load_model('checkpoint-symbol.json', 'checkpoint-0000.params', logger)
+    sym = sym.get_backend_symbol('MKLDNN_PARALLEL_EMBEDDING')
+
+    # get batch size
+    batch_size = args.batch_size
+    logger.info('batch size = %d for calibration' % batch_size)
+
+    # get number of batches for calibration
+    num_calib_batches = args.num_calib_batches
+    if calib_mode == 'none':
+        logger.info('skip calibration step as calib_mode is none')
+    else:
+        logger.info('number of batches = %d for calibration' % num_calib_batches)
+
+
+
+    label_name = args.label_name
+    logger.info('label_name = %s' % label_name)
+
+    excluded_sym_names = None
+    prefix = 'WD'
+    epoch=0
+    calib_layer = lambda name: (name.find('fullyconnected') != -1 or name.find('FullyConnected') != -1)
+    if calib_mode == 'none':
+        logger.info('Quantizing FP32 model %s' % args.model)
+        qsym, qarg_params, aux_params = quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params,
+                                                       ctx=ctx, excluded_sym_names=excluded_sym_names,
+                                                       data_names=['csr_data', 'dns_data'],
+                                                       calib_mode=calib_mode, quantized_dtype=args.quantized_dtype,
+                                                       logger=logger)
+        sym_name = '%s-symbol.json' % (prefix + '-quantized')
+    else:
+        qsym, qarg_params, aux_params = quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params,
+                                                        ctx=ctx, excluded_sym_names=excluded_sym_names,
+                                                        data_names=['csr_data', 'dns_data'],
+                                                        calib_mode=calib_mode, calib_data=data,
+                                                        num_calib_examples=num_calib_batches * batch_size,
+                                                        calib_layer=calib_layer, quantized_dtype=args.quantized_dtype,
+                                                        label_names=(label_name,), calib_quantize_op = True,
+                                                        logger=logger)
+        if calib_mode == 'entropy':
+            suffix = '-quantized-%dbatches-entropy' % num_calib_batches
+        elif calib_mode == 'naive':
+            suffix = '-quantized-%dbatches-naive' % num_calib_batches
+        else:
+            raise ValueError('unknow calibration mode %s received, only supports `none`, `naive`, and `entropy`'
+                             % calib_mode)
+        sym_name = '%s-symbol.json' % (prefix + suffix)
+    qsym = qsym.get_backend_symbol('MKLDNN_POST_FC_QUANTIZE')
+    qsym = qsym.get_backend_symbol('MKLDNN_QFC_POST_RELU_FUSED')
+    save_symbol(sym_name, qsym, logger)
+    param_name = '%s-%04d.params' % (prefix + '-quantized', epoch)
+    save_params(param_name, qarg_params, aux_params, logger)
diff --git a/include/mxnet/op_attr_types.h b/include/mxnet/op_attr_types.h
index dd81845..62e67cb 100644
--- a/include/mxnet/op_attr_types.h
+++ b/include/mxnet/op_attr_types.h
@@ -299,7 +299,13 @@ using FQuantizedOp = std::function<nnvm::NodePtr (const NodeAttrs& attrs)>;
  * \note Register under "FNeedRequantize" for non-quantized operators
  */
 using FNeedRequantize = std::function<bool (const NodeAttrs& attrs)>;
-
+/*!
+ * \brief Register a function to determine if the output of a quantized operator
+ * needs to be Dequantized. This is usually used for the operators
+ * taking int8 data types while accumulating in float, e.g. quantized_fully_connected.
+ * \note Register under "FNeedDequantize" for non-quantized operators
+ */
+using FNeedDequantize = std::function<bool(const NodeAttrs& attrs)>;
 /*!
  * \brief Register a function to determine if the input of a quantized operator
  * needs to be quantized. This is usually used for the quantized operators
@@ -307,7 +313,11 @@ using FNeedRequantize = std::function<bool (const NodeAttrs& attrs)>;
  */
 using FAvoidQuantizeInput = std::function<bool (const NodeAttrs& attrs,
                                                 size_t index)>;
-
+/*!
+ * \brief Register a function to determine if the input of a quantized operator
+ * needs min/max node for real time quantization.
+ */
+using FUseOfflineParamThreshold = std::function<bool(const NodeAttrs& attrs)>;
 }  // namespace mxnet
 
 #endif  // MXNET_OP_ATTR_TYPES_H_
diff --git a/mkldnn.mk b/mkldnn.mk
index d79bbe7..1714ebe 100644
--- a/mkldnn.mk
+++ b/mkldnn.mk
@@ -23,16 +23,18 @@ ifeq ($(UNAME_S), Darwin)
 	OMP_LIBFILE = $(MKLDNNROOT)/lib/libiomp5.dylib
 	MKLML_LIBFILE = $(MKLDNNROOT)/lib/libmklml.dylib
 	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.0.dylib
+	MKLDNN_LIB64FILE = $(MKLDNNROOT)/lib64/libmkldnn.0.dylib
 else
 	OMP_LIBFILE = $(MKLDNNROOT)/lib/libiomp5.so
 	MKLML_LIBFILE = $(MKLDNNROOT)/lib/libmklml_intel.so
 	MKLDNN_LIBFILE = $(MKLDNNROOT)/lib/libmkldnn.so.0
+	MKLDNN_LIB64FILE = $(MKLDNNROOT)/lib64/libmkldnn.so.0
 endif
 endif
 
 .PHONY: mkldnn mkldnn_clean
 
-mkldnn_build: $(MKLDNN_LIBFILE)
+mkldnn_build: $(MKLDNN_LIBFILE) 
 
 $(MKLDNN_LIBFILE):
 	mkdir -p $(MKLDNNROOT)
@@ -40,14 +42,28 @@ $(MKLDNN_LIBFILE):
 	cmake $(MKLDNN_SUBMODDIR) -DCMAKE_INSTALL_PREFIX=$(MKLDNNROOT) -B$(MKLDNN_BUILDDIR) -DARCH_OPT_FLAGS="-mtune=generic" -DWITH_TEST=OFF -DWITH_EXAMPLE=OFF
 	$(MAKE) -C $(MKLDNN_BUILDDIR) VERBOSE=1
 	$(MAKE) -C $(MKLDNN_BUILDDIR) install
+	if [ -f "$(MKLDNN_LIB64FILE)" ]; then \
+		mv $(MKLDNNROOT)/lib64/libmkldnn* $(MKLDNNROOT)/lib/; \
+	fi
 	mkdir -p $(MXNET_LIBDIR)
 	cp $(OMP_LIBFILE) $(MXNET_LIBDIR)
 	cp $(MKLML_LIBFILE) $(MXNET_LIBDIR)
 	cp $(MKLDNN_LIBFILE) $(MXNET_LIBDIR)
+ifeq ($(USE_BLAS), mkl)
+ifeq ($(USE_INTEL_PATH), NONE)
+	$(MAKE) -C $(SPARSE_MATRIX_DIR)
+else
+	$(MAKE) -C $(SPARSE_MATRIX_DIR) USE_INTEL_PATH=$(USE_INTEL_PATH)
+endif
+	cp $(SPARSE_MATRIX_DIR)/libsparse_matrix.so $(MXNET_LIBDIR)
+endif
 
 mkldnn_clean:
 	$(RM) -r 3rdparty/mkldnn/build
 	$(RM) -r $(MKLDNNROOT)
+ifeq ($(USE_BLAS), mkl)
+	$(MAKE) -C $(SPARSE_MATRIX_DIR) clean
+endif
 
 ifeq ($(USE_MKLDNN), 1)
 mkldnn: mkldnn_build
diff --git a/src/common/utils.h b/src/common/utils.h
index 92b7c20..b902b38 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -717,6 +717,23 @@ inline void EmplaceBackZeros(const NDArrayStorageType stype, const TShape &shape
   }
 }
 
+
+/*!
+ * \brief parallelize copy by OpenMP.
+ */
+template<typename DType>
+inline void ParallelCopy(DType* dst, const DType* src, index_t size) {
+  static index_t copy_block_size = dmlc::GetEnv("MXNET_CPU_PARALLEL_COPY_SIZE", 200000);
+  if (size >= copy_block_size) {
+    #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
+    for (index_t i = 0; i < size; ++i) {
+      dst[i] = src[i];
+    }
+  } else {
+    std::memcpy(dst, src, sizeof(DType) * size);
+  }
+}
+
 }  // namespace common
 }  // namespace mxnet
 #endif  // MXNET_COMMON_UTILS_H_
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index d866ad1..732c1f7 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -1312,6 +1312,26 @@ void GraphExecutor::ExecuteMonCallback(size_t nid) {
     std::string name = inode.source->attrs.name + "_" + output_names[i];
     this->monitor_callback_(name.c_str(), reinterpret_cast<void*>(cpy));
   }
+
+  std::vector<std::string> input_names;
+  static const auto& flist_inputs =
+      nnvm::Op::GetAttr<nnvm::FListOutputNames>("FListInputNames");
+  if (flist_inputs.count(node->op())) {
+      input_names = flist_inputs[node->op()](node->attrs);
+  }
+  else {
+      for (size_t i = 0; i < node->num_inputs(); ++i) {
+          input_names.emplace_back(std::to_string(i));
+      }
+  }
+  CHECK_EQ(opnode.exec->in_array.size(), input_names.size());
+
+  //TODO: this patch work only on operator that do not change input
+  for (index_t i = 0; i < opnode.exec->in_array.size(); ++i) {
+      NDArray *cpy = new NDArray(opnode.exec->in_array[i]);
+      std::string name = inode.source->attrs.name + "_" + input_names[i];
+      this->monitor_callback_(name.c_str(), reinterpret_cast<void*>(cpy));
+  }
 }
 
 void GraphExecutor::RunOps(bool is_train, size_t topo_start, size_t topo_end) {
diff --git a/src/ndarray/ndarray_function.cc b/src/ndarray/ndarray_function.cc
index 43295d6..a613d5a 100644
--- a/src/ndarray/ndarray_function.cc
+++ b/src/ndarray/ndarray_function.cc
@@ -38,13 +38,15 @@ void Copy<cpu, cpu>(const TBlob &from, TBlob *to,
                     RunContext ctx) {
   MSHADOW_TYPE_SWITCH(to->type_flag_, DType, {
     if (to->type_flag_ == from.type_flag_) {
-        mshadow::Copy(to->FlatTo1D<cpu, DType>(),
-                      from.FlatTo1D<cpu, DType>());
+      const index_t size = from.Size();
+      CHECK_EQ(size, to->Size()) << "copying size mismatch, from: " << size * sizeof(DType)
+               << " bytes, to: " << to->Size() * sizeof(DType) << " bytes.";
+      common::ParallelCopy(to->dptr<DType>(), from.dptr<DType>(), size);
     } else {
-        MSHADOW_TYPE_SWITCH(from.type_flag_, SrcDType, {
-            to->FlatTo1D<cpu, DType>() =
-                mshadow::expr::tcast<DType>(from.FlatTo1D<cpu, SrcDType>());
-        })
+      MSHADOW_TYPE_SWITCH(from.type_flag_, SrcDType, {
+          to->FlatTo1D<cpu, DType>() =
+              mshadow::expr::tcast<DType>(from.FlatTo1D<cpu, SrcDType>());
+      })
     }
   })
 }
diff --git a/src/operator/channel_op_common.h b/src/operator/channel_op_common.h
index 1afc13a..df833c1 100644
--- a/src/operator/channel_op_common.h
+++ b/src/operator/channel_op_common.h
@@ -101,6 +101,44 @@ void Split(const mshadow::Tensor<xpu, dim, DType> &input,
     split_helper<xpu, dim, dim-1>(input, output, dimension, req);
   }
 }
+
+template<typename xpu, int dim, typename DType>
+void Split_2D(const mshadow::Tensor<xpu, dim, DType> &input,
+           std::vector<mshadow::Tensor<xpu, dim, DType> > *output,
+           const int dimension, const std::vector<OpReqType> &req) {
+  if (dimension != 1) {
+    LOG(FATAL) << "dimension (" << dimension << ") must == 1";
+  }
+  if (dim != 3) {
+    LOG(FATAL) << "dimension (" << dim << ") must == 3";
+  } else {
+    std::vector<mshadow::Tensor<xpu, dim, DType> > out = *output;
+    size_t size = out.size();
+    std::vector<int>slice_len;
+    std::vector<int>begin_pos;
+    begin_pos.push_back(0);
+
+    for (index_t i = 0; i < size; ++i) {
+      slice_len.push_back(out[i].size(dimension));
+      begin_pos.push_back(begin_pos[i] + out[i].size(dimension));
+    }
+#if !defined(MXNET_ENABLE_CUDA_RTC)
+    #pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
+#endif
+    for (int i = 0; i < input.shape_[0]; i++) {
+      int iRow = i*input.shape_[1];
+      for (int j = 0; j < size; j++) {
+        int jRow = i*slice_len[j];
+        int iPos = iRow + begin_pos[j];
+        for (int k = 0; k < slice_len[j]; k++) {
+          out[j].dptr_[jRow + k] = input.dptr_[iPos + k];
+        }
+      }
+    }
+  }
+}
+
+
 }  // namespace op
 }  // namespace mxnet
 #endif  // MXNET_OPERATOR_CHANNEL_OP_COMMON_H_
diff --git a/src/operator/mxnet_op.h b/src/operator/mxnet_op.h
index 5b106afd..c5443ea 100644
--- a/src/operator/mxnet_op.h
+++ b/src/operator/mxnet_op.h
@@ -506,7 +506,7 @@ struct Kernel<OP, cpu> {
   inline static bool Launch(mshadow::Stream<cpu> *, const int N, Args... args) {
 #ifdef _OPENMP
     const int omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
-    if (omp_threads < 2) {
+    if (omp_threads < 2 || N < 2) {
       for (int i = 0; i < N; ++i) {
         OP::Map(i, args...);
       }
diff --git a/src/operator/nn/concat.cc b/src/operator/nn/concat.cc
index 711fe9c..5bfd6f0 100644
--- a/src/operator/nn/concat.cc
+++ b/src/operator/nn/concat.cc
@@ -31,47 +31,51 @@
 
 namespace mxnet {
 namespace op {
-
+bool ConcatSetShape(std::vector<TShape> *in_shape,
+	std::vector<TShape> *out_shape, int num_args, int dim)
+{
+	CHECK_EQ(in_shape->size(), static_cast<size_t>(num_args));
+	TShape dshape;
+	index_t size = 0;
+	bool has_zero = false;
+	int axis = -1;
+	for (int i = 0; i < num_args; ++i) {
+		TShape tmp = (*in_shape)[i];
+		if (tmp.ndim()) {
+			axis = CheckAxis(dim, tmp.ndim());
+			has_zero = tmp[axis] == 0 || has_zero;
+			size += tmp[axis];
+			tmp[axis] = 0;
+			shape_assign(&dshape, tmp);
+		}
+	}
+
+	TShape tmp = (*out_shape)[0];
+	if (tmp.ndim()) {
+		axis = CheckAxis(dim, tmp.ndim());
+		tmp[axis] = 0;
+		shape_assign(&dshape, tmp);
+	}
+
+	if (dshape.ndim() == 0) return false;
+
+	for (int i = 0; i < num_args; ++i) {
+		CHECK(shape_assign(&(*in_shape)[i], dshape))
+			<< "Incompatible input shape: expected " << dshape << ", got " << (*in_shape)[i];
+	}
+
+	if (!has_zero) dshape[axis] = size;
+	CHECK(shape_assign(&(*out_shape)[0], dshape))
+		<< "Incompatible output shape: expected " << dshape << ", got " << (*out_shape)[0];
+
+	return dshape.Size() != 0;
+}
 static bool ConcatShape(const nnvm::NodeAttrs& attrs,
                         std::vector<TShape> *in_shape,
                         std::vector<TShape> *out_shape) {
   using namespace mshadow;
   const ConcatParam& param_ = nnvm::get<ConcatParam>(attrs.parsed);
-  CHECK_EQ(in_shape->size(), static_cast<size_t>(param_.num_args));
-  TShape dshape;
-  index_t size = 0;
-  bool has_zero = false;
-  int axis = -1;
-  for (int i = 0; i < param_.num_args; ++i) {
-    TShape tmp = (*in_shape)[i];
-    if (tmp.ndim()) {
-      axis = CheckAxis(param_.dim, tmp.ndim());
-      has_zero = tmp[axis] == 0 || has_zero;
-      size += tmp[axis];
-      tmp[axis] = 0;
-      shape_assign(&dshape, tmp);
-    }
-  }
-
-  TShape tmp = (*out_shape)[0];
-  if (tmp.ndim()) {
-    axis = CheckAxis(param_.dim, tmp.ndim());
-    tmp[axis] = 0;
-    shape_assign(&dshape, tmp);
-  }
-
-  if (dshape.ndim() == 0) return false;
-
-  for (int i = 0; i < param_.num_args; ++i) {
-    CHECK(shape_assign(&(*in_shape)[i], dshape))
-        << "Incompatible input shape: expected " << dshape << ", got " << (*in_shape)[i];
-  }
-
-  if (!has_zero) dshape[axis] = size;
-  CHECK(shape_assign(&(*out_shape)[0], dshape))
-      << "Incompatible output shape: expected " << dshape << ", got " << (*out_shape)[0];
-
-  return dshape.Size() != 0;
+  return ConcatSetShape(in_shape, out_shape, param_.num_args, param_.dim);
 }
 
 // Concat for RNN param deals with the reverse shape inference from output
diff --git a/src/operator/nn/fully_connected-inl.h b/src/operator/nn/fully_connected-inl.h
index 2b75419..591e472 100644
--- a/src/operator/nn/fully_connected-inl.h
+++ b/src/operator/nn/fully_connected-inl.h
@@ -52,6 +52,13 @@ struct FullyConnectedParam : public dmlc::Parameter<FullyConnectedParam> {
   int num_hidden;
   bool no_bias;
   bool flatten;
+#if MXNET_USE_MKLDNN == 1
+  bool relu_fused;
+  int output_type;
+  bool out_enable_calib_range;
+  float out_min_calib_range;
+  float out_max_calib_range;
+#endif
   DMLC_DECLARE_PARAMETER(FullyConnectedParam) {
     // TODO(bing) add support for boolean
     DMLC_DECLARE_FIELD(num_hidden).set_lower_bound(1)
@@ -60,12 +67,29 @@ struct FullyConnectedParam : public dmlc::Parameter<FullyConnectedParam> {
     .describe("Whether to disable bias parameter.");
     DMLC_DECLARE_FIELD(flatten).set_default(true)
     .describe("Whether to collapse all but the first axis of the input data tensor.");
+#if MXNET_USE_MKLDNN == 1
+	DMLC_DECLARE_FIELD(relu_fused).set_default(false)
+		.describe("Enable Relu Fuse");
+	DMLC_DECLARE_FIELD(output_type)
+		.add_enum("fp32", mshadow::kFloat32)
+		.add_enum("uint8", mshadow::kUint8)
+		.add_enum("int32", mshadow::kInt32)
+		.set_default(mshadow::kInt32)
+		.describe("Quantization Fully-connection output type");
+    DMLC_DECLARE_FIELD(out_enable_calib_range).set_default(false)
+        .describe("Enable output threshold.");
+	DMLC_DECLARE_FIELD(out_min_calib_range).set_default(0.0)
+		.describe("min value of output threashold.");
+	DMLC_DECLARE_FIELD(out_max_calib_range).set_default(0.0)
+		.describe("max value of output threashold.");
+#endif
   }
   bool operator==(const FullyConnectedParam& other) const {
     return this->num_hidden == other.num_hidden &&
            this->no_bias == other.no_bias &&
            this->flatten == other.flatten;
   }
+
 };
 
 template<typename xpu, typename DType>
@@ -241,6 +265,10 @@ struct hash<mxnet::op::FullyConnectedParam> {
     ret = dmlc::HashCombine(ret, val.num_hidden);
     ret = dmlc::HashCombine(ret, val.no_bias);
     ret = dmlc::HashCombine(ret, val.flatten);
+#if MXNET_USE_MKLDNN == 1
+    ret = dmlc::HashCombine(ret, val.relu_fused);
+    ret = dmlc::HashCombine(ret, val.output_type);
+#endif
     return ret;
   }
 };
diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h
index 17e7409..68c2e86 100644
--- a/src/operator/nn/mkldnn/mkldnn_base-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h
@@ -174,10 +174,12 @@ struct ActivationParam;
 struct ConvolutionParam;
 struct DeconvolutionParam;
 struct SoftmaxParam;
+struct SoftmaxOutputParam;
 bool SupportMKLDNNAct(const ActivationParam& param);
 bool SupportMKLDNNConv(const ConvolutionParam& params, const NDArray &input);
 bool SupportMKLDNNDeconv(const DeconvolutionParam& params, const NDArray &input);
 bool SupportMKLDNNSoftmax(const SoftmaxParam& param);
+bool SupportMKLDNNSoftmaxOutput(const SoftmaxOutputParam &param);
 }
 
 static int GetTypeSize(int dtype) {
diff --git a/src/operator/nn/mkldnn/mkldnn_ops-inl.h b/src/operator/nn/mkldnn/mkldnn_ops-inl.h
index 5093770..6640fd4 100644
--- a/src/operator/nn/mkldnn/mkldnn_ops-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_ops-inl.h
@@ -75,6 +75,10 @@ void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &
 void MKLDNNSoftmaxForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
                           const NDArray &in_data, const OpReqType &req,
                           const NDArray &out_data);
+/* For softmax_output */
+void MKLDNNSoftmaxOutputForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                          const std::vector<NDArray> &in_data, const std::vector<OpReqType> &req,
+                          const std::vector<NDArray> &out_data);
 
 /* For sum */
 void MKLDNNSumForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
diff --git a/src/operator/nn/mkldnn/mkldnn_slice-inl.h b/src/operator/nn/mkldnn/mkldnn_slice-inl.h
new file mode 100644
index 0000000..3d7c413
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_slice-inl.h
@@ -0,0 +1,146 @@
+#include "../../tensor/matrix_op-inl.h"
+#include "./mkldnn_base-inl.h"
+#include "./mkldnn_ops-inl.h"
+#include <sys/time.h>
+#if MXNET_USE_MKLDNN == 1
+namespace mxnet {
+namespace op {
+
+class MKLDNNSliceFwd {
+  std::shared_ptr<mkldnn::memory> data;
+  std::shared_ptr<mkldnn::memory> out;
+  std::shared_ptr<mkldnn::reorder> fwd;
+
+ public:
+  MKLDNNSliceFwd(const SliceParam &param,
+                 const NDArray &in,
+                 const NDArray &out) {
+    const TShape ishape = in.shape();
+    const TShape oshape = out.shape();
+    uint32_t N = ishape.ndim();
+    mkldnn::memory::dims dims(N);
+    mkldnn::memory::dims offsets(N);
+    for (uint32_t i = 0; i < N; ++i) {
+      int s = 0;
+      if (param.begin[i]) {
+        s = *param.begin[i];
+        if (s < 0) s += ishape[i];
+      }
+      dims[i] = oshape[i];
+      offsets[i] = s;
+    }
+    auto in_mem = in.GetMKLDNNData();
+    auto in_mem_pd = in_mem->get_primitive_desc();
+    auto out_mem_pd = out.GetMKLDNNData()->get_primitive_desc();
+    auto view_pd = mkldnn::view::primitive_desc(in_mem_pd, dims, offsets);
+    auto reorder_pd = reorder::primitive_desc(view_pd.dst_primitive_desc(), out_mem_pd);
+    this->data = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+              view_pd.dst_primitive_desc(), nullptr));
+    this->out = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+              view_pd.dst_primitive_desc(), nullptr));
+    fwd.reset(new mkldnn::reorder(reorder_pd, *this->data, *this->out));
+  }
+
+  void SetNewMem(const mkldnn::memory &input, const mkldnn::memory &output) {
+      this->data->set_data_handle(input.get_data_handle());
+      this->out->set_data_handle(output.get_data_handle());
+  }
+
+  const mkldnn::reorder &GetPd() const {
+    return *fwd;
+  }
+};
+
+typedef ParamOpSign<SliceParam> MKLDNNSliceSignature;
+static MKLDNNSliceFwd &GetSliceForward(const SliceParam &param,
+    const NDArray &in_data, const NDArray &out_data) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<MKLDNNSliceSignature, MKLDNNSliceFwd, OpHash> fwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<MKLDNNSliceSignature, MKLDNNSliceFwd, OpHash> fwds;
+#endif
+  MKLDNNSliceSignature key(param);
+  key.AddSign(in_data);
+
+  auto it = fwds.find(key);
+  if (it == fwds.end()) {
+    MKLDNNSliceFwd fwd(param, in_data, out_data);
+    auto ins_ret = fwds.insert(std::pair<MKLDNNSliceSignature, MKLDNNSliceFwd>(key, fwd));
+    CHECK(ins_ret.second);
+    it = ins_ret.first;
+  }
+  return it->second;
+}
+
+void MKLDNNSlice(const SliceParam &param, const OpContext& ctx,
+                 const NDArray &in, OpReqType req, const NDArray &out) {
+  MKLDNNSliceFwd &fwd = GetSliceForward(param, in, out);
+  auto in_mem = in.GetMKLDNNData();
+  auto out_mem_pd = out.GetMKLDNNData()->get_primitive_desc();
+  auto out_mem = CreateMKLDNNMem(out, out_mem_pd, req);
+  fwd.SetNewMem(*in_mem, *out_mem.second);
+  MKLDNNStream::Get()->RegisterPrim(fwd.GetPd());
+  CommitOutput(out, out_mem);
+  MKLDNNStream::Get()->Submit();
+}
+
+void MKLDNNSliceAxis(const SliceAxisParam &param, const OpContext &ctx,
+                  const NDArray &in, OpReqType req, const NDArray &out) {
+  const TShape ishape = in.shape();
+  const TShape oshape = out.shape();
+  int axis, begin, end;
+  GetSliceAxisParams(param, in.shape(), &axis, &begin, &end);
+
+  uint32_t N = ishape.ndim();
+  mkldnn::memory::dims dims(N);
+  mkldnn::memory::dims offsets(N);
+  for (size_t i = 0; i < N; i++) {
+    dims[i] = oshape[i];
+    if (i == uint32_t(axis))  offsets[i] = begin;
+    else  offsets[i] = 0;
+  }
+  auto in_mem = in.GetMKLDNNData();
+  auto in_mem_pd = in_mem->get_primitive_desc();
+
+  std::shared_ptr<mkldnn::view::primitive_desc> view_pd;
+  view_pd.reset(new mkldnn::view::primitive_desc(in_mem_pd, dims, offsets));
+  auto out_mem_pd = out.GetMKLDNNData()->get_primitive_desc();
+  auto out_mem = CreateMKLDNNMem(out, out_mem_pd, req);
+  auto reorder_pd = reorder::primitive_desc(view_pd.get()->dst_primitive_desc(), out_mem_pd);
+  MKLDNNStream *stream = MKLDNNStream::Get();
+  stream->RegisterPrim(mkldnn::reorder(reorder_pd, *in_mem, *out_mem.second));
+  CommitOutput(out, out_mem);
+  stream->Submit();
+}
+
+void MKLDNNSliceLike(const SliceLikeParam &param, const OpContext &ctx,
+                  const NDArray &in, OpReqType req, const NDArray &out) {
+  const TShape ishape = in.shape();
+  const TShape oshape = out.shape();
+
+  mkldnn::memory::dims dims(oshape.ndim());
+  mkldnn::memory::dims offsets(oshape.ndim());
+  for (size_t i = 0; i < dims.size(); i++) {
+    dims[i] = oshape[i];
+    offsets[i] = 0;
+  }
+
+  auto in_mem = in.GetMKLDNNData();
+  auto in_mem_pd = in_mem->get_primitive_desc();
+
+  std::shared_ptr<mkldnn::view::primitive_desc> view_pd;
+  view_pd.reset(new mkldnn::view::primitive_desc(in_mem_pd, dims, offsets));
+
+  auto out_mem_pd = out.GetMKLDNNData()->get_primitive_desc();
+  auto out_mem = CreateMKLDNNMem(out, out_mem_pd, req);
+  mkldnn::reorder::primitive_desc reorder_pd(view_pd.get()->dst_primitive_desc(), out_mem_pd);
+
+  MKLDNNStream *stream = MKLDNNStream::Get();
+  stream->RegisterPrim(mkldnn::reorder(reorder_pd, *in_mem, *out_mem.second));
+  CommitOutput(out, out_mem);
+  stream->Submit();
+}
+
+}  // namespace op
+}  // namespace mxnet
+#endif
diff --git a/src/operator/nn/mkldnn/mkldnn_softmax_output.cc b/src/operator/nn/mkldnn/mkldnn_softmax_output.cc
new file mode 100644
index 0000000..1152638
--- /dev/null
+++ b/src/operator/nn/mkldnn/mkldnn_softmax_output.cc
@@ -0,0 +1,154 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file mkldnn_softmax_output.cc
+ * \brief
+ * \author Zhang Rong A
+*/
+
+#include "../../softmax_output-inl.h"
+#include "./mkldnn_ops-inl.h"
+#include "./mkldnn_base-inl.h"
+
+#if MXNET_USE_MKLDNN == 1
+namespace mxnet {
+namespace op {
+
+static mkldnn::softmax_forward::primitive_desc GetSoftmaxOutputFwdDescImpl(
+    const SoftmaxOutputParam& param, bool is_train,
+    const NDArray &data, const mkldnn::memory &input_mem) {
+  mkldnn::memory::primitive_desc data_mpd = input_mem.get_primitive_desc();
+  mkldnn::memory::desc data_md = data_mpd.desc();
+  auto cpu_engine = data_mpd.get_engine();
+  int axis = data.shape().ndim() - 1;
+  mkldnn::softmax_forward::desc desc = is_train
+      ? mkldnn::softmax_forward::desc(mkldnn::prop_kind::forward_training,
+                                      data_md, axis)
+      : mkldnn::softmax_forward::desc(mkldnn::prop_kind::forward_scoring,
+                                      data_md, axis);
+  return mkldnn::softmax_forward::primitive_desc(desc, cpu_engine);
+}
+
+typedef ParamOpSign<SoftmaxOutputParam> MKLDNNSoftmaxOuputSignature;
+
+class MKLDNNSoftmaxOutputFwd {
+  std::shared_ptr<mkldnn::softmax_forward> fwd;
+  std::shared_ptr<mkldnn::memory> data;
+  std::shared_ptr<mkldnn::memory> out;
+
+ public:
+  const mkldnn::softmax_forward::primitive_desc fwd_pd;
+
+  MKLDNNSoftmaxOutputFwd(const SoftmaxOutputParam& param, bool is_train,
+           const NDArray &data, const mkldnn::memory &mem): fwd_pd(
+             GetSoftmaxOutputFwdDescImpl(param, is_train, data, mem)) {
+  }
+
+  void SetNewMem(const mkldnn::memory &data, const mkldnn::memory &output) {
+  if (this->data == nullptr) {
+    this->data = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+        data.get_primitive_desc(), data.get_data_handle()));
+  }
+  else
+    this->data->set_data_handle(data.get_data_handle());
+
+  if (this->out == nullptr)
+    this->out = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+            output.get_primitive_desc(), output.get_data_handle()));
+  else
+    this->out->set_data_handle(output.get_data_handle());
+
+  if (this->fwd == nullptr) {
+    this->fwd = std::shared_ptr<mkldnn::softmax_forward>(
+      new mkldnn::softmax_forward(fwd_pd, mkldnn::primitive::at(*this->data),
+                    output));
+  }
+  }
+
+  const mkldnn::softmax_forward &GetFwd() const {
+  return *fwd;
+  }
+};
+
+static MKLDNNSoftmaxOutputFwd &GetSoftmaxOutputForward(const SoftmaxOutputParam& param,
+                     const OpContext &ctx, const NDArray &in_data,
+                     const mkldnn::memory &in_mem) {
+#if DMLC_CXX11_THREAD_LOCAL
+  static thread_local std::unordered_map<MKLDNNSoftmaxOuputSignature, MKLDNNSoftmaxOutputFwd, OpHash> fwds;
+#else
+  static MX_THREAD_LOCAL std::unordered_map<MKLDNNSoftmaxOuputSignature, MKLDNNSoftmaxOutputFwd, OpHash> fwds;
+#endif
+  MKLDNNSoftmaxOuputSignature key(param);
+  key.AddSign(ctx.is_train);
+  key.AddSign(param.preserve_shape);
+  key.AddSign(in_data);
+
+  auto it = fwds.find(key);
+  if (it == fwds.end()) {
+  MKLDNNSoftmaxOutputFwd fwd(param, ctx.is_train, in_data, in_mem);
+  auto ins_ret = fwds.insert(std::pair<MKLDNNSoftmaxOuputSignature, MKLDNNSoftmaxOutputFwd>(
+      key, fwd));
+  CHECK(ins_ret.second);
+  it = ins_ret.first;
+  }
+  return it->second;
+}
+
+
+bool SupportMKLDNNSoftmaxOutput(const SoftmaxOutputParam &param) {
+  return param.multi_output ? false : true;
+}
+
+void MKLDNNSoftmaxOutputForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
+                          const std::vector<NDArray> &in_data, const std::vector<OpReqType> &req,
+                          const std::vector<NDArray> &out_data) {
+  const SoftmaxOutputParam &param = nnvm::get<SoftmaxOutputParam>(attrs.parsed);
+
+  NDArray idata = in_data[softmaxout_enum::kData];
+  NDArray odata = out_data[softmaxout_enum::kOut];
+  
+  if (in_data[softmaxout_enum::kData].IsView() && in_data[softmaxout_enum::kData].IsMKLDNNData()) {
+    idata = in_data[softmaxout_enum::kData].Reorder2Default();
+  }
+   
+   auto input_mem = idata.GetMKLDNNData();
+   auto output_mem = odata.GetMKLDNNData();
+
+  
+  MKLDNNSoftmaxOutputFwd &fwd = GetSoftmaxOutputForward(param, ctx, idata, *input_mem);
+  fwd.SetNewMem(*input_mem, *output_mem);
+  
+  MKLDNNStream *stream = MKLDNNStream::Get();
+  stream->RegisterPrim(fwd.GetFwd());
+
+
+
+  stream->Submit();
+}
+
+}   // namespace op
+}   // namespace mxnet
+
+
+#endif
+
+
+
+
diff --git a/src/operator/quantization/dequantize.cc b/src/operator/quantization/dequantize.cc
index e20bc17..8a90cef 100644
--- a/src/operator/quantization/dequantize.cc
+++ b/src/operator/quantization/dequantize.cc
@@ -43,8 +43,6 @@ bool DequantizeStorageType(const nnvm::NodeAttrs& attrs,
   }
 #endif
   (*out_attrs)[0] = kDefaultStorage;
-  (*out_attrs)[1] = kDefaultStorage;
-  (*out_attrs)[2] = kDefaultStorage;
   return true;
 }
 
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantize-inl.h b/src/operator/quantization/mkldnn/mkldnn_quantize-inl.h
index 7a00f62..1099ea1 100644
--- a/src/operator/quantization/mkldnn/mkldnn_quantize-inl.h
+++ b/src/operator/quantization/mkldnn/mkldnn_quantize-inl.h
@@ -26,40 +26,259 @@
 #ifndef MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_QUANTIZE_INL_H_
 #define MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_QUANTIZE_INL_H_
 #if MXNET_USE_MKLDNN == 1
+#include <stdio.h>
 #include <string>
 #include <algorithm>
 #include <vector>
+#include <algorithm>
+#include <iterator>
+#include <sstream>
 #include "../quantize-inl.h"
+#include "../../tensor/matrix_op-inl.h"
 #include "../../nn/mkldnn/mkldnn_base-inl.h"
+#include "mkldnn_quantized_util.h"
 
+#include "mkl_cblas.h"
+#include "mkl_vml_functions.h"
+typedef MKL_INT cblas_int;
+namespace std {
+template<>
+struct hash<mxnet::op::QuantizeParam> {
+  size_t operator()(const mxnet::op::QuantizeParam& val) {
+    size_t ret = 0;
+    ret = dmlc::HashCombine(ret, val.out_type);
+#if MXNET_USE_MKLDNN == 1
+    ret = dmlc::HashCombine(ret, val.threshold_enabled);
+    ret = dmlc::HashCombine(ret, val.th_min);
+    ret = dmlc::HashCombine(ret, val.th_max);
+#endif
+    return ret;
+  }
+};
+}  // namespace std
 namespace mxnet {
 namespace op {
+/*
+* amin < 0, out type=int8 => (e.g. Weight, bias, blance)
+                                : min: -MaxAbs(amin, amax)
+                                  man: MaxAbs(amin, amax)
+                                  scale: 128/max
+                                  amin_shift=0
+* amin < 0, out type=uint8 => (e.g input with minus value)
+                                : min: amin
+                                  man: amax
+                                  scale: 256/(amax-amin)
+                                  amin_shift=amin
+* amin > 0, out type=uint8 =>  (e.g input with minus value)
+                                : min: amin
+                                  man: amax
+                                  scale: 256/(amax)
+                                  amin_shift=0
+
+*/
+
+class MKLDNNQuantize {
+private:
+    std::shared_ptr<mkldnn::memory> _data;    
+    std::shared_ptr<mkldnn::memory> _out;    
+    
+    mkldnn::memory::primitive_desc _i_mpd;
+    mkldnn::memory::primitive_desc _o_mpd;
+    primitive_attr _attr;
+    std::shared_ptr<mkldnn::reorder> _reorder;
+    std::shared_ptr<mkldnn::reorder::primitive_desc> _reorder_pd;
+public:    
+    MKLDNNQuantize(const QuantizeParam& param, const NDArray &data,
+        mkldnn::memory::data_type adata_type, primitive_attr& attr)
+        : _attr(attr), _reorder(nullptr) {
+        auto cpu_engine = CpuEngine::Get()->get_engine();
+        auto i_mem = data.GetMKLDNNData();
+        _i_mpd = i_mem->get_primitive_desc();
+
+        auto i_mpd = i_mem->get_primitive_desc();
+        auto i_desc = i_mpd.desc();
+        mkldnn::memory::format i_fmt = static_cast<mkldnn::memory::format>(i_desc.data.format);
+        if (i_fmt == mkldnn::memory::format::nchw ||
+            i_fmt == mkldnn::memory::format::nChw8c ||
+            i_fmt == mkldnn_nChw16c) {
+            i_fmt = mkldnn::memory::format::nhwc;
+        }
+        size_t i_ndim = data.shape().ndim();
+        mkldnn::memory::dims i_dims = mkldnn::memory::dims(i_ndim);
+        for (size_t i = 0; i < i_ndim; i++) {
+            i_dims[i] = static_cast<int>(data.shape()[i]);
+        }
+        auto o_desc = mkldnn::memory::desc(i_dims, adata_type, i_fmt);        
+        _o_mpd = memory::primitive_desc(o_desc, cpu_engine);
+        _reorder_pd = std::shared_ptr<mkldnn::reorder::primitive_desc>(
+            new mkldnn::reorder::primitive_desc(_i_mpd, _o_mpd, _attr));
+    }
+    void SetNewMem(const mkldnn::memory &i_mem, const mkldnn::memory &o_mem) {
+        if (this->_data == nullptr)
+            this->_data = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+                _i_mpd, i_mem.get_data_handle()));
+        else
+            this->_data->set_data_handle(i_mem.get_data_handle());
+
+        if (this->_out == nullptr)
+            this->_out = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+                _o_mpd, o_mem.get_data_handle()));
+        else
+            this->_out->set_data_handle(o_mem.get_data_handle());
+        if(this->_reorder == nullptr)
+            this->_reorder = std::shared_ptr<mkldnn::reorder>(
+                new mkldnn::reorder(*this->_reorder_pd, *this->_data, *this->_out));
+    }
+    const mkldnn::reorder &GetReorder() const {
+        return *_reorder;
+    }
+    mkldnn::memory::primitive_desc GetOutMPD() {
+        return _o_mpd;
+    }
+};
+
+typedef ParamOpSign<QuantizeParam> MKLDNNQuantizeSignature;
+
+static inline MKLDNNQuantize &GetQuantize(const QuantizeParam& param, const NDArray &data, 
+    mkldnn::memory::data_type adata_type, primitive_attr& attr) {
+    //return MKLDNNQuantize quantize(param, data, output, attr);
+#if DMLC_CXX11_THREAD_LOCAL
+    static thread_local std::unordered_map<MKLDNNQuantizeSignature,
+        MKLDNNQuantize, OpHash> fcFwds;
+#else
+    static MX_THREAD_LOCAL std::unordered_map<MKLDNNQuantizeSignature,
+        MKLDNNQuantize, OpHash> fcFwds;
+#endif
+
+    MKLDNNQuantizeSignature key(param);
+    key.AddSign(data);
+
+
+    auto it = fcFwds.find(key);
+    if (it == fcFwds.end()) {
+        MKLDNNQuantize quantize(param, data, adata_type, attr);
+        auto ins_ret = fcFwds.insert(
+            std::pair<MKLDNNQuantizeSignature, MKLDNNQuantize>(key, quantize));
+        CHECK(ins_ret.second);
+        it = ins_ret.first;
+    }
+    else {
+        //printf("got\n");
+    }
+    return it->second;
+}
+
+class quantize_ws_cache_buf_t
+{
+private:
+	float * _amin_cache;
+	TShape _ishape;
+	float _amin;
+    quantize_ws_cache_buf_t(const quantize_ws_cache_buf_t &obj){}
+    quantize_ws_cache_buf_t& operator=(const quantize_ws_cache_buf_t &s){ return *this; }
+public:
+	quantize_ws_cache_buf_t(const TShape& ishape)
+		: _amin_cache(NULL), _ishape(ishape){}
+    
+	float* get_amin_buffer(float amin)
+	{
+		if (_ishape.ndim() != 2)
+			return NULL;
+		if (!_amin_cache)
+		{
+			int size = _ishape[1];
+			_amin_cache = new float[size];
+			for (size_t i = 0; i < size; i++) {
+				_amin_cache[i] = -amin;
+			}
+		}
+		return _amin_cache;
+	}
+	int get_ws_mb_size() {
+		return _ishape[0];
+	}
+	int get_ws_cache_size() {
+		return _ishape[1];
+	}
+	~quantize_ws_cache_buf_t()
+	{
+		if (_amin_cache) {
+			delete[] _amin_cache;
+			_amin_cache = NULL;
+		}
+	}
+};
+
 
 template<typename SrcType, typename DstType>
 static void MKLDNNQuantizeComputeKer(const std::vector<NDArray>& inputs,
                                      const std::vector<NDArray>& outputs,
                                      const QuantizeParam& param,
                                      const std::vector<OpReqType> &req) {
-  using namespace mshadow;
-  using namespace mxnet_op;
-  using red::limits::MaxValue;
-  using red::limits::MinValue;
   float real_range = 0.0;
   float quantized_range = 0.0;
-  if (param.out_type == mshadow::kUint8) {
-    real_range = MaxAbs(*inputs[1].data().dptr<float>(), *inputs[2].data().dptr<float>());
-    quantized_range = MaxAbs(MaxValue<DstType>(), MinValue<DstType>());
-    *outputs[1].data().dptr<float>() = *inputs[1].data().dptr<float>();
-    *outputs[2].data().dptr<float>() = *inputs[2].data().dptr<float>();
-  } else if (param.out_type == mshadow::kInt8) {
-    real_range = MaxAbs(*inputs[1].data().dptr<float>(), *inputs[2].data().dptr<float>());
-    quantized_range = MinAbs(MaxValue<DstType>(), MinValue<DstType>());
-    *outputs[1].data().dptr<float>() = -real_range;
-    *outputs[2].data().dptr<float>() = real_range;
+
+  int data_t = inputs[0].dtype();
+  float amin = 0, amax=0;
+  quantize_ws_cache_buf_t ws_amin_cache(inputs[0].shape());
+  float * amin_cache_buf = nullptr;
+  if (param.threshold_enabled) {
+      real_range = MaxAbs(param.th_min, param.th_max);
+      amin = param.th_min;
+      amax = param.th_max;
+	  amin_cache_buf = ws_amin_cache.get_amin_buffer(amin);
   } else {
-    LOG(FATAL) << "mkldnn quantize op only supports int8 and uint8 as output type";
+    amin = *inputs[1].data().dptr<float>();
+    amax = *inputs[2].data().dptr<float>();
+  }
+  if (param.out_type == mshadow::kUint8)
+      quantized_range = get_unsigned_quantized_range<uint8_t>();
+  else if (param.out_type == mshadow::kInt8)
+      quantized_range = get_signed_quantized_range<int8_t>();
+
+  if (amin < 0 && param.out_type == mshadow::kUint8) {
+      real_range = amax - amin;
+      *outputs[1].data().dptr<float>() = amin; //Amin < 0
+      *outputs[2].data().dptr<float>() = amax;
+      SrcType* input_ptr = inputs[0].data().dptr<SrcType>();
+      const TShape& ishape = inputs[0].shape();      
+	  if (typeid(SrcType) == typeid(float) && amin_cache_buf)
+	  {
+		  int MB = ws_amin_cache.get_ws_mb_size();
+		  int OC = ws_amin_cache.get_ws_cache_size();
+#pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount())
+		  for (cblas_int mb = 0; mb < MB; mb++) {
+			  cblas_saxpy(OC, 1.0, amin_cache_buf, 1.0, input_ptr + mb * OC, 1);
+		  }
+	  }
+	  else
+	  {
+		  int input_size = ishape.Size();
+#pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount()) if (input_size > 2000)
+		  for (int i = 0; i < input_size; i++)
+		  {
+			  input_ptr[i] -= amin;
+		  }
+	  }
+  }
+  else if (amin < 0 && param.out_type == mshadow::kInt8) {
+      real_range = MaxAbs(amin, amax);
+      *outputs[1].data().dptr<float>() = -real_range;
+      *outputs[2].data().dptr<float>() = real_range;
+  }
+  else if (amin >= 0 && (param.out_type == mshadow::kUint8 
+      || param.out_type == mshadow::kInt8)) {
+      real_range = MaxAbs(amin, amax);
+      *outputs[1].data().dptr<float>() = amin; //Amin > 0
+      *outputs[2].data().dptr<float>() = amax;
+  }
+  else {
+      LOG(FATAL) << "mkldnn quantize op only supports int8 and uint8 as output type";
   }
+
+  
   float scale = quantized_range / real_range;
+  //std::cout << "MKLDNNQuantizeComputeKer scale:" << scale << std::endl;
   primitive_attr attr;
   const int mask = 0;
   std::vector<float> scales = {scale};
@@ -72,27 +291,14 @@ static void MKLDNNQuantizeComputeKer(const std::vector<NDArray>& inputs,
     in_buffer = inputs[0].Reorder2Default();
 
   auto i_mem = in_buffer.GetMKLDNNData();
-  auto i_mpd = i_mem->get_primitive_desc();
-  auto i_desc = i_mpd.desc();
-  mkldnn::memory::format i_fmt = static_cast<mkldnn::memory::format>(i_desc.data.format);
-  if (i_fmt == mkldnn::memory::format::nchw ||
-      i_fmt == mkldnn::memory::format::nChw8c ||
-      i_fmt == mkldnn_nChw16c) {
-    i_fmt = mkldnn::memory::format::nhwc;
-  }
-  size_t i_ndim = in_buffer.shape().ndim();
-  mkldnn::memory::dims i_dims = mkldnn::memory::dims(i_ndim);
-  for (size_t i = 0; i < i_ndim; i++) {
-    i_dims[i] = static_cast<int>(in_buffer.shape()[i]);
-  }
-  auto o_desc = mkldnn::memory::desc(i_dims,
-                                    (mkldnn::memory::data_type)data_type_enum<DstType>::type,
-                                    i_fmt);
-  auto o_mpd = memory::primitive_desc(o_desc, cpu_engine);
-  auto reorder_pd  = reorder::primitive_desc(i_mpd, o_mpd, attr);
-  auto o_mem = CreateMKLDNNMem(outputs[0], o_mpd, req[0]);
-  MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(reorder_pd, *i_mem, *o_mem.second));
-  CommitOutput(outputs[0], o_mem);
+
+
+  MKLDNNQuantize &mkldnn_Quantize = GetQuantize(param, in_buffer,
+      (mkldnn::memory::data_type)data_type_enum<DstType>::type, attr);
+  auto o_output = CreateMKLDNNMem(outputs[0], mkldnn_Quantize.GetOutMPD(), req[0]);
+  mkldnn_Quantize.SetNewMem(*i_mem, *o_output.second);
+  MKLDNNStream::Get()->RegisterPrim(mkldnn_Quantize.GetReorder());
+  CommitOutput(outputs[0], o_output);
   MKLDNNStream::Get()->Submit();
 }
 
@@ -101,6 +307,9 @@ static void MKLDNNQuantizeCompute(const nnvm::NodeAttrs& attrs, const OpContext
                                   const std::vector<OpReqType> &req,
                                   const std::vector<NDArray> &outputs) {
   const QuantizeParam& param = nnvm::get<QuantizeParam>(attrs.parsed);
+
+
+
   if (param.out_type == mshadow::kUint8) {
     MKLDNNQuantizeComputeKer<float, uint8_t>(inputs, outputs, param, req);
   } else if (param.out_type == mshadow::kInt8) {
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc b/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc
new file mode 100644
index 0000000..0fd9b2e
--- /dev/null
+++ b/src/operator/quantization/mkldnn/mkldnn_quantized_fully_connected.cc
@@ -0,0 +1,544 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+*/
+
+/*!
+* \file mkldnn_quantized_fully_connected
+* \brief
+* \author Lingyan Guo
+*/
+
+
+
+#if MXNET_USE_MKLDNN == 1
+#include "../../nn/mkldnn/mkldnn_base-inl.h"
+#include "../quantization_utils.h"
+#include "../../tensor/matrix_op-inl.h"
+#include "../../elemwise_op_common.h"
+#include "../../mxnet_op.h"
+#include "../../nn/fully_connected-inl.h"
+#include "mkldnn_quantized_util.h"
+#include "mkl_cblas.h"
+namespace mxnet {
+namespace op {
+
+void QuantizedFullyConnectedForwardCPU(const nnvm::NodeAttrs& attrs,
+    const OpContext &ctx,
+    const std::vector<TBlob> &inputs,
+    const std::vector<OpReqType> &req,
+    const std::vector<TBlob> &outputs)
+{
+    LOG(FATAL) << "TODO: QuantizedFullyConnectedForwardCPU";
+}
+
+class shift_buf_t
+{
+private:
+    int32_t * _obuf;
+    uint8_t * _shift_buf;
+    TShape _oshape;
+    TShape _ishape;
+    NDArray* _bias_buf;
+    int32_t * _weight_shift;
+    shift_buf_t(const shift_buf_t &obj){}
+    shift_buf_t& operator=(const shift_buf_t &s){ return *this; }
+public:
+    const static int INT8_SHIFT_CONST = 128;
+    shift_buf_t(TShape& oshape, TShape& ishape) 
+        :_obuf(NULL), _shift_buf(NULL), _bias_buf(NULL), _weight_shift(NULL)
+    {
+        _oshape = oshape;
+        _ishape = ishape;
+    }
+    int32_t* get_obuffer()
+    {
+        if (!_obuf)
+        {
+            int size = _oshape.Size();
+            _obuf = new int32_t[size];
+        }
+        return _obuf;
+    }
+    uint8_t* get_shift_buffer(const int8_t *data)
+    {
+        if (!_shift_buf)
+        {
+            size_t size = _ishape.Size();
+            _shift_buf = new uint8_t[size];
+            for (size_t i = 0; i < size; i++) {
+                _shift_buf[i] = data[i] + INT8_SHIFT_CONST;
+            }
+        }
+        return _shift_buf;
+    }
+    NDArray* get_bias_buf(TShape &bshape, Context ctx) {
+        if (!_bias_buf) {
+            _bias_buf = new NDArray(bshape, ctx, false, mshadow::kInt32);
+        }
+        return _bias_buf;
+    }
+    int32_t* get_weight_shift_buf(TShape &wshape) {
+        if (!_weight_shift) {
+            int hidden_num = wshape[0];
+            _weight_shift = new int32_t[hidden_num];
+        }
+        return _weight_shift;
+    }
+    ~shift_buf_t()
+    {
+        if (_obuf) {
+            delete[] _obuf;
+            _obuf = NULL;
+        }
+        if (_shift_buf) {
+            delete[] _shift_buf;
+            _shift_buf = NULL;
+        }
+
+        if (_bias_buf) {
+            delete _bias_buf;
+            _bias_buf = NULL;
+        }
+
+        if (_weight_shift) {
+            delete[] _weight_shift;
+            _weight_shift = NULL;
+        }
+    }
+};
+
+inline static mkldnn::inner_product_forward::primitive_desc GetIPFwd(
+    const NDArray &data, const NDArray &weight, const NDArray *bias,
+    const mkldnn::memory::desc &out_md, const bool is_train, primitive_attr& qfc_attr) {
+    auto data_md = GetMemDesc(data);
+    auto weight_md = GetMemDesc(weight);
+    auto engine = CpuEngine::Get()->get_engine();
+    auto propagation =
+        is_train ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring;
+    if (bias) {
+        auto bias_md = GetMemDesc(*bias);
+        mkldnn::inner_product_forward::desc ipFwd_desc(propagation,
+            data_md, weight_md, bias_md, out_md);
+        return mkldnn::inner_product_forward::primitive_desc(ipFwd_desc, qfc_attr, engine);
+    }
+    else {
+        mkldnn::inner_product_forward::desc ipFwd_desc(propagation,
+            data_md, weight_md, out_md);
+        return mkldnn::inner_product_forward::primitive_desc(ipFwd_desc, qfc_attr, engine);
+    }
+}
+//TODO: offline calibration
+void QuantizedGetWeightShift(int32_t *weight_shift, int8_t*weight, TShape& wshape,
+    float* weight_shift_ref = nullptr, float weight_scale = 1.0) {
+    int hidden_num = wshape[0];
+    int out_c = wshape[1];
+    for (int i = 0; i < hidden_num; i++) {
+        int sum_ = 0;
+        for (int j = 0; j < out_c; j++) {
+            sum_ += weight[i*out_c + j];
+        }
+        weight_shift[i] = sum_;
+    }
+}
+
+class MKLDNNQuantFullyConnectForward {
+private:
+    std::shared_ptr<mkldnn::memory> data;
+    std::shared_ptr<mkldnn::memory> weight;
+    std::shared_ptr<mkldnn::memory> out;
+    std::shared_ptr<mkldnn::memory> bias;
+    std::shared_ptr<mkldnn::inner_product_forward> ipFwd;
+    bool _weight_cached;
+    bool _bias_cached;
+public:
+    mkldnn::inner_product_forward::primitive_desc ipFwd_pd;
+
+    MKLDNNQuantFullyConnectForward(const FullyConnectedParam &param, bool is_train,
+        const NDArray &data, const NDArray &weight,
+        const NDArray *bias,
+        const mkldnn::memory::desc &output, primitive_attr& qfc_attr)
+        : ipFwd_pd(GetIPFwd(data, weight, bias, output, is_train, qfc_attr)),
+         _weight_cached(false), _bias_cached(false) {}
+    bool IsWeightCached() { return _weight_cached; }
+    bool IsBiasCached() { return _bias_cached; }
+    void SetNewMem(const mkldnn::memory &data, const mkldnn::memory *weight,
+        const mkldnn::memory *bias, const mkldnn::memory &output) {
+        if (!this->data)
+            this->data = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+                ipFwd_pd.src_primitive_desc(), data.get_data_handle()));
+        else
+            this->data->set_data_handle(data.get_data_handle());
+        if (weight) {
+            if (!this->weight)
+                this->weight = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+                    ipFwd_pd.weights_primitive_desc(), weight->get_data_handle()));
+            else
+                this->weight->set_data_handle(weight->get_data_handle());
+            _weight_cached = true;
+        }
+        else {
+            if (!this->weight) {
+                LOG(FATAL) << "MKLDNNQuantFullyConnectForward Weight not initialized";
+            }
+        }
+        if (!this->out)
+            this->out = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(
+                ipFwd_pd.dst_primitive_desc(), output.get_data_handle()));
+        else
+            this->out->set_data_handle(output.get_data_handle());
+
+        if ((bias && !_bias_cached) || (this->bias && _bias_cached)) {
+            if (!_bias_cached) {
+                if (!this->bias) {
+                    auto bias_desc = ipFwd_pd.bias_primitive_desc();
+                    this->bias = std::shared_ptr<mkldnn::memory>(new mkldnn::memory(bias_desc));
+                    int bias_buf_size = bias_desc.get_size();
+                    void * dst_ptr = this->bias->get_data_handle();
+                    void * src_ptr = bias->get_data_handle();
+                    memcpy(dst_ptr, src_ptr, bias_buf_size);
+                }
+                _bias_cached = true;
+            }
+            if (!this->ipFwd)
+                this->ipFwd = std::shared_ptr<mkldnn::inner_product_forward>(
+                    new mkldnn::inner_product_forward(
+                        ipFwd_pd, mkldnn::primitive::at(*this->data),
+                        mkldnn::primitive::at(*this->weight),
+                        mkldnn::primitive::at(*this->bias), *this->out));
+        }
+        else if (!this->ipFwd) {
+            this->ipFwd = std::shared_ptr<mkldnn::inner_product_forward>(
+                new mkldnn::inner_product_forward(
+                    ipFwd_pd, mkldnn::primitive::at(*this->data),
+                    mkldnn::primitive::at(*this->weight), *this->out));
+        }
+    }
+    const mkldnn::inner_product_forward &GetIpFwd() const {
+        return *ipFwd;
+    }
+};
+typedef ParamOpSign<FullyConnectedParam> MKLDNNFullyconSignature;
+
+static inline MKLDNNQuantFullyConnectForward &GetFCFwd(
+    const nnvm::NodeAttrs &attrs, const NDArray &data, const NDArray &weight,
+    const NDArray *bias, const mkldnn::memory::desc &output,
+    const bool is_train, primitive_attr& qfc_attr) {
+#if DMLC_CXX11_THREAD_LOCAL
+    static thread_local std::unordered_map<MKLDNNFullyconSignature,
+        MKLDNNQuantFullyConnectForward, OpHash> fcFwds;
+#else
+    static MX_THREAD_LOCAL std::unordered_map<MKLDNNFullyconSignature,
+        MKLDNNQuantFullyConnectForward, OpHash> fcFwds;
+#endif
+    const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+    MKLDNNFullyconSignature key(param);
+    key.AddSign(data);
+    key.AddSign(weight);
+    key.AddSign(is_train);
+    //if (bias)
+    //    key.AddSign(*bias);
+
+    auto it = fcFwds.find(key);
+    if (it == fcFwds.end()) {
+        MKLDNNQuantFullyConnectForward fcFwd(param, is_train, data, weight, bias,
+            output, qfc_attr);
+        auto ins_ret = fcFwds.insert(
+            std::pair<MKLDNNFullyconSignature, MKLDNNQuantFullyConnectForward>(key, fcFwd));
+        CHECK(ins_ret.second);
+        it = ins_ret.first;
+    }
+    else {
+        //printf("got\n");
+    }
+    return it->second;
+}
+void QuantizedFullyConnectedForwardExCPU(const nnvm::NodeAttrs& attrs,
+    const OpContext &ctx,
+    const std::vector<NDArray> &in_data,
+    const std::vector<OpReqType> &req,
+    const std::vector<NDArray> &out_data) {
+    std::string node_name = attrs.name;
+    const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+    using namespace mshadow;
+    using namespace mxnet_op;
+    size_t num_inputs = param.no_bias ? 2 : 3;
+
+
+	int data_t = in_data[0].dtype();
+	int weight_t = in_data[1].dtype();
+	int out_t = out_data[0].dtype();
+	float uint8_quantized_range = get_unsigned_quantized_range<uint8_t>();
+	float int8_quantized_range = get_signed_quantized_range<int8_t>();
+
+    TShape ishape = in_data[0].shape();
+    TShape wshape = in_data[1].shape();
+    TShape oshape = out_data[0].shape();
+    float input_min = in_data[num_inputs].data().dptr<float>()[0];
+    float input_max = in_data[num_inputs + 1].data().dptr<float>()[0];
+
+    float weight_min = in_data[num_inputs + 2].data().dptr<float>()[0];
+    float weight_max = in_data[num_inputs + 3].data().dptr<float>()[0];
+
+    NDArray weight = in_data[1];
+    NDArray data = in_data[0];
+
+    TShape bshape = Shape1(param.num_hidden);
+    shift_buf_t shift_buf(oshape, ishape);
+
+	if ((data_t == mshadow::kUint8)
+		&& (weight_t == mshadow::kInt8)
+		&& (out_t == mshadow::kFloat32 || out_t == mshadow::kUint8))
+	{
+        NDArray* tmp_buf = nullptr;
+
+        float input_scale = 1.0;
+        float input_shift = 0.0;
+        if (input_min < 0 && data_t == mshadow::kUint8) {
+            input_scale = uint8_quantized_range / (input_max - input_min);
+            input_shift = input_min;
+        }
+        else if (data_t == mshadow::kInt8) {
+            input_scale = int8_quantized_range / input_max;
+        }
+        else if (input_min >= 0 && data_t == mshadow::kUint8) {
+            input_scale = uint8_quantized_range / input_max;
+        } 
+
+        float weight_scale = int8_quantized_range / weight_max;
+
+        primitive_attr qfc_attr;
+        std::vector<float> qfc_scales(1);
+        qfc_scales[0] = 1 / input_scale / weight_scale;
+        float out_requant_scale = 1.0;
+        if (param.output_type == mshadow::kUint8) {
+            float out_calib_range = 1.0;
+            if (param.out_enable_calib_range) {
+                out_calib_range = MaxAbs(param.out_min_calib_range, param.out_max_calib_range);
+                out_requant_scale = uint8_quantized_range / out_calib_range;
+                *out_data[1].data().dptr<float>() = 0;
+                *out_data[2].data().dptr<float>() = out_calib_range;
+                qfc_scales[0] *= out_requant_scale;
+            }
+            else {
+                LOG(FATAL) << "mkldnn quantize fc only supports out_enable_calib_range enabled";
+            }
+        }
+        qfc_attr.set_output_scales(0, qfc_scales);
+
+        if (param.relu_fused) {
+            post_ops ops;
+            const float ops_scale = 1.f;
+            const float ops_alpha = 0.f; // relu negative slope
+            const float ops_beta = 0.f;
+            ops.append_eltwise(ops_scale, algorithm::eltwise_relu, ops_alpha, ops_beta);
+            qfc_attr.set_post_ops(ops);
+        }
+        auto out_md = GetMemDesc(out_data[0]);
+
+        if (!param.no_bias || input_shift != 0) {
+            tmp_buf = shift_buf.get_bias_buf(bshape, in_data[0].ctx());
+        }
+        MKLDNNQuantFullyConnectForward &FCFwd =
+            GetFCFwd(attrs, data, weight, tmp_buf, out_md, ctx.is_train, qfc_attr);
+        TShape bshape = Shape1(param.num_hidden);
+        int bias_size = bshape.Size();
+        //Bias cache only applied to threshold enabled
+        if (!FCFwd.IsBiasCached()) {
+            if (!param.no_bias || input_shift != 0) {
+                input_shift *= (input_scale);
+                if (!param.no_bias) {
+                    int8_t* bias_ptr = in_data[2].data().dptr<int8_t>();
+                    int32_t* tmp_bias_ptr = tmp_buf->data().dptr<int32_t>();
+
+                    float bias_min = in_data[num_inputs + 4].data().dptr<float>()[0];
+                    float bias_max = in_data[num_inputs + 5].data().dptr<float>()[0];
+                    float bias_range = MaxAbs(bias_min, bias_max);
+                    float bias_requant_scale = input_scale * weight_scale * bias_range / int8_quantized_range;
+
+                    if (input_shift != 0) {
+                        //TO check, weight_mem?
+                        int32_t *weight_shift = shift_buf.get_weight_shift_buf(wshape);
+                        QuantizedGetWeightShift(weight_shift, weight.data().dptr<int8_t>(), wshape);
+#       pragma omp parallel for schedule(static)
+                        for (int i = 0; i < bias_size; i++) {
+                            tmp_bias_ptr[i] = bias_ptr[i] * bias_requant_scale + input_shift * weight_shift[i];
+                        }
+                    }
+                    else {
+#       pragma omp parallel for schedule(static)
+                        for (int i = 0; i < bias_size; i++) {
+                            tmp_bias_ptr[i] = bias_ptr[i] * bias_requant_scale;
+                        }
+                    }
+                }
+                else if (input_shift != 0) {
+                    int32_t* tmp_bias_ptr = tmp_buf->data().dptr<int32_t>();
+                    int32_t *weight_shift = shift_buf.get_weight_shift_buf(wshape);
+                    QuantizedGetWeightShift(weight_shift, weight.data().dptr<int8_t>(), wshape);
+#       pragma omp parallel for schedule(static)
+                    for (int i = 0; i < bias_size; i++) {
+                        tmp_bias_ptr[i] = input_shift * weight_shift[i];
+                    }
+                }
+            }
+        }
+
+
+        auto data_mem = data.GetMKLDNNDataReorder(FCFwd.ipFwd_pd.src_primitive_desc());
+        const mkldnn::memory * weight_mem = nullptr;
+        if (!FCFwd.IsWeightCached()) {
+            const mkldnn::memory *w_mem = weight.GetMKLDNNData();
+            const mkldnn::memory::primitive_desc &new_pd = FCFwd.ipFwd_pd.weights_primitive_desc(); 
+            weight_mem = new mkldnn::memory(new_pd);
+            MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(*w_mem, *weight_mem));
+        }
+        auto out_mem = CreateMKLDNNMem(out_data[0], FCFwd.ipFwd_pd.dst_primitive_desc(), req[0]);
+        if (tmp_buf) {
+            if (!FCFwd.IsBiasCached()) {
+                auto bias_mem = tmp_buf->GetMKLDNNDataReorder(FCFwd.ipFwd_pd.bias_primitive_desc());
+                FCFwd.SetNewMem(*data_mem, weight_mem, bias_mem, *out_mem.second);
+            }
+            else {
+                FCFwd.SetNewMem(*data_mem, weight_mem, nullptr, *out_mem.second);
+            }
+        }
+        else {
+            FCFwd.SetNewMem(*data_mem, weight_mem, nullptr, *out_mem.second);
+        }
+        MKLDNNStream::Get()->RegisterPrim(FCFwd.GetIpFwd());
+        CommitOutput(out_data[fullc::kOut], out_mem);
+        MKLDNNStream::Get()->Submit();
+
+	} else if ((data_t == mshadow::kInt8)
+		&& (weight_t == mshadow::kInt8)
+		&& (out_t == mshadow::kFloat32 || out_t == mshadow::kUint8))
+	{ 
+        //TODO: MKLDNN not support int8 input
+        int8_t * data_ptr = in_data[0].data().dptr<int8_t>();
+        int8_t * weight_ptr = in_data[1].data().dptr<int8_t>();
+
+        float input_scale = int8_quantized_range / MaxAbs(input_max, input_min); //Qa
+        float weight_scale = int8_quantized_range / weight_max; //Qw
+        float qfc_scales = 1 / input_scale/ weight_scale;  //1/Qa*Qw
+        MKL_INT32       co = 0;
+        const int m = ishape[0], n = ishape.ProdShape(1, ishape.ndim()), k = oshape.ProdShape(1, oshape.ndim());
+        
+        uint8_t * gemm_a = shift_buf.get_shift_buffer(data_ptr);
+        int32_t * tmp_out = shift_buf.get_obuffer();
+        cblas_gemm_s8u8s32(CblasColMajor, CblasTrans, CblasNoTrans, CblasFixOffset, //offsetc
+             k,
+             m,
+             n,
+             1.0f,
+             weight_ptr, n, 0,
+             gemm_a, n, 0,
+             0.0f,
+             tmp_out, k, &co);
+        TShape bshape = Shape1(param.num_hidden);
+        const TBlob& out_blob = out_data[0].data();
+
+        int out_size = oshape.Size();
+
+        float weight_shift_scale = shift_buf_t::INT8_SHIFT_CONST / input_scale / weight_scale;
+        int32_t *weight_shift = shift_buf.get_weight_shift_buf(wshape);
+        if (out_t == mshadow::kFloat32) {
+            float * out_ptr = out_blob.dptr<float>();
+            QuantizedGetWeightShift(weight_shift, weight.data().dptr<int8_t>(), wshape);
+            if (!param.no_bias) {
+                int8_t *bias_ptr = in_data[2].data().dptr<int8_t>();
+                float bias_min = in_data[num_inputs + 4].data().dptr<float>()[0];
+                float bias_max = in_data[num_inputs + 5].data().dptr<float>()[0];
+                float bias_range = MaxAbs(bias_min, bias_max);
+                float bias_requant_scale = bias_range / int8_quantized_range;
+#       pragma omp parallel for schedule(static)
+                for (int i = 0; i < out_size; i++) {
+                    out_ptr[i] = (tmp_out[i]) * qfc_scales - weight_shift_scale * weight_shift[i%k]
+                        + bias_ptr[i%k] * bias_requant_scale;
+                }
+            }
+            else {
+#       pragma omp parallel for schedule(static)
+                for (int i = 0; i < out_size; i++) {
+                    out_ptr[i] = (tmp_out[i]) * qfc_scales - weight_shift_scale * weight_shift[i%k];
+                }
+            }
+        }
+        else if (out_t == mshadow::kUint8) {
+            float out_calib_range = 1.0;      
+            float out_requant_scale = 1.0;
+            if (param.out_enable_calib_range) {
+                out_calib_range = MaxAbs(param.out_min_calib_range, param.out_max_calib_range);
+                out_requant_scale = uint8_quantized_range / out_calib_range;
+                *out_data[1].data().dptr<float>() = 0;
+                *out_data[2].data().dptr<float>() = out_calib_range;
+            }
+            else {
+                LOG(FATAL) << "mkldnn quantize fc only supports out_enable_calib_range enabled";
+            }
+            uint8_t * out_ptr = out_blob.dptr<uint8_t>();
+            QuantizedGetWeightShift(weight_shift, weight.data().dptr<int8_t>(), wshape);
+            if (!param.no_bias) {
+                int8_t *bias_ptr = in_data[2].data().dptr<int8_t>();
+                float bias_min = in_data[num_inputs + 4].data().dptr<float>()[0];
+                float bias_max = in_data[num_inputs + 5].data().dptr<float>()[0];
+                float bias_range = MaxAbs(bias_min, bias_max);
+                float bias_requant_scale = bias_range / int8_quantized_range;
+//#       pragma omp parallel for schedule(static)
+                for (int i = 0; i < out_size; i++) {
+
+                    float out_v = out_requant_scale*((tmp_out[i]) * qfc_scales - weight_shift_scale * weight_shift[i%k]
+                        + bias_ptr[i%k] * bias_requant_scale);
+                    out_ptr[i] = out_v <= 0 ? 0 : (uint8_t)out_v;
+                }
+            }
+            else {
+#       pragma omp parallel for schedule(static)
+                for (int i = 0; i < out_size; i++) {
+                    float out_v = out_requant_scale*((tmp_out[i]) * qfc_scales - weight_shift_scale * weight_shift[i%k]);
+                    out_ptr[i] = out_v <= 0 ? 0 : (uint8_t)out_v;
+                }
+            }
+        }
+    }
+    else {
+        LOG(FATAL) << "mkldnn quantize fc only supports Output Type " << out_t;
+    }
+}
+
+inline static bool QuantizedFullyStorageType(const nnvm::NodeAttrs& attrs,
+    const int dev_mask,
+    DispatchMode* dispatch_mode,
+    std::vector<int> *in_attrs,
+    std::vector<int> *out_attrs) {
+    bool dispatched = false;
+    dispatched = storage_type_assign(out_attrs, mxnet::kDefaultStorage,
+        dispatch_mode, DispatchMode::kFComputeEx);
+    return dispatched;
+}
+
+NNVM_REGISTER_OP(_contrib_quantized_fully_connected)
+.set_attr<FCompute>("FCompute<cpu>", QuantizedFullyConnectedForwardCPU);
+
+NNVM_REGISTER_OP(_contrib_quantized_fully_connected)
+.set_attr<FInferStorageType>("FInferStorageType", QuantizedFullyStorageType)
+.set_attr<FComputeEx>("FComputeEx<cpu>", QuantizedFullyConnectedForwardExCPU);
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // MXNET_USE_MKLDNN == 1
diff --git a/src/operator/quantization/mkldnn/mkldnn_quantized_util.h b/src/operator/quantization/mkldnn/mkldnn_quantized_util.h
new file mode 100644
index 0000000..a16d1d2
--- /dev/null
+++ b/src/operator/quantization/mkldnn/mkldnn_quantized_util.h
@@ -0,0 +1,28 @@
+#ifndef MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_QUANTIZED_UTIL_INL_H_
+#define MXNET_OPERATOR_QUANTIZATION_MKLDNN_MKLDNN_QUANTIZED_UTIL_INL_H_
+
+namespace mxnet {
+namespace op {
+template<typename dtype>
+float get_signed_quantized_range()
+{
+	using namespace mshadow;
+	using namespace mxnet_op;
+	using red::limits::MaxValue;
+	using red::limits::MinValue;
+	return MinAbs(MaxValue<dtype>(), MinValue<dtype>());
+}
+template<typename dtype>
+float get_unsigned_quantized_range()
+{
+	using namespace mshadow;
+	using namespace mxnet_op;
+	using red::limits::MaxValue;
+	using red::limits::MinValue;
+	return MaxAbs(MaxValue<dtype>(), MinValue<dtype>());
+}
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif
\ No newline at end of file
diff --git a/src/operator/quantization/quantize-inl.h b/src/operator/quantization/quantize-inl.h
index 8b7a11c..537f39c 100644
--- a/src/operator/quantization/quantize-inl.h
+++ b/src/operator/quantization/quantize-inl.h
@@ -24,7 +24,8 @@
  */
 #ifndef MXNET_OPERATOR_QUANTIZATION_QUANTIZE_INL_H_
 #define MXNET_OPERATOR_QUANTIZATION_QUANTIZE_INL_H_
-
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
 #include <mxnet/operator_util.h>
 #include <vector>
 #include <limits>
@@ -38,12 +39,35 @@ namespace op {
 
 struct QuantizeParam : public dmlc::Parameter<QuantizeParam> {
   int   out_type;
+#if MXNET_USE_MKLDNN == 1
+  bool threshold_enabled;
+  float th_min;
+  float th_max;
+#endif
   DMLC_DECLARE_PARAMETER(QuantizeParam) {
     DMLC_DECLARE_FIELD(out_type)
     .add_enum("int8", mshadow::kInt8)
     .add_enum("uint8", mshadow::kUint8)
     .set_default(mshadow::kUint8)
     .describe("Output data type.");
+#if MXNET_USE_MKLDNN == 1
+    DMLC_DECLARE_FIELD(threshold_enabled).set_default(false)
+        .describe("Whether to enable threshold.");
+    DMLC_DECLARE_FIELD(th_min).set_default(0.0)
+        .describe("min value of threashold.");
+    DMLC_DECLARE_FIELD(th_max).set_default(0.0)
+        .describe("max value of threashold.");
+#endif
+  }
+
+  bool operator==(const QuantizeParam& other) const {
+      bool ret = this->out_type == other.out_type;
+#if MXNET_USE_MKLDNN == 1
+      ret = ret && this->threshold_enabled == other.threshold_enabled
+          && this->th_min == other.th_min
+          && this->th_max == other.th_max;
+#endif
+      return ret;
   }
 };
 
@@ -112,11 +136,18 @@ void QuantizeCompute(const nnvm::NodeAttrs& attrs,
 inline bool QuantizeShape(const nnvm::NodeAttrs& attrs,
                           std::vector<TShape> *in_attrs,
                           std::vector<TShape> *out_attrs) {
-  CHECK_EQ(in_attrs->size(), 3U);
+  const QuantizeParam& param = nnvm::get<QuantizeParam>(attrs.parsed);
+  if (!param.threshold_enabled) {
+    CHECK_EQ(in_attrs->size(), 3U);
+  } else {
+    CHECK_EQ(in_attrs->size(), 1U);
+  }
   CHECK_EQ(out_attrs->size(), 3U);
-
-  for (size_t i = 1; i < 3; ++i) {
-    SHAPE_ASSIGN_CHECK(*in_attrs, i, TShape({1}));
+  
+  if (!param.threshold_enabled) {
+    for (size_t i = 1; i < 3; ++i) {
+      SHAPE_ASSIGN_CHECK(*in_attrs, i, TShape({1}));
+    }
   }
 
   SHAPE_ASSIGN_CHECK(*out_attrs, 0, in_attrs->at(0));
@@ -128,12 +159,19 @@ inline bool QuantizeShape(const nnvm::NodeAttrs& attrs,
 inline bool QuantizeType(const nnvm::NodeAttrs& attrs,
                          std::vector<int> *in_attrs,
                          std::vector<int> *out_attrs) {
-  CHECK_EQ(in_attrs->size(), 3U);
-  CHECK_EQ(out_attrs->size(), 3U);
   const QuantizeParam& param = nnvm::get<QuantizeParam>(attrs.parsed);
+  if (!param.threshold_enabled) {
+    CHECK_EQ(in_attrs->size(), 3U);
+  } else {
+    CHECK_EQ(in_attrs->size(), 1U);
+  }
+  CHECK_EQ(out_attrs->size(), 3U);
+
   TYPE_ASSIGN_CHECK(*in_attrs, 0, mshadow::kFloat32);
-  TYPE_ASSIGN_CHECK(*in_attrs, 1, mshadow::kFloat32);
-  TYPE_ASSIGN_CHECK(*in_attrs, 2, mshadow::kFloat32);
+  if (!param.threshold_enabled) {
+    TYPE_ASSIGN_CHECK(*in_attrs, 1, mshadow::kFloat32);
+    TYPE_ASSIGN_CHECK(*in_attrs, 2, mshadow::kFloat32);
+  }
   if (param.out_type == mshadow::kUint8) {
     TYPE_ASSIGN_CHECK(*out_attrs, 0, mshadow::kUint8);
   } else if (param.out_type == mshadow::kInt8) {
@@ -148,4 +186,6 @@ inline bool QuantizeType(const nnvm::NodeAttrs& attrs,
 
 }  // namespace op
 }  // namespace mxnet
+
+
 #endif  // MXNET_OPERATOR_QUANTIZATION_QUANTIZE_INL_H_
diff --git a/src/operator/quantization/quantize.cc b/src/operator/quantization/quantize.cc
index 5227751..c2e03bc 100644
--- a/src/operator/quantization/quantize.cc
+++ b/src/operator/quantization/quantize.cc
@@ -73,11 +73,18 @@ where
 .. Note::
     This operator only supports forward propogation. DO NOT use it in training.)code" ADD_FILELINE)
 .set_attr_parser(ParamParser<QuantizeParam>)
-.set_num_inputs(3)
+.set_num_inputs([](const NodeAttrs& attrs) {
+  const QuantizeParam& param = nnvm::get<QuantizeParam>(attrs.parsed);
+  return param.threshold_enabled ? 1 : 3;
+})
 .set_num_outputs(3)
 .set_attr<nnvm::FListInputNames>("FListInputNames",
   [](const NodeAttrs& attrs) {
-    return std::vector<std::string>{"data", "min_range", "max_range"};
+    const QuantizeParam& param = nnvm::get<QuantizeParam>(attrs.parsed);
+    if(param.threshold_enabled)
+      return std::vector<std::string>{"data"};
+    else
+      return std::vector<std::string>{"data", "min_range", "max_range"};
   })
 .set_attr<nnvm::FInferShape>("FInferShape", QuantizeShape)
 .set_attr<nnvm::FInferType>("FInferType", QuantizeType)
diff --git a/src/operator/quantization/quantize_graph_pass.cc b/src/operator/quantization/quantize_graph_pass.cc
index 2fa790d..7a6db65 100644
--- a/src/operator/quantization/quantize_graph_pass.cc
+++ b/src/operator/quantization/quantize_graph_pass.cc
@@ -26,6 +26,8 @@
 #include <nnvm/pass.h>
 #include <mxnet/op_attr_types.h>
 #include <unordered_set>
+#include "quantize-inl.h"
+
 
 namespace mxnet {
 namespace op {
@@ -119,13 +121,14 @@ inline bool NeedQuantize(NodePtr node, const std::unordered_set<std::string>& ex
 Graph QuantizeGraph(Graph &&src) {
   static auto& quantized_op_map = Op::GetAttr<mxnet::FQuantizedOp>("FQuantizedOp");
   static auto& need_requantize_map = Op::GetAttr<mxnet::FNeedRequantize>("FNeedRequantize");
+  static auto& need_dequantize_map = Op::GetAttr<mxnet::FNeedDequantize>("FNeedDequantize");
   static auto& avoid_quantize_input_map =
       Op::GetAttr<mxnet::FAvoidQuantizeInput>("FAvoidQuantizeInput");
   auto offline_params = src.GetAttr<std::unordered_set<std::string>>("offline_params");
   auto excluded_nodes = src.GetAttr<std::unordered_set<std::string>>("excluded_nodes");
   auto quantized_dtype = src.GetAttr<std::string>("quantized_dtype");
   auto calib_quantize = src.GetAttr<bool>("calib_quantize");
-
+  static auto& use_offline_paramth_map = Op::GetAttr<mxnet::FUseOfflineParamThreshold>("FUseOfflineParamThreshold");
   // mirror_map stores the mapping from the currently visited graph to the newly created quantized
   // graph. Key is the currently visited graph's node pointer, and value is a copied node of the key
   // node. The existing key's value may be updated with the newly created quantize/dequantize op.
@@ -161,19 +164,26 @@ Graph QuantizeGraph(Graph &&src) {
             e.node->attrs.name + "_quantize", new_node, mirror_entry);
           quantize_node->attrs.dict["out_type"] = quantized_dtype;
           quantize_node->op()->attr_parser(&(quantize_node->attrs));
-          if (calib_quantize) {
-            NodePtr min_var = CreateNode("nullptr", e.node->attrs.name + "_min");
-            quantize_node->inputs.emplace_back(NodeEntry{min_var, 0, 0});
-            NodePtr max_var = CreateNode("nullptr", e.node->attrs.name + "_max");
-            quantize_node->inputs.emplace_back(NodeEntry{max_var, 0, 0});
-          } else {
-            NodePtr min_node = InsertNode("min",
-                e.node->attrs.name + "_min", quantize_node, mirror_entry);
-            min_node->op()->attr_parser(&(min_node->attrs));
+          bool use_offline_param_th = false;
+          if (use_offline_paramth_map.count(new_node->op()) > 0 &&
+              use_offline_paramth_map[new_node->op()](new_node->attrs))
+              use_offline_param_th = true;
+          if (!use_offline_param_th) {
+              if (calib_quantize) {
+                  NodePtr min_var = CreateNode("nullptr", e.node->attrs.name + "_min");
+                  quantize_node->inputs.emplace_back(NodeEntry{ min_var, 0, 0 });
+                  NodePtr max_var = CreateNode("nullptr", e.node->attrs.name + "_max");
+                  quantize_node->inputs.emplace_back(NodeEntry{ max_var, 0, 0 });
+              }
+              else {
+                  NodePtr min_node = InsertNode("min",
+                      e.node->attrs.name + "_min", quantize_node, mirror_entry);
+                  min_node->op()->attr_parser(&(min_node->attrs));
 
-            NodePtr max_node = InsertNode("max",
-                e.node->attrs.name + "_max", quantize_node, mirror_entry);
-            max_node->op()->attr_parser(&(max_node->attrs));
+                  NodePtr max_node = InsertNode("max",
+                      e.node->attrs.name + "_max", quantize_node, mirror_entry);
+                  max_node->op()->attr_parser(&(max_node->attrs));
+              }
           }
           mirror_map[e.node.get()] = std::move(quantize_node);
         } else if (mirror_node->op() != nullptr
@@ -262,7 +272,9 @@ Graph QuantizeGraph(Graph &&src) {
           // if input node is quantized operator, add dequantize node
           if (NeedQuantize(e.node, excluded_nodes) &&
               (mirror_node->op() == nullptr ||
-              mirror_node->op()->name != "_contrib_dequantize")) {
+              mirror_node->op()->name != "_contrib_dequantize") &&
+              (need_dequantize_map.count(mirror_node->op()) > 0 &&
+              need_dequantize_map[mirror_node->op()](mirror_node->attrs))) {
             // here we calculate the output number (exclude min/max, in order to
             // calculate min/max index from mirror node) based on assumption that
             // there is only 1min and 1max output from mirror node (which is
@@ -329,6 +341,7 @@ Graph SetCalibTableToQuantizedGraph(Graph&& g) {
     nnvm::Op::GetAttr<mxnet::FNeedRequantize>("FNeedRequantize");
   const auto& calib_table =
     g.GetAttr<std::unordered_map<std::string, std::pair<float, float>>>("calib_table");
+  const NodePtr* current_quantize_node = NULL;
   DFSVisit(g.outputs, [&](const NodePtr& node) {
     // If the current op is requantize
     // find the thresholds from the calibration table with the key equal
@@ -362,6 +375,27 @@ Graph SetCalibTableToQuantizedGraph(Graph&& g) {
         node->op()->attr_parser(&(node->attrs));
       }
     }
+    else if (node->op() != nullptr && node->op()->name == "_contrib_quantize") {
+      current_quantize_node = &node;
+    }
+    else if (current_quantize_node != NULL && node->inputs.size() != 0 )
+    {
+        NodePtr quantized_op_node = node->inputs[0].node;
+        if (quantized_op_node->attrs.name == (*current_quantize_node)->attrs.name)
+        {
+          std::string node_name = node->attrs.name;
+          const QuantizeParam& qparam = nnvm::get<QuantizeParam>((*current_quantize_node)->attrs.parsed);
+          auto pos = node_name.find("quantized_");
+          std::string node_data_name = node_name + "_data"; // tmp code
+          const auto input_calib_table_iter = calib_table.find(node_data_name);
+          if (input_calib_table_iter != calib_table.end()) {
+            (*current_quantize_node)->attrs.dict["threshold_enabled"] = "True";
+            (*current_quantize_node)->attrs.dict["th_min"] = std::to_string(input_calib_table_iter->second.first);
+            (*current_quantize_node)->attrs.dict["th_max"] = std::to_string(input_calib_table_iter->second.second);
+          }
+        }
+        current_quantize_node = NULL;
+    }
   });
   return g;
 }
diff --git a/src/operator/quantization/quantized_fully_connected.cc b/src/operator/quantization/quantized_fully_connected.cc
index e334fe7..d31d20f 100644
--- a/src/operator/quantization/quantized_fully_connected.cc
+++ b/src/operator/quantization/quantized_fully_connected.cc
@@ -36,15 +36,16 @@ bool QuantizedFullyConnectedShape(const nnvm::NodeAttrs& attrs,
   using namespace mshadow;
   uint32_t num_inputs = param.no_bias ? 2 : 3;
   CHECK_EQ(in_shape->size(), num_inputs * 3);
-  CHECK_EQ(out_shape->size(), 3U);
+
 
   CHECK(!shape_is_none(in_shape->at(0)))
     << "QuantizedFullyConnectedOp input data shape must be given";
   const TShape& dshape = in_shape->at(0);
   TShape wshape = Shape2(param.num_hidden, dshape.ProdShape(1, dshape.ndim()));
+  TShape bshape = Shape1(param.num_hidden);
+
   SHAPE_ASSIGN_CHECK(*in_shape, 1, wshape);
   if (!param.no_bias) {
-    TShape bshape = Shape1(param.num_hidden);
     SHAPE_ASSIGN_CHECK(*in_shape, 2, bshape);
   }
 
@@ -52,9 +53,16 @@ bool QuantizedFullyConnectedShape(const nnvm::NodeAttrs& attrs,
     SHAPE_ASSIGN_CHECK(*in_shape, i, TShape{1});
   }
 
-  SHAPE_ASSIGN_CHECK(*out_shape, 0, TShape({dshape[0], wshape[0]}));
-  SHAPE_ASSIGN_CHECK(*out_shape, 1, TShape({1}));
-  SHAPE_ASSIGN_CHECK(*out_shape, 2, TShape({1}));
+  SHAPE_ASSIGN_CHECK(*out_shape, 0, TShape({ dshape[0], wshape[0] }));
+  if (param.output_type == mshadow::kFloat32) {
+    CHECK_EQ(out_shape->size(), 1U);
+  }
+  else {
+    CHECK_EQ(out_shape->size(), 3U);
+    SHAPE_ASSIGN_CHECK(*out_shape, 1, TShape({1}));
+    SHAPE_ASSIGN_CHECK(*out_shape, 2, TShape({1}));
+  }
+
   return true;
 }
 
@@ -64,21 +72,37 @@ bool QuantizedFullyConnectedType(const nnvm::NodeAttrs& attrs,
   const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
   uint32_t num_inputs = param.no_bias ? 2 : 3;
   CHECK_EQ(in_type->size(), num_inputs * 3);
-  CHECK_EQ(out_type->size(), 3U);
+  //int data could be uint8 or int8
+  //TYPE_ASSIGN_CHECK(*in_type, 0, mshadow::kUint8);
+  TYPE_ASSIGN_CHECK(*in_type, 1, mshadow::kInt8); //Weight
+
+  if (!param.no_bias)
+      TYPE_ASSIGN_CHECK(*in_type, 2, mshadow::kInt8);
 
-  for (size_t i = 0; i < num_inputs; ++i) {
-    TYPE_ASSIGN_CHECK(*in_type, i, mshadow::kInt8);
-  }
   for (size_t i = num_inputs; i < 3 * num_inputs; ++i) {
     TYPE_ASSIGN_CHECK(*in_type, i, mshadow::kFloat32);
   }
+  if (param.output_type == mshadow::kFloat32) {
+      CHECK_EQ(out_type->size(), 1U);
+      TYPE_ASSIGN_CHECK(*out_type, 0, mshadow::kFloat32);
+  } else if (param.output_type == mshadow::kUint8) {
+      CHECK_EQ(out_type->size(), 3U);
+      TYPE_ASSIGN_CHECK(*out_type, 0, mshadow::kUint8);
+      TYPE_ASSIGN_CHECK(*out_type, 1, mshadow::kFloat32);
+      TYPE_ASSIGN_CHECK(*out_type, 2, mshadow::kFloat32);
+  } else if (param.output_type == mshadow::kInt32) {
+      CHECK_EQ(out_type->size(), 3U);
+      TYPE_ASSIGN_CHECK(*out_type, 0, mshadow::kInt32);
+      TYPE_ASSIGN_CHECK(*out_type, 1, mshadow::kFloat32);
+      TYPE_ASSIGN_CHECK(*out_type, 2, mshadow::kFloat32);
+  }
+  else
+      return false;  
 
-  TYPE_ASSIGN_CHECK(*out_type, 0, mshadow::kInt32);
-  TYPE_ASSIGN_CHECK(*out_type, 1, mshadow::kFloat32);
-  TYPE_ASSIGN_CHECK(*out_type, 2, mshadow::kFloat32);
   return true;
 }
 
+
 NNVM_REGISTER_OP(_contrib_quantized_fully_connected)
 .describe(R"code(Fully Connected operator for input, weight and bias data type of int8,
 and accumulates in type int32 for the output. For each argument, two more arguments of type
@@ -93,7 +117,11 @@ and max thresholds representing the threholds for quantizing the float32 output
     const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
     return param.no_bias? 6 : 9;
   })
-.set_num_outputs(3)
+.set_num_outputs(
+    [](const NodeAttrs& attrs) {
+      const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+      return param.output_type == mshadow::kFloat32 ? 1 : 3;
+  })
 .set_attr_parser(ParamParser<FullyConnectedParam>)
 .set_attr<nnvm::FListInputNames>("FListInputNames",
   [](const NodeAttrs& attrs) {
@@ -108,11 +136,21 @@ and max thresholds representing the threholds for quantizing the float32 output
   })
 .set_attr<nnvm::FListOutputNames>("FListOutputNames",
   [](const NodeAttrs& attrs) {
-    return std::vector<std::string>{"output", "min_output", "max_output"};
-  })
+	  const FullyConnectedParam& param = nnvm::get<FullyConnectedParam>(attrs.parsed);
+	  if (param.output_type == mshadow::kFloat32)
+		 return std::vector<std::string>{"output"};
+	  else
+		  return std::vector<std::string>{"output", "min_output", "max_output"};
+      })
 .set_attr<nnvm::FInferShape>("FInferShape", QuantizedFullyConnectedShape)
 .set_attr<nnvm::FInferType>("FInferType", QuantizedFullyConnectedType)
 .set_attr<FNeedRequantize>("FNeedRequantize", [](const NodeAttrs& attrs) { return true; })
+.set_attr<FNeedDequantize>("FNeedDequantize", [](const NodeAttrs& attrs) { return false; })
+.set_attr<FUseOfflineParamThreshold>("FUseOfflineParamThreshold", [](const NodeAttrs& attrs) {
+#if MXNET_USE_MKLDNN == 1
+          return true;
+#endif
+          return false; })
 .add_argument("data", "NDArray-or-Symbol", "Input data.")
 .add_argument("weight", "NDArray-or-Symbol", "weight.")
 .add_argument("bias", "NDArray-or-Symbol", "bias.")
diff --git a/src/operator/slice_channel-inl.h b/src/operator/slice_channel-inl.h
index 3b14a26..27af0fa 100644
--- a/src/operator/slice_channel-inl.h
+++ b/src/operator/slice_channel-inl.h
@@ -99,7 +99,14 @@ class SliceChannelOp : public Operator {
     for (int i = 0; i < size_; ++i) {
       outputs[i] = out_data[i].get_with_shape<xpu, 3, DType>(slice_shape, s);
     }
-    Split(data, &outputs, 1, req);
+
+    // 3D dshape and trailing==1, split_2d can be used to speedup
+    if (trailing == 1 && std::is_same<xpu, cpu>::value) {
+      Split_2D(data, &outputs, 1, req);
+    } else {
+      Split(data, &outputs, 1, req);
+    }
+
   }
 
   virtual void Backward(const OpContext &ctx,
@@ -147,6 +154,74 @@ class SliceChannelOp : public Operator {
 template<typename xpu>
 Operator *CreateOp(SliceChannelParam param, int dtype);
 
+inline bool SliceChannelInferShape(std::vector<TShape> *in_shape,
+	std::vector<TShape> *out_shape,
+	std::vector<TShape> *aux_shape,
+	int num_outputs, int axis, bool squeeze_axis)
+{
+	using namespace mshadow;
+	CHECK_EQ(in_shape->size(), 1U);
+	TShape dshape = in_shape->at(slice_enum::kData);
+	TShape ishape = in_shape->at(slice_enum::kData);
+	if (dshape.ndim() == 0) return false;
+	if (axis >= 0) {
+		CHECK_LT(static_cast<size_t>(axis), dshape.ndim());
+	}
+	else {
+		CHECK_LT(axis + dshape.ndim(), dshape.ndim());
+	}
+	int real_axis = axis;
+	if (real_axis < 0) {
+		real_axis += dshape.ndim();
+	}
+	CHECK_EQ(dshape[real_axis] % num_outputs, 0U)
+		<< "You are trying to split the " << real_axis
+		<< "-th axis of input tensor with shape " << dshape
+		<< " into num_outputs=" << num_outputs
+		<< " evenly sized chunks, but this is not possible because "
+		<< num_outputs << " does not evenly divide "
+		<< dshape[real_axis];
+	if (squeeze_axis && ishape[real_axis] != 0) {
+		CHECK_EQ(ishape[real_axis], static_cast<size_t>(num_outputs))
+			<< "If squeeze axis is True, the size of the sliced axis must be the same as num_outputs."
+			<< " Input shape=" << ishape << ", axis=" << real_axis
+			<< ", num_outputs=" << num_outputs << ".";
+	}
+	dshape[real_axis] /= num_outputs;
+	if (squeeze_axis && (dshape[real_axis] == 1 || ishape[real_axis] == 0)) {
+		for (int d = real_axis; d < static_cast<int>(dshape.ndim()) - 1; ++d) {
+			dshape[d] = dshape[d + 1];
+		}
+		dshape = TShape(&dshape[0], &dshape[dshape.ndim() - 1]);
+	}
+	CHECK_EQ(static_cast<int>((*out_shape).size()), num_outputs)
+		<< "Size of output shape mismatch!";
+	for (int i = 0; i < num_outputs; ++i) {
+		SHAPE_ASSIGN_CHECK(*out_shape, i, dshape);
+		// Perform incomplete shape inference.
+		// We can back-calculate the inshape based on the out_shape.
+		TShape back_calculate_dshape = ishape;
+		if (squeeze_axis && (dshape.ndim() == ishape.ndim() - 1)) {
+			for (int d = 0; d < real_axis; ++d) {
+				back_calculate_dshape[d] = (*out_shape)[i][d];
+			}
+			back_calculate_dshape[real_axis] = num_outputs;
+			for (int d = real_axis + 1; d < static_cast<int>(ishape.ndim()); ++d) {
+				back_calculate_dshape[d] = (*out_shape)[i][d - 1];
+			}
+		}
+		else {
+			for (int d = 0; d < static_cast<int>(ishape.ndim()); ++d) {
+				back_calculate_dshape[d] = (*out_shape)[i][d];
+				if (d == real_axis) {
+					back_calculate_dshape[d] *= num_outputs;
+				}
+			}
+		}
+		SHAPE_ASSIGN_CHECK(*in_shape, slice_enum::kData, back_calculate_dshape);
+	}
+	return true;
+}
 
 #if DMLC_USE_CXX11
 class SliceChannelProp : public OperatorProperty {
@@ -191,66 +266,8 @@ class SliceChannelProp : public OperatorProperty {
   bool InferShape(std::vector<TShape> *in_shape,
                   std::vector<TShape> *out_shape,
                   std::vector<TShape> *aux_shape) const override {
-    using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 1U);
-    TShape dshape = in_shape->at(slice_enum::kData);
-    TShape ishape = in_shape->at(slice_enum::kData);
-    if (dshape.ndim() == 0) return false;
-    if (param_.axis >= 0) {
-      CHECK_LT(static_cast<size_t>(param_.axis), dshape.ndim());
-    } else {
-      CHECK_LT(param_.axis + dshape.ndim(), dshape.ndim());
-    }
-    int real_axis = param_.axis;
-    if (real_axis < 0) {
-      real_axis += dshape.ndim();
-    }
-    CHECK_EQ(dshape[real_axis] % param_.num_outputs, 0U)
-      << "You are trying to split the " << real_axis
-      << "-th axis of input tensor with shape " << dshape
-      << " into num_outputs=" << param_.num_outputs
-      << " evenly sized chunks, but this is not possible because "
-      << param_.num_outputs << " does not evenly divide "
-      << dshape[real_axis];
-    if (param_.squeeze_axis && ishape[real_axis] != 0) {
-      CHECK_EQ(ishape[real_axis], static_cast<size_t>(param_.num_outputs))
-        << "If squeeze axis is True, the size of the sliced axis must be the same as num_outputs."
-        << " Input shape=" << ishape << ", axis=" << real_axis
-        << ", num_outputs=" << param_.num_outputs << ".";
-    }
-    dshape[real_axis] /= param_.num_outputs;
-    if (param_.squeeze_axis && (dshape[real_axis] == 1 || ishape[real_axis] == 0)) {
-      for (int d = real_axis; d < static_cast<int>(dshape.ndim()) - 1; ++d) {
-        dshape[d] = dshape[d+1];
-      }
-      dshape = TShape(&dshape[0], &dshape[dshape.ndim()-1]);
-    }
-    CHECK_EQ(static_cast<int>((*out_shape).size()), param_.num_outputs)
-      << "Size of output shape mismatch!";
-    for (int i = 0; i < param_.num_outputs; ++i) {
-      SHAPE_ASSIGN_CHECK(*out_shape, i, dshape);
-      // Perform incomplete shape inference.
-      // We can back-calculate the inshape based on the out_shape.
-      TShape back_calculate_dshape = ishape;
-      if (param_.squeeze_axis && (dshape.ndim() == ishape.ndim() - 1)) {
-        for (int d = 0; d < real_axis; ++d) {
-          back_calculate_dshape[d] = (*out_shape)[i][d];
-        }
-        back_calculate_dshape[real_axis] = param_.num_outputs;
-        for (int d = real_axis + 1; d < static_cast<int>(ishape.ndim()); ++d) {
-          back_calculate_dshape[d] = (*out_shape)[i][d - 1];
-        }
-      } else {
-        for (int d = 0; d < static_cast<int>(ishape.ndim()); ++d) {
-          back_calculate_dshape[d] = (*out_shape)[i][d];
-          if (d == real_axis) {
-            back_calculate_dshape[d] *= param_.num_outputs;
-          }
-        }
-      }
-      SHAPE_ASSIGN_CHECK(*in_shape, slice_enum::kData, back_calculate_dshape);
-    }
-    return true;
+	  return SliceChannelInferShape(in_shape, out_shape, aux_shape,
+		  param_.num_outputs, param_.axis, param_.squeeze_axis);
   }
 
   OperatorProperty* Copy() const override {
diff --git a/src/operator/softmax_output-inl.h b/src/operator/softmax_output-inl.h
index fec321b..d578106 100644
--- a/src/operator/softmax_output-inl.h
+++ b/src/operator/softmax_output-inl.h
@@ -21,7 +21,7 @@
  * Copyright (c) 2015 by Contributors
  * \file softmax_output-inl.h
  * \brief
- * \author Bing Xu
+ * \author Bing Xu, Zhang Rong A
 */
 #ifndef MXNET_OPERATOR_SOFTMAX_OUTPUT_INL_H_
 #define MXNET_OPERATOR_SOFTMAX_OUTPUT_INL_H_
@@ -88,189 +88,252 @@ struct SoftmaxOutputParam : public dmlc::Parameter<SoftmaxOutputParam> {
               "one-hot encoding of the gold label and distributed uniformly to"
               "all other labels.");
   };
+  bool operator==(const SoftmaxOutputParam& other) const {
+    return this->grad_scale == other.grad_scale &&
+    this->ignore_label == other.ignore_label &&
+    this->multi_output == other.multi_output &&
+    this->use_ignore == other.use_ignore &&
+    this->preserve_shape == other.preserve_shape &&
+    this->normalization == other.normalization &&
+    this->out_grad == other.out_grad &&
+    this->smooth_alpha == other.smooth_alpha;
+  }
 };
 
 template<typename xpu, typename DType>
-class SoftmaxOutputOp : public Operator {
- public:
-  explicit SoftmaxOutputOp(SoftmaxOutputParam param) : param_(param) {}
+void SoftmaxOutputForward(const OpContext &ctx,
+           const SoftmaxOutputParam &param,
+           const std::vector<TBlob> &in_data,
+           const std::vector<OpReqType> &req,
+           const std::vector<TBlob> &out_data) {
+  using namespace mshadow;
+  using namespace mshadow::expr;
+  CHECK_EQ(in_data.size(), 2U) << "SoftmaxOutput Input: [data, label]";
+  CHECK_EQ(out_data.size(), 1U) << "SoftmaxOutput Output: [output]";
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  if (param.multi_output) {
+  int n = in_data[softmaxout_enum::kData].size(0);
+  int k = in_data[softmaxout_enum::kData].size(1);
+  Shape<3> s3 = Shape3(n, k, static_cast<int>(in_data[softmaxout_enum::kData].Size()/n/k));
+  Tensor<xpu, 3, DType> data =
+    in_data[softmaxout_enum::kData].get_with_shape<xpu, 3, DType>(s3, s);
+  Tensor<xpu, 3, DType> out =
+    out_data[softmaxout_enum::kOut].get_with_shape<xpu, 3, DType>(s3, s);
+  Softmax(out, data);
+  } else {
+  if (param.preserve_shape) {
+    Tensor<xpu, 2, DType> data = in_data[softmaxout_enum::kData].FlatTo2D<xpu, DType>(s);
+    Tensor<xpu, 2, DType> out = out_data[softmaxout_enum::kOut].FlatTo2D<xpu, DType>(s);
+    Softmax(out, data);
+  } else {
+    int n = in_data[softmaxout_enum::kData].size(0);
+    int k = in_data[softmaxout_enum::kData].Size()/n;
+    Shape<2> s2 = Shape2(n, k);
+    Tensor<xpu, 2, DType> data =
+      in_data[softmaxout_enum::kData].get_with_shape<xpu, 2, DType>(s2, s);
+    Tensor<xpu, 2, DType> out =
+      out_data[softmaxout_enum::kOut].get_with_shape<xpu, 2, DType>(s2, s);
+    Softmax(out, data);
+  }
+  }
+}
 
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 2U) << "SoftmaxOutput Input: [data, label]";
-    CHECK_EQ(out_data.size(), 1U) << "SoftmaxOutput Output: [output]";
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    if (param_.multi_output) {
-      int n = in_data[softmaxout_enum::kData].size(0);
-      int k = in_data[softmaxout_enum::kData].size(1);
-      Shape<3> s3 = Shape3(n, k, static_cast<int>(in_data[softmaxout_enum::kData].Size()/n/k));
-      Tensor<xpu, 3, DType> data =
-          in_data[softmaxout_enum::kData].get_with_shape<xpu, 3, DType>(s3, s);
-      Tensor<xpu, 3, DType> out =
-          out_data[softmaxout_enum::kOut].get_with_shape<xpu, 3, DType>(s3, s);
-      Softmax(out, data);
-    } else {
-      if (param_.preserve_shape) {
-        Tensor<xpu, 2, DType> data = in_data[softmaxout_enum::kData].FlatTo2D<xpu, DType>(s);
-        Tensor<xpu, 2, DType> out = out_data[softmaxout_enum::kOut].FlatTo2D<xpu, DType>(s);
-        Softmax(out, data);
-      } else {
-        int n = in_data[softmaxout_enum::kData].size(0);
-        int k = in_data[softmaxout_enum::kData].Size()/n;
-        Shape<2> s2 = Shape2(n, k);
-        Tensor<xpu, 2, DType> data =
-            in_data[softmaxout_enum::kData].get_with_shape<xpu, 2, DType>(s2, s);
-        Tensor<xpu, 2, DType> out =
-            out_data[softmaxout_enum::kOut].get_with_shape<xpu, 2, DType>(s2, s);
-        Softmax(out, data);
-      }
-    }
+
+template<typename xpu>
+void SoftmaxOutputCompute(const nnvm::NodeAttrs& attrs, 
+                      const OpContext& ctx, const std::vector<TBlob>& inputs,
+                      const std::vector<OpReqType>& req,
+                      const std::vector<TBlob>& outputs) {
+  const SoftmaxOutputParam &param = nnvm::get<SoftmaxOutputParam>(attrs.parsed);
+                   
+  CHECK_EQ(inputs.size(), 2U);
+  std::vector<TBlob> in_data(inputs.begin(),
+                             inputs.begin() + softmaxout_enum::kLabel);
+  
+  int dtype = inputs[0].type_flag_;
+  switch (dtype) {
+  case mshadow::kFloat32:
+    SoftmaxOutputForward<xpu, float>(ctx, param, inputs, req, outputs);
+    break;
+  case mshadow::kFloat64:
+    SoftmaxOutputForward<xpu, double>(ctx, param, inputs, req, outputs);
+    break;
+  case mshadow::kFloat16:
+    LOG(FATAL) << "float16 SoftmaxOutputCompute layer is currently"
+                  "only supported.";
+    break;
+  default:
+    LOG(FATAL) << "Unsupported type " << dtype;
   }
+}
 
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 2U);
-    CHECK_EQ(out_grad.size(), 1U);
-    CHECK_GE(in_grad.size(), 1U);
-    CHECK_GE(req.size(), 1U);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-
-    if (out_data[softmaxout_enum::kOut].shape_ ==
-        in_data[softmaxout_enum::kLabel].shape_) {
-      // use probability as label
-      Tensor<xpu, 2, DType> label = in_data[softmaxout_enum::kLabel].FlatTo2D<xpu, DType>(s);
-      Tensor<xpu, 2, DType> out = out_data[softmaxout_enum::kOut].FlatTo2D<xpu, DType>(s);
-      Tensor<xpu, 2, DType> grad = in_grad[softmaxout_enum::kData].FlatTo2D<xpu, DType>(s);
-      if (param_.out_grad) {
-        Tensor<xpu, 2, DType> ograd = out_grad[softmaxout_enum::kOut].FlatTo2D<xpu, DType>(s);
-        grad = scalar<DType>(param_.grad_scale) * (out - label) * ograd;
-      } else {
-        grad = (out - label) * scalar<DType>(param_.grad_scale);
-      }
-    } else if (param_.multi_output) {
-      int n = out_data[softmaxout_enum::kOut].size(0);
-      int k = out_data[softmaxout_enum::kOut].size(1);
-      Shape<3> s3 = Shape3(n, k, static_cast<int>(out_data[softmaxout_enum::kOut].Size()/n/k));
-      Shape<2> s2 = Shape2(s3[0], s3[2]);
-      Tensor<xpu, 2, DType> label =
-          in_data[softmaxout_enum::kLabel].get_with_shape<xpu, 2, DType>(s2, s);
-      Tensor<xpu, 3, DType> out =
-          out_data[softmaxout_enum::kOut].get_with_shape<xpu, 3, DType>(s3, s);
-      Tensor<xpu, 3, DType> grad =
-          in_grad[softmaxout_enum::kData].get_with_shape<xpu, 3, DType>(s3, s);
-
-      index_t valid_cnt = label.shape_.Size();
-      if (param_.use_ignore) {
-          SoftmaxGrad(grad, out, label, static_cast<DType>(param_.ignore_label));
-      } else {
-          SoftmaxGrad(grad, out, label);
-      }
-      if (param_.normalization == softmaxout_enum::kBatch) {
-        valid_cnt = label.size(0);
-      } else if (param_.normalization == softmaxout_enum::kValid) {
-        int i_label = static_cast<int>(param_.ignore_label);
-        Tensor<cpu, 2, DType> workspace =
-          ctx.requested[softmaxout_enum::kTempSpace].get_host_space_typed<2, DType>(
-          label.shape_);
-        Copy(workspace, label, label.stream_);
-        for (index_t i = 0; i < workspace.size(0); ++i) {
-          for (index_t j = 0; j < workspace.size(1); ++j) {
-            if (static_cast<int>(workspace[i][j]) == i_label) {
-              valid_cnt--;
-            }
-          }
+
+template<typename xpu, typename DType>
+void SoftmaxOutputBackward(const OpContext &ctx,
+          const SoftmaxOutputParam &param,
+                    const std::vector<TBlob> &out_grad,
+                    const std::vector<TBlob> &in_data,
+                    const std::vector<TBlob> &out_data,
+                    const std::vector<OpReqType> &req,
+                    const std::vector<TBlob> &in_grad) {
+using namespace mshadow;
+using namespace mshadow::expr;
+CHECK_EQ(in_data.size(), 2U);
+CHECK_EQ(out_grad.size(), 1U);
+CHECK_GE(in_grad.size(), 1U);
+CHECK_GE(req.size(), 1U);
+Stream<xpu> *s = ctx.get_stream<xpu>();
+
+if (out_data[softmaxout_enum::kOut].shape_ ==
+    in_data[softmaxout_enum::kLabel].shape_) {
+  // use probability as label
+  Tensor<xpu, 2, DType> label = in_data[softmaxout_enum::kLabel].FlatTo2D<xpu, DType>(s);
+  Tensor<xpu, 2, DType> out = out_data[softmaxout_enum::kOut].FlatTo2D<xpu, DType>(s);
+  Tensor<xpu, 2, DType> grad = in_grad[softmaxout_enum::kData].FlatTo2D<xpu, DType>(s);
+  if (param.out_grad) {
+    Tensor<xpu, 2, DType> ograd = out_grad[softmaxout_enum::kOut].FlatTo2D<xpu, DType>(s);
+    grad = scalar<DType>(param.grad_scale) * (out - label) * ograd;
+  } else {
+    grad = (out - label) * scalar<DType>(param.grad_scale);
+  }
+} else if (param.multi_output) {
+  int n = out_data[softmaxout_enum::kOut].size(0);
+  int k = out_data[softmaxout_enum::kOut].size(1);
+  Shape<3> s3 = Shape3(n, k, static_cast<int>(out_data[softmaxout_enum::kOut].Size()/n/k));
+  Shape<2> s2 = Shape2(s3[0], s3[2]);
+  Tensor<xpu, 2, DType> label =
+      in_data[softmaxout_enum::kLabel].get_with_shape<xpu, 2, DType>(s2, s);
+  Tensor<xpu, 3, DType> out =
+      out_data[softmaxout_enum::kOut].get_with_shape<xpu, 3, DType>(s3, s);
+  Tensor<xpu, 3, DType> grad =
+      in_grad[softmaxout_enum::kData].get_with_shape<xpu, 3, DType>(s3, s);
+
+  index_t valid_cnt = label.shape_.Size();
+  if (param.use_ignore) {
+      SoftmaxGrad(grad, out, label, static_cast<DType>(param.ignore_label));
+  } else {
+      SoftmaxGrad(grad, out, label);
+  }
+  if (param.normalization == softmaxout_enum::kBatch) {
+    valid_cnt = label.size(0);
+  } else if (param.normalization == softmaxout_enum::kValid) {
+    int i_label = static_cast<int>(param.ignore_label);
+    Tensor<cpu, 2, DType> workspace =
+      ctx.requested[softmaxout_enum::kTempSpace].get_host_space_typed<2, DType>(
+      label.shape_);
+    Copy(workspace, label, label.stream_);
+    for (index_t i = 0; i < workspace.size(0); ++i) {
+      for (index_t j = 0; j < workspace.size(1); ++j) {
+        if (static_cast<int>(workspace[i][j]) == i_label) {
+          valid_cnt--;
         }
-        valid_cnt = valid_cnt == 0 ? 1 : valid_cnt;
-      } else {
-        valid_cnt = 1;
       }
-      grad *= DType(param_.grad_scale /
-                    (param_.normalization == softmaxout_enum::kValid ? 1 : s3[2]) /
-                    valid_cnt);
-      if (param_.out_grad) {
-        Tensor<xpu, 3, DType> ograd =
-          out_grad[softmaxout_enum::kOut].get_with_shape<xpu, 3, DType>(s3, s);
-        grad *= ograd;
-      }
-    } else {
-      Shape<1> label_shape = Shape1(in_data[softmaxout_enum::kLabel].Size());
-      Shape<2> data_shape;
-      if (param_.preserve_shape) {
-        data_shape = out_data[softmaxout_enum::kOut].shape_.FlatTo2D();
+    }
+    valid_cnt = valid_cnt == 0 ? 1 : valid_cnt;
+  } else {
+    valid_cnt = 1;
+  }
+  grad *= DType(param.grad_scale /
+                (param.normalization == softmaxout_enum::kValid ? 1 : s3[2]) /
+                valid_cnt);
+  if (param.out_grad) {
+    Tensor<xpu, 3, DType> ograd =
+      out_grad[softmaxout_enum::kOut].get_with_shape<xpu, 3, DType>(s3, s);
+    grad *= ograd;
+  }
+} else {
+  Shape<1> label_shape = Shape1(in_data[softmaxout_enum::kLabel].Size());
+  Shape<2> data_shape;
+  if (param.preserve_shape) {
+    data_shape = out_data[softmaxout_enum::kOut].shape_.FlatTo2D();
 //        Tensor<xpu, 1, DType> label = in_data[softmaxout_enum::kLabel].FlatTo1D<xpu, DType>(s);
 //        Tensor<xpu, 2, DType> out = out_data[softmaxout_enum::kOut].FlatTo2D<xpu, DType>(s);
 //        Tensor<xpu, 2, DType> grad = in_grad[softmaxout_enum::kData].FlatTo2D<xpu, DType>(s);
-      } else {
-        int n = out_data[softmaxout_enum::kOut].size(0);
-        data_shape = Shape2(n, out_data[softmaxout_enum::kOut].Size()/n);
-      }
-      Tensor<xpu, 1, DType> label = in_data[softmaxout_enum::kLabel].get_with_shape<xpu, 1, DType>(
-          label_shape, s);
-      Tensor<xpu, 2, DType> out =
-          out_data[softmaxout_enum::kOut].get_with_shape<xpu, 2, DType>(data_shape, s);
-      Tensor<xpu, 2, DType> grad =
-          in_grad[softmaxout_enum::kData].get_with_shape<xpu, 2, DType>(data_shape, s);
-      index_t valid_cnt = label.shape_.Size();
-      if (param_.use_ignore) {
-        if (param_.smooth_alpha == 0.0f) {
-          SoftmaxGrad(grad, out, label, static_cast<DType>(param_.ignore_label));
-        } else {
-          SmoothSoftmaxGrad(grad, out, label, static_cast<DType>(param_.ignore_label),
-                            param_.smooth_alpha);
-        }
-      } else {
-        if (param_.smooth_alpha == 0.0f) {
-          SoftmaxGrad(grad, out, label);
-        } else {
-          SmoothSoftmaxGrad(grad, out, label, param_.smooth_alpha);
-        }
-      }
-      if (param_.normalization == softmaxout_enum::kBatch) {
-        valid_cnt = label.size(0);
-      } else if (param_.normalization == softmaxout_enum::kValid) {
-        int i_label = static_cast<int>(param_.ignore_label);
-        Tensor<cpu, 1, DType> workspace =
-          ctx.requested[softmaxout_enum::kTempSpace].get_host_space_typed<1, DType>(
-          label.shape_);
-        Copy(workspace, label, label.stream_);
-        for (index_t i = 0; i < label.size(0); ++i) {
-          if (static_cast<int>(workspace[i]) == i_label) {
-            valid_cnt--;
-          }
-        }
-        valid_cnt = valid_cnt == 0 ? 1 : valid_cnt;
-      } else {
-        valid_cnt = 1;
-      }
-      grad *= DType(param_.grad_scale / valid_cnt);
-      if (param_.out_grad) {
-        Tensor<xpu, 2, DType> ograd =
-          out_grad[softmaxout_enum::kOut].get_with_shape<xpu, 2, DType>(data_shape, s);
-        grad *= ograd;
+  } else {
+    int n = out_data[softmaxout_enum::kOut].size(0);
+    data_shape = Shape2(n, out_data[softmaxout_enum::kOut].Size()/n);
+  }
+  Tensor<xpu, 1, DType> label = in_data[softmaxout_enum::kLabel].get_with_shape<xpu, 1, DType>(
+      label_shape, s);
+  Tensor<xpu, 2, DType> out =
+      out_data[softmaxout_enum::kOut].get_with_shape<xpu, 2, DType>(data_shape, s);
+  Tensor<xpu, 2, DType> grad =
+      in_grad[softmaxout_enum::kData].get_with_shape<xpu, 2, DType>(data_shape, s);
+  index_t valid_cnt = label.shape_.Size();
+  if (param.use_ignore) {
+    if (param.smooth_alpha == 0.0f) {
+      SoftmaxGrad(grad, out, label, static_cast<DType>(param.ignore_label));
+    } else {
+      SmoothSoftmaxGrad(grad, out, label, static_cast<DType>(param.ignore_label),
+                        param.smooth_alpha);
+    }
+  } else {
+    if (param.smooth_alpha == 0.0f) {
+      SoftmaxGrad(grad, out, label);
+    } else {
+      SmoothSoftmaxGrad(grad, out, label, param.smooth_alpha);
+    }
+  }
+  if (param.normalization == softmaxout_enum::kBatch) {
+    valid_cnt = label.size(0);
+  } else if (param.normalization == softmaxout_enum::kValid) {
+    int i_label = static_cast<int>(param.ignore_label);
+    Tensor<cpu, 1, DType> workspace =
+      ctx.requested[softmaxout_enum::kTempSpace].get_host_space_typed<1, DType>(
+      label.shape_);
+    Copy(workspace, label, label.stream_);
+    for (index_t i = 0; i < label.size(0); ++i) {
+      if (static_cast<int>(workspace[i]) == i_label) {
+        valid_cnt--;
       }
     }
+    valid_cnt = valid_cnt == 0 ? 1 : valid_cnt;
+  } else {
+    valid_cnt = 1;
+  }
+  grad *= DType(param.grad_scale / valid_cnt);
+  if (param.out_grad) {
+    Tensor<xpu, 2, DType> ograd =
+      out_grad[softmaxout_enum::kOut].get_with_shape<xpu, 2, DType>(data_shape, s);
+    grad *= ograd;
   }
+}
+}
 
- private:
-  SoftmaxOutputParam param_;
-};  // class SoftmaxOutputOp
 
-// Decalre Factory function, used for dispatch specialization
 template<typename xpu>
-Operator* CreateOp(SoftmaxOutputParam param, int dtype);
+void SoftmaxOutputGradCompute(const nnvm::NodeAttrs& attrs,
+                               const OpContext& ctx,
+                               const std::vector<TBlob>& inputs,
+                               const std::vector<OpReqType>& req,
+                               const std::vector<TBlob>& outputs) {
+  const SoftmaxOutputParam& param = nnvm::get<SoftmaxOutputParam>(attrs.parsed);
+  CHECK_EQ(inputs.size(), 2U);
+
+  std::vector<TBlob> out_grad{inputs[0]};
+  std::vector<TBlob> out_data{inputs[0]};
+  std::vector<TBlob> in_data(inputs.begin(), inputs.end());
+  int dtype = inputs[0].type_flag_;
+  const std::vector<TBlob> &in_grad = outputs;
 
+  switch (dtype) {
+  case mshadow::kFloat32:
+    SoftmaxOutputBackward<xpu, float>(ctx, param, out_grad, in_data, out_data, req, in_grad);
+    break;
+  case mshadow::kFloat64:
+    SoftmaxOutputBackward<xpu, double>(ctx, param, out_grad, in_data, out_data, req, in_grad);
+    break;
+  case mshadow::kFloat16:
+    LOG(FATAL) << "float16 fully connected layer is currently"
+                  "only supported by CuDNN version.";
+    break;
+  default:
+    LOG(FATAL) << "Unsupported type " << dtype;
+  }
+}
+
+
+#if 0
 #if DMLC_USE_CXX11
 class SoftmaxOutputProp : public OperatorProperty {
  public:
@@ -411,7 +474,27 @@ class DeprecatedSoftmaxProp : public SoftmaxOutputProp {
   }
 };
 #endif  // DMLC_USE_CXX11
+#endif // #if 0
+
 
 }  // namespace op
 }  // namespace mxnet
+namespace std {
+template<>
+struct hash<mxnet::op::SoftmaxOutputParam> {
+  size_t operator()(const mxnet::op::SoftmaxOutputParam& val) {
+    size_t ret = 0;
+    ret = dmlc::HashCombine(ret, val.grad_scale);
+    ret = dmlc::HashCombine(ret, val.ignore_label);
+    ret = dmlc::HashCombine(ret, val.multi_output);
+    ret = dmlc::HashCombine(ret, val.use_ignore);
+    ret = dmlc::HashCombine(ret, val.preserve_shape);
+    ret = dmlc::HashCombine(ret, val.normalization);
+    ret = dmlc::HashCombine(ret, val.out_grad);
+    ret = dmlc::HashCombine(ret, val.smooth_alpha);
+    return ret;
+  }
+};
+}  // namespace std
 #endif  // MXNET_OPERATOR_SOFTMAX_OUTPUT_INL_H_
+
diff --git a/src/operator/softmax_output.cc b/src/operator/softmax_output.cc
index 5ba421f..34dd512 100644
--- a/src/operator/softmax_output.cc
+++ b/src/operator/softmax_output.cc
@@ -21,30 +21,172 @@
  * Copyright (c) 2015 by Contributors
  * \file softmax_output.cc
  * \brief
- * \author Bing Xu
+ * \author Bing Xu, Zhang Rong A
 */
 #include "./softmax_output-inl.h"
+#include "./nn/mkldnn/mkldnn_ops-inl.h"
 
 namespace mxnet {
 namespace op {
-template<>
-Operator *CreateOp<cpu>(SoftmaxOutputParam param, int dtype) {
-  Operator *op = nullptr;
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    op = new SoftmaxOutputOp<cpu, DType>(param);
-  })
-  return op;
+  
+DMLC_REGISTER_PARAMETER(SoftmaxOutputParam);
+struct SoftmaxOutputGrad {
+  const char *op_name;
+  std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
+                                          const std::vector<nnvm::NodeEntry>& ograds) const {
+#if 0
+    std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
+    index_t n_out = n->num_outputs();
+    for (index_t i = 0; i < n_out; ++i) {
+      heads.emplace_back(nnvm::NodeEntry{n, i, 0});
+    }
+
+    heads.emplace_back(n->inputs[softmaxout_enum::kData]);
+    heads.emplace_back(n->inputs[softmaxout_enum::kLabel]);
+//  LOG(INFO)<<"n_out is "<<n_out<<"ograds.size is "<< ograds.size();
+    return MakeGradNode(op_name, n, heads, n->attrs.dict);
+#endif
+  std::vector<nnvm::NodeEntry> out_data(n->num_outputs());
+  for (uint32_t i = 0; i < out_data.size(); ++i) {
+    out_data[i] = nnvm::NodeEntry{n, i, 0};
+  }
+//  std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
+//  heads.push_back(ograds[0]);
+  std::vector<nnvm::NodeEntry> heads;
+  heads.push_back(out_data[softmaxout_enum::kOut]);
+//  heads.push_back(n->inputs[softmaxout_enum::kData]);
+  heads.push_back(n->inputs[softmaxout_enum::kLabel]);
+
+  nnvm::NodePtr gnode = nnvm::Node::Create();
+  gnode->inputs = std::move(heads);
+  gnode->control_deps.emplace_back(n);
+  gnode->attrs = n->attrs;
+  gnode->attrs.op = nnvm::Op::Get("_backward_SoftmaxOutput");
+  gnode->attrs.name = n->attrs.name + "_backward";
+  std::vector<nnvm::NodeEntry> in_grad(2);
+  for (uint32_t i = 0; i < 2; ++i) {
+    in_grad[i] = nnvm::NodeEntry{gnode, i, 0};
+  }
+  //LOG(INFO)<<"in_grad size is "<<in_grad.size();
+  return in_grad;
+
+  }
+};
+#if 0
+struct SoftmaxOutputGrad {
+  onst char *op_name;
+  std::vector<nnvm::NodeEntry> operator()(const nnvm::NodePtr& n,
+                                          const std::vector<nnvm::NodeEntry>& ograds) const {
+    std::vector<nnvm::NodeEntry> heads;
+    index_t n_out = n->num_outputs();
+    for (index_t i = 0; i < n_out; ++i) {
+    heads.emplace_back(nnvm::NodeEntry{n, i, 0});
+    }
+    return MakeNonlossGradNode(op_name, n, ograds, heads, n->attrs.dict);
+  }
+};
+
+#endif
+static inline std::vector<std::string> ListArguments() {
+    return {"data", "label"};
+}
+
+
+static bool SoftmaxOutputType(const nnvm::NodeAttrs& attrs,
+                               std::vector<int> *in_type, std::vector<int> *out_type) {
+    CHECK_GE(in_type->size(), 2U);
+    int dtype = (*in_type)[0];
+    CHECK_NE(dtype, -1) << "First input must have specified type";
+    for (size_t i = 0; i < in_type->size(); ++i) {
+      if ((*in_type)[i] == -1) {
+        (*in_type)[i] = dtype;
+      } else {
+        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
+      }
+    }
+    out_type->clear();
+    out_type->push_back(dtype);
+    return true;
 }
 
-// DO_BIND_DISPATCH comes from operator_common.h
-Operator *SoftmaxOutputProp::CreateOperatorEx(Context ctx, std::vector<TShape> *in_shape,
-                                     std::vector<int> *in_type) const {
-  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
+static bool SoftmaxOutputShape(const nnvm::NodeAttrs& attrs,
+                                std::vector<TShape> *in_shape,
+                                std::vector<TShape> *out_shape) {
+    using namespace mshadow;
+  const SoftmaxOutputParam& param = nnvm::get<SoftmaxOutputParam>(attrs.parsed);
+    CHECK_EQ(in_shape->size(), 2U) << "Input:[data, label]";
+    const TShape &dshape = in_shape->at(0);
+    if (dshape.ndim() == 0) return false;
+
+    // label.shape == data.shape: use probability as label
+    if (dshape != (*in_shape)[softmaxout_enum::kLabel]) {
+      if (param.multi_output) {
+        TShape lshape1 = Shape2(dshape[0], dshape.Size()/dshape[0]/dshape[1]);
+        TShape lshape2(dshape.ndim() - 1);
+        lshape2[0] = dshape[0];
+        for (index_t i = 2; i < dshape.ndim(); ++i)
+          lshape2[i-1] = dshape[i];
+        TShape lshape3 = dshape;
+        lshape3[1] = 1;
+        if (in_shape->at(softmaxout_enum::kLabel).ndim() == 0) {
+          in_shape->at(softmaxout_enum::kLabel) = lshape1;
+        } else if (in_shape->at(softmaxout_enum::kLabel) == lshape1) {
+        } else if (in_shape->at(softmaxout_enum::kLabel) == lshape2) {
+        } else if (in_shape->at(softmaxout_enum::kLabel) == lshape3) {
+        } else {
+          std::ostringstream os;
+          os << "Expecting " << lshape1 << " or " << lshape2
+             << ". But got " << in_shape->at(softmaxout_enum::kLabel);
+          throw InferShapeError(os.str(), softmaxout_enum::kLabel);
+        }
+      } else {
+        TShape label_shape(dshape.ndim() - 1);
+        for (index_t i = 0; i + 1 < dshape.ndim(); ++i)
+          label_shape[i] = dshape[i];
+        SHAPE_ASSIGN_CHECK(*in_shape, softmaxout_enum::kLabel, label_shape);
+      }
+    }
+  
+    out_shape->clear();
+    out_shape->push_back(dshape);
+    return true;
 }
 
-DMLC_REGISTER_PARAMETER(SoftmaxOutputParam);
+#if MXNET_USE_MKLDNN == 1
+inline static bool SoftmaxOutputStorageType(const nnvm::NodeAttrs& attrs,
+                                   const int dev_mask,
+                                   DispatchMode* dispatch_mode,
+                                   std::vector<int>* in_attrs,
+                                   std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 2);
+  CHECK_EQ(out_attrs->size(), 1);
+
+  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs,
+                           out_attrs);
+}
 
-MXNET_REGISTER_OP_PROPERTY(SoftmaxOutput, SoftmaxOutputProp)
+void SoftmaxOutputComputeExCPU(const nnvm::NodeAttrs &attrs,
+                           const OpContext &ctx,
+                           const std::vector<NDArray> &inputs,
+                           const std::vector<OpReqType> &req,
+                           const std::vector<NDArray> &outputs) {
+  CHECK_EQ(inputs.size(), 2U);
+  const SoftmaxOutputParam &param = nnvm::get<SoftmaxOutputParam>(attrs.parsed);
+  // MKLDNN softmaxOutput only works well on the special MKLDNN layout.
+  if (SupportMKLDNN(inputs[0]) && !ctx.is_train && SupportMKLDNNSoftmaxOutput(param)) {
+    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
+    MKLDNNSoftmaxOutputForward(attrs, ctx, inputs, req, outputs);
+    auto fn = SoftmaxOutputCompute<cpu>;
+    MKLDNN_OPCHECK_RUN(fn, attrs, ctx, inputs, req, outputs);
+    return;
+  }
+  FallBackCompute(SoftmaxOutputCompute<cpu>, attrs, ctx, inputs, req, outputs);
+}
+
+#endif
+
+NNVM_REGISTER_OP(SoftmaxOutput)
+MXNET_ADD_SPARSE_OP_ALIAS(SoftmaxOutput)  // to be check
 .describe(R"code(Computes the gradient of cross entropy loss with respect to softmax output.
 
 - This operator computes the gradient in two steps.
@@ -57,15 +199,15 @@ MXNET_REGISTER_OP_PROPERTY(SoftmaxOutput, SoftmaxOutputProp)
 
   - Softmax Function:
 
-    .. math:: \text{softmax}(x)_i = \frac{exp(x_i)}{\sum_j exp(x_j)}
+  .. math:: \text{softmax}(x)_i = \frac{exp(x_i)}{\sum_j exp(x_j)}
 
   - Cross Entropy Function:
 
-    .. math:: \text{CE(label, output)} = - \sum_i \text{label}_i \log(\text{output}_i)
+  .. math:: \text{CE(label, output)} = - \sum_i \text{label}_i \log(\text{output}_i)
 
   - The gradient of cross entropy loss w.r.t softmax output:
 
-    .. math:: \text{gradient} = \text{output} - \text{label}
+  .. math:: \text{gradient} = \text{output} - \text{label}
 
 - During forward propagation, the softmax function is computed for each instance in the input array.
 
@@ -74,70 +216,102 @@ MXNET_REGISTER_OP_PROPERTY(SoftmaxOutput, SoftmaxOutputProp)
   and `multi_output` to specify the way to compute softmax:
 
   - By default, `preserve_shape` is ``false``. This operator will reshape the input array
-    into a 2-D array with shape :math:`(d_1, \frac{s}{d_1})` and then compute the softmax function for
-    each row in the reshaped array, and afterwards reshape it back to the original shape
-    :math:`(d_1, d_2, ..., d_n)`.
+  into a 2-D array with shape :math:`(d_1, \frac{s}{d_1})` and then compute the softmax function for
+  each row in the reshaped array, and afterwards reshape it back to the original shape
+  :math:`(d_1, d_2, ..., d_n)`.
   - If `preserve_shape` is ``true``, the softmax function will be computed along
-    the last axis (`axis` = ``-1``).
+  the last axis (`axis` = ``-1``).
   - If `multi_output` is ``true``, the softmax function will be computed along
-    the second axis (`axis` = ``1``).
+  the second axis (`axis` = ``1``).
 
 - During backward propagation, the gradient of cross-entropy loss w.r.t softmax output array is computed.
   The provided label can be a one-hot label array or a probability label array.
 
   - If the parameter `use_ignore` is ``true``, `ignore_label` can specify input instances
-    with a particular label to be ignored during backward propagation. **This has no effect when
-    softmax `output` has same shape as `label`**.
-
-    Example::
-
-      data = [[1,2,3,4],[2,2,2,2],[3,3,3,3],[4,4,4,4]]
-      label = [1,0,2,3]
-      ignore_label = 1
-      SoftmaxOutput(data=data, label = label,\
-                    multi_output=true, use_ignore=true,\
-                    ignore_label=ignore_label)
-      ## forward softmax output
-      [[ 0.0320586   0.08714432  0.23688284  0.64391428]
-       [ 0.25        0.25        0.25        0.25      ]
-       [ 0.25        0.25        0.25        0.25      ]
-       [ 0.25        0.25        0.25        0.25      ]]
-      ## backward gradient output
-      [[ 0.    0.    0.    0.  ]
-       [-0.75  0.25  0.25  0.25]
-       [ 0.25  0.25 -0.75  0.25]
-       [ 0.25  0.25  0.25 -0.75]]
-      ## notice that the first row is all 0 because label[0] is 1, which is equal to ignore_label.
+  with a particular label to be ignored during backward propagation. **This has no effect when
+  softmax `output` has same shape as `label`**.
+
+  Example::
+
+    data = [[1,2,3,4],[2,2,2,2],[3,3,3,3],[4,4,4,4]]
+    label = [1,0,2,3]
+    ignore_label = 1
+    SoftmaxOutput(data=data, label = label,\
+          multi_output=true, use_ignore=true,\
+          ignore_label=ignore_label)
+  ## forward softmax output
+    [[ 0.0320586   0.08714432  0.23688284  0.64391428]
+     [ 0.25    0.25    0.25    0.25    ]
+     [ 0.25    0.25    0.25    0.25    ]
+     [ 0.25    0.25    0.25    0.25    ]]
+  ## backward gradient output
+    [[ 0.    0.  0.    0.  ]
+     [-0.75  0.25  0.25  0.25]
+     [ 0.25  0.25 -0.75  0.25]
+     [ 0.25  0.25  0.25 -0.75]]
+  ## notice that the first row is all 0 because label[0] is 1, which is equal to ignore_label.
 
   - The parameter `grad_scale` can be used to rescale the gradient, which is often used to
-    give each loss function different weights.
+  give each loss function different weights.
 
   - This operator also supports various ways to normalize the gradient by `normalization`,
-    The `normalization` is applied if softmax output has different shape than the labels.
-    The `normalization` mode can be set to the followings:
+  The `normalization` is applied if softmax output has different shape than the labels.
+  The `normalization` mode can be set to the followings:
 
-    - ``'null'``: do nothing.
-    - ``'batch'``: divide the gradient by the batch size.
-    - ``'valid'``: divide the gradient by the number of instances which are not ignored.
+  - ``'null'``: do nothing.
+  - ``'batch'``: divide the gradient by the batch size.
+  - ``'valid'``: divide the gradient by the number of instances which are not ignored.
 
 )code" ADD_FILELINE)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<SoftmaxOutputParam>)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FInferStorageType>("FInferStorageType", SoftmaxOutputStorageType)
+#endif
+.set_attr<nnvm::FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"data", "label"};
+})
+.set_attr<nnvm::FListOutputNames>("FListOutputNames",
+    [](const NodeAttrs& attrs) {
+    return std::vector<std::string>{"output"};
+})
+#if MXNET_USE_MKLDNN == 1
+.set_attr<bool>("TIsMKLDNN", true)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+#endif
+.set_attr<nnvm::FInferShape>("FInferShape", SoftmaxOutputShape)
+.set_attr<nnvm::FInferType>("FInferType", SoftmaxOutputType)
+.set_attr<FCompute>("FCompute<cpu>", SoftmaxOutputCompute<cpu>)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FComputeEx>("FComputeEx<cpu>", SoftmaxOutputComputeExCPU)
+#endif
+.set_attr<nnvm::FGradient>("FGradient", SoftmaxOutputGrad{"_backward_SoftmaxOutput"})
+.set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs){
+  return std::vector<std::pair<int, int> >{{0, 0}};
+})
 .add_argument("data", "NDArray-or-Symbol", "Input array.")
 .add_argument("label", "NDArray-or-Symbol", "Ground truth label.")
 .add_arguments(SoftmaxOutputParam::__FIELDS__());
 
 
-MXNET_REGISTER_OP_PROPERTY(Softmax, DeprecatedSoftmaxProp)
-.describe(R"code(Please use `SoftmaxOutput`.
 
-.. note::
+NNVM_REGISTER_OP(_backward_SoftmaxOutput)
+.set_num_outputs(2)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs){
+  return std::vector<std::pair<int, int> >{{0, 0}};
+})
+.set_attr_parser(ParamParser<SoftmaxOutputParam>)
+.set_attr<FCompute>("FCompute<cpu>", SoftmaxOutputGradCompute<cpu>);
 
-  This operator has been renamed to `SoftmaxOutput`, which
-  computes the gradient of cross-entropy loss w.r.t softmax output.
-  To just compute softmax output, use the `softmax` operator.
-
-)code" ADD_FILELINE)
-.add_argument("data", "NDArray-or-Symbol", "Input array.")
-.add_arguments(SoftmaxOutputParam::__FIELDS__());
 
 }  // namespace op
 }  // namespace mxnet
+
+
diff --git a/src/operator/subgraph/mkldnn/mkldnn_fc_post_quantize_property.cc b/src/operator/subgraph/mkldnn/mkldnn_fc_post_quantize_property.cc
new file mode 100644
index 0000000..d7554e7
--- /dev/null
+++ b/src/operator/subgraph/mkldnn/mkldnn_fc_post_quantize_property.cc
@@ -0,0 +1,162 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#if MXNET_USE_MKLDNN == 1
+
+#include "../common.h"
+#include "../subgraph_property.h"
+#include "../../nn/fully_connected-inl.h"
+#include "../../quantization/requantize-inl.h"
+
+namespace mxnet {
+namespace op {
+#define QFC_OP_NODE_NAME "_contrib_quantized_fully_connected"
+class SgMKLDNNFCPostQuantizeSelector : public SubgraphSelector {
+ public:
+  /*! \brief pattern match status */
+  enum SelectStatus {
+    kFail = 0,
+    kStart,
+    kSuccess,
+  };
+
+ private:
+  bool disable_all;
+  SelectStatus status;
+  std::vector<const nnvm::Node *> matched_list;
+
+ public:
+  explicit SgMKLDNNFCPostQuantizeSelector(int dis_all)
+      : disable_all(dis_all) {}
+
+  bool Select(const nnvm::Node &n) override {
+    if ((!disable_all) && n.op() && n.op()->name == QFC_OP_NODE_NAME) {
+      auto const &param = nnvm::get<FullyConnectedParam>(n.attrs.parsed);
+      status = kStart;
+      matched_list.clear();
+      matched_list.push_back(&n);
+      return true;
+    }
+    return false;
+  }
+
+  bool SelectInput(const nnvm::Node &n, const nnvm::Node &new_node) override {
+    return false;
+  }
+
+  bool SelectOutput(const nnvm::Node &n, const nnvm::Node &new_node) override {
+    if (status == kFail || status == kSuccess || new_node.is_variable())
+      return false;
+    // If n isn't the last matched node, then we encoutered a internal
+    // branch, we should pop out the node behind n and stop fusion.
+    if (matched_list.back() != &n) {
+      status = kFail;
+      return false;
+    }
+    if (new_node.op()->name == "_contrib_requantize") {
+      auto const &param = nnvm::get<RequantizeParam>(new_node.attrs.parsed);
+      if (param.min_calib_range.has_value() &&
+          param.max_calib_range.has_value()) {
+        matched_list.push_back(&new_node);
+        status = kSuccess;
+        return true;
+      } else {
+        status = kFail;
+      }
+    }
+    return false;
+  }
+
+  std::vector<nnvm::Node *> Filter(
+      const std::vector<nnvm::Node *> &candidates) override {
+    if (status != kSuccess) {
+      return std::vector<nnvm::Node *>(0);
+    } else {
+      return candidates;
+    }
+  }
+};
+
+class SgMKLDNNFCPostQuantizeProperty : public SubgraphProperty {
+ public:
+  SgMKLDNNFCPostQuantizeProperty() {
+    disable_all = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_OPT", 0);
+    if (disable_all) {
+      LOG(INFO) << "MKLDNN Fully Connection post-quantization optimization pass is disabled.";
+    } else {
+      LOG(INFO) << "Start to execute MKLDNN Fully Connection post-quantization optimization pass.";
+    }       
+  }
+  static SubgraphPropertyPtr Create() {
+    return std::make_shared<SgMKLDNNFCPostQuantizeProperty>();
+  }
+  nnvm::NodePtr CreateSubgraphNode(const nnvm::Symbol &sym,
+                                   const int subgraph_id = 0) const override {
+    nnvm::NodePtr fc_node = nullptr;
+    nnvm::NodePtr requantize_node = nullptr;
+    DFSVisit(sym.outputs, [&](const nnvm::NodePtr &node) {
+      if (node->is_variable()) return;
+      auto &op_name = node->op()->name;
+      if (op_name == QFC_OP_NODE_NAME) {
+        fc_node = node;
+      } else if (op_name == "_contrib_requantize") {
+        requantize_node = node;
+      }
+    });
+    CHECK_NOTNULL(fc_node);
+    CHECK_NOTNULL(requantize_node);
+    auto const &requantize_param =
+        nnvm::get<RequantizeParam>(requantize_node->attrs.parsed);
+    CHECK(requantize_param.min_calib_range.has_value());
+    CHECK(requantize_param.max_calib_range.has_value());
+    fc_node->attrs.dict["out_min_calib_range"] =
+        std::to_string(requantize_param.min_calib_range.value());
+    fc_node->attrs.dict["out_max_calib_range"] =
+        std::to_string(requantize_param.max_calib_range.value());
+    fc_node->attrs.dict["out_enable_calib_range"] = std::string("True");
+    fc_node->attrs.dict["output_type"] = std::string("fp32"); //Only FP32 support 1 output
+    fc_node->op()->attr_parser(&(fc_node->attrs));
+    return fc_node;
+  }
+
+  SubgraphSelectorPtr CreateSubgraphSelector() const override {
+    auto selector =
+        std::make_shared<SgMKLDNNFCPostQuantizeSelector>(disable_all);
+    return selector;
+  }
+
+  void ConnectSubgraphOutputs(
+      const nnvm::NodePtr n,
+      std::vector<nnvm::NodeEntry *> *output_entries) const override {
+    for (size_t i = 0; i < output_entries->size(); ++i) {
+      auto entry_ptr = output_entries->at(i);
+      *entry_ptr = nnvm::NodeEntry{n, entry_ptr->index, 0};
+    }
+  }
+
+ private:
+  int disable_all;
+};
+
+MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_POST_FC_QUANTIZE, SgMKLDNNFCPostQuantizeProperty);
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // if MXNET_USE_MKLDNN == 1
diff --git a/src/operator/subgraph/mkldnn/mkldnn_parallel_embedding_property.cc b/src/operator/subgraph/mkldnn/mkldnn_parallel_embedding_property.cc
new file mode 100644
index 0000000..9f55112
--- /dev/null
+++ b/src/operator/subgraph/mkldnn/mkldnn_parallel_embedding_property.cc
@@ -0,0 +1,262 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#if MXNET_USE_MKLDNN == 1
+#include <sstream>
+#include "../common.h"
+#include "../subgraph_property.h"
+#include "../../nn/fully_connected-inl.h"
+#include "../../nn/activation-inl.h"
+#include "../../tensor/mkldnn/mkldnn_parallel_embedding.h"
+
+namespace mxnet {
+namespace op {
+
+#define EMBEDDING_NODE_NAME "Embedding"
+
+class SgMKLDNNParallelEmbeddingSelector : public SubgraphSelector {
+ public:
+  /*! \brief pattern match status */
+  enum SelectStatus {
+    kFail = 0,
+    kStart,
+    kEmbedding,
+    kSuccess,
+  };
+
+ private:
+  bool disable_all;
+  SelectStatus status;
+  std::vector<const nnvm::Node *> matched_list;
+
+ public:
+  explicit SgMKLDNNParallelEmbeddingSelector(int dis_all)
+      : disable_all(dis_all) {}
+
+  bool Select(const nnvm::Node &n) override {
+    if ((!disable_all) && n.op() && n.op()->name == "Concat") {
+      status = kStart;
+      matched_list.clear();
+      return true;
+    }
+    return false;
+  }
+
+  bool SelectInput(const nnvm::Node &n, const nnvm::Node &new_node) override {
+      if (disable_all) return false;
+      if (status == kFail || status == kSuccess || new_node.is_variable())
+          return false;
+      bool ret = false;
+      switch (status) {
+      case kStart:
+          //The Assumption is only base on W&D which all embedding occur at the beginning and output to 1 concat node
+          if (new_node.op()->name == EMBEDDING_NODE_NAME) {
+              matched_list.push_back(&new_node);
+              status = kEmbedding; // > 2 embedding
+              ret = true;
+          }
+          else{
+              return false;
+          }
+
+          break;
+      case kEmbedding:
+          if (new_node.op()->name == EMBEDDING_NODE_NAME) {
+              matched_list.push_back(&new_node);
+              ret = true;
+          }
+          else {
+              status = kSuccess;
+              return false;
+          }
+
+          break;
+      default:
+      {
+          status = kSuccess;
+          break;
+      }
+      }
+      if (!ret) {
+          while (matched_list.back() != &n) {
+              matched_list.pop_back();
+          }
+          status = kSuccess;
+      }
+      return ret;
+
+  }
+
+  bool SelectOutput(const nnvm::Node &n, const nnvm::Node &new_node) override {
+      return false;
+  }
+
+  std::vector<nnvm::Node *> Filter(
+      const std::vector<nnvm::Node *> &candidates) override {
+    if (status != kSuccess) {
+      return std::vector<nnvm::Node *>(0);
+    } else {
+      return candidates;
+    }
+  }
+};
+template <typename T>
+static std::string int_vector_to_attr(T v) {
+    std::stringstream ss;
+    ss << "[";
+    int i = 0;
+    for (; i < v.size()-1; i++) {
+        ss << v[i] << ",";        
+    }
+    ss << v[i];
+    ss << "]";
+    return ss.str();
+}
+class SgMKLDNNParallelEmbeddingProperty : public SubgraphProperty {
+private:
+ public:
+  SgMKLDNNParallelEmbeddingProperty() {
+    disable_all = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_OPT", 0);
+    if (disable_all) {
+      LOG(INFO) << "MKLDNN Parallel Embedding is disabled.";
+    } else {
+      LOG(INFO) << "Start to execute MKLDNN Parallel Embedding optimization pass.";
+    }       
+  }
+  static SubgraphPropertyPtr Create() {
+    return std::make_shared<SgMKLDNNParallelEmbeddingProperty>();
+  }
+
+  nnvm::NodePtr CreateSubgraphNode(const nnvm::Symbol &sym,
+                                   const int subgraph_id = 0) const override {
+    nnvm::NodePtr pe = nnvm::Node::Create();
+    std::vector<nnvm::NodePtr> emb_nodes;
+    nnvm::NodePtr concat_node = nullptr;
+
+    DFSVisit(sym.outputs, [&](const nnvm::NodePtr &node) {
+      if (node->is_variable()) return;
+      auto &op_name = node->op()->name;
+      //The Assumption is only base on W&D which all embedding occur at the beginning and output to 1 concat node
+      if (op_name == EMBEDDING_NODE_NAME) {
+          emb_nodes.push_back(node);
+      }
+      else if (emb_nodes.size() != 0 && concat_node == nullptr)
+      {
+          if (op_name != "Concat") {
+              std::cout << "!!Parallel Embedding Node following: " << op_name << std::endl;
+          }
+          concat_node = node;
+      }      
+    });
+    CHECK_NOTNULL(emb_nodes.size() != 0);
+    pe->attrs.name = "ParallelEmbedding_0";
+    pe->attrs.op = Op::Get("ParallelEmbedding");
+
+    CHECK(pe->attrs.op);
+
+    std::vector<nnvm::NodePtr>::iterator it;
+    //Assumption:  subgraph use DFS
+    std::vector<int> v_in_dims; 
+    std::vector<int> v_out_dims;
+    std::vector<int> v_types;
+    std::vector<bool> v_sparse_grads;
+    for (it = emb_nodes.begin(); it != emb_nodes.end(); it++) {
+        nnvm::NodePtr em_node = *it;
+        const EmbeddingParam &param = nnvm::get<EmbeddingParam>(em_node->attrs.parsed);
+        v_in_dims.push_back(param.input_dim);
+        v_out_dims.push_back(param.output_dim);
+        v_types.push_back(param.dtype);
+        v_sparse_grads.push_back(param.sparse_grad);
+    }
+
+    pe->attrs.dict["input_dims"] = int_vector_to_attr<std::vector<int> >(v_in_dims);
+    pe->attrs.dict["output_dims"] = int_vector_to_attr<std::vector<int> >(v_out_dims);
+    pe->attrs.dict["dtypes"] = int_vector_to_attr<std::vector<int> >(v_types);
+    pe->attrs.dict["num_args"] = std::to_string(emb_nodes.size());
+    pe->attrs.dict["sparse_grads"] = int_vector_to_attr<std::vector<bool> >(v_sparse_grads);
+    pe->op()->attr_parser(&(pe->attrs));
+    uint32_t e_idx = 0;
+    for (int i=0; i < concat_node->inputs.size(); i++) {
+        nnvm::NodeEntry& entry = concat_node->inputs[i];
+        if (entry.node->op() && entry.node->op()->name == EMBEDDING_NODE_NAME) {
+            concat_node->inputs[i] = nnvm::NodeEntry{ pe, e_idx, 0};
+            ++e_idx;
+        }
+    }
+    return concat_node;
+  }
+
+  SubgraphSelectorPtr CreateSubgraphSelector() const override {
+    auto selector =
+        std::make_shared<SgMKLDNNParallelEmbeddingSelector>(disable_all);
+    return selector;
+  }
+
+  void ConnectSubgraphOutputs(
+      const nnvm::NodePtr n,
+      std::vector<nnvm::NodeEntry *> *output_entries) const override {
+    for (size_t i = 0; i < output_entries->size(); ++i) {
+      auto entry_ptr = output_entries->at(i);
+      *entry_ptr = nnvm::NodeEntry{n, entry_ptr->index, 0};
+    }
+  }
+  void ConnectSubgraphInputs(
+      const nnvm::NodePtr n, std::vector<nnvm::NodeEntry *> *input_entries,
+      std::vector<nnvm::NodeEntry> *orig_input_entries) const override {
+      nnvm::NodePtr para_embdding = nullptr;
+      std::vector<int> concat_non_embedding_idxs;
+      for (int i = 0; i < n->inputs.size(); i++) {
+          nnvm::NodePtr& n_input = n->inputs[i].node;
+          std::string op_name = "";
+          if (n_input->op()) op_name = n_input->op()->name;
+          if(!para_embdding && op_name == "ParallelEmbedding") {
+              para_embdding = n->inputs[i].node;
+          }
+          else {
+              concat_non_embedding_idxs.push_back(i);
+          }
+      }
+      CHECK_NOTNULL(para_embdding);
+      int non_embedding_idx = 0;
+      uint32_t slice_channel_idx = 0;
+      for (int i = 0; i < orig_input_entries->size(); i++) {
+          nnvm::NodeEntry &entry = (*orig_input_entries)[i];
+          std::string entry_name = "";
+          if (entry.node->op()) entry_name = entry.node->op()->name;
+          if (entry_name != "slice" && entry_name != "SliceChannel") {
+              para_embdding->inputs.push_back(nnvm::NodeEntry{ entry.node, 0, 0 });
+          }
+          else if (entry_name == "SliceChannel") {
+              para_embdding->inputs.push_back(nnvm::NodeEntry{ entry.node, slice_channel_idx++, 0 });
+          }
+          else { //Slice
+              n->inputs[concat_non_embedding_idxs[non_embedding_idx++]] = nnvm::NodeEntry{ entry.node, 0, 0 };
+          }
+      }
+  }
+ private:
+  int disable_all;
+};
+
+MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_PARALLEL_EMBEDDING, SgMKLDNNParallelEmbeddingProperty);
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // if MXNET_USE_MKLDNN == 1
diff --git a/src/operator/subgraph/mkldnn/mkldnn_qfc_post_relu_fused_property.cc b/src/operator/subgraph/mkldnn/mkldnn_qfc_post_relu_fused_property.cc
new file mode 100644
index 0000000..1a1aa82
--- /dev/null
+++ b/src/operator/subgraph/mkldnn/mkldnn_qfc_post_relu_fused_property.cc
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#if MXNET_USE_MKLDNN == 1
+
+#include "../common.h"
+#include "../subgraph_property.h"
+#include "../../nn/fully_connected-inl.h"
+#include "../../nn/activation-inl.h"
+
+namespace mxnet {
+namespace op {
+
+#define QFC_OP_NODE_NAME "_contrib_quantized_fully_connected"
+static bool isSuffixsPresent(std::string s, std::string suffix)
+{ 
+    int n = s.length(); 
+    int sl = suffix.length();
+    if (n < sl) 
+       return false; 
+  
+    return (s.substr(n-sl, sl).compare(suffix) == 0); 
+} 
+class SgMKLDNNQFCPostReluFusedSelector : public SubgraphSelector {
+ public:
+  /*! \brief pattern match status */
+  enum SelectStatus {
+    kFail = 0,
+    kStart,
+    kRelu,
+    kSuccess,
+  };
+
+ private:
+  bool disable_all;
+  SelectStatus status;
+  std::vector<const nnvm::Node *> matched_list;
+
+ public:
+  explicit SgMKLDNNQFCPostReluFusedSelector(int dis_all)
+      : disable_all(dis_all) {}
+
+  bool Select(const nnvm::Node &n) override {
+    if ((!disable_all) && n.op() && n.op()->name == QFC_OP_NODE_NAME) {
+      auto const &param = nnvm::get<FullyConnectedParam>(n.attrs.parsed);
+      status = kStart;
+      matched_list.clear();
+      matched_list.push_back(&n);
+      return true;
+    }
+    return false;
+  }
+
+  bool SelectInput(const nnvm::Node &n, const nnvm::Node &new_node) override {
+    return false;
+  }
+
+  bool SelectOutput(const nnvm::Node &n, const nnvm::Node &new_node) override {
+      if (disable_all) return false;
+      if (status == kFail || status == kSuccess || new_node.is_variable())
+          return false;
+      bool ret = false;
+      switch (status) {
+      case kStart:
+          if (new_node.op()->name == "Activation") {
+              const ActivationParam &param = nnvm::get<ActivationParam>(new_node.attrs.parsed);
+              if (param.act_type == activation::kReLU) {
+                  matched_list.push_back(&new_node);
+                  status = kRelu;
+                  ret = true;
+              }
+          }
+          break;
+      case kRelu:
+          if (new_node.op()->name == "_contrib_quantize") {
+              matched_list.push_back(&new_node);
+              status = kSuccess;
+              ret = true;
+          }
+          break;
+      default:
+          {
+              status = kSuccess;
+              break;
+          }
+      }
+      if (!ret) {
+          while (matched_list.back() != &n) {
+              matched_list.pop_back();
+          }
+          status = kSuccess;
+      }
+      return ret;
+  }
+
+  std::vector<nnvm::Node *> Filter(
+      const std::vector<nnvm::Node *> &candidates) override {
+    if (status != kSuccess) {
+      return std::vector<nnvm::Node *>(0);
+    } else {
+      return candidates;
+    }
+  }
+};
+
+class SgMKLDNNQFCPostReluFusedProperty : public SubgraphProperty {
+ public:
+  SgMKLDNNQFCPostReluFusedProperty() {
+    disable_all = dmlc::GetEnv("MXNET_DISABLE_MKLDNN_OPT", 0);
+    if (disable_all) {
+      LOG(INFO) << "MKLDNN Quantization Fully Connection post Relu Fused optimization pass is disabled.";
+    } else {
+      LOG(INFO) << "Start to execute MKLDNN Quantization Fully Connection post Relu Fused optimization pass.";
+    }       
+  }
+  static SubgraphPropertyPtr Create() {
+    return std::make_shared<SgMKLDNNQFCPostReluFusedProperty>();
+  }
+  nnvm::NodePtr CreateSubgraphNode(const nnvm::Symbol &sym,
+                                   const int subgraph_id = 0) const override {
+    nnvm::NodePtr fc_node = nullptr;
+    bool fc_with_relu = false;
+    bool fc_with_quantized = false;
+    DFSVisit(sym.outputs, [&](const nnvm::NodePtr &node) {
+      if (node->is_variable()) return;
+      auto &op_name = node->op()->name;
+      if (op_name == QFC_OP_NODE_NAME) {
+        fc_node = node;
+      } 
+      if (op_name == "Activation") {
+          const ActivationParam &param = nnvm::get<ActivationParam>(node->attrs.parsed);
+          if (param.act_type == activation::kReLU) {
+              fc_with_relu = true;
+          }
+      }
+      if (op_name == "_contrib_quantize")
+      {
+          fc_with_quantized = true;
+      }
+    });
+    CHECK_NOTNULL(fc_node);
+    if (fc_with_relu && fc_with_quantized)
+    {
+        fc_node->attrs.dict["output_type"] = std::string("uint8"); //Only FP32 support 1 output
+        fc_node->attrs.dict["out_min_calib_range"] =  std::to_string(0.0);
+    } else
+    {
+        fc_node->attrs.dict["out_enable_calib_range"] = std::string("False");
+    }
+    fc_node->op()->attr_parser(&(fc_node->attrs));
+    return fc_node;
+  }
+
+  SubgraphSelectorPtr CreateSubgraphSelector() const override {
+    auto selector =
+        std::make_shared<SgMKLDNNQFCPostReluFusedSelector>(disable_all);
+    return selector;
+  }
+
+  void ConnectSubgraphOutputs(
+      const nnvm::NodePtr n,
+      std::vector<nnvm::NodeEntry *> *output_entries) const override {
+    for (size_t i = 0; i < output_entries->size(); ++i) {
+      auto entry_ptr = output_entries->at(i);
+      *entry_ptr = nnvm::NodeEntry{n, entry_ptr->index, 0};
+    }
+  }
+
+ private:
+  int disable_all;
+};
+
+MXNET_REGISTER_SUBGRAPH_PROPERTY(MKLDNN_QFC_POST_RELU_FUSED, SgMKLDNNQFCPostReluFusedProperty);
+
+}  // namespace op
+}  // namespace mxnet
+
+#endif  // if MXNET_USE_MKLDNN == 1
diff --git a/src/operator/tensor/dot-inl.h b/src/operator/tensor/dot-inl.h
index 69c35f8..4bc82d8 100644
--- a/src/operator/tensor/dot-inl.h
+++ b/src/operator/tensor/dot-inl.h
@@ -38,6 +38,10 @@
 #ifdef __CUDACC__
 #include "./dot-inl.cuh"
 #endif  // __CUDACC__
+#if (MSHADOW_USE_MKL == 1)
+#include "sparse_matrix.h"
+#endif
+
 
 namespace mxnet {
 namespace op {
@@ -774,13 +778,29 @@ inline void DotCsrDnsDnsImpl(const OpContext& ctx,
   }
 
   using nnvm::dim_t;
-
+  TShape lhs_shape = lhs.shape();
+  TShape rhs_shape = rhs.shape_;
   const TBlob data_l = lhs.data();
   const TBlob indptr_l = lhs.aux_data(csr::kIndPtr);
   const TBlob col_idx_l = lhs.aux_data(csr::kIdx);
   const TBlob& data_r = rhs;
   const TBlob data_out = *ret;
 
+#if (MSHADOW_USE_MKL == 1)
+  if (data_l.type_flag_ == mshadow::kFloat32
+	  && indptr_l.type_flag_ == mshadow::kInt64
+	  && col_idx_l.type_flag_ == mshadow::kInt64
+    && !trans_lhs)
+  {	
+	  bool ret = mkl_DotCsrDnsDns((SP_INT64*)indptr_l.dptr_,
+		  (SP_INT64*)col_idx_l.dptr_, data_l.dptr<float>(), data_r.dptr<float>(),
+		  data_out.dptr<float>(), lhs_shape[0], lhs_shape[1], rhs_shape[1]);
+	  if (ret) {		
+		  return;
+	  }
+  }  
+#endif
+
   MSHADOW_SGL_DBL_TYPE_SWITCH(data_l.type_flag_, DType, {  // data type
     MSHADOW_IDX_TYPE_SWITCH(indptr_l.type_flag_, IType, {  // indptr type
       MSHADOW_IDX_TYPE_SWITCH(col_idx_l.type_flag_, CType, {  // col idx type
diff --git a/src/operator/tensor/elemwise_binary_broadcast_op.h b/src/operator/tensor/elemwise_binary_broadcast_op.h
index 391c351..491deb4 100644
--- a/src/operator/tensor/elemwise_binary_broadcast_op.h
+++ b/src/operator/tensor/elemwise_binary_broadcast_op.h
@@ -36,7 +36,11 @@
 #include "./elemwise_binary_op.h"
 #include "../operator_common.h"
 #include "broadcast_reduce-inl.h"
-
+#if (MSHADOW_USE_MKL == 1)
+#include "mkl_cblas.h"
+#include "mkl_vml_functions.h"
+typedef MKL_INT cblas_int;
+#endif
 namespace mxnet {
 namespace op {
 inline bool BinaryBroadcastShape(const nnvm::NodeAttrs& attrs,
@@ -286,7 +290,18 @@ struct csr_dns_map_kernel {
 };
 
 }  // namespace mxnet_op
-
+#if (MSHADOW_USE_MKL == 1)
+inline float *broadcast_add(float *a, float *b, float *c, int m, int n)
+{
+	int MB = m;
+	int OC = n;
+#pragma omp parallel for num_threads(engine::OpenMP::Get()->GetRecommendedOMPThreadCount()) 
+	for (cblas_int mb = 0; mb < MB; mb++) {
+		cblas_saxpy(OC, 1.0, b, 1.0, c + mb * OC, 1);
+	}
+	return NULL;
+}
+#endif
 template<typename xpu, typename OP>
 void BinaryBroadcastCompute(const nnvm::NodeAttrs& attrs,
                             const OpContext& ctx,
@@ -294,8 +309,27 @@ void BinaryBroadcastCompute(const nnvm::NodeAttrs& attrs,
                             const std::vector<OpReqType>& req,
                             const std::vector<TBlob>& outputs) {
   TShape new_lshape, new_rshape, new_oshape;
-  int ndim = BinaryBroadcastShapeCompact(inputs[0].shape_, inputs[1].shape_, outputs[0].shape_,
+  TShape lhs_shape = inputs[0].shape_;
+  TShape rhs_shape = inputs[1].shape_;
+  int ndim = BinaryBroadcastShapeCompact(lhs_shape, rhs_shape, outputs[0].shape_,
                                          &new_lshape, &new_rshape, &new_oshape);
+
+#if (MSHADOW_USE_MKL == 1)
+  if (typeid(xpu) == typeid(cpu)
+	  && typeid(OP) == typeid(op::mshadow_op::plus)
+	  && ndim == 2 && inputs[0].type_flag_ == mshadow::kFloat32
+	  && inputs[1].type_flag_ == mshadow::kFloat32
+	  && lhs_shape[1] == rhs_shape[0])
+  {
+	  float * lhs_ptr = inputs[0].dptr<float>();
+	  float * out_ptr = outputs[0].dptr<float>();
+	  if (lhs_ptr == out_ptr)
+	  {
+		  broadcast_add(lhs_ptr, inputs[1].dptr<float>(), out_ptr, lhs_shape[0], rhs_shape[0]);
+		  return;
+	  }
+  }
+#endif
   if (!ndim) {
     ElemwiseBinaryOp::Compute<xpu, OP>(attrs, ctx, inputs, req, outputs);
   } else {
diff --git a/src/operator/tensor/indexing_op.cc b/src/operator/tensor/indexing_op.cc
index 77236e0..2de6ef5 100644
--- a/src/operator/tensor/indexing_op.cc
+++ b/src/operator/tensor/indexing_op.cc
@@ -61,10 +61,14 @@ template<typename DType>
 bool CheckIndexOutOfBound(const DType* data_ptr, size_t data_size,
                           const DType min, const DType max) {
   bool is_valid = true;
-  for (size_t i = 0; i < data_size; i++) {
+  // to avoid Jenkins omp check error
+  int64_t size = data_size;
+  int64_t check_block_size = dmlc::GetEnv("MXNET_CPU_PARALLEL_CHECK_SIZE", 14000);
+  int omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+  #pragma omp parallel for num_threads(omp_threads) if (size > check_block_size)
+  for (int64_t i = 0; i < size; i++) {
     if (data_ptr[i] > max || data_ptr[i] < min) {
       is_valid = false;
-      break;
     }
   }
   return is_valid;
diff --git a/src/operator/tensor/indexing_op.h b/src/operator/tensor/indexing_op.h
index 92b6e21..dd3bff0 100644
--- a/src/operator/tensor/indexing_op.h
+++ b/src/operator/tensor/indexing_op.h
@@ -481,8 +481,15 @@ void SparseEmbeddingOpForwardEx(const nnvm::NodeAttrs& attrs,
   const auto out_stype = out.storage_type();
   if (data_stype == kDefaultStorage && weight_stype == kRowSparseStorage &&
       out_stype == kDefaultStorage) {
-    // dns, rsp -> dns
-    SparseEmbeddingOpForwardRspImpl<xpu>(ctx, data.data(), weight, req[0], out.data());
+    using namespace rowsparse;
+    if(weight.storage_initialized() && weight.aux_shape(kIdx)[0] == weight.shape()[0]) {
+      // dns, dns -> dns
+      EmbeddingOpForwardDnsImpl<xpu>(ctx.get_stream<xpu>(), data.data(), weight.data(),
+                                   req[0], out.data());
+    } else {
+      // dns, rsp -> dns
+      SparseEmbeddingOpForwardRspImpl<xpu>(ctx, data.data(), weight, req[0], out.data());
+    }
   } else if (data_stype == kDefaultStorage && weight_stype == kDefaultStorage &&
              out_stype == kDefaultStorage) {
     // dns, dns -> dns
diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h
index 9c81d87..57ac455 100644
--- a/src/operator/tensor/matrix_op-inl.h
+++ b/src/operator/tensor/matrix_op-inl.h
@@ -410,6 +410,13 @@ struct SliceParam : public dmlc::Parameter<SliceParam> {
     .set_default(nnvm::Tuple<dmlc::optional<int>>())
     .describe("step for the slice operation, supports negative values.");
   }
+
+bool operator==(const SliceParam& other) const {
+  return this->begin == other.begin &&
+          this->end == other.end &&
+          this->step == other.step;
+}
+
 };
 
 inline bool SliceForwardInferStorageType(const nnvm::NodeAttrs& attrs,
@@ -2520,4 +2527,17 @@ void SpaceToDepthOpForward(const nnvm::NodeAttrs& attrs,
 }  // namespace op
 }  // namespace mxnet
 
+namespace std {
+template<>
+struct hash<mxnet::op::SliceParam> {
+  size_t operator()(const mxnet::op::SliceParam& val) {
+    size_t ret = 0;
+    ret = dmlc::HashCombine(ret, val.begin);
+    ret = dmlc::HashCombine(ret, val.end);
+    ret = dmlc::HashCombine(ret, val.step);
+    return ret;
+  }
+};
+}  // namespace std
+
 #endif  // MXNET_OPERATOR_TENSOR_MATRIX_OP_INL_H_
diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc
index 0faa668..8e9d64b 100644
--- a/src/operator/tensor/matrix_op.cc
+++ b/src/operator/tensor/matrix_op.cc
@@ -103,6 +103,57 @@ DMLC_REGISTER_PARAMETER(StackParam);
 DMLC_REGISTER_PARAMETER(SqueezeParam);
 DMLC_REGISTER_PARAMETER(DepthToSpaceParam);
 
+#if MXNET_USE_MKLDNN == 1
+void MKLDNNReshape(const NDArray &in_data, const NDArray &out_data) {
+  MSHADOW_TYPE_SWITCH(in_data.dtype(), DType, {
+    auto this_mem = in_data.GetMKLDNNData();
+    auto out_dptr = out_data.data().dptr<DType>();
+    mkldnn::memory::primitive_desc this_pd = this_mem->get_primitive_desc();
+    mkldnn::memory::desc this_desc = this_pd.desc();
+    mkldnn::memory::dims dims(this_desc.data.dims,
+                              this_desc.data.dims + this_desc.data.ndims);
+    auto this_dtype = static_cast<mkldnn::memory::data_type>(this_desc.data.data_type);
+    auto this_format = static_cast<mkldnn::memory::format>(GetDefaultFormat(this_desc));
+    mkldnn::memory::desc data_md(dims, this_dtype, this_format);
+    mkldnn::memory::primitive_desc pd(data_md, this_pd.get_engine());
+    auto temp_mem = mkldnn::memory(pd, out_dptr);
+    MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(*this_mem, temp_mem));
+    MKLDNNStream::Get()->Submit();
+
+    const_cast<NDArray &>(out_data).InvalidateMKLDNNData();
+  });
+}
+
+static void ReshapeComputeExCPU(const nnvm::NodeAttrs& attrs,
+                                const OpContext& ctx,
+                                const std::vector<NDArray>& inputs,
+                                const std::vector<OpReqType>& req,
+                                const std::vector<NDArray>& outputs) {
+  CHECK_EQ(inputs.size(), 1U);
+  CHECK_EQ(outputs.size(), 1U);
+  // If inputs are supposed to be in MKLDNN format and
+  // MKLDNNsupport the data type or the shape. Then convert
+  // it to the output format and shape
+  if (SupportMKLDNNArray(inputs[0].dtype(), inputs[0].shape()) && req[0] != kAddTo) {
+    MKLDNNReshape(inputs[0], outputs[0]);
+    return;
+  }
+  FallBackCompute(UnaryOp::IdentityCompute<cpu>, attrs, ctx, inputs, req,
+                    outputs);
+}
+
+inline static bool ReshapeStorageType(const nnvm::NodeAttrs& attrs,
+                                      const int dev_mask,
+                                      DispatchMode* dispatch_mode,
+                                      std::vector<int>* in_attrs,
+                                      std::vector<int>* out_attrs) {
+  CHECK_EQ(in_attrs->size(), 1U);
+  CHECK_EQ(out_attrs->size(), 1U);
+  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs,
+                           out_attrs);
+}
+#endif
+
 NNVM_REGISTER_OP(Reshape)
 .add_alias("reshape")
 .describe(R"code(Reshapes the input array.
@@ -171,9 +222,19 @@ If the argument `reverse` is set to 1, then the special values are inferred from
 .set_num_outputs(1)
 .set_attr_parser(ParamParser<ReshapeParam>)
 .set_attr<nnvm::FInferShape>("FInferShape", ReshapeShape)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<FInferStorageType>("FInferStorageType", ReshapeStorageType)
+#endif
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{"_backward_copy"})
 .set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
+#if MXNET_USE_MKLDNN == 1
+.set_attr<bool>("TIsMKLDNN", true)
+.set_attr<FComputeEx>("FComputeEx<cpu>", ReshapeComputeExCPU)
+.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n) {
+  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+#else
 .set_attr<nnvm::FInplaceOption>("FInplaceOption",
   [](const NodeAttrs& attrs) {
     return std::vector<std::pair<int, int> >{{0, 0}};
@@ -182,6 +243,7 @@ If the argument `reverse` is set to 1, then the special values are inferred from
   [](const NodeAttrs& attrs){
     return std::vector<bool>{true};
   })
+#endif
 .add_argument("data", "NDArray-or-Symbol", "Input data to reshape.")
 .add_arguments(ReshapeParam::__FIELDS__());
 
@@ -210,6 +272,7 @@ static void FlattenEx(const nnvm::NodeAttrs& attrs,
 #endif
 }
 
+#if MXNET_USE_MKLDNN == 1
 static inline bool FlattenStorageType(const nnvm::NodeAttrs& attrs,
                                    const int dev_mask,
                                    DispatchMode* dispatch_mode,
@@ -217,17 +280,10 @@ static inline bool FlattenStorageType(const nnvm::NodeAttrs& attrs,
                                    std::vector<int> *out_attrs) {
   CHECK_EQ(in_attrs->size(), 1);
   CHECK_EQ(out_attrs->size(), 1);
-  bool ret = ElemwiseStorageType<1, 1, false, false, false>(attrs, dev_mask, dispatch_mode,
-                                                            in_attrs, out_attrs);
-#if MXNET_USE_MKLDNN == 1
-  if (dev_mask == mshadow::cpu::kDevMask
-      && in_attrs->at(0) == kDefaultStorage
-      && out_attrs->at(0) == kDefaultStorage) {
-    *dispatch_mode = DispatchMode::kFComputeEx;
-  }
-#endif
-  return ret;
+  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs,
+                           out_attrs);
 }
+#endif
 
 NNVM_REGISTER_OP(Flatten)
 .add_alias("flatten")
@@ -261,7 +317,9 @@ Example::
 .set_num_outputs(1)
 .set_attr<nnvm::FInferShape>("FInferShape", FlattenShape)
 .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+#if MXNET_USE_MKLDNN == 1
 .set_attr<FInferStorageType>("FInferStorageType", FlattenStorageType)
+#endif
 .set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{ "_backward_copy" })
 .set_attr<FCompute>("FCompute<cpu>", UnaryOp::IdentityCompute<cpu>)
 .set_attr<FComputeEx>("FComputeEx<cpu>", FlattenEx)
@@ -914,7 +972,7 @@ NNVM_REGISTER_OP(depth_to_space)
 .describe(R"code(Rearranges(permutes) data from depth into blocks of spatial data.
 Similar to ONNX DepthToSpace operator:
 https://github.com/onnx/onnx/blob/master/docs/Operators.md#DepthToSpace.
-The output is a new tensor where the values from depth dimension are moved in spatial blocks 
+The output is a new tensor where the values from depth dimension are moved in spatial blocks
 to height and width dimension. The reverse of this operation is ``space_to_depth``.
 
 .. math::
@@ -925,7 +983,7 @@ to height and width dimension. The reverse of this operation is ``space_to_depth
     y = reshape(x \prime \prime, [N, C / (block\_size ^ 2), H * block\_size, W * block\_size])
     \end{gather*}
 
-where :math:`x` is an input tensor with default layout as :math:`[N, C, H, W]`: [batch, channels, height, width] 
+where :math:`x` is an input tensor with default layout as :math:`[N, C, H, W]`: [batch, channels, height, width]
 and :math:`y` is the output tensor of layout :math:`[N, C / (block\_size ^ 2), H * block\_size, W * block\_size]`
 
 Example::
@@ -965,9 +1023,9 @@ Example::
 NNVM_REGISTER_OP(space_to_depth)
 .describe(R"code(Rearranges(permutes) blocks of spatial data into depth.
 Similar to ONNX SpaceToDepth operator:
-https://github.com/onnx/onnx/blob/master/docs/Operators.md#SpaceToDepth 
+https://github.com/onnx/onnx/blob/master/docs/Operators.md#SpaceToDepth
 
-The output is a new tensor where the values from height and width dimension are 
+The output is a new tensor where the values from height and width dimension are
 moved to the depth dimension. The reverse of this operation is ``depth_to_space``.
 
 .. math::
@@ -978,7 +1036,7 @@ moved to the depth dimension. The reverse of this operation is ``depth_to_space`
     y = reshape(x \prime \prime, [N, C * (block\_size ^ 2), H / block\_size, W / block\_size])
     \end{gather*}
 
-where :math:`x` is an input tensor with default layout as :math:`[N, C, H, W]`: [batch, channels, height, width] 
+where :math:`x` is an input tensor with default layout as :math:`[N, C, H, W]`: [batch, channels, height, width]
 and :math:`y` is the output tensor of layout :math:`[N, C * (block\_size ^ 2), H / block\_size, W / block\_size]`
 
 Example::
@@ -987,8 +1045,8 @@ Example::
          [12, 18, 13, 19, 14, 20],
          [3, 9, 4, 10, 5, 11],
          [15, 21, 16, 22, 17, 23]]]]
-  
-  
+
+
   space_to_depth(x, 2) = [[[[0, 1, 2],
                             [3, 4, 5]],
                            [[6, 7, 8],
diff --git a/src/operator/tensor/mkldnn/mkldnn_parallel_embedding.cc b/src/operator/tensor/mkldnn/mkldnn_parallel_embedding.cc
new file mode 100644
index 0000000..85258e2
--- /dev/null
+++ b/src/operator/tensor/mkldnn/mkldnn_parallel_embedding.cc
@@ -0,0 +1,255 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * Copyright (c) 2017 by Contributors
+ * \file mkldnn_parallel_embedding.cc
+ * \brief CPU implementation of parallel embedding
+ * \author Lingyan Guo
+*/
+
+#include "mkldnn_parallel_embedding.h"
+namespace mxnet {
+namespace op {
+
+static EmbeddingParam GetEmbeddedParam(const ParallelEmbeddingParam& param_, int i)
+{
+    EmbeddingParam embedding_param;
+    embedding_param.input_dim = param_.input_dims[i];
+    embedding_param.output_dim = param_.output_dims[i];
+    embedding_param.dtype = param_.dtypes[i];
+    embedding_param.sparse_grad = param_.sparse_grads[i];
+    return embedding_param;
+}
+// storage type inference function for Embedding
+inline bool ParallelEmbeddingOpForwardStorageType(const nnvm::NodeAttrs& attrs,
+    const int dev_mask,
+    DispatchMode* dispatch_mode,
+    std::vector<int>* in_attrs,
+    std::vector<int>* out_attrs) {
+    const ParallelEmbeddingParam& param_ = nnvm::get<ParallelEmbeddingParam>(attrs.parsed);
+    bool ret = true;
+    for (int i = 0; i < param_.num_args; i++) {
+        nnvm::NodeAttrs attrs;
+        attrs.parsed = GetEmbeddedParam(param_, i);
+        std::vector<int> e_in;
+        std::vector<int> e_out;
+        int &d = (*in_attrs)[i * 2];
+        int &w = (*in_attrs)[i * 2 + 1];
+        e_in.push_back(d);
+        e_in.push_back(w);
+        int &o = (*out_attrs)[i];
+        e_out.push_back(o);
+        ret &= EmbeddingOpForwardStorageType(attrs, dev_mask, dispatch_mode, &e_in, &e_out);
+        o = e_out[0];
+        w = e_in[1];
+    }
+    return ret;
+}
+
+
+static bool ParallelEmbeddingOpShape(const nnvm::NodeAttrs& attrs,
+                        std::vector<TShape> *in_shape,
+                        std::vector<TShape> *out_shape) {
+  const ParallelEmbeddingParam& param_ = nnvm::get<ParallelEmbeddingParam>(attrs.parsed);
+  bool ret = true;
+  for (int i = 0; i < param_.num_args; i++) {
+      nnvm::NodeAttrs attrs;
+      attrs.parsed = GetEmbeddedParam(param_, i);
+      std::vector<TShape> e_in;
+      std::vector<TShape> e_out;
+      TShape &d = (*in_shape)[i * 2];
+      TShape &w = (*in_shape)[i * 2 + 1];
+      e_in.push_back(d);
+      e_in.push_back(w);
+      TShape &o = (*out_shape)[i];
+      e_out.push_back(o);
+      ret &= EmbeddingOpShape<EmbeddingParam>(attrs, &e_in, &e_out);
+      o = e_out[0];
+      w = e_in[1];
+  }
+  return ret;
+}
+
+
+inline bool ParallelEmbeddingOpType(const nnvm::NodeAttrs& attrs,
+    std::vector<int> *in_type,
+    std::vector<int> *out_type) {
+    const ParallelEmbeddingParam& param_ = nnvm::get<ParallelEmbeddingParam>(attrs.parsed);
+    bool ret = true;
+    for (int i = 0; i < param_.num_args; i++) {
+        nnvm::NodeAttrs attrs;
+        attrs.parsed = GetEmbeddedParam(param_, i);
+        std::vector<int> e_in;
+        std::vector<int> e_out;
+        int &d = (*in_type)[i * 2];
+        int &w = (*in_type)[i * 2 + 1];
+        e_in.push_back(d);
+        e_in.push_back(w);
+        int &o = (*out_type)[i];
+        e_out.push_back(o);
+        ret &= EmbeddingOpType<EmbeddingParam>(attrs, &e_in, &e_out);
+        o = e_out[0];
+        w = e_in[1];
+    }
+    return ret;
+
+}
+template<typename xpu>
+void ParallelEmbeddingOpForward(const nnvm::NodeAttrs& attrs,
+    const OpContext& ctx,
+    const std::vector<TBlob>& inputs,
+    const std::vector<OpReqType>& req,
+    const std::vector<TBlob>& outputs)
+{
+    const ParallelEmbeddingParam& param_ = nnvm::get<ParallelEmbeddingParam>(attrs.parsed);
+#pragma omp parallel for num_threads(param_.num_args) 
+    for (int i = 0; i < param_.num_args; i++) {
+        nnvm::NodeAttrs attrs;
+        attrs.parsed = GetEmbeddedParam(param_, i);
+        std::vector<TBlob> e_in;
+        std::vector<TBlob> e_out;
+        const TBlob &d = (inputs)[i * 2];
+        const TBlob &w = (inputs)[i * 2 + 1];
+        e_in.push_back(d);
+        e_in.push_back(w);
+        const TBlob &o = (outputs)[i];
+        e_out.push_back(o);
+        EmbeddingOpForward<cpu>(attrs, ctx, e_in, req, e_out);
+    }
+}
+template <typename IType, typename DType>
+struct TakeCPUInfo
+{
+    DType* out_data;
+    DType* in_data;
+    IType* idx;
+    int N;
+    size_t M;
+    int64_t K;
+};
+
+template<typename xpu>
+void ParallelSparseEmbeddingOpForwardEx(const nnvm::NodeAttrs& attrs,
+    const OpContext& ctx,
+    const std::vector<NDArray>& inputs,
+    const std::vector<OpReqType>& req,
+    const std::vector<NDArray>& outputs)
+{
+    const ParallelEmbeddingParam& param_ = nnvm::get<ParallelEmbeddingParam>(attrs.parsed);
+    using namespace mxnet_op;
+    using namespace rowsparse;
+
+    typedef float IType;
+    typedef float  DType;
+    const int omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+    mshadow::Stream<cpu> *s = ctx.get_stream<cpu>();
+    TakeCPUInfo<IType, DType>* takecpu_info = new TakeCPUInfo<IType, DType>[param_.num_args];
+    for (int em = 0; em < param_.num_args; em++) {
+        const NDArray &d = (inputs)[em * 2];
+        const NDArray &w = (inputs)[em * 2 + 1];
+        const NDArray &o = (outputs)[em];
+        const TShape& ishape = d.shape();
+        const TShape& oshape = o.shape();
+        const TShape& wshape = w.shape();
+        takecpu_info[em].N = oshape.Size() / wshape[1];
+        takecpu_info[em].out_data = o.data().dptr<DType>();
+        takecpu_info[em].in_data = w.data().dptr<DType>();
+        takecpu_info[em].idx = d.data().dptr<IType>();
+        takecpu_info[em].M = wshape[1];
+        takecpu_info[em].K = wshape[0];
+    }
+    
+    bool clip = true;
+    int em = 0;
+    int i = 0;
+    int N = takecpu_info[0].N; //TODO: limitation, need to use collapse
+#pragma omp parallel for num_threads(omp_threads) collapse(2)
+    for (em = 0; em < param_.num_args; em++)
+        for (i = 0; i < N; ++i) {
+
+            int64_t j = static_cast<int64_t>(takecpu_info[em].idx[i]);
+            if (clip) {
+                if (j <= 0) j = 0;
+                else if (j >= takecpu_info[em].K) j = takecpu_info[em].K - 1;
+            }
+            else {
+                j = j % takecpu_info[em].K;
+                j += (j < 0) ? takecpu_info[em].K : 0;
+            }
+            std::memcpy(takecpu_info[em].out_data + i * takecpu_info[em].M, 
+                takecpu_info[em].in_data + j * takecpu_info[em].M, takecpu_info[em].M * sizeof(DType));
+        }
+
+
+    delete[] takecpu_info;
+}
+
+
+DMLC_REGISTER_PARAMETER(ParallelEmbeddingParam);
+
+NNVM_REGISTER_OP(ParallelEmbedding)
+.describe(R"code( Parallel exec embedding in Mulit-core CPU
+
+)code" ADD_FILELINE)
+.set_num_inputs([](const NodeAttrs& attrs) {
+    const ParallelEmbeddingParam& params = nnvm::get<ParallelEmbeddingParam>(attrs.parsed);
+    return params.num_args*2;
+})
+.set_num_outputs([](const NodeAttrs& attrs) {
+    const ParallelEmbeddingParam& params = nnvm::get<ParallelEmbeddingParam>(attrs.parsed);
+    return params.num_args;
+})
+.set_attr_parser(ParamParser<ParallelEmbeddingParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+    [](const NodeAttrs& attrs) {
+    const ParallelEmbeddingParam& params = nnvm::get<ParallelEmbeddingParam>(attrs.parsed);
+    std::vector<std::string> ret;
+    for (int i = 0; i < params.num_args; ++i) {
+        ret.push_back(std::string("arg_") + std::to_string(i));
+        ret.push_back(std::string("embed_") + std::to_string(i) + std::string("_weight"));
+    }
+    return ret;
+})
+.set_attr<nnvm::FListInputNames>("FListOutputNames",
+    [](const NodeAttrs& attrs) {
+    const ParallelEmbeddingParam& params = nnvm::get<ParallelEmbeddingParam>(attrs.parsed);
+    std::vector<std::string> ret;
+    for (int i = 0; i < params.num_args; ++i) {
+        ret.push_back(std::string("out_") + std::to_string(i));
+    }
+    return ret;
+})
+.set_attr<std::string>("key_var_num_args", "num_args")
+.set_attr<nnvm::FInferShape>("FInferShape", ParallelEmbeddingOpShape)
+.set_attr<nnvm::FInferType>("FInferType", ParallelEmbeddingOpType)
+.set_attr<FInferStorageType>("FInferStorageType", ParallelEmbeddingOpForwardStorageType)
+.set_attr<FResourceRequest>("FResourceRequest",
+    [](const NodeAttrs& attrs) {
+    return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+.set_attr<FCompute>("FCompute<cpu>", ParallelEmbeddingOpForward<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", ParallelSparseEmbeddingOpForwardEx<cpu>)
+
+//.add_argument("data", "NDArray-or-Symbol[]", "List of arrays to embedding")
+.add_argument("data_weight", "NDArray-or-Symbol[]", "List of arrays (data/weight) to embedding weight.")
+.add_arguments(ParallelEmbeddingParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/mkldnn/mkldnn_parallel_embedding.h b/src/operator/tensor/mkldnn/mkldnn_parallel_embedding.h
new file mode 100644
index 0000000..2db8236
--- /dev/null
+++ b/src/operator/tensor/mkldnn/mkldnn_parallel_embedding.h
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef MXNET_OPERATOR_TENSOR_MKLDNN_PARALLEL_EMBEDDING_INL_H_
+#define MXNET_OPERATOR_TENSOR_MKLDNN_PARALLEL_EMBEDDING_INL_H_
+#include "../indexing_op.h"
+namespace mxnet {
+namespace op {
+
+struct ParallelEmbeddingParam: public dmlc::Parameter<ParallelEmbeddingParam> {
+  nnvm::Tuple<int> input_dims;
+  nnvm::Tuple<int> output_dims;
+  nnvm::Tuple<int> dtypes;
+  nnvm::Tuple<bool> sparse_grads;
+  int num_args;
+  DMLC_DECLARE_PARAMETER(ParallelEmbeddingParam) {
+    DMLC_DECLARE_FIELD(input_dims)
+    .describe("Vocabulary size of the input indices.");
+    DMLC_DECLARE_FIELD(output_dims)
+    .describe("Dimension of the embedding vectors.");
+    DMLC_DECLARE_FIELD(num_args).set_lower_bound(1).set_default(1)
+        .describe("Number of inputs to be concated.");
+    DMLC_DECLARE_FIELD(dtypes).describe("Data type of weight.");
+    DMLC_DECLARE_FIELD(sparse_grads)
+        .describe("Compute row sparse gradient in the backward calculation. If set to True, "
+            "the grad's storage type is row_sparse.");
+  }
+};
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_TENSOR_MKLDNN_PARALLEL_EMBEDDING_INL_H_
\ No newline at end of file
diff --git a/src/operator/tensor/mkldnn/mkldnn_slice_split_embedding.cc b/src/operator/tensor/mkldnn/mkldnn_slice_split_embedding.cc
new file mode 100644
index 0000000..d7a215f
--- /dev/null
+++ b/src/operator/tensor/mkldnn/mkldnn_slice_split_embedding.cc
@@ -0,0 +1,302 @@
+#include "mkldnn_slice_split_embedding.h"
+#include "../indexing_op.h"
+#include "../matrix_op-inl.h"
+#include "../../slice_channel-inl.h"
+#include "../indexing_op.h"
+
+namespace mxnet {
+namespace op {
+bool ConcatSetShape(std::vector<TShape> *in_shape,
+	std::vector<TShape> *out_shape, int num_args, int dim);
+//call from SliceOpShape
+static void get_slice_output_shape(const nnvm::Tuple<dmlc::optional<int>> &_pbegin,
+	const nnvm::Tuple<dmlc::optional<int>> &_pend,
+	const nnvm::Tuple<dmlc::optional<int>>& _pstep,
+	TShape& oshape, TShape& dshape)
+{
+  MXNET_NDIM_SWITCH(dshape.ndim(), ndim, {
+  common::StaticArray<int, ndim> begin, end, step;
+  GetIndexRange(dshape, _pbegin, _pend, _pstep, &begin, &end, &step);
+  for (index_t i = 0; i < _pbegin.ndim(); ++i) {
+	const int b = begin[i], e = end[i], s = step[i];
+	SetSliceOpOutputDimSize(i, b, e, s, &oshape);
+  }
+  });
+}
+
+
+static EmbeddingParam GetEmbeddedParam(const SliceSplitEmbeddingConcatFuseParam& param_, int i)
+{
+	EmbeddingParam embedding_param;
+	embedding_param.input_dim = param_.input_dims[i];
+	embedding_param.output_dim = param_.output_dims[i];
+	embedding_param.dtype = mshadow::kFloat32;
+	embedding_param.sparse_grad = false;
+	return embedding_param;
+}
+static bool SliceSplitEmbeddingConcatOpShape(const nnvm::NodeAttrs& attrs,
+                        std::vector<TShape> *in_shape,
+                        std::vector<TShape> *out_shape) {
+  const SliceSplitEmbeddingConcatFuseParam& param_ = nnvm::get<SliceSplitEmbeddingConcatFuseParam>(attrs.parsed);
+  bool ret = true;
+  TShape& dshape = (*in_shape)[0];
+
+  nnvm::Tuple<dmlc::optional<int>> param_step;
+  TShape cont_slice_oshape = dshape;
+  get_slice_output_shape(param_.cont_begin, param_.cont_end, param_step, cont_slice_oshape, dshape);
+  TShape split_slice_oshape = dshape;
+  get_slice_output_shape(param_.embed_begin, param_.embed_end, param_step, split_slice_oshape, dshape);
+  std::vector<TShape> split_in_shapes;
+  split_in_shapes.push_back(split_slice_oshape);
+  std::vector<TShape> split_out_shapes;
+  split_out_shapes.resize(param_.num_outputs);
+  std::vector<TShape> split_aux_shapes;
+  SliceChannelInferShape(&split_in_shapes, &split_out_shapes, &split_aux_shapes, param_.num_outputs, 1, param_.squeeze_axis);
+  std::vector<TShape> embed_out_shapes;
+  
+  for (int i = 0; i < param_.num_outputs; i++)
+  {
+	  nnvm::NodeAttrs em_attrs;
+	  em_attrs.parsed = GetEmbeddedParam(param_, i);
+	  std::vector<TShape> e_in;
+	  std::vector<TShape> e_out;
+	  e_in.push_back(split_out_shapes[i]);
+	  e_in.push_back((*in_shape)[1+i]);
+	  e_out.resize(1);
+	  EmbeddingOpShape<EmbeddingParam>(em_attrs, &e_in, &e_out);
+	  SHAPE_ASSIGN_CHECK(*in_shape, i+1, e_in[1]);
+	  embed_out_shapes.push_back(e_out[0]);
+  }
+  embed_out_shapes.push_back(cont_slice_oshape);
+  ConcatSetShape(&embed_out_shapes, out_shape, param_.num_outputs+1, param_.concat_dim);
+  return ret;
+}
+inline bool SliceSplitEmbeddingConcatOpType(const nnvm::NodeAttrs& attrs,
+	std::vector<int> *in_type,
+	std::vector<int> *out_type) {
+	bool ret = true;
+	(*out_type)[0] = (*in_type)[0];
+	int in_size = (*in_type).size();
+	for(int i=1; i< in_size; i++)
+		(*in_type)[i] = (*out_type)[0];
+	return ret;
+}
+inline bool SliceSplitEmbeddingConcatOpStorageType(const nnvm::NodeAttrs& attrs,
+	const int dev_mask,
+	DispatchMode* dispatch_mode,
+	std::vector<int>* in_attrs,
+	std::vector<int>* out_attrs) {
+	bool dispatched = false;
+	auto& out_stype = out_attrs->at(0);
+
+	dispatched = storage_type_assign(&out_stype, kDefaultStorage,
+			dispatch_mode, DispatchMode::kFComputeEx);
+
+	return dispatched;
+}
+template<int ndim, int req, typename xpu>
+struct slice_forward_window;
+template<int ndim, int req>
+struct slice_forward_window<ndim, req, cpu> {
+  // i is the i-th row after flattening out into 2D tensor
+  template<typename DType>
+  MSHADOW_XINLINE static void Map(int i, DType* out, const DType* data,
+                                  const mshadow::Shape<ndim> dshape,
+                                  const mshadow::Shape<ndim> oshape,
+                                  const common::StaticArray<int, ndim> begin,
+                                  const common::StaticArray<int, ndim> step, 
+	                              int out_count_per_row) {
+    const int data_last_dim_size = dshape[ndim-1];
+    const int out_last_dim_size = oshape[ndim-1];
+    const int step_last_dim = step[ndim-1];
+    const int begin_last_dim = begin[ndim-1];
+    int out_offset = i * out_last_dim_size;
+    for (int j = 0; j < out_count_per_row; ++j) { //The only difference is out_count_per_row
+      int irow = 0;  // row id of flattend 2D data
+      int stride = 1;
+      int idx = i;
+      #pragma unroll
+      for (int k = ndim - 2; k >= 0; --k) {
+        irow += stride * ((idx % oshape[k]) * step[k] + begin[k]);
+        idx /= oshape[k];
+        stride *= dshape[k];
+      }
+      KERNEL_ASSIGN(out[out_offset++], req,
+                    data[irow * data_last_dim_size + j * step_last_dim + begin_last_dim]);
+    }
+  }
+};
+template <typename IType, typename DType>
+struct TakeCPUInfoWindow
+{
+
+	DType* in_data;
+	int idx_offset;
+	int out_offset;
+
+	size_t M;
+	int64_t K;
+};
+template<typename xpu>
+void SliceSplitEmbeddingConcatOpForward(const nnvm::NodeAttrs& attrs,
+	const OpContext& ctx,
+	const std::vector<TBlob>& inputs,
+	const std::vector<OpReqType>& req,
+	const std::vector<TBlob>& outputs)
+{
+	const SliceSplitEmbeddingConcatFuseParam& param_ = nnvm::get<SliceSplitEmbeddingConcatFuseParam>(attrs.parsed);
+	//by default Cont_features is in the first
+	TShape dshape = inputs[0].shape_;
+	TShape oshape = outputs[0].shape_;
+
+	//For Cont feature
+	using namespace mshadow;
+	Stream<xpu>* s = ctx.get_stream<xpu>();
+	const TBlob& data = inputs[0];
+	const TBlob& out = outputs[0];
+	nnvm::Tuple<dmlc::optional<int>> param_step;
+	TShape cont_slice_oshape = dshape;
+	get_slice_output_shape(param_.cont_begin, param_.cont_end, param_step, cont_slice_oshape, dshape);
+	MXNET_NDIM_SWITCH(data.ndim(), ndim, {
+	  common::StaticArray<int, ndim> begin, end, step;
+	  GetIndexRange(data.shape_, param_.cont_begin, param_.cont_end, param_step, &begin, &end, &step);
+	  MSHADOW_TYPE_SWITCH(out.type_flag_, DType, {
+		MXNET_ASSIGN_REQ_SWITCH(req[0], Req, {
+		  int num_threads = out.shape_.FlatTo2D()[0];
+		  if (std::is_same<xpu, gpu>::value) {
+			num_threads *= out.shape_.get<ndim>()[ndim - 1];
+		  }
+		  mxnet_op::Kernel<slice_forward_window<ndim, Req, xpu>, xpu>::Launch(s, num_threads,
+			  out.dptr<DType>(), data.dptr<DType>(),
+			  data.shape_.get<ndim>(), out.shape_.get<ndim>(), begin, step, cont_slice_oshape[ndim - 1]);
+		})
+	  })
+	})
+	//Here make assumption steps is 1
+ 
+    using namespace mxnet_op;
+    using namespace rowsparse;
+
+    typedef float IType;
+    typedef float  DType;
+
+	int ndim = data.ndim();
+	int cont_slice_last_dim_size = cont_slice_oshape[ndim - 1];
+	int data_last_dim_size = dshape[ndim - 1];
+	int out_last_dim_size = oshape[ndim - 1];
+	int emb_in_last_dim_size = (data_last_dim_size - cont_slice_last_dim_size) / param_.num_outputs;
+	int emb_out_last_dim_size = (out_last_dim_size - cont_slice_last_dim_size) / param_.num_outputs;
+	int batch_size = dshape.Size() / data_last_dim_size; //Flatten to 2D
+    const int omp_threads = engine::OpenMP::Get()->GetRecommendedOMPThreadCount();
+
+	TakeCPUInfoWindow<IType, DType>* takecpu_info = new TakeCPUInfoWindow<IType, DType>[param_.num_outputs];
+	DType* out_data = out.dptr<DType>();
+	IType* idx = data.dptr<IType>();
+    for (int em = 0; em < param_.num_outputs; em++) {
+        const TBlob &w = (inputs)[em + 1];
+        const TShape& wshape = w.shape_;
+		takecpu_info[em].idx_offset = em * emb_in_last_dim_size;
+		takecpu_info[em].out_offset = cont_slice_last_dim_size + em * emb_out_last_dim_size;
+        takecpu_info[em].in_data = w.dptr<DType>();
+        takecpu_info[em].M = wshape[1];
+        takecpu_info[em].K = wshape[0];
+    }
+	bool clip = true;
+	int em = 0;
+	int i = 0;
+	int N = batch_size; //TODO: limitation, need to use collapse
+
+#pragma omp parallel for num_threads(omp_threads) collapse(2)
+	for (em = 0; em < param_.num_outputs; em++)
+		for (i = 0; i < N; ++i) {
+			int64_t j = static_cast<int64_t>(*(idx + takecpu_info[em].idx_offset + i* data_last_dim_size));
+			if (clip) {
+				if (j <= 0) j = 0;
+				else if (j >= takecpu_info[em].K) j = takecpu_info[em].K - 1;
+			}
+			else {
+				j = j % takecpu_info[em].K;
+				j += (j < 0) ? takecpu_info[em].K : 0;
+			}
+			std::memcpy(out_data + takecpu_info[em].out_offset + i * out_last_dim_size,
+				takecpu_info[em].in_data + j * takecpu_info[em].M, takecpu_info[em].M * sizeof(DType));
+		}
+
+
+    delete[] takecpu_info;
+
+    return;
+}
+static void MxnetFallBackCompute(FCompute fn, const nnvm::NodeAttrs &attrs,
+	const OpContext &ctx,
+	const std::vector<NDArray> &inputs,
+	const std::vector<OpReqType> &req,
+	const std::vector<NDArray> &outputs) {
+	std::vector<TBlob> in_blobs(inputs.size());
+	std::vector<NDArray> in_bufs;
+	for (size_t i = 0; i < in_blobs.size(); i++) {
+		in_blobs[i] = inputs[i].data();
+	}
+
+	std::vector<TBlob> out_blobs(outputs.size());
+	for (size_t i = 0; i < out_blobs.size(); i++) {
+		NDArray output = outputs[i];
+		out_blobs[i] = output.data();
+	}
+
+	fn(attrs, ctx, in_blobs, req, out_blobs);
+}
+
+template<typename xpu>
+void SliceSplitEmbeddingConcatOpForwardEx(const nnvm::NodeAttrs& attrs,
+	const OpContext& ctx,
+	const std::vector<NDArray>& inputs,
+	const std::vector<OpReqType>& req,
+	const std::vector<NDArray>& outputs)
+{
+	MxnetFallBackCompute(SliceSplitEmbeddingConcatOpForward<cpu>, attrs, ctx, inputs, req, outputs);
+}
+DMLC_REGISTER_PARAMETER(SliceSplitEmbeddingConcatFuseParam);
+
+NNVM_REGISTER_OP(SliceSplitEmbeddingConcatFuse)
+.describe(R"code( Fuse Slice Split Embedding Concat for Wide & Deep Model
+)code" ADD_FILELINE)
+.set_num_inputs([](const NodeAttrs& attrs) {
+	const SliceSplitEmbeddingConcatFuseParam& params = nnvm::get<SliceSplitEmbeddingConcatFuseParam>(attrs.parsed);
+	return 1 + params.num_outputs; //data + weights
+})
+.set_num_outputs([](const NodeAttrs& attrs) {	
+	return 1;
+})
+.set_attr_parser(ParamParser<SliceSplitEmbeddingConcatFuseParam>)
+.set_attr<nnvm::FListInputNames>("FListInputNames",
+	[](const NodeAttrs& attrs) {
+	const SliceSplitEmbeddingConcatFuseParam& params = nnvm::get<SliceSplitEmbeddingConcatFuseParam>(attrs.parsed);
+	std::vector<std::string> ret;
+	ret.push_back(std::string("dns_data"));
+	for (int i = 0; i < params.num_outputs; ++i) {		
+		ret.push_back(std::string("embed_") + std::to_string(i) + std::string("_weight"));
+	}
+	return ret;
+})
+.set_attr<nnvm::FListInputNames>("FListOutputNames",
+	[](const NodeAttrs& attrs) {
+	std::vector<std::string> ret = { "out_data" };
+	return ret;
+})
+.set_attr<std::string>("key_var_num_args", "num_outputs")
+.set_attr<nnvm::FInferShape>("FInferShape", SliceSplitEmbeddingConcatOpShape)
+.set_attr<nnvm::FInferType>("FInferType", SliceSplitEmbeddingConcatOpType)
+.set_attr<FInferStorageType>("FInferStorageType", SliceSplitEmbeddingConcatOpStorageType)
+.set_attr<FResourceRequest>("FResourceRequest",
+	[](const NodeAttrs& attrs) {
+	return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
+})
+.set_attr<FCompute>("FCompute<cpu>", SliceSplitEmbeddingConcatOpForward<cpu>)
+.set_attr<FComputeEx>("FComputeEx<cpu>", SliceSplitEmbeddingConcatOpForwardEx<cpu>)
+.add_argument("data_weight", "NDArray-or-Symbol[]", "List of arrays (data/weight) to embedding weight.")
+.add_arguments(SliceSplitEmbeddingConcatFuseParam::__FIELDS__());
+;
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/tensor/mkldnn/mkldnn_slice_split_embedding.h b/src/operator/tensor/mkldnn/mkldnn_slice_split_embedding.h
new file mode 100644
index 0000000..ad2f25d
--- /dev/null
+++ b/src/operator/tensor/mkldnn/mkldnn_slice_split_embedding.h
@@ -0,0 +1,50 @@
+#ifndef MXNET_OPERATOR_TENSOR_MKLDNN_SLICE_SPLIT_EMBEDDING_OP_H_
+#define MXNET_OPERATOR_TENSOR_MKLDNN_SLICE_SPLIT_EMBEDDING_OP_H_
+
+#include <mxnet/operator_util.h>
+#include <vector>
+#include <utility>
+#include <algorithm>
+namespace mxnet {
+namespace op {
+
+//SliceOpForward operator\tensor\matrix_op-inl.h
+//SliceChannelOp slice_channel-inl.h =>split
+
+struct SliceSplitEmbeddingConcatFuseParam : public dmlc::Parameter<SliceSplitEmbeddingConcatFuseParam> {
+  //From SliceParam, do not support step
+  //Only support kWriteTo
+  nnvm::Tuple<dmlc::optional<int>> cont_begin, cont_end;
+  nnvm::Tuple<dmlc::optional<int>> embed_begin, embed_end;
+  //From SliceChannelParam, do not support Axis
+  int num_outputs;
+  bool squeeze_axis;
+  //From Embedding, do not support sparse_grads, dtypes is for float
+  nnvm::Tuple<int> input_dims;
+  nnvm::Tuple<int> output_dims;
+  //concat Dim
+  int concat_dim;
+  DMLC_DECLARE_PARAMETER(SliceSplitEmbeddingConcatFuseParam) {
+    DMLC_DECLARE_FIELD(cont_begin)
+    .describe("starting indices for the slice operation, just copy to final buffer");
+    DMLC_DECLARE_FIELD(cont_end)
+    .describe("ending indices for the slice operation, just copy to final buffer");  
+	DMLC_DECLARE_FIELD(embed_begin)
+		.describe("starting indices for the slice operation, input to split");
+	DMLC_DECLARE_FIELD(embed_end)
+		.describe("ending indices for the slice operation, input to split");
+	DMLC_DECLARE_FIELD(num_outputs).set_lower_bound(1)
+		.describe("Number of splits. Note that this should evenly divide the length of the `axis`.");	
+	DMLC_DECLARE_FIELD(squeeze_axis).set_default(0);
+	DMLC_DECLARE_FIELD(input_dims)
+		.describe("Vocabulary size of the input indices.");
+	DMLC_DECLARE_FIELD(output_dims)
+		.describe("Dimension of the embedding vectors.");
+	DMLC_DECLARE_FIELD(concat_dim).set_default(1)
+		.describe("the dimension to be concated.");
+  }
+};
+}  // namespace op
+}  // namespace mxnet
+
+#endif //MXNET_OPERATOR_TENSOR_MKLDNN_SLICE_SPLIT_EMBEDDING_OP_H_
\ No newline at end of file
diff --git a/tests/python/unittest/test_module.py b/tests/python/unittest/test_module.py
index 7347723..ce6d037 100644
--- a/tests/python/unittest/test_module.py
+++ b/tests/python/unittest/test_module.py
@@ -433,13 +433,13 @@ def test_monitor():
     res = mon.toc()
     keys = ['act_0', 'act_1', 'data', 'fc_0', 'fc_1', 'softmax']
     mon_result_counts = [0, 0, 0, 0, 0, 0]
-    assert(len(res) == 21)
+    assert(len(res) == 45)
     for n, k, v in res:
         for idx, key in enumerate(keys):
             if k.startswith(key):
                 mon_result_counts[idx] += 1
                 break
-    assert(mon_result_counts == [2, 2, 1, 6, 6, 4])
+    assert(mon_result_counts == [6, 6, 1, 12, 12, 8])
 
 @with_seed()
 def test_executor_group():