diff --git a/.gitignore b/.gitignore
index 796f2a7c355..3ef13efe3ba 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,3 +20,4 @@ project/plugins/project/
 
 # other
 *.txt
+*.swp # vim swap file
diff --git a/dl/src/main/scala/com/intel/analytics/sparkdl/example/ImageNetLocal.scala b/dl/src/main/scala/com/intel/analytics/sparkdl/example/ImageNetLocal.scala
index c58c9e9b563..dbfd76fed72 100644
--- a/dl/src/main/scala/com/intel/analytics/sparkdl/example/ImageNetLocal.scala
+++ b/dl/src/main/scala/com/intel/analytics/sparkdl/example/ImageNetLocal.scala
@@ -21,7 +21,7 @@ import java.awt.color.ColorSpace
 import java.util
 
 import com.intel.analytics.sparkdl.nn.ClassNLLCriterion
-import com.intel.analytics.sparkdl.optim.SGD
+import com.intel.analytics.sparkdl.optim.{EvaluateMethods, SGD}
 import com.intel.analytics.sparkdl.tensor.Tensor
 import com.intel.analytics.sparkdl.utils.{File, T}
 
@@ -49,160 +49,9 @@ object ImageNetLocal {
     println(s"[${(System.nanoTime() - startTime) / 1e9}s] $msg")
   }
 
-  def runDouble(donkey: Donkey, dataSet: DataSets, netType: String, classNum: Int,
+  def run(donkey: Donkey, dataSet: DataSets, netType: String, classNum: Int,
     labelsMap: Map[String, Double], testInterval: Int, donkeyVal: Donkey,
-    dataSetVal: DataSets, batchSize: Int): Unit = {
-    // Compute Mean on amount of samples
-    val samples = 10000
-    log(s"Start to calculate Mean on $samples samples")
-    var (meanR, meanG, meanB) = Array.tabulate(samples)(n => {
-      print(".")
-      val data = donkey.pull
-      dataSet.post(data._2)
-      ImageNetUtils.computeMean(data._1, data._2.dataOffset)
-    }).reduce((a, b) => (a._1 + b._1, a._2 + b._2, a._3 + b._3))
-    meanR /= samples
-    meanG /= samples
-    meanB /= samples
-    println()
-
-    // Compute std on amount of samples
-    log(s"Start to calculate std on $samples samples")
-    var (varR, varG, varB) = Array.tabulate(samples)(n => {
-      print(".")
-      val data = donkey.pull
-      dataSet.post(data._2)
-      ImageNetUtils.computeVar(data._1, meanR, meanG, meanB, data._2.dataOffset)
-    }).reduce((a, b) => (a._1 + b._1, a._2 + b._2, a._3 + b._3))
-    varR /= samples
-    varG /= samples
-    varB /= samples
-
-    val model = netType match {
-      case "alexnet" => AlexNet.getModel[Double](classNum)
-      case "googlenet" => GoogleNet.getModel[Double](classNum)
-      case "googlenet-bn" => GoogleNet.getModel[Double](classNum, "googlenet-bn")
-      case "googlenet-cf" => GoogleNet.getModelCaffe[Double](classNum)
-      case _ => throw new IllegalArgumentException
-    }
-    val (weights, grad) = model.getParameters()
-    println(s"modelsize ${weights.nElement()}")
-    println(model)
-    val criterion = new ClassNLLCriterion[Double]()
-    val epochNum = 90
-    val featureShape = Array(3, 224, 224)
-    val targetShape = Array(1)
-    val sgd = new SGD[Double]
-    val state = T("momentum" -> 0.9, "dampening" -> 0.0)
-    val stageImgs = new util.ArrayDeque[Image](batchSize)
-    val input = Tensor[Double](batchSize, 3, 224, 224)
-    val target = Tensor[Double](batchSize)
-    val iter = ImageNetUtils.toTensorDouble(
-      donkey.map(d => {
-        stageImgs.push(d._2)
-        (labelsMap(d._2.label), d._1)
-      }),
-      featureShape,
-      targetShape,
-      batchSize,
-      (meanR, meanG, meanB),
-      (varR, varG, varB),
-      input,
-      target
-    )
-
-    val stageImgsVal = new util.ArrayDeque[Image](batchSize)
-    val iterVal = ImageNetUtils.toTensorDouble(
-      donkeyVal.map(d => {
-        stageImgsVal.push(d._2)
-        (labelsMap(d._2.label), d._1)
-      }),
-      featureShape,
-      targetShape,
-      batchSize,
-      (meanR, meanG, meanB),
-      (varR, varG, varB),
-      input,
-      target
-    )
-
-    log(s"meanR is $meanR meanG is $meanG meanB is $meanB")
-    log(s"varR is $varR varG is $varG varB is $varB")
-    log("Start to train...")
-
-    var wallClockTime = 0L
-    for (i <- 1 to epochNum) {
-      println(s"Epoch[$i] Train")
-
-      for (regime <- regimes(netType)) {
-        if (i >= regime._1 && i <= regime._2) {
-          state("learningRate") = regime._3
-          state("weightDecay") = regime._4
-        }
-      }
-
-      var j = 0
-      var c = 0
-      model.training()
-      while (j < dataSet.getTotal) {
-        val start = System.nanoTime()
-        val (input, target) = iter.next()
-        val readImgTime = System.nanoTime()
-        model.zeroGradParameters()
-        val output = model.forward(input)
-        val loss = criterion.forward(output, target)
-        val gradOutput = criterion.backward(output, target)
-        model.backward(input, gradOutput)
-        sgd.optimize(_ => (loss, grad), weights, state, state)
-        val end = System.nanoTime()
-        wallClockTime += end - start
-        log(s"Epoch[$i][Iteration $c $j/${dataSet.getTotal}][Wall Clock ${wallClockTime / 1e9}s]" +
-          s" loss is $loss time ${(end - start) / 1e9}s read " +
-          s"time ${(readImgTime - start) / 1e9}s train time ${(end - readImgTime) / 1e9}s." +
-          s" Throughput is ${input.size(1).toDouble / (end - start) * 1e9} img / second")
-        while (!stageImgs.isEmpty) {
-          dataSet.post(stageImgs.poll())
-        }
-        j += input.size(1)
-        c += 1
-      }
-
-      if (i % testInterval == 0) {
-        model.evaluate()
-        var correct = 0
-        var k = 0
-        while (k < dataSetVal.getTotal) {
-          val (input, target) = iterVal.next()
-          val output = model.forward(input)
-          output.max(2)._2.squeeze().map(target, (a, b) => {
-            if (a == b) {
-              correct += 1
-            }
-            a
-          })
-          while (!stageImgsVal.isEmpty) {
-            dataSetVal.post(stageImgsVal.poll())
-          }
-          k += input.size(1)
-        }
-
-        val accuracy = correct.toDouble / dataSetVal.getTotal
-        println(s"[Wall Clock ${wallClockTime / 1e9}s] Accuracy is $accuracy")
-
-        // Save model to a file each epoch
-        File.save(model, s"${netType}${accuracy}.model${i}", true)
-        File.save(state, s"${netType}${accuracy}.state${i}", true)
-      }
-
-      log("shuffle")
-      dataSet.shuffle
-      log("shuffle end")
-    }
-  }
-
-  def runFloat(donkey: Donkey, dataSet: DataSets, netType: String, classNum: Int,
-    labelsMap: Map[String, Double], testInterval: Int, donkeyVal: Donkey,
-    dataSetVal: DataSets, batchSize: Int): Unit = {
+    dataSetVal: DataSets, batchSize: Int, modelPath : String): Unit = {
     // Compute Mean on amount of samples
     val samples = 10000
     log(s"Start to calculate Mean on $samples samples")
@@ -327,25 +176,27 @@ object ImageNetLocal {
 
       if (i % testInterval == 0) {
         model.evaluate()
-        var correct = 0
+        var top1Correct = 0
+        var top5Correct = 0
         var k = 0
         while (k < dataSetVal.getTotal) {
           val (input, target) = iterVal.next()
           val output = model.forward(input)
-          output.max(2)._2.squeeze().map(target, (a, b) => {
-            if (a == b) {
-              correct += 1
-            }
-            a
-          })
+          top1Correct += EvaluateMethods.calcAccuracy(output, target)._1
+          top5Correct += EvaluateMethods.calcTop5Accuracy(output, target)._1
           while (!stageImgsVal.isEmpty) {
             dataSetVal.post(stageImgsVal.poll())
           }
           k += input.size(1)
         }
 
-        val accuracy = correct.toDouble / dataSetVal.getTotal
-        println(s"[Wall Clock ${wallClockTime / 1e9}s] Accuracy is $accuracy")
+        val top1Accuracy = top1Correct.toDouble / dataSetVal.getTotal
+        val top5Accuracy = top5Correct.toDouble / dataSetVal.getTotal
+        println(s"[Wall Clock ${wallClockTime / 1e9}s] Top-1 Accuracy is $top1Accuracy")
+        println(s"[Wall Clock ${wallClockTime / 1e9}s] Top-5 Accuracy is $top5Accuracy")
+        println(s"Save model and state to $modelPath-$i")
+        File.save(model, modelPath + s"-$i.model")
+        File.save(state, modelPath + s"-$i.state")
       }
 
       log("shuffle")
@@ -371,8 +222,8 @@ object ImageNetLocal {
     val testInterval = args(4).toInt
     val netType = args(5)
     val classNum = args(6).toInt
-    val dataType = args(7)
-    val batchSize = args(8).toInt
+    val batchSize = args(7).toInt
+    val modelPath = args(8)
 
     val dataSet = new DataSets(path, classNum, labelsMap)
     val donkey = new Donkey(parallelism, dataSet)
@@ -383,12 +234,7 @@ object ImageNetLocal {
     dataSet.shuffle
     log("shuffle end")
 
-    dataType match {
-      case "double" => runDouble(donkey, dataSet, netType, classNum, labelsMap, testInterval,
-        donkeyVal, dataSetVal, batchSize)
-      case "float" => runFloat(donkey, dataSet, netType, classNum, labelsMap, testInterval,
-        donkeyVal, dataSetVal, batchSize)
-      case _ => throw new IllegalArgumentException
-    }
+    run(donkey, dataSet, netType, classNum, labelsMap, testInterval,
+      donkeyVal, dataSetVal, batchSize, modelPath)
   }
 }
diff --git a/dl/src/main/scala/com/intel/analytics/sparkdl/models/GoogleNet.scala b/dl/src/main/scala/com/intel/analytics/sparkdl/models/GoogleNet.scala
index 12c1a41f100..cec63aefce5 100644
--- a/dl/src/main/scala/com/intel/analytics/sparkdl/models/GoogleNet.scala
+++ b/dl/src/main/scala/com/intel/analytics/sparkdl/models/GoogleNet.scala
@@ -232,7 +232,7 @@ object GoogleNet_v2 {
 
     val conv3 = new Sequential[D]
     conv3.add(new SpatialConvolution[D](inputSize, config[Table](2)(1), 1, 1, 1, 1)
-      .setName(namePrefix + "3x3_s2"))
+      .setName(namePrefix + "3x3_reduce"))
     conv3.add(new SpatialBatchNormalization(config[Table](2)(1), 1e-3)
       .setName(namePrefix + "3x3_reduce/bn"))
     conv3.add(new ReLU[D](true). setName(namePrefix + "3x3_reduce/bn/sc/relu"))
diff --git a/dl/src/main/scala/com/intel/analytics/sparkdl/models/Perf.scala b/dl/src/main/scala/com/intel/analytics/sparkdl/models/Perf.scala
index d6be3bdb702..6191e890b2a 100644
--- a/dl/src/main/scala/com/intel/analytics/sparkdl/models/Perf.scala
+++ b/dl/src/main/scala/com/intel/analytics/sparkdl/models/Perf.scala
@@ -79,7 +79,7 @@ object Perf {
 
   def performance[T: ClassTag](param: Params)(implicit tn: TensorNumeric[T]): Unit = {
     val (model, input) = param.module match {
-      case "alexnet" => (AlexNet(1000), Tensor[T](param.batchSize, 3, 224, 224))
+      case "alexnet" => (AlexNet(1000), Tensor[T](param.batchSize, 3, 227, 227))
       case "alexnetowt" => (AlexNet_OWT(1000), Tensor[T](param.batchSize, 3, 224, 224))
       case "googlenet_v1" => (GoogleNet_v1(1000), Tensor[T](param.batchSize, 3, 224, 224))
       case "googlenet_v2" => (GoogleNet_v2(1000), Tensor[T](param.batchSize, 3, 224, 224))
@@ -139,8 +139,6 @@ object Perf {
   }
 }
 
-case class TestCase[T](input: Tensor[T], target: Tensor[T], model: Module[T])
-
 case class Params(
   batchSize: Int = 128,
   iteration: Int = 10,
diff --git a/dl/src/main/scala/com/intel/analytics/sparkdl/nn/Container.scala b/dl/src/main/scala/com/intel/analytics/sparkdl/nn/Container.scala
index 40b73ac80be..a90cf9b0187 100644
--- a/dl/src/main/scala/com/intel/analytics/sparkdl/nn/Container.scala
+++ b/dl/src/main/scala/com/intel/analytics/sparkdl/nn/Container.scala
@@ -19,6 +19,7 @@ package com.intel.analytics.sparkdl.nn
 
 import com.intel.analytics.sparkdl.tensor.Tensor
 import com.intel.analytics.sparkdl.tensor.TensorNumericMath.TensorNumeric
+import com.intel.analytics.sparkdl.mkl.MKL
 
 import scala.collection.mutable.ArrayBuffer
 import scala.reflect.ClassTag
@@ -93,4 +94,25 @@ private[nn] abstract class Container[@specialized(Float, Double) T: ClassTag](
     })
     (result, offset, newIndexes)
   }
+
+  override def initMkl() : Unit = {
+    def containMkl(module : Module[T]) : Boolean = {
+      return if (module.toString.startsWith("mkl.")) true else false
+    }
+
+    for (i <- 0 until modules.length) {
+      if (containMkl(modules(i))) {
+        if (i >= 1 && containMkl(modules(i - 1))) {
+          ev.getType() match {
+            case "Float" => MKL.SetPrevFloat(modules(i - 1).getClassPtr(),
+                                             modules(i).getClassPtr())
+            case "Double" => MKL.SetPrevDouble(modules(i - 1).getClassPtr(),
+                                               modules(i).getClassPtr())
+          }
+        }
+      } else {
+        modules(i).initMkl()
+      }
+    }
+  }
 }
diff --git a/dl/src/main/scala/com/intel/analytics/sparkdl/nn/Module.scala b/dl/src/main/scala/com/intel/analytics/sparkdl/nn/Module.scala
index 026cc3e3b69..ebe61457f38 100644
--- a/dl/src/main/scala/com/intel/analytics/sparkdl/nn/Module.scala
+++ b/dl/src/main/scala/com/intel/analytics/sparkdl/nn/Module.scala
@@ -43,6 +43,17 @@ abstract class Module[T: ClassTag](implicit ev: TensorNumeric[T]) extends Serial
     if (this.name == null) this.toString else this.name
   }
 
+  private var needComputeBack = true
+
+  def setNeedComputeBack(need: Boolean): this.type = {
+    needComputeBack = need
+    this
+  }
+
+  def isNeedComputeBack(): Boolean = {
+    needComputeBack
+  }
+
   // list of sub modules
   val modules: ArrayBuffer[Module[T]] = ArrayBuffer[Module[T]]()
 
@@ -199,6 +210,10 @@ abstract class Module[T: ClassTag](implicit ev: TensorNumeric[T]) extends Serial
   def cloneModule(): Module[T] = {
     SerializationUtils.clone(this)
   }
+
+  // Support for mkl init.
+  def getClassPtr() : Long = {0L}
+  def initMkl() : Unit = {}
 }
 
 object Module {
diff --git a/dl/src/main/scala/com/intel/analytics/sparkdl/nn/mkl/BatchNormalization.scala b/dl/src/main/scala/com/intel/analytics/sparkdl/nn/mkl/BatchNormalization.scala
new file mode 100644
index 00000000000..6eebabdc02c
--- /dev/null
+++ b/dl/src/main/scala/com/intel/analytics/sparkdl/nn/mkl/BatchNormalization.scala
@@ -0,0 +1,220 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.intel.analytics.sparkdl.nn.mkl
+
+import com.intel.analytics.sparkdl.tensor.TensorNumericMath.TensorNumeric
+import com.intel.analytics.sparkdl.tensor.Tensor
+import com.intel.analytics.sparkdl.utils.RandomGenerator._
+import com.intel.analytics.sparkdl.nn.Module
+import com.intel.analytics.sparkdl.mkl.MKL
+
+import scala.language.implicitConversions
+
+import scala.reflect.ClassTag
+
+class SpatialBatchNormalization[@specialized(Float, Double) T: ClassTag](
+    val nOutput: Int,
+    val eps: Double = 1e-5,
+    val momentum: Double = 0.1,
+    val affine: Boolean = true)(implicit ev: TensorNumeric[T])
+    extends Module[T] {
+
+  require(nOutput > 0,
+          "To set affine=false call SpatialBatchNormalization(nFeature,  eps, momentum, false)")
+
+  val nDim = 2
+  val runningMean = Tensor[T](nOutput)
+  val runningVar = Tensor[T](nOutput).fill(ev.fromType[Int](1))
+  val saveMean = Tensor[T](nOutput)
+  val saveStd = Tensor[T](nOutput).fill(ev.fromType[Int](1))
+
+  private var classPtr = 0L
+  private var firstPass = true
+
+  override def getClassPtr(): Long = classPtr
+
+  val weight: Tensor[T] = if (affine) Tensor[T](nOutput) else null
+  val bias: Tensor[T] = if (affine) Tensor[T](nOutput) else null
+  gradWeight = if (affine) Tensor[T](nOutput) else null
+  gradBias = if (affine) Tensor[T](nOutput) else null
+
+  val useWeight: Boolean = if (weight != null) true else false
+  val useBias: Boolean = if (bias != null) true else false
+
+  if (affine) {
+    reset()
+  }
+
+  override def reset(): Unit = {
+    if (null != weight) {
+      weight.apply1(_ => ev.fromType[Double](RNG.uniform(0, 1)))
+    }
+
+    if (null != bias) {
+      bias.fill(ev.fromType[Int](0))
+    }
+
+    runningMean.zero()
+    runningVar.fill(ev.fromType[Int](1))
+  }
+
+  def checkInputDim(input: Tensor[T]): Unit = {
+    require(input.dim() == nDim,
+            s"only mini-batch supported (${nDim}D tensor), got ${input.dim()}D tensor instead")
+    require(input.size(2) == runningMean.nElement(),
+            s"got ${input.size(2)}-feature tensor, expected ${runningMean.nElement()}")
+  }
+
+  override def updateOutput(input: Tensor[T]): Tensor[T] = {
+    output.resizeAs(input)
+
+    val inputOffset = input.storageOffset() - 1;
+    val outputOffset = output.storageOffset() - 1;
+
+    val inputWidth = input.size(input.dim())
+    val inputHeight = input.size(input.dim() - 1)
+    val inputChannel = if (input.dim() <= 2) 1 else input.size(input.dim() - 2)
+    val inputNumber = if (input.dim() <= 3) 1 else input.size(input.dim() - 3)
+    // TODO we may set input.size(input.dim() - 3) == 1 if input.dim() == 3
+
+    val kernelOffset = weight.storageOffset() - 1
+    val biasOffset = bias.storageOffset() - 1
+
+    implicit def bool2int(b: Boolean) = if (b) 1 else 0
+    if (firstPass) {
+      ev.getType() match {
+        case "Float" =>
+          classPtr = MKL.BatchNormInitFloat(inputNumber,
+                                            inputChannel,
+                                            inputHeight,
+                                            inputWidth,
+                                            eps,
+                                            useWeight,
+                                            useBias,
+                                            4)
+        case "Double" =>
+          classPtr = MKL.BatchNormInitDouble(inputNumber,
+                                             inputChannel,
+                                             inputHeight,
+                                             inputWidth,
+                                             eps,
+                                             useBias,
+                                             useBias,
+                                             4)
+        case _ =>
+          throw new UnsupportedOperationException(s"Only Float/Double supported")
+      }
+      firstPass = false
+    }
+
+    ev.getType() match {
+      case "Float" =>
+        MKL.BatchNormForwardFloat(input.storage().array().asInstanceOf[Array[Float]],
+                                  inputOffset,
+                                  output.storage().array().asInstanceOf[Array[Float]],
+                                  outputOffset,
+                                  weight.storage().array().asInstanceOf[Array[Float]],
+                                  kernelOffset,
+                                  bias.storage().array().asInstanceOf[Array[Float]],
+                                  biasOffset,
+                                  classPtr)
+      case "Double" =>
+        MKL.BatchNormForwardDouble(input.storage().array().asInstanceOf[Array[Double]],
+                                   inputOffset,
+                                   output.storage().array().asInstanceOf[Array[Double]],
+                                   outputOffset,
+                                   weight.storage().array().asInstanceOf[Array[Double]],
+                                   kernelOffset,
+                                   bias.storage().array().asInstanceOf[Array[Double]],
+                                   biasOffset,
+                                   classPtr)
+      case _ =>
+        throw new UnsupportedOperationException(s"Only Float/Double supported")
+    }
+    output
+  }
+
+  override def updateGradInput(input: Tensor[T], gradOutput: Tensor[T]): Tensor[T] = {
+    gradInput.resizeAs(input)
+
+    val inputOffset = input.storageOffset() - 1;
+    val outputOffset = output.storageOffset() - 1;
+
+    val inputWidth = input.size(input.dim())
+    val inputHeight = input.size(input.dim() - 1)
+    val inputChannel = if (input.dim() <= 2) 1 else input.size(input.dim() - 2)
+    val inputNumber = if (input.dim() <= 3) 1 else input.size(input.dim() - 3)
+    // TODO we may set input.size(input.dim() - 3) == 1 if input.dim() == 3
+
+    val kernelOffset = weight.storageOffset() - 1
+    val biasOffset = bias.storageOffset() - 1
+
+    val kernelDiffOffset = gradWeight.storageOffset() - 1
+    val biasDiffOffset = gradBias.storageOffset() - 1
+
+    val gradOutputOffset = gradOutput.storageOffset() - 1
+    val gradInputOffset = gradInput.storageOffset() - 1
+
+    implicit def bool2int(b: Boolean) = if (b) 1 else 0
+    ev.getType() match {
+      case "Float" =>
+        MKL.BatchNormBackwardFloat(input.storage().array().asInstanceOf[Array[Float]],
+                                   inputOffset,
+                                   gradOutput.storage().array().asInstanceOf[Array[Float]],
+                                   gradOutputOffset,
+                                   gradInput.storage().array().asInstanceOf[Array[Float]],
+                                   gradInputOffset,
+                                   gradWeight.storage().array().asInstanceOf[Array[Float]],
+                                   kernelDiffOffset,
+                                   gradBias.storage().array().asInstanceOf[Array[Float]],
+                                   biasDiffOffset,
+                                   classPtr)
+      case "Double" =>
+        MKL.BatchNormBackwardDouble(input.storage().array().asInstanceOf[Array[Double]],
+                                    inputOffset,
+                                    gradOutput.storage().array().asInstanceOf[Array[Double]],
+                                    gradOutputOffset,
+                                    gradInput.storage().array().asInstanceOf[Array[Double]],
+                                    gradInputOffset,
+                                    gradWeight.storage().array().asInstanceOf[Array[Double]],
+                                    kernelDiffOffset,
+                                    gradBias.storage().array().asInstanceOf[Array[Double]],
+                                    biasDiffOffset,
+                                    classPtr)
+      case _ =>
+        throw new UnsupportedOperationException(s"Only Float/Double supported")
+    }
+
+    gradInput
+  }
+
+  override def accGradParameters(input: Tensor[T], gradOutput: Tensor[T], scale: Double): Unit = {}
+
+  override def zeroGradParameters(): Unit = {
+    gradWeight.zero()
+    gradBias.zero()
+  }
+
+  override def parameters(): (Array[Tensor[T]], Array[Tensor[T]]) = {
+    (Array(this.weight, this.bias), Array(this.gradWeight, this.gradBias))
+  }
+
+  override def toString(): String = {
+    s"mkl.BatchNormalization[${ev.getType()}]($nOutput, $eps, $momentum, $affine)"
+  }
+}
diff --git a/dl/src/main/scala/com/intel/analytics/sparkdl/nn/mkl/Concat.scala b/dl/src/main/scala/com/intel/analytics/sparkdl/nn/mkl/Concat.scala
new file mode 100644
index 00000000000..5ec16d1026f
--- /dev/null
+++ b/dl/src/main/scala/com/intel/analytics/sparkdl/nn/mkl/Concat.scala
@@ -0,0 +1,305 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * ATTENTION: MKL version. The start and end layer must be MKL version too.
+ *            Currently, it supports BatchNormalization, Linear, LRN, Pooling(Avg, Max),
+ *            ReLU and SpatialConvolution.
+ */
+
+package com.intel.analytics.sparkdl.nn.mkl
+
+import com.intel.analytics.sparkdl.nn.{Container, Module}
+import com.intel.analytics.sparkdl.tensor.Tensor
+import com.intel.analytics.sparkdl.tensor.TensorNumericMath.TensorNumeric
+import com.intel.analytics.sparkdl.mkl.MKL
+
+import scala.reflect.ClassTag
+
+class Concat[T: ClassTag](val dimension: Int)(implicit ev: TensorNumeric[T]) extends Container[T] {
+
+  private var size: Array[Int] = null
+  private var gradouts: Array[Tensor[T]] = null
+  private var gradOutputs: Array[Array[T]] = Array[Array[T]]()
+
+  var concatPtr : Long = 0L
+  var concat1Pass: Boolean = true
+
+  var sumPtr : Long = 0L
+  var sum1Pass : Boolean = true
+
+  override def getClassPtr(): Long = concatPtr
+
+  def getSize(): Array[Int] = {
+    return size
+  }
+
+  override def updateOutput(input: Tensor[T]): Tensor[T] = {
+    // TODO should check the size of every tensor. It must be same as the first tensor
+    val outs = new Array[Tensor[T]](this.modules.length)
+    var i = 0
+    while (i < this.modules.length) {
+      val currentOutput = this.modules(i).updateOutput(input)
+      outs(i) = currentOutput
+      if (i == 0) {
+        this.size = currentOutput.size()
+      } else {
+        this.size(this.dimension - 1) += currentOutput.size(this.dimension)
+      }
+      i += 1
+    }
+
+    this.output.resize(this.size)
+    // TODO call mkl native code to update output
+    // TODO dimension here is different with "dimension" in MKL 2017
+    // TODO check all dimensions of input tensors are same
+    if (concat1Pass) {
+      val nDimension = outs(0).nDimension()
+      val inputSize: Array[Int] = new Array[Int](this.modules.length * nDimension)
+
+      for (i <- 0 until this.modules.length) {
+        for (j <- 0 until nDimension) {
+          inputSize(i * nDimension + j) = outs(i).size(nDimension - j)
+        }
+      }
+
+      ev.getType() match {
+        case "Double" =>
+          concatPtr = MKL.ConcatInitDouble(this.modules.length, nDimension, inputSize)
+        case "Float" =>
+          concatPtr = MKL.ConcatInitFloat(this.modules.length, nDimension, inputSize)
+        case _ =>
+          throw new UnsupportedOperationException(s"Only Float supported")
+      }
+      concat1Pass = false
+    }
+
+    // get all of the tensors in outs to float/double array
+    val inputs: Array[Array[T]] = new Array[Array[T]](this.modules.length)
+    val inputsOffset: Array[Int] = new Array[Int](this.modules.length)
+    for (i <- 0 until this.modules.length) {
+      inputs(i) = outs(i).storage().array()
+      inputsOffset(i) = outs(i).storageOffset() - 1
+    }
+
+
+    ev.getType() match {
+      case "Double" =>
+        MKL.ConcatForwardDouble(inputs.asInstanceOf[Array[Array[Double]]],
+                                inputsOffset,
+                                output.storage().array().asInstanceOf[Array[Double]],
+                                output.storageOffset() - 1,
+                                concatPtr)
+      case "Float" =>
+        MKL.ConcatForwardFloat(inputs.asInstanceOf[Array[Array[Float]]],
+                               inputsOffset,
+                               output.storage().array().asInstanceOf[Array[Float]],
+                               output.storageOffset() - 1,
+                               concatPtr)
+      case _ =>
+        throw new UnsupportedOperationException(s"Only Float supported")
+    }
+
+    this.output
+  }
+
+  // TODO should we implement this function, what's the difference from @backward
+  override def updateGradInput(input: Tensor[T], gradOutput: Tensor[T]): Tensor[T] = {
+//    this.gradInput.resizeAs(input)
+//
+//    var offset = 1
+//    var i = 0
+//    while (i < this.modules.length) {
+//      val currentOutput = this.modules(i).output
+//      val currentGradInput = this.modules(i).updateGradInput(input,
+//        gradOutput.narrow(dimension, offset, currentOutput.size(dimension)))
+//
+//      if (currentGradInput != null) {
+//        if (i == 0) {
+//          this.gradInput.copy(currentGradInput)
+//        } else {
+//          this.gradInput.add(currentGradInput)
+//        }
+//      }
+//      i += 1
+//      offset += currentOutput.size(dimension)
+//    }
+
+    this.gradInput
+  }
+
+  override def backward(input: Tensor[T], gradOutput: Tensor[T]): Tensor[T] = {
+    // TODO call mkl native code to update gradient input
+    var totalSize : Long = 0L
+    this.gradInput.resizeAs(input)
+    if (gradouts == null || gradouts.length != this.modules.length) {
+      gradouts = new Array[Tensor[T]](this.modules.length)
+    }
+    val gradOutputs: Array[Array[T]] = new Array[Array[T]](this.modules.length)
+    val gradOutputsOffset: Array[Int] = new Array[Int](this.modules.length)
+    for (i <- 0 until this.modules.length) {
+      if (gradouts(i) == null) gradouts(i) = Tensor()
+      gradouts(i).resizeAs(this.modules(i).output)
+      gradOutputs(i) = gradouts(i).storage().array()
+      gradOutputsOffset(i) = gradouts(i).storageOffset() - 1
+    }
+
+    ev.getType() match {
+      case "Double" =>
+        MKL.ConcatBackwardDouble(gradOutputs.asInstanceOf[Array[Array[Double]]],
+                                 gradOutputsOffset,
+                                 gradOutput.storage().array().asInstanceOf[Array[Double]],
+                                 gradOutput.storageOffset() - 1,
+                                 concatPtr)
+      case "Float" =>
+        MKL.ConcatBackwardFloat(gradOutputs.asInstanceOf[Array[Array[Float]]],
+                                gradOutputsOffset,
+                                gradOutput.storage().array().asInstanceOf[Array[Float]],
+                                gradOutput.storageOffset() - 1,
+                                concatPtr)
+      case _ =>
+        throw new UnsupportedOperationException(s"Only Float / Double is supported")
+    }
+
+    val tmpGradInputs : Array[Tensor[T]] = new Array[Tensor[T]](this.modules.length)
+
+    for (i <- 0 until this.modules.length) {
+      val currentOutput = this.modules(i).output
+      tmpGradInputs(i) = this.modules(i).backward(input, gradouts(i))
+    }
+
+    // It can't be converted to mkl dnn concat forward, becaus the size of all
+    // gradient input is the same.
+    // copy method here doesn't costs too much
+    // TODO convert to eltwise
+    //if (currentGradInput != null) {
+    //  if (i == 0) {
+    //    this.gradInput.copy(currentGradInput)
+    //  } else {
+    //    this.gradInput.add(currentGradInput)
+    //  }
+    //}
+
+    val subGradInputs: Array[Array[T]] = new Array[Array[T]](this.modules.length)
+    val subGradInputsOffset: Array[Int] = new Array[Int](this.modules.length)
+    for (i <- 0 until this.modules.length) {
+      subGradInputs(i) = tmpGradInputs(i).storage().array()
+      subGradInputsOffset(i) = tmpGradInputs(i).storageOffset() - 1
+    }
+
+    if (sum1Pass) {
+      val nDimension = tmpGradInputs(0).nDimension()
+      val subGradInputSize: Array[Int] = new Array[Int](this.modules.length * nDimension)
+
+      for (i <- 0 until this.modules.length) {
+        for (j <- 0 until nDimension) {
+          subGradInputSize(i * nDimension + j) = tmpGradInputs(i).size(nDimension - j)
+        }
+      }
+
+      ev.getType() match {
+        case "Double" =>
+          sumPtr = MKL.SumInitDouble(this.modules.length, nDimension, subGradInputSize)
+        case "Float" =>
+          sumPtr = MKL.SumInitFloat(this.modules.length, nDimension, subGradInputSize)
+        case _ =>
+          throw new UnsupportedOperationException(s"Only Float supported")
+      }
+      sum1Pass = false
+    }
+
+    ev.getType() match {
+      case "Double" =>
+        MKL.SumForwardDouble(subGradInputs.asInstanceOf[Array[Array[Double]]],
+                             subGradInputsOffset,
+                             gradInput.storage().array().asInstanceOf[Array[Double]],
+                             gradInput.storageOffset() - 1,
+                             sumPtr)
+      case "Float" =>
+        MKL.SumForwardFloat(subGradInputs.asInstanceOf[Array[Array[Float]]],
+                            subGradInputsOffset,
+                            gradInput.storage().array().asInstanceOf[Array[Float]],
+                            gradInput.storageOffset() - 1,
+                            sumPtr)
+      case _ =>
+        throw new UnsupportedOperationException(s"Only Float supported")
+    }
+
+    this.gradInput
+  }
+
+  override def equals(obj: Any): Boolean = {
+    if (!super.equals(obj)) {
+      return false
+    }
+
+    if (!obj.isInstanceOf[Concat[T]]) {
+      return false
+    }
+    val other = obj.asInstanceOf[Concat[T]]
+    if (this.eq(other)) {
+      return true
+    }
+    if (dimension != other.dimension) {
+      return false
+    }
+
+    if (this.modules.length != other.modules.length) {
+      return false
+    }
+
+    val moduleLength = modules.length
+    var i = 0
+    while (i < moduleLength) {
+      if (modules(i) != other.modules(i)) {
+        return false
+      }
+      i += 1
+    }
+
+    true
+  }
+  override def hashCode(): Int = {
+
+    val seed = 37
+    var hash = super.hashCode()
+    var i = 0
+    val moduleLength = modules.length
+    while (i < moduleLength) {
+      hash = hash * seed + modules(i).hashCode()
+      i += 1
+    }
+
+    hash
+  }
+
+  override def toString(): String = {
+    val tab = "  "
+    val next = "  |`-> "
+    val last = "   ... -> "
+    val ext = "  |    "
+    val extlast = "       "
+    s"mkl.Concat {$line${tab}input$line${modules.zipWithIndex.map {
+      case (model: Module[T], index: Int) =>
+        s"$tab$next(${index + 1}): ${if (index == modules.length - 1) {
+          model.setLine(line + tab + extlast)
+        } else {
+          model.setLine(line + tab + ext)
+        }}"
+    }.mkString(line)}$line$tab${last}output$line$tab}"
+  }
+}
diff --git a/dl/src/main/scala/com/intel/analytics/sparkdl/nn/mkl/Linear.scala b/dl/src/main/scala/com/intel/analytics/sparkdl/nn/mkl/Linear.scala
new file mode 100644
index 00000000000..f049b31cff7
--- /dev/null
+++ b/dl/src/main/scala/com/intel/analytics/sparkdl/nn/mkl/Linear.scala
@@ -0,0 +1,317 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.intel.analytics.sparkdl.nn.mkl
+
+import com.intel.analytics.sparkdl.mkl.MKL
+import com.intel.analytics.sparkdl.nn.{Default, InitializationMethod, Module, Xavier}
+import com.intel.analytics.sparkdl.utils.RandomGenerator._
+import com.intel.analytics.sparkdl.tensor.TensorNumericMath.TensorNumeric
+import com.intel.analytics.sparkdl.tensor.Tensor
+
+import scala.reflect.ClassTag
+
+class Linear[@specialized(Float, Double) T: ClassTag](
+    inputSize: Int,
+    outputSize: Int,
+    val needCompute: Boolean = true,
+    private var initMethod: InitializationMethod = Default
+)(implicit ev: TensorNumeric[T])
+    extends Module[T] {
+  val weight: Tensor[T] = Tensor[T](outputSize, inputSize)
+  val bias: Tensor[T] = Tensor[T](outputSize)
+  val addBuffer: Tensor[T] = Tensor[T]()
+  this.gradWeight = Tensor[T](outputSize, inputSize)
+  this.gradBias = Tensor[T](outputSize)
+
+  private var classPtr = 0L
+  private var firstPass = true
+
+  override def getClassPtr(): Long = classPtr
+
+  reset()
+
+  def setInitMethod(initMethod: InitializationMethod): this.type = {
+    this.initMethod = initMethod
+    this
+  }
+
+  override def reset(): Unit = {
+    initMethod match {
+      case Default =>
+        val stdv = 1.0 / math.sqrt(weight.size(2)) // todo, better to support uniform
+        weight.apply1(_ => ev.fromType[Double](RNG.uniform(0, 1) * 2 * stdv - stdv))
+        bias.apply1(_ => ev.fromType[Double](RNG.uniform(0, 1) * 2 * stdv - stdv))
+      case Xavier =>
+        val fanIn = weight.size(2)
+        val fanOut = weight.size(1)
+        val stdv = math.sqrt(3 / (fanIn + fanOut)) // todo, better to support uniform
+        weight.apply1(_ => ev.fromType[Double](RNG.uniform(0, 1) * 2 * stdv - stdv))
+        bias.fill(ev.fromType(0))
+      case _ =>
+        throw new UnsupportedOperationException(s"Only Default / Xavier supported")
+    }
+  }
+
+  override def updateOutput(input: Tensor[T]): Tensor[T] = {
+    require(input.dim() == 2, "only batch mode supported")
+
+    val inputWidth = input.size(input.dim())
+    val inputHeight = input.size(input.dim() - 1)
+
+    val nFrame = input.size(1)
+    val nElement = output.nElement
+    output.resize(Array(nFrame, bias.size(1)))
+    if (output.nElement() != nElement) { output.zero() }
+
+    val inputOffset = input.storageOffset() - 1
+    val outputOffset = output.storageOffset() - 1
+    val biasOffset = bias.storageOffset() - 1
+    val kernelOffset = weight.storageOffset() - 1
+
+    val kernelHeight = outputSize
+    val kernelWidth = inputSize
+    val outputChannels = outputSize
+
+    if (firstPass) {
+      ev.getType() match {
+        case "Double" =>
+          classPtr = MKL
+            .LinearInitDouble(inputHeight, inputWidth, outputChannels, kernelHeight, kernelWidth)
+        case "Float" =>
+          classPtr =
+            MKL.LinearInitFloat(inputHeight, inputWidth, outputChannels, kernelHeight, kernelWidth)
+        case _ =>
+          throw new UnsupportedOperationException(s"Only Float/Double supported")
+      }
+
+      firstPass = false
+    }
+
+    ev.getType() match {
+      case "Double" =>
+        MKL.LinearForwardDouble(input.storage().array().asInstanceOf[Array[Double]],
+                                inputOffset,
+                                output.storage().array().asInstanceOf[Array[Double]],
+                                outputOffset,
+                                weight.storage().array().asInstanceOf[Array[Double]],
+                                kernelOffset,
+                                bias.storage().array().asInstanceOf[Array[Double]],
+                                biasOffset,
+                                classPtr)
+      case "Float" =>
+        MKL.LinearForwardFloat(input.storage().array().asInstanceOf[Array[Float]],
+                               inputOffset,
+                               output.storage().array().asInstanceOf[Array[Float]],
+                               outputOffset,
+                               weight.storage().array().asInstanceOf[Array[Float]],
+                               kernelOffset,
+                               bias.storage().array().asInstanceOf[Array[Float]],
+                               biasOffset,
+                               classPtr)
+      case _ =>
+        throw new UnsupportedOperationException(s"Only Float supported")
+    }
+    output
+  }
+
+  override def updateGradInput(input: Tensor[T], gradOutput: Tensor[T]): Tensor[T] = {
+    require(input.dim() == 2, "only batch mode supported")
+    val nElement = gradInput.nElement()
+    gradInput.resizeAs(input)
+    if (nElement != gradInput.nElement()) {
+      gradInput.zero()
+    }
+
+    val inputWidth = input.size(input.dim())
+    val inputHeight = input.size(input.dim() - 1)
+
+    val inputOffset = input.storageOffset() - 1
+    val kernelOffset = weight.storageOffset() - 1
+    val biasOffset = bias.storageOffset() - 1
+    val gradOutputOffset = gradOutput.storageOffset() - 1
+    val gradInputOffset = gradInput.storageOffset() - 1
+    val gradWeightOffset = gradWeight.storageOffset() - 1
+    val gradBiasOffset = gradBias.storageOffset() - 1
+
+    val kernelHeight = outputSize
+    val kernelWidth = inputSize
+    val outputChannels = outputSize
+
+    if (needCompute) {
+      ev.getType() match {
+        case "Double" =>
+          MKL.LinearBackwardDataDouble(input.storage().array().asInstanceOf[Array[Double]],
+                                       inputOffset,
+                                       gradOutput.storage().array().asInstanceOf[Array[Double]],
+                                       gradOutputOffset,
+                                       gradInput.storage().array().asInstanceOf[Array[Double]],
+                                       gradInputOffset,
+                                       weight.storage().array().asInstanceOf[Array[Double]],
+                                       kernelOffset,
+                                       bias.storage().array().asInstanceOf[Array[Double]],
+                                       biasOffset,
+                                       classPtr)
+        case "Float" =>
+          MKL.LinearBackwardDataFloat(input.storage().array().asInstanceOf[Array[Float]],
+                                      inputOffset,
+                                      gradOutput.storage().array().asInstanceOf[Array[Float]],
+                                      gradOutputOffset,
+                                      gradInput.storage().array().asInstanceOf[Array[Float]],
+                                      gradInputOffset,
+                                      weight.storage().array().asInstanceOf[Array[Float]],
+                                      kernelOffset,
+                                      bias.storage().array().asInstanceOf[Array[Float]],
+                                      biasOffset,
+                                      classPtr)
+        case _ =>
+          throw new UnsupportedOperationException(s"Only Float supported")
+      }
+    }
+
+    ev.getType() match {
+      case "Double" =>
+        MKL.LinearBackwardKernelDouble(input.storage().array().asInstanceOf[Array[Double]],
+                                       inputOffset,
+                                       gradOutput.storage().array().asInstanceOf[Array[Double]],
+                                       gradOutputOffset,
+                                       gradWeight.storage().array().asInstanceOf[Array[Double]],
+                                       gradWeightOffset,
+                                       weight.storage().array().asInstanceOf[Array[Double]],
+                                       kernelOffset,
+                                       bias.storage().array().asInstanceOf[Array[Double]],
+                                       biasOffset,
+                                       classPtr)
+
+      case "Float" =>
+        MKL.LinearBackwardKernelFloat(input.storage().array().asInstanceOf[Array[Float]],
+                                      inputOffset,
+                                      gradOutput.storage().array().asInstanceOf[Array[Float]],
+                                      gradOutputOffset,
+                                      gradWeight.storage().array().asInstanceOf[Array[Float]],
+                                      gradWeightOffset,
+                                      weight.storage().array().asInstanceOf[Array[Float]],
+                                      kernelOffset,
+                                      bias.storage().array().asInstanceOf[Array[Float]],
+                                      biasOffset,
+                                      classPtr)
+
+      case _ =>
+        throw new UnsupportedOperationException(s"Only Float/Double supported")
+    }
+
+    ev.getType() match {
+      case "Double" =>
+        MKL.LinearBackwardBiasDouble(input.storage().array().asInstanceOf[Array[Double]],
+                                     inputOffset,
+                                     gradOutput.storage().array().asInstanceOf[Array[Double]],
+                                     gradOutputOffset,
+                                     gradBias.storage().array().asInstanceOf[Array[Double]],
+                                     gradBiasOffset,
+                                     weight.storage().array().asInstanceOf[Array[Double]],
+                                     kernelOffset,
+                                     bias.storage().array().asInstanceOf[Array[Double]],
+                                     biasOffset,
+                                     classPtr)
+
+      case "Float" =>
+        MKL.LinearBackwardBiasFloat(input.storage().array().asInstanceOf[Array[Float]],
+                                    inputOffset,
+                                    gradOutput.storage().array().asInstanceOf[Array[Float]],
+                                    gradOutputOffset,
+                                    gradBias.storage().array().asInstanceOf[Array[Float]],
+                                    gradBiasOffset,
+                                    weight.storage().array().asInstanceOf[Array[Float]],
+                                    kernelOffset,
+                                    bias.storage().array().asInstanceOf[Array[Float]],
+                                    biasOffset,
+                                    classPtr)
+
+      case _ =>
+        throw new UnsupportedOperationException(s"Only Float/Double supported")
+    }
+
+    gradInput
+  }
+
+//  override def accGradParameters(input: Tensor[T],
+//                                 gradOutput: Tensor[T],
+//                                 scale: Double = 1.0): Unit = {
+//    require(input.dim() == 2, "only batch mode supported")
+//    require(input.dim() == 1 || input.dim() == 2, "input must be vector or matrix")
+//    val value = ev.fromType[Double](scale)
+//    if (input.dim() == 1) {
+//      gradWeight.addr(value, gradOutput, input)
+//      gradBias.add(value, gradOutput)
+//    } else if (input.dim() == 2) {
+//      gradWeight.addmm(value, gradOutput.t, input)
+//      gradBias.addmv(value, gradOutput.t, addBuffer)
+//    }
+//  }
+
+  override def updateParameters(learningRate: T): Unit = {
+    // weight.map(gradWeight,(a,b)=>a - learningRate*b)
+    weight.add(ev.negative(learningRate), gradWeight)
+    // bias.map(gradBias,(a,b)=>a - learningRate*b)
+    bias.add(ev.negative(learningRate), gradBias)
+  }
+
+  override def zeroGradParameters(): Unit = {
+    gradWeight.zero()
+    gradBias.zero()
+  }
+
+  override def parameters(): (Array[Tensor[T]], Array[Tensor[T]]) = {
+    (Array(this.weight, this.bias), Array(this.gradWeight, this.gradBias))
+  }
+
+  override def equals(obj: Any): Boolean = {
+
+    if (!super.equals(obj)) {
+      return false
+    }
+
+    if (!obj.isInstanceOf[Linear[T]]) { return false }
+    val other = obj.asInstanceOf[Linear[T]]
+    if (this.eq(other)) { return true }
+
+    gradWeight == other.gradWeight &&
+    gradBias == other.gradBias &&
+    weight == other.weight &&
+    bias == other.bias
+  }
+
+  override def hashCode() : Int = {
+    val seed = 37
+    var hash = super.hashCode()
+    hash = hash * seed + gradWeight.hashCode()
+    hash = hash * seed + gradBias.hashCode()
+    hash = hash * seed + weight.hashCode()
+    hash = hash * seed + bias.hashCode()
+
+    hash
+  }
+
+  override def toString(): String = {
+    s"nn.mkl.Linear($inputSize -> $outputSize)"
+  }
+
+  override def findModel(paramOffset: Int, indexes: Array[Int]): (Module[T], Int, Array[Int]) = {
+    (this, paramOffset - outputSize * inputSize - outputSize, indexes)
+  }
+
+}
diff --git a/dl/src/main/scala/com/intel/analytics/sparkdl/nn/mkl/LocalNormalizationAcrossChannels.scala b/dl/src/main/scala/com/intel/analytics/sparkdl/nn/mkl/LocalNormalizationAcrossChannels.scala
new file mode 100644
index 00000000000..30e185c258f
--- /dev/null
+++ b/dl/src/main/scala/com/intel/analytics/sparkdl/nn/mkl/LocalNormalizationAcrossChannels.scala
@@ -0,0 +1,188 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.intel.analytics.sparkdl.nn.mkl
+
+import com.intel.analytics.sparkdl.mkl.MKL
+import com.intel.analytics.sparkdl.nn.Module
+import com.intel.analytics.sparkdl.tensor.TensorNumericMath.TensorNumeric
+import com.intel.analytics.sparkdl.tensor._
+import com.intel.analytics.sparkdl.utils.RandomGenerator._
+import scala.reflect.ClassTag
+import scala.language.implicitConversions
+
+class LocalNormalizationAcrossChannels[@specialized(Float, Double) T: ClassTag](
+    val size: Int = 5,
+    val alpha: Double = 1.0,
+    val beta: Double = 0.75,
+    val k: Double = 1.0)(implicit ev: TensorNumeric[T])
+    extends Module[T] {
+
+  private val scale = Tensor[T]()
+  private val paddedSquare = Tensor[T]()
+  private val paddedRatio = Tensor[T]()
+  private val accumRatio = Tensor[T]()
+  private val accumRatioTimeInput = Tensor[T]()
+
+  require(size % 2 == 1, "LRN only supports odd values for size")
+  val prePad = (size - 1) / 2
+
+  var classPtr = 0L
+  private var firstPass = true
+
+  override def getClassPtr(): Long = classPtr
+
+  override def equals(obj: Any): Boolean = {
+    if (!super.equals(obj)) {
+      return false
+    }
+
+    if (!obj.isInstanceOf[LocalNormalizationAcrossChannels[T]]) { return false }
+    val other = obj.asInstanceOf[LocalNormalizationAcrossChannels[T]]
+    if (this.eq(other)) { return true }
+
+    size == other.size &&
+    alpha == other.alpha && beta == other.beta && k == other.k
+  }
+
+  override def hashCode() : Int = {
+    val seed = 37
+    var hash = super.hashCode()
+    hash = hash * seed + size.hashCode()
+    hash = hash * seed + alpha.hashCode()
+    hash = hash * seed + beta.hashCode()
+    hash = hash * seed + k.hashCode()
+
+    hash
+  }
+
+  override def toString(): String = {
+    s"mkl.LocalResponseNormalizationAcrossChannels($size, $alpha, $beta, $k)"
+  }
+
+  override def updateOutput(input: Tensor[T]): Tensor[T] = {
+    require(input.nDimension() == 4,
+            "Input must have 4 dimensions, corresponding to (batch, channels, height, width)")
+    require(input.isContiguous(), "Input is not contiguous")
+
+    output.resizeAs(input)
+
+    val inputOffset = input.storageOffset() - 1;
+    val outputOffset = output.storageOffset() - 1;
+
+    val inputWidth = input.size(input.dim())
+    val inputHeight = input.size(input.dim() - 1)
+    val inputChannel = if (input.dim() <= 3) 1 else input.size(input.dim() - 2)
+    val inputNumber = if (input.dim() <= 3) 1 else input.size(input.dim() - 3)
+    // TODO we may set input.size(input.dim() - 3) == 1 if input.dim() == 3
+
+    if (firstPass) {
+      ev.getType() match {
+        case "Float" =>
+          classPtr = MKL.LRNInitFloat(inputNumber,
+                                      inputChannel,
+                                      inputHeight,
+                                      inputWidth,
+                                      size,
+                                      alpha.toFloat,
+                                      beta.toFloat,
+                                      k.toFloat,
+                                      4)
+        case "Double" =>
+          classPtr = MKL.LRNInitDouble(inputNumber,
+                                       inputChannel,
+                                       inputHeight,
+                                       inputWidth,
+                                       size,
+                                       alpha.toDouble,
+                                       beta.toDouble,
+                                       k.toDouble,
+                                       4)
+        case _ =>
+          throw new UnsupportedOperationException(s"Only Float/Double supported")
+      }
+      firstPass = false
+    }
+
+    implicit def bool2int(b: Boolean) = if (b) 1 else 0
+    ev.getType() match {
+      case "Float" =>
+        MKL.LRNForwardFloat(
+          input.storage().array().asInstanceOf[Array[Float]],
+          inputOffset,
+          output.storage().array().asInstanceOf[Array[Float]],
+          outputOffset,
+          classPtr
+        )
+      case "Double" =>
+        MKL.LRNForwardDouble(
+          input.storage().array().asInstanceOf[Array[Double]],
+          inputOffset,
+          output.storage().array().asInstanceOf[Array[Double]],
+          outputOffset,
+          classPtr
+        )
+      case _ =>
+        throw new UnsupportedOperationException(s"Only Float/Double supported")
+    }
+
+    output
+  }
+
+  override def updateGradInput(input: Tensor[T], gradOutput: Tensor[T]): Tensor[T] = {
+    require(input.nDimension() == 4,
+            "Input must have 4 dimensions, corresponding to (batch, channels, height, width)")
+    require(gradOutput.isContiguous(), "gradOutput is not contiguous")
+
+    gradInput.resizeAs(input)
+
+    val inputOffset = input.storageOffset() - 1;
+    val outputOffset = output.storageOffset() - 1;
+
+    val inputWidth = input.size(input.dim())
+    val inputHeight = input.size(input.dim() - 1)
+    val inputChannel = input.size(input.dim() - 2)
+    val inputNumber = if (input.dim() == 3) 1 else input.size(input.dim() - 3)
+    // TODO we may set input.size(input.dim() - 3) == 1 if input.dim() == 3
+
+    val gradOutputOffset = gradOutput.storageOffset() - 1
+    val gradInputOffset = gradInput.storageOffset() - 1
+
+    ev.getType() match {
+      case "Float" =>
+        MKL.LRNBackwardFloat(input.storage().array().asInstanceOf[Array[Float]],
+                             inputOffset,
+                             gradOutput.storage().array().asInstanceOf[Array[Float]],
+                             gradOutputOffset,
+                             gradInput.storage().array().asInstanceOf[Array[Float]],
+                             gradInputOffset,
+                             classPtr)
+      case "Double" =>
+        MKL.LRNBackwardDouble(input.storage().array().asInstanceOf[Array[Double]],
+                              inputOffset,
+                              gradOutput.storage().array().asInstanceOf[Array[Double]],
+                              gradOutputOffset,
+                              gradInput.storage().array().asInstanceOf[Array[Double]],
+                              gradInputOffset,
+                              classPtr)
+      case _ =>
+        throw new UnsupportedOperationException(s"Only Float/Double supported")
+    }
+
+    gradInput
+  }
+}
diff --git a/dl/src/main/scala/com/intel/analytics/sparkdl/nn/mkl/Pooling.scala b/dl/src/main/scala/com/intel/analytics/sparkdl/nn/mkl/Pooling.scala
new file mode 100644
index 00000000000..796652b7104
--- /dev/null
+++ b/dl/src/main/scala/com/intel/analytics/sparkdl/nn/mkl/Pooling.scala
@@ -0,0 +1,248 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.intel.analytics.sparkdl.nn.mkl
+
+import com.intel.analytics.sparkdl.mkl.MKL
+import com.intel.analytics.sparkdl.nn.Module
+import com.intel.analytics.sparkdl.tensor.TensorNumericMath.TensorNumeric
+import com.intel.analytics.sparkdl.utils.RandomGenerator
+import com.intel.analytics.sparkdl.tensor.Tensor
+
+import scala.language.implicitConversions
+
+import scala.reflect.ClassTag
+
+class SpatialPooling[@specialized(Float, Double) T: ClassTag](
+    val kernelWidth: Int,
+    val kernelHeight: Int,
+    val strideWidth: Int,
+    val strideHeight: Int,
+    val padWidth: Int = 0,
+    val padHeight: Int = 0)(implicit ev: TensorNumeric[T])
+    extends Module[T] {
+
+  implicit def bool2int(b: Boolean) : Int = if (b) 1 else 0
+
+  var classPtr: Long = 0L
+  private var firstPass = true
+
+  override def getClassPtr(): Long = classPtr
+
+  // algorithm = 0 -> max
+  // algorithm = 0 -> avg
+  val algorithm = 0;
+
+  // TODO just for adopt to the testcase
+  var ceil_mode = false
+  def ceil(): SpatialPooling[T] = {
+    ceil_mode = true
+    this
+  }
+
+  def floor(): SpatialPooling[T] = {
+    ceil_mode = false
+    this
+  }
+
+  def this(kernelWidth: Int, kernelHeight: Int)(implicit ev: TensorNumeric[T]) {
+    this(kernelWidth, kernelHeight, kernelWidth, kernelHeight)
+  }
+
+  // compute the output height and width
+  def computeOut(input: Int, pad: Int, kernel: Int, stride: Int): Int = {
+    if (ceil_mode) {
+      math.ceil(1.0 * (input + 2 * pad - kernel) / stride).toInt + 1
+    } else {
+      math.floor(1.0 * (input + 2 * pad - kernel) / stride).toInt + 1
+    }
+  }
+
+  override def updateGradInput(input: Tensor[T], gradOutput: Tensor[T]): Tensor[T] = {
+    gradInput.resizeAs(input)
+
+    val inputOffset = input.storageOffset() - 1;
+    val outputOffset = output.storageOffset() - 1;
+    val gradInputOffset = gradInput.storageOffset() - 1;
+    val gradOutputOffset = gradOutput.storageOffset() - 1;
+
+    val inputWidth = input.size(input.dim())
+    val inputHeight = input.size(input.dim() - 1)
+    val inputChannel = input.size(input.dim() - 2)
+    val inputNumber = if (input.dim() == 3) 1 else input.size(input.dim() - 3)
+    // TODO we may set input.size(input.dim() - 3) == 1 if input.dim() == 3
+
+    val outputHeight =
+      computeOut(inputHeight, padHeight, kernelHeight, strideHeight)
+    val outputWidth =
+      computeOut(inputWidth, padHeight, kernelWidth, strideWidth)
+    val outputChannel = inputChannel
+    val outputNumber = inputNumber
+
+    ev.getType() match {
+      case "Float" =>
+        MKL.PoolingBackwardFloat(input.storage().array().asInstanceOf[Array[Float]],
+                                 inputOffset,
+                                 gradOutput.storage().array().asInstanceOf[Array[Float]],
+                                 gradOutputOffset,
+                                 gradInput.storage().array().asInstanceOf[Array[Float]],
+                                 gradInputOffset,
+                                 classPtr)
+      case "Double" =>
+        MKL.PoolingBackwardDouble(input.storage().array().asInstanceOf[Array[Double]],
+                                  inputOffset,
+                                  gradOutput.storage().array().asInstanceOf[Array[Double]],
+                                  gradOutputOffset,
+                                  gradInput.storage().array().asInstanceOf[Array[Double]],
+                                  gradOutputOffset,
+                                  classPtr)
+      case _ =>
+        throw new UnsupportedOperationException(s"Only Float/Double supported")
+    }
+
+    gradInput
+  }
+
+  override def updateOutput(input: Tensor[T]): Tensor[T] = {
+    val inputWidth = input.size(input.dim())
+    val inputHeight = input.size(input.dim() - 1)
+    val inputChannel = input.size(input.dim() - 2)
+    val inputNumber = if (input.dim() == 3) 1 else input.size(input.dim() - 3)
+    // TODO we may set input.size(input.dim() - 3) == 1 if input.dim() == 3
+
+    val outputHeight =
+      computeOut(inputHeight, padHeight, kernelHeight, strideHeight)
+    val outputWidth =
+      computeOut(inputWidth, padWidth, kernelWidth, strideWidth)
+    val outputChannel = inputChannel
+    val outputNumber = inputNumber
+
+    val inputOffset = input.storageOffset() - 1;
+    val outputOffset = output.storageOffset() - 1;
+
+    if (input.dim() == 3) {
+      output.resize(Array(outputChannel, outputHeight, outputWidth))
+    } else {
+      output.resize(Array(outputNumber, outputChannel, outputHeight, outputWidth))
+    }
+
+    // TODO algorithm = 0 means using MAX
+    val algorithm = 0
+
+    if (firstPass) {
+      ev.getType() match {
+        case "Float" =>
+          classPtr = MKL.PoolingInitFloat(inputNumber,
+                                          inputChannel,
+                                          inputHeight,
+                                          inputWidth,
+                                          kernelHeight,
+                                          kernelWidth,
+                                          strideHeight,
+                                          strideWidth,
+                                          padHeight,
+                                          padWidth,
+                                          4,
+                                          ceil_mode,
+                                          algorithm)
+        case "Double" =>
+          classPtr = MKL.PoolingInitDouble(inputNumber,
+                                           inputChannel,
+                                           inputHeight,
+                                           inputWidth,
+                                           kernelHeight,
+                                           kernelWidth,
+                                           strideHeight,
+                                           strideWidth,
+                                           padHeight,
+                                           padWidth,
+                                           4,
+                                           ceil_mode,
+                                           algorithm)
+        case _ =>
+          throw new UnsupportedOperationException(s"Only Float/Double supported")
+      }
+
+      firstPass = false
+    }
+
+    ev.getType() match {
+      case "Float" =>
+        MKL.PoolingForwardFloat(input.storage().array.asInstanceOf[Array[Float]],
+                                inputOffset,
+                                output.storage().array.asInstanceOf[Array[Float]],
+                                outputOffset,
+                                classPtr)
+      case "Double" =>
+        MKL.PoolingForwardDouble(input.storage().array.asInstanceOf[Array[Double]],
+                                 inputOffset,
+                                 output.storage().array.asInstanceOf[Array[Double]],
+                                 outputOffset,
+                                 classPtr)
+      case _ =>
+        throw new UnsupportedOperationException(s"Only Float/Double supported")
+    }
+    output
+  }
+
+  override def toString(): String = {
+    s"mkl.Pooling"
+  }
+
+}
+
+class SpatialMaxPooling[T: ClassTag](kernelWidth: Int,
+                                     kernelHeight: Int,
+                                     strideWidth: Int,
+                                     strideHeight: Int,
+                                     padWidth: Int = 0,
+                                     padHeight: Int = 0)(implicit ev: TensorNumeric[T])
+    extends SpatialPooling[T](kernelWidth,
+                              kernelHeight,
+                              strideWidth,
+                              strideHeight,
+                              padWidth,
+                              padHeight) {
+  override val algorithm: Int = 0
+  def this(kernelWidth: Int, kernelHeight: Int)(implicit ev: TensorNumeric[T]) {
+    this(kernelWidth, kernelHeight, kernelWidth, kernelHeight)
+  }
+  override def toString(): String = {
+    s"mkl.SpatialMaxPooling"
+  }
+}
+
+class SpatialAveragePooling[T: ClassTag](kernelWidth: Int,
+                                         kernelHeight: Int,
+                                         strideWidth: Int,
+                                         strideHeight: Int,
+                                         padWidth: Int = 0,
+                                         padHeight: Int = 0)(implicit ev: TensorNumeric[T])
+    extends SpatialPooling[T](kernelWidth,
+                              kernelHeight,
+                              strideWidth,
+                              strideHeight,
+                              padWidth,
+                              padHeight) {
+  override val algorithm: Int = 1
+  def this(kernelWidth: Int, kernelHeight: Int)(implicit ev: TensorNumeric[T]) {
+    this(kernelWidth, kernelHeight, kernelWidth, kernelHeight)
+  }
+  override def toString(): String = {
+    s"mkl.SpatialAvgPooling"
+  }
+}
diff --git a/dl/src/main/scala/com/intel/analytics/sparkdl/nn/mkl/ReLU.scala b/dl/src/main/scala/com/intel/analytics/sparkdl/nn/mkl/ReLU.scala
new file mode 100644
index 00000000000..77fb16e903d
--- /dev/null
+++ b/dl/src/main/scala/com/intel/analytics/sparkdl/nn/mkl/ReLU.scala
@@ -0,0 +1,134 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.intel.analytics.sparkdl.nn.mkl
+
+import com.intel.analytics.sparkdl.mkl.MKL
+import com.intel.analytics.sparkdl.nn.Module
+import com.intel.analytics.sparkdl.tensor.Tensor
+import com.intel.analytics.sparkdl.tensor.TensorNumericMath.TensorNumeric
+
+import scala.language.implicitConversions
+
+import scala.reflect.ClassTag
+
+class ReLU[@specialized(Float, Double) T: ClassTag](ip: Boolean = false)(
+    implicit ev: TensorNumeric[T])
+    extends Module[T] {
+
+  override def toString(): String = {
+    s"mkl.ReLU"
+  }
+
+  private var firstPass = true
+  var classPtr = 0L;
+
+  override def getClassPtr(): Long = classPtr
+
+  override def updateGradInput(input: Tensor[T], gradOutput: Tensor[T]): Tensor[T] = {
+    gradInput.resizeAs(gradOutput)
+    // TODO Why does copy in mkl_dnn? Because it costs so much time, I comment is out.
+    // gradInput.copy(gradOutput)
+
+    val inputOffset = input.storageOffset() - 1;
+    val outputOffset = output.storageOffset() - 1;
+    val gradInputOffset = gradInput.storageOffset() - 1;
+    val gradOutputOffset = gradOutput.storageOffset() - 1;
+
+    val inputWidth = input.size(input.dim())
+    val inputHeight = input.size(input.dim() - 1)
+    val inputChannel = if (input.dim() <= 2) 1 else input.size(input.dim() - 2)
+    val inputNumber = if (input.dim() <= 3) 1 else input.size(input.dim() - 3)
+    // TODO we may set input.size(input.dim() - 3) == 1 if input.dim() == 3
+
+    implicit def bool2int(b: Boolean) = if (b) 1 else 0
+    val start = System.nanoTime()
+    ev.getType() match {
+      case "Float" =>
+        MKL.ReLUBackwardFloat(input.storage().array().asInstanceOf[Array[Float]],
+                              inputOffset,
+                              gradOutput.storage().array().asInstanceOf[Array[Float]],
+                              gradOutputOffset,
+                              gradInput.storage().array().asInstanceOf[Array[Float]],
+                              gradInputOffset,
+                              classPtr)
+
+      case "Double" =>
+        MKL.ReLUBackwardDouble(input.storage().array().asInstanceOf[Array[Double]],
+                               inputOffset,
+                               gradOutput.storage().array().asInstanceOf[Array[Double]],
+                               gradOutputOffset,
+                               gradInput.storage().array().asInstanceOf[Array[Double]],
+                               gradInputOffset,
+                               classPtr)
+
+      case _ =>
+        throw new UnsupportedOperationException(s"Only Float/Double supported")
+    }
+
+    gradInput
+  }
+
+  override def updateOutput(input: Tensor[T]): Tensor[T] = {
+    output.resizeAs(input)
+
+    val inputOffset = input.storageOffset() - 1;
+    val outputOffset = output.storageOffset() - 1;
+
+    val inputWidth = input.size(input.dim())
+    val inputHeight = input.size(input.dim() - 1)
+    val inputChannel = if (input.dim() <= 2) 1 else input.size(input.dim() - 2)
+    val inputNumber = if (input.dim() <= 3) 1 else input.size(input.dim() - 3)
+    // TODO we may set input.size(input.dim() - 3) == 1 if input.dim() == 3
+
+    if (firstPass) {
+      ev.getType() match {
+        case "Float" =>
+          classPtr = MKL.ReLUInitFloat(inputNumber, inputChannel, inputHeight, inputWidth, 4);
+        case "Double" =>
+          classPtr = MKL.ReLUInitDouble(inputNumber, inputChannel, inputHeight, inputWidth, 4);
+        case _ =>
+          throw new UnsupportedOperationException(s"Only Float/Double supported")
+      }
+      firstPass = false
+    }
+
+    implicit def bool2int(b: Boolean) = if (b) 1 else 0
+    val start = System.nanoTime()
+    ev.getType() match {
+      case "Float" =>
+        MKL.ReLUForwardFloat(input.storage().array().asInstanceOf[Array[Float]],
+                             inputOffset,
+                             output.storage().array().asInstanceOf[Array[Float]],
+                             outputOffset,
+                             classPtr)
+
+      case "Double" =>
+        MKL.ReLUForwardDouble(input.storage().array().asInstanceOf[Array[Double]],
+                              inputOffset,
+                              output.storage().array().asInstanceOf[Array[Double]],
+                              outputOffset,
+                              classPtr)
+
+      case _ =>
+        throw new UnsupportedOperationException(s"Only Float/Double supported")
+    }
+    // println("[SCALA] ReLU forward call JNI " + (System.nanoTime() - start) / 1e6)
+
+    output
+  }
+}
diff --git a/dl/src/main/scala/com/intel/analytics/sparkdl/nn/mkl/SpatialConvolution.scala b/dl/src/main/scala/com/intel/analytics/sparkdl/nn/mkl/SpatialConvolution.scala
new file mode 100644
index 00000000000..5e024697109
--- /dev/null
+++ b/dl/src/main/scala/com/intel/analytics/sparkdl/nn/mkl/SpatialConvolution.scala
@@ -0,0 +1,425 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.intel.analytics.sparkdl.nn.mkl
+
+import com.intel.analytics.sparkdl.mkl.MKL
+import com.intel.analytics.sparkdl.nn.Module
+import com.intel.analytics.sparkdl.tensor.TensorNumericMath.TensorNumeric
+import com.intel.analytics.sparkdl.tensor._
+import com.intel.analytics.sparkdl.utils.RandomGenerator._
+
+import scala.language.implicitConversions
+
+import com.intel.analytics.sparkdl.nn.InitializationMethod
+import com.intel.analytics.sparkdl.nn.Default
+import com.intel.analytics.sparkdl.nn.Xavier
+
+import scala.reflect.ClassTag
+
+class SpatialConvolution[@specialized(Float, Double) T: ClassTag](
+    val nInputPlane: Int,
+    val nOutputPlane: Int,
+    val kernelWidth: Int,
+    val kernelHeight: Int,
+    val strideWidth: Int = 1,
+    val strideHeight: Int = 1,
+    val padWidth: Int = 0,
+    val padHeight: Int = 0,
+    val groups: Int = 1,
+    private var initMethod: InitializationMethod = Default
+)(implicit ev: TensorNumeric[T])
+    extends Module[T] {
+  val weight: Tensor[T] =
+    Tensor[T](nOutputPlane, nInputPlane, kernelHeight, kernelWidth)
+  val bias: Tensor[T] = Tensor[T](nOutputPlane)
+  this.gradInput = Tensor[T](nOutputPlane, nInputPlane, kernelHeight, kernelWidth)
+  this.gradBias = Tensor[T](nOutputPlane)
+  this.gradWeight = Tensor[T](nOutputPlane, nInputPlane, kernelHeight, kernelWidth)
+  val fInput = Tensor[T]()
+  val fGradInput = Tensor[T]()
+  reset()
+
+  private var im2colTime = 0L
+  private var col2imTime = 0L
+
+  var classPtr = 0L
+  private var firstPass = true
+
+  override def getClassPtr(): Long = classPtr
+
+  def getIm2ColTime() : Long = im2colTime
+  def getCol2ImgTime() : Long = col2imTime
+
+  def setInitMethod(initMethod: InitializationMethod): this.type = {
+    this.initMethod = initMethod
+    this
+  }
+
+  override def reset(): Unit = {
+    val stdv = 1.0 / math.sqrt(kernelWidth * kernelHeight * nInputPlane)
+    // todo, better to support uniform
+    weight.apply1(_ => ev.fromType[Double](RNG.uniform(0, 1) * 2 * stdv - stdv))
+    bias.apply1(_ => ev.fromType[Double](RNG.uniform(0, 1) * 2 * stdv - stdv))
+  }
+
+  override def updateOutput(input: Tensor[T]): Tensor[T] = {
+    require(input.dim() == 3 || input.dim() == 4, "Only support 3D or 4D(batch mode) input")
+    // TODO the requirement of contiguous input may be not necessary for MKL 2017.
+    //      because it supports the api of groups convolution.
+    require(input.isContiguous(), "input is not contiguous")
+
+    // compute the output height and width
+    def computeOut(input: Int, pad: Int, kernel: Int, stride: Int): Int = {
+      (input + 2 * pad - kernel) / stride + 1
+    }
+
+    // +---------+-------+-------+
+    // |         | 3-dim | 4-dim |
+    // +=========+=======+=======+
+    // | Number  | ?     | 1     |
+    // +---------+-------+-------+
+    // | Channel | 1     | 2     |
+    // +---------+-------+-------+
+    // | Height  | 2     | 3     |
+    // +---------+-------+-------+
+    // | Width   | 3     | 4     |
+    // +---------+-------+-------+
+    // Table: Index of 3-dim/4-dim input
+
+    val inputWidth = input.size(input.dim())
+    val inputHeight = input.size(input.dim() - 1)
+    val inputChannel = input.size(input.dim() - 2)
+    // TODO we may set input.size(input.dim() - 3) == 1 if input.dim() == 3
+    val inputNumber = if (input.dim() == 3) 1 else input.size(input.dim() - 3)
+
+    // output number is as same as input number
+    val outputNumber = inputNumber
+    val outputChannel = nOutputPlane
+    val outputWidth =
+      computeOut(inputWidth, padWidth, kernelWidth, strideWidth)
+    val outputHeight =
+      computeOut(inputHeight, padHeight, kernelHeight, strideHeight)
+
+    require(outputWidth >= 1 && outputHeight >= 1, "output size is too small")
+    if (input.dim() == 3) {
+      output.resize(Array(outputChannel, outputHeight, outputWidth))
+    } else {
+      output.resize(Array(outputNumber, outputChannel, outputHeight, outputWidth))
+    }
+
+    // kernel number and bias number are as same as nOutputPlane
+    val biasNumber = nOutputPlane
+    val kernelNumber = nOutputPlane
+    // TODO kernel channel equals to input channel now
+    val kernelChannel = inputChannel
+
+    val inputOffset = input.storageOffset() - 1
+    val outputOffset = output.storageOffset() - 1
+    val biasOffset = bias.storageOffset() - 1
+    val kernelOffset = weight.storageOffset() - 1
+
+    if (firstPass) {
+      ev.getType() match {
+        case "Double" =>
+          classPtr = MKL.ConvolutionInitDouble(inputNumber,
+                                               inputChannel,
+                                               inputHeight,
+                                               inputWidth,
+                                               kernelNumber,
+                                               kernelChannel,
+                                               kernelHeight,
+                                               kernelWidth,
+                                               strideHeight,
+                                               strideWidth,
+                                               padHeight,
+                                               padWidth,
+                                               4,
+                                               groups)
+        case "Float" =>
+          classPtr = MKL.ConvolutionInitFloat(inputNumber,
+                                              inputChannel,
+                                              inputHeight,
+                                              inputWidth,
+                                              kernelNumber,
+                                              kernelChannel,
+                                              kernelHeight,
+                                              kernelWidth,
+                                              strideHeight,
+                                              strideWidth,
+                                              padHeight,
+                                              padWidth,
+                                              4,
+                                              groups)
+        case _ =>
+          throw new UnsupportedOperationException(s"Only Float supported")
+      }
+      firstPass = false
+    }
+
+    implicit def bool2int(b: Boolean) = if (b) 1 else 0
+    val start = System.nanoTime()
+    ev.getType() match {
+      case "Double" =>
+        MKL.ConvolutionForwardDouble(input.storage().array().asInstanceOf[Array[Double]],
+                                     inputOffset,
+                                     output.storage().array().asInstanceOf[Array[Double]],
+                                     outputOffset,
+                                     weight.storage().array().asInstanceOf[Array[Double]],
+                                     kernelOffset,
+                                     bias.storage().array().asInstanceOf[Array[Double]],
+                                     biasOffset,
+                                     classPtr)
+      case "Float" =>
+        MKL.ConvolutionForwardFloat(input.storage().array().asInstanceOf[Array[Float]],
+                                    inputOffset,
+                                    output.storage().array().asInstanceOf[Array[Float]],
+                                    outputOffset,
+                                    weight.storage().array().asInstanceOf[Array[Float]],
+                                    kernelOffset,
+                                    bias.storage().array().asInstanceOf[Array[Float]],
+                                    biasOffset,
+                                    classPtr)
+
+      case _ =>
+        throw new UnsupportedOperationException(s"Only Float supported")
+    }
+    output
+  }
+
+  override def updateGradInput(input: Tensor[T], gradOutput: Tensor[T]): Tensor[T] = {
+    require(input.nDimension() == 3 || input.nDimension() == 4, "Only support 3D or 4D input")
+    require(nOutputPlane == (if (input.nDimension() == 3) gradOutput.size(1)
+                             else gradOutput.size(2)),
+            "Number of output features is not equal to nOutputPlane")
+    require(input.isContiguous(), "input is not contiguous")
+    require(gradInput.isContiguous(), "gradInput is not contiguous")
+    gradInput.resizeAs(input)
+
+    val gradInputOffset = gradInput.storageOffset() - 1
+    val gradKernelOffset = gradWeight.storageOffset() - 1
+    val gradOutputOffset = gradOutput.storageOffset() - 1
+    val gradBiasOffset = gradBias.storageOffset() - 1
+
+    // +---------+-------+-------+
+    // |         | 3-dim | 4-dim |
+    // +=========+=======+=======+
+    // | Number  | ?     | 1     |
+    // +---------+-------+-------+
+    // | Channel | 1     | 2     |
+    // +---------+-------+-------+
+    // | Height  | 2     | 3     |
+    // +---------+-------+-------+
+    // | Width   | 3     | 4     |
+    // +---------+-------+-------+
+    // Table: Index of 3-dim/4-dim input
+
+    val inputWidth = input.size(input.dim())
+    val inputHeight = input.size(input.dim() - 1)
+    val inputChannel = input.size(input.dim() - 2)
+    // TODO we may set input.size(input.dim() - 3) == 1 if input.dim() == 3
+    val inputNumber = if (input.dim() == 3) 1 else input.size(input.dim() - 3)
+
+    val kernelNumber = nOutputPlane
+    val kernelChannel = inputChannel
+
+    val inputOffset = input.storageOffset() - 1
+    val biasOffset = bias.storageOffset() - 1
+    val kernelOffset = weight.storageOffset() - 1
+
+    implicit def bool2int(b: Boolean) = if (b) 1 else 0
+    val start = System.nanoTime()
+    if (isNeedComputeBack()) {
+      ev.getType() match {
+        case "Double" =>
+          MKL.ConvolutionBackwardDataDouble(
+            input.storage().array().asInstanceOf[Array[Double]],
+            inputOffset,
+            gradOutput.storage().array().asInstanceOf[Array[Double]],
+            gradOutputOffset,
+            gradInput.storage().array().asInstanceOf[Array[Double]],
+            gradInputOffset,
+            weight.storage().array().asInstanceOf[Array[Double]],
+            kernelOffset,
+            bias.storage().array().asInstanceOf[Array[Double]],
+            biasOffset,
+            classPtr
+          )
+        case "Float" =>
+          MKL.ConvolutionBackwardDataFloat(
+            input.storage().array().asInstanceOf[Array[Float]],
+            inputOffset,
+            gradOutput.storage().array().asInstanceOf[Array[Float]],
+            gradOutputOffset,
+            gradInput.storage().array().asInstanceOf[Array[Float]],
+            gradInputOffset,
+            weight.storage().array().asInstanceOf[Array[Float]],
+            kernelOffset,
+            bias.storage().array().asInstanceOf[Array[Float]],
+            biasOffset,
+            classPtr
+          )
+
+        case _ =>
+          throw new UnsupportedOperationException(s"Only Float/Double supported")
+      }
+    }
+    ev.getType() match {
+      case "Double" =>
+        MKL.ConvolutionBackwardKernelDouble(
+          input.storage().array().asInstanceOf[Array[Double]],
+          inputOffset,
+          gradOutput.storage().array().asInstanceOf[Array[Double]],
+          gradOutputOffset,
+          gradWeight.storage().array().asInstanceOf[Array[Double]],
+          gradKernelOffset,
+          weight.storage().array().asInstanceOf[Array[Double]],
+          kernelOffset,
+          bias.storage().array().asInstanceOf[Array[Double]],
+          biasOffset,
+          classPtr
+        )
+      case "Float" =>
+        MKL.ConvolutionBackwardKernelFloat(
+          input.storage().array().asInstanceOf[Array[Float]],
+          inputOffset,
+          gradOutput.storage().array().asInstanceOf[Array[Float]],
+          gradOutputOffset,
+          gradWeight.storage().array().asInstanceOf[Array[Float]],
+          gradKernelOffset,
+          weight.storage().array().asInstanceOf[Array[Float]],
+          kernelOffset,
+          bias.storage().array().asInstanceOf[Array[Float]],
+          biasOffset,
+          classPtr
+        )
+      case _ =>
+        throw new UnsupportedOperationException(s"Only Float/Double supported")
+    }
+    ev.getType() match {
+      case "Double" =>
+        MKL.ConvolutionBackwardBiasDouble(
+          input.storage().array().asInstanceOf[Array[Double]],
+          inputOffset,
+          gradOutput.storage().array().asInstanceOf[Array[Double]],
+          gradOutputOffset,
+          gradBias.storage().array().asInstanceOf[Array[Double]],
+          gradBiasOffset,
+          weight.storage().array().asInstanceOf[Array[Double]],
+          kernelOffset,
+          bias.storage().array().asInstanceOf[Array[Double]],
+          biasOffset,
+          classPtr
+        )
+
+      case "Float" =>
+        MKL.ConvolutionBackwardBiasFloat(
+          input.storage().array().asInstanceOf[Array[Float]],
+          inputOffset,
+          gradOutput.storage().array().asInstanceOf[Array[Float]],
+          gradOutputOffset,
+          gradBias.storage().array().asInstanceOf[Array[Float]],
+          gradBiasOffset,
+          weight.storage().array().asInstanceOf[Array[Float]],
+          kernelOffset,
+          bias.storage().array().asInstanceOf[Array[Float]],
+          biasOffset,
+          classPtr
+        )
+
+      case _ =>
+        throw new UnsupportedOperationException(s"Only Float/Double supported")
+    }
+    gradInput
+  }
+
+  override def updateParameters(learningRate: T): Unit = {
+    weight.map(gradWeight, (a, b) => ev.minus(a, ev.times(learningRate, b)))
+    bias.map(gradBias, (a, b) => ev.minus(a, ev.times(learningRate, b)))
+  }
+
+  override def zeroGradParameters(): Unit = {
+    gradWeight.zero()
+    gradBias.zero()
+  }
+
+  override def parameters(): (Array[Tensor[T]], Array[Tensor[T]]) = {
+    (Array(this.weight, this.bias), Array(this.gradWeight, this.gradBias))
+  }
+
+  override def equals(obj: Any): Boolean = {
+    if (!super.equals(obj)) {
+      return false
+    }
+
+    if (!obj.isInstanceOf[SpatialConvolution[T]]) { return false }
+    val other = obj.asInstanceOf[SpatialConvolution[T]]
+    if (this.eq(other)) { return true }
+
+    nInputPlane == other.nInputPlane &&
+    nOutputPlane == other.nOutputPlane &&
+    kernelWidth == other.kernelWidth &&
+    kernelHeight == other.kernelHeight &&
+    strideWidth == other.strideWidth &&
+    strideHeight == other.strideHeight &&
+    padWidth == other.padWidth &&
+    padHeight == other.padHeight &&
+    weight == other.weight &&
+    bias == other.bias &&
+    gradWeight == other.gradWeight &&
+    gradBias == other.gradBias
+  }
+
+  override def hashCode() : Int = {
+    val seed = 37
+    var hash = super.hashCode()
+    hash = hash * seed + nInputPlane.hashCode()
+    hash = hash * seed + nOutputPlane.hashCode()
+    hash = hash * seed + kernelWidth.hashCode()
+    hash = hash * seed + kernelHeight.hashCode()
+    hash = hash * seed + strideWidth.hashCode()
+    hash = hash * seed + strideHeight.hashCode()
+    hash = hash * seed + padWidth.hashCode()
+    hash = hash * seed + padWidth.hashCode()
+    hash = hash * seed + weight.hashCode()
+    hash = hash * seed + bias.hashCode()
+    hash = hash * seed + gradWeight.hashCode()
+    hash = hash * seed + gradBias.hashCode()
+
+    hash
+  }
+
+  override def toString(): String = {
+    s"""mkl.SpatialConvolution($nInputPlane -> $nOutputPlane, $kernelWidth x $kernelHeight, $strideWidth, $strideHeight, $padWidth, $padHeight)"""
+  }
+
+  override def findModel(paramOffset: Int, indexes: Array[Int]): (Module[T], Int, Array[Int]) = {
+    (this,
+     paramOffset - nOutputPlane * nInputPlane * kernelHeight * kernelWidth - nOutputPlane,
+     indexes)
+  }
+
+  // mkl-dnn's convolution_backward has done updateGradInput and accGradParameters,
+  // so accGradParameters does nothing
+  // override def updateGradInput(input: Tensor[T], gradOutput: Tensor[T]): Tensor[T] = {
+  //   backward(input, gradOutput)
+  // }
+
+  override def accGradParameters(input: Tensor[T],
+                                 gradOutput: Tensor[T],
+                                 scale: Double = 1.0): Unit = {}
+}
diff --git a/dl/src/test/scala/com/intel/analytics/sparkdl/nn/mkl/GoogLeNetSpec.scala b/dl/src/test/scala/com/intel/analytics/sparkdl/nn/mkl/GoogLeNetSpec.scala
new file mode 100644
index 00000000000..cc127c24ff3
--- /dev/null
+++ b/dl/src/test/scala/com/intel/analytics/sparkdl/nn/mkl/GoogLeNetSpec.scala
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.intel.analytics.sparkdl.nn.mkl
+
+import com.intel.analytics.sparkdl.models._
+import org.scalatest.FlatSpec
+
+class GoogLeNetSpec extends FlatSpec{
+  "GoogLeNet V1 with mkl dnn" should "ends with no segment fault" in {
+    Perf.performance[Float](new Params(batchSize = 32, module = "alexnet"))
+  }
+}
diff --git a/dl/src/test/scala/com/intel/analytics/sparkdl/optim/EpochOptimizerSpec.scala b/dl/src/test/scala/com/intel/analytics/sparkdl/optim/EpochOptimizerSpec.scala
index 0284d54dff3..599fb1a0021 100644
--- a/dl/src/test/scala/com/intel/analytics/sparkdl/optim/EpochOptimizerSpec.scala
+++ b/dl/src/test/scala/com/intel/analytics/sparkdl/optim/EpochOptimizerSpec.scala
@@ -20,7 +20,7 @@ package com.intel.analytics.sparkdl.optim
 import com.intel.analytics.sparkdl.nn._
 import com.intel.analytics.sparkdl.ps.{AllReduceParameterManager, OneReduceParameterManager}
 import com.intel.analytics.sparkdl.tensor.{Storage, Tensor}
-import com.intel.analytics.sparkdl.utils.{Engine, T}
+import com.intel.analytics.sparkdl.utils.{RandomGenerator, Engine, T}
 import org.apache.log4j.{Level, Logger}
 import org.apache.spark.SparkContext
 import org.scalatest.{BeforeAndAfter, FlatSpec, Matchers}
@@ -38,6 +38,7 @@ class EpochOptimizerSpec extends FlatSpec with Matchers with BeforeAndAfter {
   "An Artificial Neural Network with MSE and LBFGS" should "be trained with good result" in {
     Logger.getLogger("org").setLevel(Level.WARN)
     Logger.getLogger("akka").setLevel(Level.WARN)
+    RandomGenerator.RNG.setSeed(1000)
 
     sc = new SparkContext("local[1]", "SerialOptimizerSpec")
 
@@ -98,6 +99,7 @@ class EpochOptimizerSpec extends FlatSpec with Matchers with BeforeAndAfter {
     Logger.getLogger("org").setLevel(Level.WARN)
     Logger.getLogger("akka").setLevel(Level.WARN)
 
+    RandomGenerator.RNG.setSeed(1000)
     sc = new SparkContext("local[1]", "SerialOptimizerSpec")
 
     // Prepare two kinds of input and their corresponding label
diff --git a/mkl/jni/src/main/java/com/intel/analytics/sparkdl/mkl/MKL.java b/mkl/jni/src/main/java/com/intel/analytics/sparkdl/mkl/MKL.java
index 42e19c689b0..4e2796a95e1 100644
--- a/mkl/jni/src/main/java/com/intel/analytics/sparkdl/mkl/MKL.java
+++ b/mkl/jni/src/main/java/com/intel/analytics/sparkdl/mkl/MKL.java
@@ -83,4 +83,196 @@ private static File file(String path) throws IOException {
         String name = new File(path).getName();
         return createTempFile("jniloader", name);
     }
+
+    /* Convolution API */
+    public native static long ConvolutionInitFloat(
+            int inputNumber, int inputChannel, int inputHeight, int inputWidth,
+            int kernelNumber, int kernelChannel, int kernelHeight, int kernelWidth,
+            int strideHeight, int strideWidth, int padHeight, int padWidth,
+            int dimension, int groups);
+    public native static void ConvolutionForwardFloat(
+            float[] input, int inputOffset, float[] output, int outputOffset,
+            float[] kernel, int kernelOffset, float[] bias, int biasOffset, long classPtr);
+    public native static void ConvolutionBackwardDataFloat(
+            float[] input, int inputOffset, float[] gradOutput, int gradOutputOffset,
+            float[] gradInput, int gradInputOffset,
+            float[] kernel, int kernelOffset, float[] bias, int biasOffset, long classPtr);
+    public native static void ConvolutionBackwardKernelFloat(
+            float[] input, int inputOffset, float[] gradOutput, int gradOutputOffset,
+            float[] gradKernel, int gradKernelOffset,
+            float[] kernel, int kernelOffset, float[] bias, int biasOffset, long classPtr);
+    public native static void ConvolutionBackwardBiasFloat(
+            float[] input, int inputOffset, float[] gradOutput, int gradOutputOffset,
+            float[] gradBias, int gradBiasOffset,
+            float[] kernel, int kernelOffset, float[] bias, int biasOffset, long classPtr);
+
+    public native static long ConvolutionInitDouble(
+            int inputNumber, int inputChannel, int inputHeight, int inputWidth,
+            int kernelNumber, int kernelChannel, int kernelHeight, int kernelWidth,
+            int strideHeight, int strideWidth, int padHeight, int padWidth,
+            int dimension, int groups);
+    public native static void ConvolutionForwardDouble(
+            double[] input, int inputOffset, double[] output, int outputOffset,
+            double[] kernel, int kernelOffset, double[] bias, int biasOffset, long classPtr);
+    public native static void ConvolutionBackwardDataDouble(
+            double[] input, int inputOffset, double[] gradOutput, int gradOutputOffset,
+            double[] gradInput, int gradInputOffset,
+            double[] kernel, int kernelOffset, double[] bias, int biasOffset, long classPtr);
+    public native static void ConvolutionBackwardKernelDouble(
+            double[] input, int inputOffset, double[] gradOutput, int gradOutputOffset,
+            double[] gradKernel, int gradKernelOffset,
+            double[] kernel, int kernelOffset, double[] bias, int biasOffset, long classPtr);
+    public native static void ConvolutionBackwardBiasDouble(
+            double[] input, int inputOffset, double[] gradOutput, int gradOutputOffset,
+            double[] gradBias, int gradBiasOffset,
+            double[] kernel, int kernelOffset, double[] bias, int biasOffset, long classPtr);
+
+    /* ReLU API */
+    public native static long ReLUInitFloat(
+            int inputNumber, int inputChannel, int inputHeight, int inputWidth, int dimension);
+    public native static void ReLUForwardFloat(
+            float[] input, int inputOffset, float[] output, int outputOffset, long classPtr);
+    public native static void ReLUBackwardFloat(
+            float[] input, int inputOffset, float[] gradOutput, int gradOutputOffset,
+            float[] gradInput, int gradInputOffset, long classPtr);
+
+    public native static long ReLUInitDouble(
+            int inputNumber, int inputChannel, int inputHeight, int inputWidth, int dimension);
+    public native static void ReLUForwardDouble(
+            double[] input, int inputOffset, double[] output, int outputOffset, long classPtr);
+    public native static void ReLUBackwardDouble(
+            double[] input, int inputOffset, double[] gradOutput, int gradOutputOffset,
+            double[] gradInput, int gradInputOffset, long classPtr);
+
+    /* Pooling API */
+    public native static long PoolingInitFloat(
+        int inputNumber, int inputChannel, int inputHeight, int inputWidth,
+        int kernelHeight, int kernelWidth, int strideHeight, int strideWidth,
+        int padHeight, int padWidth, int dimension, int ceilMode,
+        int algorithm);
+    public native static void PoolingForwardFloat(
+        float[] input, int inputOffset, float[] output, int outputOffset,
+        long classPtr);
+    public native static void PoolingBackwardFloat(
+        float[] input, int inputOffset, float[] outputDiff,
+        int outputDiffOffset, float[] inputDiff, int inputDiffOffset,
+        long classPtr);
+
+    public native static long PoolingInitDouble(
+        int inputNumber, int inputChannel, int inputHeight, int inputWidth,
+        int kernelHeight, int kernelWidth, int strideHeight, int strideWidth,
+        int padHeight, int padWidth, int dimension, int ceilMode,
+        int algorithm);
+    public native static void PoolingForwardDouble(
+        double[] input, int inputOffset, double[] output, int outputOffset,
+        long classPtr);
+    public native static void PoolingBackwardDouble(
+        double[] input, int inputOffset, double[] outputDiff,
+        int outputDiffOffset, double[] inputDiff, int inputDiffOffset,
+        long classPtr);
+
+    /* Batch Normalization */
+    public native static long BatchNormInitFloat(
+            int inputNumber, int inputChannel, int inputHeight, int inputWidth,
+            double eps, int useKernel, int useBias,
+            int dimension);
+    public native static void BatchNormForwardFloat(
+            float[] input, int inputOffset, float[] output, int outputOffset,
+            float[] kernel, int kernelOffset, float[] bias, int biasOffset, long classPtr);
+    public native static void BatchNormBackwardFloat(
+            float[] input, int inputOffset, float[] gradOutput, int gradOutputOffset,
+            float[] gradInput, int gradInputOffset,
+            float[] kernelDiff, int kernelDiffOffset, float[] biasDiff, int biasDiffOffset, long classPtr);
+
+    public native static long BatchNormInitDouble(
+            int inputNumber, int inputChannel, int inputHeight, int inputWidth,
+            double eps, int useKernel, int useBias,
+            int dimension);
+    public native static void BatchNormForwardDouble(
+            double[] input, int inputOffset, double[] output, int outputOffset,
+            double[] kernel, int kernelOffset, double[] bias, int biasOffset, long classPtr);
+    public native static void BatchNormBackwardDouble(
+            double[] input, int inputOffset, double[] gradOutput, int gradOutputOffset,
+            double[] gradInput, int gradInputOffset,
+            double[] kernelDiff, int kernelDiffOffset, double[] biasDiff, int biasDiffOffset, long classPtr);
+
+    /* LRN API*/
+    public native static long LRNInitFloat(int inputNumber, int inputChannel, int inputHeight, int inputWidth,
+                                           int size, float alpha, float beta, float k, int dimension);
+    public native static void LRNForwardFloat(float[] input, int inputOffset, float[] output, int outputOffset, long classPtr);
+    public native static void LRNBackwardFloat(float[] input, int inputOffset,
+                                               float[] outputDiff, int outputOffsetDiff,
+                                               float[] inputDiff, int inputDiffOffset,
+                                               long classPtr);
+    public native static long LRNInitDouble(int inputNumber, int inputChannel, int inputHeight, int inputWidth,
+                                           int size, double alpha, double beta, double k, int dimension);
+    public native static void LRNForwardDouble(double[] input, int inputOffset, double[] output, int outputOffset, long classPtr);
+    public native static void LRNBackwardDouble(double[] input, int inputOffset,
+                                               double[] outputDiff, int outputOffsetDiff,
+                                               double[] inputDiff, int inputDiffOffset,
+                                               long classPtr);
+
+
+    /* Init MKL Model */
+    public native static void SetPrevFloat(long prev, long current);
+    public native static void SetPrevDouble(long prev, long current);
+    
+    /* Delete all memmory allocated */
+    public native static void ReleaseAllMemFloat(long classPtr);
+    public native static void ReleaseAllMemDouble(long classPtr);
+
+
+    // TODO
+    /* Linear API */
+    public native static long LinearInitFloat(
+            int inputHeight, int inputWidth, int outputChannel,
+            int kernelHeight, int kernelWidth);
+    public native static void LinearForwardFloat(
+            float[] input, int inputOffset, float[] output, int outputOffset,
+            float[] kernel, int kernelOffset, float[] bias, int biasOffset, long classPtr);
+    public native static void LinearBackwardDataFloat(
+            float[] input, int inputOffset, float[] gradOutput, int gradOutputOffset,
+            float[] gradInput, int gradInputOffset,
+            float[] kernel, int kernelOffset, float[] bias, int biasOffset, long classPtr);
+    public native static void LinearBackwardKernelFloat(
+            float[] input, int inputOffset, float[] gradOutput, int gradOutputOffset,
+            float[] gradKernel, int gradKernelOffset,
+            float[] kernel, int kernelOffset, float[] bias, int biasOffset, long classPtr);
+    public native static void LinearBackwardBiasFloat(
+            float[] input, int inputOffset, float[] gradOutput, int gradOutputOffset,
+            float[] gradBias, int gradBiasOffset,
+            float[] kernel, int kernelOffset, float[] bias, int biasOffset, long classPtr);
+
+    public native static long LinearInitDouble(
+            int inputHeight, int inputWidth, int outputChannel,
+            int kernelHeight, int kernelWidth);
+    public native static void LinearForwardDouble(
+            double[] input, int inputOffset, double[] output, int outputOffset,
+            double[] kernel, int kernelOffset, double[] bias, int biasOffset, long classPtr);
+    public native static void LinearBackwardDataDouble(
+            double[] input, int inputOffset, double[] gradOutput, int gradOutputOffset,
+            double[] gradInput, int gradInputOffset,
+            double[] kernel, int kernelOffset, double[] bias, int biasOffset, long classPtr);
+    public native static void LinearBackwardKernelDouble(
+            double[] input, int inputOffset, double[] gradOutput, int gradOutputOffset,
+            double[] gradKernel, int gradKernelOffset,
+            double[] kernel, int kernelOffset, double[] bias, int biasOffset, long classPtr);
+    public native static void LinearBackwardBiasDouble(
+            double[] input, int inputOffset, double[] gradOutput, int gradOutputOffset,
+            double[] gradBias, int gradBiasOffset,
+            double[] kernel, int kernelOffset, double[] bias, int biasOffset, long classPtr);
+
+    /* Concat API */
+    public native static long ConcatInitFloat(int numChannels, int dimension, int[] size);
+    public native static void ConcatForwardFloat(float[][] input, int[] inputOffset, float[] output, int outputOffset, long classPtr);
+    public native static void ConcatBackwardFloat(float[][] gradInput, int[] gradInputOffset, float[] output, int outputOffset, long classPtr);
+    public native static long ConcatInitDouble(int numChannels, int dimension, int[] size);
+    public native static void ConcatForwardDouble(double[][] input, int[] inputOffset, double[] output, int outputOffset, long classPtr);
+    public native static void ConcatBackwardDouble(double[][] gradInput, int[] gradInputOffset, double[] output, int outputOffset, long classPtr);
+
+    /* Sum API */
+    public native static long SumInitFloat(int numChannels, int dimension, int[] size);
+    public native static void SumForwardFloat(float[][] input, int[] inputOffset, float[] output, int outputOffset, long classPtr);
+    public native static long SumInitDouble(int numChannels, int dimension, int[] size);
+    public native static void SumForwardDouble(double[][] input, int[] inputOffset, double[] output, int outputOffset, long classPtr);
 }
diff --git a/mkl/native/pom.xml b/mkl/native/pom.xml
index 3f695449888..bfe1c0bb6e5 100644
--- a/mkl/native/pom.xml
+++ b/mkl/native/pom.xml
@@ -46,7 +46,18 @@
                                 <source>
                                     <directory>${basedir}/src/main/c/jni</directory>
                                     <fileNames>
-                                        <fileName>mkl.c</fileName>
+                                        <fileName>omp_threads.cpp</fileName>
+                                        <fileName>layer.cpp</fileName>
+                                        <fileName>convolution.cpp</fileName>
+                                        <fileName>pooling.cpp</fileName>
+                                        <fileName>lrn.cpp</fileName>
+                                        <fileName>linear.cpp</fileName>
+                                        <fileName>relu.cpp</fileName>
+                                        <fileName>batch_norm.cpp</fileName>
+                                        <fileName>concat.cpp</fileName>
+                                        <fileName>sum.cpp</fileName>
+                                        <fileName>utils.cpp</fileName>
+                                        <fileName>debug.cpp</fileName>
                                     </fileNames>
                                 </source>
                             </sources>
@@ -63,7 +74,11 @@
                                 <compilerEndOption>-fPIC</compilerEndOption>
                                 <compilerEndOption>-fopenmp</compilerEndOption>
                                 <compilerEndOption>-Wall</compilerEndOption>
-                                <compilerEndOption>-std=c99</compilerEndOption>
+                                <compilerEndOption>-std=c++11</compilerEndOption>
+                                <!--
+                                <compilerEndOption>-DDEBUG</compilerEndOption>
+                                <compilerEndOption>-DPERF</compilerEndOption>
+                                -->
                             </compilerEndOptions>
                             <linkerStartOptions>
                                 <linkerStartOption>-I ${JAVA_HOME}/include/</linkerStartOption>
@@ -73,6 +88,8 @@
                                 <linkerMiddleOption>-lpthread</linkerMiddleOption>
                                 <linkerMiddleOption>-lm</linkerMiddleOption>
                                 <linkerMiddleOption>-lrt</linkerMiddleOption>
+                                <linkerMiddleOption>-lrt</linkerMiddleOption>
+                                <linkerMiddleOption>-lmkl_rt</linkerMiddleOption>
                             </linkerMiddleOptions>
                             <linkerEndOptions>
                                 <linkerEndOption>-shared</linkerEndOption>
diff --git a/mkl/native/src/main/c/jni/.clang-format b/mkl/native/src/main/c/jni/.clang-format
new file mode 100644
index 00000000000..4c24541ff91
--- /dev/null
+++ b/mkl/native/src/main/c/jni/.clang-format
@@ -0,0 +1,90 @@
+---
+Language:        Cpp
+BasedOnStyle:  llvm
+AccessModifierOffset: -1
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: true
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: true
+AllowShortLoopsOnASingleLine: true
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:   
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Linux
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: true
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
+IncludeCategories: 
+  - Regex:           '^<.*\.h>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IndentCaseLabels: true
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: false
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+ReflowComments:  true
+SortIncludes:    true
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Auto
+TabWidth:        8
+UseTab:          Never
+AlignConsecutiveAssignments: true
+AlignOperands: true
diff --git a/mkl/native/src/main/c/jni/MKLWrapper.h b/mkl/native/src/main/c/jni/MKLWrapper.h
new file mode 100644
index 00000000000..5d75ddd5385
--- /dev/null
+++ b/mkl/native/src/main/c/jni/MKLWrapper.h
@@ -0,0 +1,527 @@
+#ifndef _MKLWARPPER_H
+#define _MKLWARPPER_H
+#include <mkl_dnn.h>
+#include <mkl_dnn_types.h>
+#include <mkl_service.h>
+
+template <typename Type>
+dnnError_t dnnGroupsConvolutionCreateForwardBias(
+    dnnPrimitive_t *pConvolution, dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm, size_t groups, size_t dimension,
+    const size_t srcSize[], const size_t dstSize[], const size_t filterSize[],
+    const size_t convolutionStrides[], const int inputOffset[],
+    const dnnBorder_t borderType)
+{
+  return dnnGroupsConvolutionCreateForwardBias_F32(
+      pConvolution, attributes, algorithm, groups, dimension, srcSize, dstSize,
+      filterSize, convolutionStrides, inputOffset, borderType);
+}
+template <>
+dnnError_t dnnGroupsConvolutionCreateForwardBias<double>(
+    dnnPrimitive_t *pConvolution, dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm, size_t groups, size_t dimension,
+    const size_t srcSize[], const size_t dstSize[], const size_t filterSize[],
+    const size_t convolutionStrides[], const int inputOffset[],
+    const dnnBorder_t borderType)
+{
+  return dnnGroupsConvolutionCreateForwardBias_F64(
+      pConvolution, attributes, algorithm, groups, dimension, srcSize, dstSize,
+      filterSize, convolutionStrides, inputOffset, borderType);
+}
+
+template <typename Type>
+dnnError_t dnnGroupsConvolutionCreateBackwardData(
+    dnnPrimitive_t *pConvolution, dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm, size_t groups, size_t dimension,
+    const size_t srcSize[], const size_t dstSize[], const size_t filterSize[],
+    const size_t convolutionStrides[], const int inputOffset[],
+    const dnnBorder_t borderType)
+{
+  return dnnGroupsConvolutionCreateBackwardData_F32(
+      pConvolution, attributes, algorithm, groups, dimension, srcSize, dstSize,
+      filterSize, convolutionStrides, inputOffset, borderType);
+}
+template <>
+dnnError_t dnnGroupsConvolutionCreateBackwardData<double>(
+    dnnPrimitive_t *pConvolution, dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm, size_t groups, size_t dimension,
+    const size_t srcSize[], const size_t dstSize[], const size_t filterSize[],
+    const size_t convolutionStrides[], const int inputOffset[],
+    const dnnBorder_t borderType)
+{
+  return dnnGroupsConvolutionCreateBackwardData_F64(
+      pConvolution, attributes, algorithm, groups, dimension, srcSize, dstSize,
+      filterSize, convolutionStrides, inputOffset, borderType);
+}
+template <typename Type>
+dnnError_t dnnGroupsConvolutionCreateBackwardFilter(
+    dnnPrimitive_t *pConvolution, dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm, size_t groups, size_t dimension,
+    const size_t srcSize[], const size_t dstSize[], const size_t filterSize[],
+    const size_t convolutionStrides[], const int inputOffset[],
+    const dnnBorder_t borderType)
+{
+  return dnnGroupsConvolutionCreateBackwardFilter_F32(
+      pConvolution, attributes, algorithm, groups, dimension, srcSize, dstSize,
+      filterSize, convolutionStrides, inputOffset, borderType);
+}
+template <>
+dnnError_t dnnGroupsConvolutionCreateBackwardFilter<double>(
+    dnnPrimitive_t *pConvolution, dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm, size_t groups, size_t dimension,
+    const size_t srcSize[], const size_t dstSize[], const size_t filterSize[],
+    const size_t convolutionStrides[], const int inputOffset[],
+    const dnnBorder_t borderType)
+{
+  return dnnGroupsConvolutionCreateBackwardFilter_F64(
+      pConvolution, attributes, algorithm, groups, dimension, srcSize, dstSize,
+      filterSize, convolutionStrides, inputOffset, borderType);
+}
+template <typename Type>
+dnnError_t dnnGroupsConvolutionCreateBackwardBias(
+    dnnPrimitive_t *pConvolution, dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm, size_t groups, size_t dimension,
+    const size_t dstSize[])
+{
+  return dnnGroupsConvolutionCreateBackwardBias_F32(
+      pConvolution, attributes, algorithm, groups, dimension, dstSize);
+}
+template <>
+dnnError_t dnnGroupsConvolutionCreateBackwardBias<double>(
+    dnnPrimitive_t *pConvolution, dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t algorithm, size_t groups, size_t dimension,
+    const size_t dstSize[])
+{
+  return dnnGroupsConvolutionCreateBackwardBias_F64(
+      pConvolution, attributes, algorithm, groups, dimension, dstSize);
+}
+
+template <typename Type>
+dnnError_t dnnExecute(dnnPrimitive_t primitive, void *resources[])
+{
+  return dnnExecute_F32(primitive, resources);
+}
+template <>
+dnnError_t dnnExecute<double>(dnnPrimitive_t primitive, void *resources[])
+{
+  return dnnExecute_F64(primitive, resources);
+}
+
+template <typename Type>
+dnnError_t dnnReLUCreateForward(dnnPrimitive_t *pRelu,
+                                dnnPrimitiveAttributes_t attributes,
+                                const dnnLayout_t dataLayout,
+                                Type negativeSlope)
+{
+  return dnnReLUCreateForward_F32(pRelu, attributes, dataLayout, negativeSlope);
+}
+template <>
+dnnError_t dnnReLUCreateForward<double>(dnnPrimitive_t *pRelu,
+                                        dnnPrimitiveAttributes_t attributes,
+                                        const dnnLayout_t dataLayout,
+                                        double negativeSlope)
+{
+  return dnnReLUCreateForward_F64(pRelu, attributes, dataLayout, negativeSlope);
+}
+template <typename Type>
+dnnError_t dnnReLUCreateBackward(dnnPrimitive_t *pRelu,
+                                 dnnPrimitiveAttributes_t attributes,
+                                 const dnnLayout_t diffLayout,
+                                 const dnnLayout_t dataLayout,
+                                 Type negativeSlope)
+{
+  return dnnReLUCreateBackward_F32(pRelu, attributes, diffLayout, dataLayout,
+                                   negativeSlope);
+}
+template <>
+dnnError_t dnnReLUCreateBackward<double>(dnnPrimitive_t *pRelu,
+                                         dnnPrimitiveAttributes_t attributes,
+                                         const dnnLayout_t diffLayout,
+                                         const dnnLayout_t dataLayout,
+                                         double negativeSlope)
+{
+  return dnnReLUCreateBackward_F64(pRelu, attributes, diffLayout, dataLayout,
+                                   negativeSlope);
+}
+
+template <typename Type>
+dnnError_t dnnLayoutCreate(dnnLayout_t *pLayout, size_t dimension,
+                           const size_t size[], const size_t strides[])
+{
+  return dnnLayoutCreate_F32(pLayout, dimension, size, strides);
+}
+
+template <>
+dnnError_t dnnLayoutCreate<double>(dnnLayout_t *pLayout, size_t dimension,
+                                   const size_t size[], const size_t strides[])
+{
+  return dnnLayoutCreate_F64(pLayout, dimension, size, strides);
+}
+
+template <typename Type>
+dnnError_t dnnPoolingCreateForward(
+    dnnPrimitive_t *pPooling, dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t op, const dnnLayout_t srcLayout, const size_t kernelSize[],
+    const size_t kernelStride[], const int inputOffset[],
+    const dnnBorder_t borderType)
+{
+  return dnnPoolingCreateForward_F32(pPooling, attributes, op, srcLayout,
+                                     kernelSize, kernelStride, inputOffset,
+                                     borderType);
+}
+
+template <>
+dnnError_t dnnPoolingCreateForward<double>(
+    dnnPrimitive_t *pPooling, dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t op, const dnnLayout_t srcLayout, const size_t kernelSize[],
+    const size_t kernelStride[], const int inputOffset[],
+    const dnnBorder_t borderType)
+{
+  return dnnPoolingCreateForward_F64(pPooling, attributes, op, srcLayout,
+                                     kernelSize, kernelStride, inputOffset,
+                                     borderType);
+}
+
+template <typename Type>
+dnnError_t dnnPoolingCreateBackward(
+    dnnPrimitive_t *pPooling, dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t op, const dnnLayout_t srcLayout, const size_t kernelSize[],
+    const size_t kernelStride[], const int inputOffset[],
+    const dnnBorder_t borderType)
+{
+  return dnnPoolingCreateBackward_F32(pPooling, attributes, op, srcLayout,
+                                      kernelSize, kernelStride, inputOffset,
+                                      borderType);
+}
+
+template <>
+dnnError_t dnnPoolingCreateBackward<double>(
+    dnnPrimitive_t *pPooling, dnnPrimitiveAttributes_t attributes,
+    dnnAlgorithm_t op, const dnnLayout_t srcLayout, const size_t kernelSize[],
+    const size_t kernelStride[], const int inputOffset[],
+    const dnnBorder_t borderType)
+{
+  return dnnPoolingCreateBackward_F64(pPooling, attributes, op, srcLayout,
+                                      kernelSize, kernelStride, inputOffset,
+                                      borderType);
+}
+
+template <typename Type>
+dnnError_t dnnLayoutCreateFromPrimitive(dnnLayout_t *pLayout,
+                                        const dnnPrimitive_t primitive,
+                                        dnnResourceType_t type)
+{
+  return dnnLayoutCreateFromPrimitive_F32(pLayout, primitive, type);
+}
+
+template <>
+dnnError_t dnnLayoutCreateFromPrimitive<double>(dnnLayout_t *pLayout,
+                                                const dnnPrimitive_t primitive,
+                                                dnnResourceType_t type)
+{
+  return dnnLayoutCreateFromPrimitive_F64(pLayout, primitive, type);
+}
+
+template <typename Type>
+dnnError_t dnnDelete(dnnPrimitive_t primitive)
+{
+  return dnnDelete_F32(primitive);
+}
+
+template <>
+dnnError_t dnnDelete<double>(dnnPrimitive_t primitive)
+{
+  return dnnDelete_F64(primitive);
+}
+
+template <typename Type>
+dnnError_t dnnLayoutDelete(dnnLayout_t layout)
+{
+  return dnnLayoutDelete_F32(layout);
+}
+template <>
+dnnError_t dnnLayoutDelete<double>(dnnLayout_t layout)
+{
+  return dnnLayoutDelete_F64(layout);
+}
+
+template <typename Type>
+int dnnLayoutCompare(const dnnLayout_t L1, const dnnLayout_t L2)
+{
+  return dnnLayoutCompare_F32(L1, L2);
+}
+template <>
+int dnnLayoutCompare<double>(const dnnLayout_t L1, const dnnLayout_t L2)
+{
+  return dnnLayoutCompare_F64(L1, L2);
+}
+
+template <typename Type>
+size_t dnnLayoutGetMemorySize(const dnnLayout_t Layout)
+{
+  return dnnLayoutGetMemorySize_F32(Layout);
+}
+template <>
+size_t dnnLayoutGetMemorySize<double>(const dnnLayout_t Layout)
+{
+  return dnnLayoutGetMemorySize_F64(Layout);
+}
+
+template <typename Type>
+dnnError_t dnnAllocateBuffer(void **pPtr, dnnLayout_t layout)
+{
+  return dnnAllocateBuffer_F32(pPtr, layout);
+}
+template <>
+dnnError_t dnnAllocateBuffer<double>(void **pPtr, dnnLayout_t layout)
+{
+  return dnnAllocateBuffer_F64(pPtr, layout);
+}
+
+template <typename Type>
+dnnError_t dnnConversionCreate(dnnPrimitive_t *pConversion,
+                               const dnnLayout_t from, const dnnLayout_t to)
+{
+  return dnnConversionCreate_F32(pConversion, from, to);
+}
+template <>
+dnnError_t dnnConversionCreate<double>(dnnPrimitive_t *pConversion,
+                                       const dnnLayout_t from,
+                                       const dnnLayout_t to)
+{
+  return dnnConversionCreate_F64(pConversion, from, to);
+}
+
+template <typename Type>
+dnnError_t dnnReleaseBuffer(void *pPtr)
+{
+  return dnnReleaseBuffer_F32(pPtr);
+}
+template <>
+dnnError_t dnnReleaseBuffer<double>(void *pPtr)
+{
+  return dnnReleaseBuffer_F64(pPtr);
+}
+
+template <typename Type>
+dnnError_t dnnBatchNormalizationCreateForward(
+    dnnPrimitive_t *pBatchNormalization, dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout, float eps)
+{
+  return dnnBatchNormalizationCreateForward_F32(pBatchNormalization, attributes,
+                                                dataLayout, eps);
+}
+
+template <>
+dnnError_t dnnBatchNormalizationCreateForward<double>(
+    dnnPrimitive_t *pBatchNormalization, dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout, float eps)
+{
+  return dnnBatchNormalizationCreateForward_F64(pBatchNormalization, attributes,
+                                                dataLayout, eps);
+}
+
+template <typename Type>
+dnnError_t dnnBatchNormalizationCreateBackwardScaleShift(
+    dnnPrimitive_t *pBatchNormalization, dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout, float eps)
+{
+  return dnnBatchNormalizationCreateBackwardScaleShift_F32(
+      pBatchNormalization, attributes, dataLayout, eps);
+}
+
+template <>
+dnnError_t dnnBatchNormalizationCreateBackwardScaleShift<double>(
+    dnnPrimitive_t *pBatchNormalization, dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout, float eps)
+{
+  return dnnBatchNormalizationCreateBackwardScaleShift_F64(
+      pBatchNormalization, attributes, dataLayout, eps);
+}
+
+template <typename Type>
+dnnError_t dnnBatchNormalizationCreateBackwardData(
+    dnnPrimitive_t *pBatchNormalization, dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout, float eps)
+{
+  return dnnBatchNormalizationCreateBackwardData_F32(
+      pBatchNormalization, attributes, dataLayout, eps);
+}
+
+template <>
+dnnError_t dnnBatchNormalizationCreateBackwardData<double>(
+    dnnPrimitive_t *pBatchNormalization, dnnPrimitiveAttributes_t attributes,
+    const dnnLayout_t dataLayout, float eps)
+{
+  return dnnBatchNormalizationCreateBackwardData_F64(
+      pBatchNormalization, attributes, dataLayout, eps);
+}
+
+template <typename Type>
+dnnError_t dnnLRNCreateForward(dnnPrimitive_t *pLrn,
+                               dnnPrimitiveAttributes_t attributes,
+                               const dnnLayout_t dataLayout, size_t kernelSie,
+                               float alpha, float beta, float k)
+{
+  return dnnLRNCreateForward_F32(pLrn, attributes, dataLayout, kernelSie, alpha,
+                                 beta, k);
+}
+
+template <>
+dnnError_t dnnLRNCreateForward<double>(dnnPrimitive_t *pLrn,
+                                       dnnPrimitiveAttributes_t attributes,
+                                       const dnnLayout_t dataLayout,
+                                       size_t kernelSie, float alpha,
+                                       float beta, float k)
+{
+  return dnnLRNCreateForward_F64(pLrn, attributes, dataLayout, kernelSie, alpha,
+                                 beta, k);
+}
+
+template <typename Type>
+dnnError_t dnnLRNCreateBackward(dnnPrimitive_t *pLrn,
+                                dnnPrimitiveAttributes_t attributes,
+                                const dnnLayout_t diffLayout,
+                                const dnnLayout_t dataLayout, size_t kernelSize,
+                                float alpha, float beta, float k)
+{
+  return dnnLRNCreateBackward_F32(pLrn, attributes, diffLayout, dataLayout,
+                                  kernelSize, alpha, beta, k);
+}
+
+template <>
+dnnError_t dnnLRNCreateBackward<double>(dnnPrimitive_t *pLrn,
+                                        dnnPrimitiveAttributes_t attributes,
+                                        const dnnLayout_t diffLayout,
+                                        const dnnLayout_t dataLayout,
+                                        size_t kernelSize, float alpha,
+                                        float beta, float k)
+{
+  return dnnLRNCreateBackward_F64(pLrn, attributes, diffLayout, dataLayout,
+                                  kernelSize, alpha, beta, k);
+}
+
+template <typename Type>
+dnnError_t dnnInnerProductCreateForwardBias(dnnPrimitive_t *pInnerProduct,
+                                            dnnPrimitiveAttributes_t attributes,
+                                            size_t dimentions,
+                                            const size_t srcSize[],
+                                            size_t outputChannels)
+{
+  return dnnInnerProductCreateForwardBias_F32(
+      pInnerProduct, attributes, dimentions, srcSize, outputChannels);
+}
+template <>
+dnnError_t dnnInnerProductCreateForwardBias<double>(
+    dnnPrimitive_t *pInnerProduct, dnnPrimitiveAttributes_t attributes,
+    size_t dimentions, const size_t srcSize[], size_t outputChannels)
+{
+  return dnnInnerProductCreateForwardBias_F64(
+      pInnerProduct, attributes, dimentions, srcSize, outputChannels);
+}
+
+template <typename Type>
+dnnError_t dnnInnerProductCreateBackwardData(
+    dnnPrimitive_t *pInnerProduct, dnnPrimitiveAttributes_t attributes,
+    size_t dimentions, const size_t srcSize[], size_t outputChannels)
+{
+  return dnnInnerProductCreateBackwardData_F32(
+      pInnerProduct, attributes, dimentions, srcSize, outputChannels);
+}
+template <>
+dnnError_t dnnInnerProductCreateBackwardData<double>(
+    dnnPrimitive_t *pInnerProduct, dnnPrimitiveAttributes_t attributes,
+    size_t dimentions, const size_t srcSize[], size_t outputChannels)
+{
+  return dnnInnerProductCreateBackwardData_F64(
+      pInnerProduct, attributes, dimentions, srcSize, outputChannels);
+}
+template <typename Type>
+dnnError_t dnnInnerProductCreateBackwardFilter(
+    dnnPrimitive_t *pInnerProduct, dnnPrimitiveAttributes_t attributes,
+    size_t dimentions, const size_t srcSize[], size_t outputChannels)
+{
+  return dnnInnerProductCreateBackwardFilter_F32(
+      pInnerProduct, attributes, dimentions, srcSize, outputChannels);
+}
+template <>
+dnnError_t dnnInnerProductCreateBackwardFilter<double>(
+    dnnPrimitive_t *pInnerProduct, dnnPrimitiveAttributes_t attributes,
+    size_t dimentions, const size_t srcSize[], size_t outputChannels)
+{
+  return dnnInnerProductCreateBackwardFilter_F64(
+      pInnerProduct, attributes, dimentions, srcSize, outputChannels);
+}
+template <typename Type>
+dnnError_t dnnInnerProductCreateBackwardBias(
+    dnnPrimitive_t *pInnerProduct, dnnPrimitiveAttributes_t attributes,
+    size_t dimentions, const size_t dstSize[])
+{
+  return dnnInnerProductCreateBackwardBias_F32(pInnerProduct, attributes,
+                                               dimentions, dstSize);
+}
+template <>
+dnnError_t dnnInnerProductCreateBackwardBias<double>(
+    dnnPrimitive_t *pInnerProduct, dnnPrimitiveAttributes_t attributes,
+    size_t dimentions, const size_t dstSize[])
+{
+  return dnnInnerProductCreateBackwardBias_F64(pInnerProduct, attributes,
+                                               dimentions, dstSize);
+}
+
+template <typename Type>
+dnnError_t dnnConcatCreate(dnnPrimitive_t *pConcat,
+                           dnnPrimitiveAttributes_t attributes,
+                           size_t nSrcTensors, dnnLayout_t *src)
+{
+  return dnnConcatCreate_F32(pConcat, attributes, nSrcTensors, src);
+}
+
+template <>
+dnnError_t dnnConcatCreate<double>(dnnPrimitive_t *pConcat,
+                                   dnnPrimitiveAttributes_t attributes,
+                                   size_t nSrcTensors, dnnLayout_t *src)
+{
+  return dnnConcatCreate_F64(pConcat, attributes, nSrcTensors, src);
+}
+
+template <typename Type>
+dnnError_t dnnSplitCreate(dnnPrimitive_t *pSplit,
+                          dnnPrimitiveAttributes_t attributes,
+                          const size_t nDstTensors, dnnLayout_t layout,
+                          size_t dstChannelSize[])
+{
+  
+  return dnnSplitCreate_F32(pSplit, attributes, nDstTensors, layout,
+                            dstChannelSize);
+}
+
+template <>
+dnnError_t dnnSplitCreate<double>(dnnPrimitive_t *pSplit,
+                                  dnnPrimitiveAttributes_t attributes,
+                                  const size_t nDstTensors, dnnLayout_t layout,
+                                  size_t dstChannelSize[])
+{
+  
+  return dnnSplitCreate_F64(pSplit, attributes, nDstTensors, layout,
+                            dstChannelSize);
+}
+
+template <typename Type>
+dnnError_t dnnSumCreate(
+  dnnPrimitive_t *pSum,
+  dnnPrimitiveAttributes_t attributes, const size_t nSummands,
+  dnnLayout_t layout, Type *coefficients)
+{
+  return dnnSumCreate_F32(pSum, attributes, nSummands, layout, coefficients);
+}
+
+template <>
+dnnError_t dnnSumCreate<double>(
+  dnnPrimitive_t *pSum,
+  dnnPrimitiveAttributes_t attributes, const size_t nSummands,
+  dnnLayout_t layout, double *coefficients)
+{
+  return dnnSumCreate_F64(pSum, attributes, nSummands, layout, coefficients);
+}
+#endif
diff --git a/mkl/native/src/main/c/jni/batch_norm.cpp b/mkl/native/src/main/c/jni/batch_norm.cpp
new file mode 100644
index 00000000000..c648e5c5ef1
--- /dev/null
+++ b/mkl/native/src/main/c/jni/batch_norm.cpp
@@ -0,0 +1,428 @@
+#include <jni.h>
+
+#include "debug.h"
+#include "layer.h"
+#include "memory.h"
+#include "utils.h"
+
+template <typename DType>
+class MKLBatchNorm : public MKLLayer<DType>
+{
+ public:
+  MKLBatchNorm();
+  ~MKLBatchNorm();
+
+  void init(size_t inputNumber, size_t inputChannel, size_t inputHeight,
+            size_t inputWidth, double eps, int useKernel, int useBias,
+            int dimension);
+
+  void updateOutput(DType *input, DType *output);
+  void updateGradInput(DType *input, DType *gradOutput, DType *gradInput);
+
+  void setKernel(DType *ptr);
+  void setBias(DType *ptr);
+  void setGradKernel(DType *ptr);
+  void setGradBias(DType *ptr);
+
+ private:
+  // this method is not the same as createMklLayout in MKLMemory
+  void firstPass();
+  void preExecute(DType *input);
+
+  std::shared_ptr<MKLData<DType>> scaleShift;
+  std::shared_ptr<MKLData<DType>> workspace;
+
+  size_t inputSize[4];
+  size_t inputStrides[4];
+
+  size_t outputSize[4];
+  size_t outputStrides[4];
+
+  double eps;
+  bool useKernel;
+  bool useBias;
+
+  DType *kernel;
+  DType *bias;
+  DType *gradKernel;
+  DType *gradBias;
+
+  dnnPrimitive_t scaleShiftPrim;
+};
+
+template <typename DType>
+MKLBatchNorm<DType>::MKLBatchNorm()
+    : scaleShift(new MKLData<DType>),
+      workspace(new MKLData<DType>),
+      kernel(NULL),
+      bias(NULL),
+      gradKernel(NULL),
+      gradBias(NULL),
+      scaleShiftPrim(NULL)
+{
+  eps = 0.00001;
+}
+
+template <typename DType>
+MKLBatchNorm<DType>::~MKLBatchNorm()
+{
+  dnnDelete<DType>(scaleShiftPrim);
+}
+
+template <typename DType>
+void MKLBatchNorm<DType>::setKernel(DType *ptr)
+{
+  kernel = ptr;
+}
+template <typename DType>
+void MKLBatchNorm<DType>::setBias(DType *ptr)
+{
+  bias = ptr;
+}
+template <typename DType>
+void MKLBatchNorm<DType>::setGradKernel(DType *ptr)
+{
+  gradKernel = ptr;
+}
+template <typename DType>
+void MKLBatchNorm<DType>::setGradBias(DType *ptr)
+{
+  gradBias = ptr;
+}
+
+template <typename DType>
+void MKLBatchNorm<DType>::init(size_t inputNumber, size_t inputChannel,
+                               size_t inputHeight, size_t inputWidth,
+                               double eps, int useKernel, int useBias,
+                               int dimension)
+{
+  this->dimension = dimension;
+
+  inputSize[0] = inputWidth;
+  inputSize[1] = inputHeight;
+  inputSize[2] = inputChannel;
+  inputSize[3] = inputNumber;
+
+  inputStrides[0] = 1;
+  for (int i        = 1; i < 4; i++)
+    inputStrides[i] = inputStrides[i - 1] * inputSize[i - 1];
+
+  // the output channel is as same as the number of kernel.
+  // and the output number must be as same as the number of input too.
+  outputSize[0] = inputWidth;
+  outputSize[1] = inputHeight;
+  outputSize[2] = inputChannel;
+  outputSize[3] = inputNumber;
+
+  outputStrides[0] = 1;
+  for (int i         = 1; i < 4; i++)
+    outputStrides[i] = outputStrides[i - 1] * outputSize[i - 1];
+
+  this->eps       = eps;
+  this->useKernel = useKernel > 0 ? true : false;
+  this->useBias   = useBias > 0 ? true : false;
+
+  // create usr layout
+  this->input->createUsrLayout(dimension, inputSize, inputStrides);
+  this->output->createUsrLayout(dimension, outputSize, outputStrides);
+
+  this->gradInput->createUsrLayout(dimension, inputSize, inputStrides);
+  this->gradOutput->createUsrLayout(dimension, outputSize, outputStrides);
+}
+
+template <typename DType>
+void MKLBatchNorm<DType>::firstPass()
+{
+  dnnError_t status = E_UNIMPLEMENTED;
+  dnnLayout_t layout;
+
+  status =
+      dnnLayoutCreate<DType>(&layout, this->dimension, inputSize, inputStrides);
+  CHECK_EQ(status, E_SUCCESS);
+
+  // forward
+  status = dnnBatchNormalizationCreateForward<DType>(&(this->forwardPrim), NULL,
+                                                     layout, eps);
+  CHECK_EQ(status, E_SUCCESS);
+
+  this->input->createMklLayout(this->forwardPrim, dnnResourceSrc);
+  this->output->createMklLayout(this->forwardPrim, dnnResourceDst);
+
+  // backward data
+  status = dnnBatchNormalizationCreateBackwardData<DType>(&(this->backwardPrim),
+                                                          NULL, layout, eps);
+  CHECK_EQ(status, E_SUCCESS);
+
+  this->gradOutput->createMklLayout(this->backwardPrim, dnnResourceDiffDst);
+  this->gradInput->createMklLayout(this->backwardPrim, dnnResourceDiffSrc);
+
+  // scaleshift
+  this->scaleShift->createMklLayout(this->forwardPrim, dnnResourceScaleShift);
+  this->scaleShift->createConversion(true);
+  if (useKernel) {
+    status = dnnBatchNormalizationCreateBackwardScaleShift<DType>(
+        &scaleShiftPrim, NULL, layout, eps);
+    CHECK_EQ(status, E_SUCCESS);
+  }
+
+  // workspace
+  this->workspace->createMklLayout(this->forwardPrim, dnnResourceWorkspace);
+  this->workspace->createConversion(true);
+
+  // we create the layout only at the first time
+  this->isFirstPass = false;
+
+  // delte the layout
+  dnnLayoutDelete<DType>(layout);
+}
+
+template <typename DType>
+void MKLBatchNorm<DType>::preExecute(DType *input)
+{
+  this->input->createConversion();
+}
+
+template <typename DType>
+void MKLBatchNorm<DType>::updateOutput(DType *input, DType *output)
+{
+  if (this->isFirstPass) firstPass();
+
+  // Because the address will change every time, so we need create conversion
+  // every forward/backward.
+  // TODO Should we set the kernel and bias address every time?
+  preExecute(input);
+  this->output->createConversion();
+
+  DType *ptr = reinterpret_cast<DType *>(scaleShift->getData());
+
+  // pad the scale shift with kernel and bias
+  if (useKernel) {
+    for (int i = 0; i < inputSize[2]; i++) {
+      ptr[i] = kernel[i];
+      if (useBias)
+        ptr[i + inputSize[2]] = bias[i];
+      else
+        ptr[i + inputSize[2]] = 0;
+    }
+  } else {
+    for (int i = 0; i < inputSize[2]; i++) {
+      ptr[i]                = 1.0;
+      ptr[i + inputSize[2]] = 0;
+    }
+  }
+#ifdef DEBUG
+  printData<DType>(reinterpret_cast<DType *>(this->input->getUsrData()),
+                   this->inputSize[3], this->inputSize[2], this->inputSize[1],
+                   this->inputSize[0], "Forward input");
+#endif
+
+  dnnError_t status;
+  void *resources[dnnResourceNumber];
+
+  resources[dnnResourceSrc]        = this->input->getConvertedData();
+  resources[dnnResourceDst]        = this->output->getData();
+  resources[dnnResourceScaleShift] = scaleShift->getData();
+  resources[dnnResourceWorkspace]  = workspace->getData();
+
+  PERFSTART();
+  status = dnnExecute<DType>(this->forwardPrim, resources);
+  PERFEND("main computing");
+  CHECK_EQ(status, E_SUCCESS);
+
+#ifdef DEBUG
+  printData<DType>(reinterpret_cast<DType *>(this->output->getData()),
+                   outputSize[3], outputSize[2], outputSize[1], outputSize[0],
+                   "Forward output");
+#endif
+
+  if (!this->output->isUseNext()) {
+    this->output->backToUsr();
+  }
+}
+
+template <typename DType>
+void MKLBatchNorm<DType>::updateGradInput(DType *input, DType *gradOutput,
+                                          DType *gradInput)
+{
+  dnnError_t status;
+  void *resources[dnnResourceNumber];
+
+  preExecute(input);
+
+  this->gradOutput->createConversion();
+  this->gradInput->createConversion();
+
+  resources[dnnResourceDiffDst]    = this->gradOutput->getConvertedData();
+  resources[dnnResourceDiffSrc]    = this->gradInput->getData();
+  resources[dnnResourceSrc]        = this->input->getConvertedData();
+  resources[dnnResourceScaleShift] = scaleShift->getData();
+  resources[dnnResourceWorkspace]  = workspace->getData();
+
+  // 4. main computing parts.
+  PERFSTART();
+  status = dnnExecute<DType>(this->backwardPrim, resources);
+  CHECK_EQ(status, E_SUCCESS);
+  PERFEND("main computing");
+
+  if (useKernel) {
+    void *diffRes[dnnResourceNumber];
+    diffRes[dnnResourceDiffDst]        = this->gradOutput->getConvertedData();
+    diffRes[dnnResourceSrc]            = this->input->getConvertedData();
+    diffRes[dnnResourceDiffScaleShift] = scaleShift->getData();
+    diffRes[dnnResourceWorkspace]      = workspace->getData();
+
+    PERFSTART();
+    status = dnnExecute<DType>(scaleShiftPrim, diffRes);
+    CHECK_EQ(status, E_SUCCESS);
+    PERFEND("weight and bias diff main computing");
+
+    DType *ptr = reinterpret_cast<DType *>(scaleShift->getData());
+    for (int i = 0; i < inputSize[2]; i++) {
+      gradKernel[i] = ptr[i];
+      if (useBias) {
+        gradBias[i] = ptr[i + inputSize[2]];
+      }
+    }
+  }
+
+  if (!this->gradInput->isUsePrev()) {
+    this->gradInput->backToUsr();
+  }
+
+#ifdef DEBUG
+  printData<DType>(reinterpret_cast<DType *>(this->gradInput->getUsrData()),
+                   inputSize[3], inputSize[2], inputSize[1], inputSize[0],
+                   "backward gradient input");
+#endif
+}
+
+template <typename ArrayType, typename DType>
+jlong JNIBatchNormInit(JNIEnv *env, jclass thisClass, jint inputNumber,
+                       jint inputChannel, jint inputHeight, jint inputWidth,
+                       double eps, jint useKernel, jint useBias, jint dimension)
+{
+  MKLBatchNorm<DType> *ptr = new MKLBatchNorm<DType>();
+  ptr->init(inputNumber, inputChannel, inputHeight, inputWidth, eps, useKernel,
+            useBias, dimension);
+
+  return reinterpret_cast<long>(ptr);
+}
+
+template <typename ArrayType, typename DType>
+void JNIBatchNormUpdateOutput(JNIEnv *env, jclass thisClass, ArrayType input,
+                              jint inputOffset, ArrayType output,
+                              jint outputOffset, ArrayType kernel,
+                              jint kernelOffset, ArrayType bias,
+                              jint biasOffset, long classPtr)
+{
+  MKLBatchNorm<DType> *ptr = reinterpret_cast<MKLBatchNorm<DType> *>(classPtr);
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jInput(
+      new ZipArray<ArrayType, DType>(env, input, inputOffset, ptr->input));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jOutput(
+      new ZipArray<ArrayType, DType>(env, output, outputOffset, ptr->output));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jKernel(
+      new ZipArray<ArrayType, DType>(env, kernel, kernelOffset, NULL));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jBias(
+      new ZipArray<ArrayType, DType>(env, bias, biasOffset, NULL));
+
+  ptr->setKernel(jKernel->getPtr());
+  ptr->setBias(jBias->getPtr());
+
+  ptr->updateOutput(jInput->getPtr(), jOutput->getPtr());
+}
+
+template <typename ArrayType, typename DType>
+void JNIBatchNormUpdateGradInput(JNIEnv *env, jclass thisClass, ArrayType input,
+                                 jint inputOffset, ArrayType outputDiff,
+                                 jint outputDiffOffset, ArrayType inputDiff,
+                                 jint inputDiffOffset, ArrayType kernelDiff,
+                                 jint kernelDiffOffset, ArrayType biasDiff,
+                                 jint biasDiffOffset, long classPtr)
+{
+  MKLBatchNorm<DType> *ptr = reinterpret_cast<MKLBatchNorm<DType> *>(classPtr);
+  std::shared_ptr<ZipArray<ArrayType, DType>> jInput(
+      new ZipArray<ArrayType, DType>(env, input, inputOffset, ptr->input));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jOutputDiff(
+      new ZipArray<ArrayType, DType>(env, outputDiff, outputDiffOffset,
+                                     ptr->gradOutput));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jInputDiff(
+      new ZipArray<ArrayType, DType>(env, inputDiff, inputDiffOffset,
+                                     ptr->gradInput));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jKernelDiff(
+      new ZipArray<ArrayType, DType>(env, kernelDiff, kernelDiffOffset, NULL));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jBiasDiff(
+      new ZipArray<ArrayType, DType>(env, biasDiff, biasDiffOffset, NULL));
+
+  ptr->setGradKernel(jKernelDiff->getPtr());
+  ptr->setGradBias(jBiasDiff->getPtr());
+
+  ptr->updateGradInput(jInput->getPtr(), jOutputDiff->getPtr(),
+                       jInputDiff->getPtr());
+}
+
+// Macro
+#define BatchNormInit(DType, JType, JArrayType)                                \
+  JNIEXPORT                                                                    \
+  jlong JNICALL Java_com_intel_analytics_sparkdl_mkl_MKL_BatchNormInit##DType( \
+      JNIEnv *env, jclass thisClass, jint inputNumber, jint inputChannel,      \
+      jint inputHeight, jint inputWidth, jdouble eps, jint useKernel,          \
+      jint useBias, jint dimension)                                            \
+  {                                                                            \
+    return JNIBatchNormInit<JArrayType, JType>(                                \
+        env, thisClass, inputNumber, inputChannel, inputHeight, inputWidth,    \
+        eps, useKernel, useBias, dimension);                                   \
+  }
+
+#define BatchNormForward(DType, JType, JArrayType)                            \
+  JNIEXPORT                                                                   \
+  void JNICALL                                                                \
+      Java_com_intel_analytics_sparkdl_mkl_MKL_BatchNormForward##DType(       \
+          JNIEnv *env, jclass thisClass, JArrayType input, jint inputOffset,  \
+          JArrayType output, jint outputOffset, JArrayType kernel,            \
+          jint kernelOffset, JArrayType bias, jint biasOffset, long classPtr) \
+  {                                                                           \
+    JNIBatchNormUpdateOutput<JArrayType, JType>(                              \
+        env, thisClass, input, inputOffset, output, outputOffset, kernel,     \
+        kernelOffset, bias, biasOffset, classPtr);                            \
+  }
+
+#define BatchNormBackward(DType, JType, JArrayType)                           \
+  JNIEXPORT                                                                   \
+  void JNICALL                                                                \
+      Java_com_intel_analytics_sparkdl_mkl_MKL_BatchNormBackward##DType(      \
+          JNIEnv *env, jclass thisClass, JArrayType input, jint inputOffset,  \
+          JArrayType outputDiff, jint outputDiffOffset, JArrayType inputDiff, \
+          jint inputDiffOffset, JArrayType kernelDiff, jint kernelDiffOffset, \
+          JArrayType biasDiff, jint biasDiffOffset, long classPtr)            \
+  {                                                                           \
+    JNIBatchNormUpdateGradInput<JArrayType, JType>(                           \
+        env, thisClass, input, inputOffset, outputDiff, outputDiffOffset,     \
+        inputDiff, inputDiffOffset, kernelDiff, kernelDiffOffset, biasDiff,   \
+        biasDiffOffset, classPtr);                                            \
+  }
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// double
+BatchNormInit(Double, jdouble, jdoubleArray);
+BatchNormForward(Double, jdouble, jdoubleArray);
+BatchNormBackward(Double, jdouble, jdoubleArray);
+
+// float
+BatchNormInit(Float, jfloat, jfloatArray);
+BatchNormForward(Float, jfloat, jfloatArray);
+BatchNormBackward(Float, jfloat, jfloatArray);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/mkl/native/src/main/c/jni/concat.cpp b/mkl/native/src/main/c/jni/concat.cpp
new file mode 100644
index 00000000000..f3b8fb557f6
--- /dev/null
+++ b/mkl/native/src/main/c/jni/concat.cpp
@@ -0,0 +1,331 @@
+#include <stdio.h>
+#include <vector>
+
+#include "debug.h"
+#include "layer.h"
+#include "memory.h"
+#include "utils.h"
+
+using namespace std;
+
+template <typename DType>
+class MKLConcat : public MKLLayer<DType>
+{
+ public:
+  MKLConcat();
+  ~MKLConcat();
+
+  void init(int numConcats, int dimension, int *size);
+
+  void updateOutput(DType **input, DType *output);
+  void updateGradInput(DType **gradInput, DType *gradOutput);
+
+  // attention, we will override the four variables of MKLLayer
+  vector<shared_ptr<MKLData<DType>>> input;
+  vector<shared_ptr<MKLData<DType>>> gradInput;
+
+ private:
+  // this method is not the same as createMklLayout in MKLMemory
+  void firstPass();
+  void preExecute(DType *input);
+
+  int numConcats;  // number of concats
+  size_t *numSplits;
+};
+
+template <typename DType>
+MKLConcat<DType>::MKLConcat() : numSplits(NULL), numConcats(0)
+{
+  // TODO
+}
+
+template <typename DType>
+MKLConcat<DType>::~MKLConcat()
+{
+  // TODO
+  delete[] numSplits;
+}
+
+template <typename DType>
+void MKLConcat<DType>::init(int numConcats, int dimension, int *size)
+{
+  this->numConcats = numConcats;
+  this->dimension  = dimension;
+  this->numSplits  = new size_t[numConcats];
+
+  size_t inputSize[dimension];
+  size_t inputStrides[dimension];
+  size_t outputSize[dimension];
+  size_t outputStrides[dimension];
+
+  int offset      = 0;
+  size_t channels = 0;
+
+  for (int i = 0; i < numConcats; i++) {
+    input.push_back(shared_ptr<MKLData<DType>>(new MKLData<DType>));
+    gradInput.push_back(shared_ptr<MKLData<DType>>(new MKLData<DType>));
+
+    // set the size.
+    // the size of every channel should be gaved in size.
+    // the dimension of every channel should be the same.
+    inputStrides[0] = 1;
+    inputSize[0]    = size[offset];
+    for (int j = 1; j < dimension; j++) {
+      inputSize[j]    = size[offset + j];
+      inputStrides[j] = inputStrides[j - 1] * inputSize[j - 1];
+    }
+    offset += dimension;
+
+    // we must be sure that inputSize[2] is channels, or it will be 1
+    // if dimension == 2, which means there are only height and width. -> height
+    // if dimension >  2, which means there is channel in the tensor, -> channel
+    numSplits[i] = dimension <= 2 ? inputSize[1] : inputSize[2];
+    channels += numSplits[i];
+
+    this->input[i]->createUsrLayout(dimension, inputSize, inputStrides);
+    this->gradInput[i]->createUsrLayout(dimension, inputSize, inputStrides);
+  }
+
+  // the output size should be equal to the first input size, besides channel
+  // the channel of output (outputSize[2]) should be the sum of all
+  // input channels.
+  // the number of output is only 1
+  outputStrides[0] = 1;
+  outputSize[0]    = inputSize[0];
+  for (int i = 1; i < dimension; i++) {
+    if (i == 2)
+      outputSize[i] = channels;
+    else
+      outputSize[i]  = inputSize[i];
+    outputStrides[i] = outputStrides[i - 1] * outputSize[i - 1];
+  }
+
+  this->output->createUsrLayout(dimension, outputSize, outputStrides);
+  this->gradOutput->createUsrLayout(dimension, outputSize, outputStrides);
+}
+
+template <typename DType>
+void MKLConcat<DType>::firstPass()
+{
+  dnnLayout_t *layouts = new dnnLayout_t[numConcats];
+
+  for (int i = 0; i < numConcats; i++) {
+    layouts[i] = this->input[i]->getUsrLayout();
+  }
+
+  dnnError_t status = E_UNIMPLEMENTED;
+  status =
+      dnnConcatCreate<DType>(&(this->forwardPrim), NULL, numConcats, layouts);
+  CHECK_EQ(status, E_SUCCESS);
+
+  this->output->createMklLayout(this->forwardPrim, dnnResourceDst);
+  this->gradOutput->createMklLayout(this->forwardPrim, dnnResourceDst);
+
+  // backward
+  status = dnnSplitCreate<DType>(&(this->backwardPrim), NULL, numConcats,
+                                 this->gradOutput->getMklLayout(), numSplits);
+  CHECK_EQ(status, E_SUCCESS);
+
+  for (int i = 0; i < numConcats; i++) {
+    this->input[i]->createMklLayout(
+        this->forwardPrim, (dnnResourceType_t)(dnnResourceMultipleSrc + i));
+
+    // TODO comes from caffe, it's different with others (DiffSrc/DiffDst)
+    this->gradInput[i]->createMklLayout(
+        this->backwardPrim, (dnnResourceType_t)(dnnResourceMultipleDst + i));
+  }
+
+  delete[] layouts;
+
+  this->isFirstPass = false;
+}
+
+template <typename DType>
+void MKLConcat<DType>::updateOutput(DType **input, DType *output)
+{
+  if (this->isFirstPass) firstPass();
+
+  for (int i = 0; i < numConcats; i++) {
+    this->input[i]->setUsrData(input[i]);
+    this->input[i]->createConversion();
+  }
+  this->output->setUsrData(output);
+  this->output->createConversion();
+
+  dnnError_t status;
+  void *resources[dnnResourceNumber];
+
+  for (int i = 0; i < numConcats; i++) {
+    resources[dnnResourceMultipleSrc + i] = this->input[i]->getConvertedData();
+  }
+  resources[dnnResourceDst] = this->output->getData();
+
+  PERFSTART();
+  status = dnnExecute<DType>(this->forwardPrim, resources);
+  PERFEND("main computing");
+
+  if (!this->output->isUseNext()) this->output->backToUsr();
+}
+
+template <typename DType>
+void MKLConcat<DType>::updateGradInput(DType **gradInput, DType *gradOutput)
+{
+  for (int i = 0; i < numConcats; i++) {
+    this->gradInput[i]->setUsrData(gradInput[i]);
+    this->gradInput[i]->createConversion();
+  }
+  this->gradOutput->setUsrData(gradOutput);
+  this->gradOutput->createConversion();
+
+  dnnError_t status;
+  void *resources[dnnResourceNumber];
+
+  for (int i = 0; i < numConcats; i++) {
+    resources[dnnResourceMultipleDst + i] = this->gradInput[i]->getData();
+  }
+  resources[dnnResourceSrc] = this->gradOutput->getConvertedData();
+
+  PERFSTART();
+  status = dnnExecute<DType>(this->backwardPrim, resources);
+  PERFEND("main computing");
+
+  for (int i = 0; i < numConcats; i++) {
+    if (!this->gradInput[i]->isUsePrev()) this->gradInput[i]->backToUsr();
+  }
+}
+
+template <typename ArrayType, typename DType>
+jlong JNIConcatInit(JNIEnv *env, jclass thisClass, int numConcats,
+                    int dimension, jintArray size)
+{
+  MKLConcat<DType> *ptr = new MKLConcat<DType>();
+
+  jint *jSize =
+      reinterpret_cast<int *>(env->GetPrimitiveArrayCritical(size, 0));
+  ptr->init(numConcats, dimension, jSize);
+  env->ReleasePrimitiveArrayCritical(size, jSize, 0);
+
+  return reinterpret_cast<long>(ptr);
+}
+
+template <typename ArrayType, typename DType>
+void JNIConcatUpdateOutput(JNIEnv *env, jclass thisClass, jobjectArray input,
+                           jintArray inputOffset, ArrayType output,
+                           jint outputOffset, long classPtr)
+{
+  MKLConcat<DType> *ptr = reinterpret_cast<MKLConcat<DType> *>(classPtr);
+
+  jint *jInputOffset =
+      reinterpret_cast<jint *>(env->GetPrimitiveArrayCritical(inputOffset, 0));
+
+  // TODO we should re-write, this version makes a little complict.
+  int len = env->GetArrayLength(input);
+  DType *inputArrStart[len];
+  DType *inputArr[len];
+  ArrayType jInputArr[len];
+  for (int i = 0; i < len; i++) {
+    jInputArr[i]     = (ArrayType)(env->GetObjectArrayElement(input, i));
+    inputArrStart[i] = reinterpret_cast<DType *>(
+        env->GetPrimitiveArrayCritical(jInputArr[i], 0));
+    inputArr[i] = inputArrStart[i] + jInputOffset[i];
+  }
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jOutput(
+      new ZipArray<ArrayType, DType>(env, output, outputOffset, ptr->output));
+
+  ptr->updateOutput(inputArr, jOutput->getPtr());
+
+  for (int i = 0; i < len; i++) {
+    env->ReleasePrimitiveArrayCritical(jInputArr[i], inputArrStart[i], 0);
+  }
+
+  env->ReleasePrimitiveArrayCritical(inputOffset, jInputOffset, 0);
+}
+
+template <typename ArrayType, typename DType>
+void JNIConcatUpdateGradInput(JNIEnv *env, jclass thisClass,
+                              jobjectArray inputDiff, jintArray inputDiffOffset,
+                              ArrayType outputDiff, jint outputDiffOffset,
+                              long classPtr)
+{
+  MKLConcat<DType> *ptr = reinterpret_cast<MKLConcat<DType> *>(classPtr);
+
+  jint *jInputDiffOffset = reinterpret_cast<jint *>(
+      env->GetPrimitiveArrayCritical(inputDiffOffset, 0));
+
+  int len = env->GetArrayLength(inputDiff);
+  DType *inputDiffArrStart[len];
+  DType *inputDiffArr[len];
+  ArrayType jInputDiffArr[len];
+  for (int i = 0; i < len; i++) {
+    jInputDiffArr[i] = (ArrayType)(env->GetObjectArrayElement(inputDiff, i));
+    inputDiffArrStart[i] = reinterpret_cast<DType *>(
+        env->GetPrimitiveArrayCritical(jInputDiffArr[i], 0));
+    inputDiffArr[i] = inputDiffArrStart[i] + jInputDiffOffset[i];
+  }
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jOutputDiff(
+      new ZipArray<ArrayType, DType>(env, outputDiff, outputDiffOffset,
+                                     ptr->gradOutput));
+
+  ptr->updateGradInput(inputDiffArr, jOutputDiff->getPtr());
+
+  for (int i = 0; i < len; i++) {
+    env->ReleasePrimitiveArrayCritical(jInputDiffArr[i], inputDiffArrStart[i],
+                                       0);
+  }
+
+  env->ReleasePrimitiveArrayCritical(inputDiffOffset, jInputDiffOffset, 0);
+}
+
+// Macro
+#define ConcatInit(DType, JType, JArrayType)                                \
+  JNIEXPORT                                                                 \
+  jlong JNICALL Java_com_intel_analytics_sparkdl_mkl_MKL_ConcatInit##DType( \
+      JNIEnv *env, jclass thisClass, jint numConcats, jint dimension,       \
+      jintArray size)                                                       \
+  {                                                                         \
+    return JNIConcatInit<JArrayType, JType>(env, thisClass, numConcats,     \
+                                            dimension, size);               \
+  }
+
+#define ConcatForward(DType, JType, JArrayType)                               \
+  JNIEXPORT                                                                   \
+  void JNICALL Java_com_intel_analytics_sparkdl_mkl_MKL_ConcatForward##DType( \
+      JNIEnv *env, jclass thisClass, jobjectArray input,                      \
+      jintArray inputOffset, JArrayType output, jint outputOffset,            \
+      long classPtr)                                                          \
+  {                                                                           \
+    JNIConcatUpdateOutput<JArrayType, JType>(                                 \
+        env, thisClass, input, inputOffset, output, outputOffset, classPtr);  \
+  }
+
+#define ConcatBackward(DType, JType, JArrayType)                               \
+  JNIEXPORT                                                                    \
+  void JNICALL Java_com_intel_analytics_sparkdl_mkl_MKL_ConcatBackward##DType( \
+      JNIEnv *env, jclass thisClass, jobjectArray inputDiff,                   \
+      jintArray inputDiffOffset, JArrayType outputDiff, jint outputDiffOffset, \
+      long classPtr)                                                           \
+  {                                                                            \
+    JNIConcatUpdateGradInput<JArrayType, JType>(env, thisClass, inputDiff,     \
+                                                inputDiffOffset, outputDiff,   \
+                                                outputDiffOffset, classPtr);   \
+  }
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Double
+ConcatInit(Double, jdouble, jdoubleArray);
+ConcatForward(Double, jdouble, jdoubleArray);
+ConcatBackward(Double, jdouble, jdoubleArray);
+
+// Float
+ConcatInit(Float, jfloat, jfloatArray);
+ConcatForward(Float, jfloat, jfloatArray);
+ConcatBackward(Float, jfloat, jfloatArray);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/mkl/native/src/main/c/jni/convolution.cpp b/mkl/native/src/main/c/jni/convolution.cpp
new file mode 100644
index 00000000000..36c821ba7aa
--- /dev/null
+++ b/mkl/native/src/main/c/jni/convolution.cpp
@@ -0,0 +1,580 @@
+#include <jni.h>
+
+#include "debug.h"
+#include "layer.h"
+#include "memory.h"
+#include "utils.h"
+
+static int getMKLBuildDate()
+{
+  static int build = 0;
+  if (build == 0) {
+    MKLVersion v;
+    mkl_get_version(&v);
+    build = atoi(v.Build);
+  }
+  return build;
+}
+
+template <typename DType>
+class MKLConvolution : public MKLLayer<DType>
+{
+ public:
+  MKLConvolution();
+  ~MKLConvolution();
+
+  void init(size_t inputNumber, size_t inputChannel, size_t inputHeight,
+            size_t inputWidth, size_t kernelNumber, size_t kernelChannel,
+            size_t kernelHeight, size_t kernelWidth, size_t strideHeight,
+            size_t strideWidth, int padHeight, int padWidth, int dimension,
+            int groups);
+
+  void updateOutput(DType *input, DType *output);
+  void updateGradInput(DType *input, DType *gradOutput, DType *gradInput);
+  void updateGradKernel(DType *input, DType *gradOutput, DType *gradKernel);
+  void updateGradBias(DType *input, DType *gradOutput, DType *gradBias);
+
+  std::shared_ptr<MKLData<DType>> kernel;
+  std::shared_ptr<MKLData<DType>> bias;
+
+  std::shared_ptr<MKLData<DType>> gradKernel;
+  std::shared_ptr<MKLData<DType>> gradBias;
+
+ private:
+  // this method is not the same as createMklLayout in MKLMemory
+  void firstPass();
+  void preExecute(DType *input);
+
+  DType *kernelAdr;
+  DType *biasAdr;
+
+  dnnPrimitive_t kernelPrim, biasPrim;
+
+  size_t groups;
+
+  size_t inputSize[4];
+  size_t inputStrides[4];
+
+  size_t outputSize[4];
+  size_t outputStrides[4];
+
+  size_t kernelDimension;
+  size_t kernelSize[5];
+  size_t kernelStrides[5];
+
+  size_t biasSize[1];
+  size_t biasStrides[1];
+
+  size_t stride[2];
+  int pad[2];
+};
+
+template <typename DType>
+MKLConvolution<DType>::MKLConvolution()
+    : kernel(new MKLData<DType>),
+      bias(new MKLData<DType>),
+      gradKernel(new MKLData<DType>),
+      gradBias(new MKLData<DType>),
+      kernelAdr(NULL),
+      biasAdr(NULL),
+      kernelPrim(NULL),
+      biasPrim(NULL)
+{
+}
+
+template <typename DType>
+MKLConvolution<DType>::~MKLConvolution()
+{
+  dnnDelete<DType>(kernelPrim);
+  dnnDelete<DType>(biasPrim);
+}
+
+template <typename DType>
+void MKLConvolution<DType>::init(size_t inputNumber, size_t inputChannel,
+                                 size_t inputHeight, size_t inputWidth,
+                                 size_t kernelNumber, size_t kernelChannel,
+                                 size_t kernelHeight, size_t kernelWidth,
+                                 size_t strideHeight, size_t strideWidth,
+                                 int padHeight, int padWidth, int dimension,
+                                 int groups)
+{
+  this->dimension = dimension;
+  this->groups    = groups;
+
+  inputSize[0] = inputWidth;
+  inputSize[1] = inputHeight;
+  inputSize[2] = inputChannel;
+  inputSize[3] = inputNumber;
+
+  inputStrides[0] = 1;
+  for (int i        = 1; i < 4; i++)
+    inputStrides[i] = inputStrides[i - 1] * inputSize[i - 1];
+
+  size_t outputWidth =
+      computeOut(inputWidth, padWidth, kernelWidth, strideWidth, false);
+  size_t outputHeight =
+      computeOut(inputHeight, padHeight, kernelHeight, strideHeight, false);
+
+  // the output channel is as same as the number of kernel.
+  // and the output number must be as same as the number of input too.
+  outputSize[0] = outputWidth;
+  outputSize[1] = outputHeight;
+  outputSize[2] = kernelNumber;
+  outputSize[3] = inputNumber;
+
+  outputStrides[0] = 1;
+  for (int i         = 1; i < 4; i++)
+    outputStrides[i] = outputStrides[i - 1] * outputSize[i - 1];
+
+  // comes from IntelCaffe.
+  size_t groupsMKL = groups;
+  kernelDimension  = this->dimension + (groups != 1);
+  if (getMKLBuildDate() < 20160701) {
+    kernelDimension = this->dimension;
+    groupsMKL       = 1;
+  }
+
+  kernelSize[0] = kernelWidth;
+  kernelSize[1] = kernelHeight;
+  kernelSize[2] = kernelChannel / groups;
+  kernelSize[3] = kernelNumber / groupsMKL;
+  kernelSize[4] = groupsMKL;
+
+  kernelStrides[0] = 1;
+  for (int i         = 1; i < 5; i++)
+    kernelStrides[i] = kernelStrides[i - 1] * kernelSize[i - 1];
+
+  biasSize[0]    = kernelNumber;
+  biasStrides[0] = 1;
+
+  stride[0] = strideWidth;
+  stride[1] = strideHeight;
+
+  pad[0] = -padWidth;
+  pad[1] = -padHeight;
+
+  // create usr layout
+  this->input->createUsrLayout(dimension, inputSize, inputStrides);
+  this->output->createUsrLayout(dimension, outputSize, outputStrides);
+  this->kernel->createUsrLayout(kernelDimension, kernelSize, kernelStrides);
+  this->bias->createUsrLayout(1, biasSize, biasStrides);
+
+  this->gradInput->createUsrLayout(dimension, inputSize, inputStrides);
+  this->gradOutput->createUsrLayout(dimension, outputSize, outputStrides);
+  this->gradKernel->createUsrLayout(kernelDimension, kernelSize, kernelStrides);
+  // bias dimension is 1
+  this->gradBias->createUsrLayout(1, biasSize, biasStrides);
+}
+
+template <typename DType>
+void MKLConvolution<DType>::firstPass()
+{
+  dnnError_t status = E_UNIMPLEMENTED;
+  // forward
+  status = dnnGroupsConvolutionCreateForwardBias<DType>(
+      &(this->forwardPrim), NULL, dnnAlgorithmConvolutionDirect, groups,
+      this->dimension, inputSize, outputSize, kernelSize, stride, pad,
+      dnnBorderZeros);
+  CHECK_EQ(status, E_SUCCESS);
+
+  this->input->createMklLayout(this->forwardPrim, dnnResourceSrc);
+  this->output->createMklLayout(this->forwardPrim, dnnResourceDst);
+  this->kernel->createMklLayout(this->forwardPrim, dnnResourceFilter);
+  this->bias->createMklLayout(this->forwardPrim, dnnResourceBias);
+
+  // backward data
+  status = dnnGroupsConvolutionCreateBackwardData<DType>(
+      &(this->backwardPrim), NULL, dnnAlgorithmConvolutionDirect, groups,
+      this->dimension, inputSize, outputSize, kernelSize, stride, pad,
+      dnnBorderZeros);
+  CHECK_EQ(status, E_SUCCESS);
+
+  this->gradOutput->createMklLayout(this->backwardPrim, dnnResourceDiffDst);
+  this->gradInput->createMklLayout(this->backwardPrim, dnnResourceDiffSrc);
+
+  // backward kernel
+  status = dnnGroupsConvolutionCreateBackwardFilter<DType>(
+      &kernelPrim, NULL, dnnAlgorithmConvolutionDirect, groups, this->dimension,
+      inputSize, outputSize, kernelSize, stride, pad, dnnBorderZeros);
+  CHECK_EQ(status, E_SUCCESS);
+
+  this->gradKernel->createMklLayout(this->kernelPrim, dnnResourceDiffFilter);
+
+  // backward bias
+  status = dnnGroupsConvolutionCreateBackwardBias<DType>(
+      &biasPrim, NULL, dnnAlgorithmConvolutionDirect, groups, this->dimension,
+      outputSize);
+  CHECK_EQ(status, E_SUCCESS);
+
+  this->gradBias->createMklLayout(this->biasPrim, dnnResourceDiffBias);
+
+  // we create the layout only at the first time
+  this->isFirstPass = false;
+}
+
+template <typename DType>
+void MKLConvolution<DType>::preExecute(DType *input)
+{
+  this->input->createConversion();
+  this->kernel->createConversion();
+  this->bias->createConversion();
+}
+
+template <typename DType>
+void MKLConvolution<DType>::updateOutput(DType *input, DType *output)
+{
+  if (this->isFirstPass) firstPass();
+
+  // Because the address will change every time, so we need create conversion
+  // every forward/backward.
+  // TODO Should we set the kernel and bias address every time?
+  preExecute(input);
+  this->output->createConversion();
+
+#ifdef DEBUG
+  printData<DType>(reinterpret_cast<DType *>(this->input->getUsrData()),
+                   this->inputSize[3], this->inputSize[2], this->inputSize[1],
+                   this->inputSize[0], "Forward input");
+#endif
+
+  dnnError_t status;
+  void *resources[dnnResourceNumber];
+
+  resources[dnnResourceFilter] = this->kernel->getConvertedData();
+  resources[dnnResourceBias]   = this->bias->getConvertedData();
+  resources[dnnResourceSrc]    = this->input->getConvertedData();
+  resources[dnnResourceDst]    = this->output->getData();
+
+  PERFSTART();
+  status = dnnExecute<DType>(this->forwardPrim, resources);
+  PERFEND("main computing");
+  CHECK_EQ(status, E_SUCCESS);
+
+#ifdef DEBUG
+  printData<DType>(reinterpret_cast<DType *>(this->output->getData()),
+                   outputSize[3], outputSize[2], outputSize[1], outputSize[0],
+                   "Forward output");
+#endif
+
+  if (!this->output->isUseNext()) {
+    this->output->backToUsr();
+  }
+}
+
+template <typename DType>
+void MKLConvolution<DType>::updateGradInput(DType *input, DType *gradOutput,
+                                            DType *gradInput)
+{
+  dnnError_t status;
+  void *resources[dnnResourceNumber];
+
+  preExecute(input);
+
+  this->gradOutput->createConversion();
+  this->gradInput->createConversion();
+
+  resources[dnnResourceDiffDst] = this->gradOutput->getConvertedData();
+  resources[dnnResourceFilter]  = this->kernel->getConvertedData();
+  resources[dnnResourceDiffSrc] = this->gradInput->getData();
+
+  // 4. main computing parts.
+  PERFSTART();
+  status = dnnExecute<DType>(this->backwardPrim, resources);
+  CHECK_EQ(status, E_SUCCESS);
+  PERFEND("main computing");
+
+  if (!this->gradInput->isUsePrev()) {
+    this->gradInput->backToUsr();
+  }
+
+#ifdef DEBUG
+  printData<DType>(reinterpret_cast<DType *>(this->gradInput->getUsrData()),
+                   inputSize[3], inputSize[2], inputSize[1], inputSize[0],
+                   "backward gradient input");
+#endif
+}
+template <typename DType>
+void MKLConvolution<DType>::updateGradKernel(DType *input, DType *gradOutput,
+                                             DType *gradKernel)
+{
+  dnnError_t status;
+  void *resources[dnnResourceNumber];
+
+  preExecute(input);
+
+  this->gradOutput->createConversion();
+  this->gradKernel->createConversion();
+
+  resources[dnnResourceDiffDst]    = this->gradOutput->getConvertedData();
+  resources[dnnResourceSrc]        = this->input->getConvertedData();
+  resources[dnnResourceDiffFilter] = this->gradKernel->getData();
+
+  // 4. main computing parts.
+  PERFSTART();
+  status = dnnExecute<DType>(this->kernelPrim, resources);
+  CHECK_EQ(status, E_SUCCESS);
+  PERFEND("main computing");
+
+  // the kernel need not re-use for previous layer
+  this->gradKernel->backToUsr();
+}
+
+template <typename DType>
+void MKLConvolution<DType>::updateGradBias(DType *input, DType *gradOutput,
+                                           DType *gradBias)
+{
+  dnnError_t status;
+  void *resources[dnnResourceNumber];
+
+  preExecute(input);
+
+  this->gradOutput->createConversion();
+  this->gradBias->createConversion();
+
+  resources[dnnResourceDiffDst]  = this->gradOutput->getConvertedData();
+  resources[dnnResourceDiffBias] = this->gradBias->getData();
+
+  // 4. main computing parts.
+  PERFSTART();
+  status = dnnExecute<DType>(this->biasPrim, resources);
+  CHECK_EQ(status, E_SUCCESS);
+  PERFEND("main computing");
+
+  this->gradBias->backToUsr();
+}
+
+template <typename ArrayType, typename DType>
+jlong JNIConvolutionInit(JNIEnv *env, jclass thisClass, jint inputNumber,
+                         jint inputChannel, jint inputHeight, jint inputWidth,
+                         jint kernelNumber, jint kernelChannel,
+                         jint kernelHeight, jint kernelWidth, jint strideHeight,
+                         jint strideWidth, jint padHeight, jint padWidth,
+                         jint dimension, jint groups)
+{
+  MKLConvolution<DType> *conv = new MKLConvolution<DType>();
+  conv->init(inputNumber, inputChannel, inputHeight, inputWidth, kernelNumber,
+             kernelChannel, kernelHeight, kernelWidth, strideHeight,
+             strideWidth, padHeight, padWidth, dimension, groups);
+
+  return reinterpret_cast<long>(conv);
+}
+
+template <typename ArrayType, typename DType>
+void JNIConvolutionUpdateOutput(JNIEnv *env, jclass thisClass, ArrayType input,
+                                jint inputOffset, ArrayType output,
+                                jint outputOffset, ArrayType kernel,
+                                jint kernelOffset, ArrayType bias,
+                                jint biasOffset, long classPtr)
+{
+  MKLConvolution<DType> *ptr =
+      reinterpret_cast<MKLConvolution<DType> *>(classPtr);
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jInput(
+      new ZipArray<ArrayType, DType>(env, input, inputOffset, ptr->input));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jOutput(
+      new ZipArray<ArrayType, DType>(env, output, outputOffset, ptr->output));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jKernel(
+      new ZipArray<ArrayType, DType>(env, kernel, kernelOffset, ptr->kernel));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jBias(
+      new ZipArray<ArrayType, DType>(env, bias, biasOffset, ptr->bias));
+
+  ptr->updateOutput(jInput->getPtr(), jOutput->getPtr());
+}
+
+template <typename ArrayType, typename DType>
+void JNIConvolutionUpdateGradInput(JNIEnv *env, jclass thisClass,
+                                   ArrayType input, jint inputOffset,
+                                   ArrayType outputDiff, jint outputDiffOffset,
+                                   ArrayType inputDiff, jint inputDiffOffset,
+                                   ArrayType kernel, jint kernelOffset,
+                                   ArrayType bias, jint biasOffset,
+                                   long classPtr)
+{
+  MKLConvolution<DType> *ptr =
+      reinterpret_cast<MKLConvolution<DType> *>(classPtr);
+  std::shared_ptr<ZipArray<ArrayType, DType>> jInput(
+      new ZipArray<ArrayType, DType>(env, input, inputOffset, ptr->input));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jOutputDiff(
+      new ZipArray<ArrayType, DType>(env, outputDiff, outputDiffOffset,
+                                     ptr->gradOutput));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jInputDiff(
+      new ZipArray<ArrayType, DType>(env, inputDiff, inputDiffOffset,
+                                     ptr->gradInput));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jKernel(
+      new ZipArray<ArrayType, DType>(env, kernel, kernelOffset, ptr->kernel));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jBias(
+      new ZipArray<ArrayType, DType>(env, bias, biasOffset, ptr->bias));
+
+  ptr->updateGradInput(jInput->getPtr(), jOutputDiff->getPtr(),
+                       jInputDiff->getPtr());
+}
+
+template <typename ArrayType, typename DType>
+void JNIConvolutionUpdateGradKernel(JNIEnv *env, jclass thisClass,
+                                    ArrayType input, jint inputOffset,
+                                    ArrayType outputDiff, jint outputDiffOffset,
+                                    ArrayType kernelDiff, jint kernelDiffOffset,
+                                    ArrayType kernel, jint kernelOffset,
+                                    ArrayType bias, jint biasOffset,
+                                    long classPtr)
+{
+  MKLConvolution<DType> *ptr =
+      reinterpret_cast<MKLConvolution<DType> *>(classPtr);
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jInput(
+      new ZipArray<ArrayType, DType>(env, input, inputOffset, ptr->input));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jOutputDiff(
+      new ZipArray<ArrayType, DType>(env, outputDiff, outputDiffOffset,
+                                     ptr->gradOutput));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jKernelDiff(
+      new ZipArray<ArrayType, DType>(env, kernelDiff, kernelDiffOffset,
+                                     ptr->gradKernel));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jKernel(
+      new ZipArray<ArrayType, DType>(env, kernel, kernelOffset, ptr->kernel));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jBias(
+      new ZipArray<ArrayType, DType>(env, bias, biasOffset, ptr->bias));
+
+  ptr->updateGradKernel(jInput->getPtr(), jOutputDiff->getPtr(),
+                        jKernelDiff->getPtr());
+}
+
+template <typename ArrayType, typename DType>
+void JNIConvolutionUpdateGradBias(JNIEnv *env, jclass thisClass,
+                                  ArrayType input, jint inputOffset,
+                                  ArrayType outputDiff, jint outputDiffOffset,
+                                  ArrayType biasDiff, jint biasDiffOffset,
+                                  ArrayType kernel, jint kernelOffset,
+                                  ArrayType bias, jint biasOffset,
+                                  long classPtr)
+{
+  MKLConvolution<DType> *ptr =
+      reinterpret_cast<MKLConvolution<DType> *>(classPtr);
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jInput(
+      new ZipArray<ArrayType, DType>(env, input, inputOffset, ptr->input));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jOutputDiff(
+      new ZipArray<ArrayType, DType>(env, outputDiff, outputDiffOffset,
+                                     ptr->gradOutput));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jBiasDiff(
+      new ZipArray<ArrayType, DType>(env, biasDiff, biasDiffOffset,
+                                     ptr->gradBias));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jKernel(
+      new ZipArray<ArrayType, DType>(env, kernel, kernelOffset, ptr->kernel));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jBias(
+      new ZipArray<ArrayType, DType>(env, bias, biasOffset, ptr->bias));
+
+  ptr->updateGradBias(jInput->getPtr(), jOutputDiff->getPtr(),
+                      jBiasDiff->getPtr());
+}
+
+// Macro
+#define ConvolutionInit(DType, JType, JArrayType)                             \
+  JNIEXPORT                                                                   \
+  jlong JNICALL                                                               \
+      Java_com_intel_analytics_sparkdl_mkl_MKL_ConvolutionInit##DType(        \
+          JNIEnv *env, jclass thisClass, jint inputNumber, jint inputChannel, \
+          jint inputHeight, jint inputWidth, jint kernelNumber,               \
+          jint kernelChannel, jint kernelHeight, jint kernelWidth,            \
+          jint strideHeight, jint strideWidth, jint padHeight, jint padWidth, \
+          jint dimension, jint groups)                                        \
+  {                                                                           \
+    return JNIConvolutionInit<JArrayType, JType>(                             \
+        env, thisClass, inputNumber, inputChannel, inputHeight, inputWidth,   \
+        kernelNumber, kernelChannel, kernelHeight, kernelWidth, strideHeight, \
+        strideWidth, padHeight, padWidth, dimension, groups);                 \
+  }
+
+#define ConvolutionForward(DType, JType, JArrayType)                          \
+  JNIEXPORT                                                                   \
+  void JNICALL                                                                \
+      Java_com_intel_analytics_sparkdl_mkl_MKL_ConvolutionForward##DType(     \
+          JNIEnv *env, jclass thisClass, JArrayType input, jint inputOffset,  \
+          JArrayType output, jint outputOffset, JArrayType kernel,            \
+          jint kernelOffset, JArrayType bias, jint biasOffset, long classPtr) \
+  {                                                                           \
+    JNIConvolutionUpdateOutput<JArrayType, JType>(                            \
+        env, thisClass, input, inputOffset, output, outputOffset, kernel,     \
+        kernelOffset, bias, biasOffset, classPtr);                            \
+  }
+
+#define ConvolutionBackwardData(DType, JType, JArrayType)                      \
+  JNIEXPORT                                                                    \
+  void JNICALL                                                                 \
+      Java_com_intel_analytics_sparkdl_mkl_MKL_ConvolutionBackwardData##DType( \
+          JNIEnv *env, jclass thisClass, JArrayType input, jint inputOffset,   \
+          JArrayType outputDiff, jint outputDiffOffset, JArrayType inputDiff,  \
+          jint inputDiffOffset, JArrayType kernel, jint kernelOffset,          \
+          JArrayType bias, jint biasOffset, long classPtr)                     \
+  {                                                                            \
+    JNIConvolutionUpdateGradInput<JArrayType, JType>(                          \
+        env, thisClass, input, inputOffset, outputDiff, outputDiffOffset,      \
+        inputDiff, inputDiffOffset, kernel, kernelOffset, bias, biasOffset,    \
+        classPtr);                                                             \
+  }
+
+#define ConvolutionBackwardKernel(DType, JType, JArrayType)                      \
+  JNIEXPORT                                                                      \
+  void JNICALL                                                                   \
+      Java_com_intel_analytics_sparkdl_mkl_MKL_ConvolutionBackwardKernel##DType( \
+          JNIEnv *env, jclass thisClass, JArrayType input, jint inputOffset,     \
+          JArrayType outputDiff, jint outputDiffOffset, JArrayType kernelDiff,   \
+          jint kernelDiffOffset, JArrayType kernel, jint kernelOffset,           \
+          JArrayType bias, jint biasOffset, long classPtr)                       \
+  {                                                                              \
+    JNIConvolutionUpdateGradKernel<JArrayType, JType>(                           \
+        env, thisClass, input, inputOffset, outputDiff, outputDiffOffset,        \
+        kernelDiff, kernelDiffOffset, kernel, kernelOffset, bias, biasOffset,    \
+        classPtr);                                                               \
+  }
+
+#define ConvolutionBackwardBias(DType, JType, JArrayType)                      \
+  JNIEXPORT                                                                    \
+  void JNICALL                                                                 \
+      Java_com_intel_analytics_sparkdl_mkl_MKL_ConvolutionBackwardBias##DType( \
+          JNIEnv *env, jclass thisClass, JArrayType input, jint inputOffset,   \
+          JArrayType outputDiff, jint outputDiffOffset, JArrayType biasDiff,   \
+          jint biasDiffOffset, JArrayType kernel, jint kernelOffset,           \
+          JArrayType bias, jint biasOffset, long classPtr)                     \
+  {                                                                            \
+    JNIConvolutionUpdateGradBias<JArrayType, JType>(                           \
+        env, thisClass, input, inputOffset, outputDiff, outputDiffOffset,      \
+        biasDiff, biasDiffOffset, kernel, kernelOffset, bias, biasOffset,      \
+        classPtr);                                                             \
+  }
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// double
+ConvolutionInit(Double, jdouble, jdoubleArray);
+ConvolutionForward(Double, jdouble, jdoubleArray);
+ConvolutionBackwardData(Double, jdouble, jdoubleArray);
+ConvolutionBackwardKernel(Double, jdouble, jdoubleArray);
+ConvolutionBackwardBias(Double, jdouble, jdoubleArray);
+
+// float
+ConvolutionInit(Float, jfloat, jfloatArray);
+ConvolutionForward(Float, jfloat, jfloatArray);
+ConvolutionBackwardData(Float, jfloat, jfloatArray);
+ConvolutionBackwardKernel(Float, jfloat, jfloatArray);
+ConvolutionBackwardBias(Float, jfloat, jfloatArray);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/mkl/native/src/main/c/jni/debug.cpp b/mkl/native/src/main/c/jni/debug.cpp
new file mode 100644
index 00000000000..a542a04c9af
--- /dev/null
+++ b/mkl/native/src/main/c/jni/debug.cpp
@@ -0,0 +1,37 @@
+#include <cstdio>
+#include <cstring>
+#include <stdlib.h>
+#include "debug.h"
+
+LogMessage::LogMessage(const char *file, int line, LogType type)
+{
+  int len = strlen(file) + 20;
+  char *buf = new char[len];
+  type_ = type;
+
+  const char *lastSlash = strrchr(file, '/');
+  const char *fileName = (lastSlash == NULL) ? file : lastSlash + 1;
+
+  snprintf(buf, len, "%c %s %s:%d] ", "DIWEFI"[type], "MKL", fileName, line);
+  stream() << buf;
+
+  delete buf;
+}
+
+LogMessage::~LogMessage()
+{
+  stream() << std::endl;
+  if (type_ == FATAL) {
+    stream() << "Aborting..." << std::endl;
+    abort();
+  }
+}
+
+std::ostream& LogMessage::stream()
+{
+  if (type_ >= WARNNING) {
+    return std::cerr;
+  } else {
+    return std::cout;
+  }
+}
diff --git a/mkl/native/src/main/c/jni/debug.h b/mkl/native/src/main/c/jni/debug.h
new file mode 100644
index 00000000000..1545bf22481
--- /dev/null
+++ b/mkl/native/src/main/c/jni/debug.h
@@ -0,0 +1,93 @@
+#ifndef _DEBUG_H_
+#define _DEBUG_H_
+
+#include <iostream>
+
+const int DBG = 0, INFO = 1, WARNNING = 2, ERROR = 3, FATAL = 4, DEFALT = 5;
+typedef int LogType;
+
+class LogMessage
+{
+ public:
+  LogMessage(const char *file, int line, LogType type);
+  ~LogMessage();
+  std::ostream &stream();
+
+ private:
+  LogType type_;
+};
+
+#define CHECK(x) \
+  if (!(x))      \
+    LogMessage(__FILE__, __LINE__, WARNNING).stream() << "Check failed " #x;
+
+//#define CHECK_EQ(x, y) CHECK((x) == (y))
+#define CHECK_EQ(x, y)                              \
+  if (!((x) == (y)))                                \
+  LogMessage(__FILE__, __LINE__, WARNNING).stream() \
+      << "Check failed. " #x << " = " << x << ",which should be " #y
+#define CHECK_NE(x, y) CHECK((x) != (y))
+
+#define LOG(x) LogMessage(__FILE__, __LINE__, x).stream()
+
+#ifdef PERF
+const int INPERF = 1;
+#else
+const int INPERF = 0;
+#endif
+
+#define PERFSTART()                           \
+  do {                                        \
+    struct timespec start, end;               \
+    if (INPERF) {                             \
+      clock_gettime(CLOCK_MONOTONIC, &start); \
+    }
+
+#define PERFEND(msg)                                                  \
+  if (INPERF) {                                                       \
+    clock_gettime(CLOCK_MONOTONIC, &end);                             \
+    LOG(INFO) << __func__ << " " << msg << " costs: "                 \
+              << (end.tv_sec - start.tv_sec) * 1000 +                 \
+                     (double)(end.tv_nsec - start.tv_nsec) / 1000000; \
+  }                                                                   \
+  }                                                                   \
+  while (0)                                                           \
+    ;
+
+/**
+ * @brief print 4 dimensions data
+ *
+ * Because the input/output is orgnized as vector, it should be more human
+ * readable when we debug the result generated.
+ *
+ * @param input input/output data which is orgnized as vecotr/array.
+ * @param num how many images
+ * @param channel how many channels, like 3
+ * @param height image height
+ * @param width image width
+ * @param msg messge user defined
+ */
+template <typename Type>
+void printData(Type *input, size_t num, size_t channel, size_t height,
+               size_t width, const char *msg)
+{
+  std::cout << std::string(msg) << " CHECK IN CPP..." << std::endl;
+
+  for (int i = 0; i < num; i++) {
+    std::cout << "The " << i << " num." << std::endl;
+    for (int j = 0; j < channel; j++) {
+      std::cout << "The " << j << " channel." << std::endl;
+      for (int k = 0; k < height; k++) {
+        for (int t = 0; t < width; t++) {
+          int index = ((i * channel + j) * height + k) * width + t;
+          std::cout << input[index] << '\t';
+        }
+        std::cout << std::endl;
+      }
+      std::cout << std::endl;
+    }
+    std::cout << std::endl;
+  }
+}
+
+#endif
diff --git a/mkl/native/src/main/c/jni/layer.cpp b/mkl/native/src/main/c/jni/layer.cpp
new file mode 100644
index 00000000000..59867fe0bcb
--- /dev/null
+++ b/mkl/native/src/main/c/jni/layer.cpp
@@ -0,0 +1,23 @@
+#include "layer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+JNIEXPORT
+void JNICALL Java_com_intel_analytics_sparkdl_mkl_MKL_SetPrevFloat(
+    JNIEnv *env, jclass thisClass, long prev, long curr)
+{
+  MKLLayer<float>::setPrev(prev, curr);
+}
+
+JNIEXPORT
+void JNICALL Java_com_intel_analytics_sparkdl_mkl_MKL_SetPrevDouble(
+    JNIEnv *env, jclass thisClass, long prev, long curr)
+{
+  MKLLayer<double>::setPrev(prev, curr);
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/mkl/native/src/main/c/jni/layer.h b/mkl/native/src/main/c/jni/layer.h
new file mode 100644
index 00000000000..88189178842
--- /dev/null
+++ b/mkl/native/src/main/c/jni/layer.h
@@ -0,0 +1,112 @@
+#ifndef _MKL_LAYER_H
+#define _MKL_LAYER_H
+#include <memory>
+
+#include "MKLWrapper.h"
+#include "memory.h"
+
+template <typename DType>
+class MKLLayer
+{
+ public:
+  MKLLayer();
+  ~MKLLayer();
+
+  static void setPrev(long prev, long curr);
+
+  void init(size_t inputNumber, size_t inputChannel, size_t inputHeight,
+            size_t inputWidth, size_t dimension);
+
+  std::shared_ptr<MKLData<DType>> input, output, gradInput, gradOutput;
+
+  int dimension;
+
+  // parameters of pooling layer
+  size_t inputSize[4];
+  size_t inputStrides[4];
+
+  // If it's the first pass, we should create some conversions.
+  // After that, we need not do that again.
+  // Default is true.
+  //
+  // Note:
+  //   1. Defaultly, we assume that the address of input will not change.
+  //   2. The address of input is real address of Array in JVM.
+  //   3. TODO It will set to false after an iteration (forward and backward).
+  bool isFirstPass;
+
+  dnnPrimitive_t forwardPrim, backwardPrim;
+};
+
+template <typename DType>
+void MKLLayer<DType>::init(size_t inputNumber, size_t inputChannel,
+                           size_t inputHeight, size_t inputWidth,
+                           size_t dimension)
+{
+  inputSize[0] = inputWidth;
+  inputSize[1] = inputHeight;
+  inputSize[2] = inputChannel;
+  inputSize[3] = inputNumber;
+
+  this->dimension = dimension;
+
+  inputStrides[0] = 1;
+  for (int i = 1; i < 4; i++) {
+    inputStrides[i] = inputStrides[i - 1] * inputSize[i - 1];
+  }
+
+  input->createUsrLayout(dimension, inputSize, inputStrides);
+  gradInput->createUsrLayout(dimension, inputSize, inputStrides);
+}
+
+template <typename DType>
+MKLLayer<DType>::MKLLayer()
+    : input(new MKLData<DType>()),
+      output(new MKLData<DType>()),
+      gradInput(new MKLData<DType>()),
+      gradOutput(new MKLData<DType>()),
+      isFirstPass(true),
+      forwardPrim(NULL),
+      backwardPrim(NULL)
+{
+}
+
+template <typename DType>
+MKLLayer<DType>::~MKLLayer()
+{
+  if (forwardPrim) {
+    dnnDelete<DType>(forwardPrim);
+    forwardPrim = NULL;
+  }
+
+  if (backwardPrim) {
+    dnnDelete<DType>(backwardPrim);
+    backwardPrim = NULL;
+  }
+}
+
+template <typename DType>
+void MKLLayer<DType>::setPrev(long prev, long curr)
+{
+  MKLLayer<DType> *prevLayer = reinterpret_cast<MKLLayer<DType> *>(prev);
+  MKLLayer<DType> *currLayer = reinterpret_cast<MKLLayer<DType> *>(curr);
+
+  dnnLayout_t prevLayout = prevLayer->gradOutput->getMklLayout();
+  dnnLayout_t currLayout = currLayer->gradInput->getMklLayout();
+
+  if (dnnLayoutCompare<DType>(prevLayout, currLayout)) {
+    prevLayer->gradOutput->setUseNext(true);
+    prevLayer->gradOutput = currLayer->gradInput;
+    currLayer->gradInput->setUsePrev(true);
+  }
+
+  prevLayout = prevLayer->output->getMklLayout();
+  currLayout = currLayer->input->getMklLayout();
+
+  if (dnnLayoutCompare<DType>(prevLayout, currLayout)) {
+    prevLayer->output->setUseNext(true);
+    currLayer->input = prevLayer->output;
+    currLayer->input->setUsePrev(true);
+  }
+}
+#endif
diff --git a/mkl/native/src/main/c/jni/linear.cpp b/mkl/native/src/main/c/jni/linear.cpp
new file mode 100644
index 00000000000..ca6e14bef4e
--- /dev/null
+++ b/mkl/native/src/main/c/jni/linear.cpp
@@ -0,0 +1,501 @@
+#include <jni.h>
+
+#include "debug.h"
+#include "layer.h"
+#include "memory.h"
+#include "utils.h"
+
+template <typename DType>
+class MKLLinear : public MKLLayer<DType>
+{
+ public:
+  MKLLinear();
+  ~MKLLinear();
+
+  void init(size_t inputHeight, size_t inputWidth, size_t outputChannel,
+            size_t kernelHeight, size_t kernelWidth);
+
+  void updateOutput(DType *input, DType *output);
+  void updateGradInput(DType *input, DType *gradOutput, DType *gradInput);
+  void updateGradKernel(DType *input, DType *gradOutput, DType *gradKernel);
+  void updateGradBias(DType *input, DType *gradOutput, DType *gradBias);
+
+  std::shared_ptr<MKLData<DType>> kernel;
+  std::shared_ptr<MKLData<DType>> bias;
+
+  std::shared_ptr<MKLData<DType>> gradKernel;
+  std::shared_ptr<MKLData<DType>> gradBias;
+
+ private:
+  // this method is not the same as createMklLayout in MKLMemory
+  void firstPass();
+  void preExecute(DType *input);
+
+  size_t inputSize[2];
+  size_t inputStrides[2];
+
+  size_t outputSize[2];
+  size_t outputStrides[2];
+
+  size_t kernelSize[2];
+  size_t kernelStrides[2];
+
+  size_t biasSize[1];
+  size_t biasStrides[1];
+
+  size_t outputChannel;
+
+  dnnPrimitive_t gradKernelPrim, gradBiasPrim;
+};
+
+template <typename DType>
+MKLLinear<DType>::MKLLinear()
+    : kernel(new MKLData<DType>),
+      bias(new MKLData<DType>),
+      gradKernel(new MKLData<DType>),
+      gradBias(new MKLData<DType>),
+      outputChannel(0),
+      gradKernelPrim(NULL),
+      gradBiasPrim(NULL)
+{
+}
+
+template <typename DType>
+MKLLinear<DType>::~MKLLinear()
+{
+  dnnDelete<DType>(gradKernelPrim);
+  dnnDelete<DType>(gradBiasPrim);
+}
+
+template <typename DType>
+void MKLLinear<DType>::init(size_t inputHeight, size_t inputWidth,
+                            size_t outputChannel, size_t kernelHeight,
+                            size_t kernelWidth)
+{
+  this->dimension = 2;
+
+  inputSize[0] = inputWidth;
+  inputSize[1] = inputHeight;
+
+  outputSize[0] = outputChannel;
+  outputSize[1] = inputHeight;
+
+  kernelSize[0] = kernelWidth;
+  kernelSize[1] = kernelHeight;
+
+  inputStrides[0]  = 1;
+  kernelStrides[0] = 1;
+  outputStrides[0] = 1;
+  for (int i = 1; i < this->dimension; i++) {
+    inputStrides[i]  = inputStrides[i - 1] * inputSize[i - 1];
+    kernelStrides[i] = kernelStrides[i - 1] * kernelSize[i - 1];
+    outputStrides[i] = outputStrides[i - 1] * outputSize[i - 1];
+  }
+
+  biasSize[0]    = outputChannel;
+  biasStrides[0] = 1;
+
+  this->outputChannel = outputChannel;
+
+  // create usr layout
+  this->input->createUsrLayout(this->dimension, inputSize, inputStrides);
+  this->output->createUsrLayout(this->dimension, outputSize, outputStrides);
+  this->kernel->createUsrLayout(this->dimension, kernelSize, kernelStrides);
+  this->bias->createUsrLayout(1, biasSize, biasStrides);
+
+  this->gradInput->createUsrLayout(this->dimension, inputSize, inputStrides);
+  this->gradOutput->createUsrLayout(this->dimension, outputSize, outputStrides);
+  this->gradKernel->createUsrLayout(this->dimension, kernelSize, kernelStrides);
+  // bias dimension is 1
+  this->gradBias->createUsrLayout(1, biasSize, biasStrides);
+}
+
+template <typename DType>
+void MKLLinear<DType>::firstPass()
+{
+  dnnError_t status = E_UNIMPLEMENTED;
+  // forward
+  status = dnnInnerProductCreateForwardBias<DType>(
+      &(this->forwardPrim), NULL, this->dimension, inputSize, outputChannel);
+  CHECK_EQ(status, E_SUCCESS);
+
+  this->input->createMklLayout(this->forwardPrim, dnnResourceSrc);
+  this->output->createMklLayout(this->forwardPrim, dnnResourceDst);
+  this->kernel->createMklLayout(this->forwardPrim, dnnResourceFilter);
+  this->bias->createMklLayout(this->forwardPrim, dnnResourceBias);
+
+  // backward data
+  status = dnnInnerProductCreateBackwardData<DType>(
+      &(this->backwardPrim), NULL, this->dimension, inputSize, outputChannel);
+  CHECK_EQ(status, E_SUCCESS);
+
+  this->gradOutput->createMklLayout(this->backwardPrim, dnnResourceDiffDst);
+  this->gradInput->createMklLayout(this->backwardPrim, dnnResourceDiffSrc);
+
+  // backward kernel
+  status = dnnInnerProductCreateBackwardFilter<DType>(
+      &gradKernelPrim, NULL, this->dimension, inputSize, outputChannel);
+  CHECK_EQ(status, E_SUCCESS);
+
+  this->gradKernel->createMklLayout(this->gradKernelPrim,
+                                    dnnResourceDiffFilter);
+
+  // backward bias
+  status = dnnInnerProductCreateBackwardBias<DType>(
+      &gradBiasPrim, NULL, this->dimension, outputSize);
+  CHECK_EQ(status, E_SUCCESS);
+
+  this->gradBias->createMklLayout(this->gradBiasPrim, dnnResourceDiffBias);
+
+  // we create the layout only at the first time
+  this->isFirstPass = false;
+}
+
+template <typename DType>
+void MKLLinear<DType>::preExecute(DType *input)
+{
+  this->input->createConversion();
+  this->kernel->createConversion();
+  this->bias->createConversion();
+}
+
+template <typename DType>
+void MKLLinear<DType>::updateOutput(DType *input, DType *output)
+{
+  if (this->isFirstPass) firstPass();
+
+  // Because the address will change every time, so we need create conversion
+  // every forward/backward.
+  // TODO Should we set the kernel and bias address every time?
+  preExecute(input);
+  this->output->createConversion();
+
+#ifdef DEBUG
+  printData<DType>(reinterpret_cast<DType *>(this->input->getUsrData()),
+                   this->inputSize[3], this->inputSize[2], this->inputSize[1],
+                   this->inputSize[0], "Forward input");
+#endif
+
+  dnnError_t status;
+  void *resources[dnnResourceNumber];
+
+  resources[dnnResourceFilter] = this->kernel->getConvertedData();
+  resources[dnnResourceBias]   = this->bias->getConvertedData();
+  resources[dnnResourceSrc]    = this->input->getConvertedData();
+  resources[dnnResourceDst]    = this->output->getData();
+
+  PERFSTART();
+  status = dnnExecute<DType>(this->forwardPrim, resources);
+  PERFEND("main computing");
+  CHECK_EQ(status, E_SUCCESS);
+
+#ifdef DEBUG
+  printData<DType>(reinterpret_cast<DType *>(this->output->getData()),
+                   outputSize[3], outputSize[2], outputSize[1], outputSize[0],
+                   "Forward output");
+#endif
+
+  if (!this->output->isUseNext()) {
+    this->output->backToUsr();
+  }
+}
+
+template <typename DType>
+void MKLLinear<DType>::updateGradInput(DType *input, DType *gradOutput,
+                                       DType *gradInput)
+{
+  dnnError_t status;
+  void *resources[dnnResourceNumber];
+
+  preExecute(input);
+
+  this->gradOutput->createConversion();
+  this->gradInput->createConversion();
+
+  resources[dnnResourceDiffDst] = this->gradOutput->getConvertedData();
+  resources[dnnResourceFilter]  = this->kernel->getConvertedData();
+  resources[dnnResourceDiffSrc] = this->gradInput->getData();
+
+  // 4. main computing parts.
+  PERFSTART();
+  status = dnnExecute<DType>(this->backwardPrim, resources);
+  CHECK_EQ(status, E_SUCCESS);
+  PERFEND("main computing");
+
+  if (!this->gradInput->isUsePrev()) {
+    this->gradInput->backToUsr();
+  }
+
+#ifdef DEBUG
+  printData<DType>(reinterpret_cast<DType *>(this->gradInput->getUsrData()),
+                   inputSize[3], inputSize[2], inputSize[1], inputSize[0],
+                   "backward gradient input");
+#endif
+}
+
+template <typename DType>
+void MKLLinear<DType>::updateGradKernel(DType *input, DType *gradOutput,
+                                        DType *gradKernel)
+{
+  dnnError_t status;
+  void *resources[dnnResourceNumber];
+
+  preExecute(input);
+
+  this->gradOutput->createConversion();
+  this->gradKernel->createConversion();
+
+  resources[dnnResourceDiffDst]    = this->gradOutput->getConvertedData();
+  resources[dnnResourceSrc]        = this->input->getConvertedData();
+  resources[dnnResourceDiffFilter] = this->gradKernel->getData();
+
+  // 4. main computing parts.
+  PERFSTART();
+  status = dnnExecute<DType>(this->gradKernelPrim, resources);
+  CHECK_EQ(status, E_SUCCESS);
+  PERFEND("main computing");
+
+  // the kernel need not re-use for previous layer
+  this->gradKernel->backToUsr();
+}
+
+template <typename DType>
+void MKLLinear<DType>::updateGradBias(DType *input, DType *gradOutput,
+                                      DType *gradBias)
+{
+  dnnError_t status;
+  void *resources[dnnResourceNumber];
+
+  preExecute(input);
+
+  this->gradOutput->createConversion();
+  this->gradBias->createConversion();
+
+  resources[dnnResourceDiffDst]  = this->gradOutput->getConvertedData();
+  resources[dnnResourceDiffBias] = this->gradBias->getData();
+
+  // 4. main computing parts.
+  PERFSTART();
+  status = dnnExecute<DType>(this->gradBiasPrim, resources);
+  CHECK_EQ(status, E_SUCCESS);
+  PERFEND("main computing");
+
+  this->gradBias->backToUsr();
+}
+
+template <typename ArrayType, typename DType>
+jlong JNILinearInit(JNIEnv *env, jclass thisClass, jint inputHeight,
+                    jint inputWidth, jint outputChannel, jint kernelHeight,
+                    jint kernelWidth)
+{
+  MKLLinear<DType> *ptr = new MKLLinear<DType>();
+  ptr->init(inputHeight, inputWidth, outputChannel, kernelHeight, kernelWidth);
+
+  return reinterpret_cast<long>(ptr);
+}
+
+template <typename ArrayType, typename DType>
+void JNILinearUpdateOutput(JNIEnv *env, jclass thisClass, ArrayType input,
+                           jint inputOffset, ArrayType output,
+                           jint outputOffset, ArrayType kernel,
+                           jint kernelOffset, ArrayType bias, jint biasOffset,
+                           long classPtr)
+{
+  MKLLinear<DType> *ptr = reinterpret_cast<MKLLinear<DType> *>(classPtr);
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jInput(
+      new ZipArray<ArrayType, DType>(env, input, inputOffset, ptr->input));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jOutput(
+      new ZipArray<ArrayType, DType>(env, output, outputOffset, ptr->output));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jKernel(
+      new ZipArray<ArrayType, DType>(env, kernel, kernelOffset, ptr->kernel));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jBias(
+      new ZipArray<ArrayType, DType>(env, bias, biasOffset, ptr->bias));
+
+  ptr->updateOutput(jInput->getPtr(), jOutput->getPtr());
+}
+
+template <typename ArrayType, typename DType>
+void JNILinearUpdateGradInput(JNIEnv *env, jclass thisClass, ArrayType input,
+                              jint inputOffset, ArrayType outputDiff,
+                              jint outputDiffOffset, ArrayType inputDiff,
+                              jint inputDiffOffset, ArrayType kernel,
+                              jint kernelOffset, ArrayType bias,
+                              jint biasOffset, long classPtr)
+{
+  MKLLinear<DType> *ptr = reinterpret_cast<MKLLinear<DType> *>(classPtr);
+  std::shared_ptr<ZipArray<ArrayType, DType>> jInput(
+      new ZipArray<ArrayType, DType>(env, input, inputOffset, ptr->input));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jOutputDiff(
+      new ZipArray<ArrayType, DType>(env, outputDiff, outputDiffOffset,
+                                     ptr->gradOutput));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jInputDiff(
+      new ZipArray<ArrayType, DType>(env, inputDiff, inputDiffOffset,
+                                     ptr->gradInput));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jKernel(
+      new ZipArray<ArrayType, DType>(env, kernel, kernelOffset, ptr->kernel));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jBias(
+      new ZipArray<ArrayType, DType>(env, bias, biasOffset, ptr->bias));
+
+  ptr->updateGradInput(jInput->getPtr(), jOutputDiff->getPtr(),
+                       jInputDiff->getPtr());
+}
+
+template <typename ArrayType, typename DType>
+void JNILinearUpdateGradKernel(JNIEnv *env, jclass thisClass, ArrayType input,
+                               jint inputOffset, ArrayType outputDiff,
+                               jint outputDiffOffset, ArrayType kernelDiff,
+                               jint kernelDiffOffset, ArrayType kernel,
+                               jint kernelOffset, ArrayType bias,
+                               jint biasOffset, long classPtr)
+{
+  MKLLinear<DType> *ptr = reinterpret_cast<MKLLinear<DType> *>(classPtr);
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jInput(
+      new ZipArray<ArrayType, DType>(env, input, inputOffset, ptr->input));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jOutputDiff(
+      new ZipArray<ArrayType, DType>(env, outputDiff, outputDiffOffset,
+                                     ptr->gradOutput));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jKernelDiff(
+      new ZipArray<ArrayType, DType>(env, kernelDiff, kernelDiffOffset,
+                                     ptr->gradKernel));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jKernel(
+      new ZipArray<ArrayType, DType>(env, kernel, kernelOffset, ptr->kernel));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jBias(
+      new ZipArray<ArrayType, DType>(env, bias, biasOffset, ptr->bias));
+
+  ptr->updateGradKernel(jInput->getPtr(), jOutputDiff->getPtr(),
+                        jKernelDiff->getPtr());
+}
+
+template <typename ArrayType, typename DType>
+void JNILinearUpdateGradBias(JNIEnv *env, jclass thisClass, ArrayType input,
+                             jint inputOffset, ArrayType outputDiff,
+                             jint outputDiffOffset, ArrayType biasDiff,
+                             jint biasDiffOffset, ArrayType kernel,
+                             jint kernelOffset, ArrayType bias, jint biasOffset,
+                             long classPtr)
+{
+  MKLLinear<DType> *ptr = reinterpret_cast<MKLLinear<DType> *>(classPtr);
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jInput(
+      new ZipArray<ArrayType, DType>(env, input, inputOffset, ptr->input));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jOutputDiff(
+      new ZipArray<ArrayType, DType>(env, outputDiff, outputDiffOffset,
+                                     ptr->gradOutput));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jBiasDiff(
+      new ZipArray<ArrayType, DType>(env, biasDiff, biasDiffOffset,
+                                     ptr->gradBias));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jKernel(
+      new ZipArray<ArrayType, DType>(env, kernel, kernelOffset, ptr->kernel));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jBias(
+      new ZipArray<ArrayType, DType>(env, bias, biasOffset, ptr->bias));
+
+  ptr->updateGradBias(jInput->getPtr(), jOutputDiff->getPtr(),
+                      jBiasDiff->getPtr());
+}
+// Macro
+#define LinearInit(DType, JType, JArrayType)                                \
+  JNIEXPORT                                                                 \
+  jlong JNICALL Java_com_intel_analytics_sparkdl_mkl_MKL_LinearInit##DType( \
+      JNIEnv *env, jclass thisClass, jint inputHeight, jint inputWidth,     \
+      jint outputChannel, jint kernelHeight, jint kernelWidth)              \
+  {                                                                         \
+    return JNILinearInit<JArrayType, JType>(env, thisClass, inputHeight,    \
+                                            inputWidth, outputChannel,      \
+                                            kernelHeight, kernelWidth);     \
+  }
+
+#define LinearForward(DType, JType, JArrayType)                               \
+  JNIEXPORT                                                                   \
+  void JNICALL Java_com_intel_analytics_sparkdl_mkl_MKL_LinearForward##DType( \
+      JNIEnv *env, jclass thisClass, JArrayType input, jint inputOffset,      \
+      JArrayType output, jint outputOffset, JArrayType kernel,                \
+      jint kernelOffset, JArrayType bias, jint biasOffset, long classPtr)     \
+  {                                                                           \
+    JNILinearUpdateOutput<JArrayType, JType>(                                 \
+        env, thisClass, input, inputOffset, output, outputOffset, kernel,     \
+        kernelOffset, bias, biasOffset, classPtr);                            \
+  }
+
+#define LinearBackwardData(DType, JType, JArrayType)                          \
+  JNIEXPORT                                                                   \
+  void JNICALL                                                                \
+      Java_com_intel_analytics_sparkdl_mkl_MKL_LinearBackwardData##DType(     \
+          JNIEnv *env, jclass thisClass, JArrayType input, jint inputOffset,  \
+          JArrayType outputDiff, jint outputDiffOffset, JArrayType inputDiff, \
+          jint inputDiffOffset, JArrayType kernel, jint kernelOffset,         \
+          JArrayType bias, jint biasOffset, long classPtr)                    \
+  {                                                                           \
+    JNILinearUpdateGradInput<JArrayType, JType>(                              \
+        env, thisClass, input, inputOffset, outputDiff, outputDiffOffset,     \
+        inputDiff, inputDiffOffset, kernel, kernelOffset, bias, biasOffset,   \
+        classPtr);                                                            \
+  }
+
+#define LinearBackwardKernel(DType, JType, JArrayType)                         \
+  JNIEXPORT                                                                    \
+  void JNICALL                                                                 \
+      Java_com_intel_analytics_sparkdl_mkl_MKL_LinearBackwardKernel##DType(    \
+          JNIEnv *env, jclass thisClass, JArrayType input, jint inputOffset,   \
+          JArrayType outputDiff, jint outputDiffOffset, JArrayType kernelDiff, \
+          jint kernelDiffOffset, JArrayType kernel, jint kernelOffset,         \
+          JArrayType bias, jint biasOffset, long classPtr)                     \
+  {                                                                            \
+    JNILinearUpdateGradKernel<JArrayType, JType>(                              \
+        env, thisClass, input, inputOffset, outputDiff, outputDiffOffset,      \
+        kernelDiff, kernelDiffOffset, kernel, kernelOffset, bias, biasOffset,  \
+        classPtr);                                                             \
+  }
+
+#define LinearBackwardBias(DType, JType, JArrayType)                         \
+  JNIEXPORT                                                                  \
+  void JNICALL                                                               \
+      Java_com_intel_analytics_sparkdl_mkl_MKL_LinearBackwardBias##DType(    \
+          JNIEnv *env, jclass thisClass, JArrayType input, jint inputOffset, \
+          JArrayType outputDiff, jint outputDiffOffset, JArrayType biasDiff, \
+          jint biasDiffOffset, JArrayType kernel, jint kernelOffset,         \
+          JArrayType bias, jint biasOffset, long classPtr)                   \
+  {                                                                          \
+    JNILinearUpdateGradBias<JArrayType, JType>(                              \
+        env, thisClass, input, inputOffset, outputDiff, outputDiffOffset,    \
+        biasDiff, biasDiffOffset, kernel, kernelOffset, bias, biasOffset,    \
+        classPtr);                                                           \
+  }
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// double
+LinearInit(Double, jdouble, jdoubleArray);
+LinearForward(Double, jdouble, jdoubleArray);
+LinearBackwardData(Double, jdouble, jdoubleArray);
+LinearBackwardKernel(Double, jdouble, jdoubleArray);
+LinearBackwardBias(Double, jdouble, jdoubleArray);
+
+// float
+LinearInit(Float, jfloat, jfloatArray);
+LinearForward(Float, jfloat, jfloatArray);
+LinearBackwardData(Float, jfloat, jfloatArray);
+LinearBackwardKernel(Float, jfloat, jfloatArray);
+LinearBackwardBias(Float, jfloat, jfloatArray);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/mkl/native/src/main/c/jni/lrn.cpp b/mkl/native/src/main/c/jni/lrn.cpp
new file mode 100644
index 00000000000..bead038a6f8
--- /dev/null
+++ b/mkl/native/src/main/c/jni/lrn.cpp
@@ -0,0 +1,306 @@
+#include <jni.h>
+
+#include "debug.h"
+#include "layer.h"
+#include "memory.h"
+#include "utils.h"
+
+template <typename DType>
+class MKLLRN : public MKLLayer<DType>
+{
+ public:
+  MKLLRN();
+  ~MKLLRN();
+
+  void init(size_t inputNumber, size_t inputChannel, size_t inputHeight,
+            size_t inputWidth, int size, DType alpha, DType beta, DType k,
+            int dimension);
+
+  void updateOutput(DType *input, DType *output);
+  void updateGradInput(DType *input, DType *gradOutput, DType *gradInput);
+
+ private:
+  // this method is not the same as createMklLayout in MKLMemory
+  void firstPass();
+  void preExecute(DType *input);
+
+  std::shared_ptr<MKLData<DType>> workspace;
+
+  int size;
+  DType alpha;
+  DType beta;
+  DType k;
+
+  size_t inputSize[4];
+  size_t inputStrides[4];
+
+  size_t outputSize[4];
+  size_t outputStrides[4];
+};
+
+template <typename DType>
+MKLLRN<DType>::MKLLRN() : workspace(new MKLData<DType>)
+{
+}
+
+template <typename DType>
+MKLLRN<DType>::~MKLLRN()
+{
+}
+
+template <typename DType>
+void MKLLRN<DType>::init(size_t inputNumber, size_t inputChannel,
+                         size_t inputHeight, size_t inputWidth, int size,
+                         DType alpha, DType beta, DType k, int dimension)
+{
+  this->dimension = dimension;
+
+  inputSize[0] = inputWidth;
+  inputSize[1] = inputHeight;
+  inputSize[2] = inputChannel;
+  inputSize[3] = inputNumber;
+
+  inputStrides[0] = 1;
+  for (int i        = 1; i < 4; i++)
+    inputStrides[i] = inputStrides[i - 1] * inputSize[i - 1];
+
+  // the output channel is as same as the number of kernel.
+  // and the output number must be as same as the number of input too.
+  outputSize[0] = inputWidth;
+  outputSize[1] = inputHeight;
+  outputSize[2] = inputChannel;
+  outputSize[3] = inputNumber;
+
+  outputStrides[0] = 1;
+  for (int i         = 1; i < 4; i++)
+    outputStrides[i] = outputStrides[i - 1] * outputSize[i - 1];
+
+  this->size  = size;
+  this->alpha = alpha;
+  this->beta  = beta;
+  this->k     = k;
+
+  // create usr layout
+  this->input->createUsrLayout(dimension, inputSize, inputStrides);
+  this->output->createUsrLayout(dimension, outputSize, outputStrides);
+
+  this->gradInput->createUsrLayout(dimension, inputSize, inputStrides);
+  this->gradOutput->createUsrLayout(dimension, outputSize, outputStrides);
+}
+
+template <typename DType>
+void MKLLRN<DType>::firstPass()
+{
+  dnnError_t status = E_UNIMPLEMENTED;
+  dnnLayout_t layout;
+
+  status =
+      dnnLayoutCreate<DType>(&layout, this->dimension, inputSize, inputStrides);
+  CHECK_EQ(status, E_SUCCESS);
+
+  status = dnnLRNCreateForward<DType>(&(this->forwardPrim), NULL, layout, size,
+                                      alpha, beta, k);
+  CHECK_EQ(status, E_SUCCESS);
+
+  this->input->createMklLayout(this->forwardPrim, dnnResourceSrc);
+  this->output->createMklLayout(this->forwardPrim, dnnResourceDst);
+
+  status = dnnLRNCreateBackward<DType>(&(this->backwardPrim), NULL, layout,
+                                       layout, size, alpha, beta, k);
+  CHECK_EQ(status, E_SUCCESS);
+
+  this->gradOutput->createMklLayout(this->backwardPrim, dnnResourceDiffDst);
+  this->gradInput->createMklLayout(this->backwardPrim, dnnResourceDiffSrc);
+
+  // create workspace
+  this->workspace->createMklLayout(this->forwardPrim, dnnResourceWorkspace);
+  this->workspace->createConversion(true);
+
+  dnnLayoutDelete<DType>(layout);
+
+  // we create the layout only at the first time
+  this->isFirstPass = false;
+}
+
+template <typename DType>
+void MKLLRN<DType>::preExecute(DType *input)
+{
+  this->input->createConversion();
+}
+
+template <typename DType>
+void MKLLRN<DType>::updateOutput(DType *input, DType *output)
+{
+  if (this->isFirstPass) firstPass();
+
+  // Because the address will change every time, so we need create conversion
+  // every forward/backward.
+  // TODO Should we set the kernel and bias address every time?
+  preExecute(input);
+  this->output->createConversion();
+
+#ifdef DEBUG
+  printData<DType>(reinterpret_cast<DType *>(this->input->getUsrData()),
+                   this->inputSize[3], this->inputSize[2], this->inputSize[1],
+                   this->inputSize[0], "Forward input");
+#endif
+
+  dnnError_t status;
+  void *resources[dnnResourceNumber];
+
+  resources[dnnResourceSrc]       = this->input->getConvertedData();
+  resources[dnnResourceDst]       = this->output->getData();
+  resources[dnnResourceWorkspace] = this->workspace->getData();
+
+  PERFSTART();
+  status = dnnExecute<DType>(this->forwardPrim, resources);
+  PERFEND("main computing");
+  CHECK_EQ(status, E_SUCCESS);
+
+#ifdef DEBUG
+  printData<DType>(reinterpret_cast<DType *>(this->output->getData()),
+                   outputSize[3], outputSize[2], outputSize[1], outputSize[0],
+                   "Forward output");
+#endif
+
+  if (!this->output->isUseNext()) {
+    this->output->backToUsr();
+  }
+}
+
+template <typename DType>
+void MKLLRN<DType>::updateGradInput(DType *input, DType *gradOutput,
+                                    DType *gradInput)
+{
+  dnnError_t status;
+  void *resources[dnnResourceNumber];
+
+  preExecute(input);
+
+  this->gradOutput->createConversion();
+  this->gradInput->createConversion();
+
+  resources[dnnResourceDiffDst]   = this->gradOutput->getConvertedData();
+  resources[dnnResourceDiffSrc]   = this->gradInput->getData();
+  resources[dnnResourceSrc]       = this->input->getConvertedData();
+  resources[dnnResourceWorkspace] = this->workspace->getData();
+
+  // 4. main computing parts.
+  PERFSTART();
+  status = dnnExecute<DType>(this->backwardPrim, resources);
+  CHECK_EQ(status, E_SUCCESS);
+  PERFEND("main computing");
+
+  if (!this->gradInput->isUsePrev()) {
+    this->gradInput->backToUsr();
+  }
+
+#ifdef DEBUG
+  printData<DType>(reinterpret_cast<DType *>(this->gradInput->getUsrData()),
+                   inputSize[3], inputSize[2], inputSize[1], inputSize[0],
+                   "backward gradient input");
+#endif
+}
+
+template <typename ArrayType, typename DType>
+jlong JNILRNInit(JNIEnv *env, jclass thisClass, jint inputNumber,
+                 jint inputChannel, jint inputHeight, jint inputWidth,
+                 jint size, DType alpha, DType beta, DType k, jint dimension)
+{
+  MKLLRN<DType> *lrn = new MKLLRN<DType>();
+  lrn->init(inputNumber, inputChannel, inputHeight, inputWidth, size, alpha,
+            beta, k, dimension);
+
+  return reinterpret_cast<long>(lrn);
+}
+
+template <typename ArrayType, typename DType>
+void JNILRNUpdateOutput(JNIEnv *env, jclass thisClass, ArrayType input,
+                        jint inputOffset, ArrayType output, jint outputOffset,
+                        long classPtr)
+{
+  MKLLRN<DType> *ptr = reinterpret_cast<MKLLRN<DType> *>(classPtr);
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jInput(
+      new ZipArray<ArrayType, DType>(env, input, inputOffset, ptr->input));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jOutput(
+      new ZipArray<ArrayType, DType>(env, output, outputOffset, ptr->output));
+
+  ptr->updateOutput(jInput->getPtr(), jOutput->getPtr());
+}
+
+template <typename ArrayType, typename DType>
+void JNILRNUpdateGradInput(JNIEnv *env, jclass thisClass, ArrayType input,
+                           jint inputOffset, ArrayType outputDiff,
+                           jint outputDiffOffset, ArrayType inputDiff,
+                           jint inputDiffOffset, long classPtr)
+{
+  MKLLRN<DType> *ptr = reinterpret_cast<MKLLRN<DType> *>(classPtr);
+  std::shared_ptr<ZipArray<ArrayType, DType>> jInput(
+      new ZipArray<ArrayType, DType>(env, input, inputOffset, ptr->input));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jOutputDiff(
+      new ZipArray<ArrayType, DType>(env, outputDiff, outputDiffOffset,
+                                     ptr->gradOutput));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jInputDiff(
+      new ZipArray<ArrayType, DType>(env, inputDiff, inputDiffOffset,
+                                     ptr->gradInput));
+
+  ptr->updateGradInput(jInput->getPtr(), jOutputDiff->getPtr(),
+                       jInputDiff->getPtr());
+}
+
+// Macro
+#define LRNInit(DType, JType, JArrayType)                                    \
+  JNIEXPORT                                                                  \
+  jlong JNICALL Java_com_intel_analytics_sparkdl_mkl_MKL_LRNInit##DType(     \
+      JNIEnv *env, jclass thisClass, jint inputNumber, jint inputChannel,    \
+      jint inputHeight, jint inputWidth, jint size, JType alpha, JType beta, \
+      JType k, jint dimension)                                               \
+  {                                                                          \
+    return JNILRNInit<JArrayType, JType>(                                    \
+        env, thisClass, inputNumber, inputChannel, inputHeight, inputWidth,  \
+        size, alpha, beta, k, dimension);                                    \
+  }
+
+#define LRNForward(DType, JType, JArrayType)                                  \
+  JNIEXPORT                                                                   \
+  void JNICALL Java_com_intel_analytics_sparkdl_mkl_MKL_LRNForward##DType(    \
+      JNIEnv *env, jclass thisClass, JArrayType input, jint inputOffset,      \
+      JArrayType output, jint outputOffset, long classPtr)                    \
+  {                                                                           \
+    JNILRNUpdateOutput<JArrayType, JType>(env, thisClass, input, inputOffset, \
+                                          output, outputOffset, classPtr);    \
+  }
+
+#define LRNBackward(DType, JType, JArrayType)                               \
+  JNIEXPORT                                                                 \
+  void JNICALL Java_com_intel_analytics_sparkdl_mkl_MKL_LRNBackward##DType( \
+      JNIEnv *env, jclass thisClass, JArrayType input, jint inputOffset,    \
+      JArrayType outputDiff, jint outputDiffOffset, JArrayType inputDiff,   \
+      jint inputDiffOffset, long classPtr)                                  \
+  {                                                                         \
+    JNILRNUpdateGradInput<JArrayType, JType>(                               \
+        env, thisClass, input, inputOffset, outputDiff, outputDiffOffset,   \
+        inputDiff, inputDiffOffset, classPtr);                              \
+  }
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// double
+LRNInit(Double, jdouble, jdoubleArray);
+LRNForward(Double, jdouble, jdoubleArray);
+LRNBackward(Double, jdouble, jdoubleArray);
+
+// float
+LRNInit(Float, jfloat, jfloatArray);
+LRNForward(Float, jfloat, jfloatArray);
+LRNBackward(Float, jfloat, jfloatArray);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/mkl/native/src/main/c/jni/memory.h b/mkl/native/src/main/c/jni/memory.h
new file mode 100644
index 00000000000..9d2b8b9ec98
--- /dev/null
+++ b/mkl/native/src/main/c/jni/memory.h
@@ -0,0 +1,434 @@
+#ifndef _MKL_MEMORY_H
+#define _MKL_MEMORY_H
+
+#include <jni.h>
+#include <cstring>
+#include <memory>
+#include "MKLWrapper.h"
+#include "debug.h"
+
+template <typename DType>
+class MKLData
+{
+ public:
+  MKLData();
+  ~MKLData();
+
+  template <typename JArrayType, typename JType>
+  friend class ZipArray;
+
+  // set
+  void createUsrLayout(int dimensions, size_t *size, size_t *stride);
+  void createMklLayout(dnnPrimitive_t primitive, dnnResourceType_t type);
+  /**
+   * @brief create an mkl conversion
+   *
+   * @param doNotCreateConversion This argument is only for pooling. Because it
+   *                              can't be converted when the mode is floor.
+   */
+  void createConversion(bool doNotCreateConversion = false);
+  void backToUsr();
+  // TODO If the input always the same, we should not have a set method.
+  void setUsrData(void *ptr);
+  // this is only for re-using previous layer memory.
+  void setMklData(void *ptr);
+
+  // get
+  dnnLayout_t getUsrLayout();
+  dnnLayout_t getMklLayout();
+
+  // TODO should we combine this two versions of getData -> one version?
+  void *getData();
+  void *getConvertedData();
+
+  // for debug
+  void *getUsrData();
+  void *getMklData();
+
+  // for re-using output generated by mkl.
+  bool isUseNext();
+  bool isUsePrev();
+
+  void setUseNext(bool val);
+  void setUsePrev(bool val);
+  // ------------------------------------
+
+  // Currently, this two method substitude the backToUsr in pooling layer.
+  /**
+   * @brief cut the last row and column of every matrix in 4-D data.
+   *
+   * Note: MUST be used in mkl -> usr data.
+   *
+   * @param fromSize mkl data size.
+   * @param fromStrides mkl data strides.
+   * @param toStrides usr data strides.
+   */
+  void cutLastRowColumn(size_t *fromSize, size_t *fromStrides,
+                        size_t *toStrides);
+  /**
+   * @brief pad the last row and column of every matrix in 4-D data.
+   *
+   * Note: MUST be used in usr -> mkl data.
+   *
+   * @param fromSize usr data size
+   * @param fromStrides usr data strides
+   * @param toSize mkl data size
+   * @param toStrides mkl data strides
+   */
+  void padLastRowColumn(size_t *fromSize, size_t *fromStrides, size_t *toSize,
+                        size_t *toStrides);
+
+  size_t getMklLayoutSize();
+
+ private:
+  // call dnnAllocateBuffer to allocate a new block of mem
+  void allocate();
+  void convert(dnnPrimitive_t primitive, void *from, void *to);
+
+  dnnLayout_t layoutUsr;
+  dnnLayout_t layoutMkl;
+
+  void *dataUsr;
+  void *dataMkl;
+
+  dnnPrimitive_t mklToUsr;
+  dnnPrimitive_t usrToMkl;
+
+  bool useNext;
+  bool usePrev;
+};
+
+template <typename DType>
+MKLData<DType>::MKLData()
+{
+  dataUsr = NULL;
+  dataMkl = NULL;
+
+  layoutUsr = NULL;
+  layoutMkl = NULL;
+
+  mklToUsr = NULL;
+  usrToMkl = NULL;
+
+  useNext = false;
+  usePrev = false;
+}
+
+template <typename DType>
+MKLData<DType>::~MKLData()
+{
+  if (layoutUsr) {
+    dnnLayoutDelete<DType>(layoutUsr);
+    layoutUsr = NULL;
+  }
+  if (layoutMkl) {
+    dnnLayoutDelete<DType>(layoutMkl);
+    layoutMkl = NULL;
+  }
+  if (dataMkl) {
+    dnnReleaseBuffer<DType>(dataMkl);
+    dataMkl = NULL;
+  }
+
+  dnnDelete<DType>(mklToUsr);
+  dnnDelete<DType>(usrToMkl);
+
+  LOG(DBG) << "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA";
+}
+
+template <typename DType>
+void MKLData<DType>::createUsrLayout(int dimension, size_t *size,
+                                     size_t *stride)
+{
+  dnnError_t status;
+  status = dnnLayoutCreate<DType>(&layoutUsr, dimension, size, stride);
+  CHECK_EQ(status, E_SUCCESS);
+}
+
+template <typename DType>
+void MKLData<DType>::createMklLayout(dnnPrimitive_t primitive,
+                                     dnnResourceType_t type)
+{
+  dnnError_t status;
+  status = dnnLayoutCreateFromPrimitive<DType>(&layoutMkl, primitive, type);
+  CHECK_EQ(status, E_SUCCESS);
+}
+
+template <typename DType>
+void MKLData<DType>::createConversion(bool doNotCreateConversion)
+{
+  if (!layoutUsr && !layoutMkl) return;
+
+  if (isUsePrev() || isUseNext()) return;
+
+  // this->willToUsr = willToUsr;
+  int isSame = dnnLayoutCompare<DType>(layoutUsr, layoutMkl);
+  // it not unnecessary to convert when the layout in scala and mkl is the same.
+  // But we shoud pay attention to that it's not sure layout must be the same
+  // when the dnnLayoutGetMemorySize is the same.
+  if (!isSame) {
+    if (!dataMkl) {
+      allocate();
+    }
+
+    if (!doNotCreateConversion) {
+      if (mklToUsr) {
+        dnnDelete<DType>(mklToUsr);
+        mklToUsr = NULL;
+      }
+      if (usrToMkl) {
+        dnnDelete<DType>(usrToMkl);
+        usrToMkl = NULL;
+      }
+      dnnError_t status;
+      status = dnnConversionCreate<DType>(&mklToUsr, layoutMkl, layoutUsr);
+      CHECK_EQ(status, E_SUCCESS);
+
+      status = dnnConversionCreate<DType>(&usrToMkl, layoutUsr, layoutMkl);
+      CHECK_EQ(status, E_SUCCESS);
+    }
+  }
+}
+
+template <typename DType>
+void MKLData<DType>::backToUsr()
+{
+  // TODO we should put the if statement of isUseNex here.
+  if (dataUsr && dataMkl) {
+    convert(mklToUsr, dataMkl, dataUsr);
+  }
+}
+
+template <typename DType>
+void MKLData<DType>::allocate()
+{
+  dnnError_t status;
+  status = dnnAllocateBuffer<DType>(&dataMkl, layoutMkl);
+  CHECK_EQ(status, E_SUCCESS);
+
+  size_t size = dnnLayoutGetMemorySize<DType>(layoutMkl);
+  memset(dataMkl, 0, size);
+
+  LOG(INFO) << "Allocating layout memory -> " << size << " bytes...";
+}
+
+template <typename DType>
+void MKLData<DType>::convert(dnnPrimitive_t primitive, void *from, void *to)
+{
+  dnnError_t status;
+  void *resources[dnnResourceNumber];
+
+  resources[dnnResourceFrom] = from;
+  resources[dnnResourceTo]   = to;
+
+  PERFSTART();
+  status = dnnExecute<DType>(primitive, resources);
+  CHECK_EQ(status, E_SUCCESS);
+  PERFEND("main computing");
+}
+
+template <typename DType>
+void *MKLData<DType>::getConvertedData()
+{
+  void *ret = dataUsr;
+
+  // TODO something wrong
+  // 1. The data of previous layer we use should be allocated by mkl
+  // 2. Default it always convert the data.
+  if (usrToMkl) {
+    if (!isUsePrev() && !isUseNext()) {
+      convert(usrToMkl, dataUsr, dataMkl);
+    }
+    ret = dataMkl;
+  } else if (dataMkl) {
+    // sometimes, we need create memory for mkl, like workspace in pooling.
+    ret = dataMkl;
+  }
+
+  return ret;
+}
+
+template <typename DType>
+void *MKLData<DType>::getData()
+{
+  void *ret = dataUsr;
+
+  if (dataMkl) {
+    // sometimes, we need create memory for mkl, like workspace in pooling.
+    ret = dataMkl;
+  }
+
+  return ret;
+}
+
+template <typename DType>
+void MKLData<DType>::setUsrData(void *ptr)
+{
+  dataUsr = ptr;
+}
+
+template <typename DType>
+void *MKLData<DType>::getUsrData()
+{
+  return dataUsr;
+}
+
+template <typename DType>
+void *MKLData<DType>::getMklData()
+{
+  return dataMkl;
+}
+
+template <typename DType>
+bool MKLData<DType>::isUseNext()
+{
+  return useNext;
+}
+
+template <typename DType>
+bool MKLData<DType>::isUsePrev()
+{
+  return usePrev;
+}
+
+template <typename DType>
+void MKLData<DType>::setUseNext(bool val)
+{
+  useNext = val;
+}
+
+template <typename DType>
+void MKLData<DType>::setUsePrev(bool val)
+{
+  usePrev = val;
+}
+
+template <typename DType>
+void MKLData<DType>::cutLastRowColumn(size_t *fromStrides, size_t *toSize,
+                                      size_t *toStrides)
+{
+  // TODO this should be optimized. It's terrible.
+  // The funciton of four depth loop cuts off the last column and
+  // the last row of every matrix (height * weight) in output generated by
+  // MKL2017. memcpy may be much better.
+  // Fortunately, it doesn't occur frequently and it will not cost so much.
+  //
+  // TODO the default dimension is 4
+  DType *from = reinterpret_cast<DType *>(dataMkl);
+  DType *to   = reinterpret_cast<DType *>(dataUsr);
+  PERFSTART();
+  for (int n = 0; n < toSize[3]; n++)
+    for (int c = 0; c < toSize[2]; c++)
+      for (int h = 0; h < toSize[1]; h++)      // height
+        for (int w = 0; w < toSize[0]; w++) {  // width
+          int toIndex =
+              n * toStrides[3] + c * toStrides[2] + h * toStrides[1] + w;
+          int fromIndex =
+              n * fromStrides[3] + c * fromStrides[2] + h * fromStrides[1] + w;
+          *(to + toIndex) = *(from + fromIndex);
+        }
+  PERFEND("convert : cut last row and column of a matrix");
+}
+
+template <typename DType>
+void MKLData<DType>::padLastRowColumn(size_t *fromSize, size_t *fromStrides,
+                                      size_t *toSize, size_t *toStrides)
+{
+  DType *from = reinterpret_cast<DType *>(dataUsr);
+  DType *to   = reinterpret_cast<DType *>(dataMkl);
+
+  PERFSTART();
+  for (int n = 0; n < fromSize[3]; n++) {
+    for (int c = 0; c < fromSize[2]; c++) {
+      int baseIndex = n * toStrides[3] + c * toStrides[2];
+
+      for (int h = 0; h < fromSize[1]; h++) {  // height
+        memcpy(to + baseIndex + h * toStrides[1],
+               from + baseIndex + h * fromStrides[1],
+               fromSize[0] * sizeof(DType));
+
+        // the last column of a matrix with 0. we only need to set
+        // one element to 0, because 0 <= ceil - floor <= 1
+        if (toSize[0] != fromSize[0]) {
+          int end     = baseIndex + h * toStrides[1] + fromSize[0];
+          *(to + end) = 0;
+        }
+      }
+
+      // pad the last row of a matrix with 0 * width
+      if (toSize[1] != fromSize[1]) {
+        int end = baseIndex + toSize[1] * toStrides[1];
+        memset(to + end, 0, toSize[0] * sizeof(DType));
+      }
+    }
+  }
+  PERFEND("convert : pad last row and column of a matrix with 0");
+}
+
+template <typename DType>
+size_t MKLData<DType>::getMklLayoutSize()
+{
+  if (layoutMkl)
+    return dnnLayoutGetMemorySize<DType>(layoutMkl);
+  else
+    return 0;
+}
+
+template <typename DType>
+dnnLayout_t MKLData<DType>::getUsrLayout()
+{
+  return layoutUsr;
+}
+
+template <typename DType>
+dnnLayout_t MKLData<DType>::getMklLayout()
+{
+  if (layoutMkl)
+    return layoutMkl;
+  else
+    return layoutUsr;
+}
+
+template <typename JArrayType, typename JType>
+class ZipArray
+{
+ public:
+  ZipArray(JNIEnv *env, JArrayType array, jint offset,
+           std::shared_ptr<MKLData<JType>> mklData);
+  ~ZipArray();
+
+  JType *getPtr();
+
+ private:
+  void *ptr;
+  JArrayType array;
+  JNIEnv *env;
+};
+
+template <typename JArrayType, typename JType>
+ZipArray<JArrayType, JType>::ZipArray(JNIEnv *env, JArrayType array,
+                                      jint offset,
+                                      std::shared_ptr<MKLData<JType>> mklData)
+{
+  this->ptr   = env->GetPrimitiveArrayCritical(array, 0);
+  this->env   = env;
+  this->array = array;
+
+  JType *usrPtr = reinterpret_cast<JType *>(ptr) + offset;
+
+  if (mklData) mklData->setUsrData(usrPtr);
+}
+
+template <typename JArrayType, typename JType>
+ZipArray<JArrayType, JType>::~ZipArray()
+{
+  env->ReleasePrimitiveArrayCritical(array, ptr, 0);
+}
+
+template <typename JArrayType, typename JType>
+JType *ZipArray<JArrayType, JType>::getPtr()
+{
+  return reinterpret_cast<JType *>(ptr);
+}
+
+#endif
diff --git a/mkl/native/src/main/c/jni/mkl.c b/mkl/native/src/main/c/jni/omp_threads.cpp
similarity index 71%
rename from mkl/native/src/main/c/jni/mkl.c
rename to mkl/native/src/main/c/jni/omp_threads.cpp
index fcb600f70b0..db7e50f2789 100644
--- a/mkl/native/src/main/c/jni/mkl.c
+++ b/mkl/native/src/main/c/jni/omp_threads.cpp
@@ -9,19 +9,20 @@ extern "C" {
  * Method:    setNumThreads
  * Signature: (I)V
  */
-JNIEXPORT void JNICALL Java_com_intel_analytics_sparkdl_mkl_MKL_setNumThreads
-  (JNIEnv * env, jclass cls, jint num_threads) {
+JNIEXPORT void JNICALL Java_com_intel_analytics_sparkdl_mkl_MKL_setNumThreads(
+    JNIEnv* env, jclass cls, jint num_threads)
+{
   omp_set_num_threads(num_threads);
 }
 
-
 /*
  * Class:     com_intel_webscaleml_mkl_MKL
  * Method:    getNumThreads
  * Signature: ()I
  */
-JNIEXPORT jint JNICALL Java_com_intel_analytics_sparkdl_mkl_MKL_getNumThreads
-  (JNIEnv * env, jclass cls) {
+JNIEXPORT jint JNICALL
+Java_com_intel_analytics_sparkdl_mkl_MKL_getNumThreads(JNIEnv* env, jclass cls)
+{
   return omp_get_max_threads();
 }
 
diff --git a/mkl/native/src/main/c/jni/pooling.cpp b/mkl/native/src/main/c/jni/pooling.cpp
new file mode 100644
index 00000000000..be3b077b9b3
--- /dev/null
+++ b/mkl/native/src/main/c/jni/pooling.cpp
@@ -0,0 +1,378 @@
+#include <jni.h>
+
+#include "debug.h"
+#include "layer.h"
+#include "memory.h"
+#include "utils.h"
+
+enum Algorithm { MAX, AVG, MIN };
+
+template <typename DType>
+class MKLPooling : public MKLLayer<DType>
+{
+ public:
+  MKLPooling();
+  ~MKLPooling();
+
+  void init(size_t inputNumber, size_t inputChannel, size_t inputHeight,
+            size_t inputWidth, size_t kernelHeight, size_t kernelWidth,
+            size_t strideHeight, size_t strideWidth, int padHeight,
+            int padWidth, int dimension, bool ceilMode, Algorithm pAl);
+
+  void updateOutput(DType *input, DType *output);
+  void updateGradInput(DType *input, DType *gradOutput, DType *gradInput);
+
+ private:
+  std::shared_ptr<MKLData<DType>> workspace;
+
+  size_t inputSize[4];
+  size_t inputStrides[4];
+
+  size_t kernelSize[2];
+
+  size_t outputSizeCeil[4];
+  size_t outputStridesCeil[4];
+
+  size_t outputSizeFloor[4];
+  size_t outputStridesFloor[4];
+
+  size_t stride[2];
+  int pad[2];
+
+  // Algorithm for pooling : max, average, min. The default is MAX
+  dnnAlgorithm_t algorithm;
+  // When $mod(input + 2 * pad - kernel)$ is not eqal 0, the divisible will be
+  // false.
+  bool ceilMode;
+};
+
+template <typename DType>
+MKLPooling<DType>::MKLPooling() : workspace(new MKLData<DType>)
+{
+}
+
+template <typename DType>
+MKLPooling<DType>::~MKLPooling()
+{
+}
+
+template <typename DType>
+void MKLPooling<DType>::init(size_t inputNumber, size_t inputChannel,
+                             size_t inputHeight, size_t inputWidth,
+                             size_t kernelHeight, size_t kernelWidth,
+                             size_t strideHeight, size_t strideWidth,
+                             int padHeight, int padWidth, int dimension,
+                             bool ceilMode, Algorithm pAl)
+{
+  MKLLayer<DType>::init(inputNumber, inputChannel, inputHeight, inputWidth,
+                        dimension);
+
+  switch (pAl) {
+    case MAX:
+      algorithm = dnnAlgorithmPoolingMax;
+      break;
+    case AVG:
+      algorithm = dnnAlgorithmPoolingAvg;
+      break;
+    case MIN:
+      algorithm = dnnAlgorithmPoolingMin;
+      break;
+    default:
+      algorithm = dnnAlgorithmPoolingMax;
+  }
+
+  stride[0] = strideWidth;
+  stride[1] = strideHeight;
+
+  kernelSize[0] = kernelWidth;
+  kernelSize[1] = kernelHeight;
+
+  pad[0] = -padWidth;
+  pad[1] = -padHeight;
+
+  this->ceilMode = ceilMode;
+
+  inputSize[0] = inputWidth;
+  inputSize[1] = inputHeight;
+  inputSize[2] = inputChannel;
+  inputSize[3] = inputNumber;
+
+  inputStrides[0] = 1;
+  for (int i        = 1; i < 4; i++)
+    inputStrides[i] = inputStrides[i - 1] * inputSize[i - 1];
+
+  // compute output
+  outputSizeCeil[0] =
+      computeOut(inputWidth, padWidth, kernelWidth, strideWidth, true);
+  outputSizeCeil[1] =
+      computeOut(inputHeight, padHeight, kernelHeight, strideHeight, true);
+  outputSizeCeil[2] = this->inputSize[2];
+  outputSizeCeil[3] = this->inputSize[3];
+
+  outputSizeFloor[0] =
+      computeOut(inputWidth, padWidth, kernelWidth, strideWidth, false);
+  outputSizeFloor[1] =
+      computeOut(inputHeight, padHeight, kernelHeight, strideHeight, false);
+  outputSizeFloor[2] = this->inputSize[2];
+  outputSizeFloor[3] = this->inputSize[3];
+
+  // strides of input, kernel, output
+  outputStridesFloor[0] = 1;
+  outputStridesCeil[0]  = 1;
+  for (int i = 1; i < 4; i++) {
+    outputStridesFloor[i] = outputStridesFloor[i - 1] * outputSizeFloor[i - 1];
+    outputStridesCeil[i]  = outputStridesCeil[i - 1] * outputSizeCeil[i - 1];
+  }
+
+  if (outputSizeCeil[0] == outputSizeFloor[0] &&
+      outputSizeCeil[1] == outputSizeFloor[1])
+    this->ceilMode = true;
+
+  // create usr layout.
+  this->input->createUsrLayout(dimension, inputSize, inputStrides);
+  this->gradInput->createUsrLayout(dimension, inputSize, inputStrides);
+  if (this->ceilMode) {
+    this->output->createUsrLayout(dimension, outputSizeCeil, outputStridesCeil);
+    this->gradOutput->createUsrLayout(dimension, outputSizeCeil,
+                                      outputStridesCeil);
+  } else {
+    this->output->createUsrLayout(dimension, outputSizeFloor,
+                                  outputStridesFloor);
+    this->gradOutput->createUsrLayout(dimension, outputSizeFloor,
+                                      outputStridesFloor);
+  }
+}
+
+template <typename DType>
+void MKLPooling<DType>::updateOutput(DType *input, DType *output)
+{
+  dnnError_t status  = E_UNIMPLEMENTED;
+  dnnLayout_t layout = NULL;
+
+// It's very stange, the address of input changes every time.
+#ifdef DEBUG
+  if (this->input->getUsrData() && this->input->getUsrData() != input)
+    LOG(DBG) << "the address of input is not the same with preserved.";
+#endif
+
+  if (this->isFirstPass) {
+    status = dnnLayoutCreate<DType>(&layout, this->dimension, this->inputSize,
+                                    this->inputStrides);
+    CHECK_EQ(status, E_SUCCESS);
+
+    // forward
+    status = dnnPoolingCreateForward<DType>(&(this->forwardPrim), NULL,
+                                            algorithm, layout, kernelSize,
+                                            stride, pad, dnnBorderZeros);
+    CHECK_EQ(status, E_SUCCESS);
+    this->input->createMklLayout(this->forwardPrim, dnnResourceSrc);
+    this->output->createMklLayout(this->forwardPrim, dnnResourceDst);
+    this->workspace->createMklLayout(this->forwardPrim, dnnResourceWorkspace);
+    this->workspace->createConversion(true);
+
+    // backward
+    status = dnnPoolingCreateBackward<DType>(&(this->backwardPrim), NULL,
+                                             algorithm, layout, kernelSize,
+                                             stride, pad, dnnBorderZeros);
+    CHECK_EQ(status, E_SUCCESS);
+
+    this->gradInput->createMklLayout(this->backwardPrim, dnnResourceDiffSrc);
+    this->gradOutput->createMklLayout(this->backwardPrim, dnnResourceDiffDst);
+    dnnLayoutDelete<DType>(layout);
+
+    // the first pass we only create the layout, primitive, which are only
+    // created the first time and not change.
+    this->isFirstPass = false;
+  }
+
+  // Because the address will change every time, so we need create conversion
+  // every forward/backward.
+  this->input->setUsrData(input);
+  this->input->createConversion();
+
+  this->output->setUsrData(output);
+  this->output->createConversion(!(ceilMode));
+
+  void *resources[dnnResourceNumber];
+  resources[dnnResourceSrc]       = this->input->getConvertedData();
+  resources[dnnResourceDst]       = this->output->getData();
+  resources[dnnResourceWorkspace] = this->workspace->getData();
+
+  PERFSTART();
+  status = dnnExecute<DType>(this->forwardPrim, resources);
+  CHECK_EQ(status, E_SUCCESS);
+  PERFEND("main computing");
+
+#ifdef DEBUG
+  printData<DType>(reinterpret_cast<DType *>(this->output->getUsrData()),
+                   outputSizeCeil[3], outputSizeCeil[2], outputSizeCeil[1],
+                   outputSizeCeil[0],
+                   "Pooling forward output data generated by MKL2017");
+#endif
+
+  if (!this->output->isUseNext()) {
+    if (ceilMode) {
+      this->output->backToUsr();
+    } else {
+      this->output->cutLastRowColumn(outputStridesCeil, outputSizeFloor,
+                                     outputStridesFloor);
+    }
+  }
+
+#ifdef DEBUG
+  printData<DType>(reinterpret_cast<DType *>(this->output->getUsrData()),
+                   outputSizeFloor[3], outputSizeFloor[2], outputSizeFloor[1],
+                   outputSizeCeil[0],
+                   "Pooling forward output data generated by MKL2017");
+#endif
+}
+
+template <typename DType>
+void MKLPooling<DType>::updateGradInput(DType *input, DType *gradOutput,
+                                        DType *gradInput)
+{
+#ifdef DEBUG
+  LOG(DBG) << "gradOutput = " << gradOutput
+           << " dataUsr = " << this->gradOutput->getUsrData();
+#endif
+
+  // Because the address will change every time, so we need create conversion
+  // every forward/backward.
+  this->gradInput->setUsrData(gradInput);
+  this->gradInput->createConversion();
+
+  this->gradOutput->setUsrData(gradOutput);
+  this->gradOutput->createConversion(!(ceilMode));
+
+  if (!ceilMode)
+    this->gradOutput->padLastRowColumn(outputSizeFloor, outputStridesFloor,
+                                       outputSizeCeil, outputStridesCeil);
+
+  void *resources[dnnResourceNumber];
+  resources[dnnResourceDiffDst]   = this->gradOutput->getConvertedData();
+  resources[dnnResourceDiffSrc]   = this->gradInput->getData();
+  resources[dnnResourceWorkspace] = this->workspace->getData();
+
+  dnnError_t status;
+  PERFSTART();
+  status = dnnExecute<DType>(this->backwardPrim, resources);
+  CHECK_EQ(status, E_SUCCESS);
+  PERFEND("main computing");
+
+  if (!this->gradInput->isUsePrev()) this->gradInput->backToUsr();
+}
+
+template <typename ArrayType, typename DType>
+jlong JNIPoolingInit(jint inputNumber, jint inputChannel, jint inputHeight,
+                     jint inputWidth, jint kernelHeight, jint kernelWidth,
+                     jint strideHeight, jint strideWidth, jint padHeight,
+                     jint padWidth, jint dimension, jint ceilMode, jint pAl)
+{
+  MKLPooling<DType> *pool = new MKLPooling<DType>();
+  pool->init(inputNumber, inputChannel, inputHeight, inputWidth, kernelHeight,
+             kernelWidth, strideHeight, strideWidth, padHeight, padWidth,
+             dimension, ceilMode, static_cast<Algorithm>(pAl));
+
+  return reinterpret_cast<jlong>(pool);
+}
+
+template <typename ArrayType, typename DType>
+void JNIPoolingUpdateOutput(JNIEnv *env, jclass thisClass, ArrayType input,
+                            jint inputOffset, ArrayType output,
+                            jint outputOffset, long classPtr)
+{
+  DType *jInputStart =
+      reinterpret_cast<DType *>(env->GetPrimitiveArrayCritical(input, 0));
+  DType *jOutputStart =
+      reinterpret_cast<DType *>(env->GetPrimitiveArrayCritical(output, 0));
+
+  DType *jInput  = jInputStart + inputOffset;
+  DType *jOutput = jOutputStart + outputOffset;
+
+  MKLPooling<DType> *ptr = reinterpret_cast<MKLPooling<DType> *>(classPtr);
+  ptr->updateOutput(jInput, jOutput);
+
+  env->ReleasePrimitiveArrayCritical(input, jInputStart, 0);
+  env->ReleasePrimitiveArrayCritical(output, jOutputStart, 0);
+}
+
+template <typename ArrayType, typename DType>
+void JNIPoolingUpdateGradInput(JNIEnv *env, jclass thisClass, ArrayType input,
+                               jint inputOffset, ArrayType outputDiff,
+                               jint outputDiffOffset, ArrayType inputDiff,
+                               jint inputDiffOffset, long classPtr)
+{
+  DType *jInputStart =
+      reinterpret_cast<DType *>(env->GetPrimitiveArrayCritical(input, 0));
+  DType *jOutputDiffStart =
+      reinterpret_cast<DType *>(env->GetPrimitiveArrayCritical(outputDiff, 0));
+  DType *jInputDiffStart =
+      reinterpret_cast<DType *>(env->GetPrimitiveArrayCritical(inputDiff, 0));
+
+  DType *jInput      = jInputStart + inputOffset;
+  DType *jOutputDiff = jOutputDiffStart + outputDiffOffset;
+  DType *jInputDiff  = jInputDiffStart + inputDiffOffset;
+
+  MKLPooling<DType> *ptr = reinterpret_cast<MKLPooling<DType> *>(classPtr);
+  ptr->updateGradInput(jInput, jOutputDiff, jInputDiff);
+
+  env->ReleasePrimitiveArrayCritical(input, jInputStart, 0);
+  env->ReleasePrimitiveArrayCritical(outputDiff, jOutputDiffStart, 0);
+  env->ReleasePrimitiveArrayCritical(inputDiff, jInputDiffStart, 0);
+}
+
+// Macro
+#define PoolingInit(DType, JType, JArrayType)                                 \
+  JNIEXPORT                                                                   \
+  jlong JNICALL Java_com_intel_analytics_sparkdl_mkl_MKL_PoolingInit##DType(  \
+      JNIEnv *env, jclass thisClass, jint inputNumber, jint inputChannel,     \
+      jint inputHeight, jint inputWidth, jint kernelHeight, jint kernelWidth, \
+      jint strideHeight, jint strideWidth, jint padHeight, jint padWidth,     \
+      jint dimension, jint ceilMode, jint pAl)                                \
+  {                                                                           \
+    return JNIPoolingInit<JArrayType, JType>(                                 \
+        inputNumber, inputChannel, inputHeight, inputWidth, kernelHeight,     \
+        kernelWidth, strideHeight, strideWidth, padHeight, padWidth,          \
+        dimension, ceilMode, pAl);                                            \
+  }
+
+#define PoolingForward(DType, JType, JArrayType)                               \
+  JNIEXPORT                                                                    \
+  void JNICALL Java_com_intel_analytics_sparkdl_mkl_MKL_PoolingForward##DType( \
+      JNIEnv *env, jclass thisClass, JArrayType input, jint inputOffset,       \
+      JArrayType output, jint outputOffset, long classPtr)                     \
+  {                                                                            \
+    JNIPoolingUpdateOutput<JArrayType, JType>(                                 \
+        env, thisClass, input, inputOffset, output, outputOffset, classPtr);   \
+  }
+
+#define PoolingBackward(DType, JType, JArrayType)                             \
+  JNIEXPORT                                                                   \
+  void JNICALL                                                                \
+      Java_com_intel_analytics_sparkdl_mkl_MKL_PoolingBackward##DType(        \
+          JNIEnv *env, jclass thisClass, JArrayType input, jint inputOffset,  \
+          JArrayType outputDiff, jint outputDiffOffset, JArrayType inputDiff, \
+          jint inputDiffOffset, long classPtr)                                \
+  {                                                                           \
+    JNIPoolingUpdateGradInput<JArrayType, JType>(                             \
+        env, thisClass, input, inputOffset, outputDiff, outputDiffOffset,     \
+        inputDiff, inputDiffOffset, classPtr);                                \
+  }
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+  // Double
+  PoolingInit(Double, jdouble, jdoubleArray);
+  PoolingForward(Double, jdouble, jdoubleArray);
+  PoolingBackward(Double, jdouble, jdoubleArray);
+
+  // Float
+  PoolingInit(Float, jfloat, jfloatArray);
+  PoolingForward(Float, jfloat, jfloatArray);
+  PoolingBackward(Float, jfloat, jfloatArray);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/mkl/native/src/main/c/jni/relu.cpp b/mkl/native/src/main/c/jni/relu.cpp
new file mode 100644
index 00000000000..ad51a695b32
--- /dev/null
+++ b/mkl/native/src/main/c/jni/relu.cpp
@@ -0,0 +1,288 @@
+#include <jni.h>
+
+#include "debug.h"
+#include "layer.h"
+#include "memory.h"
+#include "utils.h"
+
+template <typename DType>
+class MKLReLU : public MKLLayer<DType>
+{
+ public:
+  MKLReLU();
+  ~MKLReLU();
+
+  void init(size_t inputNumber, size_t inputChannel, size_t inputHeight,
+            size_t inputWidth, int dimension);
+
+  void updateOutput(DType *input, DType *output);
+  void updateGradInput(DType *input, DType *gradOutput, DType *gradInput);
+
+ private:
+  // this method is not the same as createMklLayout in MKLMemory
+  void firstPass();
+  void preExecute(DType *input);
+
+  size_t inputSize[4];
+  size_t inputStrides[4];
+
+  size_t outputSize[4];
+  size_t outputStrides[4];
+
+  DType nagtiveSlope;
+};
+
+template <typename DType>
+MKLReLU<DType>::MKLReLU()
+{
+  nagtiveSlope = static_cast<DType>(0.0);
+}
+
+template <typename DType>
+MKLReLU<DType>::~MKLReLU()
+{
+}
+
+template <typename DType>
+void MKLReLU<DType>::init(size_t inputNumber, size_t inputChannel,
+                          size_t inputHeight, size_t inputWidth, int dimension)
+{
+  this->dimension = dimension;
+
+  inputSize[0] = inputWidth;
+  inputSize[1] = inputHeight;
+  inputSize[2] = inputChannel;
+  inputSize[3] = inputNumber;
+
+  inputStrides[0] = 1;
+  for (int i        = 1; i < 4; i++)
+    inputStrides[i] = inputStrides[i - 1] * inputSize[i - 1];
+
+  // the output channel is as same as the number of kernel.
+  // and the output number must be as same as the number of input too.
+  outputSize[0] = inputWidth;
+  outputSize[1] = inputHeight;
+  outputSize[2] = inputChannel;
+  outputSize[3] = inputNumber;
+
+  outputStrides[0] = 1;
+  for (int i         = 1; i < 4; i++)
+    outputStrides[i] = outputStrides[i - 1] * outputSize[i - 1];
+
+  // create usr layout
+  this->input->createUsrLayout(dimension, inputSize, inputStrides);
+  this->output->createUsrLayout(dimension, outputSize, outputStrides);
+
+  this->gradInput->createUsrLayout(dimension, inputSize, inputStrides);
+  this->gradOutput->createUsrLayout(dimension, outputSize, outputStrides);
+}
+
+template <typename DType>
+void MKLReLU<DType>::firstPass()
+{
+  dnnError_t status = E_UNIMPLEMENTED;
+  dnnLayout_t layout;
+
+  status =
+      dnnLayoutCreate<DType>(&layout, this->dimension, inputSize, inputStrides);
+  CHECK_EQ(status, E_SUCCESS);
+
+  // forward
+  status = dnnReLUCreateForward<DType>(&(this->forwardPrim), NULL, layout,
+                                       nagtiveSlope);
+  CHECK_EQ(status, E_SUCCESS);
+
+  this->input->createMklLayout(this->forwardPrim, dnnResourceSrc);
+  this->output->createMklLayout(this->forwardPrim, dnnResourceDst);
+
+  // backward data
+  // the input layout is as same as input diff layout
+  status = dnnReLUCreateBackward<DType>(&(this->backwardPrim), NULL, layout,
+                                        layout, nagtiveSlope);
+  CHECK_EQ(status, E_SUCCESS);
+
+  this->gradOutput->createMklLayout(this->backwardPrim, dnnResourceDiffDst);
+  this->gradInput->createMklLayout(this->backwardPrim, dnnResourceDiffSrc);
+
+  // we create the layout only at the first time
+  this->isFirstPass = false;
+}
+
+template <typename DType>
+void MKLReLU<DType>::preExecute(DType *input)
+{
+  this->input->createConversion();
+}
+
+template <typename DType>
+void MKLReLU<DType>::updateOutput(DType *input, DType *output)
+{
+  if (this->isFirstPass) firstPass();
+
+  // Because the address will change every time, so we need create conversion
+  // every forward/backward.
+  // TODO Should we set the kernel and bias address every time?
+  preExecute(input);
+  this->output->createConversion();
+
+#ifdef DEBUG
+  printData<DType>(reinterpret_cast<DType *>(this->input->getUsrData()),
+                   this->inputSize[3], this->inputSize[2], this->inputSize[1],
+                   this->inputSize[0], "Forward input");
+#endif
+
+  dnnError_t status;
+  void *resources[dnnResourceNumber];
+
+  resources[dnnResourceSrc] = this->input->getConvertedData();
+  resources[dnnResourceDst] = this->output->getData();
+
+  PERFSTART();
+  status = dnnExecute<DType>(this->forwardPrim, resources);
+  PERFEND("main computing");
+  CHECK_EQ(status, E_SUCCESS);
+
+#ifdef DEBUG
+  printData<DType>(reinterpret_cast<DType *>(this->output->getData()),
+                   outputSize[3], outputSize[2], outputSize[1], outputSize[0],
+                   "Forward output");
+#endif
+
+  if (!this->output->isUseNext()) {
+    this->output->backToUsr();
+  }
+}
+
+template <typename DType>
+void MKLReLU<DType>::updateGradInput(DType *input, DType *gradOutput,
+                                     DType *gradInput)
+{
+  dnnError_t status;
+  void *resources[dnnResourceNumber];
+
+  preExecute(input);
+
+  this->gradOutput->createConversion();
+  this->gradInput->createConversion();
+
+  resources[dnnResourceDiffDst] = this->gradOutput->getConvertedData();
+  resources[dnnResourceDiffSrc] = this->gradInput->getData();
+  resources[dnnResourceSrc]     = this->input->getConvertedData();
+
+  // 4. main computing parts.
+  PERFSTART();
+  status = dnnExecute<DType>(this->backwardPrim, resources);
+  CHECK_EQ(status, E_SUCCESS);
+  PERFEND("main computing");
+
+  if (!this->gradInput->isUsePrev()) {
+    this->gradInput->backToUsr();
+  }
+
+#ifdef DEBUG
+  printData<DType>(reinterpret_cast<DType *>(this->gradInput->getUsrData()),
+                   inputSize[3], inputSize[2], inputSize[1], inputSize[0],
+                   "backward gradient input");
+#endif
+}
+
+template <typename ArrayType, typename DType>
+jlong JNIReLUInit(JNIEnv *env, jclass thisClass, jint inputNumber,
+                  jint inputChannel, jint inputHeight, jint inputWidth,
+                  jint dimension)
+{
+  MKLReLU<DType> *ptr = new MKLReLU<DType>();
+  ptr->init(inputNumber, inputChannel, inputHeight, inputWidth, dimension);
+
+  return reinterpret_cast<long>(ptr);
+}
+
+template <typename ArrayType, typename DType>
+void JNIReLUUpdateOutput(JNIEnv *env, jclass thisClass, ArrayType input,
+                         jint inputOffset, ArrayType output, jint outputOffset,
+                         long classPtr)
+{
+  MKLReLU<DType> *ptr = reinterpret_cast<MKLReLU<DType> *>(classPtr);
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jInput(
+      new ZipArray<ArrayType, DType>(env, input, inputOffset, ptr->input));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jOutput(
+      new ZipArray<ArrayType, DType>(env, output, outputOffset, ptr->output));
+
+  ptr->updateOutput(jInput->getPtr(), jOutput->getPtr());
+}
+
+template <typename ArrayType, typename DType>
+void JNIReLUUpdateGradInput(JNIEnv *env, jclass thisClass, ArrayType input,
+                            jint inputOffset, ArrayType outputDiff,
+                            jint outputDiffOffset, ArrayType inputDiff,
+                            jint inputDiffOffset, long classPtr)
+{
+  MKLReLU<DType> *ptr = reinterpret_cast<MKLReLU<DType> *>(classPtr);
+  std::shared_ptr<ZipArray<ArrayType, DType>> jInput(
+      new ZipArray<ArrayType, DType>(env, input, inputOffset, ptr->input));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jOutputDiff(
+      new ZipArray<ArrayType, DType>(env, outputDiff, outputDiffOffset,
+                                     ptr->gradOutput));
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jInputDiff(
+      new ZipArray<ArrayType, DType>(env, inputDiff, inputDiffOffset,
+                                     ptr->gradInput));
+
+  ptr->updateGradInput(jInput->getPtr(), jOutputDiff->getPtr(),
+                       jInputDiff->getPtr());
+}
+
+// Macro
+#define ReLUInit(DType, JType, JArrayType)                                \
+  JNIEXPORT                                                               \
+  jlong JNICALL Java_com_intel_analytics_sparkdl_mkl_MKL_ReLUInit##DType( \
+      JNIEnv *env, jclass thisClass, jint inputNumber, jint inputChannel, \
+      jint inputHeight, jint inputWidth, jint dimension)                  \
+  {                                                                       \
+    return JNIReLUInit<JArrayType, JType>(env, thisClass, inputNumber,    \
+                                          inputChannel, inputHeight,      \
+                                          inputWidth, dimension);         \
+  }
+
+#define ReLUForward(DType, JType, JArrayType)                                  \
+  JNIEXPORT                                                                    \
+  void JNICALL Java_com_intel_analytics_sparkdl_mkl_MKL_ReLUForward##DType(    \
+      JNIEnv *env, jclass thisClass, JArrayType input, jint inputOffset,       \
+      JArrayType output, jint outputOffset, long classPtr)                     \
+  {                                                                            \
+    JNIReLUUpdateOutput<JArrayType, JType>(env, thisClass, input, inputOffset, \
+                                           output, outputOffset, classPtr);    \
+  }
+
+#define ReLUBackward(DType, JType, JArrayType)                               \
+  JNIEXPORT                                                                  \
+  void JNICALL Java_com_intel_analytics_sparkdl_mkl_MKL_ReLUBackward##DType( \
+      JNIEnv *env, jclass thisClass, JArrayType input, jint inputOffset,     \
+      JArrayType outputDiff, jint outputDiffOffset, JArrayType inputDiff,    \
+      jint inputDiffOffset, long classPtr)                                   \
+  {                                                                          \
+    JNIReLUUpdateGradInput<JArrayType, JType>(                               \
+        env, thisClass, input, inputOffset, outputDiff, outputDiffOffset,    \
+        inputDiff, inputDiffOffset, classPtr);                               \
+  }
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// double
+ReLUInit(Double, jdouble, jdoubleArray);
+ReLUForward(Double, jdouble, jdoubleArray);
+ReLUBackward(Double, jdouble, jdoubleArray);
+
+// float
+ReLUInit(Float, jfloat, jfloatArray);
+ReLUForward(Float, jfloat, jfloatArray);
+ReLUBackward(Float, jfloat, jfloatArray);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/mkl/native/src/main/c/jni/sum.cpp b/mkl/native/src/main/c/jni/sum.cpp
new file mode 100644
index 00000000000..037e6fcd606
--- /dev/null
+++ b/mkl/native/src/main/c/jni/sum.cpp
@@ -0,0 +1,221 @@
+#include <stdio.h>
+#include <vector>
+
+#include "debug.h"
+#include "layer.h"
+#include "memory.h"
+#include "utils.h"
+
+using namespace std;
+
+template <typename DType>
+class MKLSum : public MKLLayer<DType>
+{
+ public:
+  MKLSum();
+  ~MKLSum();
+
+  void init(int numSums, int dimension, int *size);
+
+  void updateOutput(DType **input, DType *output);
+  void updateGradInput(DType **gradInput, DType *gradOutput);
+
+  // attention, we will override the four variables of MKLLayer
+  vector<shared_ptr<MKLData<DType>>> input;
+
+ private:
+  void firstPass();
+  void preExecute(DType *input);
+
+  int numSums;  // number of concats
+  DType *coefficients;
+};
+
+template <typename DType>
+MKLSum<DType>::MKLSum() : numSums(0)
+{
+  // TODO
+}
+
+template <typename DType>
+MKLSum<DType>::~MKLSum()
+{
+  // TODO
+}
+
+template <typename DType>
+void MKLSum<DType>::init(int numSums, int dimension, int *size)
+{
+  this->numSums      = numSums;
+  this->dimension    = dimension;
+  this->coefficients = new DType[numSums];
+
+  size_t inputSize[dimension];
+  size_t inputStrides[dimension];
+  size_t outputSize[dimension];
+  size_t outputStrides[dimension];
+
+  int offset = 0;
+
+  for (int i = 0; i < numSums; i++) {
+    input.push_back(shared_ptr<MKLData<DType>>(new MKLData<DType>));
+
+    // set the size.
+    // the size of every channel should be gaved in size.
+    // the dimension of every channel should be the same.
+    inputStrides[0] = 1;
+    inputSize[0]    = size[offset];
+    for (int j = 1; j < dimension; j++) {
+      inputSize[j]    = size[offset + j];
+      inputStrides[j] = inputStrides[j - 1] * inputSize[j - 1];
+    }
+    offset += dimension;
+
+    this->input[i]->createUsrLayout(dimension, inputSize, inputStrides);
+    this->coefficients[i] = 1;
+  }
+
+  // TODO check size of all input, they should be the same
+
+  outputStrides[0] = 1;
+  outputSize[0]    = inputSize[0];
+  for (int i = 1; i < dimension; i++) {
+    outputSize[i]    = inputSize[i];
+    outputStrides[i] = outputStrides[i - 1] * outputSize[i - 1];
+  }
+
+  this->output->createUsrLayout(dimension, outputSize, outputStrides);
+}
+
+template <typename DType>
+void MKLSum<DType>::firstPass()
+{
+  dnnLayout_t layout = this->input[0]->getMklLayout();
+
+  dnnError_t status = E_UNIMPLEMENTED;
+  status = dnnSumCreate<DType>(&(this->forwardPrim), NULL, numSums, layout,
+                               this->coefficients);
+  CHECK_EQ(status, E_SUCCESS);
+
+  this->output->createMklLayout(this->forwardPrim, dnnResourceDst);
+
+  for (int i = 0; i < numSums; i++) {
+    this->input[i]->createMklLayout(
+        this->forwardPrim, (dnnResourceType_t)(dnnResourceMultipleSrc + i));
+  }
+
+  this->isFirstPass = false;
+}
+
+template <typename DType>
+void MKLSum<DType>::updateOutput(DType **input, DType *output)
+{
+  if (this->isFirstPass) firstPass();
+
+  for (int i = 0; i < numSums; i++) {
+    this->input[i]->setUsrData(input[i]);
+    this->input[i]->createConversion();
+  }
+  this->output->setUsrData(output);
+  this->output->createConversion();
+
+  dnnError_t status;
+  void *resources[dnnResourceNumber];
+
+  for (int i = 0; i < numSums; i++) {
+    resources[dnnResourceMultipleSrc + i] = this->input[i]->getConvertedData();
+  }
+  resources[dnnResourceDst] = this->output->getData();
+
+  PERFSTART();
+  status = dnnExecute<DType>(this->forwardPrim, resources);
+  PERFEND("main computing");
+
+  if (!this->output->isUseNext()) this->output->backToUsr();
+}
+
+template <typename ArrayType, typename DType>
+jlong JNISumInit(JNIEnv *env, jclass thisClass, int numSums, int dimension,
+                 jintArray size)
+{
+  MKLSum<DType> *ptr = new MKLSum<DType>();
+
+  jint *jSize =
+      reinterpret_cast<int *>(env->GetPrimitiveArrayCritical(size, 0));
+  ptr->init(numSums, dimension, jSize);
+  env->ReleasePrimitiveArrayCritical(size, jSize, 0);
+
+  return reinterpret_cast<long>(ptr);
+}
+
+template <typename ArrayType, typename DType>
+void JNISumUpdateOutput(JNIEnv *env, jclass thisClass, jobjectArray input,
+                        jintArray inputOffset, ArrayType output,
+                        jint outputOffset, long classPtr)
+{
+  MKLSum<DType> *ptr = reinterpret_cast<MKLSum<DType> *>(classPtr);
+
+  jint *jInputOffset =
+      reinterpret_cast<jint *>(env->GetPrimitiveArrayCritical(inputOffset, 0));
+
+  // TODO we should re-write, this version makes a little complict.
+  int len = env->GetArrayLength(input);
+  DType *inputArrStart[len];
+  DType *inputArr[len];
+  ArrayType jInputArr[len];
+  for (int i = 0; i < len; i++) {
+    jInputArr[i]     = (ArrayType)(env->GetObjectArrayElement(input, i));
+    inputArrStart[i] = reinterpret_cast<DType *>(
+        env->GetPrimitiveArrayCritical(jInputArr[i], 0));
+    inputArr[i] = inputArrStart[i] + jInputOffset[i];
+  }
+
+  std::shared_ptr<ZipArray<ArrayType, DType>> jOutput(
+      new ZipArray<ArrayType, DType>(env, output, outputOffset, ptr->output));
+
+  ptr->updateOutput(inputArr, jOutput->getPtr());
+
+  for (int i = 0; i < len; i++) {
+    env->ReleasePrimitiveArrayCritical(jInputArr[i], inputArrStart[i], 0);
+  }
+
+  env->ReleasePrimitiveArrayCritical(inputOffset, jInputOffset, 0);
+}
+
+// Macro
+#define SumInit(DType, JType, JArrayType)                                    \
+  JNIEXPORT                                                                  \
+  jlong JNICALL Java_com_intel_analytics_sparkdl_mkl_MKL_SumInit##DType(     \
+      JNIEnv *env, jclass thisClass, jint numSums, jint dimension,           \
+      jintArray size)                                                        \
+  {                                                                          \
+    return JNISumInit<JArrayType, JType>(env, thisClass, numSums, dimension, \
+                                         size);                              \
+  }
+
+#define SumForward(DType, JType, JArrayType)                                  \
+  JNIEXPORT                                                                   \
+  void JNICALL Java_com_intel_analytics_sparkdl_mkl_MKL_SumForward##DType(    \
+      JNIEnv *env, jclass thisClass, jobjectArray input,                      \
+      jintArray inputOffset, JArrayType output, jint outputOffset,            \
+      long classPtr)                                                          \
+  {                                                                           \
+    JNISumUpdateOutput<JArrayType, JType>(env, thisClass, input, inputOffset, \
+                                          output, outputOffset, classPtr);    \
+  }
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Double
+SumInit(Double, jdouble, jdoubleArray);
+SumForward(Double, jdouble, jdoubleArray);
+
+// Float
+SumInit(Float, jfloat, jfloatArray);
+SumForward(Float, jfloat, jfloatArray);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/mkl/native/src/main/c/jni/utils.cpp b/mkl/native/src/main/c/jni/utils.cpp
new file mode 100644
index 00000000000..3e1a8381c2d
--- /dev/null
+++ b/mkl/native/src/main/c/jni/utils.cpp
@@ -0,0 +1,45 @@
+#include "utils.h"
+#include <cstdio>
+#include <cmath>
+#include <iostream>
+
+#if 0
+int computeOut(int input, int pad, int kernel, int stride)
+{
+  // if (((input + 2 * pad - kernel) % stride) != 0)
+  //   printf("%d %d %d %d\n", input, pad, kernel, stride);
+  // TODO Should we substitute with ceil or floor when compute the output?
+  //std::cout << static_cast<int>(ceil(static_cast<float>((input + 2 * pad - kernel) / stride) + 1)) << std::endl;
+  //std::cout << ((input + 2 * pad - kernel) / stride) + 1 << std::endl;
+  //return static_cast<int>(floor(static_cast<float>((input + 2 * pad - kernel) / stride) + 1));
+  // return static_cast<int>(
+  //    static_cast<float>((input + 2 * pad - kernel) / stride) + 1);
+  //return ((input + 2 * pad - kernel) / stride) + 1;
+  int tmp = ((input + 2 * pad - kernel) / stride) + 1;
+  //if (((input + 2 * pad - kernel) % stride) != 0)
+  //  tmp += 1;
+  return tmp;
+}
+#endif
+
+int computeOut(int input, int pad, int kernel, int stride, bool ceilMode)
+{
+  if (ceilMode) {
+    return static_cast<int>(ceil(static_cast<float>(
+          input + 2 * pad - kernel) / stride)) + 1;
+  } else {
+    return static_cast<int>(floor(static_cast<float>(
+          input + 2 * pad - kernel) / stride)) + 1;
+  }
+}
+
+int main()
+{
+  std::cout << computeOut(4, 0, 3, 2, true);
+  std::cout << computeOut(4, 0, 3, 2, false);
+
+  std::cout << computeOut(3, 1, 2, 1, true);
+  std::cout << computeOut(3, 1, 2, 1, false);
+
+  return 0;
+}
diff --git a/mkl/native/src/main/c/jni/utils.h b/mkl/native/src/main/c/jni/utils.h
new file mode 100644
index 00000000000..117bfef15f2
--- /dev/null
+++ b/mkl/native/src/main/c/jni/utils.h
@@ -0,0 +1,7 @@
+#ifndef _UTILS_H_
+#define _UTILS_H_
+
+int computeOut(int input, int pad, int kernle, int stride,
+               bool ceilMode = false);
+
+#endif