Feat: MKL-DNN Supports (#2482)

This feature enables mkl-dnn support, which can speed up deep learning model. We wrapper the native c api in the java, which are in BigDL-core projects. And in BigDL, we integrated the convolution, batchnorm, maxpooling, avgpooling, relu, lrn, softmax, caddtable and concattable. Currently, it supports create the model which only contains dnn layer or container. Because the data layout is optimized in mkl-dnn. The mkl-dnn model will use `DnnTensor` which contains the native buffer as a default tensor. So there're some notations, 1. User should copy the data from jvm heap at the first layer and copy back to jvm heap at the last layer. 2. User should compile the model, which contains the phase (training/inference) and input tensor size. It will infer and allocate the other information. * fix: linear performance issue and serialization of java object in MklDnnTensor * memory leak refactor * memory leak and bn performance issues 1. Memory Leak The internal buffer with MklDnnTensor should not be re-assigned without releasing. So we should check it first. At first iteration or after the changing of input size, we create a new MklDnnTensor as a buffer. 2. Bn perf The JIT BatchNormalization only supports avx2 or avx512, which has much batter performance than ref version. The input and gradOutput format should be the same to get the best performance. * test: add some test cases for BatchNorm. The computation of float value is not the same as C/C++/Native with JVM. And batch norm will make it much greater such as 10^-8 -> 10^-4 -> 10^-1 * fix: rebase with upstream master: 1. Concat and ConcatTable should inherit from DynamicContainer. 2. updateParameters has been depricated. 3. zeroGradParameters should be final. But from now on, the Linear should use it. 4. Some other syntax or semantic errors. * perf: single node and single model performance * perf: single model * feat: add fusion for mkl-dnn * test: add test utils to compare dnn output * test: add some tests compared with caffe * add unit tests for dnn tensor * add unit test for reorder memory * test: fix the test regression errors * checkin reorder manager * add backward for sequential * fix some bugs * update core ref * add unit tests * refactor: move the static class DataType, AlgKind and so on to standalone class (#4) * refactor: delete MklDnn.MemoryFormat * refactor: move the static class DataType, AlgKind and so on to standalone class * fix: core refactor errors * refactor: spec errors (#5) * Mkl dnn dev (#6) * checkin reorder manager * add container and refine reorder manager * fix merge issue * add join table forward * refine inteface (#7) * add LRN and ReLU * add pooling * refactor: conv + linear + bn * add JoinTable backward * refactor: conv + linear + bn * add cAddTable concattable * fix: reorder failed on some of convs * refactor: softmax * refactor: fusion support * refactor: resnet_50 * refactor: move tests to this branch * refactor: delete unusefull files and enable the special old tests. refactor: delete unsed methods in MklDnnOps fix: scalastyle check * fix: rebase with upstream * fix: ignore the prototxt tests * fix: do not change the core commit ref * fix: move set num of threads for mkldnn to ResNet50Perf * fix: serialization disabled for mkldnn module
intel-analytics · Jun 28, 2018 · 5f18519 · 5f18519
1 parent fe84fd2
commit 5f18519
Show file tree

Hide file tree

Showing 54 changed files with 8,671 additions and 106 deletions.
diff --git a/spark/dl/src/main/scala/com/intel/analytics/bigdl/nn/Utils.scala b/spark/dl/src/main/scala/com/intel/analytics/bigdl/nn/Utils.scala
@@ -392,6 +392,54 @@ object Utils {
     Array(padH, padH, padW, padW, oheight, owidth)
   }
 
+  private[nn] def getOutSizeAndPaddingForDNN(
+    inputHeight: Int,
+    inputWidth: Int,
+    dH: Int,
+    dW: Int,
+    kH: Int,
+    kW: Int,
+    padH: Int,
+    padW: Int,
+    ceilMode: Boolean,
+    dilationHeight: Int = 1,
+    dilationWidth: Int = 1,
+    inputdepth: Int = -1,
+    dt: Int = -1,
+    kt: Int = -1,
+    padt: Int = 0,
+    dilationDepth: Int = 1): Array[Int] = {
+    // compute padding left, right, top and bottom
+    var pad_t = padH
+    var pad_b = padH
+    var pad_l = padW
+    var pad_r = padW
+
+    var oheight = 0
+    var owidth = 0
+    var odepth = 0
+
+    val dilationKernelHeight = dilationHeight * (kH - 1) + 1
+    val dilationKernelWidth = dilationWidth * (kW - 1) + 1
+
+    oheight = math.ceil(1.0 * (inputHeight - dilationKernelHeight + 2*padH) / dH).toInt + 1
+    owidth = math.ceil(1.0 * (inputWidth - dilationKernelWidth + 2*padW) / dW).toInt + 1
+
+    if (padH != 0 || padW != 0 || padt != 0 || kH == 1 || kW == 1) {
+      if ((oheight - 1) * dH >= inputHeight + padH) oheight -= 1
+      if ((owidth - 1) * dW >= inputWidth + padW) owidth -= 1
+    }
+
+    val h = inputHeight + pad_t
+//    var pad_b = padH
+    while ((h + pad_b) < (dH * (oheight - 1) + kH)) pad_b = pad_b + 1
+    val w = inputWidth + pad_l
+//    var pad_r = padW
+    while ((w + pad_r) < (dW * (owidth - 1) + kW)) pad_r = pad_r + 1
+
+    Array(pad_t, pad_b, pad_l, pad_r, oheight, owidth)
+  }
+
   private[nn] def getOutputShape(outputHeight: Int, outputWidth: Int, nOutputPlane: Int,
     batchSize: Int = -1, format: DataFormat): Array[Int] = {
     format match {
@@ -472,6 +520,41 @@ object Utils {
     out
   }
 
+  private[nn] def getPaddingAndOutputSize(
+    inputHeight: Int,
+    inputWidth: Int,
+    dH: Int,
+    dW: Int,
+    kH: Int,
+    kW: Int,
+    padH: Int,
+    padW: Int
+  ): (Int, Int, Int, Int, Int, Int) = {
+    // compute padding left, right, top and bottom
+    var pad_t = padH
+    var pad_b = padH
+    var pad_l = padW
+    var pad_r = padW
+
+    var oheight = 0
+    var owidth = 0
+    var odepth = 0
+
+    oheight = math.ceil(1.0 * (inputHeight - kH + 2 * padH) / dH).toInt + 1
+    owidth = math.ceil(1.0 * (inputWidth - kW + 2 * padW) / dW).toInt + 1
+
+    if (padH != 0 || padW != 0 || kH == 1 || kW == 1) {
+      if ((oheight - 1) * dH >= inputHeight + padH) oheight -= 1
+      if ((owidth - 1) * dW >= inputWidth + padW) owidth -= 1
+    }
+
+    val h = inputHeight + pad_t
+    while ((h + pad_b) < (dH * (oheight - 1) + kH)) pad_b = pad_b + 1
+    val w = inputWidth + pad_l
+    while ((w + pad_r) < (dW * (owidth - 1) + kW)) pad_r = pad_r + 1
+
+    (pad_t, pad_b, pad_l, pad_r, oheight, owidth)
+  }
   /**
    * Calculate forward time and backward time.
    * @param times

diff --git a/spark/dl/src/main/scala/com/intel/analytics/bigdl/nn/abstractnn/AbstractModule.scala b/spark/dl/src/main/scala/com/intel/analytics/bigdl/nn/abstractnn/AbstractModule.scala
@@ -320,7 +320,7 @@ abstract class AbstractModule[A <: Activity: ClassTag, B <: Activity: ClassTag,
    * If the module has parameters, this will zero the accumulation of the gradients with respect
    * to these parameters. Otherwise, it does nothing.
    */
-  final def zeroGradParameters(): Unit = {
+  def zeroGradParameters(): Unit = {
     if (parameters() != null) {
       parameters()._1.zip(parameters()._2)foreach{ case (weight, grad) =>
         grad.resizeAs(weight).zero()

diff --git a/spark/dl/src/main/scala/com/intel/analytics/bigdl/nn/mkldnn/AvgPooling.scala b/spark/dl/src/main/scala/com/intel/analytics/bigdl/nn/mkldnn/AvgPooling.scala
@@ -0,0 +1,93 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.mkl._
+import com.intel.analytics.bigdl.nn.Utils
+import com.intel.analytics.bigdl.nn.abstractnn.Activity
+import com.intel.analytics.bigdl.tensor.Tensor
+
+class AvgPooling(
+  kW: Int,
+  kH: Int,
+  dW: Int = 1,
+  dH: Int = 1,
+  padW: Int = 0,
+  padH: Int = 0
+) extends MklDnnLayer {
+  @transient
+  private var paddingTL: Array[Int] = _
+  @transient
+  private var paddingBR: Array[Int] = _
+  @transient
+  private var fwdPD: Long = _
+
+  override private[mkldnn] def initFwdPrimitives(inputs: Array[MemoryData], phase: Phase) = {
+    _inputFormats = singleNativeData(inputs)
+    val strides = Array(dW, dH)
+    val kernel = Array(kH, kW)
+    val n = _inputFormats(0).shape(0)
+    val c = _inputFormats(0).shape(1)
+    val h = _inputFormats(0).shape(2)
+    val w = _inputFormats(0).shape(3)
+    val (pt, pb, pl, pr, oh, ow) =
+      Utils.getPaddingAndOutputSize(h, w, dH, dW, kH, kW, padH, padW)
+    paddingTL = Array(pt, pl)
+    paddingBR = Array(pb, pr)
+    val outputMD = MklDnn.MemoryDescInit(4, Array(n, c, oh, ow), DataType.F32, Memory.Format.any)
+    val description = MklDnn.PoolingForwardDescInit(
+      PropKind.Forward, AlgKind.PoolingAvgExcludePadding,
+      _inputFormats(0).getMemoryDescription(), outputMD, strides, kernel, paddingTL, paddingBR,
+      MklDnn.PaddingKind.mkldnnPaddingZero)
+    fwdPD = MklDnn.PrimitiveDescCreate(description, runtime.engine, 0L)
+    _outputFormats = Array(MemoryData.primitiveOutput(fwdPD))
+    output = initTensor(_outputFormats(0))
+    updateOutputPrimitives = Array(MklDnn.PrimitiveCreate2(fwdPD,
+      _inputFormats.map(_.getPrimitive(runtime)), Array(0), 1,
+      _outputFormats.map(_.getPrimitive(runtime)), 2))
+    (_inputFormats, _outputFormats)
+  }
+
+  override private[mkldnn] def initBwdPrimitives(grad: Array[MemoryData], phase: Phase) = {
+    _gradOutputFormats = singleNativeData(grad)
+    _gradOutputFormatsForWeight = _gradOutputFormats
+    val strides = Array(dW, dH)
+    val kernel = Array(kH, kW)
+    val description = MklDnn.PoolingBackwardDescInit(AlgKind.PoolingAvgExcludePadding,
+      _inputFormats(0).getMemoryDescription(),
+      _gradOutputFormats(0).getMemoryDescription(),
+      strides, kernel, paddingTL, paddingBR, MklDnn.PaddingKind.mkldnnPaddingZero)
+
+    val pd = MklDnn.PrimitiveDescCreate(description, runtime.engine, fwdPD)
+    _gradInputFormats = Array(MemoryData.primitiveGradInput(pd))
+    updateGradInputPrimitives = Array(MklDnn.PrimitiveCreate2(pd,
+      _gradOutputFormats.map(_.getPrimitive(runtime)),
+      Array(0, 0), 2, _gradInputFormats.map(_.getPrimitive(runtime)), 1))
+    gradInput = initTensor(_gradInputFormats(0))
+    (_gradOutputFormats, _gradInputFormats)
+  }
+}
+
+object AvgPooling {
+  def apply(
+    kW: Int,
+    kH: Int,
+    dW: Int = 1,
+    dH: Int = 1,
+    padW: Int = 0,
+    padH: Int = 0
+  ): AvgPooling = new AvgPooling(kW, kH, dW, dH, padW, padH)
+}
diff --git a/spark/dl/src/main/scala/com/intel/analytics/bigdl/nn/mkldnn/CAddTable.scala b/spark/dl/src/main/scala/com/intel/analytics/bigdl/nn/mkldnn/CAddTable.scala
@@ -0,0 +1,67 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.intel.analytics.bigdl.nn.mkldnn
+
+import com.intel.analytics.bigdl.mkl.{DataType, Memory, MklDnn}
+import com.intel.analytics.bigdl.nn.abstractnn.Activity
+import com.intel.analytics.bigdl.utils.T
+
+class CAddTable extends MklDnnLayer {
+  override private[mkldnn] def initFwdPrimitives(inputs: Array[MemoryData], phase: Phase) = {
+    _inputFormats = nativeData(inputs)
+    val shape = inputs(0).shape.clone()
+    for(i <- 1 until inputs.length) {
+      require(shape.length == inputs(i).shape.length, "dimension not match")
+      for(j <- 0 until shape.length) {
+        require(shape(j) == inputs(i).shape(j), "size not match")
+      }
+    }
+
+    val outputMD = MklDnn.MemoryDescInit(shape.length, shape, DataType.F32, Memory.Format.any)
+    val scales = inputs.map(_ => 1f)
+    val pd = MklDnn.SumPrimitiveDescCreate(outputMD, inputs.length, scales,
+      inputs.map(_.getPrimitiveDescription(runtime)))
+    _outputFormats = Array(MemoryData.primitiveOutput(pd))
+    updateOutputPrimitives = Array(MklDnn.PrimitiveCreate2(pd,
+      _inputFormats.map(_.getPrimitive(runtime)), new Array[Int](inputs.length),
+      _inputFormats.length, _outputFormats.map(_.getPrimitive(runtime)), 1))
+    output = initTensor(_outputFormats(0))
+    (_inputFormats, _outputFormats)
+  }
+
+  override private[mkldnn] def initBwdPrimitives(grad: Array[MemoryData], phase: Phase) = {
+    _gradOutputFormats = grad
+    _gradOutputFormatsForWeight = grad
+    _gradInputFormats = new Array[MemoryData](_inputFormats.length).map(a => grad(0))
+    gradInput = T()
+    (_gradOutputFormats, _gradInputFormats)
+  }
+
+  override def updateGradInput(input: Activity, gradOutput: Activity): Activity = {
+    require(gradOutput.isTensor, "gradOutput should be a tensor")
+    val _gradInput = gradInput.toTable
+    var i = 1
+    while(i <= _inputFormats.length) {
+      _gradInput(i) = gradOutput
+      i += 1
+    }
+    gradInput
+  }
+}
+
+object CAddTable {
+  def apply(): CAddTable = new CAddTable()
+}