Merge pull request #139 from hutingh/fix_resize_gpu

Fix resize gpu
huawei-noah · Jul 4, 2024 · 6fcfa33 · 6fcfa33
2 parents cf4ca8f + d6154fb
commit 6fcfa33
Show file tree

Hide file tree

Showing 5 changed files with 78 additions and 16 deletions.
diff --git a/compute/image/src/gpu/mali/cl/kernel_option/resize_opt.h b/compute/image/src/gpu/mali/cl/kernel_option/resize_opt.h
@@ -12,11 +12,14 @@ inline EE set_resize_opt_mali(ResizeParamSpec p,
     char *kernelName,
     KernelOpt *kernelOpt)
 {
-#ifdef _USE_FP16
-    kernelOpt->kernelDataType = DT_F16;
-#else
-    kernelOpt->kernelDataType = DT_F32;
-#endif
+    DataType dt = idt;
+    if (bytesOf(odt) > bytesOf(idt)) {
+        dt = odt;
+    }
+    if (bytesOf(dt) < 2) {
+        dt = DT_F32;
+    }
+    kernelOpt->kernelDataType = dt;
     char *opt = kernelOpt->option;
     std::string source;
     if (p.mode == RESIZE_NEAREST) {
@@ -55,9 +58,13 @@ inline EE set_resize_opt_mali(ResizeParamSpec p,
     } else {
         CHECK_STATUS(add_macro(opt, "USE_NCHW"));
     }
+    if (odt == DT_U8) {
+        CHECK_STATUS(add_macro(opt, "OUTPUT_UCHAR"));
+    }
     std::string idtName = gcl_get_type(idt);
     std::string odtName = gcl_get_type(odt);
     CHECK_STATUS(add_macro(opt, "IT", idtName));
+    CHECK_STATUS(add_macro(opt, "IT4", idtName + "4"));
     CHECK_STATUS(add_macro(opt, "OT", odtName));
     CHECK_STATUS(add_macro_type(opt, kernelOpt->kernelDataType));
     CHECK_STATUS(add_macro_io(opt, inputMemType, outputMemType));

diff --git a/compute/image/src/gpu/mali/cl/resize_bilinear.cl b/compute/image/src/gpu/mali/cl/resize_bilinear.cl
@@ -35,8 +35,8 @@
     }
 #endif
 
-#if OT == uchar
-#define func convert_uchar_sat
+#if defined(OUTPUT_UCHAR)
+#define func convert_uchar_sat_rte
 #else
 #define func
 #endif
@@ -80,8 +80,8 @@ __kernel void KERNEL_NAME(const int iw_str,
     tblr.y = min(tblr.x + 1, iw - 1);  // R
     tblr.z = max(0, (int)floor(iy));   // T
     tblr.w = min(tblr.z + 1, ih - 1);  // B
-    T dif1 = ix - tblr.x;              // C-L
-    T dif2 = iy - tblr.z;              // C-T
+    T dif1 = ix - (float)tblr.x;              // C-L
+    T dif2 = iy - (float)tblr.z;              // C-T
 
 #if defined(USE_NCHW) || defined(USE_NHWC)
     int x = (idz * ih_str + tblr.z) * iw_str + tblr.x + i_off;  // TL_off

diff --git a/compute/image/src/gpu/mali/fp16/resize_mali_fp16.cpp b/compute/image/src/gpu/mali/fp16/resize_mali_fp16.cpp
@@ -30,15 +30,22 @@ inline EE resize_core_mali_fp16(GCLHandle_t handle,
     GCLMem_t output)
 {
     DataType idt, odt;
+    DataFormat idf;
     U32 iw, ih, ic, in;
     U32 ow, oh, oc, on;
-    tensorSelectGet(inputDesc, &idt, NULL, &in, &ic, &ih, &iw);
+    tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw);
     tensorSelectGet(outputDesc, &odt, NULL, &on, &oc, &oh, &ow);
 
     U32 iw_str, ih_str, iw_off, ih_off, i_off;
     U32 ow_str, oh_str, ow_off, oh_off, o_off;
     get_gclmem_dim(input->desc, &iw_str, &ih_str, NULL, &iw_off, &ih_off);
     get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off);
+    if (iw_str < iw) {
+        ih_str = ih;
+        iw_str = iw;
+        oh_str = oh;
+        ow_str = ow;
+    }
     cl_mem inbuf = input->mem;
     cl_mem outbuf = output->mem;
     GCLMemType inputMemType = input->desc.memType;
@@ -53,9 +60,9 @@ inline EE resize_core_mali_fp16(GCLHandle_t handle,
     U32 dim = 3;
     U32 gs[3] = {ow, oh, 0};
     U32 ls[3] = {0, 0, 0};
-    if (input->desc.df == DF_NCHWC4) {
+    if (idf == DF_NCHWC4) {
         gs[2] = (oc + 3) / 4 * on;
-    } else if (input->desc.df == DF_NHWC) {
+    } else if (idf == DF_NHWC) {
         gs[2] = on;
     } else {
         gs[2] = oc * on;
@@ -64,8 +71,8 @@ inline EE resize_core_mali_fp16(GCLHandle_t handle,
     Kernel kernel;
     KernelOpt kernelOpt;
     char kernelName[128];
-    CHECK_STATUS(set_resize_opt_mali(
-        p, input->desc.df, idt, odt, inputMemType, outputMemType, kernelName, &kernelOpt));
+    CHECK_STATUS(
+        set_resize_opt_mali(p, idf, idt, odt, inputMemType, outputMemType, kernelName, &kernelOpt));
     CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel, &kernelOpt));
     CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, i_off, iw, ih, ow_str, oh_str, o_off,
         ow, oh, r0_w, r0_h, r1_w, r1_h, inbuf, outbuf));

diff --git a/compute/image/src/resize.cpp b/compute/image/src/resize.cpp
@@ -28,6 +28,14 @@
 #include "cpu/x86/image_x86.h"
 #endif
 
+static bool is_implicit_nhwc(TensorDesc desc) {
+    bool ret = false;
+    if (desc.df == DF_NCHW && desc.dims[0] == 3 && desc.dims[1] > 3 && desc.dims[2] > 3) {
+        ret = true;
+    }
+    return ret;
+}
+
 // params is a pointer to either the target size or the resize ratios
 // When paramDT specifies DT_U32, params should point to target sizes (height and width)
 // When paramDT specifies DT_F32, params should point to resize ratios
@@ -37,10 +45,18 @@ EE resize_infer_output_size_cpu(TensorDesc inputDesc, ResizeParamSpec p, TensorD
     DataFormat idf, odf;
     U32 in, ic, ih, iw = 1;
     U32 oh, ow = 1;
+    bool nhwc = false;
     if (tensorIs3d(inputDesc)) {
         CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &in, &ic, &ih));
     } else if (tensorIs4d(inputDesc)) {
         CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
+        nhwc = is_implicit_nhwc(inputDesc);
+        if (nhwc) {
+            int t = iw;
+            iw = ih;
+            ih = ic;
+            ic = t;
+        }
     } else {
         UNI_ERROR_LOG("can support to resize %d-dim tensor.\n", inputDesc.nDims);
     }
@@ -77,7 +93,11 @@ EE resize_infer_output_size_cpu(TensorDesc inputDesc, ResizeParamSpec p, TensorD
     if (tensorIs3d(inputDesc)) {
         *outputDesc = tensor3df(idt, odf, in, ic, oh);
     } else if (tensorIs4d(inputDesc)) {
-        *outputDesc = tensor4df(idt, odf, in, ic, oh, ow);
+        if (nhwc) {
+            *outputDesc = tensor4df(idt, odf, in, oh, ow, ic);
+        } else {
+            *outputDesc = tensor4df(idt, odf, in, ic, oh, ow);
+        }
     }
     return SUCCESS;
 }
@@ -201,6 +221,34 @@ EE resize_nearest(TensorDesc inputDesc,
     return ret;
 }
 
+static bool update(TensorDesc &inputDesc, TensorDesc &outputDesc) {
+    bool ret = false;
+    if (is_implicit_nhwc(inputDesc) && inputDesc.dims[0] == outputDesc.dims[0]) {
+        TensorDesc desc0 = inputDesc;
+        U32 v = inputDesc.dims[0];
+        for (U32 i = 0; i < inputDesc.nDims; i++) {
+            inputDesc.dims[i - 1] = inputDesc.dims[i];
+        }
+        inputDesc.dims[inputDesc.nDims - 2] = v;
+        inputDesc.df = DF_NCHW;
+
+        TensorDesc desc1 = outputDesc;
+        v = outputDesc.dims[0];
+        for (U32 i = 1; i < outputDesc.nDims; i++) {
+            outputDesc.dims[i - 1] = outputDesc.dims[i];
+        }
+        outputDesc.dims[outputDesc.nDims - 2] = v;
+        outputDesc.df = DF_NHWC;
+
+        UNI_DEBUG_LOG("change input from %s -> %s.\n", tensorDesc2Str(desc0).c_str(),
+            tensorDesc2Str(inputDesc).c_str());
+        UNI_DEBUG_LOG("change output from %s -> %s.\n", tensorDesc2Str(desc1).c_str(),
+            tensorDesc2Str(outputDesc).c_str());
+        ret = true;
+    }
+    return ret;
+}
+
 EE resize(
     Tensor inputTensor, ResizeParamSpec p, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t archInfo)
 {
@@ -210,6 +258,7 @@ EE resize(
     TensorDesc outputDesc = outputTensor.get_desc();
     void *output = get_ptr_from_tensor(outputTensor, arch);
     void *tmp = get_ptr_from_tensor(tmpTensor, arch);
+    update(inputDesc, outputDesc);
 
     if (inputDesc.nDims == 3) {
         for (int i = inputDesc.nDims; i > 0; i--) {

diff --git a/inference/engine/src/bolt_c_simplify.cpp b/inference/engine/src/bolt_c_simplify.cpp
@@ -141,7 +141,6 @@ int FreeTensor(
         c, h, w, dt, df, data);
     int ret = 0;
     if (num > 0) {
-        FreeTensorDesc(num, name, n, c, h, w, dt, df);
         for (int i = 0; i < num; i++) {
             UNI_FREE(data[i]);
         }