From 23e0bee20bb29371162965a953af7a977add596e Mon Sep 17 00:00:00 2001 From: Shadow <871744494@qq.com> Date: Thu, 4 Jul 2024 15:35:59 +0800 Subject: [PATCH 1/3] fix resize gpu --- .../gpu/mali/cl/kernel_option/resize_opt.h | 17 +++++-- .../image/src/gpu/mali/cl/resize_bilinear.cl | 8 +-- .../src/gpu/mali/fp16/resize_mali_fp16.cpp | 17 +++++-- compute/image/src/resize.cpp | 51 ++++++++++++++++++- 4 files changed, 78 insertions(+), 15 deletions(-) diff --git a/compute/image/src/gpu/mali/cl/kernel_option/resize_opt.h b/compute/image/src/gpu/mali/cl/kernel_option/resize_opt.h index 09933f67..aaa969b5 100644 --- a/compute/image/src/gpu/mali/cl/kernel_option/resize_opt.h +++ b/compute/image/src/gpu/mali/cl/kernel_option/resize_opt.h @@ -12,11 +12,14 @@ inline EE set_resize_opt_mali(ResizeParamSpec p, char *kernelName, KernelOpt *kernelOpt) { -#ifdef _USE_FP16 - kernelOpt->kernelDataType = DT_F16; -#else - kernelOpt->kernelDataType = DT_F32; -#endif + DataType dt = idt; + if (bytesOf(odt) > bytesOf(idt)) { + dt = odt; + } + if (bytesOf(dt) < 2) { + dt = DT_F32; + } + kernelOpt->kernelDataType = dt; char *opt = kernelOpt->option; std::string source; if (p.mode == RESIZE_NEAREST) { @@ -55,9 +58,13 @@ inline EE set_resize_opt_mali(ResizeParamSpec p, } else { CHECK_STATUS(add_macro(opt, "USE_NCHW")); } + if (odt == DT_U8) { + CHECK_STATUS(add_macro(opt, "OUTPUT_UCHAR")); + } std::string idtName = gcl_get_type(idt); std::string odtName = gcl_get_type(odt); CHECK_STATUS(add_macro(opt, "IT", idtName)); + CHECK_STATUS(add_macro(opt, "IT4", idtName + "4")); CHECK_STATUS(add_macro(opt, "OT", odtName)); CHECK_STATUS(add_macro_type(opt, kernelOpt->kernelDataType)); CHECK_STATUS(add_macro_io(opt, inputMemType, outputMemType)); diff --git a/compute/image/src/gpu/mali/cl/resize_bilinear.cl b/compute/image/src/gpu/mali/cl/resize_bilinear.cl index 43a7c88b..7a76a4ee 100644 --- a/compute/image/src/gpu/mali/cl/resize_bilinear.cl +++ b/compute/image/src/gpu/mali/cl/resize_bilinear.cl @@ -35,8 +35,8 @@ } #endif -#if OT == uchar -#define func convert_uchar_sat +#if defined(OUTPUT_UCHAR) +#define func convert_uchar_sat_rte #else #define func #endif @@ -80,8 +80,8 @@ __kernel void KERNEL_NAME(const int iw_str, tblr.y = min(tblr.x + 1, iw - 1); // R tblr.z = max(0, (int)floor(iy)); // T tblr.w = min(tblr.z + 1, ih - 1); // B - T dif1 = ix - tblr.x; // C-L - T dif2 = iy - tblr.z; // C-T + T dif1 = ix - (float)tblr.x; // C-L + T dif2 = iy - (float)tblr.z; // C-T #if defined(USE_NCHW) || defined(USE_NHWC) int x = (idz * ih_str + tblr.z) * iw_str + tblr.x + i_off; // TL_off diff --git a/compute/image/src/gpu/mali/fp16/resize_mali_fp16.cpp b/compute/image/src/gpu/mali/fp16/resize_mali_fp16.cpp index 391055a9..22f83430 100644 --- a/compute/image/src/gpu/mali/fp16/resize_mali_fp16.cpp +++ b/compute/image/src/gpu/mali/fp16/resize_mali_fp16.cpp @@ -30,15 +30,22 @@ inline EE resize_core_mali_fp16(GCLHandle_t handle, GCLMem_t output) { DataType idt, odt; + DataFormat idf; U32 iw, ih, ic, in; U32 ow, oh, oc, on; - tensorSelectGet(inputDesc, &idt, NULL, &in, &ic, &ih, &iw); + tensorSelectGet(inputDesc, &idt, idf, &in, &ic, &ih, &iw); tensorSelectGet(outputDesc, &odt, NULL, &on, &oc, &oh, &ow); U32 iw_str, ih_str, iw_off, ih_off, i_off; U32 ow_str, oh_str, ow_off, oh_off, o_off; get_gclmem_dim(input->desc, &iw_str, &ih_str, NULL, &iw_off, &ih_off); get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off); + if (iw_str < iw) { + ih_str = ih; + iw_str = iw; + oh_str = oh; + ow_str = ow; + } cl_mem inbuf = input->mem; cl_mem outbuf = output->mem; GCLMemType inputMemType = input->desc.memType; @@ -53,9 +60,9 @@ inline EE resize_core_mali_fp16(GCLHandle_t handle, U32 dim = 3; U32 gs[3] = {ow, oh, 0}; U32 ls[3] = {0, 0, 0}; - if (input->desc.df == DF_NCHWC4) { + if (idf == DF_NCHWC4) { gs[2] = (oc + 3) / 4 * on; - } else if (input->desc.df == DF_NHWC) { + } else if (idf == DF_NHWC) { gs[2] = on; } else { gs[2] = oc * on; @@ -64,8 +71,8 @@ inline EE resize_core_mali_fp16(GCLHandle_t handle, Kernel kernel; KernelOpt kernelOpt; char kernelName[128]; - CHECK_STATUS(set_resize_opt_mali( - p, input->desc.df, idt, odt, inputMemType, outputMemType, kernelName, &kernelOpt)); + CHECK_STATUS( + set_resize_opt_mali(p, idf, idt, odt, inputMemType, outputMemType, kernelName, &kernelOpt)); CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel, &kernelOpt)); CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, i_off, iw, ih, ow_str, oh_str, o_off, ow, oh, r0_w, r0_h, r1_w, r1_h, inbuf, outbuf)); diff --git a/compute/image/src/resize.cpp b/compute/image/src/resize.cpp index 46e8d1c3..a77482ba 100644 --- a/compute/image/src/resize.cpp +++ b/compute/image/src/resize.cpp @@ -28,6 +28,14 @@ #include "cpu/x86/image_x86.h" #endif +static bool is_implicit_nhwc(TensorDesc desc) { + bool ret = false; + if (desc.df == DF_NCHW && desc.dims[0] == 3 && desc.dims[1] > 3 && desc.dims[2] > 3) { + ret = true; + } + return ret; +} + // params is a pointer to either the target size or the resize ratios // When paramDT specifies DT_U32, params should point to target sizes (height and width) // When paramDT specifies DT_F32, params should point to resize ratios @@ -37,10 +45,18 @@ EE resize_infer_output_size_cpu(TensorDesc inputDesc, ResizeParamSpec p, TensorD DataFormat idf, odf; U32 in, ic, ih, iw = 1; U32 oh, ow = 1; + bool nhwc = false; if (tensorIs3d(inputDesc)) { CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &in, &ic, &ih)); } else if (tensorIs4d(inputDesc)) { CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw)); + nhwc = is_implicit_nhwc(inputDesc); + if (nhwc) { + int t = iw; + iw = ih; + ih = ic; + ic = t; + } } else { UNI_ERROR_LOG("can support to resize %d-dim tensor.\n", inputDesc.nDims); } @@ -77,7 +93,11 @@ EE resize_infer_output_size_cpu(TensorDesc inputDesc, ResizeParamSpec p, TensorD if (tensorIs3d(inputDesc)) { *outputDesc = tensor3df(idt, odf, in, ic, oh); } else if (tensorIs4d(inputDesc)) { - *outputDesc = tensor4df(idt, odf, in, ic, oh, ow); + if (nhwc) { + *outputDesc = tensor4df(idt, odf, in, oh, ow, ic); + } else { + *outputDesc = tensor4df(idt, odf, in, ic, oh, ow); + } } return SUCCESS; } @@ -201,6 +221,34 @@ EE resize_nearest(TensorDesc inputDesc, return ret; } +static bool update(TensorDesc &inputDesc, TensorDesc &outputDesc) { + bool ret = false; + if (is_implicit_nhwc(inputDesc) && inputDesc.dims[0] == outputDesc.dims[0]) { + TensorDesc desc0 = inputDesc; + U32 v = inputDesc.dims[0]; + for (U32 i = 0; i < inputDesc.nDims; i++) { + inputDesc.dims[i - 1] = inputDesc.dims[i]; + } + inputDesc.dims[inputDesc.nDims - 2] = v; + inputDesc.df = DF_NCHW; + + TensorDesc desc1 = outputDesc; + v = outputDesc.dims[0]; + for (U32 i = 1; i < outputDesc.nDims; i++) { + outputDesc.dims[i - 1] = outputDesc.dims[i]; + } + outputDesc.dims[outputDesc.nDims - 2] = v; + outputDesc.df = DF_NHWC; + + UNI_DEBUG_LOG("change input from %s -> %s.\n", tensorDesc2Str(desc0).c_str(), + tensorDesc2Str(inputDesc).c_str()); + UNI_DEBUG_LOG("change output from %s -> %s.\n", tensorDesc2Str(desc1).c_str(), + tensorDesc2Str(outputDesc).c_str()); + ret = true; + } + return ret; +} + EE resize( Tensor inputTensor, ResizeParamSpec p, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t archInfo) { @@ -210,6 +258,7 @@ EE resize( TensorDesc outputDesc = outputTensor.get_desc(); void *output = get_ptr_from_tensor(outputTensor, arch); void *tmp = get_ptr_from_tensor(tmpTensor, arch); + update(inputDesc, outputDesc); if (inputDesc.nDims == 3) { for (int i = inputDesc.nDims; i > 0; i--) { From 123dd6ece5c9fc2de2c4cd31033875b0c94db666 Mon Sep 17 00:00:00 2001 From: hutingh Date: Thu, 4 Jul 2024 15:56:53 +0800 Subject: [PATCH 2/3] Update bolt_c_simplify.cpp --- inference/engine/src/bolt_c_simplify.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/inference/engine/src/bolt_c_simplify.cpp b/inference/engine/src/bolt_c_simplify.cpp index 7513e8ec..139f7cef 100644 --- a/inference/engine/src/bolt_c_simplify.cpp +++ b/inference/engine/src/bolt_c_simplify.cpp @@ -141,7 +141,6 @@ int FreeTensor( c, h, w, dt, df, data); int ret = 0; if (num > 0) { - FreeTensorDesc(num, name, n, c, h, w, dt, df); for (int i = 0; i < num; i++) { UNI_FREE(data[i]); } From d6154fb0cf485c0341a128f9e350333f3cb5adfb Mon Sep 17 00:00:00 2001 From: hutingh Date: Thu, 4 Jul 2024 16:57:56 +0800 Subject: [PATCH 3/3] Update resize_mali_fp16.cpp --- compute/image/src/gpu/mali/fp16/resize_mali_fp16.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compute/image/src/gpu/mali/fp16/resize_mali_fp16.cpp b/compute/image/src/gpu/mali/fp16/resize_mali_fp16.cpp index 22f83430..bbb69ab6 100644 --- a/compute/image/src/gpu/mali/fp16/resize_mali_fp16.cpp +++ b/compute/image/src/gpu/mali/fp16/resize_mali_fp16.cpp @@ -33,7 +33,7 @@ inline EE resize_core_mali_fp16(GCLHandle_t handle, DataFormat idf; U32 iw, ih, ic, in; U32 ow, oh, oc, on; - tensorSelectGet(inputDesc, &idt, idf, &in, &ic, &ih, &iw); + tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw); tensorSelectGet(outputDesc, &odt, NULL, &on, &oc, &oh, &ow); U32 iw_str, ih_str, iw_off, ih_off, i_off;