Skip to content

Commit

Permalink
Merge pull request #139 from hutingh/fix_resize_gpu
Browse files Browse the repository at this point in the history
Fix resize gpu
  • Loading branch information
zhangjiajin2 committed Jul 4, 2024
2 parents cf4ca8f + d6154fb commit 6fcfa33
Show file tree
Hide file tree
Showing 5 changed files with 78 additions and 16 deletions.
17 changes: 12 additions & 5 deletions compute/image/src/gpu/mali/cl/kernel_option/resize_opt.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,14 @@ inline EE set_resize_opt_mali(ResizeParamSpec p,
char *kernelName,
KernelOpt *kernelOpt)
{
#ifdef _USE_FP16
kernelOpt->kernelDataType = DT_F16;
#else
kernelOpt->kernelDataType = DT_F32;
#endif
DataType dt = idt;
if (bytesOf(odt) > bytesOf(idt)) {
dt = odt;
}
if (bytesOf(dt) < 2) {
dt = DT_F32;
}
kernelOpt->kernelDataType = dt;
char *opt = kernelOpt->option;
std::string source;
if (p.mode == RESIZE_NEAREST) {
Expand Down Expand Up @@ -55,9 +58,13 @@ inline EE set_resize_opt_mali(ResizeParamSpec p,
} else {
CHECK_STATUS(add_macro(opt, "USE_NCHW"));
}
if (odt == DT_U8) {
CHECK_STATUS(add_macro(opt, "OUTPUT_UCHAR"));
}
std::string idtName = gcl_get_type(idt);
std::string odtName = gcl_get_type(odt);
CHECK_STATUS(add_macro(opt, "IT", idtName));
CHECK_STATUS(add_macro(opt, "IT4", idtName + "4"));
CHECK_STATUS(add_macro(opt, "OT", odtName));
CHECK_STATUS(add_macro_type(opt, kernelOpt->kernelDataType));
CHECK_STATUS(add_macro_io(opt, inputMemType, outputMemType));
Expand Down
8 changes: 4 additions & 4 deletions compute/image/src/gpu/mali/cl/resize_bilinear.cl
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@
}
#endif

#if OT == uchar
#define func convert_uchar_sat
#if defined(OUTPUT_UCHAR)
#define func convert_uchar_sat_rte
#else
#define func
#endif
Expand Down Expand Up @@ -80,8 +80,8 @@ __kernel void KERNEL_NAME(const int iw_str,
tblr.y = min(tblr.x + 1, iw - 1); // R
tblr.z = max(0, (int)floor(iy)); // T
tblr.w = min(tblr.z + 1, ih - 1); // B
T dif1 = ix - tblr.x; // C-L
T dif2 = iy - tblr.z; // C-T
T dif1 = ix - (float)tblr.x; // C-L
T dif2 = iy - (float)tblr.z; // C-T

#if defined(USE_NCHW) || defined(USE_NHWC)
int x = (idz * ih_str + tblr.z) * iw_str + tblr.x + i_off; // TL_off
Expand Down
17 changes: 12 additions & 5 deletions compute/image/src/gpu/mali/fp16/resize_mali_fp16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,22 @@ inline EE resize_core_mali_fp16(GCLHandle_t handle,
GCLMem_t output)
{
DataType idt, odt;
DataFormat idf;
U32 iw, ih, ic, in;
U32 ow, oh, oc, on;
tensorSelectGet(inputDesc, &idt, NULL, &in, &ic, &ih, &iw);
tensorSelectGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw);
tensorSelectGet(outputDesc, &odt, NULL, &on, &oc, &oh, &ow);

U32 iw_str, ih_str, iw_off, ih_off, i_off;
U32 ow_str, oh_str, ow_off, oh_off, o_off;
get_gclmem_dim(input->desc, &iw_str, &ih_str, NULL, &iw_off, &ih_off);
get_gclmem_dim(output->desc, &ow_str, &oh_str, NULL, &ow_off, &oh_off);
if (iw_str < iw) {
ih_str = ih;
iw_str = iw;
oh_str = oh;
ow_str = ow;
}
cl_mem inbuf = input->mem;
cl_mem outbuf = output->mem;
GCLMemType inputMemType = input->desc.memType;
Expand All @@ -53,9 +60,9 @@ inline EE resize_core_mali_fp16(GCLHandle_t handle,
U32 dim = 3;
U32 gs[3] = {ow, oh, 0};
U32 ls[3] = {0, 0, 0};
if (input->desc.df == DF_NCHWC4) {
if (idf == DF_NCHWC4) {
gs[2] = (oc + 3) / 4 * on;
} else if (input->desc.df == DF_NHWC) {
} else if (idf == DF_NHWC) {
gs[2] = on;
} else {
gs[2] = oc * on;
Expand All @@ -64,8 +71,8 @@ inline EE resize_core_mali_fp16(GCLHandle_t handle,
Kernel kernel;
KernelOpt kernelOpt;
char kernelName[128];
CHECK_STATUS(set_resize_opt_mali(
p, input->desc.df, idt, odt, inputMemType, outputMemType, kernelName, &kernelOpt));
CHECK_STATUS(
set_resize_opt_mali(p, idf, idt, odt, inputMemType, outputMemType, kernelName, &kernelOpt));
CHECK_STATUS(gcl_create_kernel(handle, kernelName, &kernel, &kernelOpt));
CHECK_STATUS(gcl_set_kernelArgs(kernel, iw_str, ih_str, i_off, iw, ih, ow_str, oh_str, o_off,
ow, oh, r0_w, r0_h, r1_w, r1_h, inbuf, outbuf));
Expand Down
51 changes: 50 additions & 1 deletion compute/image/src/resize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,14 @@
#include "cpu/x86/image_x86.h"
#endif

static bool is_implicit_nhwc(TensorDesc desc) {
bool ret = false;
if (desc.df == DF_NCHW && desc.dims[0] == 3 && desc.dims[1] > 3 && desc.dims[2] > 3) {
ret = true;
}
return ret;
}

// params is a pointer to either the target size or the resize ratios
// When paramDT specifies DT_U32, params should point to target sizes (height and width)
// When paramDT specifies DT_F32, params should point to resize ratios
Expand All @@ -37,10 +45,18 @@ EE resize_infer_output_size_cpu(TensorDesc inputDesc, ResizeParamSpec p, TensorD
DataFormat idf, odf;
U32 in, ic, ih, iw = 1;
U32 oh, ow = 1;
bool nhwc = false;
if (tensorIs3d(inputDesc)) {
CHECK_STATUS(tensor3dGet(inputDesc, &idt, &idf, &in, &ic, &ih));
} else if (tensorIs4d(inputDesc)) {
CHECK_STATUS(tensor4dGet(inputDesc, &idt, &idf, &in, &ic, &ih, &iw));
nhwc = is_implicit_nhwc(inputDesc);
if (nhwc) {
int t = iw;
iw = ih;
ih = ic;
ic = t;
}
} else {
UNI_ERROR_LOG("can support to resize %d-dim tensor.\n", inputDesc.nDims);
}
Expand Down Expand Up @@ -77,7 +93,11 @@ EE resize_infer_output_size_cpu(TensorDesc inputDesc, ResizeParamSpec p, TensorD
if (tensorIs3d(inputDesc)) {
*outputDesc = tensor3df(idt, odf, in, ic, oh);
} else if (tensorIs4d(inputDesc)) {
*outputDesc = tensor4df(idt, odf, in, ic, oh, ow);
if (nhwc) {
*outputDesc = tensor4df(idt, odf, in, oh, ow, ic);
} else {
*outputDesc = tensor4df(idt, odf, in, ic, oh, ow);
}
}
return SUCCESS;
}
Expand Down Expand Up @@ -201,6 +221,34 @@ EE resize_nearest(TensorDesc inputDesc,
return ret;
}

static bool update(TensorDesc &inputDesc, TensorDesc &outputDesc) {
bool ret = false;
if (is_implicit_nhwc(inputDesc) && inputDesc.dims[0] == outputDesc.dims[0]) {
TensorDesc desc0 = inputDesc;
U32 v = inputDesc.dims[0];
for (U32 i = 0; i < inputDesc.nDims; i++) {
inputDesc.dims[i - 1] = inputDesc.dims[i];
}
inputDesc.dims[inputDesc.nDims - 2] = v;
inputDesc.df = DF_NCHW;

TensorDesc desc1 = outputDesc;
v = outputDesc.dims[0];
for (U32 i = 1; i < outputDesc.nDims; i++) {
outputDesc.dims[i - 1] = outputDesc.dims[i];
}
outputDesc.dims[outputDesc.nDims - 2] = v;
outputDesc.df = DF_NHWC;

UNI_DEBUG_LOG("change input from %s -> %s.\n", tensorDesc2Str(desc0).c_str(),
tensorDesc2Str(inputDesc).c_str());
UNI_DEBUG_LOG("change output from %s -> %s.\n", tensorDesc2Str(desc1).c_str(),
tensorDesc2Str(outputDesc).c_str());
ret = true;
}
return ret;
}

EE resize(
Tensor inputTensor, ResizeParamSpec p, Tensor tmpTensor, Tensor outputTensor, ArchInfo_t archInfo)
{
Expand All @@ -210,6 +258,7 @@ EE resize(
TensorDesc outputDesc = outputTensor.get_desc();
void *output = get_ptr_from_tensor(outputTensor, arch);
void *tmp = get_ptr_from_tensor(tmpTensor, arch);
update(inputDesc, outputDesc);

if (inputDesc.nDims == 3) {
for (int i = inputDesc.nDims; i > 0; i--) {
Expand Down
1 change: 0 additions & 1 deletion inference/engine/src/bolt_c_simplify.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,6 @@ int FreeTensor(
c, h, w, dt, df, data);
int ret = 0;
if (num > 0) {
FreeTensorDesc(num, name, n, c, h, w, dt, df);
for (int i = 0; i < num; i++) {
UNI_FREE(data[i]);
}
Expand Down

0 comments on commit 6fcfa33

Please sign in to comment.