Merge pull request PaddlePaddle#27 from mthreads/fix_mcc

[MTAI-484] fix(build): fix CudaAtomicAdd bug
hanhaowen-mt · Aug 10, 2023 · 8c1ef73 · 8c1ef73
2 parents 53539d2 + 25e7ee4
commit 8c1ef73
Show file tree

Hide file tree

Showing 10 changed files with 153 additions and 157 deletions.
diff --git a/cmake/musa.cmake b/cmake/musa.cmake
@@ -47,6 +47,9 @@ endif()
 
 list(APPEND MUSA_MCC_FLAGS --cuda-gpu-arch=mp_21)
 list(APPEND MUSA_MCC_FLAGS -U__CUDA__)
+# MUSA has compile conflicts of float16.h as platform::float16 overload std::is_floating_point and std::is_integer
+list(APPEND MUSA_MCC_FLAGS -D__MUSA_NO_HALF_CONVERSIONS__)
+
 #set(MUSA_VERBOSE_BUILD ON)
 if(CMAKE_BUILD_TYPE MATCHES Debug)
   list(APPEND MUSA_MCC_FLAGS -g2)

diff --git a/paddle/phi/backends/gpu/gpu_primitives.h b/paddle/phi/backends/gpu/gpu_primitives.h
@@ -266,54 +266,54 @@ CUDA_ATOMIC_WRAPPER(Add, phi::dtype::bfloat16) {
                                     PDBF16ToCUDABF16(val)));
 }
 #else
-//CUDA_ATOMIC_WRAPPER(Add, phi::dtype::bfloat16) {
-//  // concrete packed bfloat16 value may exsits in lower or higher 16bits
-//  // of the 32bits address.
-//  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
-//      reinterpret_cast<char *>(address) -
-//      (reinterpret_cast<uintptr_t>(address) & 0x02));
-//  float val_f = static_cast<float>(val);
-//  uint32_t old = *address_as_ui;
-//  uint32_t sum;
-//  uint32_t newval;
-//  uint32_t assumed;
-//  if (((uintptr_t)address & 0x02) == 0) {
-//    // the bfloat16 value stay at lower 16 bits of the address.
-//    do {
-//      assumed = old;
-//      old = atomicCAS(
-//          address_as_ui, assumed, bf16_add_to_low_half(assumed, val_f));
-//    } while (old != assumed);
-//    phi::dtype::bfloat16 ret;
-//    ret.x = old & 0xFFFFu;
-//    return ret;
-//  } else {
-//    // the bfloat16 value stay at higher 16 bits of the address.
-//    do {
-//      assumed = old;
-//      old = atomicCAS(
-//          address_as_ui, assumed, bf16_add_to_high_half(assumed, val_f));
-//    } while (old != assumed);
-//    phi::dtype::bfloat16 ret;
-//    ret.x = old >> 16;
-//    return ret;
-//  }
-//}
+CUDA_ATOMIC_WRAPPER(Add, phi::dtype::bfloat16) {
+  // concrete packed bfloat16 value may exsits in lower or higher 16bits
+  // of the 32bits address.
+  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
+      reinterpret_cast<char *>(address) -
+      (reinterpret_cast<uintptr_t>(address) & 0x02));
+  float val_f = static_cast<float>(val);
+  uint32_t old = *address_as_ui;
+  uint32_t sum;
+  uint32_t newval;
+  uint32_t assumed;
+  if (((uintptr_t)address & 0x02) == 0) {
+    // the bfloat16 value stay at lower 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(
+          address_as_ui, assumed, bf16_add_to_low_half(assumed, val_f));
+    } while (old != assumed);
+    phi::dtype::bfloat16 ret;
+    ret.x = old & 0xFFFFu;
+    return ret;
+  } else {
+    // the bfloat16 value stay at higher 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(
+          address_as_ui, assumed, bf16_add_to_high_half(assumed, val_f));
+    } while (old != assumed);
+    phi::dtype::bfloat16 ret;
+    ret.x = old >> 16;
+    return ret;
+  }
+}
 #endif
 
-//CUDA_ATOMIC_WRAPPER(Add, complex<float>) {
-//  float *real = reinterpret_cast<float *>(address);
-//  float *imag = real + 1;
-//  return complex<float>(CudaAtomicAdd(real, val.real),
-//                        CudaAtomicAdd(imag, val.imag));
-//}
-//
-//CUDA_ATOMIC_WRAPPER(Add, complex<double>) {
-//  double *real = reinterpret_cast<double *>(address);
-//  double *imag = real + 1;
-//  return complex<double>(CudaAtomicAdd(real, val.real),
-//                         CudaAtomicAdd(imag, val.imag));
-//}
+CUDA_ATOMIC_WRAPPER(Add, complex<float>) {
+  float *real = reinterpret_cast<float *>(address);
+  float *imag = real + 1;
+  return complex<float>(CudaAtomicAdd(real, val.real),
+                        CudaAtomicAdd(imag, val.imag));
+}
+
+CUDA_ATOMIC_WRAPPER(Add, complex<double>) {
+  double *real = reinterpret_cast<double *>(address);
+  double *imag = real + 1;
+  return complex<double>(CudaAtomicAdd(real, val.real),
+                         CudaAtomicAdd(imag, val.imag));
+}
 
 // For atomicMax
 USE_CUDA_ATOMIC(Max, int);
@@ -470,38 +470,38 @@ inline static __device__ uint32_t bf16_max_to_high_half(uint32_t val, float x) {
   //return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
 }
 
-//CUDA_ATOMIC_WRAPPER(Max, phi::dtype::bfloat16) {
-//  if (*address >= val) {
-//    return *address;
-//  }
-//  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
-//      reinterpret_cast<char *>(address) -
-//      (reinterpret_cast<uintptr_t>(address) & 0x02));
-//  float val_f = static_cast<float>(val);
-//  uint32_t old = *address_as_ui;
-//  uint32_t assumed;
-//  if (((uintptr_t)address & 0x02) == 0) {
-//    // The bfloat16 value stay at lower 16 bits of the address.
-//    do {
-//      assumed = old;
-//      old = atomicCAS(
-//          address_as_ui, assumed, bf16_max_to_low_half(assumed, val_f));
-//    } while (old != assumed);
-//    phi::dtype::bfloat16 ret;
-//    ret.x = old & 0xFFFFu;
-//    return ret;
-//  } else {
-//    // The bfloat16 value stay at higher 16 bits of the address.
-//    do {
-//      assumed = old;
-//      old = atomicCAS(
-//          address_as_ui, assumed, bf16_max_to_high_half(assumed, val_f));
-//    } while (old != assumed);
-//    phi::dtype::bfloat16 ret;
-//    ret.x = old >> 16;
-//    return ret;
-//  }
-//}
+CUDA_ATOMIC_WRAPPER(Max, phi::dtype::bfloat16) {
+  if (*address >= val) {
+    return *address;
+  }
+  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
+      reinterpret_cast<char *>(address) -
+      (reinterpret_cast<uintptr_t>(address) & 0x02));
+  float val_f = static_cast<float>(val);
+  uint32_t old = *address_as_ui;
+  uint32_t assumed;
+  if (((uintptr_t)address & 0x02) == 0) {
+    // The bfloat16 value stay at lower 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(
+          address_as_ui, assumed, bf16_max_to_low_half(assumed, val_f));
+    } while (old != assumed);
+    phi::dtype::bfloat16 ret;
+    ret.x = old & 0xFFFFu;
+    return ret;
+  } else {
+    // The bfloat16 value stay at higher 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(
+          address_as_ui, assumed, bf16_max_to_high_half(assumed, val_f));
+    } while (old != assumed);
+    phi::dtype::bfloat16 ret;
+    ret.x = old >> 16;
+    return ret;
+  }
+}
 
 // For atomicMin
 USE_CUDA_ATOMIC(Min, int);
@@ -658,38 +658,38 @@ inline static __device__ uint32_t bf16_min_to_high_half(uint32_t val, float x) {
   //return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
 }
 
-//CUDA_ATOMIC_WRAPPER(Min, phi::dtype::bfloat16) {
-//  if (*address <= val) {
-//    return *address;
-//  }
-//  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
-//      reinterpret_cast<char *>(address) -
-//      (reinterpret_cast<uintptr_t>(address) & 0x02));
-//  float val_f = static_cast<float>(val);
-//  uint32_t old = *address_as_ui;
-//  uint32_t assumed;
-//  if (((uintptr_t)address & 0x02) == 0) {
-//    // The bfloat16 value stay at lower 16 bits of the address.
-//    do {
-//      assumed = old;
-//      old = atomicCAS(
-//          address_as_ui, assumed, bf16_min_to_low_half(assumed, val_f));
-//    } while (old != assumed);
-//    phi::dtype::bfloat16 ret;
-//    ret.x = old & 0xFFFFu;
-//    return ret;
-//  } else {
-//    // The bfloat16 value stay at higher 16 bits of the address.
-//    do {
-//      assumed = old;
-//      old = atomicCAS(
-//          address_as_ui, assumed, bf16_min_to_high_half(assumed, val_f));
-//    } while (old != assumed);
-//    phi::dtype::bfloat16 ret;
-//    ret.x = old >> 16;
-//    return ret;
-//  }
-//}
+CUDA_ATOMIC_WRAPPER(Min, phi::dtype::bfloat16) {
+  if (*address <= val) {
+    return *address;
+  }
+  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
+      reinterpret_cast<char *>(address) -
+      (reinterpret_cast<uintptr_t>(address) & 0x02));
+  float val_f = static_cast<float>(val);
+  uint32_t old = *address_as_ui;
+  uint32_t assumed;
+  if (((uintptr_t)address & 0x02) == 0) {
+    // The bfloat16 value stay at lower 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(
+          address_as_ui, assumed, bf16_min_to_low_half(assumed, val_f));
+    } while (old != assumed);
+    phi::dtype::bfloat16 ret;
+    ret.x = old & 0xFFFFu;
+    return ret;
+  } else {
+    // The bfloat16 value stay at higher 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(
+          address_as_ui, assumed, bf16_min_to_high_half(assumed, val_f));
+    } while (old != assumed);
+    phi::dtype::bfloat16 ret;
+    ret.x = old >> 16;
+    return ret;
+  }
+}
 
 #ifdef PADDLE_WITH_CUDA
 /*

diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h
@@ -1019,13 +1019,14 @@ struct is_pod<phi::dtype::float16> {
                             is_standard_layout<phi::dtype::float16>::value;
 };
 
-//template <>
-//struct is_floating_point<phi::dtype::float16>
-//    : std::integral_constant<
-//          bool,
-//          std::is_same<
-//              phi::dtype::float16,
-//              typename std::remove_cv<phi::dtype::float16>::type>::value> {};
+template <>
+struct is_floating_point<phi::dtype::float16>
+    : std::integral_constant<
+          bool,
+          std::is_same<
+              phi::dtype::float16,
+              typename std::remove_cv<phi::dtype::float16>::type>::value> {};
+
 template <>
 struct is_signed<phi::dtype::float16> {
   static const bool value = true;

diff --git a/paddle/phi/common/scalar.h b/paddle/phi/common/scalar.h
@@ -140,10 +140,10 @@ class ScalarBase {
         return static_cast<RT>(data_.f32);
       case DataType::FLOAT64:
         return static_cast<RT>(data_.f64);
-      //case DataType::FLOAT16:
-      //  return static_cast<RT>(data_.f16);
-      //case DataType::BFLOAT16:
-      //  return static_cast<RT>(data_.bf16);
+      case DataType::FLOAT16:
+        return static_cast<RT>(data_.f16);
+      case DataType::BFLOAT16:
+        return static_cast<RT>(data_.bf16);
       case DataType::INT32:
         return static_cast<RT>(data_.i32);
       case DataType::INT64:
@@ -162,10 +162,10 @@ class ScalarBase {
         return static_cast<RT>(data_.ui8);
       case DataType::BOOL:
         return static_cast<RT>(data_.b);
-      //case DataType::COMPLEX64:
-      //  return static_cast<RT>(data_.c64);
-      //case DataType::COMPLEX128:
-      //  return static_cast<RT>(data_.c128);
+      case DataType::COMPLEX64:
+        return static_cast<RT>(data_.c64);
+      case DataType::COMPLEX128:
+        return static_cast<RT>(data_.c128);
       default:
         PD_THROW("Invalid enum scalar data type `", dtype_, "`.");
     }

diff --git a/paddle/phi/core/visit_type.h b/paddle/phi/core/visit_type.h
@@ -281,9 +281,17 @@ namespace phi {
       PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT16, int16_t, __VA_ARGS__) \
       PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT32, int32_t, __VA_ARGS__) \
       PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT64, int64_t, __VA_ARGS__) \
+      PD_PRIVATE_CASE_TYPE(                                                    \
+          NAME, ::phi::DataType::BFLOAT16, phi::bfloat16, __VA_ARGS__)         \
+      PD_PRIVATE_CASE_TYPE(                                                    \
+          NAME, ::phi::DataType::FLOAT16, phi::float16, __VA_ARGS__)           \
       PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::FLOAT32, float, __VA_ARGS__) \
       PD_PRIVATE_CASE_TYPE(                                                    \
           NAME, ::phi::DataType::FLOAT64, double, __VA_ARGS__)                 \
+      PD_PRIVATE_CASE_TYPE(                                                    \
+          NAME, ::phi::DataType::COMPLEX64, phi::complex64, __VA_ARGS__)       \
+      PD_PRIVATE_CASE_TYPE(                                                    \
+          NAME, ::phi::DataType::COMPLEX128, phi::complex128, __VA_ARGS__)     \
       default:                                                                 \
         PADDLE_THROW(phi::errors::InvalidArgument(                             \
             "Invalid enum data type `%d`.", static_cast<int>(__dtype__)));     \

diff --git a/paddle/phi/kernels/funcs/gather_scatter_functor.cu b/paddle/phi/kernels/funcs/gather_scatter_functor.cu
@@ -34,8 +34,7 @@ class ReduceAdd {
       typename tensor_t,
       std::enable_if_t<!std::is_same<tensor_t, uint8_t>::value>* = nullptr>
   __device__ void operator()(tensor_t* self_data, tensor_t* src_data) const {
-    // TODO(@caizhi): enable cudaAtomicAdd
-    //phi::CudaAtomicAdd(self_data, *src_data);
+    phi::CudaAtomicAdd(self_data, *src_data);
   }
   template <typename tensor_t,
             std::enable_if_t<std::is_same<tensor_t, uint8_t>::value>* = nullptr>

diff --git a/paddle/phi/kernels/funcs/im2col.cu b/paddle/phi/kernels/funcs/im2col.cu
@@ -472,8 +472,7 @@ __global__ void col2imOCF(const T* col_data,
 
         if (height_offset >= 0 && height_offset < im_height &&
             width_offset >= 0 && width_offset < im_width) {
-	  // TODO(@caizhi): compile CudaAtomicAdd
-          //phi::CudaAtomicAdd(im_data + im_offset, col_data[col_offset]);
+          phi::CudaAtomicAdd(im_data + im_offset, col_data[col_offset]);
         }
       }
     }

diff --git a/paddle/phi/kernels/funcs/scatter.cu.h b/paddle/phi/kernels/funcs/scatter.cu.h
@@ -76,8 +76,7 @@ __global__ void ScatterCUDAKernel(const T* params,
     if (overwrite) {
       *(output + out_i) = *(params + i);
     } else {
-      // TODO(@caizhi): enable compiling cudaAtomicAdd
-      //phi::CudaAtomicAdd(output + out_i, *(params + i));
+      phi::CudaAtomicAdd(output + out_i, *(params + i));
     }
   }
 }
@@ -111,8 +110,7 @@ __global__ void ScatterNdCUDAKernel(const T* update,
       temp *= output_dims[j];
     }
     int64_t output_i = gather_i + slice_i;
-    // TODO(@caizhi): enable compiling cudaAtomicAdd
-    //phi::CudaAtomicAdd(output + output_i, *(update + i));
+    phi::CudaAtomicAdd(output + output_i, *(update + i));
   }
 }