Skip to content

Commit

Permalink
Merge pull request PaddlePaddle#27 from mthreads/fix_mcc
Browse files Browse the repository at this point in the history
[MTAI-484] fix(build): fix CudaAtomicAdd bug
  • Loading branch information
caizhi-mt authored and mt-robot committed Aug 10, 2023
2 parents 53539d2 + 25e7ee4 commit 8c1ef73
Show file tree
Hide file tree
Showing 10 changed files with 153 additions and 157 deletions.
3 changes: 3 additions & 0 deletions cmake/musa.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ endif()

list(APPEND MUSA_MCC_FLAGS --cuda-gpu-arch=mp_21)
list(APPEND MUSA_MCC_FLAGS -U__CUDA__)
# MUSA has compile conflicts of float16.h as platform::float16 overload std::is_floating_point and std::is_integer
list(APPEND MUSA_MCC_FLAGS -D__MUSA_NO_HALF_CONVERSIONS__)

#set(MUSA_VERBOSE_BUILD ON)
if(CMAKE_BUILD_TYPE MATCHES Debug)
list(APPEND MUSA_MCC_FLAGS -g2)
Expand Down
220 changes: 110 additions & 110 deletions paddle/phi/backends/gpu/gpu_primitives.h
Original file line number Diff line number Diff line change
Expand Up @@ -266,54 +266,54 @@ CUDA_ATOMIC_WRAPPER(Add, phi::dtype::bfloat16) {
PDBF16ToCUDABF16(val)));
}
#else
//CUDA_ATOMIC_WRAPPER(Add, phi::dtype::bfloat16) {
// // concrete packed bfloat16 value may exsits in lower or higher 16bits
// // of the 32bits address.
// uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
// reinterpret_cast<char *>(address) -
// (reinterpret_cast<uintptr_t>(address) & 0x02));
// float val_f = static_cast<float>(val);
// uint32_t old = *address_as_ui;
// uint32_t sum;
// uint32_t newval;
// uint32_t assumed;
// if (((uintptr_t)address & 0x02) == 0) {
// // the bfloat16 value stay at lower 16 bits of the address.
// do {
// assumed = old;
// old = atomicCAS(
// address_as_ui, assumed, bf16_add_to_low_half(assumed, val_f));
// } while (old != assumed);
// phi::dtype::bfloat16 ret;
// ret.x = old & 0xFFFFu;
// return ret;
// } else {
// // the bfloat16 value stay at higher 16 bits of the address.
// do {
// assumed = old;
// old = atomicCAS(
// address_as_ui, assumed, bf16_add_to_high_half(assumed, val_f));
// } while (old != assumed);
// phi::dtype::bfloat16 ret;
// ret.x = old >> 16;
// return ret;
// }
//}
CUDA_ATOMIC_WRAPPER(Add, phi::dtype::bfloat16) {
// concrete packed bfloat16 value may exsits in lower or higher 16bits
// of the 32bits address.
uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
reinterpret_cast<char *>(address) -
(reinterpret_cast<uintptr_t>(address) & 0x02));
float val_f = static_cast<float>(val);
uint32_t old = *address_as_ui;
uint32_t sum;
uint32_t newval;
uint32_t assumed;
if (((uintptr_t)address & 0x02) == 0) {
// the bfloat16 value stay at lower 16 bits of the address.
do {
assumed = old;
old = atomicCAS(
address_as_ui, assumed, bf16_add_to_low_half(assumed, val_f));
} while (old != assumed);
phi::dtype::bfloat16 ret;
ret.x = old & 0xFFFFu;
return ret;
} else {
// the bfloat16 value stay at higher 16 bits of the address.
do {
assumed = old;
old = atomicCAS(
address_as_ui, assumed, bf16_add_to_high_half(assumed, val_f));
} while (old != assumed);
phi::dtype::bfloat16 ret;
ret.x = old >> 16;
return ret;
}
}
#endif

//CUDA_ATOMIC_WRAPPER(Add, complex<float>) {
// float *real = reinterpret_cast<float *>(address);
// float *imag = real + 1;
// return complex<float>(CudaAtomicAdd(real, val.real),
// CudaAtomicAdd(imag, val.imag));
//}
//
//CUDA_ATOMIC_WRAPPER(Add, complex<double>) {
// double *real = reinterpret_cast<double *>(address);
// double *imag = real + 1;
// return complex<double>(CudaAtomicAdd(real, val.real),
// CudaAtomicAdd(imag, val.imag));
//}
CUDA_ATOMIC_WRAPPER(Add, complex<float>) {
float *real = reinterpret_cast<float *>(address);
float *imag = real + 1;
return complex<float>(CudaAtomicAdd(real, val.real),
CudaAtomicAdd(imag, val.imag));
}

CUDA_ATOMIC_WRAPPER(Add, complex<double>) {
double *real = reinterpret_cast<double *>(address);
double *imag = real + 1;
return complex<double>(CudaAtomicAdd(real, val.real),
CudaAtomicAdd(imag, val.imag));
}

// For atomicMax
USE_CUDA_ATOMIC(Max, int);
Expand Down Expand Up @@ -470,38 +470,38 @@ inline static __device__ uint32_t bf16_max_to_high_half(uint32_t val, float x) {
//return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
}

//CUDA_ATOMIC_WRAPPER(Max, phi::dtype::bfloat16) {
// if (*address >= val) {
// return *address;
// }
// uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
// reinterpret_cast<char *>(address) -
// (reinterpret_cast<uintptr_t>(address) & 0x02));
// float val_f = static_cast<float>(val);
// uint32_t old = *address_as_ui;
// uint32_t assumed;
// if (((uintptr_t)address & 0x02) == 0) {
// // The bfloat16 value stay at lower 16 bits of the address.
// do {
// assumed = old;
// old = atomicCAS(
// address_as_ui, assumed, bf16_max_to_low_half(assumed, val_f));
// } while (old != assumed);
// phi::dtype::bfloat16 ret;
// ret.x = old & 0xFFFFu;
// return ret;
// } else {
// // The bfloat16 value stay at higher 16 bits of the address.
// do {
// assumed = old;
// old = atomicCAS(
// address_as_ui, assumed, bf16_max_to_high_half(assumed, val_f));
// } while (old != assumed);
// phi::dtype::bfloat16 ret;
// ret.x = old >> 16;
// return ret;
// }
//}
CUDA_ATOMIC_WRAPPER(Max, phi::dtype::bfloat16) {
if (*address >= val) {
return *address;
}
uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
reinterpret_cast<char *>(address) -
(reinterpret_cast<uintptr_t>(address) & 0x02));
float val_f = static_cast<float>(val);
uint32_t old = *address_as_ui;
uint32_t assumed;
if (((uintptr_t)address & 0x02) == 0) {
// The bfloat16 value stay at lower 16 bits of the address.
do {
assumed = old;
old = atomicCAS(
address_as_ui, assumed, bf16_max_to_low_half(assumed, val_f));
} while (old != assumed);
phi::dtype::bfloat16 ret;
ret.x = old & 0xFFFFu;
return ret;
} else {
// The bfloat16 value stay at higher 16 bits of the address.
do {
assumed = old;
old = atomicCAS(
address_as_ui, assumed, bf16_max_to_high_half(assumed, val_f));
} while (old != assumed);
phi::dtype::bfloat16 ret;
ret.x = old >> 16;
return ret;
}
}

// For atomicMin
USE_CUDA_ATOMIC(Min, int);
Expand Down Expand Up @@ -658,38 +658,38 @@ inline static __device__ uint32_t bf16_min_to_high_half(uint32_t val, float x) {
//return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
}

//CUDA_ATOMIC_WRAPPER(Min, phi::dtype::bfloat16) {
// if (*address <= val) {
// return *address;
// }
// uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
// reinterpret_cast<char *>(address) -
// (reinterpret_cast<uintptr_t>(address) & 0x02));
// float val_f = static_cast<float>(val);
// uint32_t old = *address_as_ui;
// uint32_t assumed;
// if (((uintptr_t)address & 0x02) == 0) {
// // The bfloat16 value stay at lower 16 bits of the address.
// do {
// assumed = old;
// old = atomicCAS(
// address_as_ui, assumed, bf16_min_to_low_half(assumed, val_f));
// } while (old != assumed);
// phi::dtype::bfloat16 ret;
// ret.x = old & 0xFFFFu;
// return ret;
// } else {
// // The bfloat16 value stay at higher 16 bits of the address.
// do {
// assumed = old;
// old = atomicCAS(
// address_as_ui, assumed, bf16_min_to_high_half(assumed, val_f));
// } while (old != assumed);
// phi::dtype::bfloat16 ret;
// ret.x = old >> 16;
// return ret;
// }
//}
CUDA_ATOMIC_WRAPPER(Min, phi::dtype::bfloat16) {
if (*address <= val) {
return *address;
}
uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
reinterpret_cast<char *>(address) -
(reinterpret_cast<uintptr_t>(address) & 0x02));
float val_f = static_cast<float>(val);
uint32_t old = *address_as_ui;
uint32_t assumed;
if (((uintptr_t)address & 0x02) == 0) {
// The bfloat16 value stay at lower 16 bits of the address.
do {
assumed = old;
old = atomicCAS(
address_as_ui, assumed, bf16_min_to_low_half(assumed, val_f));
} while (old != assumed);
phi::dtype::bfloat16 ret;
ret.x = old & 0xFFFFu;
return ret;
} else {
// The bfloat16 value stay at higher 16 bits of the address.
do {
assumed = old;
old = atomicCAS(
address_as_ui, assumed, bf16_min_to_high_half(assumed, val_f));
} while (old != assumed);
phi::dtype::bfloat16 ret;
ret.x = old >> 16;
return ret;
}
}

#ifdef PADDLE_WITH_CUDA
/*
Expand Down
15 changes: 8 additions & 7 deletions paddle/phi/common/float16.h
Original file line number Diff line number Diff line change
Expand Up @@ -1019,13 +1019,14 @@ struct is_pod<phi::dtype::float16> {
is_standard_layout<phi::dtype::float16>::value;
};

//template <>
//struct is_floating_point<phi::dtype::float16>
// : std::integral_constant<
// bool,
// std::is_same<
// phi::dtype::float16,
// typename std::remove_cv<phi::dtype::float16>::type>::value> {};
template <>
struct is_floating_point<phi::dtype::float16>
: std::integral_constant<
bool,
std::is_same<
phi::dtype::float16,
typename std::remove_cv<phi::dtype::float16>::type>::value> {};

template <>
struct is_signed<phi::dtype::float16> {
static const bool value = true;
Expand Down
16 changes: 8 additions & 8 deletions paddle/phi/common/scalar.h
Original file line number Diff line number Diff line change
Expand Up @@ -140,10 +140,10 @@ class ScalarBase {
return static_cast<RT>(data_.f32);
case DataType::FLOAT64:
return static_cast<RT>(data_.f64);
//case DataType::FLOAT16:
// return static_cast<RT>(data_.f16);
//case DataType::BFLOAT16:
// return static_cast<RT>(data_.bf16);
case DataType::FLOAT16:
return static_cast<RT>(data_.f16);
case DataType::BFLOAT16:
return static_cast<RT>(data_.bf16);
case DataType::INT32:
return static_cast<RT>(data_.i32);
case DataType::INT64:
Expand All @@ -162,10 +162,10 @@ class ScalarBase {
return static_cast<RT>(data_.ui8);
case DataType::BOOL:
return static_cast<RT>(data_.b);
//case DataType::COMPLEX64:
// return static_cast<RT>(data_.c64);
//case DataType::COMPLEX128:
// return static_cast<RT>(data_.c128);
case DataType::COMPLEX64:
return static_cast<RT>(data_.c64);
case DataType::COMPLEX128:
return static_cast<RT>(data_.c128);
default:
PD_THROW("Invalid enum scalar data type `", dtype_, "`.");
}
Expand Down
8 changes: 8 additions & 0 deletions paddle/phi/core/visit_type.h
Original file line number Diff line number Diff line change
Expand Up @@ -281,9 +281,17 @@ namespace phi {
PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT16, int16_t, __VA_ARGS__) \
PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT32, int32_t, __VA_ARGS__) \
PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::INT64, int64_t, __VA_ARGS__) \
PD_PRIVATE_CASE_TYPE( \
NAME, ::phi::DataType::BFLOAT16, phi::bfloat16, __VA_ARGS__) \
PD_PRIVATE_CASE_TYPE( \
NAME, ::phi::DataType::FLOAT16, phi::float16, __VA_ARGS__) \
PD_PRIVATE_CASE_TYPE(NAME, ::phi::DataType::FLOAT32, float, __VA_ARGS__) \
PD_PRIVATE_CASE_TYPE( \
NAME, ::phi::DataType::FLOAT64, double, __VA_ARGS__) \
PD_PRIVATE_CASE_TYPE( \
NAME, ::phi::DataType::COMPLEX64, phi::complex64, __VA_ARGS__) \
PD_PRIVATE_CASE_TYPE( \
NAME, ::phi::DataType::COMPLEX128, phi::complex128, __VA_ARGS__) \
default: \
PADDLE_THROW(phi::errors::InvalidArgument( \
"Invalid enum data type `%d`.", static_cast<int>(__dtype__))); \
Expand Down
3 changes: 1 addition & 2 deletions paddle/phi/kernels/funcs/gather_scatter_functor.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,7 @@ class ReduceAdd {
typename tensor_t,
std::enable_if_t<!std::is_same<tensor_t, uint8_t>::value>* = nullptr>
__device__ void operator()(tensor_t* self_data, tensor_t* src_data) const {
// TODO(@caizhi): enable cudaAtomicAdd
//phi::CudaAtomicAdd(self_data, *src_data);
phi::CudaAtomicAdd(self_data, *src_data);
}
template <typename tensor_t,
std::enable_if_t<std::is_same<tensor_t, uint8_t>::value>* = nullptr>
Expand Down
3 changes: 1 addition & 2 deletions paddle/phi/kernels/funcs/im2col.cu
Original file line number Diff line number Diff line change
Expand Up @@ -472,8 +472,7 @@ __global__ void col2imOCF(const T* col_data,

if (height_offset >= 0 && height_offset < im_height &&
width_offset >= 0 && width_offset < im_width) {
// TODO(@caizhi): compile CudaAtomicAdd
//phi::CudaAtomicAdd(im_data + im_offset, col_data[col_offset]);
phi::CudaAtomicAdd(im_data + im_offset, col_data[col_offset]);
}
}
}
Expand Down
6 changes: 2 additions & 4 deletions paddle/phi/kernels/funcs/scatter.cu.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,7 @@ __global__ void ScatterCUDAKernel(const T* params,
if (overwrite) {
*(output + out_i) = *(params + i);
} else {
// TODO(@caizhi): enable compiling cudaAtomicAdd
//phi::CudaAtomicAdd(output + out_i, *(params + i));
phi::CudaAtomicAdd(output + out_i, *(params + i));
}
}
}
Expand Down Expand Up @@ -111,8 +110,7 @@ __global__ void ScatterNdCUDAKernel(const T* update,
temp *= output_dims[j];
}
int64_t output_i = gather_i + slice_i;
// TODO(@caizhi): enable compiling cudaAtomicAdd
//phi::CudaAtomicAdd(output + output_i, *(update + i));
phi::CudaAtomicAdd(output + output_i, *(update + i));
}
}

Expand Down
Loading

0 comments on commit 8c1ef73

Please sign in to comment.