Skip to content

Commit

Permalink
Enable CUDA Fuser for ROCm (#45965)
Browse files Browse the repository at this point in the history
Summary:
This enables the cuda fuser on ROCm and enables tests for them.

Part of this patch is based on work of Rohith Nallamaddi, thank you.
Errors are my own, of course.

Pull Request resolved: pytorch/pytorch#45965

Reviewed By: seemethere

Differential Revision: D24170457

Pulled By: walterddr

fbshipit-source-id: 3dd25b3501a41d2f00acba3ce8642ce51c49c9a6
  • Loading branch information
t-vi authored and facebook-github-bot committed Oct 8, 2020
1 parent e29e50e commit 8ace1ef
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 37 deletions.
4 changes: 4 additions & 0 deletions codegen.cpp
Expand Up @@ -113,7 +113,11 @@ class CudaKernelGenerator : private OptInConstDispatch {
// Shared memory
if (has_dynamic_smem || has_reductions) {
indent() << "alignas("
#ifndef __HIP_PLATFORM_HCC__
<< dataTypeSize(kernel_summary.largest_smem_data_type)
#else
<< 8 // for HIP, we want 8-aligned even for smaller datatypes
#endif
<< ") extern __shared__ char array[];\n";

if (has_dynamic_smem) {
Expand Down
10 changes: 7 additions & 3 deletions executor.cpp
Expand Up @@ -25,9 +25,13 @@ int FusionExecutor::fusion_id_counter_ = 0;

std::string FusionExecutor::getStructuredCode(const std::string& kernel) {
// generating cuda code;
std::string code = std::string("namespace ") +
FusionExecutor::kernelNamespace() + " {\n" +
executor_utils::kernelPreamble() + kernel + "}\n";
std::string code = "";
#ifdef __HIP_PLATFORM_HCC__
code += std::string("#include <hip/hip_runtime.h>\n") +
std::string("#include <hip/hip_fp16.h>\n");
#endif
code += std::string("namespace ") + FusionExecutor::kernelNamespace() +
" {\n" + executor_utils::kernelPreamble() + kernel + "}\n";

const char* debug_env = getenv("PYTORCH_CUDA_FUSER_DEBUG");
if (debug_env && atoi(debug_env)) {
Expand Down
11 changes: 11 additions & 0 deletions executor_utils.cpp
Expand Up @@ -272,10 +272,14 @@ NvrtcFunction nvrtcCompile(
at::globalContext().getNVRTC().nvrtcDestroyProgram(&program));
});

#ifdef __HIP_PLATFORM_HCC__
std::vector<const char*> args = {"--std=c++14"};
#else
const std::string compute = "--gpu-architecture=compute_" +
std::to_string(major) + std::to_string(minor);
std::vector<const char*> args = {
"--std=c++14", compute.c_str(), "-default-device"};
#endif

const char* disable_fma = getenv("PYTORCH_CUDA_FUSER_DISABLE_FMA");
// int disable_fma_flag = disable_fma ? atoi(disable_fma) : 0;
Expand Down Expand Up @@ -346,6 +350,7 @@ NvrtcFunction nvrtcCompile(
// TODO: We do go through different code path, should investigate whether this
// has an impact on generated binary.
const char* prefix_env = getenv("PYTORCH_CUDA_FUSER_CUBIN");
#ifndef __HIP_PLATFORM_HCC__
if (prefix_env) {
FUSER_PERF_SCOPE("load CUBIN");

Expand Down Expand Up @@ -403,6 +408,12 @@ NvrtcFunction nvrtcCompile(
options.data(),
option_vals.data()));
}
#else
// load ptx directly
AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuModuleLoadData(
&(compiled_kernel_.module), ptx.data()));

#endif
AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuModuleGetFunction(
&(compiled_kernel_.function),
compiled_kernel_.module,
Expand Down
73 changes: 39 additions & 34 deletions kernel_resource_strings.h
Expand Up @@ -12,7 +12,7 @@ typedef long long int int64_t;
template<typename T, int N>
struct Tensor {
T& operator[](int64_t ind) {
__device__ T& operator[](int64_t ind) {
return data[ind];
};
Expand All @@ -25,7 +25,7 @@ struct Tensor {
// They will be an error as well since zero-length arrays are not allowed.
template<typename T>
struct Tensor<T, 0> {
T& operator[](int64_t) {
__device__ T& operator[](int64_t) {
return *data;
};
Expand All @@ -34,6 +34,9 @@ struct Tensor<T, 0> {
)";

// Code support for FP16 __half type and intrinsics
#ifdef __HIP_PLATFORM_HCC__
static auto code_fp16_support = R"()";
#else
static auto code_fp16_support = R"(
#define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
#define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
Expand All @@ -55,7 +58,7 @@ __device__ float __half2float(const __half h) {
return val;
}
)";

#endif
// struct and code for functions that need random number generation
static auto code_random_number_gen = R"(
class Philox {
Expand Down Expand Up @@ -184,6 +187,9 @@ __device__ float randLike(Philox rnd) {
};
)";

// Note: We agressively template functions taking dim3 in the functions below
// because ROCM uses different types for the various dim3 and maps them
// directly to intrinsics, but they're dim3 when used after modification.
/*
* EXAMPLE USAGE:
* blockReduceSum<X_THREADS, Y_THREADS, Z_THREADS>
Expand All @@ -196,14 +202,14 @@ static auto code_template_block_reduction = R"(
// participate, otherwise it is the number of threads. We could start with warp
// reductions, then reduce the warps, this could save some shared memory, but
// may actually be slower.
template<bool X_REDUCE, bool Y_REDUCE, bool Z_REDUCE, typename T, typename Func>
template<bool X_REDUCE, bool Y_REDUCE, bool Z_REDUCE, typename T, typename Func, typename _dim3ti, typename _dim3bd>
__inline__ __device__
void blockReduce(
T& out,
const T inp_val,
Func reduction_op,
const dim3& thread_idx,
const dim3& block_dim,
const _dim3ti& thread_idx,
const _dim3bd& block_dim,
T* shared_mem,
bool read_write_pred,
T init_val) {
Expand Down Expand Up @@ -324,49 +330,47 @@ static auto code_template_grid_reduction = R"(
namespace reduction {
// Utility functions
__host__ __device__ __forceinline__ size_t size(const dim3& d) {
template<typename _dim3>
__host__ __device__ __forceinline__ size_t size(const _dim3& d) {
return (size_t)d.x * (size_t)d.y * (size_t)d.z;
}
__host__ __device__ __forceinline__ int isize(const dim3& d) {
return d.x * d.y * d.z;
}
#define isize(d) d.x * d.y * d.z
__host__ __device__ __forceinline__ size_t offset(const dim3& pos, const dim3& dim) {
template<typename _dim3pos, typename _dim3dim>
__host__ __device__ __forceinline__ size_t offset(const _dim3pos& pos, const _dim3dim& dim) {
return (size_t)pos.x + (size_t)pos.y * (size_t)dim.x +
(size_t)pos.z * (size_t)dim.x * (size_t)dim.y;
}
__host__ __device__ __forceinline__ size_t ioffset(const dim3& pos, const dim3& dim) {
return pos.x + pos.y * dim.x + pos.z * dim.x * dim.y;
}
#define ioffset(pos, dim) pos.x + pos.y * dim.x + pos.z * dim.x * dim.y
// Returns dim3 of each reduction segment.
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK>
__host__ __device__ dim3 dimension_of_reduction_segment(const dim3& grid_dim) {
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, typename _dim3>
__host__ __device__ dim3 dimension_of_reduction_segment(const _dim3& grid_dim) {
return dim3{X_BLOCK ? grid_dim.x : 1,
Y_BLOCK ? grid_dim.y : 1,
Z_BLOCK ? grid_dim.z : 1};
}
// Returns the number of blocks in each reduction segment.
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK>
__host__ __device__ size_t size_of_reduction_segment(const dim3& grid_dim) {
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, typename _dim3>
__host__ __device__ size_t size_of_reduction_segment(const _dim3& grid_dim) {
return size(dimension_of_reduction_segment<X_BLOCK, Y_BLOCK, Z_BLOCK>(grid_dim));
}
// Returns the total number of reduction segments.
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK>
__host__ __device__ size_t number_of_reduction_segments(const dim3& grid_dim) {
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, typename _dim3>
__host__ __device__ size_t number_of_reduction_segments(const _dim3& grid_dim) {
return (X_BLOCK ? 1: grid_dim.x) *
(Y_BLOCK ? 1 : grid_dim.y) *
(Z_BLOCK ? 1 : grid_dim.z);
}
// Returns the 1-D index of the segment of thread block of block_idx.
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK>
__host__ __device__ size_t index_of_reduction_segment(const dim3& block_idx,
const dim3& grid_dim) {
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, typename _dim3bi, typename _dim3gd>
__host__ __device__ size_t index_of_reduction_segment(const _dim3bi& block_idx,
const _dim3gd& grid_dim) {
size_t seg_idx = 0;
if (!Z_BLOCK)
seg_idx += block_idx.z;
Expand All @@ -378,9 +382,9 @@ __host__ __device__ size_t index_of_reduction_segment(const dim3& block_idx,
}
// Returns the offset of thread block in its reduction segment.
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK>
__host__ __device__ size_t offset_in_reduction_segment(const dim3& block_idx,
const dim3& grid_dim) {
template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, typename _dim3bi, typename _dim3gd>
__host__ __device__ size_t offset_in_reduction_segment(const _dim3bi& block_idx,
const _dim3gd& grid_dim) {
size_t offset = 0;
if (Z_BLOCK)
offset = offset * grid_dim.z + block_idx.z;
Expand All @@ -392,23 +396,24 @@ __host__ __device__ size_t offset_in_reduction_segment(const dim3& block_idx,
}
// Returns dim3 of each reduction block.
template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD>
__host__ __device__ dim3 dimension_of_reduction_block(const dim3& block_dim) {
template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename _dim3>
__host__ __device__ dim3 dimension_of_reduction_block(const _dim3& block_dim) {
return dim3{X_THREAD ? block_dim.x : 1,
Y_THREAD ? block_dim.y : 1,
Z_THREAD ? block_dim.z : 1};
}
// Returns the number of threads of each reduction block.
template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD>
__host__ __device__ int size_of_reduction_block(const dim3& block_dim) {
return isize(dimension_of_reduction_block<X_THREAD, Y_THREAD, Z_THREAD>(block_dim));
template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename _dim3>
__host__ __device__ int size_of_reduction_block(const _dim3& block_dim) {
auto tmp_dim = dimension_of_reduction_block<X_THREAD, Y_THREAD, Z_THREAD>(block_dim);
return isize(tmp_dim);
}
// Returns the linear offset of a thread in a reduction block.
template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD>
__host__ __device__ int offset_in_reduction_block(const dim3& thread_idx,
const dim3& block_dim) {
template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename _dim3ti, typename _dim3bd>
__host__ __device__ int offset_in_reduction_block(const _dim3ti& thread_idx,
const _dim3bd& block_dim) {
int offset = 0;
if (Z_THREAD)
offset += thread_idx.z;
Expand Down

0 comments on commit 8ace1ef

Please sign in to comment.