diff --git a/codegen.cpp b/codegen.cpp
index f6e791f..f976af9 100644
--- a/codegen.cpp
+++ b/codegen.cpp
@@ -113,7 +113,11 @@ class CudaKernelGenerator : private OptInConstDispatch {
     // Shared memory
     if (has_dynamic_smem || has_reductions) {
       indent() << "alignas("
+#ifndef __HIP_PLATFORM_HCC__
                << dataTypeSize(kernel_summary.largest_smem_data_type)
+#else
+               << 8 // for HIP, we want 8-aligned even for smaller datatypes
+#endif
                << ") extern __shared__ char array[];\n";
 
       if (has_dynamic_smem) {
diff --git a/executor.cpp b/executor.cpp
index a0df3c7..2500381 100644
--- a/executor.cpp
+++ b/executor.cpp
@@ -25,9 +25,13 @@ int FusionExecutor::fusion_id_counter_ = 0;
 
 std::string FusionExecutor::getStructuredCode(const std::string& kernel) {
   // generating cuda code;
-  std::string code = std::string("namespace ") +
-      FusionExecutor::kernelNamespace() + " {\n" +
-      executor_utils::kernelPreamble() + kernel + "}\n";
+  std::string code = "";
+#ifdef __HIP_PLATFORM_HCC__
+  code += std::string("#include <hip/hip_runtime.h>\n") +
+      std::string("#include <hip/hip_fp16.h>\n");
+#endif
+  code += std::string("namespace ") + FusionExecutor::kernelNamespace() +
+      " {\n" + executor_utils::kernelPreamble() + kernel + "}\n";
 
   const char* debug_env = getenv("PYTORCH_CUDA_FUSER_DEBUG");
   if (debug_env && atoi(debug_env)) {
diff --git a/executor_utils.cpp b/executor_utils.cpp
index af4e127..19f873c 100644
--- a/executor_utils.cpp
+++ b/executor_utils.cpp
@@ -272,10 +272,14 @@ NvrtcFunction nvrtcCompile(
         at::globalContext().getNVRTC().nvrtcDestroyProgram(&program));
   });
 
+#ifdef __HIP_PLATFORM_HCC__
+  std::vector<const char*> args = {"--std=c++14"};
+#else
   const std::string compute = "--gpu-architecture=compute_" +
       std::to_string(major) + std::to_string(minor);
   std::vector<const char*> args = {
       "--std=c++14", compute.c_str(), "-default-device"};
+#endif
 
   const char* disable_fma = getenv("PYTORCH_CUDA_FUSER_DISABLE_FMA");
   // int disable_fma_flag = disable_fma ? atoi(disable_fma) : 0;
@@ -346,6 +350,7 @@ NvrtcFunction nvrtcCompile(
   // TODO: We do go through different code path, should investigate whether this
   // has an impact on generated binary.
   const char* prefix_env = getenv("PYTORCH_CUDA_FUSER_CUBIN");
+#ifndef __HIP_PLATFORM_HCC__
   if (prefix_env) {
     FUSER_PERF_SCOPE("load CUBIN");
 
@@ -403,6 +408,12 @@ NvrtcFunction nvrtcCompile(
         options.data(),
         option_vals.data()));
   }
+#else
+  // load ptx directly
+  AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuModuleLoadData(
+      &(compiled_kernel_.module), ptx.data()));
+
+#endif
   AT_CUDA_DRIVER_CHECK(at::globalContext().getNVRTC().cuModuleGetFunction(
       &(compiled_kernel_.function),
       compiled_kernel_.module,
diff --git a/kernel_resource_strings.h b/kernel_resource_strings.h
index d30eb3f..a601a95 100644
--- a/kernel_resource_strings.h
+++ b/kernel_resource_strings.h
@@ -12,7 +12,7 @@ typedef long long int int64_t;
 
 template<typename T, int N>
 struct Tensor {
-  T& operator[](int64_t ind) {
+  __device__ T& operator[](int64_t ind) {
     return data[ind];
   };
 
@@ -25,7 +25,7 @@ struct Tensor {
 // They will be an error as well since zero-length arrays are not allowed.
 template<typename T>
 struct Tensor<T, 0> {
-  T& operator[](int64_t) {
+  __device__ T& operator[](int64_t) {
     return *data;
   };
 
@@ -34,6 +34,9 @@ struct Tensor<T, 0> {
 )";
 
 // Code support for FP16 __half type and intrinsics
+#ifdef __HIP_PLATFORM_HCC__
+static auto code_fp16_support = R"()";
+#else
 static auto code_fp16_support = R"(
 #define __HALF_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
 #define __HALF_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
@@ -55,7 +58,7 @@ __device__ float __half2float(const __half h) {
   return val;
 }
 )";
-
+#endif
 // struct and code for functions that need random number generation
 static auto code_random_number_gen = R"(
 class Philox {
@@ -184,6 +187,9 @@ __device__ float randLike(Philox rnd) {
 };
 )";
 
+// Note: We agressively template functions taking dim3 in the functions below
+//       because ROCM uses different types for the various dim3 and maps them
+//       directly to intrinsics, but they're dim3 when used after modification.
 /*
  *  EXAMPLE USAGE:
  *  blockReduceSum<X_THREADS, Y_THREADS, Z_THREADS>
@@ -196,14 +202,14 @@ static auto code_template_block_reduction = R"(
 // participate, otherwise it is the number of threads. We could start with warp
 // reductions, then reduce the warps, this could save some shared memory, but
 // may actually be slower.
-template<bool X_REDUCE, bool Y_REDUCE, bool Z_REDUCE, typename T, typename Func>
+template<bool X_REDUCE, bool Y_REDUCE, bool Z_REDUCE, typename T, typename Func, typename _dim3ti, typename _dim3bd>
 __inline__ __device__
 void blockReduce(
     T& out,
     const T inp_val,
     Func reduction_op,
-    const dim3& thread_idx,
-    const dim3& block_dim,
+    const _dim3ti& thread_idx,
+    const _dim3bd& block_dim,
     T* shared_mem,
     bool read_write_pred,
     T init_val) {
@@ -324,49 +330,47 @@ static auto code_template_grid_reduction = R"(
 namespace reduction {
 
 // Utility functions
-__host__ __device__ __forceinline__ size_t size(const dim3& d) {
+template<typename _dim3>
+__host__ __device__ __forceinline__ size_t size(const _dim3& d) {
   return (size_t)d.x * (size_t)d.y * (size_t)d.z;
 }
 
-__host__ __device__ __forceinline__ int isize(const dim3& d) {
-  return d.x * d.y * d.z;
-}
+#define isize(d) d.x * d.y * d.z
 
-__host__ __device__ __forceinline__ size_t offset(const dim3& pos, const dim3& dim) {
+template<typename _dim3pos, typename _dim3dim>
+__host__ __device__ __forceinline__ size_t offset(const _dim3pos& pos, const _dim3dim& dim) {
   return (size_t)pos.x + (size_t)pos.y * (size_t)dim.x +
       (size_t)pos.z * (size_t)dim.x * (size_t)dim.y;
 }
 
-__host__ __device__ __forceinline__ size_t ioffset(const dim3& pos, const dim3& dim) {
-  return pos.x + pos.y * dim.x + pos.z * dim.x * dim.y;
-}
+#define ioffset(pos, dim) pos.x + pos.y * dim.x + pos.z * dim.x * dim.y
 
 // Returns dim3 of each reduction segment.
-template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK>
-__host__ __device__ dim3 dimension_of_reduction_segment(const dim3& grid_dim) {
+template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, typename _dim3>
+__host__ __device__ dim3 dimension_of_reduction_segment(const _dim3& grid_dim) {
   return dim3{X_BLOCK ? grid_dim.x : 1,
         Y_BLOCK ? grid_dim.y : 1,
         Z_BLOCK ? grid_dim.z : 1};
 }
 
 // Returns the number of blocks in each reduction segment.
-template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK>
-__host__ __device__ size_t size_of_reduction_segment(const dim3& grid_dim) {
+template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, typename _dim3>
+__host__ __device__ size_t size_of_reduction_segment(const _dim3& grid_dim) {
   return size(dimension_of_reduction_segment<X_BLOCK, Y_BLOCK, Z_BLOCK>(grid_dim));
 }
 
 // Returns the total number of reduction segments.
-template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK>
-__host__ __device__ size_t number_of_reduction_segments(const dim3& grid_dim) {
+template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, typename _dim3>
+__host__ __device__ size_t number_of_reduction_segments(const _dim3& grid_dim) {
   return (X_BLOCK ? 1: grid_dim.x) *
       (Y_BLOCK ? 1 : grid_dim.y) *
       (Z_BLOCK ? 1 : grid_dim.z);
 }
 
 // Returns the 1-D index of the segment of thread block of block_idx.
-template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK>
-__host__ __device__ size_t index_of_reduction_segment(const dim3& block_idx,
-                                                      const dim3& grid_dim) {
+template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, typename _dim3bi, typename _dim3gd>
+__host__ __device__ size_t index_of_reduction_segment(const _dim3bi& block_idx,
+                                                      const _dim3gd& grid_dim) {
   size_t seg_idx = 0;
   if (!Z_BLOCK)
     seg_idx += block_idx.z;
@@ -378,9 +382,9 @@ __host__ __device__ size_t index_of_reduction_segment(const dim3& block_idx,
 }
 
 // Returns the offset of thread block in its reduction segment.
-template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK>
-__host__ __device__ size_t offset_in_reduction_segment(const dim3& block_idx,
-                                                       const dim3& grid_dim) {
+template <bool X_BLOCK, bool Y_BLOCK, bool Z_BLOCK, typename _dim3bi, typename _dim3gd>
+__host__ __device__ size_t offset_in_reduction_segment(const _dim3bi& block_idx,
+                                                       const _dim3gd& grid_dim) {
   size_t offset = 0;
   if (Z_BLOCK)
     offset = offset * grid_dim.z + block_idx.z;
@@ -392,23 +396,24 @@ __host__ __device__ size_t offset_in_reduction_segment(const dim3& block_idx,
 }
 
 // Returns dim3 of each reduction block.
-template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD>
-__host__ __device__ dim3 dimension_of_reduction_block(const dim3& block_dim) {
+template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename _dim3>
+__host__ __device__ dim3 dimension_of_reduction_block(const _dim3& block_dim) {
   return dim3{X_THREAD ? block_dim.x : 1,
         Y_THREAD ? block_dim.y : 1,
         Z_THREAD ? block_dim.z : 1};
 }
 
 // Returns the number of threads of each reduction block.
-template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD>
-__host__ __device__ int size_of_reduction_block(const dim3& block_dim) {
-  return isize(dimension_of_reduction_block<X_THREAD, Y_THREAD, Z_THREAD>(block_dim));
+template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename _dim3>
+__host__ __device__ int size_of_reduction_block(const _dim3& block_dim) {
+  auto tmp_dim = dimension_of_reduction_block<X_THREAD, Y_THREAD, Z_THREAD>(block_dim);
+  return isize(tmp_dim);
 }
 
 // Returns the linear offset of a thread in a reduction block.
-template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD>
-__host__ __device__ int offset_in_reduction_block(const dim3& thread_idx,
-                                                  const dim3& block_dim) {
+template <bool X_THREAD, bool Y_THREAD, bool Z_THREAD, typename _dim3ti, typename _dim3bd>
+__host__ __device__ int offset_in_reduction_block(const _dim3ti& thread_idx,
+                                                  const _dim3bd& block_dim) {
   int offset = 0;
   if (Z_THREAD)
     offset += thread_idx.z;