intel · bader · Jan 20, 2022 · Dec 10, 2021 · Dec 15, 2021 · Dec 15, 2021
@@ -5787,6 +5787,11 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back("-treat-scalable-fixed-error-as-warning");
   }
 
+  // Enable local accessor to shared memory pass for SYCL.
+  if (isa<BackendJobAction>(JA) && IsSYCL) {
+    CmdArgs.push_back("-mllvm");
+    CmdArgs.push_back("-sycl-enable-local-accessor");
+  }
   // These two are potentially updated by AddClangCLArgs.
   codegenoptions::DebugInfoKind DebugInfoKind = codegenoptions::NoDebugInfo;
   bool EmitCodeView = false;

@@ -78,8 +78,12 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA,
                                          const llvm::opt::ArgList &Args) const {
   // Construct lld command.
   // The output from ld.lld is an HSA code object file.
-  ArgStringList LldArgs{"-flavor", "gnu", "--no-undefined", "-shared",
-                        "-plugin-opt=-amdgpu-internalize-symbols"};
+  ArgStringList LldArgs{"-flavor",
+                        "gnu",
+                        "--no-undefined",
+                        "-shared",
+                        "-plugin-opt=-amdgpu-internalize-symbols",
+                        "-plugin-opt=-sycl-enable-local-accessor"};
 
   auto &TC = getToolChain();
   auto &D = TC.getDriver();

@@ -0,0 +1,11 @@
+/// Check the correct handling of sycl-enable-local-accessor option.
+
+// REQUIRES: clang-driver
+
+// RUN:   %clang -fsycl -### %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-NO-OPT %s
+// CHECK-NO-OPT-NOT: "-sycl-enable-local-accessor"
+
+// RUN:   %clang -fsycl -fsycl-targets=nvptx64-nvidia-cuda -### %s 2>&1 \
+// RUN:   | FileCheck %s
+// CHECK: "-sycl-enable-local-accessor"
@@ -8,9 +8,9 @@
 //
 // This pass operates on SYCL kernels being compiled to CUDA. It modifies
 // kernel entry points which take pointers to shared memory and modifies them
-// to take offsets into shared memory (represented by a symbol in the shared address
-// space). The SYCL runtime is expected to provide offsets rather than pointers
-// to these functions.
+// to take offsets into shared memory (represented by a symbol in the shared
+// address space). The SYCL runtime is expected to provide offsets rather than
+// pointers to these functions.
 //
 //===----------------------------------------------------------------------===//
 

@@ -56,6 +56,8 @@ add_llvm_component_library(LLVMSYCLLowerIR
   ESIMDVerifier.cpp
   MutatePrintfAddrspace.cpp
 
+  LocalAccessorToSharedMemory.cpp
+
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/SYCLLowerIR
   ${LLVM_MAIN_SRC_DIR}/projects/vc-intrinsics/GenXIntrinsics/include

@@ -14,92 +14,115 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "LocalAccessorToSharedMemory.h"
-#include "../MCTargetDesc/NVPTXBaseInfo.h"
+#include "llvm/SYCLLowerIR/LocalAccessorToSharedMemory.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/IPO.h"
 
 using namespace llvm;
 
 #define DEBUG_TYPE "localaccessortosharedmemory"
 
+static bool EnableLocalAccessor;
+
+static cl::opt<bool, true> EnableLocalAccessorFlag(
+    "sycl-enable-local-accessor", cl::Hidden,
+    cl::desc("Enable local accessor to shared memory optimisation."),
+    cl::location(EnableLocalAccessor), cl::init(false));
+
 namespace llvm {
 void initializeLocalAccessorToSharedMemoryPass(PassRegistry &);
-}
+} // namespace llvm
 
 namespace {
 
 class LocalAccessorToSharedMemory : public ModulePass {
+private:
+  enum class ArchType { Cuda, AMDHSA, Unsupported };
+
+  struct KernelPayload {
+    KernelPayload(Function *Kernel, MDNode *MD = nullptr)
+        : Kernel(Kernel), MD(MD){};
+    Function *Kernel;
+    MDNode *MD;
+  };
+
+  unsigned SharedASValue = 0;
+
 public:
   static char ID;
   LocalAccessorToSharedMemory() : ModulePass(ID) {}
 
   bool runOnModule(Module &M) override {
+    if (!EnableLocalAccessor)
+      return false;
+
+    auto AT = StringSwitch<ArchType>(M.getTargetTriple().c_str())
+                  .Case("nvptx64-nvidia-cuda", ArchType::Cuda)
+                  .Case("nvptx-nvidia-cuda", ArchType::Cuda)
+                  .Case("amdgcn-amd-amdhsa", ArchType::AMDHSA)
+                  .Default(ArchType::Unsupported);
+
     // Invariant: This pass is only intended to operate on SYCL kernels being
-    // compiled to the `nvptx{,64}-nvidia-cuda` triple.
-    // TODO: make sure that non-SYCL kernels are not impacted.
+    // compiled to either `nvptx{,64}-nvidia-cuda`, or `amdgcn-amd-amdhsa`
+    // triples.
+    if (ArchType::Unsupported == AT)
+      return false;
+
     if (skipModule(M))
       return false;
 
-    // Keep track of whether the module was changed.
-    auto Changed = false;
+    switch (AT) {
+    case ArchType::Cuda:
+      // ADDRESS_SPACE_SHARED = 3,
+      SharedASValue = 3;
+      break;
+    case ArchType::AMDHSA:
+      // LOCAL_ADDRESS = 3,
+      SharedASValue = 3;
+      break;
+    default:
+      SharedASValue = 0;
+      break;
+    }
 
-    // Access `nvvm.annotations` to determine which functions are kernel entry
-    // points.
-    auto NvvmMetadata = M.getNamedMetadata("nvvm.annotations");
-    if (!NvvmMetadata)
+    SmallVector<KernelPayload> Kernels;
+    SmallVector<std::pair<Function *, KernelPayload>> NewToOldKernels;
+    populateKernels(M, Kernels, AT);
+    if (Kernels.empty())
       return false;
 
-    for (auto MetadataNode : NvvmMetadata->operands()) {
-      if (MetadataNode->getNumOperands() != 3)
-        continue;
+    // Process the function and if changed, update the metadata.
+    for (auto K : Kernels) {
+      auto *NewKernel = processKernel(M, K.Kernel);
+      if (NewKernel)
+        NewToOldKernels.push_back(std::make_pair(NewKernel, K));
+    }
 
-      // NVPTX identifies kernel entry points using metadata nodes of the form:
-      //   !X = !{<function>, !"kernel", i32 1}
-      const MDOperand &TypeOperand = MetadataNode->getOperand(1);
-      auto Type = dyn_cast<MDString>(TypeOperand);
-      if (!Type)
-        continue;
-      // Only process kernel entry points.
-      if (Type->getString() != "kernel")
-        continue;
+    if (NewToOldKernels.empty())
+      return false;
 
-      // Get a pointer to the entry point function from the metadata.
-      const MDOperand &FuncOperand = MetadataNode->getOperand(0);
-      if (!FuncOperand)
-        continue;
-      auto FuncConstant = dyn_cast<ConstantAsMetadata>(FuncOperand);
-      if (!FuncConstant)
-        continue;
-      auto Func = dyn_cast<Function>(FuncConstant->getValue());
-      if (!Func)
-        continue;
+    postProcessKernels(NewToOldKernels, AT);
 
-      // Process the function and if changed, update the metadata.
-      auto NewFunc = this->ProcessFunction(M, Func);
-      if (NewFunc) {
-        Changed = true;
-        MetadataNode->replaceOperandWith(
-            0, llvm::ConstantAsMetadata::get(NewFunc));
-      }
-    }
+    return true;
+  }
 
-    return Changed;
+  virtual llvm::StringRef getPassName() const override {
+    return "SYCL Local Accessor to Shared Memory";
   }
 
-  Function *ProcessFunction(Module &M, Function *F) {
+private:
+  Function *processKernel(Module &M, Function *F) {
     // Check if this function is eligible by having an argument that uses shared
     // memory.
     auto UsesLocalMemory = false;
     for (Function::arg_iterator FA = F->arg_begin(), FE = F->arg_end();
          FA != FE; ++FA) {
-      if (FA->getType()->isPointerTy()) {
-        UsesLocalMemory =
-            FA->getType()->getPointerAddressSpace() == ADDRESS_SPACE_SHARED;
-      }
-      if (UsesLocalMemory) {
+      if (FA->getType()->isPointerTy() &&
+          FA->getType()->getPointerAddressSpace() == SharedASValue) {
+        UsesLocalMemory = true;
         break;
       }
     }
@@ -111,9 +134,9 @@ class LocalAccessorToSharedMemory : public ModulePass {
     // Create a global symbol to CUDA shared memory.
     auto SharedMemGlobalName = F->getName().str();
     SharedMemGlobalName.append("_shared_mem");
-    auto SharedMemGlobalType =
+    auto *SharedMemGlobalType =
         ArrayType::get(Type::getInt8Ty(M.getContext()), 0);
-    auto SharedMemGlobal = new GlobalVariable(
+    auto *SharedMemGlobal = new GlobalVariable(
         /* Module= */ M,
         /* Type= */ &*SharedMemGlobalType,
         /* IsConstant= */ false,
@@ -122,7 +145,7 @@ class LocalAccessorToSharedMemory : public ModulePass {
         /* Name= */ Twine{SharedMemGlobalName},
         /* InsertBefore= */ nullptr,
         /* ThreadLocalMode= */ GlobalValue::NotThreadLocal,
-        /* AddressSpace= */ ADDRESS_SPACE_SHARED,
+        /* AddressSpace= */ SharedASValue,
         /* IsExternallyInitialized= */ false);
     SharedMemGlobal->setAlignment(Align(4));
 
@@ -139,7 +162,7 @@ class LocalAccessorToSharedMemory : public ModulePass {
     for (Function::arg_iterator FA = F->arg_begin(), FE = F->arg_end();
          FA != FE; ++FA, ++i) {
       if (FA->getType()->isPointerTy() &&
-          FA->getType()->getPointerAddressSpace() == ADDRESS_SPACE_SHARED) {
+          FA->getType()->getPointerAddressSpace() == SharedASValue) {
         // Replace pointers to shared memory with i32 offsets.
         Arguments.push_back(Type::getInt32Ty(M.getContext()));
         ArgumentAttributes.push_back(
@@ -178,8 +201,8 @@ class LocalAccessorToSharedMemory : public ModulePass {
       if (ArgumentReplaced[i]) {
         // If this argument was replaced, then create a `getelementptr`
         // instruction that uses it to recreate the pointer that was replaced.
-        auto InsertBefore = &NF->getEntryBlock().front();
-        auto PtrInst = GetElementPtrInst::CreateInBounds(
+        auto *InsertBefore = &NF->getEntryBlock().front();
+        auto *PtrInst = GetElementPtrInst::CreateInBounds(
             /* PointeeType= */ SharedMemGlobalType,
             /* Ptr= */ SharedMemGlobal,
             /* IdxList= */
@@ -191,7 +214,7 @@ class LocalAccessorToSharedMemory : public ModulePass {
         // Then create a bitcast to make sure the new pointer is the same type
         // as the old one. This will only ever be a `i8 addrspace(3)*` to `i32
         // addrspace(3)*` type of cast.
-        auto CastInst = new BitCastInst(PtrInst, FA->getType());
+        auto *CastInst = new BitCastInst(PtrInst, FA->getType());
         CastInst->insertAfter(PtrInst);
         NewValueForUse = CastInst;
       }
@@ -217,11 +240,85 @@ class LocalAccessorToSharedMemory : public ModulePass {
     return NF;
   }
 
-  virtual llvm::StringRef getPassName() const {
-    return "localaccessortosharedmemory";
+  void populateCudaKernels(Module &M, SmallVector<KernelPayload> &Kernels) {
+    // Access `nvvm.annotations` to determine which functions are kernel entry
+    // points.
+    auto *NvvmMetadata = M.getNamedMetadata("nvvm.annotations");
+    if (!NvvmMetadata)
+      return;
+
+    for (auto *MetadataNode : NvvmMetadata->operands()) {
+      if (MetadataNode->getNumOperands() != 3)
+        continue;
+
+      // NVPTX identifies kernel entry points using metadata nodes of the form:
+      //   !X = !{<function>, !"kernel", i32 1}
+      const MDOperand &TypeOperand = MetadataNode->getOperand(1);
+      auto *Type = dyn_cast<MDString>(TypeOperand);
+      if (!Type)
+        continue;
+      // Only process kernel entry points.
+      if (Type->getString() != "kernel")
+        continue;
+
+      // Get a pointer to the entry point function from the metadata.
+      const MDOperand &FuncOperand = MetadataNode->getOperand(0);
+      if (!FuncOperand)
+        continue;
+      auto *FuncConstant = dyn_cast<ConstantAsMetadata>(FuncOperand);
+      if (!FuncConstant)
+        continue;
+      auto *Func = dyn_cast<Function>(FuncConstant->getValue());
+      if (!Func)
+        continue;
+
+      Kernels.push_back(KernelPayload(Func, MetadataNode));
+    }
+  }
+
+  void populateAMDKernels(Module &M, SmallVector<KernelPayload> &Kernels) {
+    for (auto &F : M) {
+      if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL)
+        Kernels.push_back(KernelPayload(&F));
+    }
   }
-};
 
+  void populateKernels(Module &M, SmallVector<KernelPayload> &Kernels,
+                       ArchType AT) {
+    switch (AT) {
+    case ArchType::Cuda:
+      return populateCudaKernels(M, Kernels);
+    case ArchType::AMDHSA:
+      return populateAMDKernels(M, Kernels);
+    default:
+      llvm_unreachable("Unsupported arch type.");
+    }
+  }
+
+  void postProcessCudaKernels(
+      SmallVector<std::pair<Function *, KernelPayload>> &NewToOldKernels) {
+    for (auto &Pair : NewToOldKernels) {
+      std::get<1>(Pair).MD->replaceOperandWith(
+          0, llvm::ConstantAsMetadata::get(std::get<0>(Pair)));
+    }
+  }
+
+  void postProcessAMDKernels(
+      SmallVector<std::pair<Function *, KernelPayload>> &NewToOldKernels) {}
+
+  void postProcessKernels(
+      SmallVector<std::pair<Function *, KernelPayload>> &NewToOldKernels,
+      ArchType AT) {
+    switch (AT) {
+    case ArchType::Cuda:
+      return postProcessCudaKernels(NewToOldKernels);
+    case ArchType::AMDHSA:
+      return postProcessAMDKernels(NewToOldKernels);
+    default:
+      llvm_unreachable("Unsupported arch type.");
+    }
+  }
+};
 } // end anonymous namespace
 
 char LocalAccessorToSharedMemory::ID = 0;

@@ -25,6 +25,8 @@ FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone);
 FunctionPass *createAMDGPURegBankCombiner(bool IsOptNone);
 void initializeAMDGPURegBankCombinerPass(PassRegistry &);
 
+void initializeLocalAccessorToSharedMemoryPass(PassRegistry &);
+
 // SI Passes
 FunctionPass *createGCNDPPCombinePass();
 FunctionPass *createSIAnnotateControlFlowPass();