From 3fdb587d8bb5142e0da79f80d921d57eef842331 Mon Sep 17 00:00:00 2001 From: kvladimi Date: Wed, 20 May 2020 04:31:08 -0700 Subject: [PATCH] Open-source whole Vector Compute backend Change-Id: Id22b13722d4c79f70e4b0d1629510526d8dcf2e2 --- IGC/AdaptorOCL/cmc.cpp | 144 + IGC/AdaptorOCL/cmc.h | 8 + IGC/AdaptorOCL/dllInterfaceCompute.cpp | 150 + IGC/CMakeLists.txt | 24 + IGC/VectorCompiler/.gitignore | 60 + IGC/VectorCompiler/CMakeLists.txt | 158 + IGC/VectorCompiler/cmake/spirv.cmake | 180 + IGC/VectorCompiler/include/CMakeLists.txt | 18 + IGC/VectorCompiler/include/vc/CMakeLists.txt | 1 + .../include/vc/GenXCodeGen/GenXTarget.h | 42 + .../include/vc/GenXCodeGen/GenXWrapper.h | 142 + .../include/vc/GenXOpts/GenXAnalysis.h | 79 + .../include/vc/GenXOpts/GenXOpts.h | 74 + .../include/vc/GenXOpts/Utils/CMRegion.h | 237 + .../include/vc/GenXOpts/Utils/GenXSTLExtras.h | 80 + .../include/vc/GenXOpts/Utils/KernelInfo.h | 356 + .../include/vc/GenXOpts/Utils/RegCategory.h | 55 + .../include/vc/Support/CMakeLists.txt | 3 + .../include/vc/Support/Options.h | 58 + .../include/vc/Support/Options.td | 117 + .../include/vc/Support/Status.h | 158 + .../include/vc/Support/StatusCode.h | 75 + .../include/vc/Support/StatusTraits.h | 85 + .../lib/BackendPlugin/BackendPlugin.cpp | 36 + .../lib/BackendPlugin/CMakeLists.txt | 39 + IGC/VectorCompiler/lib/CMakeLists.txt | 9 + .../lib/GenXCodeGen/CMakeLists.txt | 84 + .../lib/GenXCodeGen/FunctionGroup.cpp | 671 ++ .../lib/GenXCodeGen/FunctionGroup.h | 280 + IGC/VectorCompiler/lib/GenXCodeGen/GenX.h | 157 + IGC/VectorCompiler/lib/GenXCodeGen/GenX.td | 87 + .../lib/GenXCodeGen/GenXAddressCommoning.cpp | 1047 +++ .../GenXAggregatePseudoLowering.cpp | 366 ++ .../lib/GenXCodeGen/GenXAlignmentInfo.cpp | 401 ++ .../lib/GenXCodeGen/GenXAlignmentInfo.h | 154 + .../lib/GenXCodeGen/GenXAnalysisDumper.cpp | 144 + .../lib/GenXCodeGen/GenXArgIndirection.cpp | 1822 ++++++ .../lib/GenXCodeGen/GenXBaling.cpp | 2365 +++++++ .../lib/GenXCodeGen/GenXBaling.h | 550 ++ .../lib/GenXCodeGen/GenXCFSimplification.cpp | 354 + .../lib/GenXCodeGen/GenXCategory.cpp | 1060 +++ .../lib/GenXCodeGen/GenXCisaBuilder.cpp | 5779 +++++++++++++++++ .../lib/GenXCodeGen/GenXCoalescing.cpp | 1759 +++++ .../lib/GenXCodeGen/GenXConstants.cpp | 1524 +++++ .../lib/GenXCodeGen/GenXConstants.h | 135 + .../lib/GenXCodeGen/GenXDeadVectorRemoval.cpp | 746 +++ .../lib/GenXCodeGen/GenXDepressurizer.cpp | 1662 +++++ .../lib/GenXCodeGen/GenXEmulate.cpp | 174 + .../lib/GenXCodeGen/GenXExtractVectorizer.cpp | 295 + .../lib/GenXCodeGen/GenXFuncPtrsLowering.cpp | 364 ++ .../lib/GenXCodeGen/GenXGEPLowering.cpp | 324 + .../lib/GenXCodeGen/GenXGotoJoin.cpp | 332 + .../lib/GenXCodeGen/GenXGotoJoin.h | 83 + .../GenXCodeGen/GenXIMadPostLegalization.cpp | 390 ++ .../lib/GenXCodeGen/GenXInlineAsmLowering.cpp | 345 + .../GenXCodeGen/GenXInstCombineCleanup.cpp | 141 + .../lib/GenXCodeGen/GenXIntrinsics.cpp | 201 + .../lib/GenXCodeGen/GenXIntrinsics.h | 324 + .../lib/GenXCodeGen/GenXLayoutBlocks.cpp | 126 + .../lib/GenXCodeGen/GenXLegalization.cpp | 2613 ++++++++ .../lib/GenXCodeGen/GenXLiveRanges.cpp | 215 + .../lib/GenXCodeGen/GenXLiveness.cpp | 1872 ++++++ .../lib/GenXCodeGen/GenXLiveness.h | 666 ++ .../lib/GenXCodeGen/GenXLowerAggrCopies.cpp | 200 + .../lib/GenXCodeGen/GenXLowerAggrCopies.h | 41 + .../lib/GenXCodeGen/GenXLowering.cpp | 3071 +++++++++ .../lib/GenXCodeGen/GenXModule.cpp | 140 + .../lib/GenXCodeGen/GenXModule.h | 185 + .../lib/GenXCodeGen/GenXNumbering.cpp | 392 ++ .../lib/GenXCodeGen/GenXNumbering.h | 166 + .../lib/GenXCodeGen/GenXOCLInfoExtractor.cpp | 77 + .../lib/GenXCodeGen/GenXOCLRuntimeInfo.cpp | 292 + .../lib/GenXCodeGen/GenXOCLRuntimeInfo.h | 256 + .../lib/GenXCodeGen/GenXPatternMatch.cpp | 2640 ++++++++ .../lib/GenXCodeGen/GenXPostLegalization.cpp | 171 + .../lib/GenXCodeGen/GenXPressureTracker.cpp | 211 + .../lib/GenXCodeGen/GenXPressureTracker.h | 91 + .../lib/GenXCodeGen/GenXPrinter.cpp | 243 + .../lib/GenXCodeGen/GenXPromoteArray.cpp | 1081 +++ .../lib/GenXCodeGen/GenXPromotePredicate.cpp | 204 + .../lib/GenXCodeGen/GenXRawSendRipper.cpp | 96 + .../lib/GenXCodeGen/GenXReduceIntSize.cpp | 1038 +++ .../lib/GenXCodeGen/GenXRegion.cpp | 954 +++ .../lib/GenXCodeGen/GenXRegion.h | 197 + .../lib/GenXCodeGen/GenXRegionCollapsing.cpp | 1460 +++++ .../lib/GenXCodeGen/GenXRematerialization.cpp | 146 + .../lib/GenXCodeGen/GenXSimdCFConformance.cpp | 3698 +++++++++++ .../lib/GenXCodeGen/GenXSubtarget.cpp | 145 + .../lib/GenXCodeGen/GenXSubtarget.h | 293 + .../lib/GenXCodeGen/GenXTargetMachine.cpp | 546 ++ .../lib/GenXCodeGen/GenXTargetMachine.h | 183 + .../GenXCodeGen/GenXThreadPrivateMemory.cpp | 1023 +++ .../lib/GenXCodeGen/GenXTidyControlFlow.cpp | 302 + .../lib/GenXCodeGen/GenXUnbaling.cpp | 1204 ++++ .../lib/GenXCodeGen/GenXUtil.cpp | 1446 +++++ IGC/VectorCompiler/lib/GenXCodeGen/GenXUtil.h | 429 ++ .../lib/GenXCodeGen/GenXVectorDecomposer.cpp | 1177 ++++ .../lib/GenXCodeGen/GenXVectorDecomposer.h | 175 + IGC/VectorCompiler/lib/GenXCodeGen/GenXVisa.h | 140 + .../lib/GenXCodeGen/GenXVisaRegAlloc.cpp | 698 ++ .../lib/GenXCodeGen/GenXVisaRegAlloc.h | 253 + .../lib/GenXCodeGen/GenXWATable.cpp | 34 + .../lib/GenXCodeGen/GenXWATable.h | 57 + .../lib/GenXCodeGen/GenXWrapper.cpp | 717 ++ .../lib/GenXCodeGen/IgnoreRAUWValueMap.h | 42 + .../lib/GenXCodeGen/IsaDescription.h | 254 + .../lib/GenXCodeGen/KillAnalysis.cpp | 188 + .../lib/GenXCodeGen/KillAnalysis.h | 51 + .../lib/GenXCodeGen/TargetInfo/CMakeLists.txt | 5 + .../GenXCodeGen/TargetInfo/GenXTargetInfo.cpp | 50 + .../GenXCodeGen/TargetInfo/GenXTargetInfo.h | 39 + .../lib/GenXCodeGen/Utils/CMakeLists.txt | 23 + .../Utils/cisa_gen_intrinsics.json | 3674 +++++++++++ .../GenXCodeGen/Utils/cisa_gen_intrinsics.py | 230 + .../CMAnalysis/ConstantFoldingGenX.cpp | 285 + .../CMAnalysis/InstructionSimplifyGenX.cpp | 269 + .../GenXOpts/CMPacketize/GenXPacketize.cpp | 1757 +++++ .../GenXOpts/CMPacketize/PacketBuilder.cpp | 209 + .../lib/GenXOpts/CMPacketize/PacketBuilder.h | 340 + .../CMPacketize/PacketBuilder_math.cpp | 163 + .../CMPacketize/PacketBuilder_mem.cpp | 172 + .../CMPacketize/PacketBuilder_misc.cpp | 503 ++ .../lib/GenXOpts/CMPacketize/README.md | 1 + .../lib/GenXOpts/CMPacketize/WIAnalysis.cpp | 900 +++ .../lib/GenXOpts/CMPacketize/WIAnalysis.hpp | 265 + .../lib/GenXOpts/CMPacketize/gen_builder.hpp | 1035 +++ .../CMPacketize/gen_builder_intrin.hpp | 172 + .../GenXOpts/CMPacketize/gen_builder_meta.hpp | 244 + .../lib/GenXOpts/CMTrans/CMABI.cpp | 1942 ++++++ .../lib/GenXOpts/CMTrans/CMImpParam.cpp | 701 ++ .../GenXOpts/CMTrans/CMKernelArgOffset.cpp | 621 ++ .../lib/GenXOpts/CMTrans/CMRegion.cpp | 925 +++ .../lib/GenXOpts/CMakeLists.txt | 20 + IGC/VectorCompiler/lib/Support/CMakeLists.txt | 11 + IGC/VectorCompiler/lib/Support/Options.cpp | 62 + IGC/VectorCompiler/lib/Support/Status.cpp | 150 + ...ddress-spaces-for-VectorCompute-glob.patch | 40 + ...ncParamKindINTEL-and-DecorationFuncP.patch | 198 + .../0003-Add-SPIRVDLL-and-VCExport.patch | 216 + ...Lib-from-targets-Rename-tool-llvm-sp.patch | 107 + IGC/VectorCompiler/tests/vctest_config.yml | 2 + IGC/VectorCompiler/unittests/CMakeLists.txt | 9 + .../unittests/Regions/CMakeLists.txt | 17 + .../unittests/Regions/OverlapTest.cpp | 81 + .../unittests/SPIRVConversions/CMakeLists.txt | 16 + .../SPIRVConversions/SPIRVConversionsTest.cpp | 255 + IGC/common/igc_flags.def | 2 + 147 files changed, 74118 insertions(+) create mode 100644 IGC/VectorCompiler/.gitignore create mode 100644 IGC/VectorCompiler/CMakeLists.txt create mode 100644 IGC/VectorCompiler/cmake/spirv.cmake create mode 100644 IGC/VectorCompiler/include/CMakeLists.txt create mode 100644 IGC/VectorCompiler/include/vc/CMakeLists.txt create mode 100644 IGC/VectorCompiler/include/vc/GenXCodeGen/GenXTarget.h create mode 100644 IGC/VectorCompiler/include/vc/GenXCodeGen/GenXWrapper.h create mode 100644 IGC/VectorCompiler/include/vc/GenXOpts/GenXAnalysis.h create mode 100644 IGC/VectorCompiler/include/vc/GenXOpts/GenXOpts.h create mode 100644 IGC/VectorCompiler/include/vc/GenXOpts/Utils/CMRegion.h create mode 100644 IGC/VectorCompiler/include/vc/GenXOpts/Utils/GenXSTLExtras.h create mode 100644 IGC/VectorCompiler/include/vc/GenXOpts/Utils/KernelInfo.h create mode 100644 IGC/VectorCompiler/include/vc/GenXOpts/Utils/RegCategory.h create mode 100644 IGC/VectorCompiler/include/vc/Support/CMakeLists.txt create mode 100644 IGC/VectorCompiler/include/vc/Support/Options.h create mode 100644 IGC/VectorCompiler/include/vc/Support/Options.td create mode 100644 IGC/VectorCompiler/include/vc/Support/Status.h create mode 100644 IGC/VectorCompiler/include/vc/Support/StatusCode.h create mode 100644 IGC/VectorCompiler/include/vc/Support/StatusTraits.h create mode 100644 IGC/VectorCompiler/lib/BackendPlugin/BackendPlugin.cpp create mode 100644 IGC/VectorCompiler/lib/BackendPlugin/CMakeLists.txt create mode 100644 IGC/VectorCompiler/lib/CMakeLists.txt create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/CMakeLists.txt create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/FunctionGroup.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/FunctionGroup.h create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenX.h create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenX.td create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXAddressCommoning.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXAggregatePseudoLowering.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXAlignmentInfo.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXAlignmentInfo.h create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXAnalysisDumper.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXArgIndirection.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXBaling.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXBaling.h create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXCFSimplification.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXCategory.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXCisaBuilder.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXCoalescing.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXConstants.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXConstants.h create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXDeadVectorRemoval.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXDepressurizer.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXEmulate.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXExtractVectorizer.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXFuncPtrsLowering.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXGEPLowering.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXGotoJoin.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXGotoJoin.h create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXIMadPostLegalization.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXInlineAsmLowering.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXInstCombineCleanup.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXIntrinsics.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXIntrinsics.h create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXLayoutBlocks.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXLegalization.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveRanges.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveness.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveness.h create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXLowerAggrCopies.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXLowerAggrCopies.h create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXLowering.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXModule.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXModule.h create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXNumbering.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXNumbering.h create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXOCLInfoExtractor.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXOCLRuntimeInfo.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXOCLRuntimeInfo.h create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXPatternMatch.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXPostLegalization.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXPressureTracker.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXPressureTracker.h create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXPrinter.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXPromoteArray.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXPromotePredicate.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXRawSendRipper.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXReduceIntSize.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXRegion.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXRegion.h create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXRegionCollapsing.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXRematerialization.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXSimdCFConformance.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXSubtarget.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXSubtarget.h create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXTargetMachine.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXTargetMachine.h create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXThreadPrivateMemory.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXTidyControlFlow.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXUnbaling.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXUtil.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXUtil.h create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXVectorDecomposer.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXVectorDecomposer.h create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXVisa.h create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXVisaRegAlloc.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXVisaRegAlloc.h create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXWATable.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXWATable.h create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/GenXWrapper.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/IgnoreRAUWValueMap.h create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/IsaDescription.h create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/KillAnalysis.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/KillAnalysis.h create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/TargetInfo/CMakeLists.txt create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/TargetInfo/GenXTargetInfo.cpp create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/TargetInfo/GenXTargetInfo.h create mode 100644 IGC/VectorCompiler/lib/GenXCodeGen/Utils/CMakeLists.txt create mode 100755 IGC/VectorCompiler/lib/GenXCodeGen/Utils/cisa_gen_intrinsics.json create mode 100755 IGC/VectorCompiler/lib/GenXCodeGen/Utils/cisa_gen_intrinsics.py create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMAnalysis/ConstantFoldingGenX.cpp create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMAnalysis/InstructionSimplifyGenX.cpp create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMPacketize/GenXPacketize.cpp create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder.cpp create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder.h create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder_math.cpp create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder_mem.cpp create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder_misc.cpp create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMPacketize/README.md create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMPacketize/WIAnalysis.cpp create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMPacketize/WIAnalysis.hpp create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMPacketize/gen_builder.hpp create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMPacketize/gen_builder_intrin.hpp create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMPacketize/gen_builder_meta.hpp create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMTrans/CMABI.cpp create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMTrans/CMImpParam.cpp create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMTrans/CMKernelArgOffset.cpp create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMTrans/CMRegion.cpp create mode 100644 IGC/VectorCompiler/lib/GenXOpts/CMakeLists.txt create mode 100644 IGC/VectorCompiler/lib/Support/CMakeLists.txt create mode 100644 IGC/VectorCompiler/lib/Support/Options.cpp create mode 100644 IGC/VectorCompiler/lib/Support/Status.cpp create mode 100644 IGC/VectorCompiler/spirv-patches-new/0001-Add-common-OCL-address-spaces-for-VectorCompute-glob.patch create mode 100644 IGC/VectorCompiler/spirv-patches-new/0002-Add-DecorationFuncParamKindINTEL-and-DecorationFuncP.patch create mode 100644 IGC/VectorCompiler/spirv-patches-new/0003-Add-SPIRVDLL-and-VCExport.patch create mode 100644 IGC/VectorCompiler/spirv-patches-new/0004-Remove-LLVMSPIRVLib-from-targets-Rename-tool-llvm-sp.patch create mode 100644 IGC/VectorCompiler/tests/vctest_config.yml create mode 100644 IGC/VectorCompiler/unittests/CMakeLists.txt create mode 100644 IGC/VectorCompiler/unittests/Regions/CMakeLists.txt create mode 100644 IGC/VectorCompiler/unittests/Regions/OverlapTest.cpp create mode 100644 IGC/VectorCompiler/unittests/SPIRVConversions/CMakeLists.txt create mode 100644 IGC/VectorCompiler/unittests/SPIRVConversions/SPIRVConversionsTest.cpp diff --git a/IGC/AdaptorOCL/cmc.cpp b/IGC/AdaptorOCL/cmc.cpp index 225357fdacec..1df4d3f7e34b 100644 --- a/IGC/AdaptorOCL/cmc.cpp +++ b/IGC/AdaptorOCL/cmc.cpp @@ -611,3 +611,147 @@ int cmc::vISACompile_v2(cmc_compile_info_v2* output, iOpenCL::CGen8CMProgram& CM CMProgram.CreateKernelBinaries(); return status; } + +static void getCmcArg(cmc_arg_info& CmcArg, const vc::ocl::ArgInfo& Arg) +{ + switch (Arg.Kind) + { + case vc::ocl::ArgKind::General: + CmcArg.kind = cmc_arg_kind::General; + break; + case vc::ocl::ArgKind::LocalSize: + CmcArg.kind = cmc_arg_kind::LocalSize; + break; + case vc::ocl::ArgKind::GroupCount: + CmcArg.kind = cmc_arg_kind::GroupCount; + break; + case vc::ocl::ArgKind::Buffer: + CmcArg.kind = cmc_arg_kind::Buffer; + break; + case vc::ocl::ArgKind::SVM: + CmcArg.kind = cmc_arg_kind::SVM; + break; + case vc::ocl::ArgKind::Sampler: + CmcArg.kind = cmc_arg_kind::Sampler; + break; + case vc::ocl::ArgKind::Image1d: + CmcArg.kind = cmc_arg_kind::Image1d; + break; + case vc::ocl::ArgKind::Image2d: + CmcArg.kind = cmc_arg_kind::Image2d; + break; + case vc::ocl::ArgKind::Image3d: + CmcArg.kind = cmc_arg_kind::Image3d; + break; + case vc::ocl::ArgKind::PrintBuffer: + CmcArg.kind = cmc_arg_kind::PrintBuffer; + break; + case vc::ocl::ArgKind::PrivateBase: + CmcArg.kind = cmc_arg_kind::PrivateBase; + break; + } + + switch (Arg.AccessKind) + { + case vc::ocl::ArgAccessKind::None: + CmcArg.access = cmc_access_kind::undef; + break; + case vc::ocl::ArgAccessKind::ReadOnly: + CmcArg.access = cmc_access_kind::read_only; + break; + case vc::ocl::ArgAccessKind::WriteOnly: + CmcArg.access = cmc_access_kind::write_only; + break; + case vc::ocl::ArgAccessKind::ReadWrite: + CmcArg.access = cmc_access_kind::read_write; + break; + } + + CmcArg.index = Arg.Index; + CmcArg.offset = Arg.Offset; + CmcArg.sizeInBytes = Arg.SizeInBytes; + CmcArg.BTI = Arg.BTI; +} + +// Returns vector of cmc_arg_info with all fields initialized. +static std::vector getCmcArgInfos(const std::vector& Args) +{ + std::vector CmcArgs{Args.size()}; + for (unsigned i = 0, e = Args.size(); i != e; ++i) + getCmcArg(CmcArgs[i], Args[i]); + return CmcArgs; +} + +static std::vector getCmcPrintStrings( + const std::vector& Original) +{ + std::vector Converted; + std::transform(Original.begin(), Original.end(), std::back_inserter(Converted), + [](const std::string &str) { + IGC_ASSERT_MESSAGE(str.size() < cmc_ocl_print_string::max_width, "illegal string length"); + cmc_ocl_print_string Tmp; + strcpy_s(Tmp.s, cmc_ocl_print_string::max_width, str.c_str()); + return Tmp; + }); + return Converted; +} + +struct CmcContext +{ + std::vector Args; + std::vector PrintStrings; +}; + +// Fills non-owning cmc_kernel_info with all fields initialized. +static void getCmcKernelInfo( + cmc_kernel_info_v2& CmcInfo, + const vc::ocl::KernelInfo& Info, + const FINALIZER_INFO& JitInfo, + CmcContext& CmcCtx) +{ + IGC_ASSERT_MESSAGE(CmcCtx.PrintStrings.size() == Info.PrintStrings.size(), "inconsistent arguments"); + CmcInfo.name = Info.Name.c_str(); + CmcInfo.num_args = CmcCtx.Args.size(); + CmcInfo.arg_descs = CmcCtx.Args.data(); + CmcInfo.HasLocalIDx = true; + CmcInfo.HasLocalIDy = true; + CmcInfo.HasLocalIDz = true; + CmcInfo.HasGroupID = Info.HasGroupID; + CmcInfo.CompiledSIMDSize = 1; + CmcInfo.SLMSize = Info.SLMSize; + CmcInfo.NumGRFRequired = JitInfo.numGRFTotal; + CmcInfo.GRFByteSize = Info.GRFSizeInBytes; + CmcInfo.HasBarriers = Info.HasBarriers; + CmcInfo.StatelessPrivateMemSize = Info.StatelessPrivateMemSize; + CmcInfo.HasReadWriteImages = Info.HasReadWriteImages; + CmcInfo.num_print_strings = CmcCtx.PrintStrings.size(); + CmcInfo.print_string_descs = CmcCtx.PrintStrings.data(); + // std::copy requires either reinteprets or implementation of operator= in + // TableInfos from independent headers so memcpy seems to be the best option + // for now + memcpy_s(&CmcInfo.RelocationTable, sizeof(Info.RelocationTable), &Info.RelocationTable, + sizeof(Info.RelocationTable)); + memcpy_s(&CmcInfo.SymbolTable, sizeof(Info.SymbolTable), &Info.SymbolTable, + sizeof(Info.SymbolTable)); +} + +void vc::createBinary( + iOpenCL::CGen8CMProgram& CMProgram, + const std::vector& CompileInfos) +{ + cmc_kernel_info_v2 CmcInfo; + CmcContext CmcCtx; + for (const vc::ocl::CompileInfo& Info : CompileInfos) + { + CmcCtx.Args = getCmcArgInfos(Info.KernelInfo.Args); + CmcCtx.PrintStrings = getCmcPrintStrings(Info.KernelInfo.PrintStrings); + getCmcKernelInfo(CmcInfo, Info.KernelInfo, Info.JitInfo, CmcCtx); + CMKernel* K = new CMKernel(CMProgram.getPlatform()); + CMProgram.m_kernels.push_back(K); + llvm::ArrayRef GenBin{ + reinterpret_cast(Info.GenBinary.data()), + Info.GenBinary.size()}; + populateKernelInfo_v2(&CmcInfo, Info.JitInfo, GenBin, *K); + } + CMProgram.CreateKernelBinaries(); +} diff --git a/IGC/AdaptorOCL/cmc.h b/IGC/AdaptorOCL/cmc.h index f369e381dc46..1b23741131f0 100644 --- a/IGC/AdaptorOCL/cmc.h +++ b/IGC/AdaptorOCL/cmc.h @@ -35,6 +35,9 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "igcmc.h" #include "Compiler/CodeGenPublic.h" +#include "common/LLVMWarningsPush.hpp" +#include "VectorCompiler/include/vc/GenXCodeGen/GenXWrapper.h" +#include "common/LLVMWarningsPop.hpp" namespace iOpenCL { class CGen8CMProgram; @@ -111,3 +114,8 @@ extern int vISACompile_v2(cmc_compile_info_v2 *output, extern const char* getPlatformStr(PLATFORM platform); } // namespace cmc + +namespace vc { +void createBinary(iOpenCL::CGen8CMProgram &CMProgram, + const std::vector &CompileInfos); +} // namespace vc diff --git a/IGC/AdaptorOCL/dllInterfaceCompute.cpp b/IGC/AdaptorOCL/dllInterfaceCompute.cpp index b42eb24f034d..a874c2aa0031 100644 --- a/IGC/AdaptorOCL/dllInterfaceCompute.cpp +++ b/IGC/AdaptorOCL/dllInterfaceCompute.cpp @@ -55,6 +55,11 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. #include "AdaptorOCL/OCL/sp/gtpin_igc_ocl.h" #include "AdaptorOCL/igcmc.h" #include "AdaptorOCL/cmc.h" +#include "common/LLVMWarningsPush.hpp" +#include +#include "VectorCompiler/include/vc/Support/StatusCode.h" +#include "VectorCompiler/include/vc/GenXCodeGen/GenXWrapper.h" +#include "common/LLVMWarningsPop.hpp" #include @@ -820,6 +825,14 @@ static bool TranslateBuildCM(const STB_TranslateInputArgs* pInputArgs, const IGC::CPlatform& IGCPlatform, float profilingTimerResolution); +#if !defined(WDDM_LINUX) +static std::error_code TranslateBuildVC( + const STB_TranslateInputArgs* pInputArgs, + STB_TranslateOutputArgs* pOutputArgs, TB_DATA_FORMAT inputDataFormatTemp, + const IGC::CPlatform& IGCPlatform, float profilingTimerResolution); +#endif // !defined(WDDM_LINUX) + + bool TranslateBuild( const STB_TranslateInputArgs* pInputArgs, STB_TranslateOutputArgs* pOutputArgs, @@ -828,6 +841,16 @@ bool TranslateBuild( float profilingTimerResolution) { if (pInputArgs->pOptions) { +#if !defined(WDDM_LINUX) + std::error_code Status = + TranslateBuildVC(pInputArgs, pOutputArgs, inputDataFormatTemp, + IGCPlatform, profilingTimerResolution); + if (!Status) + return true; + // If vc codegen option was not specified, then vc was not called. + if (static_cast(Status.value()) != vc::errc::not_vc_codegen) + return false; +#endif // !defined(WDDM_LINUX) static const char* CMC = "-cmc"; if (strstr(pInputArgs->pOptions, CMC) != nullptr) return TranslateBuildCM(pInputArgs, @@ -1438,4 +1461,131 @@ static bool TranslateBuildCM(const STB_TranslateInputArgs* pInputArgs, return false; } +#if !defined(WDDM_LINUX) + +static void adjustPlatformVC(const IGC::CPlatform& IGCPlatform, + vc::CompileOptions& Opts) +{ + Opts.CPUStr = cmc::getPlatformStr(IGCPlatform.getPlatformInfo()); + Opts.WATable = std::make_unique(IGCPlatform.getWATable()); +} + +static void adjustFileTypeVC(TB_DATA_FORMAT DataFormat, + vc::CompileOptions& Opts) +{ + switch (DataFormat) + { + case TB_DATA_FORMAT::TB_DATA_FORMAT_SPIR_V: + Opts.FType = vc::FileType::SPIRV; + return; + default: + llvm_unreachable("Data format is not supported yet"); + } +} + +static void adjustOptLevelVC(vc::CompileOptions& Opts) +{ + if (IGC_IS_FLAG_ENABLED(VCOptimizeNone)) + Opts.OptLevel = vc::OptimizerLevel::None; +} + +static void adjustOptionsVC(const IGC::CPlatform& IGCPlatform, + TB_DATA_FORMAT DataFormat, vc::CompileOptions& Opts) +{ + adjustPlatformVC(IGCPlatform, Opts); + adjustFileTypeVC(DataFormat, Opts); + adjustOptLevelVC(Opts); +} + +static std::error_code getErrorVC(llvm::Error Err, + STB_TranslateOutputArgs* pOutputArgs) +{ + std::error_code Status; + llvm::handleAllErrors( + std::move(Err), [&Status, pOutputArgs](const llvm::ErrorInfoBase& EI) { + Status = EI.convertToErrorCode(); + // Some tests check for build log when everything is ok. + // So let's not even try to touch things if we were not called. + if (static_cast(Status.value()) == vc::errc::not_vc_codegen) + return; + SetErrorMessage(EI.message(), *pOutputArgs); + }); + return Status; +} + +static void outputBinaryVC(llvm::StringRef Binary, + STB_TranslateOutputArgs* pOutputArgs) +{ + size_t BinarySize = static_cast(Binary.size()); + char* pBinaryOutput = new char[BinarySize]; + memcpy_s(pBinaryOutput, BinarySize, Binary.data(), BinarySize); + pOutputArgs->OutputSize = static_cast(BinarySize); + pOutputArgs->pOutput = pBinaryOutput; +} + +static std::error_code TranslateBuildVC( + const STB_TranslateInputArgs* pInputArgs, + STB_TranslateOutputArgs* pOutputArgs, TB_DATA_FORMAT inputDataFormatTemp, + const IGC::CPlatform& IGCPlatform, float profilingTimerResolution) +{ +#if IGC_VC_DISABLED + SetErrorMessage("IGC VC explicitly disabled in build", *pOutputArgs); + return false; +#else + + llvm::StringRef ApiOptions{pInputArgs->pOptions, pInputArgs->OptionsSize}; + llvm::StringRef InternalOptions{pInputArgs->pInternalOptions, + pInputArgs->InternalOptionsSize}; + auto pInput = pInputArgs->pInput; + size_t InputSize = pInputArgs->InputSize; + + + auto ExpOptions = vc::ParseOptions(ApiOptions, InternalOptions); + if (!ExpOptions) + return getErrorVC(ExpOptions.takeError(), pOutputArgs); + + // Reset options when everything is done here. + // This is needed to not interfere with subsequent translations. + const auto ClOptGuard = + llvm::make_scope_exit([]() { llvm::cl::ResetAllOptionOccurrences(); }); + + vc::CompileOptions& Opts = ExpOptions.get(); + adjustOptionsVC(IGCPlatform, inputDataFormatTemp, Opts); + + llvm::ArrayRef Input{pInput, InputSize}; + auto ExpOutput = vc::Compile(Input, Opts); + if (!ExpOutput) + return getErrorVC(ExpOutput.takeError(), pOutputArgs); + vc::CompileOutput& Res = ExpOutput.get(); + + auto Visitor = [&IGCPlatform, pOutputArgs](auto&& CompileResult) { + using Ty = std::decay_t; + if constexpr (std::is_same_v) + { + outputBinaryVC(CompileResult.IsaBinary, pOutputArgs); + } + else if constexpr (std::is_same_v) + { + iOpenCL::CGen8CMProgram CMProgram{IGCPlatform.getPlatformInfo()}; + vc::createBinary(CMProgram, CompileResult.Kernels); + Util::BinaryStream ProgramBinary; + CMProgram.GetProgramBinary(ProgramBinary, + CompileResult.PointerSizeInBytes); + llvm::StringRef BinaryRef(ProgramBinary.GetLinearPointer(), + ProgramBinary.Size()); + outputBinaryVC(BinaryRef, pOutputArgs); + } + else + { + static_assert(!sizeof(Ty), "One of compile output is not visited"); + } + }; + + std::visit(Visitor, Res); + + return {}; +#endif +} +#endif // !defined(WDDM_LINUX) + } // namespace TC diff --git a/IGC/CMakeLists.txt b/IGC/CMakeLists.txt index 80c793788fa5..4331acc17cb8 100644 --- a/IGC/CMakeLists.txt +++ b/IGC/CMakeLists.txt @@ -2196,6 +2196,14 @@ set(IGC_BUILD__PROJ_NAME_PREFIX "") set(IGC_BUILD__SPIRV_ENABLED ON) +# Enable vector compiler for Linux and Windows +# If user already defined this, honor decision +if(NOT DEFINED IGC_BUILD__VC_ENABLED) + if(LLVM_ON_UNIX OR LLVM_ON_WIN32) + set(IGC_BUILD__VC_ENABLED ON) + endif() +endif() + # ======================================== Path helper variables ======================================= @@ -3078,6 +3086,12 @@ if(IGC_BUILD__SPIRV_ENABLED) ) endif() +#VC OPT switch on/off +if(NOT IGC_BUILD__VC_ENABLED) + set_property(DIRECTORY APPEND PROPERTY COMPILE_DEFINITIONS + IGC_VC_DISABLED + ) +endif() set_property(DIRECTORY APPEND PROPERTY COMPILE_DEFINITIONS _SCL_SECURE_NO_WARNINGS _CRT_SECURE_NO_WARNINGS @@ -3446,6 +3460,10 @@ if(LLVM_ON_WIN32 endif() +if(IGC_BUILD__VC_ENABLED AND NOT CMAKE_WDDM_LINUX) + add_subdirectory(VectorCompiler) +endif() + add_subdirectory(Compiler) add_subdirectory(DriverInterface) igc_sg_define(IGC__DriverInterface) @@ -3737,6 +3755,12 @@ list(APPEND _targetLinkLineCommon zebinlib) ) endif() + if(IGC_BUILD__VC_ENABLED) + list(APPEND _targetLinkLineCommon + ${IGC_BUILD__PROJ_VC_LIBS_TO_LINK} + ) + endif() + list(APPEND _targetLinkLineCommon "${IGC_BUILD__START_GROUP}" ${IGC_BUILD__LLVM_LIBS_TO_LINK} diff --git a/IGC/VectorCompiler/.gitignore b/IGC/VectorCompiler/.gitignore new file mode 100644 index 000000000000..e4ac6dd803fc --- /dev/null +++ b/IGC/VectorCompiler/.gitignore @@ -0,0 +1,60 @@ +#==============================================================================# +# This file specifies intentionally untracked files that git should ignore. +# See: http://www.kernel.org/pub/software/scm/git/docs/gitignore.html +# +# This file is intentionally different from the output of `git svn show-ignore`, +# as most of those are useless. +#==============================================================================# + +#==============================================================================# +# File extensions to be ignored anywhere in the tree. +#==============================================================================# +# Temp files created by most text editors. +*~ +# Merge files created by git. +*.orig +# Byte compiled python modules. +*.pyc +# vim swap files +.*.sw? +.sw? +#OS X specific files. +.DS_store + +# Nested build directory +/build + +#==============================================================================# +# Explicit files to ignore (only matches one). +#==============================================================================# +# Various tag programs +/tags +/TAGS +/GPATH +/GRTAGS +/GSYMS +/GTAGS +.gitusers +autom4te.cache +cscope.files +cscope.out +autoconf/aclocal.m4 +autoconf/autom4te.cache +/compile_commands.json +tags +# Visual Studio built-in CMake configuration +/CMakeSettings.json +# CLion project configuration +/.idea + +#==============================================================================# +# Directories to ignore (do not add trailing '/'s, they skip symlinks). +#==============================================================================# +# Sphinx build tree, if building in-source dir. +docs/_build +docs/autogenerated +# VS2017 and VSCode config files. +.vscode +.vs +# clangd index +.clangd diff --git a/IGC/VectorCompiler/CMakeLists.txt b/IGC/VectorCompiler/CMakeLists.txt new file mode 100644 index 000000000000..44ae6a41fb4a --- /dev/null +++ b/IGC/VectorCompiler/CMakeLists.txt @@ -0,0 +1,158 @@ +#===================== begin_copyright_notice ================================== + +#Copyright (c) 2017 Intel Corporation + +#Permission is hereby granted, free of charge, to any person obtaining a +#copy of this software and associated documentation files (the +#"Software"), to deal in the Software without restriction, including +#without limitation the rights to use, copy, modify, merge, publish, +#distribute, sublicense, and/or sell copies of the Software, and to +#permit persons to whom the Software is furnished to do so, subject to +#the following conditions: + +#The above copyright notice and this permission notice shall be included +#in all copies or substantial portions of the Software. + +#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +#OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +#MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +#IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +#CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +#TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +#SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +#======================= end_copyright_notice ================================== + + + +# CMake Settings: +# - SPIRV_PREBUILD_DIR +# - SPIRV_SRC +# - INSTALL_SPIRVDLL +# - VC_INTRINSICS_SRC +set(IGC_BUILD__PROJ__VectorCompiler "${IGC_BUILD__PROJ_NAME_PREFIX}VectorCompiler") +set(IGC_BUILD__PROJ__VectorCompiler "${IGC_BUILD__PROJ__VectorCompiler}" PARENT_SCOPE) +set(IGC_BUILD__PROJ_VC_LIBS_TO_LINK VCCodeGen PARENT_SCOPE) + +set(IGC_BUILD__PROJ_LABEL__VectorCompiler "${IGC_BUILD__PROJ__VectorCompiler}") + +message(STATUS "+++ Source/IGC/VectorCompiler +++") +message(STATUS "[VC] Build proj: ${IGC_BUILD__PROJ__VectorCompiler}") + + +igc_arch_get_cpu(_cpuSuffix) + +set(BUILD_EXTERNAL YES) + +# --- LLVM --- +if(IGC_OPTION__FORCE_SYSTEM_LLVM OR (WIN32 AND LLVM_USE_PREBUILT)) + message(STATUS "[VC] Using system llvm") + + # Need to search for llvm-tblgen + find_program(LLVM_TABLEGEN_EXE "llvm-tblgen" + ${LLVM_TOOLS_BINARY_DIR} + NO_DEFAULT_PATH + ) + if(LLVM_TABLEGEN_EXE-NOTFOUND) + message(FATAL_ERROR "[VC] llvm-tblgen is not found") + endif() + message(STATUS "[VC] Found tblgen: ${LLVM_TABLEGEN_EXE}") + + # find_package was called by igc cmake, no need to do it again. +else() + # Prebuilt llvm does not have tblgen... + if(LLVM_USE_PREBUILT) + message(FATAL_ERROR "[VC] vector compiler with prebuilt llvm is not supported") + endif() + + # In last scenario we are building with llvm so every target is defined + # and LLVMConfig will only set needed variables. + message(STATUS "[VC] Using llvm source build") + set(LLVM_BUILD_DIR "${LLVM_SOURCE_DIR}/../build/src") + set(LLVM_CMAKE_DIR "${LLVM_BUILD_DIR}/lib/cmake/llvm") + message(STATUS "[VC] LLVMConfig.cmake location: ${LLVM_CMAKE_DIR}") + find_package(LLVM REQUIRED + HINTS ${LLVM_CMAKE_DIR} + NO_DEFAULT_PATH + ) + + # We have executable target, use it. + set(LLVM_TABLEGEN_EXE "llvm-tblgen") + message(STATUS "[VC] Using executable target llvm-tlbgen for tablegenning") + # IGC has its own special cmake for external llvm. + # It sets LLVM_INCLUDE_DIRS instead of LLVM_INCLUDE_DIR. + set(LLVM_INCLUDE_DIR ${LLVM_INCLUDE_DIRS}) +endif() + +# Now find_package was called in all cases and we have all needed variables. +set(CMAKE_MODULE_PATH + ${LLVM_CMAKE_DIR} + ${CMAKE_MODULE_PATH} + ) + +cmake_policy(SET CMP0057 NEW) +# cm offline compiler requires -rdynamic flag to be absent +cmake_policy(SET CMP0065 NEW) + +include(AddLLVM) + +set(LLVM_MAIN_INCLUDE_DIR ${LLVM_INCLUDE_DIR}) +include(TableGen) +# Set LLVM_TABLEGEN_FLAGS manually based on include dirs. +list(TRANSFORM LLVM_INCLUDE_DIR PREPEND "-I=" OUTPUT_VARIABLE LLVM_TABLEGEN_FLAGS) + +message(STATUS "[VC] Including llvm headers: ${LLVM_INCLUDE_DIR}") +include_directories(${LLVM_INCLUDE_DIR}) + +# --- VISA --- + +# HACK. We should use only visa/include without visa internal headers. +set(VISA_INCLUDE_DIRS ${IGC_BUILD__VISA_DIR}) + +# --- VC Intrinsics --- + +add_compile_definitions(LLVM_VERSION_MAJOR=${LLVM_VERSION_MAJOR}) + +if(LLVM_ON_WIN32) + add_compile_options(/experimental:external) + foreach(INCDIR ${LLVM_INCLUDE_DIRS}) + add_compile_options("SHELL:/external:I ${INCDIR}") + endforeach() + add_compile_options(/external:W0) + + # disable 32/64 warnings + add_compile_options(/wd4244) + + # disable unary minus to unsigned type warning + add_compile_options(/wd4146) + + # disable implicitly deleted dtor warning + add_compile_options(/wd4624) +endif() + +if(DEFINED VC_INTRINSICS_SRC) + set(INTRSRC "${VC_INTRINSICS_SRC}/GenXIntrinsics") +endif() + +if(NOT DEFINED INTRSRC) + set(INTRSRC "${CMAKE_CURRENT_SOURCE_DIR}/../../../vc-intrinsics/GenXIntrinsics") +endif() + +message(STATUS "[VC] Using vc-intrinsics source from: ${INTRSRC}") + +# We are using prebuilt SPIRV and building intrinsics. +set(INTRBUILD "${CMAKE_CURRENT_BINARY_DIR}/intrbuild") + + +# Do not copy anything from prebuilts. libSPIRVDLL.so will be dynamically loaded at runtime. +add_subdirectory(${INTRSRC} ${INTRBUILD}) +include_directories(${INTRSRC}/include ${INTRBUILD}/include) + +include(cmake/spirv.cmake) + +# --- VC Opt --- + +add_subdirectory(include) +add_subdirectory(lib) + diff --git a/IGC/VectorCompiler/cmake/spirv.cmake b/IGC/VectorCompiler/cmake/spirv.cmake new file mode 100644 index 000000000000..8ebe5c8a1c11 --- /dev/null +++ b/IGC/VectorCompiler/cmake/spirv.cmake @@ -0,0 +1,180 @@ +# +# Creates `target_branch` starting at the `base_revision` in the `repo_dir`. +# Then all patches from the `patches_dir` are committed to the `target_branch`. +# Does nothing if the `target_branch` is already checked out in the `repo_dir`. +# +function(apply_patches repo_dir patches_dir base_revision target_branch) + file(GLOB patches ${patches_dir}/*.patch) + if(NOT patches) + message(STATUS "No patches in ${patches_dir}") + return() + endif() + + if(NOT DEFINED GIT_EXECUTABLE) + find_program(GIT_EXECUTABLE git) + endif() + + message(STATUS "[VC] ${repo_dir}:") + # Check if the target branch already exists + execute_process( + COMMAND ${GIT_EXECUTABLE} rev-parse --verify --no-revs -q ${target_branch} + WORKING_DIRECTORY ${repo_dir} + RESULT_VARIABLE patches_needed + ) + # Set up fake username just in case if we don't have one globally + execute_process( + COMMAND ${GIT_EXECUTABLE} config --local user.name "patcher" + WORKING_DIRECTORY ${repo_dir} + ) + execute_process( + COMMAND ${GIT_EXECUTABLE} config --local user.email "patcher@intel.com" + WORKING_DIRECTORY ${repo_dir} + ) + if(patches_needed) # The target branch doesn't exist + list(SORT patches) + execute_process( # Create the target branch + COMMAND ${GIT_EXECUTABLE} checkout -b ${target_branch} ${base_revision} + WORKING_DIRECTORY ${repo_dir} + ) + execute_process( # Apply the pathces + COMMAND ${GIT_EXECUTABLE} am --3way --ignore-whitespace ${patches} + WORKING_DIRECTORY ${repo_dir} + ) + else() # The target branch already exists + execute_process( # Check it out + COMMAND ${GIT_EXECUTABLE} checkout ${target_branch} + WORKING_DIRECTORY ${repo_dir} + ) + endif() +endfunction() + +# User may switch spirv dll installation off +if(NOT DEFINED INSTALL_SPIRVDLL) + set(INSTALL_SPIRVDLL 1) +endif() + +# Handle installation of SPIRVDLL. +# Currently, release build of spirvdll is used to read spirv. +# For debugging, one has to build debug version locally and replace release library. +if(INSTALL_SPIRVDLL) +if(NOT DEFINED SPIRV_PREBUILD_DIR AND NOT WIN32) +include(ExternalProject) +set(SPIRV_COPY "${CMAKE_CURRENT_BINARY_DIR}/llvm-spirv-vc") +if(DEFINED SPIRV_SRC) + if(NOT EXISTS ${SPIRV_SRC}) + message(FATAL_ERROR "[VC] Cannot find SPIRVDLL sources in ${SPIRV_SRC}") + endif() + set(SPIRV_SOURCES ${SPIRV_SRC}) +else() + set(SPIRV_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/../../../llvm-project/llvm/projects/llvm-spirv") + if(NOT EXISTS ${SPIRV_SOURCES}) + message(STATUS "[VC] Cannot find SPIRVDLL sources in ${SPIRV_SOURCES}") + set(SPIRV_SOURCES "${CMAKE_CURRENT_SOURCE_DIR}/../../../llvm-spirv") + endif() + if(NOT EXISTS ${SPIRV_SOURCES}) + message(FATAL_ERROR "[VC] Cannot find SPIRVDLL sources in ${SPIRV_SOURCES}") + endif() +endif() + +set(SPIRV_REV_PATCH e87b59a77abb30d3b5fb0b3e0555a39acbe5ebb4) +set(SPRIV_PATCHES ${CMAKE_CURRENT_SOURCE_DIR}/spirv-patches-new/) +set(SPRIV_BRANCH_PATCH spirvdll_100) +find_program(MAKE_EXEC NAMES make gmake) + +if(NOT EXISTS ${SPIRV_COPY}) + message(STATUS "[VC] : Copying stock SPIRV-Translator sources to ${SPIRV_COPY}") + execute_process(COMMAND ${CMAKE_COMMAND} -E copy_directory ${SPIRV_SOURCES} ${SPIRV_COPY}) +endif() + +apply_patches(${SPIRV_COPY} +${SPRIV_PATCHES} +${SPIRV_REV_PATCH} +${SPRIV_BRANCH_PATCH} +) + +if(IGC_OPTION__FORCE_SYSTEM_LLVM) + + ExternalProject_Add(SPIRVDLL_EX + PREFIX ${CMAKE_CURRENT_BINARY_DIR}/SPIRVDLL + SOURCE_DIR ${SPIRV_COPY} + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${CMAKE_CURRENT_BINARY_DIR}/spirv-install + BUILD_COMMAND ${MAKE_EXEC} SPIRVDLL + INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/spirv-install + ) + +else() + + ExternalProject_Add(SPIRVDLL_EX + PREFIX ${CMAKE_CURRENT_BINARY_DIR}/SPIRVDLL + SOURCE_DIR ${SPIRV_COPY} + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${CMAKE_CURRENT_BINARY_DIR}/spirv-install -DLLVM_DIR=${LLVM_DIR} + BUILD_COMMAND ${MAKE_EXEC} SPIRVDLL + INSTALL_DIR ${CMAKE_CURRENT_BINARY_DIR}/spirv-install + ) + + add_dependencies(SPIRVDLL_EX VCCodeGen) + +endif(IGC_OPTION__FORCE_SYSTEM_LLVM) + +install(FILES + ${CMAKE_CURRENT_BINARY_DIR}/spirv-install/lib/libSPIRVDLL.so + DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR} + COMPONENT igc-core +) + +elseif(NOT TARGET SPIRVDLL) + if(DEFINED WIN32) + set(SPIRVDLL_NAME "SPIRVDLL.dll") + else() + set(SPIRVDLL_NAME "libSPIRVDLL.so") + endif() + if(DEFINED SPIRV_PREBUILD_DIR) + set(PREBUILT_SPIRVDLL_PATH "${SPIRV_PREBUILD_DIR}/lib" ) + endif() + find_file(SPIRVDLL_LIB + ${SPIRVDLL_NAME} + PATHS ${PREBUILT_SPIRVDLL_PATH} + NO_DEFAULT_PATH + ) + if(NOT SPIRVDLL_LIB) + message(FATAL_ERROR "[VC] Cannot find SPIRVDLL in prebuilds") + endif() + message(STATUS "[VC] Found SPIRVDLL: ${SPIRVDLL_LIB}") + if(WIN32) + if ("${vc_uses_custom_spirv}" STREQUAL "True") + set(INSTALL_SPRIRVDLL_NAME "SPIRVDLL.dll") + if("${_cpuSuffix}" STREQUAL "32") + set(INSTALL_SPRIRVDLL_NAME "SPIRVDLL32.dll") + endif() + install(FILES ${SPIRVDLL_LIB} + CONFIGURATIONS Debug Release + DESTINATION $/lh64 + RENAME ${INSTALL_SPRIRVDLL_NAME} + ) + install(FILES ${SPIRVDLL_LIB} + CONFIGURATIONS ReleaseInternal + DESTINATION Release-Internal/lh64 + RENAME ${INSTALL_SPRIRVDLL_NAME} + ) + endif() + else() + install(FILES + ${SPIRVDLL_LIB} + DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR} + COMPONENT igc-core + ) + endif() +else() + get_target_property(SPIRVDLL_IMPORTED SPIRVDLL IMPORTED) + if(SPIRVDLL_IMPORTED) + message(STATUS "[VC] SPIRVDLL is already imported") + else() + message(STATUS "[VC] SPIRVDLL will be built in-tree") + install(FILES + $ + DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR} + COMPONENT igc-core + ) + endif() +endif() +endif(INSTALL_SPIRVDLL) diff --git a/IGC/VectorCompiler/include/CMakeLists.txt b/IGC/VectorCompiler/include/CMakeLists.txt new file mode 100644 index 000000000000..d3d27fce5e6a --- /dev/null +++ b/IGC/VectorCompiler/include/CMakeLists.txt @@ -0,0 +1,18 @@ +# Special common target for headers that propagates +# needed include directories and dependencies. +add_library(VCHeaders INTERFACE) + +target_include_directories(VCHeaders + INTERFACE + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_BINARY_DIR} + ) + +add_dependencies(VCHeaders + intrinsics_gen + GenXIntrinsicsGen + VectorCompilerOptions + ) + +# Additional things like header generators. +add_subdirectory(vc) diff --git a/IGC/VectorCompiler/include/vc/CMakeLists.txt b/IGC/VectorCompiler/include/vc/CMakeLists.txt new file mode 100644 index 000000000000..fc23e64eeb7a --- /dev/null +++ b/IGC/VectorCompiler/include/vc/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(Support) diff --git a/IGC/VectorCompiler/include/vc/GenXCodeGen/GenXTarget.h b/IGC/VectorCompiler/include/vc/GenXCodeGen/GenXTarget.h new file mode 100644 index 000000000000..9d0ada5bffa6 --- /dev/null +++ b/IGC/VectorCompiler/include/vc/GenXCodeGen/GenXTarget.h @@ -0,0 +1,42 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ + +#ifndef LLVM_GENX_TARGET_INITIALIZERS_H +#define LLVM_GENX_TARGET_INITIALIZERS_H + +extern "C" void LLVMInitializeGenXTargetInfo(); +extern "C" void LLVMInitializeGenXTarget(); +extern "C" void LLVMInitializeGenXTargetMC(); + +namespace llvm { +void initializeGenX() { + LLVMInitializeGenXTargetInfo(); + LLVMInitializeGenXTarget(); + LLVMInitializeGenXTargetMC(); +} +} // namespace llvm + +#endif diff --git a/IGC/VectorCompiler/include/vc/GenXCodeGen/GenXWrapper.h b/IGC/VectorCompiler/include/vc/GenXCodeGen/GenXWrapper.h new file mode 100644 index 000000000000..85c6b7109efa --- /dev/null +++ b/IGC/VectorCompiler/include/vc/GenXCodeGen/GenXWrapper.h @@ -0,0 +1,142 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ + +#pragma once + +#include +#include + +#include +#include + +#include +#include +#include +#include + +namespace vc { + +namespace ocl { + +enum class ArgKind { + General, + LocalSize, // IMPLICIT_LOCAL_SIZE + GroupCount, // IMPLICIT_NUM_GROUPS + Buffer, // 1D buffer + SVM, // stateless global pointer + Sampler, + Image1d, + Image2d, + Image3d, + PrintBuffer, + PrivateBase +}; + +enum class ArgAccessKind { None, ReadOnly, WriteOnly, ReadWrite }; + +struct ArgInfo { + ArgKind Kind; + ArgAccessKind AccessKind; + unsigned Index; + unsigned Offset; + unsigned SizeInBytes; + unsigned BTI; +}; + +struct TableInfo { + void *Buf = nullptr; + uint32_t Size = 0; + uint32_t NumEntries = 0; +}; + +// Mirror of cmc_kernel_info that owns its data. +struct KernelInfo { + std::string Name; + std::vector Args; + std::vector PrintStrings; + bool HasGroupID; + bool HasBarriers; + bool HasReadWriteImages; + unsigned SLMSize; + unsigned ThreadPrivateMemSize; + unsigned StatelessPrivateMemSize; + unsigned GRFSizeInBytes; + + TableInfo RelocationTable; + TableInfo SymbolTable; +}; + + +struct CompileInfo { + KernelInfo KernelInfo; + FINALIZER_INFO JitInfo; + std::string GenBinary; +}; + +struct CompileOutput { + std::vector Kernels; + unsigned PointerSizeInBytes; +}; + +} // namespace ocl + +namespace cm { +struct CompileOutput { + std::string IsaBinary; +}; +} // namespace cm + +using CompileOutput = std::variant; + +enum class FileType { + SPIRV, SOURCE +}; + +enum class OptimizerLevel { None, Full }; + +enum class RuntimeKind { CM, OpenCL }; + +struct CompileOptions { + FileType FType = FileType::SPIRV; + std::string CPUStr; + std::unique_ptr WATable = nullptr; + + // Api accessible options. + bool NoVecDecomp = false; + OptimizerLevel OptLevel = OptimizerLevel::Full; + + // Internal options. + RuntimeKind Runtime = RuntimeKind::OpenCL; + bool DumpIsa = false; + bool DumpIR = false; +}; + +llvm::Expected Compile(llvm::ArrayRef Input, + const CompileOptions &Opts); + +llvm::Expected ParseOptions(llvm::StringRef ApiOptions, + llvm::StringRef InternalOptions); +} // namespace vc diff --git a/IGC/VectorCompiler/include/vc/GenXOpts/GenXAnalysis.h b/IGC/VectorCompiler/include/vc/GenXOpts/GenXAnalysis.h new file mode 100644 index 000000000000..ab271556deda --- /dev/null +++ b/IGC/VectorCompiler/include/vc/GenXOpts/GenXAnalysis.h @@ -0,0 +1,79 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +// This file declares some hooks that are injected into llvm analysis library +// files to make them work with genx related stuff. +// +//===----------------------------------------------------------------------===// + + +#ifndef LLVM_GENX_ANALYSIS_H +#define LLVM_GENX_ANALYSIS_H + +namespace llvm { + +template class ArrayRef; +class CallInst; +class Constant; +class DataLayout; +class Instruction; +class ImmutableCallSite; +class Type; +class Use; +class Value; + +/// canConstantFoldGenXIntrinsic - Return true if it is even possible to fold +/// a call to the specified GenX intrinsic. +bool canConstantFoldGenXIntrinsic(unsigned IID); + +/// ConstantFoldGenXIntrinsic - Attempt to constant fold a call to the +/// specified GenX intrinsic with the specified arguments, returning null if +/// unsuccessful. +Constant *ConstantFoldGenXIntrinsic(unsigned IID, Type *RetTy, + ArrayRef Operands, + ImmutableCallSite CS, const DataLayout *DL); + +/// ConstantFoldGenX - Attempt to constant fold genx-related instruction (intrinsic). +/// This function tries to fold operands and then tries to fold instruction +/// itself. Returns nullptr if folding was unsuccessful. +Constant *ConstantFoldGenX(Instruction *I, const DataLayout &DL); + +/// Given a GenX intrinsic and a set of arguments, see if we can fold the +/// result. +/// +/// If this call could not be simplified returns null. +Value *SimplifyGenXIntrinsic(unsigned IID, Type *RetTy, Use *ArgBegin, + Use *ArgEnd); + +/// Given a GenX related intruction, see if we can fold the +/// result. This function tries simplification and then constant folding. +/// +/// If this instruction could not be simplified returns null. +Value *SimplifyGenX(CallInst *I); + +} // end namespace llvm + +#endif diff --git a/IGC/VectorCompiler/include/vc/GenXOpts/GenXOpts.h b/IGC/VectorCompiler/include/vc/GenXOpts/GenXOpts.h new file mode 100644 index 000000000000..b67804c2f775 --- /dev/null +++ b/IGC/VectorCompiler/include/vc/GenXOpts/GenXOpts.h @@ -0,0 +1,74 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +// This header file defines prototypes for accessor functions that expose passes +// in the GenX transformations library. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_GENX_OPTS_H +#define LLVM_GENX_OPTS_H + +namespace llvm { + +class FunctionPass; +class ModulePass; +class Pass; + +//===----------------------------------------------------------------------===// +// +// CMImpParam - Transforms to enable implicit parameters +// +Pass *createCMImpParamPass(bool); + +//===----------------------------------------------------------------------===// +// +// CMKernelArgOffset - Determine offset of each CM kernel argument +// +Pass *createCMKernelArgOffsetPass(unsigned GrfByteSize, bool OCLCodeGen); + +//===----------------------------------------------------------------------===// +// +// CMABI - Fix ABI issues for the genx backend. +// +Pass *createCMABIPass(); + +//===----------------------------------------------------------------------===// +// +// CMLowerVLoadVStore - Lower CM reference loads and stores. +// +Pass *createCMLowerVLoadVStorePass(); + +FunctionPass *createGenXReduceIntSizePass(); +FunctionPass *createGenXRegionCollapsingPass(); +FunctionPass *createGenXSimplifyPass(); +FunctionPass *createGenXLayoutBlocksPass(); +FunctionPass *createGenXLowerAggrCopiesPass(); + +ModulePass *createGenXPacketizePass(); +} // End llvm namespace + +#endif diff --git a/IGC/VectorCompiler/include/vc/GenXOpts/Utils/CMRegion.h b/IGC/VectorCompiler/include/vc/GenXOpts/Utils/CMRegion.h new file mode 100644 index 000000000000..421cbbd631d4 --- /dev/null +++ b/IGC/VectorCompiler/include/vc/GenXOpts/Utils/CMRegion.h @@ -0,0 +1,237 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// CMRegion : region information +/// ------------------------------- +/// +/// An object of class CMRegion describes the region parameters of a Gen region. +/// It is a transient object, in that a pass constructs it as needed and then +/// forgets it. It does not persist between passes, as the region parameters are +/// fully described by the arguments to the rdregion and wrregion intrinsics. +/// +/// The region parameters in a CMRegion are: +/// +/// * ElementBytes : number of bytes per element +/// * ElementTy : Type of element +/// * NumElements : total number of elements in the region (number of rows is +/// thus NumElements/Width) +/// * VStride : vertical stride in elements +/// * Width : row width in elements +/// * Stride : horizontal stride in elements +/// * Offset : constant part of offset +/// * Indirect : variable index (nullptr for direct region, scalar value for +/// single indirect, vector value for multi indirect) +/// * IndirectIdx : start index in vector indirect. This is always 0 when +/// constructing a CMRegion, but can be set to a non-zero value before +/// calling a method to create a new rdregion/wrregion intrinsic +/// * IndirectAddrOffset : offset from the address value where region +/// origin starts +/// * Mask : mask (predicate) for wrregion, nullptr if none +/// * ParentWidth : the parent width value (a statement that no row crosses a +/// boundary of a multiple of this number of elements) +/// +/// There are the following constructors: +/// +/// * Construct from a Type or Value, setting the GenXRegion to a region that +/// covers the whole value. +/// * Construct from a rdregion/wrregion intrinsic, setting the GenXRegion to the +/// region described by the intrinsic. +/// * Construct from a bitmap of which elements need to be in the region. This +/// is used from GenXConstants when constructing a splat region when loading +/// a constant in multiple stages. +/// +/// CMRegion is not used to represent the region parameters in predicate regions, +/// since they are much simpler. But GenXRegion does contain static methods to create +/// rdpredregion etc intrinsics given the predicate region parameters. +/// +//===----------------------------------------------------------------------===// + +#ifndef CMREGION_H +#define CMREGION_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallBitVector.h" +#include "llvm/GenXIntrinsics/GenXIntrinsics.h" + +namespace llvm { + +class Constant; +class DataLayout; +class Value; +class Function; +class Module; +class Type; +class Instruction; +class raw_ostream; +class Twine; +class DebugLoc; +class TargetLibraryInfo; + +// CMRegion : description of an operand's region +class CMRegion { +public: + unsigned ElementBytes; + Type *ElementTy; + unsigned NumElements; + int VStride; + unsigned Width; + int Stride; + int Offset; + Value *Indirect; + unsigned IndirectIdx; // start index in vector Indirect + unsigned IndirectAddrOffset; + Value *Mask; // 0 else mask for wrregion + unsigned ParentWidth; // 0 else parent width + // Default constructor: assume single element + CMRegion() + : ElementBytes(0), ElementTy(0), NumElements(1), VStride(1), Width(1), + Stride(1), Offset(0), Indirect(0), IndirectIdx(0), + IndirectAddrOffset(0), Mask(0), ParentWidth(0) {} + // Construct from a type. + CMRegion(Type *Ty, const DataLayout *DL = nullptr); + // Construct from a value. + CMRegion(Value *V, const DataLayout *DL = nullptr); + // Construct from a rd/wr region/element + CMRegion(Instruction *Inst, bool WantParentWidth = false); + // Construct from a bitmap of which elements to set (legal 1D region) + CMRegion(unsigned Bits, unsigned ElementBytes); + // Create rdregion intrinsic from this Region + // Returns a scalar if the Region has one element and AllowScalar is true. + // Otherwise returns a vector. + Instruction *createRdRegion(Value *Input, const Twine &Name, + Instruction *InsertBefore, const DebugLoc &DL, + bool AllowScalar = false); + // Modify Region object for a subregion + void getSubregion(unsigned StartIdx, unsigned Size); + // Create wrregion intrinsic from this Region + Value *createWrRegion(Value *OldVal, Value *Input, const Twine &Name, + Instruction *InsertBefore, const DebugLoc &DL); + // Create wrconstregion intrinsic from this Region + Value *createWrConstRegion(Value *OldVal, Value *Input, const Twine &Name, + Instruction *InsertBefore, const DebugLoc &DL); + // Create rdpredregion from given start index and size + static Instruction *createRdPredRegion(Value *Input, unsigned Index, + unsigned Size, const Twine &Name, + Instruction *InsertBefore, + const DebugLoc &DL); + static Value *createRdPredRegionOrConst(Value *Input, unsigned Index, + unsigned Size, const Twine &Name, + Instruction *InsertBefore, + const DebugLoc &DL); + // Create wrpredregion from given start index + static Instruction *createWrPredRegion(Value *OldVal, Value *Input, + unsigned Index, const Twine &Name, + Instruction *InsertBefore, + const DebugLoc &DL); + // Create wrpredpredregion from given start index + static Instruction *createWrPredPredRegion(Value *OldVal, Value *Input, + unsigned Index, Value *Pred, + const Twine &Name, + Instruction *InsertBefore, + const DebugLoc &DL); + // Set the called function in an intrinsic call + static void setRegionCalledFunc(Instruction *Inst); + // Compare two regions to see if they have the same region parameters other + // than start offset (not allowing element type to be different). + bool isStrictlySimilar(const CMRegion &R2) const { + return VStride == R2.VStride && Width == R2.Width && Stride == R2.Stride && + Mask == R2.Mask; + } + // Compare two regions to see if they have the same region parameters other + // than start offset (also allowing element type to be different). + bool isSimilar(const CMRegion &R2) const; + // Compare two regions to see if they have the same region parameters (also + // allowing element type to be different). + bool operator==(const CMRegion &R2) const { + return isSimilar(R2) && Offset == R2.Offset && Indirect == R2.Indirect + && IndirectIdx == R2.IndirectIdx; + } + bool operator!=(const CMRegion &R2) const { return !(*this == R2); } + // Compare two regions to see if they overlaps each other. + bool overlap(const CMRegion &R2) const; + // Test whether a region is scalar + bool isScalar() const { + return !Stride && (Width == NumElements || !VStride); + } + // Test whether a region is 2D + bool is2D() const { return !isScalar() && Width != NumElements; } + // Test whether a region is contiguous. + bool isContiguous() const; + // Test whether a region covers exactly the whole of the given type, allowing + // for the element type being different. + bool isWhole(Type *Ty) const; + // Test whether the region has a whole number of rows. (append() can result + // in a region with an incomplete final row, which is normally not allowed.) + bool isWholeNumRows() const { return !(NumElements % Width); } + // Evaluate rdregion with constant input. + Constant *evaluateConstantRdRegion(Constant *Input, bool AllowScalar); + // evaluateConstantWrRegion : evaluate wrregion with constant inputs + Constant *evaluateConstantWrRegion(Constant *OldVal, Constant *NewVal); + // append : append region AR to this region + bool append(CMRegion AR); + // changeElementType : change the element type of the region + bool changeElementType(Type *NewElementType); + // Debug dump/print + void dump() const; + void print(raw_ostream &OS) const; + // Check whether the region is multi indirect. Returns true if Indirect has + // VectorType (a sign of multi indirection) + bool isMultiIndirect() const { + return Indirect && isa(Indirect->getType()); + } + // Get bit mask in which ones values represent bytes which + // were accessed by this region + BitVector getAccessBitMap(int MinTrackingOffset = 0) const; + // Length of single row in bytes + unsigned getRowLength() const { + return Stride ? (Width * Stride * ElementBytes) : ElementBytes; + } + // Length of whole region in bytes + unsigned getLength() const { + return VStride * ((NumElements / Width) - 1) * ElementBytes + + getRowLength(); + } + +protected: + // Create wrregion or wrconstregion intrinsic from this Region + Value *createWrCommonRegion(GenXIntrinsic::ID, Value *OldVal, Value *Input, + const Twine &Name, Instruction *InsertBefore, + const DebugLoc &DL); + // Get the function declaration for a region intrinsic + static Function *getGenXRegionDeclaration(Module *M, GenXIntrinsic::ID IID, Type *RetTy, + ArrayRef Args); + // Get (or create instruction for) the start index of a region. + Value *getStartIdx(const Twine &Name, Instruction *InsertBefore, const DebugLoc &DL); +}; + +inline raw_ostream &operator<<(raw_ostream &OS, const CMRegion &R) { + R.print(OS); + return OS; +} + +} // end namespace llvm + +#endif /* CMREGION_H */ diff --git a/IGC/VectorCompiler/include/vc/GenXOpts/Utils/GenXSTLExtras.h b/IGC/VectorCompiler/include/vc/GenXOpts/Utils/GenXSTLExtras.h new file mode 100644 index 000000000000..b7a404dabb89 --- /dev/null +++ b/IGC/VectorCompiler/include/vc/GenXOpts/Utils/GenXSTLExtras.h @@ -0,0 +1,80 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ + +#ifndef LLVM_GENXOPTS_UTILS_GENXSTLEXTRAS_H +#define LLVM_GENXOPTS_UTILS_GENXSTLEXTRAS_H + +#include + +namespace llvm { +namespace genx { + +namespace ranges { + +template +using iterator_t = decltype(std::begin(std::declval())); + +template +using range_pointer_t = + typename std::iterator_traits>::pointer; + +template +using range_reference_t = + typename std::iterator_traits>::reference; + +} // namespace ranges + +/* Returns the first iterator (let's name it RetIt) such that + * std::accumulate(First, RetIt, 0) > Bound (not full truth, read below). + * + * Arguments: + * \p First, \p Last - considered range + * \p Bound - considered Bound + * \p Op - functor that returns T, takes T and decltype(*First) + * respectively as arguments. It is meant to increment current partial sum. + * First argument is previous partial sum, second argument is upcoming value + * from the range, new partial sum is returned. + * + * Arguments of \p PlusEqualOp may not be equal, so the range may possibly point + * not to T type. In this case partial sum is calculated for transformed range + * (transformation is hidden in \p Op). + */ +template +ForwardIt upper_partial_sum_bound(ForwardIt First, ForwardIt Last, T Bound, + PlusEqualOp Op) { + T CurSum = 0; + for (; First != Last; ++First) { + CurSum = Op(CurSum, *First); + if (CurSum > Bound) + return First; + } + return Last; +} + +} // namespace genx +} // namespace llvm + +#endif // LLVM_GENXOPTS_UTILS_GENXSTLEXTRAS_H diff --git a/IGC/VectorCompiler/include/vc/GenXOpts/Utils/KernelInfo.h b/IGC/VectorCompiler/include/vc/GenXOpts/Utils/KernelInfo.h new file mode 100644 index 000000000000..b6b5876c97ac --- /dev/null +++ b/IGC/VectorCompiler/include/vc/GenXOpts/Utils/KernelInfo.h @@ -0,0 +1,356 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ + +#ifndef GENX_KERNEL_INFO_H +#define GENX_KERNEL_INFO_H + +#include "vc/GenXOpts/Utils/RegCategory.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/GenXIntrinsics/GenXMetadata.h" + +namespace llvm { +namespace genx { + +enum { VISA_MAJOR_VERSION = 3, VISA_MINOR_VERSION = 6 }; + +// Utility function to tell whether a Function is a vISA kernel. +inline bool isKernel(const Function *F) { + // We use DLLExport to represent a kernel in LLVM IR. + return (F->hasDLLExportStorageClass() || + F->hasFnAttribute(genx::FunctionMD::CMGenXMain)); +} + +// Turn a MDNode into llvm::value or its subclass. +// Return nullptr if the underlying value has type mismatch. +template Ty *getValueAsMetadata(Metadata *M) { + if (auto VM = dyn_cast(M)) + if (auto V = dyn_cast(VM->getValue())) + return V; + return nullptr; +} + +/// KernelMetadata : class to parse kernel metadata +class KernelMetadata { + Function *F = nullptr; + bool IsKernel = false; + StringRef Name; + unsigned SLMSize = 0; + SmallVector ArgKinds; + SmallVector ArgOffsets; + SmallVector ArgIOKinds; + SmallVector ArgTypeDescs; + // Assign a BTI value to a surface or sampler, OCL path only. + // Given buffer x, --> UAV + // read_only image --> SRV + // write_only or read_write image --> UAV + // + // First assign SRV then UAV resources. + SmallVector BTIs; + +public: + // default constructor + KernelMetadata() {} + + /* + * KernelMetadata constructor + * + * Enter: F = Function that purports to be a CM kernel + * + */ + KernelMetadata(Function *F) { + if (!genx::isKernel(F)) + return; + NamedMDNode *Named = + F->getParent()->getNamedMetadata(genx::FunctionMD::GenXKernels); + if (!Named) + return; + + MDNode *Node = nullptr; + for (unsigned i = 0, e = Named->getNumOperands(); i != e; ++i) { + if (i == e) + return; + Node = Named->getOperand(i); + if (Node->getNumOperands() > KernelMDOp::ArgTypeDescs && + getValueAsMetadata(Node->getOperand(KernelMDOp::FunctionRef)) == F) + break; + } + if (!Node) + return; + + // Node is the metadata node for F, and it has the required number of + // operands. + this->F = F; + IsKernel = true; + if (MDString *MDS = dyn_cast(Node->getOperand(KernelMDOp::Name))) + Name = MDS->getString(); + if (ConstantInt *Sz = getValueAsMetadata(Node->getOperand(KernelMDOp::SLMSize))) + SLMSize = Sz->getZExtValue(); + // Build the argument kinds and offsets arrays that should correspond to the + // function arguments (both explicit and implicit) + MDNode *KindsNode = dyn_cast(Node->getOperand(KernelMDOp::ArgKinds)); + MDNode *OffsetsNode = dyn_cast(Node->getOperand(KernelMDOp::ArgOffsets)); + MDNode *InputOutputKinds = dyn_cast(Node->getOperand(KernelMDOp::ArgIOKinds)); + MDNode *ArgDescNode = dyn_cast(Node->getOperand(KernelMDOp::ArgTypeDescs)); + + assert(KindsNode); + + for (unsigned i = 0, e = KindsNode->getNumOperands(); i != e; ++i) { + ArgKinds.push_back( + getValueAsMetadata(KindsNode->getOperand(i)) + ->getZExtValue()); + if (OffsetsNode == nullptr) + ArgOffsets.push_back(0); + else { + assert(OffsetsNode->getNumOperands() == e && "out of sync"); + ArgOffsets.push_back( + getValueAsMetadata(OffsetsNode->getOperand(i)) + ->getZExtValue()); + } + } + assert(InputOutputKinds && + KindsNode->getNumOperands() >= InputOutputKinds->getNumOperands()); + for (unsigned i = 0, e = InputOutputKinds->getNumOperands(); i != e; ++i) + ArgIOKinds.push_back( + getValueAsMetadata(InputOutputKinds->getOperand(i)) + ->getZExtValue()); + assert(ArgDescNode); + for (unsigned i = 0, e = ArgDescNode->getNumOperands(); i < e; ++i) { + MDString *MDS = dyn_cast(ArgDescNode->getOperand(i)); + assert(MDS); + ArgTypeDescs.push_back(MDS->getString()); + } + } + // Accessors + bool isKernel() const { return IsKernel; } + StringRef getName() const { return Name; } + unsigned getSLMSize() const { return SLMSize; } + ArrayRef getArgKinds() const { return ArgKinds; } + unsigned getNumArgs() const { return ArgKinds.size(); } + unsigned getArgKind(unsigned Idx) const { return ArgKinds[Idx]; } + StringRef getArgTypeDesc(unsigned Idx) const { + if (Idx >= ArgTypeDescs.size()) + return ""; + return ArgTypeDescs[Idx]; + } + + enum { AK_NORMAL, AK_SAMPLER, AK_SURFACE, AK_VME }; + unsigned getArgCategory(unsigned Idx) const { + switch (getArgKind(Idx) & 7) { + case AK_SAMPLER: + return RegCategory::SAMPLER; + case AK_SURFACE: + return RegCategory::SURFACE; + case AK_VME: + return RegCategory::VME; + default: + return RegCategory::GENERAL; + } + } + + // check if an argument is annotated with attribute "buffer_t". + bool isBufferType(unsigned Idx) const { + return (getArgTypeDesc(Idx).find_lower("buffer_t") != StringRef::npos && + getArgTypeDesc(Idx).find_lower("image1d_buffer_t") == StringRef::npos); + } + + // check if an argument is annotated with attribute "image{1,2,3}d_t". + bool isImageType(unsigned Idx) const { + return getArgTypeDesc(Idx).find_lower("image1d_t") != StringRef::npos || + getArgTypeDesc(Idx).find_lower("image2d_t") != StringRef::npos || + getArgTypeDesc(Idx).find_lower("image3d_t") != StringRef::npos || + getArgTypeDesc(Idx).find_lower("image1d_buffer_t") != StringRef::npos; + } + + int32_t getBTI(unsigned Index) { + if (BTIs.empty()) + computeBTIs(); + assert(Index < BTIs.size()); + return BTIs[Index]; + } + + enum { + // Reserved surface indices start from 253, see GenXCodeGen/GenXVisa.h + // TODO: consider adding a dependency from GenXCodeGen and extract + // "252" from there + K_MaxAvailableBtiIndex = 252 + }; + // Assign BTIs lazily. + void computeBTIs() { + unsigned SurfaceID = 0; + unsigned SamplerID = 0; + auto Desc = ArgTypeDescs.begin(); + // Assign SRV and samplers. + for (auto Kind = ArgKinds.begin(); Kind != ArgKinds.end(); ++Kind) { + BTIs.push_back(-1); + if (*Kind == AK_SAMPLER) + BTIs.back() = SamplerID++; + else if (*Kind == AK_SURFACE) { + StringRef DescStr = *Desc; + // By default, an unannotated surface is read_write. + if (DescStr.find_lower("read_only") != StringRef::npos) { + BTIs.back() = SurfaceID++; + if (SurfaceID > K_MaxAvailableBtiIndex) { + llvm::report_fatal_error("not enough BTI indeces", false); + } + } + } + ++Desc; + } + // Scan again and assign BTI to UAV resources. + Desc = ArgTypeDescs.begin(); + int Idx = 0; + for (auto Kind = ArgKinds.begin(); Kind != ArgKinds.end(); ++Kind) { + if (*Kind == AK_SURFACE && BTIs[Idx] == -1) + BTIs[Idx] = SurfaceID++; + // SVM arguments are also assigned an BTI, which is not necessary, but OCL + // runtime requires it. + if (*Kind == AK_NORMAL) { + StringRef DescStr = *Desc; + if (DescStr.find_lower("svmptr_t") != StringRef::npos) { + BTIs[Idx] = SurfaceID++; + if (SurfaceID > K_MaxAvailableBtiIndex) { + llvm::report_fatal_error("not enough BTI indeces", false); + } + } + } + // print buffer is also assigned with BTI, which is not necessary, but OCL + // runtime requires it. + if (*Kind & KernelMetadata::IMP_OCL_PRINTF_BUFFER) { + BTIs[Idx] = SurfaceID++; + } + + if (*Kind & KernelMetadata::IMP_OCL_PRIVATE_BASE) + BTIs[Idx] = SurfaceID++; + ++Desc, ++Idx; + } + } + + // All the Kinds defined + // These correspond to the values used in vISA + // Bits 0-2 represent category (see enum) + // Bits 7..3 represent the value needed for the runtime to determine what + // the implicit argument should be + // + // IMP_OCL_LOCAL_ID{X, Y, Z} and IMP_OCL_GLOBAL_OR_LOCAL_SIZE apply to OCL + // runtime only. + // + enum ImpValue : uint32_t { + IMP_NONE = 0x0, + IMP_LOCAL_SIZE = 0x1 << 3, + IMP_GROUP_COUNT = 0x2 << 3, + IMP_LOCAL_ID = 0x3 << 3, + IMP_SB_DELTAS = 0x4 << 3, + IMP_SB_BTI = 0x5 << 3, + IMP_SB_DEPCNT = 0x6 << 3, + IMP_OCL_LOCAL_ID_X = 0x7 << 3, + IMP_OCL_LOCAL_ID_Y = 0x8 << 3, + IMP_OCL_LOCAL_ID_Z = 0x9 << 3, + IMP_OCL_GROUP_OR_LOCAL_SIZE = 0xA << 3, + IMP_OCL_PRINTF_BUFFER = 0xB << 3, + IMP_OCL_PRIVATE_BASE = 0xC << 3, + IMP_PSEUDO_INPUT = 0x10 << 3 + }; + + enum { SKIP_OFFSET_VAL = -1 }; + // Check if this argument should be omitted as a kernel input. + bool shouldSkipArg(unsigned Idx) const { + return static_cast(ArgOffsets[Idx]) == SKIP_OFFSET_VAL; + } + unsigned getNumNonSKippingInputs() const { + unsigned K = 0; + for (unsigned Val : ArgOffsets) + K += (static_cast(Val) != SKIP_OFFSET_VAL); + return K; + } + unsigned getArgOffset(unsigned Idx) const { return ArgOffsets[Idx]; } + + enum ArgIOKind { + IO_Normal = 0, + IO_INPUT = 1, + IO_OUTPUT = 2, + IO_INPUT_OUTPUT = 3 + }; + ArgIOKind getArgInputOutputKind(unsigned Idx) const { + if (Idx < ArgIOKinds.size()) + return static_cast(ArgIOKinds[Idx] & 0x3); + return IO_Normal; + } + bool isOutputArg(unsigned Idx) const { + auto Kind = getArgInputOutputKind(Idx); + return Kind == ArgIOKind::IO_OUTPUT || Kind == ArgIOKind::IO_INPUT_OUTPUT; + } +}; + +struct KernelArgInfo { + uint32_t Kind; + explicit KernelArgInfo(uint32_t Kind) : Kind(Kind) {} + bool isNormalCategory() const { + return (Kind & 0x7) == genx::KernelMetadata::AK_NORMAL; + } + bool isLocalIDX() const { + uint32_t Val = Kind & 0xFFF8; + return Val == genx::KernelMetadata::IMP_OCL_LOCAL_ID_X; + } + bool isLocalIDY() const { + uint32_t Val = Kind & 0xFFF8; + return Val == genx::KernelMetadata::IMP_OCL_LOCAL_ID_Y; + } + bool isLocalIDZ() const { + uint32_t Val = Kind & 0xFFF8; + return Val == genx::KernelMetadata::IMP_OCL_LOCAL_ID_Z; + } + bool isGroupOrLocalSize() const { + uint32_t Val = Kind & 0xFFF8; + return Val == genx::KernelMetadata::IMP_OCL_GROUP_OR_LOCAL_SIZE; + } + bool isLocalIDs() const { + uint32_t Val = Kind & 0xFFF8; + return Val == genx::KernelMetadata::IMP_LOCAL_ID; + } + bool isLocalSize() const { + uint32_t Val = Kind & 0xFFF8; + return Val == genx::KernelMetadata::IMP_LOCAL_SIZE; + } + bool isGroupCount() const { + uint32_t Val = Kind & 0xFFF8; + return Val == genx::KernelMetadata::IMP_GROUP_COUNT; + } + bool isPrintBuffer() const { + uint32_t Val = Kind & 0xFFF8; + return Val == genx::KernelMetadata::IMP_OCL_PRINTF_BUFFER; + } + bool isPrivateBase() const { + uint32_t Val = Kind & 0xFFF8; + return Val == genx::KernelMetadata::IMP_OCL_PRIVATE_BASE; + } +}; + +} // namespace genx +} // namespace llvm + +#endif diff --git a/IGC/VectorCompiler/include/vc/GenXOpts/Utils/RegCategory.h b/IGC/VectorCompiler/include/vc/GenXOpts/Utils/RegCategory.h new file mode 100644 index 000000000000..5d7d9ad33405 --- /dev/null +++ b/IGC/VectorCompiler/include/vc/GenXOpts/Utils/RegCategory.h @@ -0,0 +1,55 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ + +#ifndef GENX_REG_CATEGORIES_H +#define GENX_REG_CATEGORIES_H + +namespace llvm { +namespace genx { + +// The encoding for register category, used in GenXCategory, +// GenXLiveness and GenXVisaRegAlloc. It is an anonymous enum inside a class +// rather than a named enum so you don't need to cast to/from int. +struct RegCategory { + enum { + NONE, + GENERAL, + ADDRESS, + PREDICATE, + SAMPLER, + SURFACE, + VME, + NUMREALCATEGORIES, + EM, + RM, + NUMCATEGORIES + }; +}; + +} // namespace genx +} // namespace llvm + +#endif diff --git a/IGC/VectorCompiler/include/vc/Support/CMakeLists.txt b/IGC/VectorCompiler/include/vc/Support/CMakeLists.txt new file mode 100644 index 000000000000..4079289ba9f9 --- /dev/null +++ b/IGC/VectorCompiler/include/vc/Support/CMakeLists.txt @@ -0,0 +1,3 @@ +set(LLVM_TARGET_DEFINITIONS Options.td) +tablegen(LLVM Options.inc -gen-opt-parser-defs) +add_public_tablegen_target(VectorCompilerOptions) diff --git a/IGC/VectorCompiler/include/vc/Support/Options.h b/IGC/VectorCompiler/include/vc/Support/Options.h new file mode 100644 index 000000000000..9ff203af166a --- /dev/null +++ b/IGC/VectorCompiler/include/vc/Support/Options.h @@ -0,0 +1,58 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ + +#ifndef VC_SUPPORT_OPTIONS_H +#define VC_SUPPORT_OPTIONS_H + +#include + +namespace vc { +namespace options { +// Flags should not overlap with llvm::opt::DriverFlag. +constexpr unsigned FirstNonBuiltinFlagNum = 4; + +enum Flags { + ApiOption = (1 << FirstNonBuiltinFlagNum), + InternalOption = (ApiOption << 1), + IgcmcApiOption = (InternalOption << 1), +}; + +enum ID { + OPT_INVALID = 0, +#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM, \ + HELPTEXT, METAVAR, VALUES) \ + OPT_##ID, +#include "vc/Support/Options.inc" + LastOption +#undef OPTION +}; + +} // namespace options + +const llvm::opt::OptTable &getOptTable(); +} // namespace vc + +#endif diff --git a/IGC/VectorCompiler/include/vc/Support/Options.td b/IGC/VectorCompiler/include/vc/Support/Options.td new file mode 100644 index 000000000000..c79718c955fb --- /dev/null +++ b/IGC/VectorCompiler/include/vc/Support/Options.td @@ -0,0 +1,117 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +// This file defines the options accepted by vector compiler. +// +// There are two kinds of options: api options and internal options. +// +// Api options are exposed to user via, e.g., openCL clBuildProgram. +// +// Internal options are for passing of additional info of various purposes. +// Among these can be: debug, things that are not exposed to user directly. +// +//===----------------------------------------------------------------------===// + +include "llvm/Option/OptParser.td" + +// Option kinds {{ +// Options accessible using API. +def ApiOption : OptionFlag; + +// Api options compatible with igcmc. +// These are used only when -cmc is present in api options. +def IgcmcApiOption : OptionFlag; + +// Internal options. +def InternalOption : OptionFlag; +// }} Option kinds + +// Api options {{ +let Flags = [ApiOption] in { + +// Main dispatch option. +def vc_codegen : Flag<["-"], "vc-codegen">, + HelpText<"Enable vector codegenerator">; + +def optimize : Separate<["-"], "optimize">, + HelpText<"Set optimization level to either 'none' or 'full'">, + MetaVarName<"">; +def optimize_eq : Joined<["-"], "optimize=">, + Alias; +def no_optimize : Flag<["-"], "no-optimize">, + Alias, AliasArgs<["none"]>; + +def no_vector_decomposition : Flag<["-"], "no-vector-decomposition">, + HelpText<"Disable vector decomposition pass">; +// Old igcmc alias. +def no_vector_decomposition_old : Flag<["-"], "no_vector_decomposition">, + Alias { + let Flags = [ApiOption, IgcmcApiOption]; +} + +} +// }} Api options + +// Igcmc compatibility {{ +let Flags = [IgcmcApiOption] in { + +def igcmc : Flag<["-"], "cmc">, + HelpText<"Enable igcmc compatible mode; incompatible with -vc-codegen; implies -optimize=none.">; + +def igcmc_visaopts : Joined<["-"], "visaopts=">, + HelpText<"Options for finalizer in form \"opt1 opt2 opt3...\"">; + +def igcmc_stack_size : Joined<["-"], "stack-mem-size=">, + HelpText<"Control stack memory size (in bytes)">; + +} +// }} Igcmc compatibility + +// Internal options {{ +let Flags = [InternalOption] in { + +def dump_llvm_ir : Flag<["-"], "dump-llvm-ir">, + HelpText<"Dump llvm IR after SPIRV reading, optimizations and codegen">; +def dump_isa_binary : Flag<["-"], "dump-isa-binary">, + HelpText<"Dump isa binary after finalization pass">; + +def help : Flag<["-"], "help">, + HelpText<"Display available API options">; +def help_internal : Flag<["-"], "help-internal">, + HelpText<"Display available internal options">; + +def llvm_options : Separate<["-"], "llvm-options">, + HelpText<"Additional options forwarded to llvm CommandLine global option parser">; +def llvm_options_eq : Joined<["-"], "llvm-options=">, + Alias, HelpText<"Alias for -llvm-options">; + +def runtime : Separate<["-"], "runtime">, + HelpText<"Set runtime for which binary should be generated; values: 'ocl' or 'cm'">; +def runtime_eq : Joined<["-"], "runtime=">, + Alias, HelpText<"Alias for -runtime ">; + +} +// }} Internal options diff --git a/IGC/VectorCompiler/include/vc/Support/Status.h b/IGC/VectorCompiler/include/vc/Support/Status.h new file mode 100644 index 000000000000..7e261a2f1fe8 --- /dev/null +++ b/IGC/VectorCompiler/include/vc/Support/Status.h @@ -0,0 +1,158 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ + +#ifndef VC_SUPPORT_STATUS_H +#define VC_SUPPORT_STATUS_H + +#include "vc/Support/StatusCode.h" + +#include + +#include + +namespace vc { + +class DynLoadError final : public llvm::ErrorInfo { +public: + static char ID; + +private: + std::string Message; + +public: + DynLoadError(llvm::StringRef Msg) : Message(Msg.str()) {} + + void log(llvm::raw_ostream &OS) const override; + std::error_code convertToErrorCode() const override { + return make_error_code(errc::dynamic_load_fail); + } +}; + +class SymbolLookupError final : public llvm::ErrorInfo { +public: + static char ID; + +private: + std::string Library; + std::string Symbol; + +public: + SymbolLookupError(llvm::StringRef Lib, llvm::StringRef Sym) + : Library(Lib.str()), Symbol(Sym.str()) {} + + void log(llvm::raw_ostream &OS) const override; + std::error_code convertToErrorCode() const override { + return make_error_code(errc::symbol_not_found); + } +}; + +class BadSpirvError final : public llvm::ErrorInfo { +public: + static char ID; + +private: + std::string Message; + +public: + BadSpirvError(llvm::StringRef Msg) : Message(Msg.str()) {} + + void log(llvm::raw_ostream &OS) const override; + std::error_code convertToErrorCode() const override { + return make_error_code(errc::bad_spirv); + } +}; + +class BadBitcodeError final : public llvm::ErrorInfo { +public: + static char ID; + +private: + std::string Message; + +public: + BadBitcodeError(llvm::StringRef Msg) : Message(Msg.str()) {} + + void log(llvm::raw_ostream &OS) const override; + std::error_code convertToErrorCode() const override { + return make_error_code(errc::bad_bitcode); + } +}; + +class InvalidModuleError final : public llvm::ErrorInfo { +public: + static char ID; + + void log(llvm::raw_ostream &OS) const override; + std::error_code convertToErrorCode() const override { + return make_error_code(errc::invalid_module); + } +}; + +class TargetMachineError final : public llvm::ErrorInfo { +public: + static char ID; + + void log(llvm::raw_ostream &OS) const override; + std::error_code convertToErrorCode() const override { + return make_error_code(errc::target_machine_not_created); + } +}; + +class NotVCError final : public llvm::ErrorInfo { +public: + static char ID; + + void log(llvm::raw_ostream &OS) const override; + std::error_code convertToErrorCode() const override { + return make_error_code(errc::not_vc_codegen); + } +}; + +class OptionError final : public llvm::ErrorInfo { +public: + static char ID; + +private: + std::string BadOption; + bool IsInternal; + +public: + OptionError(llvm::StringRef BadOpt, bool IsInternal_) + : BadOption(BadOpt.str()), IsInternal(IsInternal_) {} + + bool isInternal() const { return IsInternal; } + + void log(llvm::raw_ostream &OS) const override; + std::error_code convertToErrorCode() const override { + const errc c = + IsInternal ? errc::invalid_internal_option : errc::invalid_api_option; + return make_error_code(c); + } +}; + +} // namespace vc + +#endif diff --git a/IGC/VectorCompiler/include/vc/Support/StatusCode.h b/IGC/VectorCompiler/include/vc/Support/StatusCode.h new file mode 100644 index 000000000000..471eafc97a01 --- /dev/null +++ b/IGC/VectorCompiler/include/vc/Support/StatusCode.h @@ -0,0 +1,75 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ + +#ifndef VC_SUPPORT_STATUSCODE_H +#define VC_SUPPORT_STATUSCODE_H + +#include + +namespace vc { + +enum class errc { + // DynamicLibrary::getPermanentLibrary failure. + dynamic_load_fail = 1, + + // DynamicLibrary::getAddressOfSymbol failure. + symbol_not_found, + + // Spirv read failure. + bad_spirv, + + // Parse bitcode failure. + bad_bitcode, + + // Module verification failure. + invalid_module, + + // Target machine allocation failure. + target_machine_not_created, + + // VC codegen not specified in options. + not_vc_codegen, + + // Bad option in api options. + invalid_api_option, + + // Bad option in internal options. + invalid_internal_option, +}; + +const std::error_category &err_category() noexcept; + +inline std::error_code make_error_code(vc::errc e) noexcept { + return std::error_code(static_cast(e), vc::err_category()); +} + +} // namespace vc + +namespace std { +template <> struct is_error_code_enum : std::true_type {}; +} // namespace std + +#endif diff --git a/IGC/VectorCompiler/include/vc/Support/StatusTraits.h b/IGC/VectorCompiler/include/vc/Support/StatusTraits.h new file mode 100644 index 000000000000..f75ab00be381 --- /dev/null +++ b/IGC/VectorCompiler/include/vc/Support/StatusTraits.h @@ -0,0 +1,85 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ + +#ifndef VC_SUPPORT_STATUSTRAITS_H +#define VC_SUPPORT_STATUSTRAITS_H + +#include "vc/Support/StatusCode.h" + +#include "llvm/ADT/StringRef.h" + +namespace vc { + +// There should be specialization for every error code listed in errc. +// Specialization should define: +// * llvm::StringRef getMessage() // return description for error +template struct ErrorTraits; + +template <> struct ErrorTraits { + static llvm::StringRef getMessage() { + return "failed to load dynamic library"; + } +}; + +template <> struct ErrorTraits { + static llvm::StringRef getMessage() { return "symbol lookup error"; } +}; + +template <> struct ErrorTraits { + static llvm::StringRef getMessage() { return "bad spirv bitcode"; } +}; + +template <> struct ErrorTraits { + static llvm::StringRef getMessage() { return "bad llvm bitcode"; } +}; + +template <> struct ErrorTraits { + static llvm::StringRef getMessage() { return "module verification failed"; } +}; + +template <> struct ErrorTraits { + static llvm::StringRef getMessage() { + return "target machine creation failed"; + } +}; + +template <> struct ErrorTraits { + static llvm::StringRef getMessage() { + return "vc codegen path option was not specified"; + } +}; + +template <> struct ErrorTraits { + static llvm::StringRef getMessage() { return "invalid api option"; } +}; + +template <> struct ErrorTraits { + static llvm::StringRef getMessage() { return "invalid internal option"; } +}; + +} // namespace vc + +#endif diff --git a/IGC/VectorCompiler/lib/BackendPlugin/BackendPlugin.cpp b/IGC/VectorCompiler/lib/BackendPlugin/BackendPlugin.cpp new file mode 100644 index 000000000000..6b14e665c189 --- /dev/null +++ b/IGC/VectorCompiler/lib/BackendPlugin/BackendPlugin.cpp @@ -0,0 +1,36 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ + +#include "vc/GenXCodeGen/GenXTarget.h" + +static int initializeAll() { + llvm::initializeGenX(); + return 0; +} + +// This will be initialized on plugin load. +// Can cause problems if library is linked at compilation time. +static const int Init = initializeAll(); diff --git a/IGC/VectorCompiler/lib/BackendPlugin/CMakeLists.txt b/IGC/VectorCompiler/lib/BackendPlugin/CMakeLists.txt new file mode 100644 index 000000000000..06542850ca69 --- /dev/null +++ b/IGC/VectorCompiler/lib/BackendPlugin/CMakeLists.txt @@ -0,0 +1,39 @@ +set(BACKEND_PLUGIN_SOURCES + BackendPlugin.cpp + ) + +add_library(VCBackendPlugin + MODULE + ${BACKEND_PLUGIN_SOURCES} + ) + +# Hack to avoid transitive LLVM dependencies that will break +# plugin because of duplicate global variables. +# 'CODEGEN_LIBS' will consist of 'VCCodeGen' and all its direct dependencies. +# This should be enough for now. In case of indirect dependencies on VectorCompiler +# libraries, searching algorithm needs to be improved. +# 'CODEGEN_LIBS_FILES' will be expanded to plain library names so cmake +# will not add any transitive dependencies when target is linked against them. +get_target_property(CODEGEN_LIBS VCCodeGen LINK_LIBRARIES) +set(CODEGEN_LIBS VCCodeGen ${CODEGEN_LIBS}) +foreach(target ${CODEGEN_LIBS}) + # Filter out interface libraries -- these will not produce any files. + get_target_property(TARGET_TYPE ${target} TYPE) + if(NOT ("${TARGET_TYPE}" STREQUAL "INTERFACE_LIBRARY")) + set(CODEGEN_LIBS_FILES ${CODEGEN_LIBS_FILES} "$") + endif() +endforeach() + +# Cmake also does not add any dependencies for libraries +# that are linked this way. +add_dependencies(VCBackendPlugin + ${CODEGEN_LIBS} + ) + +target_link_libraries(VCBackendPlugin + PRIVATE + VCHeaders + ${CODEGEN_LIBS_FILES} + # GenX_IR actually should be linked to LLVMGenXCodeGen. + GenX_IR + ) diff --git a/IGC/VectorCompiler/lib/CMakeLists.txt b/IGC/VectorCompiler/lib/CMakeLists.txt new file mode 100644 index 000000000000..c67d9efb5f3f --- /dev/null +++ b/IGC/VectorCompiler/lib/CMakeLists.txt @@ -0,0 +1,9 @@ +add_subdirectory(GenXOpts) +add_subdirectory(GenXCodeGen) +add_subdirectory(Support) + +# Plugin support. +# Only for linux. +if(LLVM_ON_UNIX) + add_subdirectory(BackendPlugin) +endif() diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/CMakeLists.txt b/IGC/VectorCompiler/lib/GenXCodeGen/CMakeLists.txt new file mode 100644 index 000000000000..78e44fb1226d --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/CMakeLists.txt @@ -0,0 +1,84 @@ +add_subdirectory(TargetInfo) +add_subdirectory(Utils) + +set(LLVM_TARGET_DEFINITIONS GenX.td) +tablegen(LLVM GenXGenSubtargetInfo.inc -gen-subtarget) +add_public_tablegen_target(GenXCommonTableGen) + +set(CODEGEN_SOURCES + FunctionGroup.cpp + KillAnalysis.cpp + GenXAddressCommoning.cpp + GenXAggregatePseudoLowering.cpp + GenXAlignmentInfo.cpp + GenXAnalysisDumper.cpp + GenXArgIndirection.cpp + GenXBaling.cpp + GenXCategory.cpp + GenXCFSimplification.cpp + GenXCisaBuilder.cpp + GenXConstants.cpp + GenXCoalescing.cpp + GenXDeadVectorRemoval.cpp + GenXDepressurizer.cpp + GenXExtractVectorizer.cpp + GenXFuncPtrsLowering.cpp + GenXGotoJoin.cpp + GenXGEPLowering.cpp + GenXIMadPostLegalization.cpp + GenXInlineAsmLowering.cpp + GenXIntrinsics.cpp + GenXLayoutBlocks.cpp + GenXLegalization.cpp + GenXLiveRanges.cpp + GenXLiveness.cpp + GenXLowering.cpp + GenXLowerAggrCopies.cpp + GenXEmulate.cpp + GenXModule.cpp + GenXNumbering.cpp + GenXOCLInfoExtractor.cpp + GenXOCLRuntimeInfo.cpp + GenXPatternMatch.cpp + GenXPostLegalization.cpp + GenXPrinter.cpp + GenXPressureTracker.cpp + GenXPromoteArray.cpp + GenXThreadPrivateMemory.cpp + GenXPromotePredicate.cpp + GenXRawSendRipper.cpp + GenXReduceIntSize.cpp + GenXInstCombineCleanup.cpp + GenXRegion.cpp + GenXRegionCollapsing.cpp + GenXRematerialization.cpp + GenXSimdCFConformance.cpp + GenXSubtarget.cpp + GenXTargetMachine.cpp + GenXTidyControlFlow.cpp + GenXUnbaling.cpp + GenXUtil.cpp + GenXVectorDecomposer.cpp + GenXVisaRegAlloc.cpp + GenXWATable.cpp + GenXWrapper.cpp +) + +add_library(VCCodeGen ${CODEGEN_SOURCES}) +add_dependencies(VCCodeGen + GenXUtilBuild + GenXCommonTableGen + ) +target_include_directories(VCCodeGen + PRIVATE + ${VISA_INCLUDE_DIRS} + ${CMAKE_CURRENT_BINARY_DIR} + ) +target_link_libraries(VCCodeGen + LLVMGenXIntrinsics + + VCHeaders + VCTransforms + VCTargetInfo + VCSupport + ) diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/FunctionGroup.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/FunctionGroup.cpp new file mode 100644 index 000000000000..380e71ede96d --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/FunctionGroup.cpp @@ -0,0 +1,671 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +// This file implements FunctionGroup, FunctionGroupAnalysis and +// FunctionGroupPass. See FunctionGroup.h for more details. +// +// The FunctionGroupPass part was adapted from CallGraphSCCPass.cpp. +// +// This file is currently in lib/Target/GenX, as that is the only place it +// is used. It could be moved somewhere more general. +// +//===----------------------------------------------------------------------===// + +#include "FunctionGroup.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Timer.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include "llvm/GenXIntrinsics/GenXMetadata.h" +using namespace llvm; + +#include "llvmWrapper/IR/LegacyPassManagers.h" +#include "llvmWrapper/IR/PassTimingInfo.h" + + +#define DEBUG_TYPE "functiongroup-passmgr" + +/*********************************************************************** + * FunctionGroupAnalysis implementation + */ +char FunctionGroupAnalysis::ID = 0; +INITIALIZE_PASS(FunctionGroupAnalysis, "FunctionGroupAnalysis", + "FunctionGroupAnalysis", false, true /*analysis*/) + +ModulePass *llvm::createFunctionGroupAnalysisPass() { + initializeFunctionGroupAnalysisPass(*PassRegistry::getPassRegistry()); + return new FunctionGroupAnalysis(); +} + +// clear : clear out the analysis +void FunctionGroupAnalysis::clear() { + for (auto T : TypesToProcess) + GroupMap[T].clear(); + + for (auto i = begin(), e = end(); i != e; ++i) + delete *i; + for (auto i = NonMainGroups.begin(), e = NonMainGroups.end(); i != e; ++i) + delete *i; + + Groups.clear(); + NonMainGroups.clear(); + M = nullptr; +} + +FunctionGroup *FunctionGroupAnalysis::getGroup(Function *F, FGType Type) { + auto i = GroupMap[Type].find(F); + if (i == GroupMap[Type].end()) + return nullptr; + return i->second; +} + +// getGroup : get the FunctionGroup containing Function F, else 0 +FunctionGroup *FunctionGroupAnalysis::getGroup(Function *F) { + return getGroup(F, FGType::GROUP); +} + +FunctionGroup *FunctionGroupAnalysis::getSubGroup(Function *F) { + return getGroup(F, FGType::SUBGROUP); +} + +// getGroupForHead : get the FunctionGroup for which Function F is the +// head, else 0 +FunctionGroup *FunctionGroupAnalysis::getGroupForHead(Function *F) { + auto FG = getGroup(F); + assert(FG->size()); + if (*FG->begin() == F) + return FG; + return nullptr; +} + +// replaceFunction : replace a Function in a FunctionGroup +// An in-use iterator in the modified FunctionGroup remains valid. +void FunctionGroupAnalysis::replaceFunction(Function *OldF, Function *NewF) { + for (auto T : TypesToProcess) { + auto OldFIt = GroupMap[T].find(OldF); + assert(OldFIt != GroupMap[T].end()); + FunctionGroup *FG = OldFIt->second; + GroupMap[T].erase(OldFIt); + GroupMap[T][NewF] = FG; + for (auto i = FG->begin();; ++i) { + assert(i != FG->end()); + if (*i == OldF) { + *i = NewF; + break; + } + } + } +} + +// addToFunctionGroup : add Function F to FunctionGroup FG +// Using this (rather than calling push_back directly on the FunctionGroup) +// means that the mapping from F to FG will be created, and getGroup() will +// work for this Function. +void FunctionGroupAnalysis::addToFunctionGroup(FunctionGroup *FG, Function *F, + FGType Type) { + assert(FG); + assert(FG->getParent()->getModule() == M && + "attaching to FunctionGroup from wrong Module"); + assert(!GroupMap[Type][F] && "Function already attached to FunctionGroup"); + GroupMap[Type][F] = FG; + FG->push_back(F); +} + +// createFunctionGroup : create new FunctionGroup for which F is the head +FunctionGroup *FunctionGroupAnalysis::createFunctionGroup(Function *F, + FGType Type) { + auto FG = new FunctionGroup(this); + if (Type == FGType::GROUP) + Groups.push_back(FG); + else + NonMainGroups.push_back(FG); + addToFunctionGroup(FG, F, Type); + return FG; +} + +// Returns true if pass is simple module pass, +// e.g. it is neither FG pass nor function pass manager. +static bool isModulePass(Pass *P) { + if (P->getPassKind() != PT_Module) + return false; + return !P->getAsPMDataManager(); +} + +static StringRef TypeToAttr(FunctionGroupAnalysis::FGType Type) { + switch (Type) { + case FunctionGroupAnalysis::FGType::GROUP: + return genx::FunctionMD::CMGenXMain; + case FunctionGroupAnalysis::FGType::SUBGROUP: + return genx::FunctionMD::CMStackCall; + default: + llvm_unreachable("Can't gen attribute for nox-existent FG type"); + break; + } + return ""; +} + +bool FunctionGroupAnalysis::buildGroup(CallGraph &Callees, Function *F, + FunctionGroup *curGr, FGType Type) { + bool result = false; + LLVM_DEBUG(dbgs() << "process function " << F->getName() << " from " << curGr + << ", type = " << Type << "\n"); + if (Visited.count(F) > 0) { + if (GroupMap[Type].count(F) > 0 && GroupMap[Type][F] != curGr && + !F->hasFnAttribute(TypeToAttr(Type))) { + ValueToValueMapTy VMap; + Function *ClonedFunc = CloneFunction(F, VMap); + LLVM_DEBUG(dbgs() << "Cloning: " << ClonedFunc->getName() << "\n"); + + result = true; + + for (auto it = F->use_begin(); it != F->use_end();) { + Use *u = &*it++; + auto *CI = dyn_cast(u->getUser()); + assert(CI); + if (GroupMap[Type][CI->getFunction()] == curGr) + *u = ClonedFunc; + } + for (auto T : TypesToProcess) { + if (T >= Type) + break; + addToFunctionGroup(getGroup(F, T), ClonedFunc, T); + } + addToFunctionGroup(curGr, ClonedFunc, Type); + + for (auto &Callee : Callees[F]) { + if (Callee == F) + continue; + LLVM_DEBUG(dbgs() << "Next callee: " << Callee->getName() << "\n"); + result |= buildGroup(Callees, Callee, curGr, Type); + } + } + } else if (!Visited.count(F)) { + Visited[F] = true; + // group is created either on a function with a corresponding attribute + // or on a root of a whole function tree that is kernel (genx_main) + if (F->hasFnAttribute(TypeToAttr(Type)) || + F->hasFnAttribute(genx::FunctionMD::CMGenXMain)) { + LLVM_DEBUG(dbgs() << "Create new group of type " << Type << "\n"); + curGr = createFunctionGroup(F, Type); + } else if (curGr) { + LLVM_DEBUG(dbgs() << "Add to group " << curGr->getHead()->getName() + << " of type " << Type << "\n"); + addToFunctionGroup(curGr, F, Type); + } + for (auto &Callee : Callees[F]) { + LLVM_DEBUG(dbgs() << "Next callee: " << Callee->getName() << "\n"); + result |= buildGroup(Callees, Callee, curGr, Type); + } + } + LLVM_DEBUG(dbgs() << "finish processing function " << F->getName() + << " on level " << Type << "\n"); + return result; +} + +//===----------------------------------------------------------------------===// +// FGPassManager +// +/// FGPassManager manages FPPassManagers and FunctionGroupPasses. +/// It actually now imitates MPPassManager because there is no way +/// to extend pass manager structure without modification of +/// LLVM pass managers code. +/// This pass is injected into pass manager stack instead of top-level +/// MPPassManager when there is first time FunctionGroupPass is created. +/// After this manager replaces MPPassManager, it handles all Module and +/// FunctionGroup passes. This manager itself is module pass so it is +/// actually contained in list of module passes of module pass manager +/// as last pass that should be run. However, top-level pass manager do +/// not know anything about this FGPassManager except that it is indirect +/// pass manager, so it will not run it directly. + +namespace { + +class FGPassManager : public ModulePass, public IGCLLVM::PMDataManager { +public: + static char ID; + explicit FGPassManager() : ModulePass(ID), IGCLLVM::PMDataManager() {} + + /// run - Execute all of the passes scheduled for execution. Keep track of + /// whether any of the passes modifies the module, and if so, return true. + bool runOnModule(Module &M) override; + + bool doInitialization(Module &M) override; + bool doFinalization(Module &M) override; + + /// Pass Manager itself does not invalidate any analysis info. + void getAnalysisUsage(AnalysisUsage &Info) const override { + // FGPassManager needs FunctionGroupAnalysis. + Info.addRequired(); + Info.setPreservesAll(); + } + + StringRef getPassName() const override { + return "FunctionGroup Pass Manager"; + } + + PMDataManager *getAsPMDataManager() override { return this; } + Pass *getAsPass() override { return this; } + + // Print passes managed by this manager + void dumpPassStructure(unsigned Offset) override { + errs().indent(Offset * 2) << "FunctionGroup Pass Manager\n"; + for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) { + Pass *P = getContainedPass(Index); + unsigned DumpOffset = Offset + 1; + // Pretend that there is no FGPassManager when we need to dump + // module pass indentation. + if (isModulePass(P)) + DumpOffset -= 1; + P->dumpPassStructure(DumpOffset); + dumpLastUses(P, DumpOffset); + } + } + + Pass *getContainedPass(unsigned N) { + assert(N < PassVector.size() && "Pass number out of range!"); + return static_cast(PassVector[N]); + } + + PassManagerType getPassManagerType() const override { + return PMT_ModulePassManager; + } + +private: + bool runPassesOnFunctionGroup(unsigned Begin, unsigned End, FunctionGroup &FG); + bool runPassOnFunctionGroup(Pass *P, FunctionGroup &FG); + bool doFGInitialization(unsigned Begin, unsigned End, FunctionGroupAnalysis &FGA); + bool doFGFinalization(unsigned Begin, unsigned End, FunctionGroupAnalysis &FGA); + bool runFGPassSequence(unsigned &Pass); + bool runModulePassSequence(unsigned &Pass, Module &M); +}; + +} // end anonymous namespace. + +char FGPassManager::ID = 0; + +bool FGPassManager::runPassOnFunctionGroup(Pass *P, FunctionGroup &FG) { + bool Changed = false; + llvm::PMDataManager *PM = P->getAsPMDataManager(); + + if (!PM) { + FunctionGroupPass *CGSP = (FunctionGroupPass *)P; + { + TimeRegion PassTimer(getPassTimer(CGSP)); + Changed = CGSP->runOnFunctionGroup(FG); + } + return Changed; + } + + // TODO: there may be also SCC pass manager. + assert(PM->getPassManagerType() == PMT_FunctionPassManager && + "Invalid FGPassManager member"); + FPPassManager *FPP = (FPPassManager *)P; + + // Run pass P on all functions in the current FunctionGroup. + for (auto &F : FG) { + dumpPassInfo(P, EXECUTION_MSG, ON_FUNCTION_MSG, F->getName()); + { + TimeRegion PassTimer(getPassTimer(FPP)); + Changed |= FPP->runOnFunction(*F); + } + F->getContext().yield(); + } + return Changed; +} + + +/// RunPassesOnFunctionGroup - Execute sequential passes of pass manager +/// on the specified FunctionGroup +bool FGPassManager::runPassesOnFunctionGroup(unsigned Begin, unsigned End, + FunctionGroup &FG) { + bool Changed = false; + + // Run selected passes on current FunctionGroup. + for (unsigned PassNo = Begin; PassNo != End; ++PassNo) { + Pass *P = getContainedPass(PassNo); + dumpRequiredSet(P); + + initializeAnalysisImpl(P); + + // Actually run this pass on the current FunctionGroup. + Changed |= runPassOnFunctionGroup(P, FG); + if (Changed) + dumpPassInfo(P, MODIFICATION_MSG, ON_MODULE_MSG, ""); + dumpPreservedSet(P); + + verifyPreservedAnalysis(P); + removeNotPreservedAnalysis(P); + recordAvailableAnalysis(P); + removeDeadPasses(P, "", ON_MODULE_MSG); + } + + return Changed; +} + +/// Initialize sequential FG passes +bool FGPassManager::doFGInitialization(unsigned Begin, unsigned End, + FunctionGroupAnalysis &FGA) { + bool Changed = false; + + for (unsigned i = Begin; i != End; ++i) { + if (llvm::PMDataManager *PM = getContainedPass(i)->getAsPMDataManager()) { + // TODO: SCC PassManager? + assert(PM->getPassManagerType() == PMT_FunctionPassManager && + "Invalid FGPassManager member"); + Changed |= ((FPPassManager*)PM)->doInitialization(*FGA.getModule()); + } else { + Changed |= + ((FunctionGroupPass *)getContainedPass(i))->doInitialization(FGA); + } + } + + return Changed; +} + +/// Finalize sequential FG passes +bool FGPassManager::doFGFinalization(unsigned Begin, unsigned End, + FunctionGroupAnalysis &FGA) { + bool Changed = false; + + for (int i = End - 1; i >= static_cast(Begin); --i) { + if (llvm::PMDataManager *PM = getContainedPass(i)->getAsPMDataManager()) { + // TODO: SCC PassManager? + assert(PM->getPassManagerType() == PMT_FunctionPassManager && + "Invalid FGPassManager member"); + Changed |= ((FPPassManager*)PM)->doFinalization(*FGA.getModule()); + } else { + Changed |= + ((FunctionGroupPass *)getContainedPass(i))->doFinalization(FGA); + } + } + + return Changed; +} + +bool FGPassManager::runFGPassSequence(unsigned &Pass) { + const unsigned BeginPass = Pass; + const unsigned NumPasses = getNumContainedPasses(); + while (Pass < NumPasses && !isModulePass(getContainedPass(Pass))) + ++Pass; + + // Function group analysis may be invalidated by previous + // module passes so we will need to query it every time we + // execute sequence of passes. + FunctionGroupAnalysis &FGA = getAnalysis(); + bool Changed = false; + + Changed |= doFGInitialization(BeginPass, Pass, FGA); + for (auto *FG : FGA) + Changed |= runPassesOnFunctionGroup(BeginPass, Pass, *FG); + Changed |= doFGFinalization(BeginPass, Pass, FGA); + + return Changed; +} + +bool FGPassManager::runModulePassSequence(unsigned &Pass, Module &M) { + const unsigned BeginPass = Pass; + const unsigned NumPasses = getNumContainedPasses(); + while (Pass < NumPasses && isModulePass(getContainedPass(Pass))) + ++Pass; + + bool Changed = false; + + // Copied from MPPassManager in LegacyPassManager.cpp. + unsigned InstrCount, ModuleCount = 0; + StringMap> FunctionToInstrCount; + bool EmitICRemark = M.shouldEmitInstrCountChangedRemark(); + // Collect the initial size of the module. + if (EmitICRemark) { + InstrCount = initSizeRemarkInfo(M, FunctionToInstrCount); + ModuleCount = InstrCount; + } + + for (unsigned Index = BeginPass; Index < Pass; ++Index) { + auto *MP = static_cast(getContainedPass(Index)); + bool LocalChanged = false; + + dumpPassInfo(MP, EXECUTION_MSG, ON_MODULE_MSG, M.getModuleIdentifier()); + dumpRequiredSet(MP); + + initializeAnalysisImpl(MP); + + { + PassManagerPrettyStackEntry X(MP, M); + TimeRegion PassTimer(getPassTimer(MP)); + + LocalChanged |= MP->runOnModule(M); + if (EmitICRemark) { + // Update the size of the module. + ModuleCount = M.getInstructionCount(); + if (ModuleCount != InstrCount) { + int64_t Delta = static_cast(ModuleCount) - + static_cast(InstrCount); + emitInstrCountChangedRemark(MP, M, Delta, InstrCount, + FunctionToInstrCount); + InstrCount = ModuleCount; + } + } + } + + Changed |= LocalChanged; + if (LocalChanged) + dumpPassInfo(MP, MODIFICATION_MSG, ON_MODULE_MSG, + M.getModuleIdentifier()); + dumpPreservedSet(MP); + dumpUsedSet(MP); + + verifyPreservedAnalysis(MP); + removeNotPreservedAnalysis(MP); + recordAvailableAnalysis(MP); + removeDeadPasses(MP, M.getModuleIdentifier(), ON_MODULE_MSG); + } + + return Changed; +} + +/// run - Execute all of the passes scheduled for execution. Keep track of +/// whether any of the passes modifies the module, and if so, return true. +bool FGPassManager::runOnModule(Module &M) { + bool Changed = false; + + unsigned CurPass = 0; + unsigned NumPasses = getNumContainedPasses(); + while (CurPass != NumPasses) { + // We will always have chain of fg passes followed by + // module passes repeating until there are no passes. + Changed |= runFGPassSequence(CurPass); + Changed |= runModulePassSequence(CurPass, M); + } + + return Changed; +} + +bool FGPassManager::doInitialization(Module &M) { + bool Changed = false; + + // Initialize module passes + for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) { + auto *P = getContainedPass(Index); + if (isModulePass(P)) + Changed |= P->doInitialization(M); + } + + return Changed; +} + +bool FGPassManager::doFinalization(Module &M) { + bool Changed = false; + + // Finalize module passes + for (int Index = getNumContainedPasses() - 1; Index >= 0; --Index) { + auto *P = getContainedPass(Index); + if (isModulePass(P)) + Changed |= P->doFinalization(M); + } + + return Changed; +} + +//===----------------------------------------------------------------------===// +// FunctionGroupPass Implementation +//===----------------------------------------------------------------------===// + +/// Assign pass manager to manage this pass. +void FunctionGroupPass::assignPassManager(PMStack &PMS, + PassManagerType PreferredType) { + // Find module pass manager. + while (!PMS.empty() && + PMS.top()->getPassManagerType() > PMT_ModulePassManager) + PMS.pop(); + + assert(!PMS.empty() && "Unable to handle FunctionGroup Pass"); + FGPassManager *GFP; + + // Check whether this ModulePassManager is our injected function + // group pass manager. If not, replace old module pass manager + // with one for function groups. + auto *PM = PMS.top(); + assert(PM->getPassManagerType() == PMT_ModulePassManager && + "Bad pass manager type for function group pass manager"); + if (PM->getAsPass()->getPassID() == &FGPassManager::ID) + GFP = static_cast(PM); + else { + // Create new FunctionGroup Pass Manager if it does not exist. + + // [1] Create new FunctionGroup Pass Manager + GFP = new FGPassManager(); + + // [2] Set up new manager's top level manager + PMTopLevelManager *TPM = PM->getTopLevelManager(); + TPM->addIndirectPassManager(GFP); + GFP->setTopLevelManager(TPM); + + // [3] Assign manager to manage this new manager. This should not create + // and push new managers into PMS + TPM->schedulePass(GFP); + assert(PMS.top() == PM && "Pass manager unexpectedly changed"); + + // [4] Steal analysis info from module pass manager. + *GFP->getAvailableAnalysis() = std::move(*PM->getAvailableAnalysis()); + + // [5] Replace module pass manager with function group pass manager. + PMS.pop(); + PMS.push(GFP); + } + + GFP->add(this); +} + +/// getAnalysisUsage - For this class, we declare that we require and preserve +/// FunctionGroupAnalysis. If the derived class implements this method, it +/// should always explicitly call the implementation here. +void FunctionGroupPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.addPreserved(); +} + +//===----------------------------------------------------------------------===// +// PrintFunctionGroupPass Implementation +//===----------------------------------------------------------------------===// + +namespace { +/// PrintFunctionGroupPass - Print a FunctionGroup +/// +class PrintFunctionGroupPass : public FunctionGroupPass { + std::string Banner; + raw_ostream &Out; // raw_ostream to print on. +public: + static char ID; + PrintFunctionGroupPass(const std::string &B, raw_ostream &o) + : FunctionGroupPass(ID), Banner(B), Out(o) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + } + + bool runOnFunctionGroup(FunctionGroup &FG) override { + Out << Banner; + for (auto I = FG.begin(), E = FG.end(); I != E; ++I) { + Function *F = *I; + Out << Banner << static_cast(*F); + } + return false; + } +}; +} // end anonymous namespace. + +char PrintFunctionGroupPass::ID = 0; + +Pass *FunctionGroupPass::createPrinterPass(raw_ostream &O, + const std::string &Banner) const { + return new PrintFunctionGroupPass(Banner, O); +} + +//===----------------------------------------------------------------------===// +// DominatorTreeGroupWrapperPass Implementation +//===----------------------------------------------------------------------===// +// +// The implementation details of the wrapper pass that holds a DominatorTree +// per Function in a FunctionGroup. +// +//===----------------------------------------------------------------------===// +char DominatorTreeGroupWrapperPass::ID = 0; +INITIALIZE_PASS_BEGIN(DominatorTreeGroupWrapperPass, "groupdomtree", + "Group Dominator Tree Construction", true, true) +INITIALIZE_PASS_END(DominatorTreeGroupWrapperPass, "groupdomtree", + "Group Dominator Tree Construction", true, true) + +void DominatorTreeGroupWrapperPass::releaseMemory() { + for (auto i = DTs.begin(), e = DTs.end(); i != e; ++i) + delete i->second; + DTs.clear(); +} + +bool DominatorTreeGroupWrapperPass::runOnFunctionGroup(FunctionGroup &FG) { + for (auto fgi = FG.begin(), fge = FG.end(); fgi != fge; ++fgi) { + Function *F = *fgi; + auto DT = new DominatorTree; + DT->recalculate(*F); + DTs[F] = DT; + } + return false; +} + +void DominatorTreeGroupWrapperPass::verifyAnalysis() const { + for (auto i = DTs.begin(), e = DTs.end(); i != e; ++i) + i->second->verify(); +} + +void DominatorTreeGroupWrapperPass::print(raw_ostream &OS, + const Module *) const { + for (auto i = DTs.begin(), e = DTs.end(); i != e; ++i) + i->second->print(OS); +} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/FunctionGroup.h b/IGC/VectorCompiler/lib/GenXCodeGen/FunctionGroup.h new file mode 100644 index 000000000000..e3639616ada9 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/FunctionGroup.h @@ -0,0 +1,280 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// FunctionGroup +/// ------------- +/// +/// FunctionGroup is a generic mechanism for maintaining a group of Functions. +/// +/// FunctionGroupAnalysis is a Module analysis that maintains all the +/// FunctionGroups in the Module. It is up to some other pass to use +/// FunctionGroupAnalysis to create and populate the FunctionGroups, and thus +/// attach some semantics to what a FunctionGroup represents. +/// +/// FunctionGroupPass is a type of pass (with associated pass manager) that +/// runs a pass instance per FunctionGroup. +/// +/// This file is currently in lib/Target/GenX, as that is the only place it +/// is used. It could be moved somewhere more general. +/// +//===----------------------------------------------------------------------===// +#ifndef FUNCTIONGROUP_H +#define FUNCTIONGROUP_H + +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/ValueHandle.h" +#include "llvm/Pass.h" + +#include + +namespace llvm { + +class FunctionGroupAnalysis; +class LLVMContext; +class PMStack; + +//---------------------------------------------------------------------- +// FunctionGroup : a group of Functions +// +class FunctionGroup { + FunctionGroupAnalysis *FGA; + // Vector of Functions in the FunctionGroup. Element 0 is the head. + // Elements are asserting value handles, so we spot when a Function + // in the group gets destroyed too early. + SmallVector, 8> Functions; + +public: + FunctionGroup(FunctionGroupAnalysis *FGA) : FGA(FGA) {} + FunctionGroupAnalysis *getParent() { return FGA; } + // push_back : push a Function into the group. The first time this is done, + // the Function is the head Function. + void push_back(Function *F) { Functions.push_back(AssertingVH(F)); } + // iterator and forwarders. The iterator iterates over the Functions in the + // group, starting with the head Function. + AssertingVH &at(unsigned i) { return Functions[i]; } + typedef SmallVectorImpl>::iterator iterator; + iterator begin() { return Functions.begin(); } + iterator end() { return Functions.end(); } + typedef SmallVectorImpl>::reverse_iterator + reverse_iterator; + reverse_iterator rbegin() { return Functions.rbegin(); } + reverse_iterator rend() { return Functions.rend(); } + size_t size() { return Functions.size(); } + // accessors + Function *getHead() { + assert(size()); + return *begin(); + } + StringRef getName() { return getHead()->getName(); } + LLVMContext &getContext() { return getHead()->getContext(); } + Module *getModule() { return getHead()->getParent(); } +}; + +//---------------------------------------------------------------------- +// FunctionGroupAnalysis - a Module analysis that maintains all the +// FunctionGroups in the Module. It is up to some other pass to use +// FunctionGroupAnalysis to create the FunctionGroups and then populate them. +// +class FunctionGroupAnalysis : public ModulePass { +public: + // FunctionGroup types: + // * GROUP - GENX_MAIN kernel and its underlying callgraph + // * SUBGROUP - GENX_STACKCALL function and its underlying callgraph including + // subroutines only + // Groups are necessary to perform cloning of subroutines + // called from different kernels and/or stack functions + enum class FGType { GROUP, SUBGROUP, MAX }; + const FGType TypesToProcess[static_cast(FGType::MAX)] = { + FGType::GROUP, FGType::SUBGROUP}; + +private: + Module *M; + SmallVector Groups; + + // storage for FunctionGroups that aren't of type GROUP, + // i.e. not necessarily GENX_MAIN headed + // TODO: mb increase 8 as there can be many stack funcs hence may subgroups + SmallVector NonMainGroups; + + class FGMap { + using ElementType = std::map; + ElementType data[static_cast(FGType::MAX)]; + + public: + ElementType &operator[](FGType type) { + auto index = static_cast(type); + return data[index]; + } + }; + + FGMap GroupMap; + std::map Visited; + using CallGraph = std::map>; + +public: + static char ID; + explicit FunctionGroupAnalysis() : ModulePass(ID) {} + ~FunctionGroupAnalysis() { clear(); } + virtual StringRef getPassName() const { return "function group analysis"; } + // runOnModule : does almost nothing + bool runOnModule(Module &ArgM) { + clear(); + M = &ArgM; + return false; + } + // getModule : get the Module that this FunctionGroupAnalysis is for + Module *getModule() { return M; } + // clear : clear out the FunctionGroupAnalysis + void clear(); + // getGroup : get the FunctionGroup containing Function F, else 0 + FunctionGroup *getGroup(Function *F, FGType Type); + FunctionGroup *getGroup(Function *F); + FunctionGroup *getSubGroup(Function *F); + // getGroupForHead : get the FunctionGroup for which Function F is the + // head, else 0 + FunctionGroup *getGroupForHead(Function *F); + // replaceFunction : replace a Function in a FunctionGroup + void replaceFunction(Function *OldF, Function *NewF); + // iterator for FunctionGroups in the analysis + typedef SmallVectorImpl::iterator iterator; + iterator begin() { return iterator(Groups.begin()); } + iterator end() { return iterator(Groups.end()); } + size_t size() { return Groups.size(); } + // addToFunctionGroup : add Function F to FunctionGroup FG + // Using this (rather than calling push_back directly on the FunctionGroup) + // means that the mapping from F to FG will be created, and getGroup() will + // work for this Function. + void addToFunctionGroup(FunctionGroup *FG, Function *F, FGType Type); + // createFunctionGroup : create new FunctionGroup for which F is the head + FunctionGroup *createFunctionGroup(Function *F, FGType Type); + bool buildGroup(CallGraph &callees, Function *F, + FunctionGroup *curGr = nullptr, FGType Type = FGType::GROUP); + + void clearVisited() { Visited.clear(); } +}; + +ModulePass *createFunctionGroupAnalysisPass(); +void initializeFunctionGroupAnalysisPass(PassRegistry &); + +inline raw_ostream &operator<<(raw_ostream &OS, + const FunctionGroupAnalysis::FGType &T) { + switch (T) { + case FunctionGroupAnalysis::FGType::GROUP: + OS << "Group"; + break; + case FunctionGroupAnalysis::FGType::SUBGROUP: + OS << "Subgroup"; + break; + default: + llvm_unreachable("Invalid FG type"); + break; + } + return OS; +} + +//---------------------------------------------------------------------- +// FunctionGroupPass - a type of pass (with associated pass manager) that +// runs a pass instance per FunctionGroup. +// +class FunctionGroupPass : public Pass { +public: + static constexpr unsigned PassType = PT_PassManager + 1; + + explicit FunctionGroupPass(char &pid) : Pass(static_cast(PassType), pid) {} + + // createPrinterPass - Get a pass that prints the Module + // corresponding to a FunctionGroupAnalysis. + Pass *createPrinterPass(raw_ostream &O, + const std::string &Banner) const override; + + using llvm::Pass::doFinalization; + using llvm::Pass::doInitialization; + + // doInitialization - This method is called before the FunctionGroups of the + // program have been processed, allowing the pass to do initialization as + // necessary. + virtual bool doInitialization(FunctionGroupAnalysis &FGA) { return false; } + + // runOnFunctionGroup - This method should be implemented by the subclass to + // perform whatever action is necessary for the specified FunctionGroup. + // + virtual bool runOnFunctionGroup(FunctionGroup &FG) = 0; + + // doFinalization - This method is called after the FunctionGroups of the + // program have been processed, allowing the pass to do final cleanup as + // necessary. + virtual bool doFinalization(FunctionGroupAnalysis &FGA) { return false; } + + // Assign pass manager to manager this pass + void assignPassManager(PMStack &PMS, PassManagerType PMT) override; + + // Return what kind of Pass Manager can manage this pass. + PassManagerType getPotentialPassManagerType() const override { + return PMT_ModulePassManager; + } + + // getAnalysisUsage - For this class, we declare that we require and + // preserve the FunctionGroupAnalysis. + // If the derived class implements this method, it should + // always explicitly call the implementation here. + void getAnalysisUsage(AnalysisUsage &Info) const override; +}; + +//---------------------------------------------------------------------- +// DominatorTreeGroupWrapperPass : Analysis pass which computes a DominatorTree +// per Function in the FunctionGroup. +class DominatorTree; + +class DominatorTreeGroupWrapperPass : public FunctionGroupPass { + std::map DTs; + +public: + static char ID; + + DominatorTreeGroupWrapperPass() : FunctionGroupPass(ID) {} + ~DominatorTreeGroupWrapperPass() { releaseMemory(); } + + DominatorTree *getDomTree(Function *F) { return DTs[F]; } + const DominatorTree &getDomTree(); + + bool runOnFunctionGroup(FunctionGroup &FG) override; + + void verifyAnalysis() const override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + FunctionGroupPass::getAnalysisUsage(AU); + AU.setPreservesAll(); + } + + void releaseMemory() override; + + void print(raw_ostream &OS, const Module *M = nullptr) const override; +}; +void initializeDominatorTreeGroupWrapperPassPass(PassRegistry &); + +} // end namespace llvm +#endif // ndef FUNCTIONGROUP_H diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenX.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenX.h new file mode 100644 index 000000000000..3c8b42037f67 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenX.h @@ -0,0 +1,157 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ + +#ifndef TARGET_GENX_H +#define TARGET_GENX_H +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/GenXIntrinsics/GenXIntrinsics.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Analysis/LoopInfo.h" +#include + +namespace llvm { + +class BasicBlock; +class CallInst; +class Constant; +class DebugLoc; +class DominatorTree; +class formatted_raw_ostream; +class Function; +class FunctionGroup; +class FunctionGroupPass; +class FunctionPass; +class GenXSubtarget; +class Instruction; +class MDNode; +class ModulePass; +class ShuffleVectorInst; +class TargetOptions; +class Twine; +class Value; +class raw_ostream; +class raw_pwrite_stream; + +enum BalingKind { + BK_Legalization, // build baling info for legalization + BK_CodeGen, // build baling info for the final vISA emission + BK_Analysis, // build baling info for analysis (register pressure) +}; + +FunctionPass *createGenXPrinterPass(raw_ostream &O, const std::string &Banner); +FunctionGroupPass *createGenXGroupPrinterPass(raw_ostream &O, const std::string &Banner); +FunctionPass *createGenXAnalysisDumperPass(FunctionPass *Analysis, const char *Suffix); +FunctionGroupPass *createGenXGroupAnalysisDumperPass(FunctionGroupPass *Analysis, const char *Suffix); + +FunctionPass *createGenXCFSimplificationPass(); +ModulePass *createGenXEarlySimdCFConformancePass(); +FunctionPass *createGenXReduceIntSizePass(); +FunctionPass *createGenXInstCombineCleanup(); +FunctionPass *createGenXInlineAsmLoweringPass(); +FunctionPass *createGenXLoweringPass(); +FunctionPass *createGenXLowerAggrCopiesPass(); +FunctionPass *createGenXGEPLoweringPass(); +FunctionPass *createGenXRegionCollapsingPass(); +FunctionPass *createGenXExtractVectorizerPass(); +FunctionPass *createGenXRawSendRipperPass(); +FunctionPass *createGenXFuncBalingPass(BalingKind Kind, GenXSubtarget *ST); +FunctionPass *createGenXLegalizationPass(); +ModulePass *createGenXEmulatePass(); +FunctionPass *createGenXDeadVectorRemovalPass(); +FunctionPass *createGenXPatternMatchPass(const TargetOptions *Options); +FunctionPass *createGenXPostLegalizationPass(); +FunctionPass *createTransformPrivMemPass(); +ModulePass *createGenXThreadPrivateMemoryPass(); +FunctionPass *createGenXPromotePredicatePass(); +FunctionPass *createGenXIMadPostLegalizationPass(); +FunctionPass *createGenXAggregatePseudoLoweringPass(); +ModulePass *createGenXModulePass(); +FunctionGroupPass *createGenXLateSimdCFConformancePass(); +FunctionGroupPass *createGenXLivenessPass(); +ModulePass *createGenXFunctionPointersLoweringPass(); +FunctionGroupPass *createGenXCategoryPass(); +FunctionGroupPass *createGenXGroupBalingPass(BalingKind Kind, GenXSubtarget *ST); +FunctionGroupPass *createGenXUnbalingPass(); +FunctionGroupPass *createGenXDepressurizerPass(); +FunctionGroupPass *createGenXLateLegalizationPass(); +FunctionGroupPass *createGenXNumberingPass(); +FunctionGroupPass *createGenXLiveRangesPass(); +FunctionGroupPass *createGenXRematerializationPass(); +FunctionGroupPass *createGenXCoalescingPass(); +FunctionGroupPass *createGenXAddressCommoningPass(); +FunctionGroupPass *createGenXArgIndirectionPass(); +FunctionPass *createGenXTidyControlFlowPass(); +FunctionGroupPass *createGenXVisaRegAllocPass(); +FunctionGroupPass *createGenXVisaFuncWriterPass(); +FunctionGroupPass *createGenXCisaBuilderPass(); +ModulePass *createGenXFinalizerPass(raw_pwrite_stream &o); +ModulePass *createGenXVisaWriterPass(raw_pwrite_stream &o); + +namespace genx { + +// A local encoding (not part of vISA or GenX) of whether an operand should be signed. +enum Signedness { + DONTCARESIGNED = 3, SIGNED = 1, UNSIGNED = 2 +}; + +const constexpr int BoolBits = 1; +const constexpr int ByteBits = 8; +const constexpr int WordBits = 16; +const constexpr int DWordBits = 32; +const constexpr int QWordBits = 64; +const constexpr int GRFBits = 256; + +const constexpr int ByteBytes = ByteBits / ByteBits; +const constexpr int WordBytes = WordBits / ByteBits; +const constexpr int DWordBytes = DWordBits / ByteBits; +const constexpr int QWordBytes = QWordBits / ByteBits; + +// vISA allows [-512,511] for operation to be baled as offset +// for rdregion, copied from visa +const constexpr int G4_MAX_ADDR_IMM = 511; +const constexpr int G4_MIN_ADDR_IMM = -512; + +// describe integer vector immediate (V, UV) +enum ImmIntVec { + Width = 8, // num elem in vector + ElemSize = 4, // in bits + MaxUInt = (1 << ElemSize) - 1, + MinUInt = 0, + MaxSInt = (1 << (ElemSize - 1)) - 1, + MinSInt = -(1 << (ElemSize - 1)) +}; + +} // End genx namespace +} // End llvm namespace + +#endif diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenX.td b/IGC/VectorCompiler/lib/GenXCodeGen/GenX.td new file mode 100644 index 000000000000..93775e9e14bb --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenX.td @@ -0,0 +1,87 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +// This is a target description file for the Intel Gen architecture, referred +// to here as the "GenX" architecture. +// +//===----------------------------------------------------------------------===// + +// Get the target-independent interfaces which we are implementing... +// +include "llvm/Target/Target.td" + +//===----------------------------------------------------------------------===// +// GenX Subtarget features - these are typically passed in as features +//===----------------------------------------------------------------------===// + +def DumpRegAlloc: SubtargetFeature<"dump_regalloc", "DumpRegAlloc", + "true", "dump regalloc information">; + +//===----------------------------------------------------------------------===// +// GenX Subtarget state - these are typically inferred from the Proc +//===----------------------------------------------------------------------===// + +def FeatureLongLong : SubtargetFeature<"longlong","HasLongLong", "true", + "supports long long">; + +def FeatureNoJmpi : SubtargetFeature<"disable_jmpi", "DisableJmpi", + "true", "disable jmpi">; + +def FeatureVectorDecomp : SubtargetFeature<"disable_vec_decomp", + "DisableVectorDecomposition", + "true", + "disable vector decomposition pass">; + +def WarnCallable : SubtargetFeature<"warn_callable", "WarnCallable", + "true", "warn instead of error on callable violation">; + +def OCLRuntime : SubtargetFeature<"ocl_runtime", "OCLRuntime", "true", + "Prepare structures for OCL runtime">; + + +//===----------------------------------------------------------------------===// +// GenX processors supported. +//===----------------------------------------------------------------------===// + +class Proc Features> + : Processor; + +def : Proc<"generic", []>; +def : Proc<"HSW", []>; +def : Proc<"BDW", [FeatureLongLong]>; +def : Proc<"CHV", [FeatureLongLong]>; +def : Proc<"SKL", [FeatureLongLong]>; +def : Proc<"BXT", [FeatureLongLong]>; +def : Proc<"KBL", [FeatureLongLong]>; +def : Proc<"GLK", [FeatureLongLong]>; +def : Proc<"CNL", [FeatureLongLong]>; +def : Proc<"ICL", [FeatureLongLong]>; +def : Proc<"ICLLP", []>; +def : Proc<"TGLLP", []>; + +def GenX : Target { + // Nothing here (yet?) +} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXAddressCommoning.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXAddressCommoning.cpp new file mode 100644 index 000000000000..7bafdc398ab3 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXAddressCommoning.cpp @@ -0,0 +1,1047 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXAddressCommoning +/// -------------------- +/// +/// This pass spots when multiple address conversions use the same value and +/// are used in regions with the same base register (the same coalesced live +/// range), and commons up the address conversions. +/// +/// It also handles cases where an llvm.genx.add.addr has an out of range offset +/// that is not encodable as the constant offset in an indirect operand. When +/// commoning up address conversions, it groups ones with nearby offsets such +/// that all uses of a commoned address conversion have in range offsets in +/// their llvm.genx.add.addr ops. +/// +/// Before this pass, GenXCategoryConversion has ensured that each use of a +/// variable index in an element or region access (llvm.genx.rdregion etc +/// intrinsics) has its own separate address conversion (llvm.genx.convert.addr +/// intrinsic). Any constant add/sub between the address conversion +/// and the use of the variable index has been turned into an llvm.genx.add.addr +/// intrinsic. +/// +/// This GenXAddressCommoning pass spots when multiple address conversions +/// use the same index value as input and are used in element/region accesses +/// with the same base register. These can then be commoned up. +/// +/// In fact, rather than looking at an address conversion in isolation, it needs +/// to look at the whole bale containing the address conversion, which might have +/// a baled in rdregion and modifiers. It needs to do this because +/// GenXBaling cloned the rdregion and modifiers, so they need commoning up +/// again with the address conversion. +/// This situation is common because GenXLowering lowers a trunc (as often +/// found in an index calculation to convert the index to i16) into a bitcast +/// and a rdregion. +/// +/// A second transformation in this pass is the "histogram optimization": If +/// there are multiple scalar address conversions for the same base reg where +/// each index is an extract (a scalar rdregion) from the same index vector, we +/// attempt to common them up into a vector address conversion, with an extract +/// from the result of the vector address conversion for each user of an +/// original scalar address conversion. The extract is baled in to the indirect +/// region, appearing as the "addr_offset" field (the index into the 8 wide +/// address register) in the generated vISA. +/// +/// This histogram optimization uses the hasIndirectGRFCrossing feature from +/// GenXSubtarget to tell how big the combined vector address conversion can be, +/// in the case that it itself is an indirect region. +/// +/// Both of the transformations in this pass are fiddly because the pass runs so +/// late. It has to run this late because we cannot tell whether address +/// conversions can be commoned up until GenXCoalescing has decided which vectors +/// are in the same register, but that then means that this pass has to update +/// live ranges and baling info for the code that it modifies. +/// +/// **IR restriction**: After this pass, the restrictions on +/// ``llvm.genx.convert.addr`` and ``llvm.genx.add.addr`` having just a single +/// use are relaxed. Now, multiple uses of ``llvm.genx.convert.addr``, possibly +/// each via a single ``llvm.genx.add.addr``, must be in rdregions/wrregions +/// where the base register is provably the same because all the values that +/// appear as the "old value" input are coalesced together into the same +/// LiveRange. +/// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "GENX_ADDRESSCOMMONING" + +#include "FunctionGroup.h" +#include "GenX.h" +#include "GenXBaling.h" +#include "GenXGotoJoin.h" +#include "GenXLiveness.h" +#include "GenXModule.h" +#include "GenXNumbering.h" +#include "GenXRegion.h" +#include "GenXUtil.h" +#include "vc/GenXOpts/Utils/RegCategory.h" +#include "llvm-c/Core.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; +using namespace genx; + +static cl::opt ConvertAfterWholeRegion("convert-after-whole", cl::init(true), cl::Hidden, + cl::desc("Convert addrs after whole region conversion attempt")); + +namespace { + +// Bucket : a bucket for collecting address conversions with the same base reg +// and the same address calculation value, discarding duplicates. +struct Bucket { + SmallVector Addrs; + SmallSet AddrSet; + void add(Instruction *Addr) { + if (AddrSet.insert(Addr).second) + Addrs.push_back(Addr); + } +}; + +// ExtractBucket : a bucket for collecting address conversions with the same +// base reg that all use an extract (scalar rdregion) from the same vector, +// discarding duplicates. +struct ExtractBucket { + SmallVector Addrs; + SmallSet AddrSet; + void add(Instruction *Addr) { + if (AddrSet.insert(Addr).second) + Addrs.push_back(Addr); + } +}; + +// Extract: address and offset for region conversion +struct Extract { + Instruction *Addr; // the address conversion instruction + int Offset; // the offset from the rdregion; + Extract(Instruction *Addr, int Offset) : Addr(Addr), Offset(Offset) {} + bool operator<(const Extract &Other) const { return Offset < Other.Offset; } +}; + +// GenX address conversion pass +class GenXAddressCommoning : public FunctionGroupPass { + GenXBaling *Baling; + GenXLiveness *Liveness; + GenXNumbering *Numbering; + const GenXSubtarget *ST; + Function *F; + SmallSet AlreadyProcessed; + // Types and data structures used for gathering convert_addr ops that + // could be commoned up: + // InnerVec is a vector of convert_addr ops that have the same base register + // and bale hash. OuterVec is a vector of InnerVec. OuterMap provides a + // way of finding the element of OuterVec for a particular base register + // and bale hash. Using a vector and map together like this ensures that + // we process everything in the same order even as pointer values and hashes + // change from one compiler run to another. + typedef SmallVector InnerVec_t; + typedef SmallVector OuterVec_t; + OuterVec_t OuterVec; + struct BaseRegAndBaleHash { + LiveRange *BaseReg; + hash_code BaleHash; + BaseRegAndBaleHash(LiveRange *BaseReg, hash_code BaleHash) + : BaseReg(BaseReg), BaleHash(BaleHash) {} + static bool less(BaseRegAndBaleHash BRH1, BaseRegAndBaleHash BRH2) + { + if (BRH1.BaseReg != BRH2.BaseReg) + return BRH1.BaseReg < BRH2.BaseReg; + return BRH1.BaleHash < BRH2.BaleHash; + } + }; + typedef std::map OuterMap_t; + OuterMap_t OuterMap; +public: + static char ID; + explicit GenXAddressCommoning() : FunctionGroupPass(ID), + OuterMap(OuterMap_t(BaseRegAndBaleHash::less)) { } + virtual StringRef getPassName() const { return "GenX address commoning"; } + void getAnalysisUsage(AnalysisUsage &AU) const { + FunctionGroupPass::getAnalysisUsage(AU); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.setPreservesCFG(); + } + bool runOnFunctionGroup(FunctionGroup &FG); + // createPrinterPass : get a pass to print the IR, together with the GenX + // specific analyses + virtual Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const + { return createGenXGroupPrinterPass(O, Banner); } +private: + bool processFunction(Function *F); + bool processBaseReg(LiveRange *LR); + bool processCommonAddrs(ArrayRef Addrs); + void processCommonAddrsWithValidOffsets(ArrayRef Addrs); + bool vectorizeAddrs(LiveRange *LR); + void addAddrConvIfExtract(std::map, ExtractBucket> *ExtractBuckets, Value *Index); + bool tryConvertWholeRegion(SmallVector &Extracts, + Instruction *VecDef); + bool vectorizeAddrsFromOneVector(ArrayRef Addrs); + DominatorTree *getDominatorTree(); + bool isValueInCurrentFunc(Value *V); +}; + +} // end anonymous namespace + +char GenXAddressCommoning::ID = 0; +namespace llvm { void initializeGenXAddressCommoningPass(PassRegistry &); } +INITIALIZE_PASS_BEGIN(GenXAddressCommoning, "GenXAddressCommoning", "GenXAddressCommoning", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeGroupWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GenXGroupBaling) +INITIALIZE_PASS_DEPENDENCY(GenXLiveness) +INITIALIZE_PASS_DEPENDENCY(GenXNumbering) +INITIALIZE_PASS_END(GenXAddressCommoning, "GenXAddressCommoning", "GenXAddressCommoning", false, false) + +FunctionGroupPass *llvm::createGenXAddressCommoningPass() +{ + initializeGenXAddressCommoningPass(*PassRegistry::getPassRegistry()); + return new GenXAddressCommoning(); +} + +/*********************************************************************** + * runOnFunctionGroup : run the address commoning pass for this + * FunctionGroup + */ +bool GenXAddressCommoning::runOnFunctionGroup(FunctionGroup &FG) +{ + Baling = &getAnalysis(); + Liveness = &getAnalysis(); + Numbering = &getAnalysis(); + ST = getAnalysis().getSubtarget(); + bool Modified = false; + for (auto fgi = FG.begin(), fge = FG.end(); fgi != fge; ++fgi) { + F = *fgi; + Modified |= processFunction(F); + } + return Modified; +} + +/*********************************************************************** + * processFunction : process one function in the address commoning pass + */ +bool GenXAddressCommoning::processFunction(Function *F) +{ + // Build a list of base registers used in an indirect rdregion or wrregion. + // This does a preordered depth first traversal of the CFG to + // ensure that we see a def before its uses (ignoring phi node uses). + // Because an llvm.genx.convert.addr intrinsic can bale in a rdregion + // with a variable index that itself uses an llvm.genx.convert.addr, + // we want to process the code in forward order so that we can do as + // much commoning as possible. + SmallVector BaseRegs; + std::set BaseRegsSet; + for (df_iterator i = df_begin(&F->getEntryBlock()), + e = df_end(&F->getEntryBlock()); i != e; ++i) { + for (auto bi = i->begin(), be = i->end(); bi != be; ++bi) { + Instruction *Inst = &*bi; + LiveRange *LR = nullptr; + switch (GenXIntrinsic::getGenXIntrinsicID(Inst)) { + default: + continue; + case GenXIntrinsic::genx_rdregioni: + case GenXIntrinsic::genx_rdregionf: + if (isa(Inst->getOperand( + GenXIntrinsic::GenXRegion::RdIndexOperandNum))) + continue; + LR = Liveness->getLiveRange(Inst->getOperand(0)); + break; + case GenXIntrinsic::genx_wrregioni: + case GenXIntrinsic::genx_wrregionf: + if (isa(Inst->getOperand( + GenXIntrinsic::GenXRegion::WrIndexOperandNum))) + continue; + // A write region may be baled into a g_store. + LR = Liveness->getLiveRangeOrNull(Inst); + if (!LR) { + assert(Inst->hasOneUse()); + auto SI = dyn_cast(Inst->user_back()); + if (!SI) + continue; + Value *GV = getUnderlyingGlobalVariable(SI->getPointerOperand()); + if (!GV) + continue; + LR = Liveness->getLiveRange(GV); + } + break; + } + // Inst is rdregion or wrregion with non-constant index. + // Save the base register. + if (BaseRegsSet.insert(LR).second) + BaseRegs.push_back(LR); // not seen before + } + } + BaseRegsSet.clear(); + // Process each base register. + bool Modified = false; + for (auto i = BaseRegs.begin(), e = BaseRegs.end(); i != e; ++i) { + Modified |= processBaseReg(*i); + Modified |= vectorizeAddrs(*i); + } + return Modified; +} + +/*********************************************************************** + * processBaseReg : process one base register + * + * Enter: LR = LiveRange with all the values for this base register + * + * We common up all address conversions with the same input that are used to + * address a region of this base register. + */ +bool GenXAddressCommoning::processBaseReg(LiveRange *LR) +{ + // Gather the address conversions used by regions of this base register into + // buckets, one for each distinct input. A bucket discards duplicate address + // conversions. + std::map Buckets; + for (auto vi = LR->value_begin(), ve = LR->value_end(); vi != ve; ++vi) { + Value *V = vi->getValue(); + // Ignore the value if it is in the wrong function. That can happen because + // liveness information is shared between functions in the same group. + if (!isValueInCurrentFunc(V)) + continue; + // First the def, if it is a wrregion. + if (GenXIntrinsic::isWrRegion(V)) { + Value *Index = cast(V)->getOperand( + GenXIntrinsic::GenXRegion::WrIndexOperandNum); + while (GenXIntrinsic::getGenXIntrinsicID(Index) == + GenXIntrinsic::genx_add_addr) + Index = cast(Index)->getOperand(0); + if (GenXIntrinsic::getGenXIntrinsicID(Index) == + GenXIntrinsic::genx_convert_addr) { + Bale B; + Baling->buildBale(cast(Index), &B); + B.hash(); + Buckets[B].add(cast(Index)); + } + } + // Then each use that is a rdregion. (A use that is a wrregion will be + // handled when we look at that value, which must be coalesced into the + // same live range.) + for (auto ui = V->use_begin(), ue = V->use_end(); ui != ue; ++ui) { + if (ui->getOperandNo() != GenXIntrinsic::GenXRegion::OldValueOperandNum) + continue; + auto user = cast(ui->getUser()); + + auto isBaledWrr = [=]() { + if (!isa(V) || !GenXIntrinsic::isWrRegion(user) || !user->hasOneUse()) + return false; + StoreInst *SI = dyn_cast(user->user_back()); + GlobalVariable *GV = + SI ? getUnderlyingGlobalVariable(SI->getPointerOperand()) : nullptr; + if (!GV) + return false; + // make sure the base is the right global variable. + return Liveness->getLiveRangeOrNull(GV) == LR; + }; + + // wrr may have been baled with a g_store. + if (isBaledWrr()) { + Value *Index = cast(user)->getOperand( + GenXIntrinsic::GenXRegion::WrIndexOperandNum); + while (GenXIntrinsic::getGenXIntrinsicID(Index) == + GenXIntrinsic::genx_add_addr) + Index = cast(Index)->getOperand(0); + if (GenXIntrinsic::getGenXIntrinsicID(Index) == + GenXIntrinsic::genx_convert_addr) { + Bale B; + Baling->buildBale(cast(Index), &B); + B.hash(); + Buckets[B].add(cast(Index)); + } + } + + if (!GenXIntrinsic::isRdRegion(user)) + continue; + Value *Index = user->getOperand(GenXIntrinsic::GenXRegion::RdIndexOperandNum); + while (GenXIntrinsic::getGenXIntrinsicID(Index) == + GenXIntrinsic::genx_add_addr) + Index = cast(Index)->getOperand(0); + if (GenXIntrinsic::getGenXIntrinsicID(Index) == + GenXIntrinsic::genx_convert_addr) { + Bale B; + Baling->buildBale(cast(Index), &B); + B.hash(); + Buckets[B].add(cast(Index)); + } + } + } + // Common up each bucket with more than one address conversion. + bool Modified = false; + for (auto i = Buckets.begin(), e = Buckets.end(); i != e; ++i) + Modified |= processCommonAddrs(i->second.Addrs); + return Modified; +} + +/*********************************************************************** + * processCommonAddrs : common up some address conversions + * + * Enter: Addrs = one or more address conversion instructions that all have + * the same input and address the same base register, with no + * duplicates. Offsets (in add.addr intrinsics) are not known to + * be in range; this function fixes that. + * + * Return: whether code modified + * + * This function relies on there being no duplicates in Addrs in the way that + * it erases the address conversions other than the one it uses as the common + * one. + * + * This processes a batch of address conversions with add.addr offsets close + * enough to each other that we can use constant offsets in the indirect + * operands. Then it recursively calls itself with what is left after removing + * that batch. + * + * This code relies on there only being one add.addr between a convert.addr and + * the use of the added address in a rdregion/wrregion. GenXCategory ensures + * that this is the case. + */ +bool GenXAddressCommoning::processCommonAddrs(ArrayRef Addrs) +{ +#ifndef NDEBUG + // Assert that we do not have any duplicates, and that they are all in the + // current function. + { + std::set AddrSet; + for (auto i = Addrs.begin(), e = Addrs.end(); i != e; ++i) { + assert(AddrSet.insert(*i).second); + assert((*i)->getParent()->getParent() == F); + } + } +#endif + bool Modified = false; + // Get the offsets. (Each address conversion has only one use; that is how + // GenXCategory set it up.) + SmallVector Offsets; + for (unsigned i = 0, e = Addrs.size(); i != e; ++i) { + int Offset = 0; + assert(Addrs[i]->hasOneUse()); + auto AddrUse = cast(Addrs[i]->use_begin()->getUser()); + if (GenXIntrinsic::getGenXIntrinsicID(AddrUse) == + GenXIntrinsic::genx_add_addr) { + // The offset is operand 1 of the add_addr, and it is either a constant + // int or a splat of a constant int. + auto C = cast(AddrUse->getOperand(1)); + if (isa(C->getType())) + C = C->getSplatValue(); + Offset = cast(C)->getSExtValue(); + } + Offsets.push_back(Offset); + } + // Get the min offset. + int MinOffset = INT_MAX; + for (unsigned i = 0, e = Offsets.size(); i != e; ++i) + MinOffset = std::min(MinOffset, Offsets[i]); + // Split the address conversions into ones used with an offset in + // [MinOffset,MinOffset+1023] and ones that are outside that range. + SmallVector InRangeAddrs; + SmallVector OutOfRangeAddrs; + int MaxOffset = INT_MIN; + for (unsigned i = 0, e = Offsets.size(); i != e; ++i) { + if (Offsets[i] < MinOffset + 1024) { + InRangeAddrs.push_back(Addrs[i]); + MaxOffset = std::max(MaxOffset, Offsets[i]); + } else + OutOfRangeAddrs.push_back(Addrs[i]); + } + // We handle the ones in range here. + // The address conversions are going to be commoned up. Decide what offset we + // are going to put on the commoned up one. We ensure that the offset is + // inside the range of offsets that we found in the uses of the address + // conversions, to try and avoid the situation where the address conversion + // generates an out-of-range value in the address register that is then + // brought back into range by the immediate offset in each use of the address + int CommonOffset = 0; + if (MinOffset < 0) { + if (MaxOffset < 0) { + // All offsets are negative. Use 0 if that is in range, else as close to + // the max end of the offset range as we can get, rounded down to a + // multiple of 32. + if (MinOffset < G4_MIN_ADDR_IMM) + CommonOffset = std::min(MinOffset + 512, MaxOffset) & -32; + } else { + // Some negative and some non-negative. Common offset can be 0. + CommonOffset = 0; + } + } else { + // All offsets are non-negative. Use 0 if that is in range, else as close + // to the min end of the offsets range as we can get, rounded up to a + // multiple of 32. + if (MaxOffset >= 512) + CommonOffset = (std::max(MaxOffset - 511, MinOffset) + 31) & -32; + } + if (CommonOffset) { + Modified = true; + // Modify the address conversions to use the common offset, and adjust the + // address adds accordingly. + auto CommonOffsetVal = ConstantInt::get(InRangeAddrs[0]->getType() + ->getScalarType(), CommonOffset); + for (unsigned i = 0, e = InRangeAddrs.size(); i != e; ++i) { + Instruction *Addr = InRangeAddrs[i]; + Addr->setOperand(1, CommonOffsetVal); + Use *U = &*Addr->use_begin(); + auto *AddAddr = cast(U->getUser()); + int AdjustedOffset = -CommonOffset; + if (GenXIntrinsic::getGenXIntrinsicID(AddAddr) == + GenXIntrinsic::genx_add_addr) { + auto ThisOffsetC = cast(AddAddr->getOperand(1)); + if (isa(ThisOffsetC->getType())) + ThisOffsetC = ThisOffsetC->getSplatValue(); + AdjustedOffset += cast(ThisOffsetC) ->getSExtValue(); + } else if (AdjustedOffset) { + // We don't have an add_addr. We need to insert one. + Constant *C = ConstantInt::get(CommonOffsetVal->getType(), + AdjustedOffset); + if (auto VT = dyn_cast(Addr->getType())) + C = ConstantVector::getSplat(VT->getNumElements(), C); + auto CI = createAddAddr(Addr, C, + Addr->getName() + ".addaddr", AddAddr); + *U = CI; + AddAddr = CI; + } else + AddAddr = nullptr; + if (AddAddr) { + // Adjust the offset on the add_addr. The offset is operand 1 of the + // add_addr, and it is either a constant int or a splat of a constant + // int. + Constant *C = ConstantInt::get(CommonOffsetVal->getType(), + AdjustedOffset); + if (auto VT = dyn_cast(AddAddr->getOperand(1)->getType())) + C = ConstantVector::getSplat(VT->getNumElements(), C); + AddAddr->setOperand(1, C); + // Ensure the add_addr is baled in to the rdregion/wrregion that uses + // it. (It was not if we have just created it, or if its offset was out + // of range.) Also remove its live range. + assert(AddAddr->hasOneUse()); + auto User = cast(AddAddr->use_begin()->getUser()); + assert(GenXIntrinsic::isRdRegion(User) || GenXIntrinsic::isWrRegion(User)); + auto BI = Baling->getBaleInfo(User); + BI.setOperandBaled(AddAddr->use_begin()->getOperandNo()); + Baling->setBaleInfo(User, BI); + Liveness->eraseLiveRange(AddAddr); + } + } + } + // Now we can actually common up the in range addresses, if more than one of + // them. + if (InRangeAddrs.size() > 1) { + Modified = true; + processCommonAddrsWithValidOffsets(InRangeAddrs); + } + // Call recursively to process the remaining (out of range) ones. + if (!OutOfRangeAddrs.empty()) + Modified |= processCommonAddrs(OutOfRangeAddrs); + return Modified; +} + +/*********************************************************************** + * processCommonAddrsWithValidOffsets : common up some address conversions + * + * Enter: Addrs = two or more address conversion instructions that all have + * the same input and address the same base register, with no + * duplicates, and all have valid in range offsets (add.addr intrinsics) + * + * This function relies on there being no duplicates in Addrs in the way that + * it erases the address conversions other than the one it uses as the common + * one. + */ +void GenXAddressCommoning::processCommonAddrsWithValidOffsets( + ArrayRef Addrs) +{ + // Find the address conversion that dominates all the others. + Instruction *DominatingAddr = findClosestCommonDominator( + getDominatorTree(), Addrs); + if (DominatingAddr && DominatingAddr->isTerminator()) { + // Ensure we have a legal insertion point in the presence of SIMD CF. + auto InsertBefore = GotoJoin::getLegalInsertionPoint(DominatingAddr, + getDominatorTree()); + // We did not find one address conversion that dominates all of them. Move + // an arbitrarily chosen one to the end of the dominating basic block. + // This position dominates the other address conversions, and is dominated + // by the index input value. + // We need to move the entire bale, not just the address conversion + // instruction itself. The whole bale is given an instruction number the + // same as the terminator of the closest common dominator block that it is + // being inserted before. Doing this is a bit dodgy because the result of + // the address conversion does not appear to interfere with the operands + // of a cmp baled into a conditional branch, but in practice this is not + // a problem because the result of an address conversion is an address + // register and the + unsigned Num = Numbering->getNumber(InsertBefore); + Bale B; + Baling->buildBale(Addrs[0], &B); + for (auto i = B.begin(), e = B.end(); i != e; ++i) { + Instruction *Inst = i->Inst; + DominatingAddr = Inst; + Inst->removeFromParent(); + Inst->insertBefore(InsertBefore); + Numbering->setNumber(Inst, Num); + } + } + // Use the dominating one instead of all the others. + for (auto i = Addrs.begin(), e = Addrs.end(); i != e; ++i) { + Instruction *Addr = *i; + if (Addr == DominatingAddr) + continue; + Addr->replaceAllUsesWith(DominatingAddr); + do { + auto Next = dyn_cast(Addr->getOperand(0)); + Liveness->removeValue(Addr); + + // It happens that after commoning there are unused dangling instructions + // in some cases and vISA writer asserts. + bool EraseAddr = true; + if (GenXIntrinsic::getGenXIntrinsicID(Addr) == + GenXIntrinsic::genx_rdregioni || + GenXIntrinsic::getGenXIntrinsicID(Addr) == + GenXIntrinsic::genx_rdregionf) { + Value *Idx = Addr->getOperand(GenXIntrinsic::GenXRegion::RdIndexOperandNum); + auto II = dyn_cast(Idx); + if (II && II->hasOneUse()) { + Addr->eraseFromParent(); + EraseAddr = false; + + assert(II->use_empty()); + Liveness->removeValue(II); + II->eraseFromParent(); + } + } + if (EraseAddr) + Addr->eraseFromParent(); + + Addr = Next; + } while (Addr && Addr->use_empty()); + } + // Rebuild the live range for the common address calculation. + // Note that we do not rebuild the live ranges for the input(s) to the + // common address calculation bale; this is conservative. + Liveness->rebuildLiveRange(Liveness->getLiveRange(DominatingAddr)); +} + +/*********************************************************************** + * vectorizeAddrs : attempt to vectorize address conversions for one base reg + * + * Enter: LR = LiveRange with all the values for this base register + * + * If there are multiple scalar address conversions for this base reg where + * the index is an extract from the same vector, we attempt to common them up + * into a vector address conversion with extracts from the result. This is the + * histogram optimization. + */ +bool GenXAddressCommoning::vectorizeAddrs(LiveRange *LR) +{ + // Gather the address conversions from an extract from a vector used by + // regions of this base register into buckets, one for each distinct vector + // being extracted from and each distinct address conversion offset. + std::map, ExtractBucket> ExtractBuckets; + for (auto vi = LR->value_begin(), ve = LR->value_end(); vi != ve; ++vi) { + Value *V = vi->getValue(); + // Ignore the value if it is in the wrong function. That can happen because + // liveness information is shared between functions in the same group. + if (!isValueInCurrentFunc(V)) + continue; + // First the def, if it is a wrregion. + if (GenXIntrinsic::isWrRegion(V)) { + Value *Index = cast(V)->getOperand( + GenXIntrinsic::GenXRegion::WrIndexOperandNum); + addAddrConvIfExtract(&ExtractBuckets, Index); + } + // Then each use that is a rdregion. (A use that is a wrregion will be + // handled when we look at that value, which must be coalesced into the + // same live range.) + for (auto ui = V->use_begin(), ue = V->use_end(); ui != ue; ++ui) { + if (ui->getOperandNo() != GenXIntrinsic::GenXRegion::OldValueOperandNum) + continue; + auto user = cast(ui->getUser()); + if (!GenXIntrinsic::isRdRegion(user)) + continue; + Value *Index = user->getOperand(GenXIntrinsic::GenXRegion::RdIndexOperandNum); + addAddrConvIfExtract(&ExtractBuckets, Index); + } + } + // Process each bucket of address calculations that extract from the + // same vector. + bool Modified = false; + for (auto i = ExtractBuckets.begin(), e = ExtractBuckets.end(); i != e; ++i) + if (i->second.Addrs.size() >= 2) + Modified |= vectorizeAddrsFromOneVector(i->second.Addrs); + return Modified; +} + +/*********************************************************************** + * addAddrConvIfExtract : add an address conversion to the appropriate + * bucket if the address is an extract from a vector + * + * Enter: ExtractBuckets = map of buckets + * Index = index operand from rdregion/wrregion + * + * Possibly after traversing some add_addr ops, Index is a constant or a + * convert_addr. If it is a convert_addr whose input is an extract (a scalar + * rdregion) with a single use, add the convert_addr to the bucket for the + * vector that the extract is extracted from. + */ +void GenXAddressCommoning::addAddrConvIfExtract( + std::map, ExtractBucket> *ExtractBuckets, Value *Index) +{ + while (GenXIntrinsic::getGenXIntrinsicID(Index) == + GenXIntrinsic::genx_add_addr) + Index = cast(Index)->getOperand(0); + if (isa(Index)) + return; + assert(GenXIntrinsic::getGenXIntrinsicID(Index) == + GenXIntrinsic::genx_convert_addr); + auto RdR = dyn_cast(cast(Index)->getOperand(0)); + if (!GenXIntrinsic::isRdRegion(RdR)) + return; + assert(RdR); + if (!isa(RdR->getOperand(GenXIntrinsic::GenXRegion::RdIndexOperandNum))) + return; + if (!RdR->hasOneUse()) + return; + auto AddrConv = cast(Index); + int AddrConvOffset = cast(AddrConv->getOperand(1))->getSExtValue(); + (*ExtractBuckets)[std::pair(RdR->getOperand(0), AddrConvOffset)] + .add(AddrConv); +} + +/*********************************************************************** + * tryConvertWholeRegion : attempt to convert whole region + * + * Enter: Extracts -- array of address conversions, extracted from + * inputs to vectorizeAddrsFromOneVector, + * combined with corresponding region offset + * VecDef -- instruction definition from first extract + * + * This is subroutine of vectorizeAddrsFromOneVector, see more comments + * in parent function. Idea of this subroutine is to convert whole + * region if possible + */ +bool GenXAddressCommoning::tryConvertWholeRegion(SmallVector &Extracts, Instruction *VecDef) { + Instruction *InsertBefore = Extracts[0].Addr; + unsigned int MinNum, MaxNum; + + // maximal difference between MinNum and MaxNum to accept region + // TODO: to be tuned? + const int SIZE_THRESHOLD = 48; + + MinNum = MaxNum = Numbering->getNumber(InsertBefore); + // check every extract + for (unsigned Idx = 0, End = Extracts.size(); Idx < End; ++Idx) { + Instruction *RdR = cast(Extracts[Idx].Addr->getOperand(0)); + Region R(RdR, BaleInfo()); + if (R.NumElements > 1 && R.Stride > 1) + return false; + // all address-conv must be in the same basic block + if (Extracts[Idx].Addr != InsertBefore && + Extracts[Idx].Addr->getParent() != InsertBefore->getParent()) { + LLVM_DEBUG(errs() << "tryConvertWholeRegion: not all in the same block\n"); + return false; + } + // test to update the insertion-point + unsigned int ThisNum = Numbering->getNumber(Extracts[Idx].Addr); + if (ThisNum < MinNum) { + InsertBefore = Extracts[Idx].Addr; + MinNum = ThisNum; + } + if (ThisNum > MaxNum) + MaxNum = ThisNum; + } + if ((MaxNum - MinNum) > SIZE_THRESHOLD) + return false; + // Create a vectorized address conversion and bale the new rdregion (if + // any) into it. Give the new vectorized address conversion, and the new + // rdregion (if any), the number of one less than the insert point. + int AddrConvOffset = + cast(Extracts[0].Addr->getOperand(1))->getSExtValue(); + auto NewConv = createConvertAddr(VecDef, AddrConvOffset, + Extracts[0].Addr->getName() + ".monted", InsertBefore); + NewConv->setDebugLoc(VecDef->getDebugLoc()); + Numbering->setNumber(NewConv, Numbering->getNumber(VecDef) + 1); + // For each original address conversion, replace it with an + // extract from the vectorized convert, and bale the extract into + // its use. If it has more than one use, create an extract per use + // (because a baled in instruction must be single use). + for (unsigned Idx2 = 0, End2 = Extracts.size(); Idx2 < End2; ++Idx2) { + auto OldConv = Extracts[Idx2].Addr; + Instruction *OldExtract = cast(OldConv->getOperand(0)); + Region R2(OldExtract, BaleInfo()); + while (!OldConv->use_empty()) { + auto ui = OldConv->use_begin(); + auto user = cast(ui->getUser()); + auto NewExtract = R2.createRdRegion(NewConv, OldConv->getName(), user, + user->getDebugLoc(), /*ScalarAllowed=*/!OldConv->getType()->isVectorTy()); + Numbering->setNumber(NewExtract, Numbering->getNumber(user)); + // At this late stage, I believe nothing relies on the baling type for + // this instruction being set to RDREGION, but we set it anyway for + // completeness. + Baling->setBaleInfo(NewExtract, BaleInfo(BaleInfo::RDREGION)); + BaleInfo BI = Baling->getBaleInfo(user); + BI.setOperandBaled(ui->getOperandNo()); + Baling->setBaleInfo(user, BI); + *ui = NewExtract; + } + Liveness->removeValue(OldConv); + assert(!Liveness->getLiveRangeOrNull(OldExtract) && "expected extract to be baled in"); + OldConv->eraseFromParent(); + OldExtract->eraseFromParent(); + } + // Give the new vectorized address conversion a live range. + auto LR = Liveness->getOrCreateLiveRange(NewConv); + LR->setCategory(RegCategory::ADDRESS); + Liveness->rebuildLiveRange(LR); + return true; +} + +/*********************************************************************** + * vectorizeAddrsFromOneVector : attempt to vectorize address conversions + * + * Enter: Addrs = address conversions for the same base reg with the same + * offset that are all scalar rdregion (constant offset) from + * the same vector, at least two of them + * + * If there are multiple scalar address conversions for this base reg where the + * index is an extract from the same vector, we attempt to common them up into + * a vector address conversion with extracts from the result. This is the + * histogram optimization. + */ +bool GenXAddressCommoning::vectorizeAddrsFromOneVector( + ArrayRef Addrs) +{ + bool Modified = false; + SmallVector Extracts; + bool HasVector = false; + std::set OffsetSet; + LLVM_DEBUG(dbgs() << "Collecting addrs: " << Addrs.size() << "\n"); + + for (auto i = Addrs.begin(), e = Addrs.end(); i != e; ++i) { + Instruction *Addr = *i; + LLVM_DEBUG(Addr->dump()); + + Region R(cast(Addr->getOperand(0)), BaleInfo()); + LLVM_DEBUG(dbgs() << " [" << R.Offset << "]\n"); + + Extracts.push_back(Extract(Addr, R.Offset)); + OffsetSet.insert(R.Offset); + if (isa(Addr->getType())) + HasVector = true; + } + bool ConvertWholeRegion = false; + Instruction *FirstRdR = cast(Extracts[0].Addr->getOperand(0)); + assert(FirstRdR); + Instruction *VecDef = cast(FirstRdR->getOperand(0)); + assert(VecDef); + + unsigned InputNumElements = VecDef->getType()->getVectorNumElements(); + + if (HasVector) { + if (InputNumElements == 2 || InputNumElements == 4 || + InputNumElements == 8 || InputNumElements == 16) + ConvertWholeRegion = true; + else + return Modified; + } + else if (OffsetSet.size()*3 >= InputNumElements*2 && + (InputNumElements == 2 || InputNumElements == 4 || + InputNumElements == 8 || InputNumElements == 16)) + ConvertWholeRegion = true; + + // Sort into offset order. + std::sort(Extracts.begin(), Extracts.end()); + + if (ConvertWholeRegion) { + bool Success = tryConvertWholeRegion(Extracts, VecDef); + if (Success) { + LLVM_DEBUG(dbgs() << "Succesfully converted whole region\n"); + return true; + } + + LLVM_DEBUG(dbgs() << "Failed to convert whole region\n"); + if (!ConvertAfterWholeRegion) + return false; + } + + // if we tried to convert whole region and failed + // we shall check that we will try to optimize further + // correct extract set + assert(Extracts.size() > 0); + Type *FirstType = Extracts[0].Addr->getOperand(0)->getType(); + assert(FirstType); + + for (auto e : Extracts) { + Type *Tp = e.Addr->getOperand(0)->getType(); + if (ConvertWholeRegion && (Tp != FirstType)) + return false; + } + + // Scan through the address conversions... + for (unsigned Idx = 0, Num = 1, End = Extracts.size(); + Idx < End - 2; Idx += Num) { + // See how many extracts we can take in one go that have evenly spaced + // offsets, max 8. + int Diff = Extracts[Idx + 1].Offset - Extracts[Idx].Offset; + for (Num = 2; Num != 8 && Num != End - Idx; ++Num) + if (Extracts[Idx + Num].Offset - Extracts[Idx + Num - 1].Offset != Diff) + break; + if (Num == 1) + continue; + // We have a sequence of more than one extract. Construct the region + // parameters for it. + Instruction *FirstRdR = cast(Extracts[Idx].Addr->getOperand(0)); + LLVM_DEBUG(dbgs() << "Sequence of " << Num << " instructions found. First one is:\n"); + LLVM_DEBUG(FirstRdR->dump()); + LLVM_DEBUG(dbgs() << "\n"); + Region R(FirstRdR, BaleInfo()); + R.NumElements = R.Width = Num; + R.Stride = Diff / R.ElementBytes; + // See how big we can legally make the region. + unsigned InputNumElements = FirstRdR + ->getOperand(0)->getType()->getVectorNumElements(); + Num = R.getLegalSize(0, /*Allow2D=*/true, InputNumElements, ST); + if (Num == 1) + continue; + // Even after legalizing the region, we can still vectorize to more than + // one element. + R.getSubregion(0, Num); + // Determine where to insert the new rdregion (if any) and vectorized + // address conversion. + SmallVector Addrs; + for (unsigned i = 0; i != Num; ++i) + Addrs.push_back(Extracts[Idx + i].Addr); + auto InsertBefore = findClosestCommonDominator(getDominatorTree(), Addrs); + // Ensure we have a legal insertion point in the presence of SIMD CF. + InsertBefore = GotoJoin::getLegalInsertionPoint(InsertBefore, + getDominatorTree()); + // Read the region containing all the scalar indices we are commoning + // up. (If R is the identity region, just use the whole original vector + // instead.) + Value *NewRdR = cast(Extracts[Idx].Addr->getOperand(0)) + ->getOperand(0); + Instruction *NewRdRInst = nullptr; + if (InputNumElements != R.NumElements) { + // Not identity region. + NewRdR = NewRdRInst = R.createRdRegion(NewRdR, + Extracts[Idx].Addr->getName(), InsertBefore, + Extracts[Idx].Addr->getDebugLoc(), false); + Baling->setBaleInfo(NewRdRInst, BaleInfo(BaleInfo::RDREGION)); + } + // Create a vectorized address conversion and bale the new rdregion (if + // any) into it. Give the new vectorized address conversion, and the new + // rdregion (if any), the number of one less than the insert point. + int AddrConvOffset = cast(Addrs[0]->getOperand(1))->getSExtValue(); + auto NewConv = createConvertAddr(NewRdR, AddrConvOffset, + Extracts[Idx].Addr->getName() + ".histogrammed", InsertBefore); + NewConv->setDebugLoc(Extracts[Idx].Addr->getDebugLoc()); + Numbering->setNumber(NewConv, Numbering->getNumber(InsertBefore) - 1); + if (NewRdRInst) { + Numbering->setNumber(NewRdRInst, Numbering->getNumber(InsertBefore) - 1); + BaleInfo BI(BaleInfo::MAININST); + BI.setOperandBaled(0); + Baling->setBaleInfo(NewConv, BI); + } + // For each original scalar address conversion, replace it with an + // extract from the vectorized convert, and bale the extract in to + // its use. If it has more than one use, create an extract per use + // (because a baled in instruction must be single use). + for (unsigned Idx2 = 0; Idx2 != Num; ++Idx2) { + auto OldConv = Extracts[Idx + Idx2].Addr; + Region R2(NewConv); + R2.getSubregion(Idx2, 1); + while (!OldConv->use_empty()) { + auto ui = OldConv->use_begin(); + auto user = cast(ui->getUser()); + auto NewExtract = R2.createRdRegion(NewConv, OldConv->getName(), user, + user->getDebugLoc(), /*ScalarAllowed=*/true); + Numbering->setNumber(NewExtract, Numbering->getNumber(user)); + // At this late stage, I believe nothing relies on the baling type for + // this instruction being set to RDREGION, but we set it anyway for + // completeness. + Baling->setBaleInfo(NewExtract, BaleInfo(BaleInfo::RDREGION)); + BaleInfo BI = Baling->getBaleInfo(user); + BI.setOperandBaled(ui->getOperandNo()); + Baling->setBaleInfo(user, BI); + *ui = NewExtract; + } + Liveness->removeValue(OldConv); + auto OldExtract = cast(OldConv->getOperand(0)); + assert(!Liveness->getLiveRangeOrNull(OldExtract) && "expected extract to be baled in"); + OldConv->eraseFromParent(); + OldExtract->eraseFromParent(); + } + // Give the new vectorized address conversion a live range. + auto LR = Liveness->getOrCreateLiveRange(NewConv); + LR->setCategory(RegCategory::ADDRESS); + Liveness->rebuildLiveRange(LR); + Modified = true; + } + return Modified; +} + +/*********************************************************************** + * getDominatorTree : get dominator tree for current function + */ +DominatorTree *GenXAddressCommoning::getDominatorTree() +{ + return getAnalysis().getDomTree(F); +} + +/*********************************************************************** + * isValueInCurrentFunc : determine whether V is in the current function + * + * Enter: V = value from a LiveRange (therefore it is an Instruction + * or an Argument) + * + * Return: true if it is in the current function + */ +bool GenXAddressCommoning::isValueInCurrentFunc(Value *V) +{ + if (auto Inst = dyn_cast(V)) { + auto BB = Inst->getParent(); + if (!BB) + return false;; // unified return value + return BB->getParent() == F; + } + if (isa(V)) + return false; + return cast(V)->getParent() == F; +} + diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXAggregatePseudoLowering.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXAggregatePseudoLowering.cpp new file mode 100644 index 000000000000..4c1ab6ba4d60 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXAggregatePseudoLowering.cpp @@ -0,0 +1,366 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXAggregatePseudoLowering +/// --------------------------- +/// +/// The pass is meant to replace all instructions that work with aggregate +/// values with instructions that work with elementary types (scalar, vector), +/// so there's no aggregate values in IR at all. But this pass doesn't do full +/// job, that's why it has pseudo in its name. +/// This pass replaces every instruction (except call, extract/insertvalue, etc) +/// that either has aggregate as operand, or returns an aggregate with series +/// of extractvalue instructions (if there was an aggregate operand) which +/// return only elementary values, then sequence of splits of the original +/// instruction (but now each one is working only with an elementary value) and +/// finally the sequence of insertvalues that join all elementary results back +/// to the original aggregate result. +/// +/// Example: +/// Before pass: +/// %struct_t = type { <16 x float>, <16 x float>, <16 x float> } +/// %res = select i1 %c, %struct_t %arg.0, %struct_t %arg.1 +/// After pass: +/// %struct_t = type { <16 x float>, <16 x float>, <16 x float> } +/// %arg.0.0 = extractvalue %struct_t %arg.0, 0 +/// %arg.0.1 = extractvalue %struct_t %arg.0, 1 +/// %arg.0.2 = extractvalue %struct_t %arg.0, 2 +/// %arg.1.0 = extractvalue %struct_t %arg.1, 0 +/// %arg.1.1 = extractvalue %struct_t %arg.1, 1 +/// %arg.1.2 = extractvalue %struct_t %arg.1, 2 +/// %res.0 = select i1 %c, <16 x float> %arg.0.0, <16 x float> %arg.1.0 +/// %res.1 = select i1 %c, <16 x float> %arg.0.1, <16 x float> %arg.1.1 +/// %res.2 = select i1 %c, <16 x float> %arg.0.2, <16 x float> %arg.1.2 +/// %tmp.0 = insertvalue %struct_t undef, <16 x float> %res.0, 0 +/// %tmp.1 = insertvalue %struct_t %tmp.0, <16 x float> %res.1, 1 +/// %res = insertvalue %struct_t %tmp.1, <16 x float> %res.2, 2 +/// +/// As you can see the pass doesn't fully get rid of aggregate values, it only +/// locally replaces operations over aggregates with operations over elementary +/// fields of aggregates. But if there is the instruction combine pass after +/// this pass, it can easily merge extractvalue and insertvalue so the there's +/// no aggregate values in code anymore. +/// +/// Terminology: +/// Split instructions - the instructions into which original instruction +/// is split, e.g. %res.0, %res.1, %res.2 are split insts +/// (%res is corresponding original instruction) +/// Split operands - the instructions into which original operands are split, +/// they are always extractvalue instructions, e.g. +/// %arg.0.0, %arg.0.1, %arg.0.2 are split operands +/// (%arg.0 is corresponding original operand) +/// +/// Note: split instruction operands is operands of a split instruction, not +/// split operands, though split instruction operands contain at least one +/// split operand, e.g. %c, %arg.0.0, %arg.1.0 for %res.0 instruction. +/// +/// TODO: currently this pass can only handle only flat structures (without +/// nested aggregates). Supported instructions are phi and select. +// +//===----------------------------------------------------------------------===// + +#include "GenX.h" +#include "GenXModule.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" + +#include + +using namespace llvm; +using namespace genx; + +namespace { + +// It is a map between original aggregate instruction operand +// and corresponding split operands. +// Split operands are always extractvalue instructions. +using SplitOpsMap = std::unordered_map>; + +class GenXAggregatePseudoLowering : public FunctionPass { + std::vector ToErase; + +public: + static char ID; + explicit GenXAggregatePseudoLowering() : FunctionPass(ID) {} + StringRef getPassName() const override { + return "GenX aggregate pseudo lowering"; + } + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnFunction(Function &F) override; + +private: + void processInst(Instruction &Inst); +}; + +} // end namespace + +char GenXAggregatePseudoLowering::ID = 0; +namespace llvm { +void initializeGenXAggregatePseudoLoweringPass(PassRegistry &); +} + +INITIALIZE_PASS_BEGIN(GenXAggregatePseudoLowering, + "GenXAggregatePseudoLowering", + "GenXAggregatePseudoLowering", false, false) +INITIALIZE_PASS_END(GenXAggregatePseudoLowering, "GenXAggregatePseudoLowering", + "GenXAggregatePseudoLowering", false, false) + +FunctionPass *llvm::createGenXAggregatePseudoLoweringPass() { + initializeGenXAggregatePseudoLoweringPass(*PassRegistry::getPassRegistry()); + return new GenXAggregatePseudoLowering; +} + +void GenXAggregatePseudoLowering::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); +} + +// is at least one of instruction's operands an aggregate value +static bool hasAggregateOperand(const Instruction &Inst) { + return llvm::any_of(Inst.operand_values(), [](const Value *V) { + return V->getType()->isAggregateType(); + }); +} + +// does instruction have an aggregate as an operand or return value +static bool hasAggregate(const Instruction &Inst) { + return Inst.getType()->isAggregateType() || hasAggregateOperand(Inst); +} + +bool GenXAggregatePseudoLowering::runOnFunction(Function &F) { + std::vector WorkList; + auto WorkRange = make_filter_range(instructions(F), [](Instruction &Inst) { + return hasAggregate(Inst) && !isa(Inst) && + !isa(Inst) && !isa(Inst) && + !isa(Inst); + }); + llvm::transform(WorkRange, std::back_inserter(WorkList), + [](Instruction &Inst) { return &Inst; }); + if (WorkList.empty()) + return false; + + for (auto *Inst : WorkList) + processInst(*Inst); + + for (auto *Inst : ToErase) + Inst->eraseFromParent(); + ToErase.clear(); + return true; +} + +// Returns first instruction after provided instruciton \p Inst, +// before which new instruction can be inserted. +static Instruction *getFirstInsertionPtAfter(Instruction &Inst) { + if (isa(Inst)) + return Inst.getParent()->getFirstNonPHI(); + return Inst.getNextNode(); +} + +// Returns first instruction before which new instruction that represent new +// operand can be inserted, so the new instruction precedes provided +// instruction. \p Inst. Operand \Op is the operator to be updated. +static Instruction *getFirstInsertionPtBefore(Use &Op, Instruction &Inst) { + if (!isa(Inst)) + return &Inst; + return cast(Inst).getIncomingBlock(Op)->getTerminator(); +} + +// Arguments: +// \p Inst - an instruction +// \p Op - operand of the instruction \p Inst +// +// Returns an instruction before which new operand for instruction \p Inst, +// that correspond to the operand \p Op, can be inserted +static Instruction *getInsertionPtForSplitOp(Use &Op, Instruction &Inst) { + auto &OpVal = *Op.get(); + if (isa(OpVal)) + return getFirstInsertionPtAfter(cast(OpVal)); + assert(isa(OpVal) && "only instruction or constant are expected"); + return getFirstInsertionPtBefore(Op, Inst); +} + +// Arguments: +// \p Inst - an instruction +// \p Op - operand of the instruction \p Inst +// +// Splits operand \p Op of the instruction \p Inst into elementary values. +// Those values are extractvalue instructions. Inserts those instruction in +// proper places, so if we insert new instruction right after or right before +// \p Inst those instructions could be reached. +// +// Returns the vector of created instructions. +static std::vector createSplitOperand(Use &Op, + Instruction &Inst) { + auto &OpVal = *Op.get(); + assert(OpVal.getType()->isAggregateType() && "wrong argument"); + // TODO: support ArrayType + auto *InstTy = cast(OpVal.getType()); + auto *InsertionPt = getInsertionPtForSplitOp(Op, Inst); + std::vector SplitOperand; + for (unsigned i = 0; i < InstTy->getNumElements(); ++i) { + assert(!InstTy->getElementType(i)->isAggregateType() && + "folded structures is yet unsupported"); + SplitOperand.push_back( + ExtractValueInst::Create(&OpVal, i, "", InsertionPt)); + } + return SplitOperand; +} + +// Arguments: +// \p Inst - an instruction +// +// Splits all aggregate operands of provided \p Inst. +// Returns a map between original operands and created instructions. +static SplitOpsMap createSplitOperands(Instruction &Inst) { + assert(hasAggregateOperand(Inst) && + "wrong argument: inst must have aggregate operand"); + auto AggregateOps = make_filter_range(Inst.operands(), [](const Use &U) { + return U->getType()->isAggregateType(); + }); + + SplitOpsMap SplitOps; + llvm::transform(AggregateOps, std::inserter(SplitOps, SplitOps.end()), + [&Inst](Use &U) { + return std::make_pair(&U, createSplitOperand(U, Inst)); + }); + return SplitOps; +} + +// Arguments: +// \p elemIdx - element index of the aggregate for which we construct +// split instruction +// \p OrigOps - original instruction operands (contain aggregates) +// \p SplitOps - map between original aggregate operands and corresponding +// split operands +// +// Returns vector of operands (as Value*) for split instruction with index \p +// elemIdx. +template +std::vector createSplitInstOperands(int elemIdx, OpRange OrigOps, + const SplitOpsMap &SplitOps) { + std::vector NewOps; + llvm::transform(OrigOps, std::back_inserter(NewOps), + [elemIdx, &SplitOps](Use &OrigOp) -> Value * { + if (OrigOp.get()->getType()->isAggregateType()) + return SplitOps.at(&OrigOp)[elemIdx]; + return OrigOp.get(); + }); + return NewOps; +} + +// Arguments: +// \p Inst - original instruction +// \p NewOps - operands for split instruction +// +// Creates split instruction based on the kind of original instruction. +// New instruction is inserted right before \p Inst. +// Split instruction is returned. +static Instruction *createSplitInst(Instruction &Inst, + const std::vector &NewOps) { + if (isa(Inst)) { + assert(NewOps.size() == 3 && "select must have 3 operands"); + auto *NewSelect = + SelectInst::Create(NewOps[0], NewOps[1], NewOps[2], + Inst.getName() + ".split.aggr", &Inst, &Inst); + NewSelect->setDebugLoc(Inst.getDebugLoc()); + return NewSelect; + } + assert(isa(Inst) && "unsupported instruction"); + assert(Inst.getNumOperands() == NewOps.size() && ""); + auto *NewPHI = PHINode::Create(NewOps[0]->getType(), NewOps.size(), + Inst.getName() + ".split.aggr", &Inst); + + auto &OldPHI = cast(Inst); + for (auto &&Incoming : zip(NewOps, OldPHI.blocks())) { + Value *OpVal = std::get<0>(Incoming); + BasicBlock *OpBB = std::get<1>(Incoming); + assert(isa(OpVal) && + "phi operands must be previously in this pass created " + "extractvalue insts"); + auto *OpInst = cast(OpVal); + NewPHI->addIncoming(OpInst, OpBB); + } + NewPHI->setDebugLoc(Inst.getDebugLoc()); + return NewPHI; +} + +// Arguments: +// \p Inst - original instruction +// \p SplitOps - map between original aggregate operands and corresponding +// elementary operands +// +// Creates all split instructions for original \p Inst, inserts them before the +// original one. Returns vector of created split instructions. +static std::vector +createSplitInsts(Instruction &Inst, const SplitOpsMap &SplitOps) { + // TODO: support ArrayType + auto &InstTy = *cast(Inst.getType()); + int NumNewInsts = InstTy.getNumElements(); + std::vector NewInsts; + NewInsts.reserve(NumNewInsts); + for (int i = 0; i < NumNewInsts; ++i) { + auto NewOps = createSplitInstOperands(i, Inst.operands(), SplitOps); + NewInsts.push_back(createSplitInst(Inst, NewOps)); + } + return NewInsts; +} + +// Arguments: +// \p SplitInsts - split instructions +// \p JoinTy - aggregate type that all split instructions together should +// form \p InsertBefore - insertion point +// +// Combines split instructions back into aggregate value with a sequence of +// inservalue instructions. +// Last insertvalue instruction that form full aggregate value is returned. +static Instruction *joinSplitInsts(const std::vector &SplitInsts, + Type *JoinTy, Instruction *InsertBefore) { + assert(SplitInsts.size() == cast(JoinTy)->getNumElements() && + "number of splitted insts doesn't correspond with aggregate type"); + Value *JoinInst = UndefValue::get(JoinTy); + unsigned Idx = 0; + for (auto *SplitInst : SplitInsts) { + JoinInst = + InsertValueInst::Create(JoinInst, SplitInst, Idx++, "", InsertBefore); + } + return cast(JoinInst); +} + +void GenXAggregatePseudoLowering::processInst(Instruction &Inst) { + assert(hasAggregate(Inst) && + "wrong argument: instruction doesn't work with aggregates"); + SplitOpsMap NewOperands; + if (hasAggregateOperand(Inst)) + NewOperands = createSplitOperands(Inst); + auto NewInsts = createSplitInsts(Inst, NewOperands); + auto *JoinInst = + joinSplitInsts(NewInsts, Inst.getType(), getFirstInsertionPtAfter(Inst)); + Inst.replaceAllUsesWith(JoinInst); + ToErase.push_back(&Inst); +} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXAlignmentInfo.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXAlignmentInfo.cpp new file mode 100644 index 000000000000..2ca18722a8e7 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXAlignmentInfo.cpp @@ -0,0 +1,401 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +// AlignmentInfo is a cache of information on the alignment of instruction +// values in a function. Alignment is stored as LogAlign and ExtraBits +// (ExtraBits < 1 << LogAlign) where a value is known to be +// A << LogAlign | ExtraBits. +// +// For a vector value, the alignment information is for element 0. +// +// The alignment of a value is computed as it is required, rather than all +// values in a function being computed in a separate analysis pass. +// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "GENX_ALIGNMENT_INFO" + +#include +#include "GenX.h" +#include "GenXAlignmentInfo.h" +#include "GenXBaling.h" +#include "GenXRegion.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/Support/Debug.h" +#include + +using namespace llvm; +using namespace genx; + +/*********************************************************************** + * AlignmentInfo::get : get the aligmment of a Value + * + * Return: the Alignment + */ +Alignment AlignmentInfo::get(Value *V) +{ + if (auto C = dyn_cast(V)) + return Alignment(C); + auto Inst = dyn_cast(V); + if (!Inst) { + // An Argument has unknown alignment. + // (FIXME: We may need to do better than this, tracing the value of the + // Argument at call sites, when arg indirection is introduced.) + return Alignment::getUnknown(); + } + auto MapEntry = &InstMap[Inst]; + if (!MapEntry->isUncomputed()) + return *MapEntry; // already in cache + // Need to compute for this instruction. + LLVM_DEBUG(dbgs() << "AlignmentInfo::get: computing alignment for " << Inst->getName() << "\n"); + // Get the web of instructions related to this one, including going through + // phi nodes, excluding ones that we already have alignment for. + std::set InstWebSet; + SmallVector InstWeb; + InstWebSet.insert(Inst); + InstWeb.push_back(Inst); + for (unsigned i = 0; i != InstWeb.size(); ++i) { + auto WorkInst = InstWeb[i]; + if (auto Phi = dyn_cast(WorkInst)) { + for (unsigned ii = 0, ie = Phi->getNumIncomingValues(); ii != ie; ++ii) + if (auto IncomingInst = dyn_cast(Phi->getIncomingValue(ii))) + if (InstMap.find(IncomingInst) == InstMap.end() + && InstWebSet.insert(IncomingInst).second) + InstWeb.push_back(IncomingInst); + } else if (isa(WorkInst) || isa(WorkInst)) { + for (unsigned oi = 0, oe = WorkInst->getNumOperands(); oi != oe; ++oi) + if (auto IncomingInst = dyn_cast(WorkInst->getOperand(oi))) + if (InstMap.find(IncomingInst) == InstMap.end() + && InstWebSet.insert(IncomingInst).second) + InstWeb.push_back(IncomingInst); + } else if (CastInst *CI = dyn_cast(WorkInst)) { + if (auto IncomingInst = dyn_cast(WorkInst->getOperand(0))) + if (InstMap.find(IncomingInst) == InstMap.end() + && InstWebSet.insert(IncomingInst).second) + InstWeb.push_back(IncomingInst); + } else + switch (GenXIntrinsic::getGenXIntrinsicID(WorkInst)) { + case GenXIntrinsic::genx_rdregioni: + case GenXIntrinsic::genx_rdregionf: + case GenXIntrinsic::genx_convert: + case GenXIntrinsic::genx_convert_addr: + if (auto IncomingInst = dyn_cast(WorkInst->getOperand(0))) + if (InstMap.find(IncomingInst) == InstMap.end() + && InstWebSet.insert(IncomingInst).second) + InstWeb.push_back(IncomingInst); + break; + case GenXIntrinsic::genx_ssmad: + case GenXIntrinsic::genx_uumad: + case GenXIntrinsic::genx_add_addr: + for (unsigned oi = 0, oe = WorkInst->getNumOperands(); oi != oe; ++oi) + if (auto IncomingInst = dyn_cast(WorkInst->getOperand(oi))) + if (InstMap.find(IncomingInst) == InstMap.end() + && InstWebSet.insert(IncomingInst).second) + InstWeb.push_back(IncomingInst); + break; + default: + break; + } + } + LLVM_DEBUG(dbgs() << "web:"; + for (unsigned i = 0, e = InstWeb.size(); i != e; ++i) + dbgs() << " " << InstWeb[i]->getName(); + dbgs() << "\n"); + // Use a worklist algorithm where each instruction in the web is initially on + // the worklist. + std::set WorkSet; + for (auto i = InstWeb.begin(), e = InstWeb.end(); i != e; ++i) + WorkSet.insert(*i); + while (!InstWeb.empty()) { + Instruction *WorkInst = InstWeb.back(); + InstWeb.pop_back(); + WorkSet.erase(WorkInst); + LLVM_DEBUG(dbgs() << " processing " << WorkInst->getName() << "\n"); + + Alignment A(0, 0); // assume unknown + if (BinaryOperator *BO = dyn_cast(WorkInst)) { + A = Alignment(); // assume uncomputed + Alignment A0 = getFromInstMap(BO->getOperand(0)); + Alignment A1 = getFromInstMap(BO->getOperand(1)); + if (!A0.isUncomputed() && !A1.isUncomputed()) { + switch (BO->getOpcode()) { + case Instruction::Add: + A = A0.add(A1); + break; + case Instruction::Sub: + if (A1.isConstant()) + A = A0.add(-(A1.getConstBits())); + else + A = Alignment::getUnknown(); + break; + case Instruction::Mul: + A = A0.mul(A1); + break; + case Instruction::Shl: + if (A1.isConstant()) { + A1 = Alignment(A1.getConstBits(), 0); + A = A0.mul(A1); + } else + A = Alignment::getUnknown(); + break; + default: + A = Alignment::getUnknown(); + break; + } + } + } else if (CastInst *CI = dyn_cast(WorkInst)) { + // Handle a bitcast for the same reason as above. This also handles + // trunc, sext, zext. + A = getFromInstMap(CI->getOperand(0)); + if (!A.isUncomputed()) { + unsigned LogAlign = A.getLogAlign(), ExtraBits = A.getExtraBits(); + LogAlign = std::min( + LogAlign, + static_cast( + CI->getType()->getScalarType()->getPrimitiveSizeInBits())); + if (LogAlign < 32) + ExtraBits &= (1 << LogAlign) - 1; + A = Alignment(LogAlign, ExtraBits); + } else if (!CI->isIntegerCast()) { + // For no-only-integer cast instructions - FPToUI, FPToSI + A = Alignment::getUnknown(); + } + } else if (auto Phi = dyn_cast(WorkInst)) { + // For a phi node, ignore uncomputed incomings so we have an initial + // guess at alignment value to propagate round a loop and refine in + // a later visit to this same phi node. + A = Alignment(); // initialize to uncomputed + for (unsigned ii = 0, ie = Phi->getNumIncomingValues(); ii != ie; ++ii) { + LLVM_DEBUG(dbgs() << " incoming: " << *Phi->getIncomingValue(ii) << "\n"); + LLVM_DEBUG(dbgs() << " merging " << A << " and " << getFromInstMap(Phi->getIncomingValue(ii)) << "\n"); + A = A.merge(getFromInstMap(Phi->getIncomingValue(ii))); + LLVM_DEBUG(dbgs() << " giving " << A << "\n"); + } + } else { + switch (GenXIntrinsic::getGenXIntrinsicID(WorkInst)) { + case GenXIntrinsic::genx_rdregioni: + case GenXIntrinsic::genx_rdregionf: { + // Handle the case of reading a scalar from element 0 of a vector, as + // a trunc from i32 to i16 is lowered to a bitcast to v2i16 then a + // rdregion. + Region R(WorkInst, BaleInfo()); + if (!R.Indirect && !R.Offset) + A = getFromInstMap(WorkInst->getOperand(0)); + else + A = Alignment(0, 0); + break; + } + case GenXIntrinsic::genx_constanti: + A = Alignment(cast(WorkInst->getOperand(0))); + break; + case GenXIntrinsic::genx_convert: + case GenXIntrinsic::genx_convert_addr: + A = getFromInstMap(WorkInst->getOperand(0)); + break; + case GenXIntrinsic::genx_add_addr: { + Alignment AA[2]; + for (unsigned oi = 0, oe = WorkInst->getNumOperands(); oi != oe && oi < 2; ++oi) + AA[oi] = getFromInstMap(WorkInst->getOperand(oi)); + if (!AA[0].isUncomputed() && !AA[1].isUncomputed()) + A = AA[0].add(AA[1]); + else + A = Alignment(0, 0); + break; + } + case GenXIntrinsic::genx_ssmad: + case GenXIntrinsic::genx_uumad: { + A = Alignment(); // assume uncomputed + // every source operand should be computed or constant + Alignment SA[3]; + for (unsigned oi = 0, oe = WorkInst->getNumOperands(); oi != oe && oi < 3; ++oi) + SA[oi] = getFromInstMap(WorkInst->getOperand(oi)); + if (!SA[0].isUncomputed() && !SA[1].isUncomputed() && !SA[2].isUncomputed()) + A = SA[0].mul(SA[1]).add(SA[2]); + else + A = Alignment(0, 0); + break; + } + default: + A = Alignment(0, 0); // no alignment info + break; + } + } + // See if the alignment has changed for WorkInst. + auto MapEntry = &InstMap[WorkInst]; + if (*MapEntry == A) + continue; // no change + *MapEntry = A; + LLVM_DEBUG(dbgs() << " " << WorkInst->getName() << " updated to " << A << "\n"); + // Add all users that are in the original web to the worklist, if + // not already in the worklist. + for (auto ui = WorkInst->use_begin(), ue = WorkInst->use_end(); + ui != ue; ++ui) { + auto user = cast(ui->getUser()); + if (InstWebSet.find(user) != InstWebSet.end() + && WorkSet.insert(user).second) + InstWeb.push_back(user); + } + } + MapEntry = &InstMap[Inst]; + assert(!MapEntry->isUncomputed()); + LLVM_DEBUG(dbgs() << "AlignmentInfo::get: returning " << *MapEntry << "\n"); + return *MapEntry; +} + +/*********************************************************************** + * Alignment constructor given literal value + */ +Alignment::Alignment(unsigned C) +{ + LogAlign = countTrailingZeros(C); + ExtraBits = 0; + ConstBits = (C < 0x7fffffff)? C : 0x7fffffff; +} + +/*********************************************************************** + * Alignment constructor given Constant + */ +Alignment::Alignment(Constant *C) +{ + setUncomputed(); + if (isa(C->getType())) + C = C->getAggregateElement(0U); + if (isa(C)) { + LogAlign = 31; + ExtraBits = 0; + ConstBits = 0x7fffffff; + } else if (auto CI = dyn_cast(C)) { + LogAlign = countTrailingZeros((unsigned)(CI->getSExtValue())); + ExtraBits = 0; + ConstBits = 0x7fffffff; + if (CI->getSExtValue() < 0x7fffffff && CI->getSExtValue() >= 0) + ConstBits = (unsigned)(CI->getSExtValue()); + } +} + +/*********************************************************************** + * merge : merge two alignments + */ +Alignment Alignment::merge(Alignment Other) const +{ + // If either is uncomputed, result is the other one. + if (isUncomputed()) + return Other; + if (Other.isUncomputed()) + return *this; + // Take the minimum of the two logaligns, then chop off some more for + // disagreeing extrabits. + unsigned MinLogAlign = std::min(LogAlign, Other.LogAlign); + if (MinLogAlign) { + unsigned DisagreeExtraBits = (ExtraBits ^ Other.ExtraBits) + & ((1 << MinLogAlign) - 1); + MinLogAlign = std::min(MinLogAlign, + (unsigned)countTrailingZeros(DisagreeExtraBits, ZB_Width)); + } + return Alignment(MinLogAlign, ExtraBits & ((1 << MinLogAlign) - 1)); +} + +/*********************************************************************** + * merge : add two alignments + */ +Alignment Alignment::add(Alignment Other) const +{ + assert(!isUncomputed() && !Other.isUncomputed()); + // Take the minimum of the two logaligns, then chop off some more for + // disagreeing extrabits. + unsigned MinLogAlign = std::min(LogAlign, Other.LogAlign); + unsigned ExtraBits2 = 0; + if (MinLogAlign) { + ExtraBits2 = (ExtraBits + Other.ExtraBits) + & ((1 << MinLogAlign) - 1); + MinLogAlign = std::min(MinLogAlign, + (unsigned)countTrailingZeros(ExtraBits2, ZB_Width)); + } + return Alignment(MinLogAlign, ExtraBits2 & ((1 << MinLogAlign) - 1)); +} + +/*********************************************************************** +* merge : mul two alignments +*/ +Alignment Alignment::mul(Alignment Other) const +{ + assert(!isUncomputed() && !Other.isUncomputed()); + // Take the minimum of the two logaligns, then chop off some more for + // disagreeing extrabits. + unsigned MinLogAlign = std::min(LogAlign, Other.LogAlign); + if (ExtraBits == 0 && Other.ExtraBits == 0) + MinLogAlign = LogAlign + Other.LogAlign; + else if (ExtraBits == 0) + MinLogAlign = LogAlign; + else if (Other.ExtraBits == 0) + MinLogAlign = Other.LogAlign; + unsigned ExtraBits2 = 0; + if (MinLogAlign) { + ExtraBits2 = (ExtraBits * Other.ExtraBits) + & ((1 << MinLogAlign) - 1); + MinLogAlign = std::min(MinLogAlign, + (unsigned)countTrailingZeros(ExtraBits2, ZB_Width)); + } + return Alignment(MinLogAlign, ExtraBits2 & ((1 << MinLogAlign) - 1)); +} + +/*********************************************************************** + * getFromInstMap : get the alignment of a value, direct from InstMap if + * found else return Unknown, Alignment(0, 0) + */ +Alignment AlignmentInfo::getFromInstMap(Value *V) +{ + if (auto C = dyn_cast(V)) + return Alignment(C); + if (auto Inst = dyn_cast(V)) { + return InstMap[V]; + } + return Alignment::getUnknown(); +} + +/*********************************************************************** + * Alignment debug dump/print + */ +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void Alignment::dump() const +{ + errs() << *this << "\n"; +} +#endif + +void Alignment::print(raw_ostream &OS) const +{ + if (isUncomputed()) + OS << "uncomputed"; + else if (isUnknown()) + OS << "unknown"; + else if (isConstant()) + OS << "const=" << ConstBits; + else + OS << "n<<" << LogAlign << "+" << ExtraBits; +} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXAlignmentInfo.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXAlignmentInfo.h new file mode 100644 index 000000000000..3b714126bdef --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXAlignmentInfo.h @@ -0,0 +1,154 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// genx::AlignmentInfo : alignment information +/// ------------------------------------------- +/// +/// AlignmentInfo is a cache of information on the alignment of instruction +/// values in a function. It does not persist between passes. +/// +/// A pass that needs alignment information constructs an AlignmentInfo at +/// the start of the pass, and then calls the ``get`` method each time it wants +/// alignment information for a particular instruction value. AlignmentInfo +/// calculates it if it is not already in its cache, which probably involves +/// also calculating the alignment of other instructions that the given one +/// depends on. +/// +/// This cacheing and lazy calculation is done instead of having a separate analysis +/// pass because alignment is needed for only a small subset of values in a function. +/// +/// The alignment is returned as an *Alignment* object with three fields: +/// *ConstBits*, if ConstBits is not 0x7fffffff, alignment is a known bit-pattern, +/// otherwise *LogAlign* and *ExtraBits* (where 0 <= ExtraBits < (1 << LogAlign)), +/// stating that the value is known to be A << LogAlign | ExtraBits for some A. +/// +/// For a vector value, the alignment information is for element 0. +/// +/// The calculation uses a worklist algorithm that can cope with phi nodes and +/// loops. So, for example, a variable (used as an indirect region index) that +/// starts at 10 then is incremented by 8 inside a loop is correctly calculated +/// to be 8A+2 for some A. +/// +//===----------------------------------------------------------------------===// + +#ifndef GENXALIGNMENTINFO_H +#define GENXALIGNMENTINFO_H + +#include "GenX.h" +#include "IgnoreRAUWValueMap.h" + +namespace llvm { + class raw_ostream; + +namespace genx { + +// Alignment : the alignment of a value +class Alignment { + unsigned LogAlign; + unsigned ExtraBits; + unsigned ConstBits; +public: + // No-arg constructor sets to uncomputed state. + Alignment() { setUncomputed(); } + // Constructor given LogAlign and ExtraBits fields. + Alignment(unsigned LogAlign, unsigned ExtraBits) + : LogAlign(LogAlign), ExtraBits(ExtraBits), ConstBits(0x7fffffff) {} + // Constructor given literal value. + Alignment(unsigned C); + // Constructor given Constant. + Alignment(Constant *C); + // Copy-constructor + Alignment(const Alignment& Rhs) { + LogAlign = Rhs.LogAlign; + ExtraBits = Rhs.ExtraBits; + ConstBits = Rhs.ConstBits; + } + // Copy-operator + Alignment& operator=(const Alignment &Rhs) { + LogAlign = Rhs.LogAlign; + ExtraBits = Rhs.ExtraBits; + ConstBits = Rhs.ConstBits; + return *this; + } + + // Get an unknown alignment + static Alignment getUnknown() { return Alignment(0, 0); } + // Merge two Alignments + Alignment merge(Alignment Other) const; + // Add one Alignment with another Alignment + Alignment add(Alignment Other) const; + // Mul one Alignment with another Alignment + Alignment mul(Alignment Other) const; + + // accessors + bool isUncomputed() const { return LogAlign == 0xffffffff; } + bool isUnknown() const { return LogAlign == 0 && ConstBits == 0x7fffffff; } + bool isConstant() const { return !isUncomputed() && ConstBits != 0x7fffffff; } + unsigned getLogAlign() const { assert(!isUncomputed()); return LogAlign; } + unsigned getExtraBits() const { assert(!isUncomputed()); return ExtraBits; } + int64_t getConstBits() const { assert(isConstant()); return ConstBits; } + // comparison + bool operator==(const Alignment &Rhs) const { + return (LogAlign == Rhs.LogAlign && + ExtraBits == Rhs.ExtraBits && + ConstBits == Rhs.ConstBits); + } + // Debug dump/print + void dump() const; + void print(raw_ostream &OS) const; +private: + void setUncomputed() { + LogAlign = 0xffffffff; + ExtraBits = 0; + ConstBits = 0x7fffffff; + } +}; + +// AlignmentInfo : cache of alignment of instructions in a function +class AlignmentInfo { + ValueMap> InstMap; +public: + // AlignmentInfo constructor + AlignmentInfo() {} + // Clear the cache of value alignments + void clear() { InstMap.clear(); } + // get the alignment of a Value + Alignment get(Value *V); +public: + // return an Alignment for a value + Alignment getFromInstMap(Value *V); +}; + +inline raw_ostream &operator<<(raw_ostream &OS, const Alignment &A) { + A.print(OS); + return OS; +} + +} // end namespace genx +} // end namespace llvm + +#endif /* GENXALIGNMENTINFO_H */ diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXAnalysisDumper.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXAnalysisDumper.cpp new file mode 100644 index 000000000000..a221ea459256 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXAnalysisDumper.cpp @@ -0,0 +1,144 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +// GenXAnalysisDumper is a pass that calls the print() method on a function +// pass to dump its state out to a file. +// GenXGroupAnalysisDumper is the same, but for a function group pass. +// +//===----------------------------------------------------------------------===// + +#include "FunctionGroup.h" +#include "GenX.h" +#include "vc/GenXOpts/Utils/KernelInfo.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; +using namespace genx; + +namespace { + +// GenXAnalysisDumper : a pass to dump an analysis to a file +class GenXAnalysisDumper : public FunctionPass { + FunctionPass *P; + const char *Suffix; +public: + static char ID; + explicit GenXAnalysisDumper(FunctionPass *P, const char *Suffix) + : FunctionPass(ID), P(P), Suffix(Suffix) { } + virtual StringRef getPassName() const { return "GenX analysis dumper pass"; } + void getAnalysisUsage(AnalysisUsage &AU) const { + FunctionPass::getAnalysisUsage(AU); + AU.setPreservesAll(); + } + bool runOnFunction(Function &F); +}; + +// GenXGroupAnalysisDumper : a pass to dump an analysis to a file +class GenXGroupAnalysisDumper : public FunctionGroupPass { + FunctionGroupPass *P; + const char *Suffix; +public: + static char ID; + explicit GenXGroupAnalysisDumper(FunctionGroupPass *P, const char *Suffix) + : FunctionGroupPass(ID), P(P), Suffix(Suffix) { } + virtual StringRef getPassName() const { return "GenX analysis dumper pass"; } + void getAnalysisUsage(AnalysisUsage &AU) const { + FunctionGroupPass::getAnalysisUsage(AU); + AU.setPreservesAll(); + } + bool runOnFunctionGroup(FunctionGroup &FG); +}; + +} // end anonymous namespace + +char GenXAnalysisDumper::ID = 0; + +FunctionPass *llvm::createGenXAnalysisDumperPass( + FunctionPass *P, const char *Suffix) +{ + return new GenXAnalysisDumper(P, Suffix); +} + +char GenXGroupAnalysisDumper::ID = 0; + +FunctionGroupPass *llvm::createGenXGroupAnalysisDumperPass( + FunctionGroupPass *P, const char *Suffix) +{ + return new GenXGroupAnalysisDumper(P, Suffix); +} + +/*********************************************************************** + * openFileForDump : open file for dumping analysis into + * + * The filename is the name of the kernel, or the name of the function if + * not a kernel, with the supplied suffix. + * + * On error, this function prints an error message and returns -1. + */ +static int openFileForDump(Function *F, StringRef Suffix) +{ + // Get name of kernel, or failing that, name of function. + KernelMetadata KM(F); + StringRef Name = KM.getName(); + if (Name.empty()) + Name = F->getName(); + int FD = -1; + std::string Filename = (Name + Suffix).str(); + // Sanitize templated kernel names. + std::replace_if(Filename.begin(), Filename.end(), + [](const char x) { return x == '<' || x == '>'; }, '_'); + auto EC = sys::fs::openFileForWrite(Filename, FD, sys::fs::CD_CreateAlways, sys::fs::F_None); + if (EC) { + errs() << "Error: " << EC.message() << "\n"; + return -1; + } + return FD; +} + +/*********************************************************************** + * GenXAnalysisDumper::runOnFunction : dump analysis to file + */ +bool GenXAnalysisDumper::runOnFunction(Function &F) +{ + int FD = openFileForDump(&F, Suffix); + raw_fd_ostream O(FD, /*shouldClose=*/ true); + P->print(O, F.getParent()); + return false; +} + +/*********************************************************************** + * GenXGroupAnalysisDumper::runOnFunctionGroup : dump analysis to file + */ +bool GenXGroupAnalysisDumper::runOnFunctionGroup(FunctionGroup &FG) +{ + int FD = openFileForDump(FG.getHead(), Suffix); + raw_fd_ostream O(FD, /*shouldClose=*/ true); + P->print(O, FG.getHead()->getParent()); + return false; +} + diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXArgIndirection.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXArgIndirection.cpp new file mode 100644 index 000000000000..0509ca976013 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXArgIndirection.cpp @@ -0,0 +1,1822 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXArgIndirection +/// ------------------ +/// +/// The GenXArgIndirection pass runs very late, after coalescing and address +/// commoning, to change arguments and return values that were originally by ref +/// to use address registers. This saves copies and register pressure. +/// +/// Recall that, very early on in CMABI, a by ref argument is transformed into +/// copy-in copy-out semantics. +/// +/// This pass is run very late on for two reasons: +/// +/// 1. There is no convenient way to represent passing an argument using an +/// address register in LLVM IR. We don't want to pretend that the address +/// register is a pointer, and the GRF is an area of memory, as that would +/// stop us using Values to represent registers normally, and so would stop +/// us using lots of LLVM optimizations. +/// +/// Running the pass this late means that the IR afterwards does not have to +/// strictly represent the semantics, as nothing else happens to it before +/// generating the output code. So uses and defs of the indirected argument +/// (and other Values coalesced with it) still use the same Values, but that +/// live range has no register allocated (it is category NONE), and all +/// accesses are indirected. We rely on the LLVM IR together with the +/// liveness information representing the code well enough for register +/// allocation and code generation to work. +/// +/// 2. We cannot tell whether we want to perform this transformation until we can +/// see how Values have coalesced. +/// +/// Action of GenXArgIndirection +/// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +/// +/// An argument for a subroutine call can generate a bunch of mov instructions in +/// two circumstances: +/// +/// 1. Coalescing failed to coalesce this call argument, so the argument in the +/// caller and the argument in the subroutine are in different registers +/// (different coalesced live ranges). In this case, GenXCoalescing has to +/// generate a sequence of baled together rdregion-wrregion intrinsic pairs, +/// each generating a mov instruction, to copy the value. +/// +/// 2. The argument was originally a by ref CM select(), so is an rdregion, +/// legalized into a sequence of baled together rdregion-wrregion pairs. +/// +/// The argument indirection pass attempts to spot these cases. The regions at +/// each call site must be similar (same region parameters except start index) +/// and contiguous. +/// +/// The pass modifies each call to pass an address register into the subroutine +/// as an extra argument, using it to indirecting all accesses to the subroutine +/// argument and other Values coalesced with it. It then removes the rd-wr +/// sequence so it does not generate any code. +/// +/// Indirecting all accesses to the subroutine argument is only possible if each +/// one would be legal as an indirect region. The pass uses the +/// hasIndirectGRFCrossing feature from GenXSubtarget to tell whether it would +/// be legal. The optimization can fail for this reason, and that is more common +/// on pre-SKL where there is no indirect region GRF crossing. +/// +/// The pass deals with one subroutine argument in one subroutine at a time. It +/// looks at all call sites to see if there is anything that stops this +/// transformation happening at all, and whether there is any call site that +/// would benefit from the transformation. +/// +/// Coalesced return value +/// """""""""""""""""""""" +/// +/// If the subroutine argument is coalesced with a return value from the call, +/// then argument indirection can succeed only if the return value at each call +/// site is written (similarly using a rd-wr sequence) to exactly the same +/// region in a vector that is coalesced (so same register) with the input +/// vector to the rd-wr sequence for the argument. +/// +/// No coalesced return value +/// """"""""""""""""""""""""" +/// +/// If the subroutine argument is _not_ coalesced with a return value from the +/// call, so only the arg could be indirected, indirection can only occur if one +/// of these conditions is met: +/// +/// 1. the live range being indirected is not live over the call (so it does not +/// matter if the subroutine writes to the same register), or +/// +/// 2. the subroutine does not write to the same register (i.e. there are no defs +/// in the subroutine arg's live range other than args and coalesced +/// bitcasts). +/// +/// Constant argument and rd-wr sequence return value +/// """"""""""""""""""""""""""""""""""""""""""""""""" +/// +/// Where the original source initializes a big vector or matrix to constant and +/// then calls a subroutine passing the vector by ref, the IR that this pass sees +/// is that the argument passed to the call is constant, and the rd-wr sequence +/// for the return value has an "old value" input that is another constant +/// (including undef). +/// +/// GenXArgIndirection spots this case, and transforms the code to load the +/// combination of the two constants before the call and pass an address register +/// set to the appropriate point. +/// +/// Indirection of subroutine +/// """"""""""""""""""""""""" +/// +/// If an argument is being indirected, all references to that register +/// (coalesced live range) inside the subroutine and everything it calls must be +/// indirected. +/// +/// GenXArgIndirection does not include the facility to split up a bale if it +/// would become illegal when indirected. This is only a problem in BDW and +/// earlier, where an indirect region is not allowed to cross even one GRF +/// boundary. If it sees an access with a region that would become illegal if +/// indirected, it abandons indirection of that argument. +/// +/// Warning messages +/// ^^^^^^^^^^^^^^^^ +/// +/// Where GenXArgIndirection sees a suitably large uncoalesced call arg that +/// would benefit from arg indirection, but it fails to satisfy the criteria, +/// the pass outputs a warning message. The idea is that the CM programmer +/// might consider some changes to his/her kernel to optimize it. +/// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "GENX_ARGINDIRECTION" + +#include "FunctionGroup.h" +#include "GenX.h" +#include "GenXAlignmentInfo.h" +#include "GenXBaling.h" +#include "GenXConstants.h" +#include "GenXIntrinsics.h" +#include "GenXLiveness.h" +#include "GenXModule.h" +#include "GenXNumbering.h" +#include "GenXRegion.h" +#include "GenXSubtarget.h" +#include "GenXUtil.h" +#include "vc/GenXOpts/Utils/RegCategory.h" +#include "llvm/GenXIntrinsics/GenXIntrinsics.h" +#include "llvm/GenXIntrinsics/GenXMetadata.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/DiagnosticPrinter.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/ValueHandle.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; +using namespace genx; + +static cl::opt LimitGenXArgIndirection("limit-genx-arg-indirection", cl::init(UINT_MAX), cl::Hidden, + cl::desc("Limit GenX argument indirection.")); + + +namespace { + +class GenXArgIndirection; +class SubroutineArg; + + +// Diagnostic information for error/warning relating arg indirection. +class DiagnosticInfoArgIndirection : public DiagnosticInfo { +private: + std::string Description; + StringRef Filename; + unsigned Line; + unsigned Col; + static int KindID; + static int getKindID() { + if (KindID == 0) + KindID = llvm::getNextAvailablePluginDiagnosticKind(); + return KindID; + } +public: + // Initialize from an Instruction and an Argument. + DiagnosticInfoArgIndirection(Instruction *Inst, Argument *Arg, + const Twine &Desc, DiagnosticSeverity Severity = DS_Error); + void print(DiagnosticPrinter &DP) const override; + + static bool classof(const DiagnosticInfo *DI) { + return DI->getKind() == getKindID(); + } +}; +int DiagnosticInfoArgIndirection::KindID = 0; + +// processArgLR relies on these being in this order. +// checkIndirectability relies on these being powers of 2 (except +// CALLER_INDIRECTING being 0) +enum Indirectability { + CALLER_INDIRECTING = 0, + NO_OPTIMIZATION = 1, + WANT_INDIRECTION = 2, + WANT_SOME_INDIRECTION = 4, + CANNOT_INDIRECT = 8 +}; + +// A call site and the action that we want to take when indirecting the arg. +// This is then subclassed by the *CallSite classes below. +class CallSite { +public: + CallInst *CI; +protected: + Indirectability State; + Value *Index; +public: + CallSite(CallInst *CI, Indirectability State, Value *Index) + : CI(CI), State(State), Index(Index) {} + virtual ~CallSite() {} + Indirectability getState() const { return State; } + Value *getIndex() const { return Index; } + virtual Value *process(GenXArgIndirection *Pass, SubroutineArg *SubrArg) = 0; + virtual void printImpl(raw_ostream &OS) const = 0; + void print(raw_ostream &OS) const { printImpl(OS); } +}; + +raw_ostream &operator<<(raw_ostream &OS, const CallSite &CS) { + CS.print(OS); return OS; +} + +// A call site in a subroutine that is itself indirecting the arg. +class CallerIndirectingCallSite : public CallSite { + SubroutineArg *CallerSubrArg; +public: + CallerIndirectingCallSite(CallInst *CI, SubroutineArg *CallerSubrArg) + : CallSite(CI, Indirectability::CALLER_INDIRECTING, nullptr), + CallerSubrArg(CallerSubrArg) {} + virtual Value *process(GenXArgIndirection *Pass, SubroutineArg *SubrArg); + virtual void printImpl(raw_ostream &OS) const { + OS << "CallerIndirectingCallSite " << *CI; + } +}; + +// A call site where indirecting the arg does not give any optimization because +// we did not find copies or rd/wr regions that we can get rid of. We can still +// indirect it though if other call sites do get an optimization. +class NoOptCallSite : public CallSite { +public: + NoOptCallSite(CallInst *CI) + : CallSite(CI, Indirectability::NO_OPTIMIZATION, nullptr) {} + virtual Value *process(GenXArgIndirection *Pass, SubroutineArg *SubrArg); + virtual void printImpl(raw_ostream &OS) const { + OS << "NoOptCallSite " << *CI; + } +}; + +// A call site where the arg is constant (including undef) and the arg is +// coalesced with a retval that is used only in a legalized wrregion +// whose "old value" input is constant. +class ConstArgRetCallSite : public CallSite { + Constant *LdConst; // the constant that needs to be loaded + AssertingVH RetEndWr; // the last wrregion in the sequence for the retval +public: + ConstArgRetCallSite(CallInst *CI, Constant *LdConst, Instruction *RetEndWr, + Value *Index) + : CallSite(CI, Indirectability::WANT_INDIRECTION, Index), + LdConst(LdConst), RetEndWr(RetEndWr) {} + virtual Value *process(GenXArgIndirection *Pass, SubroutineArg *SubrArg); + virtual void printImpl(raw_ostream &OS) const { + OS << "ConstArgRetCallSite " << *CI << "\n LdConst " << *LdConst + << " \n RetEndWr " << *RetEndWr << "\n Index " << *Index; + } +}; + +// A call site where the arg is a legalized rdregion or copy, and there is no +// retval coalesced with it. +class IndirectArgCallSite : public CallSite { +protected: + // Some use of input (arg or inst) in legalized rdregion or copy. This is + // kept as a Use * rather than the value it actually uses to allow for the + // case that the value is something that will be replaced and erased by + // another call site processing the same ArgLR. + Use *InputUse; +public: + IndirectArgCallSite(CallInst *CI, Use *InputUse, Value *Index) + : CallSite(CI, Indirectability::WANT_INDIRECTION, Index), + InputUse(InputUse) {} + virtual Value *process(GenXArgIndirection *Pass, SubroutineArg *SubrArg); + virtual void printImpl(raw_ostream &OS) const { + OS << "IndirectArgCallSite " << *CI << "\n Input " << (*InputUse)->getName() + << " Index " << *Index; + } +}; + +// A call site where the arg is a legalized rdregion or copy, and the arg is +// coalesced with a retval that is used only in a legalized wrregion or copy. +class IndirectArgRetCallSite : public IndirectArgCallSite { + AssertingVH RetEndWr; // the last wrregion in the sequence for the retval +public: + IndirectArgRetCallSite(CallInst *CI, Use *InputUse, Instruction *RetEndWr, + Value *Index) : IndirectArgCallSite(CI, InputUse, Index), RetEndWr(RetEndWr) + {} + virtual Value *process(GenXArgIndirection *Pass, SubroutineArg *SubrArg); + virtual void printImpl(raw_ostream &OS) const { + OS << "IndirectArgRetCallSite " << *CI << "\n Input " << (*InputUse)->getName() + << " RetEndWr " << RetEndWr->getName() << " Index " << *Index; + } +}; + + +class GenXArgIndirection; + +// A subroutine arg that we might want to indirect +class SubroutineArg { + GenXArgIndirection *Pass; +public: + LiveRange *ArgLR; + Argument *Arg; +private: + int CoalescedRetIdx; + bool CanCoalesceWithoutKill; + SmallVector CallSites; + Alignment Align; + Function *F; + Function *NewFunc; +public: + Argument *AddressArgument; + SubroutineArg(GenXArgIndirection *Pass, LiveRange *ArgLR, Argument *Arg) + : Pass(Pass), ArgLR(ArgLR), Arg(Arg), F(Arg->getParent()), NewFunc(nullptr) {} + ~SubroutineArg() { + for (auto i = CallSites.begin(), e = CallSites.end(); i != e; ++i) + delete *i; + } + Indirectability checkIndirectability(); + CallSite *createCallSite(CallInst *CI); + Alignment getIndirectAlignment() const; + void gatherBalesToModify(Alignment Align); + void addAddressArg(); + void fixCallSites(); + void coalesceAddressArgs(); + void replaceFunction(); +private: + static Value *getRetVal(CallInst *CI, unsigned RetNum); +}; + +// GenX arg indirection pass +class GenXArgIndirection : public FunctionGroupPass { + friend CallSite; + friend SubroutineArg; + friend NoOptCallSite; + friend ConstArgRetCallSite; + friend IndirectArgCallSite; + friend IndirectArgRetCallSite; +private: + FunctionGroup *FG; + FunctionGroupAnalysis *FGA; + GenXBaling *Baling; + GenXLiveness *Liveness; + GenXNumbering *Numbering; + AlignmentInfo *AI; + const GenXSubtarget *ST; + // List of arg live ranges to consider. + SmallVector ArgLRs; + // For the ArgLR being processed: + // List of subroutine args in the ArgLR. + SmallVector SubrArgs; + // Bales that need modifying for indirection. + SmallVector BalesToModify; + // Map from function back to the SubroutineArg for it. + std::map FuncMap; + // List of LRs that we need to recalculate. + SmallVector LRsToCalculate; +public: + static char ID; + explicit GenXArgIndirection() : FunctionGroupPass(ID) { } + virtual StringRef getPassName() const { return "GenX arg indirection"; } + void getAnalysisUsage(AnalysisUsage &AU) const { + FunctionGroupPass::getAnalysisUsage(AU); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.setPreservesCFG(); + } + bool runOnFunctionGroup(FunctionGroup &FG); + // createPrinterPass : get a pass to print the IR, together with the GenX + // specific analyses + virtual Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const + { return createGenXGroupPrinterPass(O, Banner); } +private: + void gatherArgLRs(); + bool processArgLR(LiveRange *ArgLR); + bool gatherBalesToModify(LiveRange *ArgLR, Alignment Align); + bool checkIndirectBale(Bale *B, LiveRange *ArgLR, Alignment Align); + void indirectBale(Bale *B, LiveRange *ArgLR, Argument *AddressArg); + void indirectRegion(Use *U, Value *AddressArg, Instruction *InsertBefore); + static Argument *getArgForFunction(LiveRange *LR, Function *F); + void replaceAndEraseSequence(Instruction *RetEndWr, Value *V); +}; + +} // end anonymous namespace + +char GenXArgIndirection::ID = 0; +namespace llvm { void initializeGenXArgIndirectionPass(PassRegistry &); } +INITIALIZE_PASS_BEGIN(GenXArgIndirection, "GenXArgIndirection", "GenXArgIndirection", false, false) +INITIALIZE_PASS_DEPENDENCY(GenXGroupBaling) +INITIALIZE_PASS_DEPENDENCY(GenXNumbering) +INITIALIZE_PASS_DEPENDENCY(GenXLiveness) +INITIALIZE_PASS_END(GenXArgIndirection, "GenXArgIndirection", "GenXArgIndirection", false, false) + +FunctionGroupPass *llvm::createGenXArgIndirectionPass() +{ + initializeGenXArgIndirectionPass(*PassRegistry::getPassRegistry()); + return new GenXArgIndirection(); +} + +/*********************************************************************** + * runOnFunctionGroup : run the coalescing pass for this FunctionGroup + */ +bool GenXArgIndirection::runOnFunctionGroup(FunctionGroup &ArgFG) +{ + FG = &ArgFG; + unsigned Modified = 0; + // Get analyses that we use and/or modify. + FGA = &getAnalysis(); + Baling = &getAnalysis(); + Numbering = &getAnalysis(); + Liveness = &getAnalysis(); + AI = new AlignmentInfo; + ST = getAnalysis().getSubtarget(); + // Gather list of LRs containing an arg that we want to consider. (Two + // args might be coalesced together, so we consider a whole arg-containing + // LR at a time.) + gatherArgLRs(); + // Process them. + for (auto i = ArgLRs.begin(), e = ArgLRs.end(); + i != e && Modified < LimitGenXArgIndirection; ++i) { + if (processArgLR(*i)) { + ++Modified; + if (LimitGenXArgIndirection != UINT_MAX) + dbgs() << "genx-arg-indirection " << Modified << "\n"; + } + } + ArgLRs.clear(); + SubrArgs.clear(); + BalesToModify.clear(); + FuncMap.clear(); + LRsToCalculate.clear(); + delete AI; + return Modified != 0; +} + +/*********************************************************************** + * gatherArgLRs : gather a list of LRs containing an arg that we want to + * consider + */ +void GenXArgIndirection::gatherArgLRs() +{ + std::set Seen; + // For a kernel arg, add it to Seen but not to the list, so it will not get + // added to the list. We cannot indirect a kernel arg. + for (auto ai = FG->at(0)->arg_begin(), ae = FG->at(0)->arg_end(); + ai != ae; ++ai) + Seen.insert(Liveness->getLiveRange(&*ai)); + // For a subroutine arg, add its LR to the list if it is not already in Seen. + for (auto fgi = FG->begin() + 1, fge = FG->end(); fgi != fge; ++fgi) { + if ((*fgi)->hasFnAttribute("referenced-indirectly")) + continue; + for (auto ai = (*fgi)->arg_begin(), ae = (*fgi)->arg_end(); ai != ae; ++ai) { + Argument *Arg = &*ai; + // Only process an arg that is bigger than 2 GRFs. + if (Arg->getType()->getPrimitiveSizeInBits() <= ST->getGRFWidth() * 16) + continue; + LiveRange *LR = Liveness->getLiveRange(Arg); + if (Seen.insert(LR).second) + ArgLRs.push_back(LR); + } + } +} + +/*********************************************************************** + * processArgLR : process one live range containing at least one subroutine arg + * + * Return: true = some modifications made + */ +bool GenXArgIndirection::processArgLR(LiveRange *ArgLR) +{ + // Get a list of args in this live range. + SubrArgs.clear(); + FuncMap.clear(); + LLVM_DEBUG(dbgs() << "processArgLR: " << *ArgLR << "\n"); + for (auto vi = ArgLR->value_begin(), ve = ArgLR->value_end(); vi != ve; ++vi) + if (auto Arg = dyn_cast(vi->getValue())) { + SubrArgs.push_back(SubroutineArg(this, ArgLR, Arg)); + FuncMap[Arg->getParent()] = &SubrArgs.back(); + } + // For each arg, see if we can or want to indirect. + Indirectability Res = Indirectability::NO_OPTIMIZATION; + for (auto SubrArg = SubrArgs.begin(), e = SubrArgs.end(); + SubrArg != e; ++SubrArg) { + LLVM_DEBUG(dbgs() << " checkIndirectability on arg " << SubrArg->Arg->getArgNo() + << " (" << (SubrArg->Arg->getType()->getPrimitiveSizeInBits() / 8U) + << " bytes) in " << SubrArg->Arg->getParent()->getName() << "\n"); + Res = std::max(Res, SubrArg->checkIndirectability()); + } + if (Res == Indirectability::NO_OPTIMIZATION) { + LLVM_DEBUG(dbgs() << "NO_OPTIMIZATION\n"); + return false; // no indirection needed + } + if (Res == Indirectability::CANNOT_INDIRECT) { + LLVM_DEBUG(dbgs() << "CANNOT_INDIRECT\n"); + return false; // cannot indirect this ArgLR + } + // Get the worst case alignment of the indices from the call sites if we + // indirect this arg. + Alignment Align = Alignment(5, 0); + for (auto SubrArg = SubrArgs.begin(), e = SubrArgs.end(); + SubrArg != e; ++SubrArg) { + auto ThisAlign = SubrArg->getIndirectAlignment(); + Align = Align.merge(ThisAlign); + } + // Gather the bales that need indirecting, and check whether indirection is + // possible. + if (!gatherBalesToModify(ArgLR, Align)) + return false; + LLVM_DEBUG(dbgs() << "GenXArgIndirection is going to indirect " << *ArgLR << "\n"); + LRsToCalculate.clear(); + if (Res == Indirectability::WANT_SOME_INDIRECTION) { + // The arg that we're indirecting is coalesced at some call site where we + // are going to indirect it (represented by a NoOptCallSite). To avoid the + // coalesced LR also being live at other call sites where the arg is in + // fact in some other register, we need to uncoalesce. We take the values + // in ArgLR and separate into two piles: one defined outside subroutines + // where ArgLR has an arg, and one defined inside such subroutines. Then + // the two piles get a live range each, and the latter one is marked as not + // needing a register allocating. + SmallVector OutsidePile; + SmallVector InsidePile; + for (auto vi = ArgLR->value_begin(), ve = ArgLR->value_end(); + vi != ve; ++vi) { + auto SV = *vi; + Function *ContainingFunc = Liveness->isUnifiedRet(SV.getValue()); + if (!ContainingFunc) { + if (auto VArg = dyn_cast(SV.getValue())) + ContainingFunc = VArg->getParent(); + else + ContainingFunc = cast(SV.getValue()) + ->getParent()->getParent(); + } + if (!FuncMap[ContainingFunc]) + OutsidePile.push_back(SV); + else + InsidePile.push_back(SV); + } + assert(!InsidePile.empty()); + if (!OutsidePile.empty()) { + Liveness->removeValuesNoDelete(ArgLR); + LiveRange *OutsideLR = Liveness->getOrCreateLiveRange(OutsidePile[0]); + OutsideLR->setCategory(ArgLR->getCategory()); + for (auto vi = OutsidePile.begin() + 1, ve = OutsidePile.end(); + vi != ve; ++vi) + Liveness->setLiveRange(*vi, OutsideLR); + for (auto vi = InsidePile.begin(), ve = InsidePile.end(); + vi != ve; ++vi) + Liveness->setLiveRange(*vi, ArgLR); + LLVM_DEBUG(dbgs() << " Uncoalesced ArgLR into " << *OutsideLR + << "\n and " << *ArgLR << "\n"); + LRsToCalculate.push_back(OutsideLR); + } + } + // ArgLR now contains only these values: + // - args that we are indirecting + // - other values inside the subroutines that we are indirecting + // We do not want it to get a register allocated, since those values will be + // indirected. We achieve that by setting ArgLR's category to NONE. + ArgLR->setCategory(RegCategory::NONE); + LLVM_DEBUG(dbgs() << " Not allocating register for arg's LR\n"); + // For each subroutine, replace the func with a new one that has an extra + // address arg. + for (auto SubrArg = SubrArgs.begin(), e = SubrArgs.end(); + SubrArg != e; ++SubrArg) + SubrArg->addAddressArg(); + // For each subroutine, fix up its call sites. + for (auto SubrArg = SubrArgs.begin(), e = SubrArgs.end(); + SubrArg != e; ++SubrArg) + SubrArg->fixCallSites(); + // Replace old function with new function. + for (auto SubrArg = SubrArgs.begin(), e = SubrArgs.end(); + SubrArg != e; ++SubrArg) + SubrArg->replaceFunction(); + // Run gatherBalesToModify again, as the list it made last time is now invalid + // due to code being changed. + if (!gatherBalesToModify(ArgLR, Align)) + llvm_unreachable("not expecting indirection to have become invalid in second run"); + // Indirect the bales. + for (auto bi = BalesToModify.begin(), be = BalesToModify.end(); + bi != be; ++bi) { + Instruction *Inst = *bi; + Bale B; + Baling->buildBale(Inst, &B); + auto argIter = Inst->getParent()->getParent()->arg_begin(); + std::advance(argIter, Inst->getParent()->getParent()->arg_size() - 1); + Argument *AddressArg = &*argIter; + indirectBale(&B, ArgLR, AddressArg); + } + // Recalculate live ranges as required. Rebuild the call graph first, as it + // has been made invalid by us replacing some functions. + { + Liveness->rebuildCallGraph(); + std::set LRsSeen; + for (auto i = LRsToCalculate.begin(), e = LRsToCalculate.end(); i != e; ++i) { + LiveRange *LR = *i; + if (LRsSeen.insert(LR).second) { + Liveness->rebuildLiveRange(LR); + LLVM_DEBUG(dbgs() << " recalculated " << *LR << "\n"); + } + } + } + // Coalesce (or insert copy on coalesce failure) new address args. + for (auto SubrArg = SubrArgs.begin(), e = SubrArgs.end(); + SubrArg != e; ++SubrArg) + SubrArg->coalesceAddressArgs(); + return true; +} + +/*********************************************************************** + * checkIndirectability : check whether we want to and can indirect a + * subroutine argument, populating the SubrArg struct so we have the + * information needed to indirect it + * + * Return: NO_OPTIMIZATION : can indirect, but no optimization in terms of + * saving instructions or register pressure + * WANT_INDIRECTION : can indirect and it is an optimization. The live + * range does not include anything outside of subroutines where + * it is an arg, thus we need to ensure that no register is + * allocated to it. + * WANT_SOME_INDIRECTION : can indirect and it is an optimization. The + * live range does include something outside of subroutines where + * it is an arg, so we need to ensure that a register is allocated + * to it. We get this if some call sites are WANT_INDIRECTION and + * some are NO_OPTIMIZATION. + * CANNOT_INDIRECT : cannot indirect this live range at all. + */ +Indirectability SubroutineArg::checkIndirectability() +{ + if (F->hasFnAttribute(genx::FunctionMD::CMStackCall)) + return CANNOT_INDIRECT; + // See if there is a return value that is coalesced with the arg. + CoalescedRetIdx = -1; + for (unsigned ri = 0, re = IndexFlattener::getNumElements(F->getReturnType()); + ri != re; ++ri) { + if (Pass->Liveness->getLiveRange( + SimpleValue(Pass->Liveness->getUnifiedRet(F), ri)) == ArgLR) { + if (CoalescedRetIdx >= 0) { + for (auto ui = F->use_begin(), ue = F->use_end(); ui != ue; ++ui) { + auto CI = cast(ui->getUser()); + DiagnosticInfoArgIndirection Warn(CI, Arg, + "Argument coalesced with multiple return values", DS_Warning); + CI->getContext().diagnose(Warn); + } + return Indirectability::CANNOT_INDIRECT; + } + CoalescedRetIdx = ri; + break; + } + } + // If there is no return value, check whether it is OK to indirect a call arg + // even if the call arg is not killed at the call. This is the case if there + // is no write to the subroutine arg's live range inside the subroutine(s) + // other than args and coalesced bitcasts. + CanCoalesceWithoutKill = true; + if (CoalescedRetIdx < 0) { + for (auto vi = ArgLR->value_begin(), ve = ArgLR->value_end(); vi != ve; ++vi) { + auto Inst = dyn_cast(vi->getValue()); + if (!Inst) + continue; // it's an arg, not an instruction + Function *Func = Pass->Liveness->isUnifiedRet(Inst); + if (!Func) + Func = Inst->getParent()->getParent(); + else + continue; + if (Pass->FuncMap.find(Func) == Pass->FuncMap.end()) + continue; // value not in one of the subroutines where the arg is indirected + auto BC = dyn_cast(Inst); + if (!BC || !Pass->Liveness->isBitCastCoalesced(BC)) { + CanCoalesceWithoutKill = false; + break; + } + } + } + + // Create an object of some subclass of CallSite for each call site. + for (auto ui = F->use_begin(), ue = F->use_end(); ui != ue; ++ui) { + auto CI = cast(ui->getUser()); + assert(ui->getOperandNo() == CI->getNumArgOperands()); + auto CallSite = createCallSite(CI); + if (!CallSite) + return Indirectability::CANNOT_INDIRECT; + CallSites.push_back(CallSite); + LLVM_DEBUG(dbgs() << " " << *CallSite << "\n"); + } + // Check indirection state for each call site. + unsigned States = 0; + for (auto csi = CallSites.begin(), cse = CallSites.end(); csi != cse; ++csi) { + auto CallSite = *csi; + States |= CallSite->getState(); + } + switch (States & (Indirectability::NO_OPTIMIZATION | Indirectability::WANT_INDIRECTION)) { + case Indirectability::NO_OPTIMIZATION | Indirectability::WANT_INDIRECTION: + return Indirectability::WANT_SOME_INDIRECTION; + case Indirectability::WANT_INDIRECTION: + return Indirectability::WANT_INDIRECTION; + } + return Indirectability::NO_OPTIMIZATION; +} + +/*********************************************************************** + * createCallSite : create a CallSite object for this call + * + * Enter: CI = CallInst + * this->Arg = the Argument to look at + * this->ArgLR = its LiveRange + * this->CoalescedRetIdx = -1 else struct index of coalesced return value + * + * Return: 0 if this call stops arg indirection happening for this arg + * otherwise object of some subclass of CallSite + */ +CallSite *SubroutineArg::createCallSite(CallInst *CI) +{ + // Check if this call site is in a function that is itself indirecting the + // arg. + if (auto SubrArg = Pass->FuncMap[CI->getParent()->getParent()]) + return new CallerIndirectingCallSite(CI, SubrArg); + // Look at the call arg. + Value *V = CI->getArgOperand(Arg->getArgNo()); + // Skip any coalesced bitcasts. + while (auto BC = dyn_cast(V)) { + if (Pass->Liveness->getLiveRangeOrNull(BC->getOperand(0)) != ArgLR) + break; + V = BC->getOperand(0); + } + // If the call arg (before coalesced bitcasts) is a wrregion where the arg + // is the only use, try and parse it as a rd-wr sequence that reads a + // contiguous region and writes the whole of Arg. + RdWrRegionSequence ArgRWS; + if (!V->hasOneUse() || !GenXIntrinsic::isWrRegion(V) + || !ArgRWS.buildFromWr(cast(V), Pass->Baling) + || !ArgRWS.RdR.isContiguous() || !ArgRWS.WrR.isWhole(Arg->getType())) { + // Failed to find such a rd-wr sequence. Set ArgRWS to null. + ArgRWS = RdWrRegionSequence(); + } + // Look at the retval. + RdWrRegionSequence RetRWS; + if (CoalescedRetIdx >= 0) { + Value *RetVal = getRetVal(CI, CoalescedRetIdx); + if (!RetVal) { + // getRetVal could not determine what happens to this return value. + DiagnosticInfoArgIndirection Warn(CI, Arg, + "Coalesced return value has unknown uses", DS_Warning); + CI->getContext().diagnose(Warn); + return nullptr; + } + if (!isa(RetVal)) { + // See if the return value has a single use in (after skipping coalesced + // bitcasts) a single wrregion or a rd-wr sequence. + // First skip single use coalesced bitcasts. + while (!RetVal->use_empty()) { + auto User = cast(RetVal->use_begin()->getUser()); + if (RetVal->hasOneUse()) { + if (auto BC = dyn_cast(User)) { + if (Pass->Liveness->getLiveRange(BC) == ArgLR) { + // Skip coalesced bitcast. + RetVal = BC; + continue; + } + } + } + // Attempt to parse as a rd-wr sequence that reads the whole of RetVal + // and writes a contiguous region, so it is either a legalized copy, or + // a legalized contiguous wrregion, and it is the only use of the input. + if (!GenXIntrinsic::isRdRegion(User) + || RetVal->use_begin()->getOperandNo() + != GenXIntrinsic::GenXRegion::OldValueOperandNum + || !RetRWS.buildFromRd(User, Pass->Baling) + || !RetRWS.WrR.isContiguous() + || !RetRWS.RdR.isWhole(RetVal->getType()) + || !RetRWS.isOnlyUseOfInput()) { + // That failed, so make RetRWS null. + RetRWS = RdWrRegionSequence(); + } + break; + } + } + } + + // Now check the various cases. This results in the creation of an object of + // some subclass of CallSite. + + // Check that the regions are contiguous, and report if they are not. + if (ArgRWS.isNull() && !ArgRWS.RdR.isContiguous()) { + DiagnosticInfoArgIndirection Warn(CI, Arg, + "Non-contiguous region", DS_Warning); + CI->getContext().diagnose(Warn); + return new NoOptCallSite(CI); + } + if (RetRWS.isNull() && !RetRWS.WrR.isContiguous()) { + DiagnosticInfoArgIndirection Warn(CI, Arg, + "Non-contiguous region for coalesced return value", DS_Warning); + CI->getContext().diagnose(Warn); + return new NoOptCallSite(CI); + } + + // Case 1: The call arg is constant (inc undef, or a legalized constant + // load), and the retval is input to a wrregion sequence where the "old + // value" input is also a constant (a legalized constant load, also allowing + // for a bitcast). This typically happens when the arg and ret were a by ref + // region of a matrix, but the matrix was initialized to constant, or not + // initialized at all, before the call, so the rdregion got simplified away. + if (!RetRWS.isNull()) { + Value *RetOldVal = RetRWS.OldVal; + while (auto BC = dyn_cast(RetOldVal)) + RetOldVal = BC->getOperand(0); + auto *RetOldValC = dyn_cast(RetOldVal); + if (!RetOldValC && GenXIntrinsic::isWrRegion(RetOldVal)) { + RdWrRegionSequence ConstRWS; + if (ConstRWS.buildFromWr(cast(RetOldVal), Pass->Baling)) + RetOldValC = dyn_cast(ConstRWS.Input); + } + if (RetOldValC) { + Constant *Input; + if (!ArgRWS.isNull()) + Input = dyn_cast(ArgRWS.Input); + else + Input = dyn_cast(CI->getArgOperand(Arg->getArgNo())); + if (Input) { + // Get the Input constant to the same element type as RetOldValC. + if (RetOldValC->getType()->getScalarType() + != Input->getType()->getScalarType()) { + Type *ElTy = RetOldValC->getType()->getScalarType(); + assert(ElTy->getPrimitiveSizeInBits()); + Input = ConstantExpr::getBitCast(Input, + VectorType::get(ElTy, + Input->getType()->getPrimitiveSizeInBits() + / ElTy->getPrimitiveSizeInBits())); + } + // Construct the constant that needs to be loaded. + assert(RetOldValC->getType()->getScalarType() == Input->getType()->getScalarType()); + auto LdConst = RetRWS.WrR.evaluateConstantWrRegion(RetOldValC, Input); + // Create the ConstArgRetCallSite object. + return new ConstArgRetCallSite(CI, LdConst, RetRWS.EndWr, + RetRWS.getWrIndex()); + } + DiagnosticInfoArgIndirection Warn(CI, Arg, + "Coalesced return value does not match constant argument", DS_Warning); + CI->getContext().diagnose(Warn); + return nullptr; + } + } + + // Case 2: The call arg is a legalized contiguous rdregion or copy of + // non-constant, and there is no retval coalesced with it. + if (RetRWS.isNull() && !ArgRWS.isNull() && CoalescedRetIdx < 0 + && !isa(ArgRWS.Input)) { + // It is valid to indirect this arg only if one of these is true: + // 1. the input to ArgRWS is not live over the call, or + // 2. the coalesced live range for the arg is not written to inside the + // subroutine or anything it calls. + if (CanCoalesceWithoutKill || !Pass->Liveness->getLiveRange(ArgRWS.Input) + ->contains(Pass->Numbering->getNumber(CI))) + return new IndirectArgCallSite(CI, ArgRWS.getInputUse(), + ArgRWS.getRdIndex()); + DiagnosticInfoArgIndirection Warn(CI, Arg, + "Argument is region in value that is live over call", DS_Warning); + CI->getContext().diagnose(Warn); + return nullptr; + } + + // Case 3: The call arg is a legalized rdregion or copy of non-constant, and + // the coalesced retval is a legalized wrregion or copy with the same region + // and the same base register. + if (!RetRWS.isNull() && !ArgRWS.isNull() && CoalescedRetIdx >= 0 + && !isa(ArgRWS.Input)) { + // Check the regions are the same. + if (ArgRWS.RdR == RetRWS.WrR) { + // Check the base registers are the same. + if (Pass->Liveness->getLiveRange(ArgRWS.Input) + == Pass->Liveness->getLiveRange(RetRWS.EndWr)) + return new IndirectArgRetCallSite(CI, ArgRWS.getInputUse(), + RetRWS.EndWr, ArgRWS.getRdIndex()); + } + DiagnosticInfoArgIndirection Warn(CI, Arg, + "Coalesced return value does not match argument", DS_Warning); + CI->getContext().diagnose(Warn); + return nullptr; + } + + // Case 4: No optimization for this call site, and cannot even indirect it + // unless either there is a coalesced retval, or the subroutine arg's LR is + // not written inside the subroutines, or the call arg is killed at the call. + if (!CanCoalesceWithoutKill && !ArgRWS.isNull() && !isa(ArgRWS.Input) + && Pass->Liveness->getLiveRange(ArgRWS.Input) + ->contains(Pass->Numbering->getNumber(CI))) { + DiagnosticInfoArgIndirection Warn(CI, Arg, + "Argument is value that is live over call", DS_Warning); + CI->getContext().diagnose(Warn); + return nullptr; + } + + // Case 5: No optimization for this call site (but it can still be indirected + // if some other call site would get optimized). + return new NoOptCallSite(CI); +} + +/*********************************************************************** + * getIndirectAlignment : get worst-case alignment of indices if we indirect + * this arg and retval + */ +Alignment SubroutineArg::getIndirectAlignment() const +{ + Alignment Align(5, 0); // best case is GRF aligned + for (auto csi = CallSites.begin(), cse = CallSites.end(); + csi != cse; ++csi) { + auto CallSite = *csi; + Value *Index = CallSite->getIndex(); + if (!Index) + continue; + Align = Align.merge(Pass->AI->get(Index)); + } + return Align; +} + +/*********************************************************************** + * gatherBalesToModify : check whether the arg can be indirected and + * gather the bales that need modifying + * + * Enter: Align = the worst case alignment of the indirection + * this->BalesToModify = vector to populate + * + * Return: true if can be indirected, with + * BalesToModify populated with bales that need indirecting + */ +bool GenXArgIndirection::gatherBalesToModify(LiveRange *ArgLR, Alignment Align) +{ + LLVM_DEBUG(dbgs() << "gatherBalesToModify: alignment " << Align << "\n"); + BalesToModify.clear(); + // We call SubroutineArg::gatherBalesToModify for each subroutine that has + // an arg in this live range. Just gathering bales for all instructions and + // args in the live range in one go would not work, because there might be a + // call site where the call arg is coalesced, and we would end up indirecting + // it and other things it is coalesced with. + for (auto si = SubrArgs.begin(), se = SubrArgs.end(); si != se; ++si) + si->gatherBalesToModify(Align); + // Check the bales to see if we can legally indirect accesses to any value in + // ArgLR (i.e. the arg, the retval, and anything coalesced with it) by doing + // a dry run of modifying them. + for (auto btmi = BalesToModify.begin(), btme = BalesToModify.end(); + btmi != btme; ++btmi) { + Bale B; + Baling->buildBale(*btmi, &B); + if (!checkIndirectBale(&B, ArgLR, Align)) { + // Failure. For error reporting, get the arg for the function in which the + // failure occurred. + Argument *Arg = getArgForFunction(ArgLR, B.getHead()->Inst->getParent() + ->getParent()); + DiagnosticInfoArgIndirection Warn(B.getHead()->Inst, Arg, + "Use of argument cannot be indirected", DS_Warning); + B.getHead()->Inst->getContext().diagnose(Warn); + return false; + } + } + return true; +} + +/*********************************************************************** + * gatherBalesToModify : gather the bales that need modifying for this one + * subroutine arg + * + * Enter: Align = the worst case alignment of the indirection + * Pass->BalesToModify = vector to populate + * + * Return: BalesToModify populated with bales that need indirecting + */ +void SubroutineArg::gatherBalesToModify(Alignment Align) +{ + std::set BalesSeen; + for (auto vi = ArgLR->value_begin(), ve = ArgLR->value_end(); vi != ve; ++vi) { + Value *V = vi->getValue(); + if (Pass->Liveness->isUnifiedRet(V)) + continue; // ignore unified ret + if (auto Inst = dyn_cast(V)) { + if (Inst->getParent()->getParent() != F) + continue; // ignore instruction in wrong function + // Add the def to the list of bales that will need modifying, unless + // it is a phi node or coalesced bitcast or insert/extract in struct + // or a non-intrinsic call. + if (!isa(Inst) && (!isa(Inst) + || Pass->Liveness->getLiveRange(Inst->getOperand(0)) != ArgLR) + && !isa(Inst) && !isa(Inst) + && (!isa(Inst) + || GenXIntrinsic::isAnyNonTrivialIntrinsic(Inst))) + if (BalesSeen.insert(Inst).second) + Pass->BalesToModify.push_back(Inst); + } else if (V != Arg) + continue; // ignore arg in wrong function + for (auto ui = V->use_begin(), ue = V->use_end(); ui != ue; ++ui) { + auto User = cast(ui->getUser()); + if (auto CI = dyn_cast(User)) { + Function *CF = CI->getCalledFunction(); + if (!GenXIntrinsic::isAnyNonTrivialIntrinsic(CF)) { + // Non-intrinsic call. Ignore. (A call site using an arg being + // indirected gets handled differently.) + continue; + } + } else { + if (isa(User->getType())) + continue; // Ignore call with multiple retvals, or insert used to do + // multiple retvals + if (isa(User)) + continue; // Ignore extract in struct used to do multiple retvals + if (isa(User)) + continue; // Ignore phi nodes + if (isa(User)) + continue; // Ignore return instruction + if (isa(User) && Pass->Liveness->getLiveRange(User) == ArgLR) + continue; // Ignore coalesced bitcast + } + // Add the head of the bale to the list of bales that will need modifying. + auto UserHead = Pass->Baling->getBaleHead(User); + if (BalesSeen.insert(UserHead).second) + Pass->BalesToModify.push_back(UserHead); + } + } +} + +/*********************************************************************** + * checkIndirectBale : check if a bale can be indirected + * + * Enter: B = bale to check + * ArgLR = live range of values that need to be indirected + * Align = alignment of index being introduced + * + * Return: true if can be indirected + */ +bool GenXArgIndirection::checkIndirectBale(Bale *B, LiveRange *ArgLR, + Alignment Align) +{ + auto MainInst = B->getMainInst(); + if (MainInst) { + // Check for things about the main instruction that stop us indexing + // operand(s) or result in this bale. + if (MainInst->Inst->getType()->getPrimitiveSizeInBits() > 256 + && !ST->hasIndirectGRFCrossing()) { + // An execution size bigger than 1 GRF disqualifies the main + // instruction on <= BDW. + LLVM_DEBUG(dbgs() << "execution size bigger than GRF\n"); + return false; + } + unsigned IID = GenXIntrinsic::getAnyIntrinsicID(MainInst->Inst); + if (GenXIntrinsic::isAnyNonTrivialIntrinsic(IID)) { + // Cannot indirect a raw operand. We approximate this conservatively by + // spotting an intrinsic with void return type or with raw result. + if (MainInst->Inst->getType()->isVoidTy()) { + LLVM_DEBUG(dbgs() << "intrinsic with void return type assumed to have raw operands\n"); + return false; + } + if (GenXIntrinsicInfo(IID).getRetInfo().isRaw()) { + LLVM_DEBUG(dbgs() << "intrinsic with raw return value\n"); + return false; + } + } + } + // Check the rdregion(s) and wrregion. + for (auto bi = B->begin(), be = B->end(); bi != be; ++bi) { + switch (bi->Info.Type) { + case BaleInfo::WRREGION: + // Check wrregion if its result is coalesced with arg. + if (Liveness->getLiveRange(bi->Inst) == ArgLR) { + Region R(bi->Inst, bi->Info); + if (R.Indirect) + break; // already indirect + // Fake up scalar indirect index for the benefit of getLegalSize. + // It doesn't matter what the value is, as long as it is scalar. + R.Indirect = bi->Inst->getOperand( + GenXIntrinsic::GenXRegion::WrIndexOperandNum); + if (R.NumElements != R.getLegalSize(0, /*Allow2D=*/false, + /*InputNumElements=*/UINT_MAX, ST, Align)) { + LLVM_DEBUG(dbgs() << "wrregion cannot be indirected: " << R << "\n"); + return false; + } + } + break; + case BaleInfo::RDREGION: + // Check rdregion if its input is coalesced with arg. + if (Liveness->getLiveRange(bi->Inst->getOperand(0)) == ArgLR) { + Region R(bi->Inst, bi->Info); + if (R.Indirect) + break; // already indirect + // Fake up scalar indirect index for the benefit of getLegalSize. + // It doesn't matter what the value is, as long as it is scalar. + R.Indirect = bi->Inst->getOperand( + GenXIntrinsic::GenXRegion::RdIndexOperandNum); + if (R.NumElements != R.getLegalSize(0, /*Allow2D=*/true, + /*InputNumElements=*/UINT_MAX, ST, Align)) { + LLVM_DEBUG(dbgs() << "rdregion cannot be indirected: " << R << "\n"; + dbgs() << R.getLegalSize(0, /*Allow2D=*/true, + /*InputNumElements=*/UINT_MAX, ST, Align) << "\n"); + return false; + } + } + break; + default: + break; + } + } + return true; +} + +/*********************************************************************** + * addAddressArg : for this subroutine, replace the Function with a new + * one with an extra address arg, and modify all call sites + * + * This sets this->NewFunc, and modifies this->Arg to the argument in the + * new function. + */ +void SubroutineArg::addAddressArg() +{ + // Create the new function type. + auto FTy = F->getFunctionType(); + SmallVector ArgTys; + for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) + ArgTys.push_back(FTy->getParamType(i)); + ArgTys.push_back(Type::getInt16Ty(F->getContext())); + FTy = FunctionType::get(FTy->getReturnType(), ArgTys, false); + // Create the new function. + NewFunc = Function::Create(FTy, F->getLinkage(), ""); + NewFunc->takeName(F); + NewFunc->copyAttributesFrom(F); + F->getParent()->getFunctionList().insert(F->getIterator(), NewFunc); + // Set the new function's number to the same as the old function. + Pass->Numbering->setNumber(NewFunc, Pass->Numbering->getNumber(F)); + // Move the original function's unified return value across to the new + // function. + Pass->Liveness->moveUnifiedRet(F, NewFunc); + // The Function itself has a live range to represent the ranges of the + // subroutine itself and everything it calls. Change the Function in that + // live range. + Pass->Liveness->replaceValue(F, NewFunc); + // Populate arrays OldArgs (the original func's args) and NewArgs (the new + // func's args). + SmallVector OldArgs, NewArgs; + for (auto ai = F->arg_begin(), ae = F->arg_end(); ai != ae; ++ai) + OldArgs.push_back(&*ai); + for (auto ai = NewFunc->arg_begin(), ae = NewFunc->arg_end(); ai != ae; ++ai) + NewArgs.push_back(&*ai); + // For the original args, change uses to use the new args instead. Also + // change the old arg's live range to have the new arg instead. + for (unsigned ArgNum = 0; ArgNum != OldArgs.size(); ++ArgNum) { + NewArgs[ArgNum]->setName(OldArgs[ArgNum]->getName()); + OldArgs[ArgNum]->replaceAllUsesWith(NewArgs[ArgNum]); + Pass->Liveness->replaceValue(OldArgs[ArgNum], NewArgs[ArgNum]); + } + // Change the Arg in the current SubroutineArg, and save the address arg. + Arg = NewArgs[Arg->getArgNo()]; + AddressArgument = NewArgs.back(); + // Give the address arg a live range, and mark that it needs calculating. + auto LR = Pass->Liveness->getOrCreateLiveRange(AddressArgument); + LR->setCategory(RegCategory::ADDRESS); + Pass->LRsToCalculate.push_back(LR); + // Set the name of the new address arg. + NewArgs[OldArgs.size()]->setName(Arg->getName() + ".addr"); + // Move the function code across. + NewFunc->getBasicBlockList().splice(NewFunc->begin(), F->getBasicBlockList()); +} + +/*********************************************************************** + * fixCallSites : fix up a call to the subroutine, so it calls the new + * function instead and passes the extra address arg + * + * For each call site, this calls the process() method on the object of a + * subclass of CallSite set up by createCallSite(). That returns the extra + * address arg, which this function then uses to create a replacement call + * instruction. + */ +void SubroutineArg::fixCallSites() +{ + for (auto csi = CallSites.begin(), cse = CallSites.end(); csi != cse; ++csi) { + auto CallSite = *csi; + LLVM_DEBUG(dbgs() << " fixCallSites: [" << Pass->Numbering->getNumber(CallSite->CI) + << "] " << *CallSite << "\n"); + // Process the call site. + // Create the replacement call instruction, with an added address arg that + // for now we set to undef. We do this first so that process() called below + // can modify the arg being indirected such that the eraseUnusedTree erases + // the rd-wr sequence that sets up the arg in the old call. + SmallVector Args; + for (unsigned oi = 0, oe = CallSite->CI->getNumArgOperands(); + oi != oe; ++oi) + Args.push_back(CallSite->CI->getArgOperand(oi)); + Args.push_back(UndefValue::get(Type::getInt16Ty(CallSite->CI->getContext()))); + CallInst *OldCI = CallSite->CI; + CallSite->CI = CallInst::Create(NewFunc, Args, "", OldCI); + CallSite->CI->takeName(OldCI); + CallSite->CI->setDebugLoc(OldCI->getDebugLoc()); + Pass->Numbering->setNumber(CallSite->CI, Pass->Numbering->getNumber(OldCI)); + Pass->Numbering->setStartNumber(CallSite->CI, + Pass->Numbering->getStartNumber(OldCI)); + // Get the subclass of CallSite to do its processing, returning the extra + // address arg for the call. + Value *AddressArg = CallSite->process(Pass, this); + LLVM_DEBUG(dbgs() << " AddressArg is " << AddressArg->getName() << "\n"); + if (!isa(AddressArg)) { + // Create a live range for the address arg, and ensure it is recalculated. + LiveRange *AddressArgLR = Pass->Liveness->getOrCreateLiveRange(AddressArg); + AddressArgLR->setCategory(RegCategory::ADDRESS); + Pass->LRsToCalculate.push_back(AddressArgLR); + } + // Use the address arg in the new call. + CallSite->CI->setOperand(Args.size() - 1, AddressArg); + // Replace the old call with the new one, and erase the old one. We use + // eraseUnusedTree so that any rd-wr sequence for the indirected arg is also + // erased. + OldCI->replaceAllUsesWith(CallSite->CI); + Pass->Liveness->replaceValue(OldCI, CallSite->CI); + Pass->Liveness->eraseUnusedTree(OldCI); + } +} + +/*********************************************************************** + * CallerIndirectingCallSite::process : arg indirection processing for a call + * site in a subroutine that is itself indirecting the arg + * + * Return: the address arg that needs to be passed to the call + */ +Value *CallerIndirectingCallSite::process(GenXArgIndirection *Pass, + SubroutineArg *SubrArg) +{ + return CallerSubrArg->AddressArgument; +} + +/*********************************************************************** + * NoOptCallSite::process : arg indirection processing for a call site where + * no optimization is possible, but we can still indirect + * + * Return: the address arg that needs to be passed to the call + */ +Value *NoOptCallSite::process(GenXArgIndirection *Pass, SubroutineArg *SubrArg) +{ + unsigned InsertNumber = Pass->Numbering->getArgIndirectionNumber( + CI, CI->getNumArgOperands() - 1, 0); + Instruction *InsertBefore = CI; + Type *I16Ty = Type::getInt16Ty(CI->getContext()); + // If the arg is undef, we can just use an undef address. + if (isa(CI->getArgOperand(SubrArg->Arg->getArgNo()))) + return UndefValue::get(I16Ty); + // Create a convert.addr of index 0, just before the call with the number of + // the arg pre-copy site for the new address argument that will be added. + auto Conv = createConvertAddr(ConstantInt::get(I16Ty, 0), 0, + SubrArg->Arg->getName() + ".indirect", InsertBefore); + Conv->setDebugLoc(CI->getDebugLoc()); + Pass->Numbering->setNumber(Conv, InsertNumber); + // Tell GenXLiveness the base register for this address register. The normal + // mechanism of tracing through to a user of the address does not work for an + // indirected arg. + Pass->Liveness->setArgAddressBase(Conv, + CI->getArgOperand(SubrArg->Arg->getArgNo())); + // If the live range of the input does not reach over the call, add a + // use of it (an unused bitcast) after the call and recalculate the + // live range. + unsigned CINumber = Pass->Numbering->getNumber(CI); + Value *Input = CI->getOperand(SubrArg->Arg->getArgNo()); + LiveRange *InputLR = Pass->Liveness->getLiveRange(Input); + if (!InputLR->contains(CINumber)) { + auto BC = CastInst::Create(Instruction::BitCast, Input, Input->getType(), + Input->getName() + ".dummy_use_for_indirection", CI->getNextNode()); + Pass->Liveness->setLiveRange(BC, InputLR); + Pass->Numbering->setNumber(BC, CINumber + 1); + Pass->LRsToCalculate.push_back(InputLR); + } + return Conv; +} + +/*********************************************************************** + * ConstArgRetCallSite::process : arg indirection processing for a call site + * where the arg is constant (including undef) and the arg is coalesced + * with a retval that is used only in a legalized wrregion whose "old + * value" input is constant. + * + * Return: the address arg that needs to be passed to the call + */ +Value *ConstArgRetCallSite::process(GenXArgIndirection *Pass, + SubroutineArg *SubrArg) +{ + // checkCallSites detected the situation where the arg is a constant + // (probably a legalized constant load, detected by RdWrRegionSequence, + // but also including undef), and the ret is wrregioned (probably + // legalized) with a constant as the "old value" operand (including + // undef). + // + // To handle this, we create a new constant load of the two constants + // combined, before the call, to turn it back into the normal situation + // of a legalized rdregion before the call and a legalized wrregion + // after the call. (However we don't actually create the legalized + // rdregion and wrregion.) + // + // The combined constant was created in checkCallSites, and in this object + // it is LdConst. + // + // Any new instruction is inserted just before the call, and given the + // instruction number of the address arg's pre-copy slot. + Instruction *InsertBefore = CI; + unsigned InsertNumber = Pass->Numbering->getArgIndirectionNumber( + CI, CI->getNumArgOperands() - 1, 0); + // Insert a load the constant. Bitcast it to the right type to replace + // RetEndWr. + SmallVector AddedInsts; + ConstantLoader CL(LdConst, nullptr, &AddedInsts); + auto LoadedConst = CL.loadBig(InsertBefore); + assert(LoadedConst); + if (LoadedConst->getType() != RetEndWr->getType()) { + LoadedConst = CastInst::Create(Instruction::BitCast, LoadedConst, + RetEndWr->getType(), LoadedConst->getName() + ".bitcast", + InsertBefore); + AddedInsts.push_back(LoadedConst); + } + // An added instruction (from the constant load) is allocated a live range as + // follows: + // 1. An instruction with the right result size is assumed to be coalesceable + // with the final result, and so put in the same live range as the retval's + // wrregion. + // 2. A (smaller) wrregion is assumed to be coalesceable with its "old value" + // input, if that is an instruction. + // 3. Otherwise it gets its own new live range. + // A wrregion also needs to be marked as such in baling. + auto RetValWrLR = Pass->Liveness->getLiveRange(RetEndWr); + unsigned LoadedConstSize = LoadedConst->getType()->getPrimitiveSizeInBits(); + for (auto i = AddedInsts.begin(), e = AddedInsts.end(); i != e; ++i) { + auto Inst = *i; + Pass->Numbering->setNumber(Inst, InsertNumber); + LiveRange *LR = nullptr; + if (Inst->getType()->getPrimitiveSizeInBits() == LoadedConstSize) + Pass->Liveness->setLiveRange(Inst, LR = RetValWrLR); + if (GenXIntrinsic::isWrRegion(Inst)) { + BaleInfo BI(BaleInfo::WRREGION); + if (isa(Inst->getOperand( + GenXIntrinsic::GenXRegion::NewValueOperandNum))) + BI.setOperandBaled(GenXIntrinsic::GenXRegion::NewValueOperandNum); + Pass->Baling->setBaleInfo(Inst, BI); + if (auto InInst = dyn_cast( + Inst->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum))) + if (!LR) + Pass->Liveness->setLiveRange(Inst, + LR = Pass->Liveness->getLiveRange(InInst)); + } + if (!LR) { + LR = Pass->Liveness->getOrCreateLiveRange(Inst); + LR->setCategory(RegCategory::GENERAL); + } + Pass->LRsToCalculate.push_back(LR); + } + // Create the genx.convert.addr for the region of that constant load. We + // use the offset of the retval's legalized wrregion. + auto AddressArg = createConvertAddr(Index, 0, + SubrArg->Arg->getName() + ".indirect", InsertBefore); + AddressArg->setDebugLoc(CI->getDebugLoc()); + Pass->Numbering->setNumber(AddressArg, InsertNumber); + // Tell GenXLiveness the base register for this address register. + // The normal mechanism of tracing through to a user of the address + // does not work for an indirected arg. + Pass->Liveness->setArgAddressBase(AddressArg, LoadedConst); + // Undef out the arg in the call, so the old code to load the constant (if + // any) gets erased when the call is erased. + unsigned CallArgNum = SubrArg->Arg->getArgNo(); + CI->setOperand(CallArgNum, + UndefValue::get(CI->getOperand(CallArgNum)->getType())); + // Replace uses of the (legalized) wrregion sequence with the newly inserted + // constant load, then erase the sequence. + Instruction *ToErase = RetEndWr; + RetEndWr = nullptr; // need to do this as RetEndWr is an AssertingVH + Pass->replaceAndEraseSequence(ToErase, LoadedConst); + return AddressArg; +} + +/*********************************************************************** + * IndirectArgCallSite::process : arg indirection processing for a call site + * where the arg is a legalized rdregion or copy, and there is no retval + * coalesced with it. + * + * Return: the address arg that needs to be passed to the call + */ +Value *IndirectArgCallSite::process(GenXArgIndirection *Pass, + SubroutineArg *SubrArg) +{ + // Any new instruction is inserted just before the call, and given the + // instruction number of the address arg's pre-copy slot. + Instruction *InsertBefore = CI; + unsigned InsertNumber = Pass->Numbering->getArgIndirectionNumber(CI, + CI->getNumArgOperands() - 1, 0); + Value *AddressArg = nullptr; + if (isa(Index)) { + // Constant index for the region. Add a convert.addr to load it into an + // address register. + auto Conv = createConvertAddr(Index, 0, + SubrArg->Arg->getName() + ".indirect", InsertBefore); + Conv->setDebugLoc(CI->getDebugLoc()); + Pass->Numbering->setNumber(Conv, InsertNumber); + AddressArg = Conv; + } else { + // Variable index for the region. The index is already converted to an + // address. It might be a genx.add.addr baled in to the rdregion; if so + // unbale it. + if (auto IndexInst = dyn_cast(Index)) + Pass->Baling->unbale(IndexInst); + AddressArg = Index; + } + // Tell GenXLiveness the base register for this address register. + // The normal mechanism of tracing through to a user of the address + // does not work for an indirected arg. + LiveRange *InputLR = Pass->Liveness->getLiveRange(*InputUse); + // Add a use of the input (an unused bitcast) in case: + // 1. the live range does not reach over the call (in which case we need to + // recalculate the live range after adding this use), or + // 2. later on, another arg indirection removes a use, meaning that the live + // range no longer reaches over the call (in which case we don't need to + // recalculate the live range yet). + auto BC = CastInst::Create(Instruction::BitCast, *InputUse, + (*InputUse)->getType(), + (*InputUse)->getName() + ".dummy_use_for_indirection", + CI->getNextNode()); + Pass->Liveness->setLiveRange(BC, InputLR); + Pass->Liveness->setArgAddressBase(AddressArg, BC); + unsigned CINumber = Pass->Numbering->getNumber(CI); + Pass->Numbering->setNumber(BC, CINumber + 1); + if (!InputLR->contains(CINumber)) + Pass->LRsToCalculate.push_back(InputLR); + // Undef out the arg in the call, so the old rd-wr sequence for the arg gets + // erased when the call is erased. + unsigned CallArgNum = SubrArg->Arg->getArgNo(); + CI->setOperand(CallArgNum, + UndefValue::get(CI->getOperand(CallArgNum)->getType())); + return AddressArg; +} + +/*********************************************************************** + * IndirectArgRetCallSite::process : arg indirection processing for a call site + * where the arg is a legalized rdregion or copy, and the arg is coalesced + * with a retval that is used only in a legalized wrregion or copy. + * + * Return: the address arg that needs to be passed to the call + */ +Value *IndirectArgRetCallSite::process(GenXArgIndirection *Pass, + SubroutineArg *SubrArg) +{ + // Common code with IndirectArgCallSite above: + auto AddressArg = IndirectArgCallSite::process(Pass, SubrArg); + // Replace uses of the (legalized) wrregion sequence with the input to the + // legalized rdregion before the call. + Instruction *ToReplace = RetEndWr; + RetEndWr = nullptr; // Needed as RetEndWr is an AssertingVH + Pass->replaceAndEraseSequence(ToReplace, *InputUse); + return AddressArg; +} + +/*********************************************************************** + * GenXArgIndirection::replaceAndEraseSequence : replace uses of a wrregion + * sequence with a different value and erase the sequence, coping with + * different types due to bitcast + * + * Enter: RetEndWr = end of wrregion sequence + * V = value to replace its uses with (not constant, so it has a + * live range) + */ +void GenXArgIndirection::replaceAndEraseSequence(Instruction *RetEndWr, Value *V) +{ + // See if the types are different due to some bitcasting somewhere. First + // handle the case that V is the result of a bitcast whose input is the type + // we want. We can just use that input. + if (V->getType() != RetEndWr->getType()) + if (auto BC = dyn_cast(V)) + if (BC->getOperand(0)->getType() == RetEndWr->getType()) + V = BC->getOperand(0); + // Then handle other different type cases by inserting our own bitcast. + if (V->getType() != RetEndWr->getType()) { + auto BC = CastInst::Create(Instruction::BitCast, V, RetEndWr->getType(), + V->getName() + ".bitcast", RetEndWr); + Numbering->setNumber(BC, Numbering->getNumber(RetEndWr)); + Liveness->setLiveRange(BC, Liveness->getLiveRange(V)); + V = BC; + } + // Replace uses and erase resulting tree of unused instructions. + RetEndWr->replaceAllUsesWith(V); + Liveness->eraseUnusedTree(RetEndWr); +} + +/*********************************************************************** + * coalesceAddressArgs : for the new address arg, attempt to coalesce at + * each call site, inserting a copy on failure to coalesce + */ +void SubroutineArg::coalesceAddressArgs() +{ + LiveRange *AddressLR = Pass->Liveness->getLiveRange(AddressArgument); + unsigned ArgNum = AddressArgument->getArgNo(); + for (unsigned csi = 0, cse = CallSites.size(); csi != cse; ++csi) { + auto CallSite = CallSites[csi]; + Value *CallArg = CallSite->CI->getArgOperand(ArgNum); + if (isa(CallArg)) + continue; + LiveRange *CallArgLR = Pass->Liveness->getLiveRange(CallArg); + if (AddressLR == CallArgLR) + continue; + if (!Pass->Liveness->interfere(AddressLR, CallArgLR)) { + // No interference -- we can coalesce. + AddressLR = Pass->Liveness->coalesce(AddressLR, CallArgLR, + /*DisallowCASC=*/true); + continue; + } + // There is interference. This should not happen if the caller is another + // subroutine where we are indirecting the arg -- the new address args + // for each subroutine should coalesce together. + LLVM_DEBUG(dbgs() << "Failed to coalesce:\n " << *AddressLR << "\n " << *CallArgLR << "\n"); + assert(!Pass->FuncMap[CallSite->CI->getParent()->getParent()] + && "new address args should coalesce together"); + // We need to insert a copy, in the address arg's pre-copy slot. An address + // copy is done with a genx.convert, even though it is not actually doing a + // conversion. + auto Copy = createConvert(CallArg, CallArg->getName() + ".coalescefail", + CallSite->CI); + Copy->setDebugLoc(CallSite->CI->getDebugLoc()); + Pass->Numbering->setNumber(Copy, Pass->Numbering->getArgPreCopyNumber( + CallSite->CI, ArgNum, 0)); + // Add the new value in to AddressLR. + Pass->Liveness->setLiveRange(Copy, AddressLR); + CallSite->CI->setOperand(ArgNum, Copy); + } +} + +/*********************************************************************** + * replaceFunction : replace the old function with the new function + * + * This replaces the function in the FunctionGroup, and then erases the old + * function. + */ +void SubroutineArg::replaceFunction() +{ + Pass->FGA->replaceFunction(F, NewFunc); + F->eraseFromParent(); + F = NewFunc; +} + +/*********************************************************************** + * indirectBale : modify a bale to be indirect + * + * Enter: B = bale to modify + * ArgLR = live range of values that need to be indirected + * AddressArg = new argument for address + * + * On return, the bale struct is no longer valid. + */ +void GenXArgIndirection::indirectBale(Bale *B, LiveRange *ArgLR, + Argument *AddressArg) +{ + // Indirect the head of the bale, if its result is in ArgLR. + auto Inst = B->getHead()->Inst; + if (Liveness->getLiveRange(Inst) == ArgLR) { + if (B->getHead()->Info.Type == BaleInfo::WRREGION) { + // wrregion: just modify the index to indirect it. + indirectRegion(&Inst->getOperandUse( + GenXIntrinsic::GenXRegion::WrIndexOperandNum), AddressArg, Inst); + } else { + // No wrregion: we need to add one, and ensure that the original + // instruction is baled into it. + Region R(Inst); + R.Indirect = AddressArg; + SmallVector Uses; + for (auto ui = Inst->use_begin(), ue = Inst->use_end(); ui != ue; ++ui) + Uses.push_back(&*ui); + auto NewWr = cast(R.createWrRegion( + UndefValue::get(Inst->getType()), Inst, + Inst->getName() + ".indirected", Inst->getNextNode(), + Inst->getDebugLoc())); + Liveness->setLiveRange(NewWr, ArgLR); + Liveness->removeValue(Inst); + for (auto ui = Uses.begin(), ue = Uses.end(); ui != ue; ++ui) + **ui = NewWr; + BaleInfo BI(BaleInfo::WRREGION); + BI.setOperandBaled(GenXIntrinsic::GenXRegion::NewValueOperandNum); + Baling->setBaleInfo(NewWr, BI); + } + } + // Process operands in each instruction of the bale. + for (auto bi = B->begin(), be = B->end(); bi != be; ++bi) { + Inst = bi->Inst; + for (unsigned oi = 0, oe = Inst->getNumOperands(); oi != oe; ++oi) { + if (bi->Info.isOperandBaled(oi)) + continue; // Ignore within-bale operands + if (!oi && bi->Info.Type == BaleInfo::WRREGION) + continue; // Ignore "old value" input to wrregion + Value *Opnd = Inst->getOperand(oi); + if (Liveness->getLiveRangeOrNull(Opnd) != ArgLR) + continue; // Not in ArgLR, does not need indirecting + if (bi->Info.Type == BaleInfo::RDREGION + && oi == GenXIntrinsic::GenXRegion::OldValueOperandNum) { + // input to rdregion: just modify the index to indirect it. + indirectRegion(&bi->Inst->getOperandUse( + GenXIntrinsic::GenXRegion::RdIndexOperandNum), AddressArg, Inst); + } else { + // No rdregion: we need to add one, and ensure that it is baled in + // to the original instruction. + Region R(Opnd); + R.Indirect = AddressArg; + auto NewRd = R.createRdRegion(Opnd, Opnd->getName() + ".indirected", + Inst, Inst->getDebugLoc()); + Inst->setOperand(oi, NewRd); + BaleInfo BI = bi->Info; + BI.setOperandBaled(oi); + Baling->setBaleInfo(Inst, BI); + BaleInfo NewRdBI(BaleInfo::RDREGION); + Baling->setBaleInfo(NewRd, NewRdBI); + } + } + } +} + +/*********************************************************************** + * indirectRegion : convert a rdregion/wrregion index operand to indirect + * + * Enter: U = the rdregion/wrregion index operand use + * AddressArg = the index to use + * InsertBefore = where to insert new instructions + * + * If the rdregion/wrregion already has a variable index, then we create an + * instruction to remove its genx.convert.addr and add it to AddressArg with + * genx.add.addr. + */ +void GenXArgIndirection::indirectRegion(Use *U, Value *AddressArg, + Instruction *InsertBefore) +{ + Value *Addr = *U; + if (auto CI = dyn_cast(Addr)) { + // Currently the index is constant. + if (CI->isNullValue()) { + *U = AddressArg; + return; + } + // Create a genx.add.addr and give it an instruction number one less + // than InsertBefore. + auto NewAdd = createAddAddr(AddressArg, CI, "indirect.offset", InsertBefore); + Numbering->setNumber(NewAdd, Numbering->getNumber(InsertBefore) - 1); + *U = NewAdd; + // If the constant is within offset range, bale the new genx.add.addr into + // its user. + if (GenXBaling::isBalableIndexAdd(NewAdd)) { + auto User = cast(U->getUser()); + BaleInfo BI = Baling->getBaleInfo(User); + BI.setOperandBaled(U->getOperandNo()); + Baling->setBaleInfo(User, BI); + } else { + // Otherwise, give it a live range, and mark it as needing calculating. + auto LR = Liveness->getOrCreateLiveRange(NewAdd); + LR->setCategory(RegCategory::ADDRESS); + LRsToCalculate.push_back(LR); + } + return; + } + // The index is already variable. + // Trace back through add_addr instructions until we find one of: + // 1. The convert_addr instruction set up by GenXCategory, and possibly + // commoned up by GenXAddressCommoning. We replace that with an + // add_addr instruction that adds the convert_addr's input to AddressArg. + // or + // 2. An Argument, so another user of the same address must have already + // found and replaced (1). + for (;;) { + if (isa(Addr)) + return; + auto IntrinsicID = GenXIntrinsic::getGenXIntrinsicID(Addr); + switch (IntrinsicID) { + case GenXIntrinsic::genx_add_addr: + Addr = cast(Addr)->getOperand(0); + continue; + case GenXIntrinsic::genx_rdregioni: + Addr = cast(Addr)->getOperand( + GenXIntrinsic::GenXRegion::OldValueOperandNum); + continue; + case GenXIntrinsic::genx_convert_addr: + // we've found what we wanted + break; + default: + llvm_unreachable("unsupported instruction"); + } + break; + } + assert(GenXIntrinsic::getGenXIntrinsicID(Addr) == + GenXIntrinsic::genx_convert_addr); + auto AddrInst = cast(Addr); + auto AddrSrc = AddrInst->getOperand(0); + // Create an add_addr to replace the convert_addr. It needs a live range with + // ADDRESS category. + auto NewAddAddr = createAddAddr(AddressArg, AddrSrc, + AddrInst->getName() + ".indirectedaddr", AddrInst); + NewAddAddr->setDebugLoc(AddrInst->getDebugLoc()); + Numbering->setNumber(NewAddAddr, Numbering->getNumber(AddrInst) - 1); + AddrInst->replaceAllUsesWith(NewAddAddr); + LiveRange *LR = Liveness->getOrCreateLiveRange(NewAddAddr); + LR->setCategory(RegCategory::ADDRESS); + LRsToCalculate.push_back(LR); + // AddrSrc (source of convert_addr) should get a live range as well + LiveRange *SrcLR = Liveness->getOrCreateLiveRange(AddrSrc); + SrcLR->setCategory(RegCategory::GENERAL); + LRsToCalculate.push_back(SrcLR); + // remove the old convert_addr + Liveness->eraseLiveRange(AddrInst); + AddrInst->eraseFromParent(); +} + +/*********************************************************************** + * getArgForFunction : find the arg in a live range that belongs to a func + */ +Argument *GenXArgIndirection::getArgForFunction(LiveRange *LR, Function *F) +{ + for (auto vi = LR->value_begin(), ve = LR->value_end(); vi != ve; ++vi) { + Value *V = vi->getValue(); + if (auto Arg = dyn_cast(V)) + if (Arg->getParent() == F) + return Arg; + } + return nullptr; +} + +/*********************************************************************** + * getRetVal : get return value for possibly multi return value call + * + * Enter: CI = call instruction + * RetNum = return value number + * + * Return: the return value (which is either a CallInst or an + * ExtractValueInst), or 0 if unknown use, or undef if it is shown + * that the requested return value is never extracted from the struct + */ +Value *SubroutineArg::getRetVal(CallInst *CI, unsigned RetNum) +{ + auto ST = dyn_cast(CI->getType()); + if (!ST) { + assert(!RetNum); + return CI; + } + Value *RetVal = UndefValue::get(ST->getElementType(RetNum)); + for (auto ui = CI->use_begin(), ue = CI->use_end(); ui != ue; ++ui) { + auto EVI = dyn_cast(ui->getUser()); + if (!EVI || EVI->getNumIndices() != 1) + return nullptr; // unknown use + if (EVI->getIndices()[0] == RetNum) { + if (isa(RetVal)) + RetVal = EVI; + else + return nullptr; // multiple extractelements of the same retval + } + } + return RetVal; +} + +/*********************************************************************** + * DiagnosticInfoArgIndirection initializer from Instruction + * + * If the Instruction has a DebugLoc, then that is used for the error + * location. + * Otherwise, the location is unknown. + */ +DiagnosticInfoArgIndirection::DiagnosticInfoArgIndirection(Instruction *Inst, + Argument *Arg, const Twine &Desc, DiagnosticSeverity Severity) + : DiagnosticInfo(getKindID(), Severity), Line(0), Col(0) +{ + auto DL = Inst->getDebugLoc(); + if (DL) { + Filename = DL->getFilename(); + Line = DL.getLine(); + Col = DL.getCol(); + } + Description = (Twine("GenXArgIndirection failed for argument ") + + Twine(Arg->getArgNo() + 1) + " in " + Arg->getParent()->getName() + + ": " + Desc).str(); +} + +/*********************************************************************** + * DiagnosticInfoArgIndirection::print : print the error/warning message + */ +void DiagnosticInfoArgIndirection::print(DiagnosticPrinter &DP) const +{ + std::string Loc( + (Twine(!Filename.empty() ? Filename : "") + + ":" + Twine(Line) + + (!Col ? Twine() : Twine(":") + Twine(Col)) + + ": ") + .str()); + DP << Loc << Description; +} + + diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXBaling.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXBaling.cpp new file mode 100644 index 000000000000..9392dee402c9 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXBaling.cpp @@ -0,0 +1,2365 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +// GenX instruction baling is analyzed by this pass. See GenXBaling.h for more +// detailed comment. +// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "GENX_INSTRUCTION_BALING" + +#include "GenXBaling.h" +#include "GenXConstants.h" +#include "GenXIntrinsics.h" +#include "GenXLiveness.h" +#include "GenXRegion.h" +#include "GenXUtil.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/GenXIntrinsics/GenXIntrinsics.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/Local.h" + +// Part of the bodge to allow abs to bale in to sext/zext. This needs to be set +// to some arbitrary value that does not clash with any +// GenXIntrinsicInfo::MODIFIER_* value. +enum { MODIFIER_ABSONLY = 9000 }; + +using namespace llvm; +using namespace genx; +using namespace GenXIntrinsic::GenXRegion; + +//---------------------------------------------------------------------- +// Administrivia for GenXFuncBaling pass +// +char GenXFuncBaling::ID = 0; +INITIALIZE_PASS(GenXFuncBaling, "GenXFuncBaling", "GenXFuncBaling", false, false) + +FunctionPass *llvm::createGenXFuncBalingPass(BalingKind Kind, GenXSubtarget *ST) +{ + initializeGenXFuncBalingPass(*PassRegistry::getPassRegistry()); + return new GenXFuncBaling(Kind, ST); +} + +void GenXFuncBaling::getAnalysisUsage(AnalysisUsage &AU) const +{ + FunctionPass::getAnalysisUsage(AU); + AU.setPreservesCFG(); +} + +//---------------------------------------------------------------------- +// Administrivia for GenXGroupBaling pass +// +char GenXGroupBaling::ID = 0; +INITIALIZE_PASS_BEGIN(GenXGroupBaling, "GenXGroupBaling", "GenXGroupBaling", false, false) +INITIALIZE_PASS_DEPENDENCY(GenXLiveness) +INITIALIZE_PASS_END(GenXGroupBaling, "GenXGroupBaling", "GenXGroupBaling", false, false) + +FunctionGroupPass *llvm::createGenXGroupBalingPass(BalingKind Kind, GenXSubtarget *ST) +{ + initializeGenXGroupBalingPass(*PassRegistry::getPassRegistry()); + return new GenXGroupBaling(Kind, ST); +} + +void GenXGroupBaling::getAnalysisUsage(AnalysisUsage &AU) const +{ + FunctionGroupPass::getAnalysisUsage(AU); + AU.addRequired(); + AU.setPreservesCFG(); + AU.addPreserved(); + AU.addPreserved(); +} + +/*********************************************************************** + * GenXGroupBaling::runOnFunctionGroup : run second baling pass on function + * group + */ +bool GenXGroupBaling::runOnFunctionGroup(FunctionGroup &FG) +{ + clear(); + Liveness = &getAnalysis(); + return processFunctionGroup(&FG); +} + +/*********************************************************************** + * processFunctionGroup : run instruction baling analysis on one + * function group + */ +bool GenXBaling::processFunctionGroup(FunctionGroup *FG) +{ + bool Modified = false; + for (auto i = FG->begin(), e = FG->end(); i != e; ++i) { + Modified |= processFunction(*i); + } + return Modified; +} + +/*********************************************************************** + * processFunction : run instruction baling analysis on one function + * + * This does a preordered depth first traversal of the CFG to + * ensure that we see a def before its uses (ignoring phi node uses). + * This is required when we see a constant add/sub used as a region or + * element variable index; if the add/sub has already been marked as + * baling in a modifier or rdregion then we cannot bale it in to the + * variable index region. + * + * This pass also clones any instruction that can be baled in but has + * multiple uses. A baled in instruction must have exactly one use. + */ +bool GenXBaling::processFunction(Function *F) +{ + bool Changed = prologue(F); + + for (df_iterator i = df_begin(&F->getEntryBlock()), + e = df_end(&F->getEntryBlock()); i != e; ++i) { + for (BasicBlock::iterator bi = i->begin(), be = i->end(); bi != be; ) { + Instruction *Inst = &*bi; + ++bi; // increment here as Inst may be erased + processInst(Inst); + } + } + // Process any two addr sends we found. + for (auto i = TwoAddrSends.begin(), e = TwoAddrSends.end(); i != e; ++i) + processTwoAddrSend(*i); + TwoAddrSends.clear(); + // Clone any instructions that we found in the pass that want to be baled in + // but have more than one use. + if (NeedCloneStack.size()) { + doClones(); + Changed = true; + } + return Changed; +} + +/*********************************************************************** + * processInst : calculate baling for an instruction + * + * Usually this is called from runOnFunction above. However another pass + * can call this to recalculate the baling for an instruction, particularly + * for a new instruction it has just added. GenXLegalization does this. + */ +void GenXBaling::processInst(Instruction *Inst) +{ + unsigned IntrinID = GenXIntrinsic::getAnyIntrinsicID(Inst); + if (GenXIntrinsic::isWrRegion(IntrinID)) + processWrRegion(Inst); + else if (IntrinID == GenXIntrinsic::genx_wrpredregion) + processWrPredRegion(Inst); + else if (IntrinID == GenXIntrinsic::genx_wrpredpredregion) + processWrPredPredRegion(Inst); + else if (IntrinID == GenXIntrinsic::genx_sat || GenXIntrinsic::isIntegerSat(IntrinID)) + processSat(Inst); + else if (GenXIntrinsic::isRdRegion(IntrinID)) + processRdRegion(Inst); + else if (BranchInst *Branch = dyn_cast(Inst)) + processBranch(Branch); + else if (auto SI = dyn_cast(Inst)) + processStore(SI); + else if (isa(Inst) && cast(Inst)->isInlineAsm()) + processInlineAsm(Inst); + else if(ExtractValueInst *EV = dyn_cast(Inst)) + processExtractValue(EV); + else if (isa(Inst) && cast(Inst) + ->getPointerOperand() + ->getType() + ->getPointerElementType() + ->isFunctionTy()) + processFuncPointer(cast(Inst)); + else { + // Try to bale a select into cmp's dst. If failed, continue to process + // select as a main instruction. + bool BaledSelect = processSelect(Inst); + if (!BaledSelect) + processMainInst(Inst, IntrinID); + } +} + +/*********************************************************************** + * static isRegionOKForIntrinsic : check whether region is OK for an intrinsic arg + * + * Enter: ArgInfoBits = mask for the ArgInfo for the intrinsic arg (or return value) + * R = region itself + * ST = check for this subtarget + * AlignInfo = alignment info if provided (can be nullptr) + * BKind = check before this baling type + * + * This checks that the arg is general (rather than raw) and does not have + * any stride restrictions that are incompatible with the region. + * + * In the legalization pass of baling, we always return true when the main + * instruction can be splitted. Otherwise, a region that would be OK after + * being split by legalization might here appear not OK, and that would stop + * legalization considering splitting it. However, if the main instruction + * cannot be splitted, then we need to check the full restriction + * otherwise, if the region is considered baled and skip legalization, + * we may have illegal standalone read-region. + */ +bool GenXBaling::isRegionOKForIntrinsic(unsigned ArgInfoBits, const Region &R, + bool CanSplitBale, + const GenXSubtarget *ST, + genx::AlignmentInfo * AlignInfo, + BalingKind BKind) { + GenXIntrinsicInfo::ArgInfo AI(ArgInfoBits); + if (!AI.isGeneral()) + return false; + if (BKind == BalingKind::BK_Legalization) { + if (CanSplitBale) + return true; + } + if (R.Indirect && (AI.Info & GenXIntrinsicInfo::DIRECTONLY)) + return false; + unsigned Restriction = AI.getRestriction(); + if (!Restriction) + return true; + unsigned GRFWidth = ST ? ST->getGRFWidth() : 32; + unsigned ElementsPerGrf = GRFWidth / R.ElementBytes; + unsigned GRFLogAlign = Log2_32(GRFWidth); + if (AI.Info & GenXIntrinsicInfo::GRFALIGNED) { + if (R.Indirect) { + // Instructions that cannot be splitted also cannot allow indirect + if (!CanSplitBale) + return false; + if (!AlignInfo) + return false; + Alignment AL = AlignInfo->get(R.Indirect); + if (AL.getLogAlign() < GRFLogAlign || AL.getExtraBits() != 0) + return false; + } else if (R.Offset & (GRFWidth - 1)) + return false; + if (R.is2D() && (R.VStride & (ElementsPerGrf - 1))) + return false; + } + if (AI.Info & GenXIntrinsicInfo::OWALIGNED) { + // Instructions that cannot be splitted also cannot allow indirect + if (R.Indirect) { + if (!CanSplitBale) + return false; + if (!AlignInfo) + return false; + Alignment AL = AlignInfo->get(R.Indirect); + if (AL.getLogAlign() < 4 || AL.getExtraBits() != 0) + return false; + } + if (R.Offset & 15) + return false; + if (R.is2D() && (R.VStride & ((ElementsPerGrf >> 1) - 1))) + return false; + } + switch (Restriction) { + case GenXIntrinsicInfo::SCALARORCONTIGUOUS: + if (!R.Stride && R.Width == R.NumElements) + break; + // fall through... + case GenXIntrinsicInfo::FIXED4: + case GenXIntrinsicInfo::CONTIGUOUS: + if (R.Stride != 1 || R.Width != R.NumElements) + return false; + break; + case GenXIntrinsicInfo::STRIDE1: + // For the dot product instructions, the vISA spec just says that the + // horizontal stride must be 1. It doesn't say anything about the + // width or the vertical stride. I am assuming that the width must also + // be at least 4, since the operation works on groups of 4 channels. + if (R.Stride != 1 || R.Width < 4) + return false; + break; + default: + break; + } + return true; +} + +/*********************************************************************** + * checkModifier : check whether instruction is a source modifier + * + * Enter: Inst = instruction to check + * + * Return: ABSMOD, NEGMOD, NOTMOD, ZEXT, SEXT or MAININST (0) if not modifier + */ +static int checkModifier(Instruction *Inst) +{ + switch (Inst->getOpcode()) { + case Instruction::Sub: + case Instruction::FSub: + // Negate is represented in LLVM IR by subtract from 0. + if (Constant *Lhs = dyn_cast(Inst->getOperand(0))) { + // Canonicalize splats as well + if (isa(Lhs->getType())) + if (auto splat = Lhs->getSplatValue()) + Lhs = splat; + + if (Lhs->isZeroValue()) + return BaleInfo::NEGMOD; + } + break; + case Instruction::Xor: + if (isIntNot(Inst)) + return BaleInfo::NOTMOD; + break; + case Instruction::ZExt: + if (!Inst->getOperand(0)->getType()->getScalarType()->isIntegerTy(1)) + return BaleInfo::ZEXT; + break; + case Instruction::SExt: + if (!Inst->getOperand(0)->getType()->getScalarType()->isIntegerTy(1)) + return BaleInfo::SEXT; + break; + default: + switch (GenXIntrinsic::getGenXIntrinsicID(Inst)) { + case GenXIntrinsic::genx_absi: + case GenXIntrinsic::genx_absf: + return BaleInfo::ABSMOD; + default: + break; + } + break; + } + return BaleInfo::MAININST; +} + +/*********************************************************************** + * operandIsBaled : check if a main inst is baled + * + * Enter: Inst = the main inst + * OperandNum = operand number to look at + * ModType = what type of modifier (arith/logic/extonly/none) this + * operand accepts + * AI = GenXIntrinsicInfo::ArgInfo, so we can see any stride + * restrictions, omitted if Inst is not an intrinsic + */ +bool +GenXBaling::operandIsBaled(Instruction *Inst, + unsigned OperandNum, int ModType, + unsigned ArgInfoBits = GenXIntrinsicInfo::GENERAL) { + GenXIntrinsicInfo::ArgInfo AI(ArgInfoBits); + Instruction *Opnd = dyn_cast(Inst->getOperand(OperandNum)); + if (!Opnd) + return false; + // Check for source operand modifier. + if (ModType != GenXIntrinsicInfo::MODIFIER_DEFAULT) { + int Mod = checkModifier(Opnd); + switch (Mod) { + case BaleInfo::MAININST: + break; + case BaleInfo::ZEXT: + case BaleInfo::SEXT: + if (ModType != GenXIntrinsicInfo::MODIFIER_DEFAULT) + return true; + break; + case BaleInfo::NOTMOD: + if (ModType == GenXIntrinsicInfo::MODIFIER_LOGIC) + return true; + break; + case BaleInfo::ABSMOD: + // Part of the bodge to allow abs to be baled in to zext/sext. + if (ModType == MODIFIER_ABSONLY) + return true; + // fall through... + default: + if (ModType == GenXIntrinsicInfo::MODIFIER_ARITH) + return true; + break; + } + } + if (GenXIntrinsic::isRdRegion(Opnd)) { + // The operand is a rdregion. Check any restrictions. + // (Note we call isRegionOKForIntrinsic even when Inst is not an + // intrinsic, since in that case AI is initialized to a state + // where there are no region restrictions.) + bool CanSplitBale = true; + Region RdR(Opnd, BaleInfo()); + if (!isRegionOKForIntrinsic(AI.Info, RdR, CanSplitBale, ST, &AlignInfo, + Kind)) + return false; + + // Do not bale in a region read with multiple uses if + // - any use is bitcast, or + // - it is indirect. + // as bitcast will not bale its operands and indirect multiple-use region + // reads often lead to narrow simd width after legalization. + if (Opnd->getNumUses() > 1 && (Kind == BalingKind::BK_Legalization || + Kind == BalingKind::BK_Analysis)) { + for (auto U : Opnd->users()) + if (isa(U)) + return false; + Region R(cast(Opnd), BaleInfo()); + if (R.Indirect) + return false; + } + return true; + } + return false; +} + +/*********************************************************************** + * processWrPredRegion : set up baling info for wrpredregion + * + * The input to wrpredregion may be the following: + * 1) icmp or fcmp, in which case it is always baled. + * 2) constant, which may resulted from region simplification. + */ +void GenXBaling::processWrPredRegion(Instruction *Inst) +{ + Value *V = Inst->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum); + assert(isa(V) || isa(V)); + BaleInfo BI(BaleInfo::WRPREDREGION); + if (isa(V)) { + setOperandBaled(Inst, GenXIntrinsic::GenXRegion::NewValueOperandNum, &BI); + } + setBaleInfo(Inst, BI); +} + +/*********************************************************************** + * processWrPredPredRegion : set up baling info for wrpredpredregion + * + * The "new value" input to wrpredregion must be icmp or fcmp, and it is always + * baled. + * + * The condition input is assumed to be EM. But it might be an rdpredregion + * out of EM, in which case the rdpredregion is baled. The rdpredregion must + * have offset 0. + */ +void GenXBaling::processWrPredPredRegion(Instruction *Inst) +{ + assert(isa(Inst->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum))); + BaleInfo BI(BaleInfo::WRPREDPREDREGION); + setOperandBaled(Inst, GenXIntrinsic::GenXRegion::NewValueOperandNum, &BI); + Value *Cond = Inst->getOperand(3); + if (GenXIntrinsic::getGenXIntrinsicID(Cond) == GenXIntrinsic::genx_rdpredregion) { + assert(cast(cast(Cond)->getOperand(1))->isNullValue()); + setOperandBaled(Inst, 3, &BI); + } + setBaleInfo(Inst, BI); +} + +/*********************************************************************** + * processWrRegion : set up baling info for wrregion + */ +void GenXBaling::processWrRegion(Instruction *Inst) +{ + BaleInfo BI(BaleInfo::WRREGION); + // Get the instruction (if any) that creates the element/subregion to write. + unsigned OperandNum = GenXIntrinsic::GenXRegion::NewValueOperandNum; + Instruction *V = dyn_cast(Inst->getOperand(OperandNum)); + if (V && !V->hasOneUse()) { + // The instruction has multiple uses. + // We don't want to bale in the following cases, as they seem to make the + // code worse, unless this is load from a global variable. + if (V->getParent() != Inst->getParent()) { + auto isRegionFromGlobalLoad = [](Value *V) { + if (!GenXIntrinsic::isRdRegion(V)) + return false; + auto LI = dyn_cast(cast(V)->getArgOperand(0)); + return LI && getUnderlyingGlobalVariable(LI->getPointerOperand()); + }; + // 0. It is in a different basic block to the wrregion. + if (!isRegionFromGlobalLoad(V)) + V = nullptr; + } else { + // 1. The maininst is a select. + Bale B; + buildBale(V, &B); + if (auto MainInst = B.getMainInst()) { + if (isa(MainInst->Inst) || + isHighCostBaling(BaleInfo::WRREGION, MainInst->Inst)) + V = nullptr; + } + // 2. There is an indirect rdregion with a constant offset (probably due to + // the risk of the jitter doing unfolding; this check may be unnecessary + // after HSW). + for (auto i = B.begin(), e = B.end(); i != e; ++i) { + if (i->Info.Type != BaleInfo::RDREGION) + continue; + if (!isa(i->Inst->getOperand( + GenXIntrinsic::GenXRegion::RdIndexOperandNum))) { + V = nullptr; + break; + } + } + } + // FIXME: Baling on WRREGION is not the right way to reduce the overhead + // from `wrregion`. Instead, register coalescing should be applied to + // enable direct defining of the WRREGION and minimize the value + // duplication. + } + if (V) { + Region WrR(Inst, BaleInfo()); + if (isBalableNewValueIntoWrr(V, WrR, ST, &AlignInfo, Kind)) { + setOperandBaled(Inst, OperandNum, &BI); + if (Liveness) { + // Ensure the wrregion's result has an + // alignment of 32 if intrinsic with + // raw result was baled into + unsigned ValIntrinID = GenXIntrinsic::getAnyIntrinsicID(V); + GenXIntrinsicInfo II(ValIntrinID); + if (GenXIntrinsic::isGenXIntrinsic(ValIntrinID) && + (ValIntrinID != GenXIntrinsic::genx_sat) && + !GenXIntrinsic::isRdRegion(V) && !GenXIntrinsic::isWrRegion(V) && + (II.getRetInfo().getCategory() == GenXIntrinsicInfo::RAW)) + Liveness->getOrCreateLiveRange(Inst)->LogAlignment = 5; + } + } + } + // Now see if there is a variable index with an add/sub with an in range + // offset that we can bale in, such that the add/sub does not already + // bale in other instructions. + OperandNum = 5; + if (isBalableIndexAdd(Inst->getOperand(OperandNum))) { + setOperandBaled(Inst, OperandNum, &BI); + // We always set up InstMap for an address add, even though it does not + // bale in any operands. + setBaleInfo(cast(Inst->getOperand(OperandNum)), BaleInfo(BaleInfo::ADDRADD, 0)); + } + // See if there is any baling in to the predicate (mask) operand. + if (processPredicate(Inst, GenXIntrinsic::GenXRegion::PredicateOperandNum)) + setOperandBaled(Inst, GenXIntrinsic::GenXRegion::PredicateOperandNum, &BI); + // We always set up InstMap for a wrregion, even if it does not bale in any + // operands. + setBaleInfo(Inst, BI); +} + +// Process a select instruction. Return true if it can be baled into a cmp +// instruction, false otherwise. +bool GenXBaling::processSelect(Instruction *Inst) { + auto SI = dyn_cast(Inst); + if (!SI || !SI->getType()->isVectorTy()) + return false; + + // Only bale into a cmp instruction. + Value *Cond = SI->getCondition(); + if (!isa(Cond) || !Cond->getType()->isVectorTy() || + !Cond->hasOneUse()) + return false; + + // Only bale "select cond, -1, 0" + Constant *Src0 = dyn_cast(SI->getTrueValue()); + Constant *Src1 = dyn_cast(SI->getFalseValue()); + if (Src0 && Src0->isAllOnesValue() && Src1 && Src1->isNullValue()) { + BaleInfo BI(BaleInfo::CMPDST); + unsigned OperandNum = 0; + setOperandBaled(Inst, OperandNum, &BI); + setBaleInfo(Inst, BI); + } + + // No baling. + return false; +} + +// Process a store instruction. +void GenXBaling::processStore(StoreInst *Inst) { + BaleInfo BI(BaleInfo::GSTORE); + unsigned OperandNum = 0; + Instruction *V = dyn_cast(Inst->getOperand(OperandNum)); + if (GenXIntrinsic::isWrRegion(V)) + setOperandBaled(Inst, OperandNum, &BI); + else if (isa(V) && cast(V)->isInlineAsm()) + setOperandBaled(Inst, OperandNum, &BI); + setBaleInfo(Inst, BI); +} + +// We can bale in shufflevector of predicate if it is replicated slice. +bool GenXBaling::processShufflePred(Instruction *Inst) { + assert(Inst->getType()->getScalarSizeInBits() == 1 && + "Expected bool shuffle"); + auto *SI = dyn_cast(Inst); + if (!SI) + return false; + + assert(ShuffleVectorAnalyzer(SI).isReplicatedSlice() && + "Predicate shuffle is not replicated slice!"); + BaleInfo BI(BaleInfo::SHUFFLEPRED); + setBaleInfo(SI, BI); + return true; +} + +/*********************************************************************** + * processPredicate : process predicate operand (to wrregion or branch) + * + * Enter: Inst = instruction with predicate operand + * OperandNum = operand number in Inst + * + * Return: whether operand can be baled in + * + * If the function returns true, the caller needs to call + * setOperandBaled(Inst, OperandNum, &BI) to actually bale it in. + * + * Unlike most baling, which proceeds in code order building a tree of baled in + * instructions, this function recurses, scanning backward through the code, + * because we only want to bale predicate operations all/any/not/rdpredregion + * once we know that the resulting predicate is used in wrregion or branch (as + * opposed to say a bitcast to int). + * + * So this function decides whether OperandNum in Inst is an instruction that + * is to be baled in, and additionally performs any further baling in to that + * instruction. + */ +bool GenXBaling::processPredicate(Instruction *Inst, unsigned OperandNum) { + Instruction *Mask = dyn_cast(Inst->getOperand(OperandNum)); + if (!Mask) + return false; + + if (Kind == BalingKind::BK_CodeGen && !isa(Mask->getType())) { + if (auto Extract = dyn_cast(Mask)) { + auto *GotoJoin = cast(Extract->getAggregateOperand()); + auto IID = GenXIntrinsic::getAnyIntrinsicID(GotoJoin); + if (IID == GenXIntrinsic::genx_simdcf_goto + || IID == GenXIntrinsic::genx_simdcf_join) { + // Second pass: Mask is the extractvalue of the !any(EM) result out of + // the result of goto/join. We mark both the use of the extract in the + // branch and the use of the goto/join in the extract as baled. The + // former is done by the caller when we return true. + BaleInfo BI; + setOperandBaled(Mask, /*OperandNum=*/0, &BI); + setBaleInfo(Mask, BI); + return true; + } + } + } + switch (GenXIntrinsic::getGenXIntrinsicID(Mask)) { + case GenXIntrinsic::genx_rdpredregion: { + if (Kind == BalingKind::BK_CodeGen) { +#if _DEBUG + // Sanity check the offset and number of elements being accessed. + unsigned MinSize = Inst->getType()->getScalarType()->getPrimitiveSizeInBits() == 64 ? 4 : 8; + unsigned NElems = Mask->getType()->getVectorNumElements(); + unsigned Offset = dyn_cast(Mask->getOperand(1))->getZExtValue(); + assert(exactLog2(NElems) >= 0 && (Offset & (std::min(NElems, MinSize) - 1)) == 0 && + "illegal offset and/or width in rdpredregion"); +#endif + } + // We always set up InstMap for an rdpredregion, even though it does not + // bale in any operands. + setBaleInfo(Mask, BaleInfo(BaleInfo::RDPREDREGION, 0)); + return true; + } + case GenXIntrinsic::genx_all: + case GenXIntrinsic::genx_any: { + if (Kind != BalingKind::BK_CodeGen) + return false; // only bale all/any for CodeGen + // The mask is the result of an all/any. Bale that in. + // Also see if its operand can be baled in. + BaleInfo BI(BaleInfo::ALLANY); + if (processPredicate(Mask, /*OperandNum=*/0)) + setOperandBaled(Mask, /*OperandNum=*/0, &BI); + setBaleInfo(Mask, BI); + return true; + } + default: + break; + } + + if (isNot(Mask)) { + // The mask is the result of a notp. Bale that in. + // Also see if its operand can be baled in. + BaleInfo BI(BaleInfo::NOTP); + if (processPredicate(Mask, /*OperandNum=*/0)) + setOperandBaled(Mask, /*OperandNum=*/0, &BI); + setBaleInfo(Mask, BI); + return true; + } + + if (processShufflePred(Mask)) + return true; + + return false; +} + +/*********************************************************************** + * processSat : set up baling info fp saturate + */ +void GenXBaling::processSat(Instruction *Inst) +{ + BaleInfo BI(BaleInfo::SATURATE); + // Get the instruction (if any) that creates value to saturate. + unsigned OperandNum = 0; + Instruction *V = dyn_cast(Inst->getOperand(OperandNum)); + if (V && V->hasOneUse()) { + // It is an instruction where we are the only use. We can bale it in, if + // it is a suitable instruction. + auto ValIntrinID = GenXIntrinsic::getAnyIntrinsicID(V); + if (GenXIntrinsic::isRdRegion(ValIntrinID)) + setOperandBaled(Inst, OperandNum, &BI); + else if (ValIntrinID==GenXIntrinsic::not_any_intrinsic) { + if (isa(V) || (isa(V) && !isa(V))) + setOperandBaled(Inst, OperandNum, &BI); + } else if (!GenXIntrinsic::isWrRegion(ValIntrinID)) { + // V is an intrinsic other than rdregion/wrregion. Check that its return + // value is suitable for baling. + GenXIntrinsicInfo II(ValIntrinID); + if (!II.getRetInfo().isRaw() && II.getRetInfo().getSaturation() == + GenXIntrinsicInfo::SATURATION_DEFAULT) + setOperandBaled(Inst, OperandNum, &BI); + } + } + // We always set up InstMap for a saturate, even if it does not bale in any + // operands. + setBaleInfo(Inst, BI); +} + +/*********************************************************************** + * processRdRegion : set up baling info for rdregion + */ +void GenXBaling::processRdRegion(Instruction *Inst) +{ + // See if there is a variable index with an add/sub with an in range + // offset that we can bale in, such that the add/sub does not already + // bale in other instructions. + const unsigned OperandNum = 4; // operand number of index in rdregion + BaleInfo BI(BaleInfo::RDREGION); + if (isBalableIndexAdd(Inst->getOperand(OperandNum))) { + setOperandBaled(Inst, OperandNum, &BI); + // We always set up InstMap for an address add, even though it does not + // bale in any operands. + setBaleInfo(cast(Inst->getOperand(OperandNum)), BaleInfo(BaleInfo::ADDRADD, 0)); + } else if (isBalableIndexOr(Inst->getOperand(OperandNum))) { + setOperandBaled(Inst, OperandNum, &BI); + // We always set up InstMap for an address or, even though it does not + // bale in any operands. + setBaleInfo(cast(Inst->getOperand(OperandNum)), BaleInfo(BaleInfo::ADDROR, 0)); + } + // We always set up InstMap for a rdregion, even if it does not bale in any + // operands. + setBaleInfo(Inst, BI); +} + +/*********************************************************************** + * processInlineAsm : RdRegion result a baled into inline asm + * instruction. Inline Assembly iremains the main instruction + * of the bale. + */ +void GenXBaling::processInlineAsm(Instruction *Inst) { + auto CI = dyn_cast(Inst); + assert((CI && CI->isInlineAsm()) && "Inline Asm expected"); + + BaleInfo BI(BaleInfo::MAININST); + for (unsigned I = 0; I < CI->getNumArgOperands(); I++) + if (auto RdR = dyn_cast(CI->getArgOperand(I))) + if (GenXIntrinsic::isRdRegion(RdR)) { + switch (GenXIntrinsic::getGenXIntrinsicID(RdR->getOperand(0))) { + default: + setOperandBaled(Inst, I, &BI); + break; + case GenXIntrinsic::genx_constanti: + case GenXIntrinsic::genx_constantf: + continue; + } + } + + setBaleInfo(Inst, BI); +} + +void GenXBaling::processFuncPointer(PtrToIntInst *Inst) { + BaleInfo BI(BaleInfo::FADDR); + for (auto *U : Inst->users()) { + if (isa(U)) { + // need to clone wrregion sinking to select + // (can't do that on FuncPtrs lowering as it's actually + // a result of post-legalization) + // to achieve 3 bales: + // b1=FADDR b2=FADDR + // |ptrtoint| |ptrtoint| + // | | | | | | + // | | | | | | + // | wrr | | wrr | + // \ / + // \ / + // |select| + // b3=select + assert(Inst->hasOneUse()); + auto &DL = Inst->getModule()->getDataLayout(); + Region R(IntegerType::get(Inst->getContext(), 64), &DL); + auto NewWrr = R.createWrRegion( + UndefValue::get(IntegerType::get(Inst->getContext(), 64)), Inst, + Inst->getName(), Inst, Inst->getDebugLoc()); + U->replaceUsesOfWith(Inst, NewWrr); + } else if (isa(U)) { + // only bitcast -> rdregion are allowed + // this is typical for vector selects + assert(Inst->hasOneUse() && U->hasOneUse() && + isa(U->user_back()) && + GenXIntrinsic::isRdRegion(U->user_back())); + setBaleInfo(Inst, BI); + return; + } + } + + assert(Inst->hasOneUse() && isa(Inst->use_begin()->getUser()) && + GenXIntrinsic::isWrRegion(Inst->use_begin()->getUser())); + + setBaleInfo(Inst, BI); +} + +/*********************************************************************** + * processExtractValue : Extract instructions can get elements from structure + * which was a result of inline assembly call with multiple outputs. + */ +void GenXBaling::processExtractValue(ExtractValueInst *EV) { + assert(EV); + if (auto CI = dyn_cast(EV->getAggregateOperand())) + if (CI->isInlineAsm()) + setBaleInfo(EV, BaleInfo(BaleInfo::MAININST, 0)); +} + +/*********************************************************************** + * static getIndexAdd : test whether the specified value is + * a constant add/sub that could be baled in as a variable index offset, + * but without checking that the index is in range + * + * Enter: V = the value that might be a constant add/sub + * Offset = where to store the offset of the constant add/sub + * + * Return: true if a constant add/sub was detected + * + * For the second run of GenXBaling, which is after GenXCategoryConversion, + * we are looking for an llvm.genx.add.addr rather than a real add/sub. + */ +bool GenXBaling::getIndexAdd(Value *V, int *Offset) +{ + if (Instruction *Inst = dyn_cast(V)) { + int IsConstAdd = 0; + switch (Inst->getOpcode()) { + case Instruction::Add: + IsConstAdd = 1; + break; + case Instruction::Sub: + IsConstAdd = -1; + break; + default: + if (GenXIntrinsic::getGenXIntrinsicID(Inst) == + GenXIntrinsic::genx_add_addr) + IsConstAdd = 1; + break; + } + if (IsConstAdd) { + if (Constant *C = dyn_cast(Inst->getOperand(1))) { + if (isa(C->getType())) + C = C->getSplatValue(); + if (C) { + if (C->isNullValue()) { + *Offset = 0; + return true; + } + if (ConstantInt *CI = dyn_cast(C)) { + // It is a constant add/sub. + *Offset = CI->getSExtValue() * IsConstAdd; + return true; + } + } + } + } + } + return false; +} + +/*********************************************************************** + * static getIndexOr : test whether the specified value is + * a constant Or that could be baled in as a variable index offset, + * but without checking that the index is in range + * + * Enter: V = the value that might be a constant or + * Offset = where to store the offset of the constant or + * + * Return: true if a constant or was detected + */ +bool GenXBaling::getIndexOr(Value *V, int &Offset) +{ + Instruction *Inst = dyn_cast(V); + if (!Inst) + return false; + + if (Inst->getOpcode() != Instruction::Or) + return false; + + // inst is Or from this point + Constant *C = dyn_cast(Inst->getOperand(1)); + if (!C) + return false; + + if (isa(C->getType())) + C = C->getSplatValue(); + + // getSplatValue could return nullptr + if (!C) + return false; + + if (C->isNullValue()) { + Offset = 0; + return true; + } + if (ConstantInt *CI = dyn_cast(C)) { + // check for or could be changed to add + if(!haveNoCommonBitsSet(Inst->getOperand(0), Inst->getOperand(1), + Inst->getModule()->getDataLayout())) + { + return false; + } + Offset = CI->getSExtValue(); + return true; + } + return false; +} + +/*********************************************************************** + * static isBalableIndexAdd : test whether the specified value is + * a constant add/sub that could be baled in as a variable index offset + * + * For the second run of GenXBaling, which is after GenXCategoryConversion, + * we are looking for an llvm.genx.add.addr rather than a real add/sub. + */ +bool GenXBaling::isBalableIndexAdd(Value *V) +{ + int Offset; + if (!getIndexAdd(V, &Offset)) + return false; + // It is a constant add/sub. Check the constant is in range. + return ( G4_MIN_ADDR_IMM <= Offset && Offset <= G4_MAX_ADDR_IMM); +} + +/*********************************************************************** + * static isBalableIndexOr : test whether the specified value is + * a constant Or that could be baled in as a variable index offset + */ +bool GenXBaling::isBalableIndexOr(Value *V) +{ + int Offset; + if (!getIndexOr(V, Offset)) + return false; + assert(Offset >=0 && "Offset in or appears to be less than zero"); + // It is a constant or. Check the constant is in range. + return (Offset <= G4_MAX_ADDR_IMM); +} + +/*********************************************************************** + * static isBalableNewValueIntoWrr : check whether the new val operand can + * be baled into wrr instruction + */ +bool GenXBaling::isBalableNewValueIntoWrr(Value *V, const Region &WrrR, + const GenXSubtarget *ST, + genx::AlignmentInfo *AlignInfo, + BalingKind BKind) { + Instruction *Inst = dyn_cast(V); + if (!Inst) + return false; + // It is an instruction. We can bale it in, if it is a suitable + // instruction. + unsigned ValIntrinID = GenXIntrinsic::getAnyIntrinsicID(Inst); + if (ValIntrinID == GenXIntrinsic::genx_sat || + GenXIntrinsic::isRdRegion(ValIntrinID)) + return true; + else if (ValIntrinID == GenXIntrinsic::not_any_intrinsic) { + if (isa(Inst) || + (isa(Inst) && !isa(Inst))) + return true; + else if (isMaskPacking(Inst)) + return true; + else if (isa(Inst) && cast(Inst)->isInlineAsm()) + return true; + else if (isa(Inst) && !WrrR.Mask) { + // Can bale in a select as long as the wrregion is unpredicated. + return true; + } else if (isa(Inst)) { + // Each extract bales into its own WrRegionand remains + // the main instruction of the bale + auto Extract = cast(Inst); + if (auto CI = dyn_cast(Extract->getAggregateOperand())) + if (CI->isInlineAsm()) + return true; + } + } else if (!GenXIntrinsic::isWrRegion(ValIntrinID)) { + // V is an intrinsic other than rdregion/wrregion. If this is a + // predicated wrregion, only permit baling in if the intrinsic + // supports a predicate mask. + GenXIntrinsicInfo II(ValIntrinID); + + if (WrrR.Mask == 0 || II.getPredAllowed()) { + // Check that its return value is suitable for baling. + GenXIntrinsicInfo::ArgInfo AI = II.getRetInfo(); + switch (AI.getCategory()) { + case GenXIntrinsicInfo::GENERAL: { + bool CanSplitBale = true; + if (isRegionOKForIntrinsic(AI.Info, WrrR, CanSplitBale, ST, AlignInfo, + BKind)) + return true; + } break; + case GenXIntrinsicInfo::RAW: { + // Intrinsic with raw result can be baled in to wrregion as long as + // it is unstrided and starts on a GRF boundary, and there is no + // non-undef TWOADDR operand. + if (isRegionOKForRaw(WrrR, ST)) { + unsigned FinalCallArgIdx = Inst->getNumOperands() - 2; + if (isa(Inst->getOperand(FinalCallArgIdx))) + return true; + else { + GenXIntrinsicInfo::ArgInfo AI2 = II.getArgInfo(FinalCallArgIdx); + if (AI2.getCategory() != GenXIntrinsicInfo::TWOADDR) + return true; + } + } + } break; + } + } + } + return false; +} + +bool GenXBaling::isHighCostBaling(uint16_t Type, Instruction *Inst) { + switch (Type) { + case BaleInfo::WRREGION: + switch (GenXIntrinsic::getGenXIntrinsicID(Inst)) { + case GenXIntrinsic::genx_dword_atomic_add: + case GenXIntrinsic::genx_dword_atomic_sub: + case GenXIntrinsic::genx_dword_atomic_min: + case GenXIntrinsic::genx_dword_atomic_max: + case GenXIntrinsic::genx_dword_atomic_xchg: + case GenXIntrinsic::genx_dword_atomic_or: + case GenXIntrinsic::genx_dword_atomic_xor: + case GenXIntrinsic::genx_dword_atomic_imin: + case GenXIntrinsic::genx_dword_atomic_imax: + case GenXIntrinsic::genx_dword_atomic_fmin: + case GenXIntrinsic::genx_dword_atomic_fmax: + case GenXIntrinsic::genx_dword_atomic_inc: + case GenXIntrinsic::genx_dword_atomic_dec: + case GenXIntrinsic::genx_dword_atomic_cmpxchg: + case GenXIntrinsic::genx_dword_atomic_fcmpwr: + case GenXIntrinsic::genx_typed_atomic_add: + case GenXIntrinsic::genx_typed_atomic_sub: + case GenXIntrinsic::genx_typed_atomic_min: + case GenXIntrinsic::genx_typed_atomic_max: + case GenXIntrinsic::genx_typed_atomic_xchg: + case GenXIntrinsic::genx_typed_atomic_and: + case GenXIntrinsic::genx_typed_atomic_or: + case GenXIntrinsic::genx_typed_atomic_xor: + case GenXIntrinsic::genx_typed_atomic_imin: + case GenXIntrinsic::genx_typed_atomic_imax: + case GenXIntrinsic::genx_typed_atomic_fmin: + case GenXIntrinsic::genx_typed_atomic_fmax: + case GenXIntrinsic::genx_typed_atomic_inc: + case GenXIntrinsic::genx_typed_atomic_dec: + case GenXIntrinsic::genx_typed_atomic_cmpxchg: + case GenXIntrinsic::genx_typed_atomic_fcmpwr: + case GenXIntrinsic::genx_gather_scaled: + case GenXIntrinsic::genx_gather4_scaled: + case GenXIntrinsic::genx_gather4_typed: + case GenXIntrinsic::genx_media_ld: + case GenXIntrinsic::genx_oword_ld: + case GenXIntrinsic::genx_oword_ld_unaligned: + case GenXIntrinsic::genx_svm_block_ld: + case GenXIntrinsic::genx_svm_block_ld_unaligned: + case GenXIntrinsic::genx_svm_gather: + case GenXIntrinsic::genx_svm_gather4_scaled: + case GenXIntrinsic::genx_svm_atomic_add: + case GenXIntrinsic::genx_svm_atomic_sub: + case GenXIntrinsic::genx_svm_atomic_min: + case GenXIntrinsic::genx_svm_atomic_max: + case GenXIntrinsic::genx_svm_atomic_xchg: + case GenXIntrinsic::genx_svm_atomic_and: + case GenXIntrinsic::genx_svm_atomic_or: + case GenXIntrinsic::genx_svm_atomic_xor: + case GenXIntrinsic::genx_svm_atomic_imin: + case GenXIntrinsic::genx_svm_atomic_imax: + case GenXIntrinsic::genx_svm_atomic_inc: + case GenXIntrinsic::genx_svm_atomic_dec: + case GenXIntrinsic::genx_svm_atomic_cmpxchg: + case GenXIntrinsic::genx_load: + case GenXIntrinsic::genx_sample: + case GenXIntrinsic::genx_sample_unorm: + case GenXIntrinsic::genx_3d_sample: + case GenXIntrinsic::genx_3d_load: + case GenXIntrinsic::genx_avs: + case GenXIntrinsic::genx_raw_send: + case GenXIntrinsic::genx_raw_sends: + case GenXIntrinsic::genx_va_convolve2d: + case GenXIntrinsic::genx_va_hdc_convolve2d: + case GenXIntrinsic::genx_va_erode: + case GenXIntrinsic::genx_va_hdc_erode: + case GenXIntrinsic::genx_va_dilate: + case GenXIntrinsic::genx_va_hdc_dilate: + case GenXIntrinsic::genx_va_minmax: + case GenXIntrinsic::genx_va_minmax_filter: + case GenXIntrinsic::genx_va_hdc_minmax_filter: + case GenXIntrinsic::genx_va_bool_centroid: + case GenXIntrinsic::genx_va_centroid: + case GenXIntrinsic::genx_va_1d_convolve_horizontal: + case GenXIntrinsic::genx_va_hdc_1d_convolve_horizontal: + case GenXIntrinsic::genx_va_1d_convolve_vertical: + case GenXIntrinsic::genx_va_hdc_1d_convolve_vertical: + case GenXIntrinsic::genx_va_1pixel_convolve: + case GenXIntrinsic::genx_va_hdc_1pixel_convolve: + case GenXIntrinsic::genx_va_1pixel_convolve_1x1mode: + case GenXIntrinsic::genx_va_lbp_creation: + case GenXIntrinsic::genx_va_hdc_lbp_creation: + case GenXIntrinsic::genx_va_lbp_correlation: + case GenXIntrinsic::genx_va_hdc_lbp_correlation: + case GenXIntrinsic::genx_va_correlation_search: + case GenXIntrinsic::genx_va_flood_fill: + return true; + default: + break; + } + break; + } + return false; +} + +/*********************************************************************** + * processMainInst : set up baling info for potential main instruction + */ +void GenXBaling::processMainInst(Instruction *Inst, int IntrinID) +{ + BaleInfo BI(BaleInfo::MAININST); + if (IntrinID == Intrinsic::dbg_value) + return; + if (IntrinID == GenXIntrinsic::not_any_intrinsic) { + if (!isa(Inst) && !isa(Inst) + && !isa(Inst) && !isa(Inst)) + return; + if (isa(Inst)) + return; + BI.Type = checkModifier(Inst); + // Work out whether the instruction accepts arithmetic, logic or no + // modifier. + int ModType = GenXIntrinsicInfo::MODIFIER_ARITH; + switch (BI.Type) { + case BaleInfo::NOTMOD: + // a "not" can only merge with a logic modifier (another "not") + ModType = GenXIntrinsicInfo::MODIFIER_LOGIC; + break; + case BaleInfo::ZEXT: + case BaleInfo::SEXT: + // an extend cannot bale in any other modifier. + // But as a bodge we allow abs to be baled in to zext/sext. This is a + // workaround for not having worked out how to set the computation type + // in cm_abs. Currently cm_abs does a genx.absi in the source type, then + // converts it to destination type. This does not allow for the result + // of an abs needing one more bit than its input. + ModType = MODIFIER_ABSONLY; + break; + case BaleInfo::MAININST: + switch (Inst->getOpcode()) { + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + // These instructions take a logic modifier. + ModType = GenXIntrinsicInfo::MODIFIER_LOGIC; + break; + case Instruction::LShr: + case Instruction::AShr: + case Instruction::Shl: + // Do not allow source modifier on integer shift operations, + // because of extra precision introduced. + ModType = GenXIntrinsicInfo::MODIFIER_DEFAULT; + break; + default: + // All other (non-intrinsic) instructions take an arith modifier. + break; + } + break; + default: + // Anything else is an arith modifier, so it can only merge with + // another arith modifier. + break; + } + unsigned i = 0; + if (isa(Inst)) { + // Deal specially with operand 0, the selector, of a select. + const unsigned OperandNum = 0; + if (processPredicate(Inst, OperandNum)) + setOperandBaled(Inst, OperandNum, &BI); + ++i; + } + // See which operands we can bale in. + for (unsigned e = Inst->getNumOperands(); i != e; ++i) + if (operandIsBaled(Inst, i, ModType)) + setOperandBaled(Inst, i, &BI); + } else if (IntrinID == GenXIntrinsic::genx_convert + || IntrinID == GenXIntrinsic::genx_convert_addr) { + // llvm.genx.convert can bale, and has exactly one arg + if (operandIsBaled(Inst, 0, GenXIntrinsicInfo::MODIFIER_ARITH)) + setOperandBaled(Inst, 0, &BI); + } else if (GenXIntrinsic::isAbs(IntrinID)) { + BI.Type = BaleInfo::ABSMOD; + if (operandIsBaled(Inst, 0, GenXIntrinsicInfo::MODIFIER_ARITH)) + setOperandBaled(Inst, 0, &BI); + } else { + // For an intrinsic, check the arg info of each arg to see if we can + // bale into it. + GenXIntrinsicInfo Info(IntrinID); + for (const auto *p = Info.getInstDesc(); *p; ++p) { + GenXIntrinsicInfo::ArgInfo AI(*p); + if (AI.isArgOrRet() && !AI.isRet()) { + unsigned ArgIdx = AI.getArgIdx(); + switch (AI.getCategory()) { + case GenXIntrinsicInfo::GENERAL: + // This source operand of the intrinsic is general. + if (operandIsBaled(Inst, ArgIdx, AI.getModifier(), AI.Info)) + setOperandBaled(Inst, ArgIdx, &BI); + break; + case GenXIntrinsicInfo::RAW: + // Rdregion can be baled in to a raw operand as long as it is + // unstrided and starts on a GRF boundary. Ensure that the input to + // the rdregion is 32 aligned. + if (isValueRegionOKForRaw(Inst->getOperand(ArgIdx), + /*IsWrite=*/false, ST)) { + setOperandBaled(Inst, ArgIdx, &BI); + if (Liveness) { + Value *Opnd = Inst->getOperand(ArgIdx); + Opnd = cast(Opnd)->getOperand(0); + Liveness->getOrCreateLiveRange(Opnd)->LogAlignment = 5; + } + } + break; + case GenXIntrinsicInfo::TWOADDR: + if (Kind == BalingKind::BK_CodeGen) { + // Record this as a two address send for processing later. + TwoAddrSends.push_back(cast(Inst)); + } + break; + case GenXIntrinsicInfo::PREDICATION: + // See if there is any baling in to the predicate (mask) operand. + if (processPredicate(Inst, ArgIdx)) + setOperandBaled(Inst, ArgIdx, &BI); + break; + } + } + } + } + + // If this instruction is a modifier, we attempt to simplify it here + // (i.e. fold constants), to avoid confusion later in GenXCisaBuilder + // if a modifier has a constant operand. Because this pass scans code + // forwards, a constant will propagate through a chain of modifiers. + if (BI.Type != BaleInfo::MAININST) { + Value *Simplified = nullptr; + if (BI.Type != BaleInfo::ABSMOD) { + const DataLayout &DL = Inst->getModule()->getDataLayout(); + Simplified = SimplifyInstruction(Inst, SimplifyQuery(DL)); + } else { + // SimplifyInstruction does not work on abs, so we roll our own for now. + if (auto C = dyn_cast(Inst->getOperand(0))) { + if (C->getType()->isIntOrIntVectorTy()) { + if (!ConstantExpr::getICmp(CmpInst::ICMP_SLT, C, + Constant::getNullValue(C->getType()))->isNullValue()) + C = ConstantExpr::getNeg(C); + } else { + if (!ConstantExpr::getFCmp(CmpInst::FCMP_OLT, C, + Constant::getNullValue(C->getType()))->isNullValue()) + C = ConstantExpr::getFNeg(C); + } + Simplified = C; + } + } + if (Simplified) { + assert(isa(Simplified) && "expecting a constant when simplifying a modifier"); + Inst->replaceAllUsesWith(Simplified); + Inst->eraseFromParent(); + return; + } + } + + // Only give an instruction an entry in the map if (a) it is not a main + // instruction or (b) it bales something in. + if (BI.Type || BI.Bits) + setBaleInfo(Inst, BI); +} + +/*********************************************************************** + * processBranch : process a branch instruction + * + * If the branch is conditional, bale in all/any/not + */ +void GenXBaling::processBranch(BranchInst *Branch) +{ + if (Branch->isConditional()) { + BaleInfo BI(BaleInfo::MAININST); + if (processPredicate(Branch, 0/*OperandNum of predicate*/)) { + setOperandBaled(Branch, 0/*OperandNum*/, &BI); + setBaleInfo(Branch, BI); + } + } +} + +/*********************************************************************** + * processTwoAddrSend : process a two-address send + * + * A "two-address send" is a send (or an intrinsic that becomes a send in the + * finalizer) with a potentially partial write, so it has a TWOADDR operand to + * represent the value of the destination before the operation, and that + * TWOADDR operand is not undef. + * + * This only gets called in the second baling pass. + * + * We can bale a rdregion into the TWOADDR operand and bale the send into a + * wrregion, but only if the two have the same region and "old value" input. + * + * We used to allow such baling in first baling, such that legalization would + * then not split the rdregion and wrregion. In bug 4607, we ran into a problem + * where code changed due to vector decomposition, and the same baling did not + * happen in second baling, leaving an illegally wide rdregion or wrregion. + * + * So now we only do this special kind of baling in the second baling pass. + * That means that we have to detect where the rdregion and wrregion have been + * split by legalization. We use the RdWrRegionSequence class to do that. + */ +void GenXBaling::processTwoAddrSend(CallInst *CI) +{ + unsigned TwoAddrOperandNum = CI->getNumArgOperands() - 1; + assert(GenXIntrinsicInfo(GenXIntrinsic::getAnyIntrinsicID(CI)) + .getArgInfo(TwoAddrOperandNum) + .getCategory() == GenXIntrinsicInfo::TWOADDR); + assert(GenXIntrinsicInfo(GenXIntrinsic::getAnyIntrinsicID(CI)) + .getRetInfo() + .getCategory() == GenXIntrinsicInfo::RAW); + // First check the case where legalization did not need to split the rdregion + // and wrregion. + auto TwoAddrOperand = dyn_cast(CI->getArgOperand(TwoAddrOperandNum)); + if (!TwoAddrOperand) + return; + if (GenXIntrinsic::isRdRegion(TwoAddrOperand)) { + if (!CI->hasOneUse()) + return; + auto Rd = cast(TwoAddrOperand); + auto Wr = cast(CI->use_begin()->getUser()); + if (!GenXIntrinsic::isWrRegion(Wr)) + return; + if (CI->use_begin()->getOperandNo() + != GenXIntrinsic::GenXRegion::NewValueOperandNum) + return; + Region RdR(Rd, BaleInfo()); + Region WrR(Wr, BaleInfo()); + if (RdR != WrR || RdR.Indirect || WrR.Mask) + return; + if (!isValueRegionOKForRaw(Wr, /*IsWrite=*/true, ST)) + return; + // Everything else is in place for a rd-send-wr baling. We just need to check + // that the input to the read sequence is the same as the old value input to + // the write sequence. We need to allow for some bitcasts in the way. Having + // different bitcasts on the two inputs is ok, as long as the original value + // is the same, because bitcasts are always copy coalesced so will be in the + // same register. + Value *RdIn = Rd->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum); + Value *WrIn = Wr->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum); + while (auto BC = dyn_cast(RdIn)) + RdIn = BC->getOperand(0); + while (auto BC = dyn_cast(WrIn)) + WrIn = BC->getOperand(0); + if (RdIn != WrIn) + return; + // We can do the baling. + auto BI = getBaleInfo(CI); + setOperandBaled(CI, TwoAddrOperandNum, &BI); + setBaleInfo(CI, BI); + BI = getBaleInfo(Wr); + setOperandBaled(Wr, GenXIntrinsic::GenXRegion::NewValueOperandNum, &BI); + setBaleInfo(Wr, BI); + return; + } + // Second, check the case where legalization has split the rdregion and + // wrregion. + if (CI->use_empty()) + return; + if (!GenXIntrinsic::isWrRegion(TwoAddrOperand)) + return; + RdWrRegionSequence RdSeq; + if (!RdSeq.buildFromWr(TwoAddrOperand, this)) + return; + RdWrRegionSequence WrSeq; + auto Rd = cast(CI->use_begin()->getUser()); + if (!GenXIntrinsic::isRdRegion(Rd)) + return; + if (!WrSeq.buildFromRd(Rd, this)) + return; + if (!RdSeq.WrR.isWhole(CI->getType())) + return; + if (!WrSeq.RdR.isWhole(CI->getType())) + return; + if (RdSeq.RdR.Indirect || WrSeq.WrR.Indirect) + return; + if (RdSeq.RdR != WrSeq.WrR) + return; + // Everything else is in place for a rd-send-wr baling. We just need to check + // that the input to the read sequence is the same as the old value input to + // the write sequence. We need to allow for some bitcasts in the way. Having + // different bitcasts on the two inputs is ok, as long as the original value + // is the same, because bitcasts are always copy coalesced so will be in the + // same register. + Value *RdIn = RdSeq.Input; + Value *WrIn = WrSeq.OldVal; + while (auto BC = dyn_cast(RdIn)) + RdIn = BC->getOperand(0); + while (auto BC = dyn_cast(WrIn)) + WrIn = BC->getOperand(0); + if (RdIn != WrIn) + return; + // Check that there are no uses of CI other than in WrSeq. We can do that by + // counting the uses. + unsigned NumUses = 0, Size = WrSeq.size(); + for (auto ui = CI->use_begin(), ue = CI->use_end(); ui != ue; ++ui) + if (++NumUses > Size) + return; + // We can bale, but we need to unlegalize back to a single rdregion and + // single wrregion. + auto NewRd = RdSeq.RdR.createRdRegion(RdSeq.Input, RdSeq.StartWr->getName(), + RdSeq.StartWr, RdSeq.StartWr->getDebugLoc()); + CI->setOperand(TwoAddrOperandNum, NewRd); + auto NewWr = cast(WrSeq.WrR.createWrRegion(WrSeq.OldVal, CI, + WrSeq.StartWr->getName(), WrSeq.StartWr, WrSeq.StartWr->getDebugLoc())); + WrSeq.EndWr->replaceAllUsesWith(NewWr); + // Set baling info for new instructions. The BI for NewWr is just a copy of + // the first wrregion in the sequence being replaced. + setBaleInfo(NewWr, getBaleInfo(WrSeq.StartWr)); + auto BI = getBaleInfo(CI); + setOperandBaled(CI, TwoAddrOperandNum, &BI); + setBaleInfo(CI, BI); + // Remove original sequences if now unused. + for (Instruction *End = RdSeq.EndWr;;) { + for (Instruction *Wr = End; Wr && Wr->use_empty(); ) { + if (!Wr->use_empty()) + break; + if (Wr->getNumOperands() < 2) + break; + auto Rd = dyn_cast(Wr->getOperand(1)); + auto NextWr = dyn_cast(Wr->getOperand(0)); + Liveness->eraseLiveRange(Wr); + Wr->eraseFromParent(); + assert(Rd); + if (Rd->use_empty()) { + Liveness->eraseLiveRange(Rd); + Rd->eraseFromParent(); + } + Wr = NextWr; + } + if (End == WrSeq.EndWr) + break; + End = WrSeq.EndWr; + } +} + +/*********************************************************************** + * setBaleInfo : set BaleInfo for an instruction + */ +void GenXBaling::setBaleInfo(const Instruction *Inst, genx::BaleInfo BI) +{ + assert(BI.Bits < 1 << Inst->getNumOperands()); + InstMap[static_cast(Inst)] = BI; +} + +/*********************************************************************** + * setOperandBaled : set flag to say that an operand is baled in + * + * Enter: Inst = instruction to bale into + * OperandNum = operand number in that instruction + * BI = BaleInfo to set flag in + * + * If the operand value has multiple uses, this also flags that we will need + * to do some cloning afterwards to ensure that a baled in operand has a + * single use. + * + * Note that a main instruction baled into a saturate modifier or into + * a wrregion, or a saturate modifier baled into a wrregion, never has + * multiple uses. So the multiple use thing only covers source operands + * of the main inst, plus a possible addradd in the wrregion. + */ +void GenXBaling::setOperandBaled(Instruction *Inst, unsigned OperandNum, + BaleInfo *BI) +{ + // Set the bit. + BI->Bits |= 1 << OperandNum; + // Check whether the operand has more than one use. + Instruction *BaledInst = cast(Inst->getOperand(OperandNum)); + if (!BaledInst->hasOneUse()) { + // Multiple uses. Add to the NeedClone stack. But not if it is a goto/join; + // we allow a goto/join to be baled into the extract of its !any(EM) result + // even though it has uses in other extracts. + unsigned IID = GenXIntrinsic::getGenXIntrinsicID(BaledInst); + if (IID != GenXIntrinsic::genx_simdcf_goto && + IID != GenXIntrinsic::genx_simdcf_join) + NeedCloneStack.push_back(NeedClone(Inst, OperandNum)); + } +} + +/*********************************************************************** + * doClones : do any cloning required to make baled in instructions + * single use + * + * NeedCloneStack is a stack of operands (instruction and operand number + * pairs) that are baled in and have more than one use, so need cloning. + * They were pushed in forward order, so if A is baled into B is baled + * into C then the use of A in B was pushed before the use of B in C. + * + * We now pop off the stack in reverse order. We see the use of B in C, + * and clone B to single use B'. Then we see that B bales in A, so we + * add the use of A in B' onto the stack, causing A to be cloned later. + * In this way we handle nested baling correctly. + */ +void GenXBaling::doClones() +{ + while (NeedCloneStack.size()) { + // Pop a NeedClone off the stack. + NeedClone NC = NeedCloneStack.back(); + NeedCloneStack.pop_back(); + // See if it is still multiple use (earlier cloning may have caused this + // one to become single use). + Instruction *Opnd = cast(NC.Inst->getOperand(NC.OperandNum)); + if (Opnd->hasOneUse()) + continue; + // See if it is still baled. But continue with cloning even if not baled in + // these cases: + // 1. An extend (zext or sext), because it tends to result in better gen + // code, probably because a zext or sext can be baled in to its user by + // the finalizer in a case where we cannot because of the vISA + // restriction that both operands need the same extend. This case arises + // only if we were going to bale the extend in, but then decided not to + // because the two operands did not have the same extend. + // 2. An address generating instruction, because, at this point in the flow + // (between GenXCategory and GenXAddressCommoning), an address + // generating instruction must have a single use. + bool IsBaled = getBaleInfo(NC.Inst).isOperandBaled(NC.OperandNum); + if (!IsBaled && !isa(Opnd) && + getAddrOperandNum(GenXIntrinsic::getGenXIntrinsicID(NC.Inst)) != (int)NC.OperandNum) + continue; + // Clone it. + assert(!isa(Opnd)); + Instruction *Cloned = Opnd->clone(); + Cloned->setName(Opnd->getName()); + // Change the use. + NC.Inst->setOperand(NC.OperandNum, Cloned); + if (IsBaled) { + // Normally, insert the cloned instruction just after the original. + Cloned->insertAfter(Opnd); + } else { + // In the special case that we are cloning something even when not baled: + // Ensure the cloned instruction has the same category as the original + // one. + if (Liveness) + Liveness->getOrCreateLiveRange(Cloned)->setCategory( + Liveness->getOrCreateLiveRange(Opnd)->getCategory()); + // Insert the clone just before its single use. + Cloned->insertBefore(NC.Inst); + // If the instruction that we cloned is now single use, not in a phi + // node, move it to just before its use. + if (Opnd->hasOneUse()) { + auto User = Opnd->use_begin()->getUser(); + if (!isa(User)) { + Opnd->removeFromParent(); + Opnd->insertBefore(cast(User)); + } + } + } + // Copy the bale info. + BaleInfo BI = getBaleInfo(Opnd); + setBaleInfo(Cloned, BI); + // Stack any operands of the cloned instruction that are baled. (They + // must be multiple use because we have just cloned the instruction + // using them.) Also any address calculation, for the reason given in the + // comment above. + int AON = getAddrOperandNum(GenXIntrinsic::getGenXIntrinsicID(Cloned)); + for (unsigned i = 0, e = Cloned->getNumOperands(); i != e; ++i) + if (BI.isOperandBaled(i) || + (Kind == BalingKind::BK_CodeGen && AON == (int)i && + isa(Cloned->getOperand(i)))) + NeedCloneStack.push_back(NeedClone(Cloned, i)); + } +} + +/*********************************************************************** + * getOrUnbaleExtend : get or unbale the extend instruction (if any) in + * this operand + * + * Enter: Inst = instruction containing operand + * BI = BaleInfo for Inst + * OperandNum = operand number to look at + * Unbale = true to unbale the extend + * + * Return: 0 if no extend found, else the extend (ZExt or SExt), and, if + * Unbale is true, then *BI has been modified _and_ written back + * into Inst's map entry in GenXBaling. + * + * BI is a pointer to handle two slightly different cases of unbaling the ext: + * 1. If this is the top level call to getOrUnBaleExtend from processMainInst, + * then we want to modify the caller's BaleInfo pointed to by BI, which the + * caller is in the middle of setting up and will write back into the map. + * 2. If this is a recursive call from getOrUnbaleExtend, then we want to + * use setBaleInfo to write the BaleInfo back into the map. + * We don't check which case we have, and we just do both things, as the + * unneeded one is harmless. + */ +Instruction *GenXBaling::getOrUnbaleExtend(Instruction *Inst, BaleInfo *BI, + unsigned OperandNum, bool Unbale) +{ + if (!BI->isOperandBaled(OperandNum)) + return nullptr; + auto Opnd = cast(Inst->getOperand(OperandNum)); + if (isa(Opnd) || isa(Opnd)) { + // Found an extend. Unbale it if requested. But do not remove it from the + // NeedClone stack; we still clone an extend that is not being baled in on + // the basis that the jitter will be able to bale it in because gen allows + // mismatched integer operand types. + if (Unbale) { + BI->clearOperandBaled(OperandNum); + setBaleInfo(Inst, *BI); + } + return Opnd; + } + BaleInfo ThisBI = getBaleInfo(Opnd); + if (ThisBI.isOperandBaled(0)) + return getOrUnbaleExtend(Opnd, &ThisBI, 0, Unbale); + if (ThisBI.isOperandBaled(1)) + return getOrUnbaleExtend(Opnd, &ThisBI, 1, Unbale); + return nullptr; +} + +/*********************************************************************** + * dump, print : dump the result of the GenXBaling analysis + */ +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void GenXBaling::dump() +{ + print(errs()); +} +#endif + +void GenXBaling::print(raw_ostream &OS) +{ + for (InstMap_t::iterator i = InstMap.begin(), e = InstMap.end(); i != e; ++i) { + const Instruction *Inst = cast(i->first); + BaleInfo *BI = &i->second; + OS << Inst->getName() << ": "; + switch (BI->Type) { + case BaleInfo::WRREGION: OS << "WRREGION"; break; + case BaleInfo::SATURATE: OS << "SATURATE"; break; + case BaleInfo::MAININST: OS << "MAININST"; break; + case BaleInfo::ABSMOD: OS << "ABSMOD"; break; + case BaleInfo::NEGMOD: OS << "NEGMOD"; break; + case BaleInfo::NOTMOD: OS << "NOTMOD"; break; + case BaleInfo::RDREGION: OS << "RDREGION"; break; + default: OS << "??"; break; + } + for (unsigned OperandNum = 0, e = Inst->getNumOperands(); + OperandNum != e; ++OperandNum) + if (BI->isOperandBaled(OperandNum)) + OS << " " << OperandNum; + OS << "\n"; + } +} + +/*********************************************************************** + * getBaleParent : return the instruction baled into, 0 if none + */ +Instruction *GenXBaling::getBaleParent(Instruction *Inst) +{ + // We can rely on the fact that a baled in instruction always has exactly + // one use. The exception is llvm.genx.simdcf.goto/join, which is baled in + // to the extractvalue that extracts the !any(EM) value. Rather than check + // the intrinsic ID, we check whether the return type is struct. + auto use = Inst->use_begin(); + if (!Inst->hasOneUse()) { + if (!isa(Inst->getType())) + return nullptr; + // For an llvm.genx.simdcf.goto/join, the use we want is the extractvalue + // that extracts the !any(EM) value from the result struct. + for (auto ue = Inst->use_end();; ++use) { + if (use == ue) + return nullptr; + if (!isa(use->getUser())) + return nullptr; + if (use->getUser()->getType()->isIntegerTy(1)) + break; + } + } + Instruction *user = cast(use->getUser()); + BaleInfo BI = getBaleInfo(user); + if (!BI.isOperandBaled(use->getOperandNo())) + return nullptr; + return cast(use->getUser()); +} + +/*********************************************************************** + * unbale : unbale an instruction from its bale parent + */ +void GenXBaling::unbale(Instruction *Inst) +{ + if (!Inst->hasOneUse()) + return; + Value::use_iterator use = Inst->use_begin(); + Instruction *user = cast(use->getUser()); + BaleInfo BI = getBaleInfo(user); + unsigned OperandNum = use->getOperandNo(); + if (!BI.isOperandBaled(OperandNum)) + return; + BI.clearOperandBaled(OperandNum); + setBaleInfo(user, BI); +} + +/*********************************************************************** + * getBaleHead : return the head of the bale containing Inst + */ +Instruction *GenXBaling::getBaleHead(Instruction *Inst) +{ + for (;;) { + Instruction *Parent = getBaleParent(Inst); + if (!Parent) + break; + Inst = Parent; + } + return Inst; +} + +/*********************************************************************** + * buildBale : populate a Bale from the head instruction + * + * Enter: Inst = the head instruction + * B = Bale struct, assumed empty + * IncludeAddr = default false, true to include address calculations + * even when not baled in + * + * IncludeAddr is used by GenXUnbaling to include the address calculation of + * a rdregion in the bale, so it can be considered together when deciding + * whether to unbale and move. This works because an address calculation has + * exactly one use, until GenXAddressCommoning commons them up later. + */ +void GenXBaling::buildBale(Instruction *Inst, Bale *B, bool IncludeAddr) const +{ + assert(!B->size()); + buildBaleSub(Inst, B, IncludeAddr); +} + +void GenXBaling::buildBaleSub(Instruction *Inst, Bale *B, bool IncludeAddr) const +{ + BaleInfo BI = getBaleInfo(Inst); + B->push_front(BaleInst(Inst, BI)); + + if (isa(Inst) || + (isa(Inst) && !cast(Inst)->isInlineAsm() && + !GenXIntrinsic::isAnyNonTrivialIntrinsic(Inst))) + return; + if (IncludeAddr) { + int AddrOperandNum = getAddrOperandNum(GenXIntrinsic::getGenXIntrinsicID(Inst)); + if (AddrOperandNum >= 0) { + // IncludeAddr: pretend that the address calculation is baled in, as long + // as it is an instruction. + if (auto OpndInst = dyn_cast(Inst->getOperand(AddrOperandNum))) { + assert(OpndInst->hasOneUse()); (void)OpndInst; + BI.setOperandBaled(AddrOperandNum); + B->front().Info = BI; + } + } + } + + assert(BI.Bits < (1 << Inst->getNumOperands()) || Inst->getNumOperands() > 16); + + while (BI.Bits) { + unsigned Idx = genx::log2(BI.Bits); + BI.Bits &= ~(1 << Idx); + if (Instruction *Op = dyn_cast(Inst->getOperand(Idx))) + buildBaleSub(Op, B, IncludeAddr); + } +} + +/*********************************************************************** + * getAddrOperandNum : given an intrinsic ID, get the address operand number + * + * For rdregion/wrregion, it returns the operand number of the index operand. + * + * For genx_add_addr, it returns 0 (the only operand number) + * + * In any other case, it returns -1. + * + * This is used both in buildBale when IncludeAddr is true, and in doClones, + * to find the address operand of an instruction. + */ +int GenXBaling::getAddrOperandNum(unsigned IID) const +{ + switch (IID) { + case GenXIntrinsic::genx_rdregioni: + case GenXIntrinsic::genx_rdregionf: + return GenXIntrinsic::GenXRegion::RdIndexOperandNum; + case GenXIntrinsic::genx_wrregioni: + case GenXIntrinsic::genx_wrregionf: + return GenXIntrinsic::GenXRegion::WrIndexOperandNum; + case GenXIntrinsic::genx_add_addr: + return 0; + default: + return -1; + } +} + +/*********************************************************************** + * store : store updated BaleInfo for instruction + * + * Enter: BI = BaleInst struct + * + * This function stores BI.Info as the new BaleInfo for BI.Inst + * + * It is used by GenXLegalization to unbale. + */ +void GenXBaling::store(BaleInst BI) +{ + assert(BI.Info.Bits < 1<< BI.Inst->getNumOperands()); + InstMap[BI.Inst] = BI.Info; +} + +static bool skipTransform(Instruction *DefI, Instruction *UseI) { + SmallPtrSet DInsts; + BasicBlock *BB = UseI->getParent(); + + // Special case for extracting out of subroutine call. + if (isa(DefI)) + return true; + + // This is a local optimization only. + for (auto U : DefI->users()) { + auto UI = dyn_cast(U); + if (UI == nullptr || UI->getParent() != BB) + return true; + if (UI != UseI) + DInsts.insert(UI); + } + + // If a use is crossing the next region write, + // then two regions are live at the same time. + // Very likely this increases register pressure + // and/or results region copies. + // + // Scan forward starting from Region write, + // check if this hits a write to this region + // before some use. + // + SmallPtrSet UInsts; + bool IsLocal = !UseI->isUsedOutsideOfBlock(BB); + if (IsLocal) { + for (auto U : UseI->users()) { + auto UI = dyn_cast(U); + if (UI != nullptr) + UInsts.insert(UI); + } + } + + for (auto I = UseI; I; I = I->getNextNode()) { + if (I == &BB->back()) + break; + if (DInsts.empty()) + break; + + // UInst is local and it is dead now. + if (IsLocal && UInsts.empty()) + break; + + // There is a region write before some use. + if (GenXIntrinsic::isWrRegion(I) && + I->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum) == UseI) + return true; + + if (DInsts.count(I)) + DInsts.erase(I); + if (UInsts.count(I)) + UInsts.erase(I); + } + + // Not all users are checked which means UseI does not + // dominate them, or UseI is local and dead before some uses. + return !DInsts.empty(); +} + +// Normalize ill-formed gstores. +// Correct gstore should be in form of: +// x = gload G +// w = wrr x, StoreVal +// gstore w, G +static void normalizeGStore(StoreInst &SI) { + auto LI = + new LoadInst(SI.getPointerOperand(), ".gload", true /*volatile*/, &SI); + Value *StoreOp = SI.getValueOperand(); + Region R(StoreOp); + auto WrR = + R.createWrRegion(LI, StoreOp, ".wrr.gstore", &SI, SI.getDebugLoc()); + SI.setOperand(0 /*Value operand idx*/, WrR); +} + +// If operand of gstore is phi and all its incoming values +// form legal values for gstore, then return true. +// All incoming blocks should have single successor. +// Otherwise return false. +static bool canPropagatePhiGStore(StoreInst &SI) { + Value *Val = SI.getValueOperand(); + auto *Phi = dyn_cast(Val); + if (!Phi) + return false; + + if (!llvm::all_of(Phi->blocks(), + [](BasicBlock *BB) { return BB->getSingleSuccessor(); })) + return false; + + Value *StorePtr = SI.getPointerOperand(); + // This can be weakened, but then new gstores should be normalized too. + return llvm::all_of(Phi->incoming_values(), [StorePtr](Use &U) { + return isLegalValueForGlobalStore(U, StorePtr); + }); +} + +// Duplicate gstore in blocks with its legal value operands coming from phi. +// After that, there will be legal gstores that can be baled later. +// Old gstore with phi is deleted. +static void propagatePhiGStore(StoreInst &SI) { + auto *Phi = cast(SI.getValueOperand()); + for (Use &U : Phi->incoming_values()) { + auto *NewSI = cast(SI.clone()); + auto *WrrInst = cast(U); + NewSI->insertBefore(WrrInst->getParent()->getTerminator()); + NewSI->setOperand(0 /*Value operand idx*/, WrrInst); + } + SI.eraseFromParent(); + if (Phi->user_empty()) + Phi->eraseFromParent(); +} + +// Normalize gstores. +// There are two main cases: +// 1) gstore of phi, then there will be attempt to hoist gstore to +// its value, if that will give correct gstores. +// 2) Otherwise, just ill-formed gstore. Normalize it. +static void normalizeGStores(Function &F) { + SmallVector PhiWorklist; + SmallVector NormalizeWorklist; + // Collect phi and ill-formed gloads. + for (auto &I : instructions(F)) { + auto *SI = dyn_cast(&I); + if (!SI || !isGlobalStore(SI)) + continue; + if (canPropagatePhiGStore(*SI)) + PhiWorklist.push_back(SI); + else if (!isGlobalStoreLegal(SI)) + NormalizeWorklist.push_back(SI); + } + + // Handle everything. + for (auto *SI : PhiWorklist) + propagatePhiGStore(*SI); + for (auto *SI : NormalizeWorklist) + normalizeGStore(*SI); +} + +// Cleanup and optimization before do baling on a function. +bool GenXBaling::prologue(Function *F) { + bool Changed = false; + auto nextInst = [](BasicBlock &BB, Instruction *I) -> Instruction * { + // This looks like an llvm bug. We cannot call getPrevNode + // on the first instruction... + if (isa(I) || I == &BB.front()) + return nullptr; + return I->getPrevNode(); + }; + + for (auto &BB : F->getBasicBlockList()) { + // scan the block backwards. + for (auto Inst = &BB.back(); Inst; Inst = nextInst(BB, Inst)) { + // + // Rewrite + // A = B op C + // V = wrr(A, R) + // E = A op D + // into + // + // A = B op C + // V = wrr(A, R) + // A' = rrd(V, R) + // E = A' op D + // + if (GenXIntrinsic::isWrRegion(Inst)) { + Instruction *V = dyn_cast( + Inst->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum)); + + // Only process the case with multiple uses. + if (!V || V->hasOneUse()) + continue; + + // Skip if this region write is indirect as + // this would result an indirect read. + Region R(Inst, BaleInfo()); + if (R.Indirect) + continue; + + // Aggressively apply this transform may increase register pressure. + // We detect if there is other region write in between, so that two + // outer regions will not be live at the same time. + if (skipTransform(V, Inst)) + continue; + + // Do this transformation. + // - Insert a region read right after Inst + // - Replace all uses other than Inst with this region read + // + auto NewV = R.createRdRegion(Inst, "split", Inst, Inst->getDebugLoc(), + /*AllowScalar*/ !V->getType()->isVectorTy()); + + assert(NewV->getType() == V->getType()); + Inst->moveBefore(NewV); + for (auto UI = V->use_begin(); UI != V->use_end(); /*Empty*/) { + Use &U = *UI++; + if (U.getUser() != Inst) + U.set(NewV); + } + Changed = true; + } + } + } + + // fold bitcast into store/load if any. This allows to bale a g_store instruction + // crossing a bitcast. + for (auto &BB : F->getBasicBlockList()) { + for (auto I = BB.begin(); I != BB.end(); /*empty*/) { + Instruction *Inst = &*I++; + using namespace llvm::PatternMatch; + + // bitcast (bitcast X to Ty1) to Ty2 ==> bitcast X to Ty2 + Value *X; + if (match(Inst, m_BitCast(m_BitCast(m_Value(X))))) { + BitCastInst *NewCI = new BitCastInst(X, Inst->getType(), "", Inst); + NewCI->setDebugLoc(Inst->getDebugLoc()); + NewCI->takeName(Inst); + Inst->replaceAllUsesWith(NewCI); + if (Liveness) + Liveness->eraseLiveRange(Inst); + Inst->eraseFromParent(); + Changed = true; + continue; + } + + if (isa(Inst) || isa(Inst)) { + Instruction* NewInst = foldBitCastInst(Inst); + if (NewInst) { + Changed = true; + Inst = NewInst; + } + } + + // Delete Trivially dead store instructions. + if (auto ST = dyn_cast(Inst)) { + Value *Val = ST->getValueOperand(); + assert(Val); + if (auto LI = dyn_cast(Val)) { + Value *Ptr = ST->getPointerOperand(); + auto GV1 = getUnderlyingGlobalVariable(Ptr); + auto GV2 = getUnderlyingGlobalVariable(LI->getPointerOperand()); + if (GV1 && GV1 == GV2) { + ST->eraseFromParent(); + Changed = true; + } + } + } + } + for (auto I = BB.rbegin(); I != BB.rend(); /*empty*/) { + Instruction *Inst = &*I++; + if (isInstructionTriviallyDead(Inst)) { + if (Liveness) + Liveness->eraseLiveRange(Inst); + Inst->eraseFromParent(); + } + } + } + + // Result of inline asm with multiple outputs is a + // struct. Each element of this struct passed to + // it's user with extractelement instruction which + // should be baled into it's own WrRegion. genx_convert_addr + // intrinsic or global load will be unbaled between these bales + // of (extractelement + wrregion). The idea is to + // move all of those address conversion before + // the inline assembly instruction. Also each extractvalue and WrR which uses + // extractvalue's result should be moved closely to inline assembly call, + // otherwise baling will force baled instructions + // to locate far away from inline asm call + // which will lead to live range increasing. + for (auto &BB : F->getBasicBlockList()) { + for (auto &Inst : BB.getInstList()) { + auto CI = dyn_cast(&Inst); + if (!CI || !CI->isInlineAsm()) + continue; + // Nothing to do if result is not a struct: no multiple outputs + if (!isa(CI->getType())) + continue; + for (auto ui = CI->use_begin(), ue = CI->use_end(); ui != ue; ++ui) { + auto EV = dyn_cast(ui->getUser()); + if (!EV) + continue; + EV->moveAfter(&Inst); + // extractelement must be baled into wrregion + for (auto User : EV->users()) { + Changed = true; + if (!GenXIntrinsic::isWrRegion(User)) + continue; + Instruction *WrR = cast(User); + Value *Index = + WrR->getOperand(GenXIntrinsic::GenXRegion::WrIndexOperandNum); + while (GenXIntrinsic::getGenXIntrinsicID(Index) == + GenXIntrinsic::genx_add_addr) + Index = cast(Index)->getOperand(0); + Instruction *IndexInst = dyn_cast(Index); + if (IndexInst && (GenXIntrinsic::getGenXIntrinsicID(IndexInst) == + GenXIntrinsic::genx_convert_addr)) + IndexInst->moveBefore(&Inst); + Value *OldVal = + WrR->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum); + LoadInst *Load = dyn_cast(OldVal); + if (Load && isGlobalLoad(Load)) + Load->moveBefore(&Inst); + WrR->moveAfter(EV); + } + } + } + } + + normalizeGStores(*F); + + // Remove Phi node with single incoming value + for (auto &BB : F->getBasicBlockList()) { + for (BasicBlock::iterator bi = BB.begin(), be = BB.end(); bi != be; ) { + Instruction *Inst = &*bi; + ++bi; + if (auto Phi = dyn_cast(Inst)) { + if (Phi->getNumIncomingValues() == 1) { + Phi->replaceAllUsesWith(Phi->getIncomingValue(0)); + Phi->eraseFromParent(); + Changed = true; + } + } + else { + break; + } + } + } + + return Changed; +} + + +/*********************************************************************** + * Bale::getMainInst : get the main instruction from the bale, 0 if none + */ +const BaleInst *Bale::getMainInst() const +{ + // From the last instruction (the bale head) backwards, find the first + // one that is not wrregion or saturate or addradd. If the head is + // wrregion, then skip anything before we reach its value operand. + // If the first one we find is rdregion, that does not count as a main + // instruction. + Value *PossibleMainInst = nullptr; + for (const_reverse_iterator i = rbegin(), e = rend(); i != e; ++i) { + if (PossibleMainInst && PossibleMainInst != i->Inst) + continue; + PossibleMainInst = nullptr; + switch (i->Info.Type) { + case BaleInfo::WRREGION: + PossibleMainInst = i->Inst->getOperand(1); + break; + case BaleInfo::GSTORE: + PossibleMainInst = i->Inst->getOperand(0); + break; + case BaleInfo::SATURATE: + case BaleInfo::ADDRADD: + break; + case BaleInfo::MAININST: + return &*i; + default: + return nullptr; + } + } + return nullptr; +} + +/*********************************************************************** + * eraseFromParent : do eraseFromParent on all instructions in the bale + */ +void Bale::eraseFromParent() +{ + // Iterate in reverse as each instruction becomes unused only when its + // user in the bale is erased. + for (reverse_iterator ri = rbegin(), re = rend(); ri != re; ++ri) + ri->Inst->eraseFromParent(); +} + +/*********************************************************************** + * Bale::compare : compare this Bale with another one + * + * Return: 0 if equivalent + * < 0 if less + * > 0 if more + * + * Two Bales are equivalent if they compute the same value, that is, they + * have the same opcodes in the instructions, the instructions are + * baled together in the same way, and the operands coming in from outside + * the bale are the same. + * + * Both bales must have had hash() called on them since being built or + * modified in any other way. + */ +int Bale::compare(const Bale &Other) const +{ + assert(Hash && Other.Hash); + if (Hash != Other.Hash) + return Hash < Other.Hash ? -1 : 1; + if (size() != Other.size()) + return size() < Other.size() ? -1 : 1; + for (unsigned i = 0, e = size(); i != e; ++i) { + if (Insts[i].Info.Bits != Other.Insts[i].Info.Bits) + return Insts[i].Info.Bits < Other.Insts[i].Info.Bits ? -1 : 1; + Instruction *Inst = Insts[i].Inst, *OtherInst = Other.Insts[i].Inst; + if (Inst->getOpcode() != OtherInst->getOpcode()) + return Inst->getOpcode() < OtherInst->getOpcode() ? -1 : 1; + unsigned NumOperands = Inst->getNumOperands(); + if (NumOperands != OtherInst->getNumOperands()) + return NumOperands < OtherInst->getNumOperands() ? -1 : 1; + for (unsigned j = 0; j != NumOperands; ++j) { + Value *Opnd = Inst->getOperand(j); + if (!Insts[i].Info.isOperandBaled(j)) { + if (Opnd != OtherInst->getOperand(j)) + return Opnd < OtherInst->getOperand(j) ? -1 : 1; + } else { + // Baled operand. Find which baled instruction it is, and check that + // the other bale has its corresponding instruction used in its + // corresponding operand. + // (We could use a map to find the baled instruction + // in an algorithmically less complex way, but there is not likely + // to be more than 3 or 4 instructions in the bale so I didn't + // bother.) + unsigned BaledInst; + for (BaledInst = 0; Insts[BaledInst].Inst != Opnd; ++BaledInst) { + assert(BaledInst != size()); + } + if (Other.Insts[BaledInst].Inst != OtherInst->getOperand(j)) + return Other.Insts[BaledInst].Inst < OtherInst->getOperand(j) ? -1 : 1; + } + } + } + return 0; +} + +/*********************************************************************** + * hash_value : get a hash_code for a Bale + * + * If two Bales are equivalent, they have the same hash_value. + * + * If two Bales are not equivalent, it is unlikely but possible that + * they have the same hash_value. + */ +void Bale::hash() +{ + Hash = 0; + for (auto i = begin(), e = end(); i != e; ++i) { + BaleInst BI = *i; + Hash = hash_combine(Hash, BI.Info.Bits); + Hash = hash_combine(Hash, BI.Inst->getOpcode()); + for (unsigned j = 0, je = BI.Inst->getNumOperands(); j != je; ++j) { + Value *Opnd = BI.Inst->getOperand(j); + if (!BI.Info.isOperandBaled(j)) { + // Non-baled operand. Hash the operand itself. + Hash = hash_combine(Hash, Opnd); + } else { + // Baled operand. Find which baled instruction it is, and use that + // index in the hash. (We could use a map to find the baled instruction + // in an algorithmically less complex way, but there is not likely + // to be more than 3 or 4 instructions in the bale so I didn't + // bother.) + Bale::iterator BaledInst; + for (BaledInst = begin(); BaledInst->Inst != Opnd; ++BaledInst) { + assert(BaledInst != i); + } + Hash = hash_combine(Hash, BaledInst - begin()); + } + } + } +} + +bool Bale::isGStoreBaleLegal() const { + assert(isGstoreBale()); + auto ST = cast(getHead()->Inst); + if (!isGlobalStore(ST)) + return false; + return isGlobalStoreLegal(ST); +} + +/*********************************************************************** + * Bale debug dump/print + */ +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void Bale::dump() const +{ + print(errs()); +} +#endif + +void Bale::print(raw_ostream &OS) const +{ + OS << "bale {\n"; + for (const_iterator i = begin(), e = end(); i != e; ++i) { + i->Inst->print(OS); + OS << " // {" << i->Info.getTypeString() << "}\n"; + } + OS << "}\n"; +} + +const char *BaleInfo::getTypeString() const +{ + switch (Type) { + case BaleInfo::MAININST: return "maininst"; + case BaleInfo::WRREGION: return "wrregion"; + case BaleInfo::SATURATE: return "saturate"; + case BaleInfo::NOTMOD: return "notmod"; + case BaleInfo::NEGMOD: return "negmod"; + case BaleInfo::ABSMOD: return "absmod"; + case BaleInfo::RDREGION: return "rdregion"; + case BaleInfo::ADDRADD: return "addradd"; + case BaleInfo::RDPREDREGION: return "rdpredregion"; + case BaleInfo::ALLANY: return "allany"; + case BaleInfo::NOTP: return "notp"; + case BaleInfo::ZEXT: return "zext"; + case BaleInfo::SEXT: return "sext"; + case BaleInfo::WRPREDREGION: return "wrpreregion"; + case BaleInfo::CMPDST: return "cmpdst"; + case BaleInfo::GSTORE: return "g_store"; + case BaleInfo::SHUFFLEPRED: return "shufflepred"; + case BaleInfo::FADDR: return "faddr"; + default: return "???"; + } +} + +bool genx::operator==(const BaleInfo &lhs, const BaleInfo &rhs) { + return lhs.Type == rhs.Type && lhs.Bits == rhs.Bits; +} + +bool genx::operator==(const BaleInst &lhs, const BaleInst &rhs) { + return lhs.Inst == rhs.Inst && lhs.Info == rhs.Info; +} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXBaling.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXBaling.h new file mode 100644 index 000000000000..d94bf49653ea --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXBaling.h @@ -0,0 +1,550 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXBaling +/// ---------- +/// +/// GenX instruction baling is the process of determining which LLVM instructions +/// can be combined into a single vISA instruction. Such a group of LLVM +/// instructions is known as a *bale*. A bale typically has a *main instruction* +/// and then optionally modifiers and region instructions on the sources and +/// the destination of the main instruction. However it is possible to have a +/// bale with no main instruction, for example just a rdregion, a modifier and +/// a wrregion. +/// +/// Bale example +/// ^^^^^^^^^^^^ +/// +/// .. image:: GenXDesign_bale.png +/// +/// This example shows a bale that is pretty much as complicated as you can get in +/// a single bale. Each small blue box is an LLVM IR instruction, with arrows showing +/// how each one is used. Other than the *bale head* instruction at the top, an +/// instruction in a bale has only one use, which is within the bale. +/// +/// The baling pass +/// ^^^^^^^^^^^^^^^ +/// +/// GenX instruction baling happens in two parts: +/// +/// 1. The GenXBaling pass sets up a map to give each Instruction +/// a *BaleInfo*, which contains a field giving the role the instruction +/// plays in its enclosing bale (main instruction, rdregion, etc), and a +/// bit vector where a bit is set if the corresponding operand of the +/// instruction is another instruction that is baled in (part of the same +/// bale). +/// +/// GenXBaling is in fact two slightly different passes run at two different +/// times: +/// +/// * The GenXFuncBaling pass (a FunctionPass) runs before GenXLegalization, +/// which uses it but invalidates it as it changes the code. This is known +/// as *first baling*. +/// +/// * The GenXGroupBaling pass (a FunctionGroupPass) runs after GenXLiveness. +/// From GenXLiveness, baling information remains valid through to +/// GenXCisaBuilder, since any code changes made (such as adding +/// copies where coalescing fails) either do not invalidate the analysis, +/// or the pass making the change also updates the baling analysis. +/// +/// The GenXBaling pass also detects where an instruction is baled in to +/// another, but the instruction has other uses too. In this case it clones the +/// instruction. Thus we end up with any baled in instruction having only +/// one use (with an exception for goto/join -- see below). +/// +/// Thus the GenXBaling pass is not a pure analysis, as it can modify the +/// code. +/// +/// 2. Using the map set up by the GenXBaling analysis, several functions are +/// provided for use by other passes: +/// +/// * getBaleInfo()/setBaleInfo() allow another pass to directly inspect and modify +/// the baling info for an instruction. The BaleInfo for an instruction gives: +/// +/// - Type, the role of the instruction in the bale (e.g. it is a rdregion); +/// - a bitmap of which operands are baled into it, together with methods +/// for getting and setting the bit for a particular operand. +/// +/// * getBaleParent() returns the instruction that the given instruction is +/// baled into, if any +/// +/// * isBaled() says whether the given instruction is baled into anything +/// +/// * getBaleHead() returns the instruction at the head of the bale that the +/// given instruction is baled into, which is the same as the given instruction +/// if it is not baled into anything. +/// +/// * buildBale() takes a head instruction (one for which isBaled is false) and +/// fills out a Bale struct with a vector of BaleInst structs for all the +/// instructions in the bale, where each BaleInst contains a pointer to the +/// instruction and its BaleInfo struct (as in getBaleInfo()/setBaleInfo()). +/// +/// Criteria for baling +/// ^^^^^^^^^^^^^^^^^^^ +/// +/// GenXBaling implements the criteria for baling, i.e. when different LLVM IR +/// instructions can be combined into the same vISA instruction: +/// +/// * A rdregion with a variable index can bale in an add constant (where the +/// constant is splatted if vector) that generates the index. In second baling, +/// the constant add is in fact a ``llvm.genx.add.addr`` intrinsic, because that +/// is what GenXCategory converted it to. +/// +/// * GenXBaling is where an instruction gets recognized as a modifier, for example +/// subtract from 0 is a negate modifier. The instruction is left as it is, and +/// its modifier equivalent (e.g. ``BaleInfo::NEGMOD``) is set up in the +/// instruction's BaleInfo. +/// +/// * SExt/ZExt are also treated as modifiers, although not always balable. See +/// below. +/// +/// * A modifier can bale in an rdregion. +/// +/// * A modifier can bale in another modifier in some circumstances. +/// +/// * In particular, SExt/ZExt normally cannot bale in another modifier, but they +/// are allowed to bale in an abs modifier as a bodge to fix a problem where +/// the LLVM IR generated for ``cm_abs`` does not properly represent its +/// semantics. See ``dc93b907 GenXBaling: bodge to work around cm_abs problems``. +/// +/// * A main instruction can bale a modifier or rdregion into each operand in some +/// circumstances: +/// +/// - Some ALU intrinsics have region requirements, e.g. oword aligned, +/// contiguous. GenXBaling enforces those requirements by only baling in an +/// rdregion that satifies them, but only in second baling. First baling does +/// the baling anyway, as we want GenXLegalization to consider the instructions +/// as one bale as it might legalize in a way that makes the region legal for +/// the instruction. +/// +/// - Baling an SExt/ZExt in is how we represent a vISA instruction such as +/// ``add`` with a result type different to operand type. The two operands can +/// have different types too in Gen, but vISA insists they are the same (if not +/// constant). So: +/// +/// 1. In first baling, we allow SExt/ZExt from different types to be baled in +/// to the two operands. This tends to make GenXLegalization legalize them +/// to the same vector width as the main instruction. +/// +/// 2. In second baling, we do not allow SExt/ZExt from different types (or one +/// SExt/ZExt where the other operand does not have one) to be baled in. This +/// yields a legal vISA instruction, but having done (1) also allows the +/// finalizer to fold the extend into the instruction. +/// +/// - A raw operand (of a send or shared function intrinsic) has its own +/// restrictions -- it can bale in a rdregion, but the region has to be +/// contiguous and GRF aligned. +/// +/// - There is special case code for where send or a shared function intrinsic +/// has a ``TWOADDR`` raw operand, one that does not appear as a vISA operand +/// in its own right but is implicitly the same register as the result. The +/// twoaddr raw operand can bale in a rdregion (with region contiguous and GRF +/// aligned) as long as the result can be baled into a wrregion with the same +/// region parameters and the same "old value" input. This represents where a +/// send or shared function intrinsic does a predicated partial write, and the +/// place it does the partial write to is a region in a vISA register. +/// +/// * ``llvm.genx.sat`` represents floating point saturation, and is a modifier that +/// is different to the other modifiers because it is not a source modifier. A +/// saturate can bale in a main instruction or modifier or rdregion. +/// +/// * A wrregion can do the following baling: +/// +/// - It can bale a main instruction (subject to region restrictions in second +/// baling), a saturate, a modifier or a rdregion into its "new value" input. +/// +/// - Like rdregion, it can bale a constant add into its index operand. +/// +/// * Anything with a predicate (wrregion, select, send, all/any, some shared +/// function intrinsics) can bale in a predicate not, and any of those things, +/// including the not, can bale in an rdpredregion to represent using e.g. an M3 +/// flag to use only part of the predicate. However predicate baling is not +/// done in first baling, as GenXLegalization does not want to consider the +/// operations together. +/// +/// * Anything with a scalar i1 condition (select, br) can bale in an all/any. +/// +/// Baling of goto/join into br +/// ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +/// +/// The goto and join intrinsics have multiple return values, returned in a single +/// struct. One of the return values is the scalar i1 !any value that is then used +/// in a conditional branch. +/// +/// In second baling, we want the goto/join, the extractvalue of the !any +/// result, and the conditional branch to be baled together, so we can generate +/// a single goto/join instruction. +/// +/// However the struct result of the goto/join has other uses, the extractvalues of +/// the other results. Thus, in this special case, we have a bale where the +/// goto/join instruction inside the bale has uses other than the inside-bale use. +/// This needs special case code to handle in GenXBaling. +/// +/// In the future it may be worth considering a generalization of this idea of a +/// bale that is not a strict tree of instructions, so that we can use LLVM IR +/// to model Gen instructions with a general result and a flag result. Currently +/// we cannot do that, which means: +/// +/// 1. we cannot represent addc properly; +/// +/// 2. we cannot represent any combined arithmetic-and-set-flags instruction, +/// although that is not too much of a problem as the jitter derives such an +/// instruction by folding a cmp into an arithmetic instruction. +/// +/// Alignment requirements +/// ^^^^^^^^^^^^^^^^^^^^^^ +/// +/// An additional function of the second baling pass is that, when it bales a +/// raw result intrinsic into a wrregion, it marks the wrregion's LiveRange as +/// needing to be 32 aligned, and when it bales a rdregion into a raw operand in +/// an intrinsic, it marks the rdregion's input's LiveRange as needing to be 32 +/// aligned. GenXCategory sets most alignment requirements, but baling in +/// a rdregion or baling a main instruction into a wrregion imposes alignment +/// requirements on the vISA register that the region is read from or written to. +/// +//===----------------------------------------------------------------------===// +#ifndef GENXBALING_H +#define GENXBALING_H + +#include "FunctionGroup.h" +#include "GenX.h" +#include "GenXRegion.h" +#include "GenXAlignmentInfo.h" +#include "GenXSubtarget.h" +#include "IgnoreRAUWValueMap.h" +#include "llvm/ADT/Hashing.h" +#include "llvm/Pass.h" +#include + +namespace llvm { + class BranchInst; + class CallInst; + class DebugLoc; + class GenXLiveness; + class Instruction; + class raw_ostream; + class Twine; + class Value; + +namespace genx { + +// BaleInfo : bale info for one instruction +struct BaleInfo { + // Type is how this instruction relates to its bale, whether it is a + // rdregion, wrregion, modifier, or main instruction. + enum { MAININST, WRREGION, SATURATE, NOTMOD, NEGMOD, ABSMOD, + RDREGION, ADDRADD, ADDROR, FADDR, RDPREDREGION, ALLANY, NOTP, ZEXT, SEXT, + SHUFFLEPRED, WRPREDREGION, WRPREDPREDREGION, CMPDST, GSTORE }; + uint16_t Type; + uint16_t Bits; // bitmap of which operands are baled in + BaleInfo(int Type = MAININST, unsigned Bits = 0) : Type(Type), Bits(Bits) {} + // isOperandBaled() : read Bits to see if operand is baled + bool isOperandBaled(unsigned OperandNum) const { return (Bits >> OperandNum) & 1; } + // clearOperandBaled() : clear bit that says that operand is baled + void clearOperandBaled(unsigned OperandNum) { Bits &= ~(1 << OperandNum); } + // setOperandBaled() : set bit that says that operand is baled + void setOperandBaled(unsigned OperandNum) { Bits |= 1 << OperandNum; } + // getTypeString : get string for BaleInfo type + const char *getTypeString() const; +}; + +bool operator==(const BaleInfo &lhs, const BaleInfo &rhs); + +// BaleInst : one instruction in a bale +struct BaleInst { + Instruction *Inst; + BaleInfo Info; + BaleInst(Instruction *Inst, BaleInfo Info) : Inst(Inst), Info(Info) {} +}; + +bool operator==(const BaleInst &lhs, const BaleInst &rhs); + +// Bale : all the instructions in a bale, filled out by buildBale() +class Bale { + typedef SmallVector Insts_t; + Insts_t Insts; + hash_code Hash; +public: + Bale() : Hash(0) {} + void clear() { Insts.clear(); Hash = 0; } + // push_front : push an instruction onto the "front", i.e. it is baled + // into an instruction already in the bale + void push_front(BaleInst BI) { Insts.push_back(BI); } + BaleInst &front() { return Insts.back(); } + // push_back : push an instruction onto the "back", i.e. it is the new + // head instruction, and the old head instruction is baled into it. + // This does an inefficient insert, but is only used in legalization + // when adding a wrregion to a bale that does not already have one. + void push_back(BaleInst BI) { Insts.insert(Insts.begin(), BI); } + BaleInst &back() { return Insts.front(); } + // Forward iterator: gives an instruction before any use of it, with the + // head instruction of the bale coming last. + typedef Insts_t::reverse_iterator iterator; + typedef Insts_t::const_reverse_iterator const_iterator; + iterator begin() { return Insts.rbegin(); } + iterator end() { return Insts.rend(); } + const_iterator begin() const { return Insts.rbegin(); } + const_iterator end() const { return Insts.rend(); } + unsigned size() const { return Insts.size(); } + bool empty() const { return Insts.empty(); } + // getIteratorPos : get 0..31 unsigned representing position of + // Bale::iterator. + unsigned getIteratorPos(iterator i) { + assert((unsigned)(i - Insts.rbegin()) < 32); + return i - Insts.rbegin(); + } + // Reverse iterator: gives an instruction after any use of it, with the + // head instruction of the bale coming first. + typedef Insts_t::iterator reverse_iterator; + typedef Insts_t::const_iterator const_reverse_iterator; + reverse_iterator rbegin() { return Insts.begin(); } + const_reverse_iterator rbegin() const { return Insts.begin(); } + reverse_iterator rend() { return Insts.end(); } + const_reverse_iterator rend() const { return Insts.end(); } + // getHead : get head instruction of the bale + iterator getHeadIt() { return std::prev(end()); } + const_iterator getHeadIt() const { return std::prev(end()); } + BaleInst *getHead() { return &*getHeadIt(); } + const BaleInst *getHead() const { return &*getHeadIt(); } + // getPreHead : returns instruction prior to head instruction + // unsafe: if there's no such instruction, behavior is undefined + iterator getPreHeadIt() { return std::prev(getHeadIt()); } + BaleInst *getPreHead() { return &*getPreHeadIt(); } + // If a bale ends with a g_store bale, return the baled instruction prior to + // this g_store instruction. + iterator getHeadIgnoreGStoreIt() { + if (endsWithGStore()) + return getPreHeadIt(); + return getHeadIt(); + } + BaleInst *getHeadIgnoreGStore() { + return &*getHeadIgnoreGStoreIt(); + } + bool endsWithGStore() const { + return !empty() && rbegin()->Info.Type == BaleInfo::GSTORE; + } + // getMainInst : get 0 else the main inst from the bale + const BaleInst *getMainInst() const; + // hash : set hash code for bale. Must be called before using comparison + // operators. + void hash(); + // Comparison operators. Two bales are equivalent if they compute the same + // value, that is, they have the same opcodes in the instructions, the + // instructions are baled together in the same way, and the operands coming + // in from outside the bale are the same. + bool operator==(const Bale &Other) const { return !compare(Other); } + bool operator!=(const Bale &Other) const { return compare(Other); } + bool operator<(const Bale &Other) const { return compare(Other) < 0; } + int compare(const Bale &Other) const; + // eraseFromParent : do eraseFromParent on all instructions in the bale + void eraseFromParent(); + // Debug dump/print + void dump() const; + void print(raw_ostream &OS) const; + bool isGstoreBale() const { return endsWithGStore(); } + bool isGStoreBaleLegal() const; +}; + +inline raw_ostream &operator<<(raw_ostream &OS, const Bale &B) { + B.print(OS); + return OS; +} + +} // end namespace genx + +//---------------------------------------------------------------------- +// GenXBaling : the baling information for a Function or FunctionGroup (depending +// on whether GenXFuncBaling or GenXGroupBaling created it) +class GenXBaling { + BalingKind Kind; + typedef llvm::ValueMap> InstMap_t; + GenXSubtarget *ST; + InstMap_t InstMap; + struct NeedClone { + Instruction *Inst; + unsigned OperandNum; + NeedClone(Instruction *Inst = 0, unsigned OperandNum = 0) + : Inst(Inst), OperandNum(OperandNum) {} + bool operator==(const NeedClone &Other) const { + return Inst == Other.Inst && OperandNum == Other.OperandNum; + } + }; + typedef SmallVector NeedCloneStack_t; + NeedCloneStack_t NeedCloneStack; + SmallVector TwoAddrSends; +protected: + GenXLiveness *Liveness; // only in group baling +public: + genx::AlignmentInfo AlignInfo; +public: + explicit GenXBaling(BalingKind BKind, GenXSubtarget *Subtarget) + : Kind(BKind), ST(Subtarget), + Liveness(nullptr) {} + // clear : clear out the analysis + void clear() { InstMap.clear(); } + // processFunctionGroup : process all the Functions in a FunctionGroup + bool processFunctionGroup(FunctionGroup *FG); + // processFunction : process one Function + bool processFunction(Function *F); + // processInst : recalculate the baling info for an instruction + void processInst(Instruction *Inst); + // getBaleInfo : get BaleInfo for an instruction + genx::BaleInfo getBaleInfo(const Instruction *Inst) const { + InstMap_t::const_iterator i = InstMap.find((const llvm::Value *)Inst); + return i == InstMap.end() ? genx::BaleInfo() : i->second; + } + // setBaleInfo : set BaleInfo for an instruction + void setBaleInfo(const Instruction *Inst, genx::BaleInfo BI); + // isBaled : test whether all uses of an instruction would be baled in to + // users + bool isBaled(Instruction *Inst) { return getBaleParent(Inst); } + // getBaleParent : return the instruction baled into, 0 if none + Instruction *getBaleParent(Instruction *Inst); + // unbale : unbale an instruction from its bale parent + void unbale(Instruction *Inst); + // getBaleHead : return the head of the bale containing this instruction + Instruction *getBaleHead(Instruction *Inst); + // buildBale : build Bale from head instruction. B assumed empty on entry + void buildBale(Instruction *Inst, genx::Bale *B, bool IncludeAddr = false) const; + // store : store updated BaleInfo for Instruction (used to unbale by + // GenXLegalization) + void store(genx::BaleInst BI); + // getIndexAdd : test whether the specified value is a constant add/sub that + // could be baled in as a variable index offset, but without checking that + // the index is in range + static bool getIndexAdd(Value *V, int *Offset); + // getIndexOr : test whether the specified value is a constant or that + // could be baled in as a variable index offset, but without checking that + // the index is in range + static bool getIndexOr(Value *V, int &Offset); + // isBalableIndexAdd : test whether the specified value is a constant + // add/sub that could be baled in as a variable index offset + static bool isBalableIndexAdd(Value *V); + // isBalableIndexOr : test whether the specified value is a constant + // or that could be baled in as a variable index offset + static bool isBalableIndexOr(Value *V); + // isBalableNewValueIntoWrr: check whether the new val operand can + // be baled into wrr instruction + static bool + isBalableNewValueIntoWrr(Value *V, const genx::Region &WrrR, + const GenXSubtarget *ST, + genx::AlignmentInfo *AlignInfo = nullptr, + BalingKind BKind = BalingKind::BK_Legalization); + + static bool isHighCostBaling(uint16_t Type, Instruction *Inst); + // Debug dump/print + void dump(); + void print(raw_ostream &OS); +private: + // methods to build the info when running the analysis + void processWrPredRegion(Instruction *Inst); + void processWrPredPredRegion(Instruction *Inst); + void processWrRegion(Instruction *Inst); + bool processSelect(Instruction *Inst); + void processStore(StoreInst *Inst); + bool processShufflePred(Instruction *Inst); + bool processPredicate(Instruction *Inst, unsigned OperandNum); + void processSat(Instruction *Inst); + void processRdRegion(Instruction *Inst); + void processInlineAsm(Instruction *Inst); + void processExtractValue(ExtractValueInst *EV); + void processFuncPointer(PtrToIntInst *Inst); + void processMainInst(Instruction *Inst, int IntrinID); + // helper func for buildBale + void buildBaleSub(Instruction *Inst, genx::Bale *B, bool IncludeAddr) const; + void processBranch(BranchInst *Branch); + void processTwoAddrSend(CallInst *CI); + void setOperandBaled(Instruction *Inst, unsigned OperandNum, genx::BaleInfo *BI); + void doClones(); + Instruction *getOrUnbaleExtend(Instruction *Inst, genx::BaleInfo *BI, + unsigned OperandNum, bool Unbale); + int getAddrOperandNum(unsigned IID) const; + + bool operandIsBaled(Instruction *Inst, + unsigned OperandNum, int ModType, + unsigned ArgInfoBits); + + static bool + isRegionOKForIntrinsic(unsigned ArgInfoBits, const genx::Region &R, + bool CanSplitBale, const GenXSubtarget *ST, + genx::AlignmentInfo *AlignInfo = nullptr, + BalingKind BKind = BalingKind::BK_Legalization); + + // Cleanup and optimization before do baling on a function. + bool prologue(llvm::Function *F); +}; + +//---------------------------------------------------------------------- +// The GenXFuncBaling analysis pass +// (used for the first baling just before GenXLegalization) +class GenXFuncBaling : public FunctionPass, public GenXBaling { +public: + static char ID; + explicit GenXFuncBaling(BalingKind Kind = BalingKind::BK_Legalization, GenXSubtarget *ST = nullptr) + : FunctionPass(ID), GenXBaling(Kind, ST) {} + virtual StringRef getPassName() const { + return "GenX instruction baling analysis for a function"; + } + void getAnalysisUsage(AnalysisUsage &AU) const; + bool runOnFunction(Function &F) { + clear(); + return processFunction(&F); + } + // createPrinterPass : get a pass to print the IR, together with the GenX + // specific analyses + virtual Pass *createPrinterPass(raw_ostream &O, + const std::string &Banner) const { + return createGenXPrinterPass(O, Banner); + } +}; +void initializeGenXFuncBalingPass(PassRegistry &); + +//---------------------------------------------------------------------- +// The GenXGroupBaling analysis pass +// (used for the second baling just before GenXLiveRanges) +class GenXGroupBaling : public FunctionGroupPass, public GenXBaling { +public: + static char ID; + explicit GenXGroupBaling(BalingKind Kind = BalingKind::BK_Legalization, GenXSubtarget *ST = nullptr) + : FunctionGroupPass(ID), GenXBaling(Kind, ST) {} + virtual StringRef getPassName() const { + return "GenX instruction baling analysis for a function group"; + } + void getAnalysisUsage(AnalysisUsage &AU) const; + bool runOnFunctionGroup(FunctionGroup &FG); + // createPrinterPass : get a pass to print the IR, together with the GenX + // specific analyses + virtual Pass *createPrinterPass(raw_ostream &O, + const std::string &Banner) const { + return createGenXGroupPrinterPass(O, Banner); + } +}; +void initializeGenXGroupBalingPass(PassRegistry &); + +} // end namespace llvm + +#endif // GENXBALING_H diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXCFSimplification.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXCFSimplification.cpp new file mode 100644 index 000000000000..e962db4dea9c --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXCFSimplification.cpp @@ -0,0 +1,354 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXCFSimplification +/// -------------------- +/// +/// This is a function pass that simplifies CF as follows: +/// +/// * Where a conditional branch on "not any(pred)" branches over a single +/// basic block containing a small number of instructions, and all +/// instructions are either predicated by pred or are used only in the same +/// basic block, then change the branch to "branch never" so it gets +/// removed later. +/// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "GENX_CFSIMPLIFICATION" + +#include "GenX.h" +#include "GenXIntrinsics.h" +#include "GenXModule.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/GenXIntrinsics/GenXIntrinsics.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/IR/Instructions.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm-c/Core.h" + +using namespace llvm; +using namespace genx; + +namespace { + +// GenXCFSimplification : simplify SIMD CF code +class GenXCFSimplification : public FunctionPass { + static const unsigned Threshold; + bool Modified = false; + SmallVector BranchedOver; +public: + static char ID; + explicit GenXCFSimplification() : FunctionPass(ID) { } + virtual StringRef getPassName() const { return "GenX SIMD CF simplification"; } + void getAnalysisUsage(AnalysisUsage &AU) const; + bool runOnFunction(Function &F); +private: + bool isBranchedOverBlock(BasicBlock *BB); + BasicBlock *processBranchedOverBlock(BasicBlock *BB); + bool isPredSubsetOf(Value *Pred1, Value *Pred2, bool Inverted); +}; + +// Threshold for removing a simd cf branch. The 9999 setting means it is +// pretty much always removed when it can be. +const unsigned GenXCFSimplification::Threshold = 9999; + +} // end anonymous namespace + +char GenXCFSimplification::ID = 0; +namespace llvm { void initializeGenXCFSimplificationPass(PassRegistry &); } +INITIALIZE_PASS_BEGIN(GenXCFSimplification, "GenXCFSimplification", "GenXCFSimplification", false, false) +INITIALIZE_PASS_END(GenXCFSimplification, "GenXCFSimplification", "GenXCFSimplification", false, false) + +FunctionPass *llvm::createGenXCFSimplificationPass() +{ + initializeGenXCFSimplificationPass(*PassRegistry::getPassRegistry()); + return new GenXCFSimplification(); +} + +void GenXCFSimplification::getAnalysisUsage(AnalysisUsage &AU) const +{ +} + +/*********************************************************************** + * GenXCFSimplification::runOnFunction : process one function to + * simplify SIMD CF + */ +bool GenXCFSimplification::runOnFunction(Function &F) +{ + LLVM_DEBUG(dbgs() << "GenXCFSimplification::runOnFunction(" << F.getName() << ")\n"); + Modified = false; + // Build a list of simple branched over basic blocks. + for (auto fi = F.begin(), fe = F.end(); fi != fe; ++fi) { + auto BB = &*fi; + if (isBranchedOverBlock(BB)) { + LLVM_DEBUG(dbgs() << "is branched over: " << BB->getName() << "\n"); + BranchedOver.push_back(BB); + } + } + // Process each branched over block. + while (!BranchedOver.empty()) { + auto BB = BranchedOver.back(); + BranchedOver.pop_back(); + BasicBlock *SubsumedInto = processBranchedOverBlock(BB); + if (!SubsumedInto) + continue; + Modified = true; + // The joined together block may now be a simple branched over block. + if (isBranchedOverBlock(SubsumedInto)) { + LLVM_DEBUG(dbgs() << "is branched over: " << SubsumedInto->getName() << "\n"); + BranchedOver.push_back(SubsumedInto); + } + } + return Modified; +} + + +/*********************************************************************** + * isBranchedOverBlock : detect whether a basic block is a simple branched + * over block. It must have a single predecessor and a single successor, + * and the predecessor must end in a conditional branch whose other + * successor is our successor. + */ +bool GenXCFSimplification::isBranchedOverBlock(BasicBlock *BB) +{ + if (BB->use_empty()) + return false; // no predecessors + if (!BB->hasOneUse()) + return false; // more than one predecessor + auto Term = BB->getTerminator(); + if (Term->getNumSuccessors() != 1) + return false; // not exactly one successor + Use *U = &*BB->use_begin(); + auto PredBr = dyn_cast(U->getUser()); + if (!PredBr || !PredBr->isConditional()) + return false; // predecessor is not conditional branch + auto Succ = Term->getSuccessor(0); + if (PredBr->getSuccessor(0) == BB) { + if (PredBr->getSuccessor(1) != Succ) + return false; // other cond branch successor is not our successor + } else { + if (PredBr->getSuccessor(0) != Succ) + return false; // other cond branch successor is not our successor + } + return true; +} + +/*********************************************************************** + * processBranchedOverBlock : process a branched over block + * + * Return: 0 if unchanged, else the basic block that BB has been subsumed into + */ +BasicBlock *GenXCFSimplification::processBranchedOverBlock(BasicBlock *BB) +{ + LLVM_DEBUG(dbgs() << "processBranchedOverBlock: " << BB->getName() << "\n"); + // Check that the condition to enter the branched over block is an any + // of a predicate. + auto PredBr = cast(BB->use_begin()->getUser()); + auto Cond = PredBr->getCondition(); + bool Inverted = false; + switch (GenXIntrinsic::getGenXIntrinsicID(Cond)) { + case GenXIntrinsic::genx_any: + if (PredBr->getSuccessor(0) != BB) + return nullptr; // branch is the wrong way round + break; + case GenXIntrinsic::genx_all: + if (PredBr->getSuccessor(1) != BB) + return nullptr; // branch is the wrong way round + Inverted = true; + break; + default: + return nullptr; // condition not "any" or "all" + } + Cond = cast(Cond)->getOperand(0); + LLVM_DEBUG(dbgs() << "branched over simd cf block: " << BB->getName() << " with Cond " << Cond->getName() + << (Inverted ? " (inverted)" : "") << "\n" + << "(source line of branch is " << PredBr->getDebugLoc().getLine() << "\n"); + // Check that each phi node in the successor has incomings related as + // follows: the incoming from BB must be a chain of selects or predicated + // wrregions where the ultimate original input is the other incoming, and + // each predicate must be Cond (inverted if necessary), or a subset of it. + // Also count the phi nodes that have different incomings for the two blocks, + // and if that goes over the threshold give up. + unsigned Count = 0; + BasicBlock *Succ = BB->getTerminator()->getSuccessor(0); + BasicBlock *Pred = PredBr->getParent(); + for (auto Inst = &Succ->front(); ; Inst = Inst->getNextNode()) { + auto Phi = dyn_cast(Inst); + if (!Phi) + break; + LLVM_DEBUG(dbgs() << "Phi " << *Phi << "\n"); + Value *V = Phi->getIncomingValueForBlock(BB); + Value *Orig = Phi->getIncomingValueForBlock(Pred); + LLVM_DEBUG(dbgs() << "V: " << *V << "\n" + << "Orig: " << *Orig << "\n"); + if (V == Orig) + continue; + // Check for special case that Orig is constant 0 and V is the condition + // input to any, thus we know that V is 0 if the branch over is taken. + // Thus we can change Pred's incoming to the phi node to match BB's. Not + // doing this can result in the branch over not being removable if it is an + // inner if..else..endif. + if (auto C = dyn_cast(Orig)) { + if (C->isNullValue() && V == Cond) { + Phi->setIncomingValue(Phi->getBasicBlockIndex(Pred), V); + continue; + } + } + // Normal check on for phi node. + bool OK = false; + for (;;) { + LLVM_DEBUG(dbgs() << " checking " << *V << "\n"); + if (V == Orig) { + OK = true; + break; + } + auto Inst = dyn_cast(V); + if (!Inst) + break; + if (++Count > Threshold) { + LLVM_DEBUG(dbgs() << "Over threshold\n"); + break; + } + if (isa(Inst)) { + if (!isPredSubsetOf(Inst->getOperand(0), Cond, Inverted)) + break; + V = Inst->getOperand(2); + continue; + } + if (!GenXIntrinsic::isWrRegion(Inst)) + break; + if (!isPredSubsetOf(Inst->getOperand( + GenXIntrinsic::GenXRegion::PredicateOperandNum), Cond, Inverted)) + break; + V = Inst->getOperand(0); + } + if (!OK) { + LLVM_DEBUG(dbgs() << "failed\n"); + return nullptr; + } + LLVM_DEBUG(dbgs() << "OK\n"); + } + // Check that the block does not contain any calls or intrinsics with + // side effects. + for (auto bi = BB->begin(), be = BB->end(); bi != be; ++bi) + if (auto CI = dyn_cast(&*bi)) { + if (!GenXIntrinsic::isAnyNonTrivialIntrinsic(CI)) { + LLVM_DEBUG(dbgs() << "contains call\n"); + return nullptr; + } + if (!CI->getCalledFunction()->doesNotAccessMemory()) { + LLVM_DEBUG(dbgs() << "contains intrinsic with side effect\n"); + return nullptr; + } + } + // We can now do the transformation. + LLVM_DEBUG(dbgs() << "Transforming " << BB->getName() << "\n"); + // Move instructions from BB into the predecessor. + for (;;) { + auto Inst = &BB->front(); + if (Inst) { + if (Inst->isTerminator()) + break; + Inst->removeFromParent(); + Inst->insertBefore(PredBr); + } + } + // In each phi node in the successor, change the incoming for the predecessor + // to match the incoming for our BB, and remove the incoming for our BB. + // If that would leave only one incoming, then remove the phi node. + for (auto Inst = &Succ->front();; ) { + auto Phi = dyn_cast(Inst); + if (!Phi) + break; + auto Next = Inst->getNextNode(); + if (Phi->getNumIncomingValues() == 2) { + Value *V = Phi->getIncomingValueForBlock(BB); + Phi->replaceAllUsesWith(V); + Phi->eraseFromParent(); + // Having got rid of the phi, it is worth running instruction + // simplification on each use. Specifically, this turns the + // P3 = (P1 & P2) | (P1 & ~P2) at the endif of an if that + // has an else into the simpler P1. Without that, an enclosing if + // would never have its branch removed, because the use of the "or" + // as a predicate stops us detecting that all predicates are a + // subset of the branch condition. + // Run instruction simplification on each use, but restart if any + // simplification happens as then the use chain changes under our feet. + if (auto I = dyn_cast(V)) { + bool Restart = true; + while (Restart) { + Restart = false; + for (auto ui = I->use_begin(), ue = I->use_end(); ui != ue; ++ui) + if (recursivelySimplifyInstruction( + cast(ui->getUser()))) { + Restart = true; + break; + } + } + } + } else { + unsigned PredIdx = Phi->getBasicBlockIndex(Pred); + unsigned BBIdx = Phi->getBasicBlockIndex(BB); + Phi->setIncomingValue(PredIdx, Phi->getIncomingValue(BBIdx)); + Phi->removeIncomingValue(BBIdx); + } + Inst = Next; + } + // Change the predecessor to have an unconditional branch to the successor. + auto NewBr = BranchInst::Create(Succ, PredBr); + NewBr->takeName(PredBr); + auto CondInst = dyn_cast(PredBr->getCondition()); + PredBr->eraseFromParent(); + if (CondInst && CondInst->use_empty()) + CondInst->eraseFromParent(); + // Remove the now empty and unreferenced BB. + BB->eraseFromParent(); + // Merge Pred and Succ blocks. + MergeBlockIntoPredecessor(Succ); + return Pred; +} + +/*********************************************************************** + * isPredSubsetOf : detect whether Pred1 is a subset of Pred2 (or of ~Pred2 + * if Inverted is set) + */ +bool GenXCFSimplification::isPredSubsetOf(Value *Pred1, Value *Pred2, + bool Inverted) +{ + if (Pred1 == Pred2 && !Inverted) + return true; + auto BO = dyn_cast(Pred1); + if (!BO) + return false; + if (BO->getOpcode() == Instruction::And) + return isPredSubsetOf(BO->getOperand(0), Pred2, Inverted) + || isPredSubsetOf(BO->getOperand(1), Pred2, Inverted); + if (BO->getOpcode() == Instruction::Xor) + if (auto C = dyn_cast(BO->getOperand(1))) + return BO->getOperand(0) == Pred2 && C->isAllOnesValue(); + return false; +} + diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXCategory.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXCategory.cpp new file mode 100644 index 000000000000..2318de0c04a8 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXCategory.cpp @@ -0,0 +1,1060 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXCategory +/// ------------ +/// +/// This pass performs five functions: +/// +/// 1. It splits any struct phi into a phi for each element of the struct. This +/// is done in GenXLowering, but a subsequent pass can re-insert a struct phi so +/// this pass mops those up. +/// +/// 2. It resolves each overlapping circular phi value. +/// +/// LLVM IR does not attach +/// any importance to the order of phi nodes in any particular basic block. +/// At the head of a loop, a phi incoming can also be a phi definition in the +/// same block, and they could be in either order. +/// +/// However, once we start constructing live ranges in the GenX backend, we +/// attach importance to the order of the phi nodes, so we need to resolve +/// any such overlapping circular phi value. Currently we do this by +/// inserting a copy (actually a bitcast) just after the phi nodes in that +/// basic block. A future enhancement would be to try and re-order the phi +/// nodes, and only fall back to copy insertion if there is circularity and +/// it is impossible to find a correct order, for example when the loop body +/// swaps two variables over. +/// +/// 3. It inserts a load for any operand that is constant but not allowed to be. +/// It also catches any case where constant propagation in EarlyCSE has +/// caused a non-simple constant to be propagated into the instruction. +/// See the GenXConstants section above. +// (in GenXConstants.cpp) +/// +/// 4. It determines the register category and increased alignment requirement +/// (e.g. use as a raw operand) of each value, and stores it by creating a +/// LiveRange for the value and storing it there. At this stage the LiveRange +/// does not contain any other information; GenXLiveRanges populates it further +/// (or erases it if the value turns out to be baled in). +/// +/// 5. It inserts instructions as required to convert from one register +/// category to another, where a value has its def and uses not all requiring +/// the same category. +/// +/// All this pass inserts is a llvm.genx.convert intrinsic. It does not record +/// what the categories are. This information is recalculated in GenXLiveness. +/// +/// The reason for inserting the convert intrinsic calls here, before the final +/// run of GenXBaling before GenXLiveRanges, is that we want GenXBaling to spot +/// when a convert intrinsic can be baled with rdregion or wrregion. +/// +/// For one value (function argument or instruction), the pass looks at the +/// categories required for the defintion and each use. If there is no address +/// conversion involved, then it inserts a single conversion if possible (all +/// uses are the same category), otherwise it inserts a conversion for each use +/// that requires one. +/// +/// **IR restriction**: After this pass, a value must have its def and all uses +/// requiring the same register category. +/// +/// Address conversion +/// ^^^^^^^^^^^^^^^^^^ +/// +/// An address conversion is treated slightly differently. +/// +/// A rdregion/wrregion representing an indirect region has a variable index. +/// This index is actually an index, whereas the vISA we need to generate for +/// it uses an address register that has been set up with an ``add_addr`` +/// instruction from the index and the base register. +/// +/// This pass inserts an ``llvm.genx.convert.addr`` intrinsic, with zero offset, +/// to represent the conversion from index to address register. However, the +/// intrinsic has no way of representing the base register. Instead, the base +/// register is implicitly the "old value" input of the rdregion/wrregion where +/// the address is used. +/// +/// The same index may well be used in multiple rdregions and wrregions, +/// especially after LLVM's CSE. But at this stage we have no idea whether +/// these multiple rdregions/wrregions will have the same base register, so +/// we must assume not and insert a separate ``llvm.genx.convert.addr`` +/// for each rdregion/wrregion use of the index. +/// +/// These multiple address conversions of the same index are commoned up +/// where possible later on in GenXAddressCommoning. That pass runs after +/// GenXCoalescing, so it can tell whether two address conversions of the +/// same index also have the same base register because the "old value" +/// inputs of the regions have been coalesced together. +/// +/// Where an index used in an indirect region is a constant add, this pass +/// inserts the ``llvm.genx.convert.addr`` before that, and turns the constant +/// add into ``llvm.genx.add.addr``. The latter can be baled into rdregion +/// or wrregion, representing a constant offset in the indirect region. +/// Only one ``llvm.genx.add.addr`` is allowed between the +/// ``llvm.genx.convert.addr`` and the use in a rdregion/wrregion. +/// +/// However this pass does not check whether the offset is in range (although +/// GenXBaling does check that before deciding to bale it in). The +/// GenXAddressCommoning pass sorts that out. +/// +/// **IR restriction**: After this pass, a variable index in a rdregion/wrregion +/// must be the result of ``llvm.genx.convert.addr`` or ``llvm.genx.add.addr``. +/// Operand 0 of ``llvm.genx.add.addr`` must be the result of +/// ``llvm.genx.convert.addr``. +/// +/// **IR restriction**: After this pass, up to GenXAddressCommoning, the result +/// of ``llvm.genx.convert.addr`` must have a single use in either a +/// ``llvm.genx.add.addr`` or as the index in rdregion/wrregion. The result +/// of ``llvm.genx.add.addr`` must have a single use as the index in +/// rdregion/wrregion. +/// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "GENX_CATEGORY" + +#include "FunctionGroup.h" +#include "GenX.h" +#include "GenXConstants.h" +#include "GenXIntrinsics.h" +#include "GenXLiveness.h" +#include "GenXModule.h" +#include "GenXRegion.h" +#include "GenXUtil.h" +#include "vc/GenXOpts/Utils/KernelInfo.h" +#include "vc/GenXOpts/Utils/RegCategory.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/GenXIntrinsics/GenXIntrinsics.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Metadata.h" +#include "llvm/PassAnalysisSupport.h" +#include "llvm/Support/Debug.h" +#include "llvmWrapper/IR/InstrTypes.h" + +using namespace llvm; +using namespace genx; + +namespace { + + // CategoryAndAlignment : values returned from getCategoryAndAlignment* + // functions + struct CategoryAndAlignment { + unsigned Cat; + unsigned Align; + CategoryAndAlignment(unsigned Cat, unsigned Align = 0) : Cat(Cat), Align(Align) {} + }; + + class UsesCatInfo; + + // GenX category pass + class GenXCategory : public FunctionGroupPass { + Function *Func; + KernelMetadata KM; + GenXLiveness *Liveness; + DominatorTreeGroupWrapperPass *DTs; + SmallVector ToErase; + bool Modified; + // Vector of arguments and phi nodes that did not get a category. + SmallVector NoCategory; + bool InFGHead; + public: + static char ID; + explicit GenXCategory() : FunctionGroupPass(ID) { } + virtual StringRef getPassName() const { return "GenX category conversion"; } + void getAnalysisUsage(AnalysisUsage &AU) const; + bool runOnFunctionGroup(FunctionGroup &FG); + // createPrinterPass : get a pass to print the IR, together with the GenX + // specific analyses + virtual Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const + { return createGenXGroupPrinterPass(O, Banner); } + unsigned getCategoryForPhiIncomings(PHINode *Phi) const; + unsigned getCategoryForCallArg(Function *Callee, unsigned ArgNo) const; + unsigned getCategoryForInlasmConstraintedOp(CallInst *CI, unsigned ArgNo, + bool IsOutput) const; + CategoryAndAlignment getCategoryAndAlignmentForDef(Value *V) const; + CategoryAndAlignment getCategoryAndAlignmentForUse(Value::use_iterator U) const; + private: + const GenXSubtarget *Subtarget; + using ConvListT = std::array; + + bool processFunction(Function *F); + bool fixCircularPhis(Function *F); + bool processValue(Value *V); + Instruction *createConversion(Value *V, unsigned Cat); + ConvListT buildConversions(Value *Def, CategoryAndAlignment DefInfo, const UsesCatInfo &UsesInfo); + }; + + // AUse : an address use of a value in processValue() + struct AUse { + Instruction *user; + unsigned OperandNum; + unsigned Cat; + AUse(Value::use_iterator U, unsigned Cat) + : user(cast(U->getUser())), + OperandNum(U->getOperandNo()), Cat(Cat) {} + }; + + // almost real input iterator, minimum for range for was implemented + class Iterator final { + unsigned ShiftedMask_; + unsigned CurCat_; + + public: + Iterator(unsigned Mask, unsigned Cat) : ShiftedMask_(Mask), CurCat_(Cat) { + validate(); + } + + unsigned operator*() const { + validate(); + return CurCat_; + } + + Iterator &operator++() { + validate(); + ShiftedMask_ /= 2; + ++CurCat_; + if (ShiftedMask_ == 0) { + CurCat_ = RegCategory::NUMCATEGORIES; + validate(); + return *this; + } + for (; ShiftedMask_ % 2 == 0; ShiftedMask_ /= 2, ++CurCat_) + ; + validate(); + return *this; + } + + friend bool operator==(const Iterator &lhs, const Iterator &rhs) { + return (lhs.ShiftedMask_ == rhs.ShiftedMask_ && + lhs.CurCat_ == rhs.CurCat_); + } + + friend bool operator!=(const Iterator &lhs, const Iterator &rhs) { + return !(lhs == rhs); + } + + private: + void validate() const { + assert((ShiftedMask_ % 2 == 1 || CurCat_ == RegCategory::NUMCATEGORIES) && + "invalid state"); + } + }; + + // Implements only begin() and end() + // to iterate over categories of uses. + class Categories final { + unsigned Mask_; + + public: + explicit Categories(unsigned Mask) : Mask_(Mask) {} + + Iterator begin() const { + // we have no category + if (!Mask_) + return end(); + // we have NONE category + if (Mask_ % 2 == 1) + return Iterator(Mask_, 0); + // we adding NONE category + Iterator FalseBegin(Mask_ + 1, 0); + // and now we get the real first category + return ++FalseBegin; + } + + Iterator end() const { return Iterator(0, RegCategory::NUMCATEGORIES); } + }; + + // Encapsulates Category'n'Alignment analysis of value uses. + class UsesCatInfo final { + using UsesT = llvm::SmallVector; + UsesT Uses_; + unsigned Mask_; + unsigned MaxAlign_; + unsigned MostUsedCat_; + + public: + UsesCatInfo() : Uses_(), Mask_(0), MaxAlign_(0) {} + + UsesCatInfo(const GenXCategory &PassInfo, Value *V) : UsesCatInfo() { + std::array Stat = {0}; + for (auto ui = V->use_begin(), ue = V->use_end(); ui != ue; ++ui) { + auto CatAlign = PassInfo.getCategoryAndAlignmentForUse(ui); + MaxAlign_ = std::max(MaxAlign_, CatAlign.Align); + Uses_.push_back(AUse(ui, CatAlign.Cat)); + Mask_ |= 1 << CatAlign.Cat; + if (CatAlign.Cat != RegCategory::NONE) + ++Stat[CatAlign.Cat]; + } + auto MaxInStatIt = std::max_element(Stat.begin(), Stat.end()); + MostUsedCat_ = MaxInStatIt - Stat.begin(); + } + + bool empty() const { return !Mask_; } + + bool allHaveCat(unsigned cat) const { return !(Mask_ & ~(1 << cat)); } + + const UsesT &getUses() const { return Uses_; } + + unsigned getMaxAlign() const { return MaxAlign_; } + + // When there's no real category uses (real is anything but NONE) + // behavior is undefined. + unsigned getMostUsedCat() const { + assert(!empty() && !allHaveCat(RegCategory::NONE) && + "works only for cases when there are uses with real categories"); + return MostUsedCat_; + } + + // meant to be used in range for + Categories getCategories() const { return Categories(Mask_); } + }; + + void placeConvAfterDef(Function *Func, Instruction *Conv, Value *Def) { + if (Instruction *Inst = dyn_cast(Def)) { + // Original value is an instruction. Insert just after it. + Conv->insertAfter(Inst); + Conv->setDebugLoc(Inst->getDebugLoc()); + } else { + assert(isa(Def) && "must be an argument if not an instruction"); + // Original value is a function argument. Insert at the start of the + // function. + Conv->insertBefore(&*Func->begin()->begin()); + } + } + + void placeConvBeforeUse(Instruction *Conv, Instruction *Use, + unsigned UseOperand) { + if (auto PhiUse = dyn_cast(Use)) { + // Use is in a phi node. Insert before terminator in corresponding + // incoming block. + Conv->insertBefore(PhiUse->getIncomingBlock(UseOperand)->getTerminator()); + } else { + // Insert just before use. + Conv->insertBefore(Use); + Conv->setDebugLoc(Use->getDebugLoc()); + } + } + + } // end anonymous namespace + +char GenXCategory::ID = 0; +namespace llvm { void initializeGenXCategoryPass(PassRegistry &); } +INITIALIZE_PASS_BEGIN(GenXCategory, "GenXCategory", "GenXCategory", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeGroupWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GenXLiveness) +INITIALIZE_PASS_END(GenXCategory, "GenXCategory", "GenXCategory", false, false) + +FunctionGroupPass *llvm::createGenXCategoryPass() +{ + initializeGenXCategoryPass(*PassRegistry::getPassRegistry()); + return new GenXCategory(); +} + +void GenXCategory::getAnalysisUsage(AnalysisUsage &AU) const +{ + FunctionGroupPass::getAnalysisUsage(AU); + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.setPreservesCFG(); +} + +/*********************************************************************** + * runOnFunctionGroup : run the category conversion pass for + * this FunctionGroup + */ +bool GenXCategory::runOnFunctionGroup(FunctionGroup &FG) +{ + KM = KernelMetadata(FG.getHead()); + DTs = &getAnalysis(); + Liveness = &getAnalysis(); + auto P = getAnalysisIfAvailable(); + Subtarget = P ? P->getSubtarget() : nullptr; + bool Modified = false; + if (KM.isKernel()) { + // Get the offset of each kernel arg. + for (auto ai = FG.getHead()->arg_begin(), ae = FG.getHead()->arg_end(); + ai != ae; ++ai) { + Argument *Arg = &*ai; + Liveness->getOrCreateLiveRange(Arg)->Offset = KM.getArgOffset(Arg->getArgNo()); + } + } + // Mop up any struct phis, splitting into elements. + for (auto i = FG.begin(), e = FG.end(); i != e; ++i) + Modified |= splitStructPhis(*i); + // Do category conversion on each function in the group. + InFGHead = true; + for (auto i = FG.begin(), e = FG.end(); i != e; ++i) { + Modified |= processFunction(*i); + InFGHead = false; + } + // Now iteratively process values that did not get a category. A valid + // category will eventually propagate through a web of phi nodes + // and/or subroutine args. + while (NoCategory.size()) { + SmallVector NoCategory2; + for (unsigned i = 0, e = NoCategory.size(); i != e; ++i) { + if (!processValue(NoCategory[i])) + NoCategory2.push_back(NoCategory[i]); + } + assert(NoCategory2.size() < NoCategory.size() && "not making any progess"); + NoCategory.clear(); + if (!NoCategory2.size()) + break; + for (unsigned i = 0, e = NoCategory2.size(); i != e; ++i) { + if (!processValue(NoCategory2[i])) + NoCategory.push_back(NoCategory2[i]); + } + Modified |= true; + } + return Modified; +} + +// Common up constpred calls within a block. +static bool commonUpPredicate(BasicBlock *BB) { + bool Changed = false; + // Map from flatten predicate value to its constpred calls. + SmallDenseMap> ValMap; + + for (auto &Inst : BB->getInstList()) { + if (GenXIntrinsic::getGenXIntrinsicID(&Inst) == GenXIntrinsic::genx_constantpred) { + Constant *V = cast(Inst.getOperand(0)); + if (auto VT = dyn_cast(V->getType())) { + unsigned NElts = VT->getVectorNumElements(); + if (NElts > 64) + continue; + uint64_t Bits = 0; + for (unsigned i = 0; i != NElts; ++i) + if (!V->getAggregateElement(i)->isNullValue()) + Bits |= ((uint64_t)1 << i); + auto Iter = ValMap.find(Bits); + if (Iter == ValMap.end()) + ValMap[Bits].push_back(&Inst); + else if (Inst.hasOneUse() && Inst.user_back()->getParent() == BB) + // Just in case constpred is not from constant predicate loading. This + // ensures the first instruction dominates others in the same vector. + (Iter->second).push_back(&Inst); + } + } + } + + // Common up when there are more than 2 uses, in which case it will not be + // worse than flag spills. + for (auto I = ValMap.begin(), E = ValMap.end(); I != E; ++I) { + auto &V = I->second; + int n = (int)V.size(); + if (n > 2) { + Instruction *DomInst = V.front(); + for (int i = 1; i < n; ++i) { + V[i]->replaceAllUsesWith(DomInst); + V[i]->eraseFromParent(); + } + Changed = true; + } + } + + return Changed; +} + +/*********************************************************************** + * processFunction : run the category conversion pass for this Function + * + * This does a postordered depth first traversal of the CFG, + * processing instructions within a basic block in reverse, to + * ensure that we see a def after its uses (ignoring phi node uses). + * This is specifically useful for an address conversion, where we want to + * see the constant add used in an indirect region (and convert it into a + * llvm.genx.add.addr) before we see the instruction it uses. + */ +bool GenXCategory::processFunction(Function *F) +{ + Func = F; + // Before doing the category conversion, fix circular phis. + Modified = fixCircularPhis(F); + // Load constants in phi nodes. + loadPhiConstants(F, DTs->getDomTree(F), false, Subtarget); + // Process all instructions. + for (po_iterator i = po_begin(&Func->getEntryBlock()), + e = po_end(&Func->getEntryBlock()); i != e; ++i) { + // This loop scans the basic block backwards. If any code is inserted + // before the current point, that code is scanned too. + BasicBlock *BB = *i; + for (Instruction *Inst = &BB->back(); Inst; + Inst = (Inst == &BB->front() ? nullptr : Inst->getPrevNode())) { + Modified |= loadNonSimpleConstants(Inst, nullptr, Subtarget); + Modified |= loadConstants(Inst, Subtarget); + if (!processValue(Inst)) + NoCategory.push_back(Inst); + } + + // This commons up constpred calls just loaded. + Modified |= commonUpPredicate(BB); + + // Erase instructions (and their live ranges) as requested by processValue. + for (unsigned i = 0, e = ToErase.size(); i != e; ++i) { + Liveness->eraseLiveRange(ToErase[i]); + ToErase[i]->eraseFromParent(); + } + ToErase.clear(); + } + // Process all args. + for (auto fi = Func->arg_begin(), fe = Func->arg_end(); fi != fe; ++fi) { + Value *V = &*fi; + if (!processValue(V)) + NoCategory.push_back(V); + } + return Modified; +} + +/*********************************************************************** + * fixCircularPhis : fix up overlapping circular phi nodes + * + * A phi node at the head of a loop can have a use in the phi nodes in the same + * basic block. If the use is after the def, it still refers to the value in + * the previous loop iteration, but the GenX backend cannot cope with the + * live range going round the loop and overlapping with its own start. + * + * This function spots any such phi node and works around it by inserting an + * extra copy (bitcast) just after the phi nodes in the basic block. + * + * A better solution for the future would be to re-order the phi nodes if + * possible, and only fall back to inserting a copy if there is circularity + * (e.g. a loop that swaps two variables in its body). + */ +bool GenXCategory::fixCircularPhis(Function *F) +{ + bool Modified = false; + for (auto fi = Func->begin(), fe = Func->end(); fi != fe; ++fi) { + BasicBlock *BB = &*fi; + // Process phi nodes in one basic block. + for (auto bi = BB->begin(); ; ++bi) { + auto Phi = dyn_cast(&*bi); + if (!Phi) + break; // end of phi nodes + if (!GenXLiveness::wrapsAround(Phi, Phi)) + continue; + // Overlapping circular phi node. Insert a copy. + // Note that the copy has to be split in the same way as a copy + // inserted in GenXCoalescing when coalescing fails, but we have + // our own code here because at this point we do not have any real + // and possibly coalesced live ranges like GenXCoalescing does. + Modified = true; + SmallVector Uses; + for (auto ui = Phi->use_begin(), ue = Phi->use_end(); ui != ue; ++ui) + Uses.push_back(&*ui); + // A phi node is never a struct -- GenXLowering removed struct phis. + assert(!isa(Phi->getType())); + // Insert a copy, split as required to be legal. + auto NewCopy = Liveness->insertCopy(Phi, nullptr, BB->getFirstNonPHI(), + Phi->getName() + ".unoverlapper", 0); + // Change the uses that existed before we added the copy to use the + // copy instead. + for (auto ui = Uses.begin(), ue = Uses.end(); ui != ue; ++ui) + **ui = NewCopy; + } + } + return Modified; +} + +/*********************************************************************** + * processValue : category conversion for one value + * + * Return: whether category successfully chosen + * + * This returns false only for a function argument or a phi node where all + * uses are in phi nodes which themselves do not have a category yet. + */ +bool GenXCategory::processValue(Value *V) +{ + // Check for special cases. + // Ignore void. + if (V->getType()->isVoidTy()) + return true; + // Ignore i1 or vector of i1. Predicates do not use category + // conversion. + if (V->getType()->getScalarType()->isIntegerTy(1)) + return true; + // Elements of a struct always have default (general or predicate) category. + if (isa(V->getType())) + return true; + + auto DefInfo = getCategoryAndAlignmentForDef(V); + UsesCatInfo UsesInfo(*this, V); + + // more corner cases + if (UsesInfo.empty()) { + // Value not used: set its category and then ignore it. If the definition + // did not give us a category (probably an unused function arg), then + // arbitrarily make it general. + if (DefInfo.Cat == RegCategory::NONE) + Liveness->getOrCreateLiveRange(V, RegCategory::GENERAL, DefInfo.Align); + else + Liveness->getOrCreateLiveRange(V, DefInfo.Cat, DefInfo.Align); + return true; + } + else if (UsesInfo.allHaveCat(RegCategory::NONE)) + { + if (DefInfo.Cat == RegCategory::NONE) { + // The "no categories at all" case can only happen for a value that is + // defined by a function argument or a phi node and used only in phi + // nodes or subroutine call args. + assert((isa(V) || isa(V)) && "no register category"); + return false; + } + // Value defined with a category but only used in phi nodes. + Liveness->getOrCreateLiveRange(V, DefInfo.Cat, DefInfo.Align); + return true; + } + + // main case + if (DefInfo.Cat == RegCategory::NONE) { + // NONE means that we're free to choose the category + if (isa(V)) + // currently we'd like to propogate general through phi + DefInfo.Cat = RegCategory::GENERAL; + else + DefInfo.Cat = UsesInfo.getMostUsedCat(); + } + + Liveness->getOrCreateLiveRange(V, DefInfo.Cat, std::max(DefInfo.Align, UsesInfo.getMaxAlign())); + auto Convs = buildConversions(V, DefInfo, UsesInfo); + for (auto UseInfo : UsesInfo.getUses()) { + if (UseInfo.Cat != DefInfo.Cat && UseInfo.Cat != RegCategory::NONE) { + Instruction *Conv; + if (UseInfo.Cat == RegCategory::ADDRESS) { + // Case of address category requires a separate conversion for each use, at least until we + // get to GenXAddressCommoning where we decide whether we can common some of them up. + Conv = createConversion(V, UseInfo.Cat); + placeConvBeforeUse(Conv, UseInfo.user, UseInfo.OperandNum); + Liveness->getOrCreateLiveRange(Conv)->setCategory(UseInfo.Cat); + } + else + Conv = Convs[UseInfo.Cat]; + assert(Conv && "must have such conversion"); + UseInfo.user->setOperand(UseInfo.OperandNum, Conv); + } + } + // If V is now unused (which happens if it is a constant add and all its + // uses were addresses), then remove it. + if (V->use_empty()) + ToErase.push_back(cast(V)); + return true; +} + +/*********************************************************************** + * createConversion : create call to llvm.genx.convert intrinsic to represent + * register category conversion + * + * The new instruction is not inserted anywhere yet. + * + * In the case that we are asked to convert a use of an add or constant sub + * to an address, we instead create an llvm.genx.add.addr of the input + * to the add/sub. + */ +Instruction *GenXCategory::createConversion(Value *V, unsigned Cat) +{ + assert(V->getType()->getScalarType()->isIntegerTy() && "createConversion expects int type"); + if (Cat == RegCategory::ADDRESS) { + Value *Input = V; + int Offset = 0; + for (;;) { + // Check for use of add/sub that can be baled in to a region as a + // constant offset. This also handles a chain of two or more adds. + int ThisOffset; + if (!GenXBaling::getIndexAdd(Input, &ThisOffset) && + !GenXBaling::getIndexOr(Input, ThisOffset)) + break; + if (ThisOffset < G4_MIN_ADDR_IMM) + break; + Offset += ThisOffset; + Input = cast(Input)->getOperand(0); + } + if (Input != V) { + // Turn the add/sub into llvm.genx.add.addr. This could be out of range as + // a constant offset in an indirect operand at this stage; + // GenXAddressCommoning sorts that out by adjusting the constant offset in + // the llvm.genx.convert.addr. + return createAddAddr(Input, ConstantInt::get(V->getType(), Offset), + V->getName() + ".addradd", nullptr, Func->getParent()); + } + } + // Normal conversion. If the source is an integer creation intrinsic + // and this isn't an address conversion, use the operand for that + // intrinsic call directly rather than using the result of the intrinsic. + // This helps the jitter to generate better code when surface constants + // are used in send intructions. + if (Cat != RegCategory::ADDRESS) { + if (GenXIntrinsic::getGenXIntrinsicID(V) == GenXIntrinsic::genx_constanti) + V = cast(V)->getArgOperand(0); + return createConvert(V, V->getName() + ".categoryconv", nullptr, + Func->getParent()); + } + return createConvertAddr(V, 0, V->getName() + ".categoryconv", nullptr, + Func->getParent()); +} + +/*********************************************************************** + * Creates conversion instructions, places them in the function (next to the + * def) + * + * Returns an array of created conversion (cons[Category] holds + * instruction if we need conversion to such Category and nullptr otherwise). + * Doesn't produce address category conversion. + */ +GenXCategory::ConvListT +GenXCategory::buildConversions(Value *Def, CategoryAndAlignment DefInfo, + const UsesCatInfo &UsesInfo) { + ConvListT Convs = {nullptr}; + for (auto Cat : UsesInfo.getCategories()) { + // NONE doesn't require conversion, ADDRESS requirs conversion before + // every use (not after def, so we won't create it here) + if (Cat != DefInfo.Cat && Cat != RegCategory::NONE && + Cat != RegCategory::ADDRESS) { + auto Conv = createConversion(Def, Cat); + placeConvAfterDef(Func, Conv, Def); + Liveness->getOrCreateLiveRange(Conv)->setCategory(Cat); + Convs[Cat] = Conv; + } + } + return Convs; +} + +/*********************************************************************** + * intrinsicCategoryToRegCategory : convert intrinsic arg category to + * register category + * + * This converts a GenXIntrinsicInfo::* category, as returned by + * GenXIntrinsicInfo::ArgInfo::getCategory(), into a register category + * as stored in a live range. + */ +static unsigned intrinsicCategoryToRegCategory(unsigned ICat) +{ + switch (ICat) { + case GenXIntrinsicInfo::ADDRESS: + return RegCategory::ADDRESS; + case GenXIntrinsicInfo::PREDICATION: + case GenXIntrinsicInfo::PREDICATE: + return RegCategory::PREDICATE; + case GenXIntrinsicInfo::SAMPLER: + return RegCategory::SAMPLER; + case GenXIntrinsicInfo::SURFACE: + return RegCategory::SURFACE; + case GenXIntrinsicInfo::VME: + return RegCategory::VME; + default: + return RegCategory::GENERAL; + } +} + +/*********************************************************************** + * getCategoryAndAlignmentForDef : get register category and alignment for a def + * + * This returns RegCategory:: value, or RegCategory::NONE if no category + * is discernable. + */ +CategoryAndAlignment GenXCategory::getCategoryAndAlignmentForDef(Value *V) const +{ + if (V->getType()->getScalarType()->getPrimitiveSizeInBits() == 1) + return RegCategory::PREDICATE; + if (Argument *Arg = dyn_cast(V)) { + // This is a function Argument. + if (!InFGHead) { + // It is an arg in a subroutine. Get the category from the corresponding + // arg at some call site. (We should not have disagreement among the + // call sites and the function arg, since whichever one gets a category + // first forces the category of all the others.) + return getCategoryForCallArg(Arg->getParent(), Arg->getArgNo()); + } + unsigned ArgNo = Arg->getArgNo(); + if (KM.getNumArgs() > ArgNo) { + // The function is a kernel, and has argument kind metadata for + // this argument. Determine the category from the kind. + return KM.getArgCategory(ArgNo); + } + // The function is not a kernel, or does not have the appropriate + // metadata. Set to no particular category, so the arg's uses will + // determine the category. This is the fallback for compatibility with + // hand coded LLVM IR from before this metadata was added. (If we only + // had to cope with non-kernel functions, we could just return GENERAL.) + return RegCategory::NONE; + } + // The def is a phi-instruction. + if (PHINode *Phi = dyn_cast(V)) { + // This is a phi node. Get the category from one of the incomings. (We + // should not have disagreement among the incomings, since whichever + // one gets a category first forces the category of all the others.) + return getCategoryForPhiIncomings(Phi); + } + // Multiple outputs of inline assembly instruction + // result in a structure and those elements are extracted + // with extractelement + if (ExtractValueInst *Extract = dyn_cast(V)) { + auto CI = dyn_cast(Extract->getAggregateOperand()); + if (CI && CI->isInlineAsm()) + return getCategoryForInlasmConstraintedOp(CI, Extract->getIndices()[0], + true /*IsOutput*/); + } + // The def is a call-inst + if (CallInst *CI = dyn_cast(V)) { + if (Function *Callee = CI->getCalledFunction()) { + unsigned IntrinsicID = GenXIntrinsic::getAnyIntrinsicID(Callee); + // We should not see genx_convert, as it is inserted into a value after + // using this function to determine its category. + assert(IntrinsicID != GenXIntrinsic::genx_convert); + if (IntrinsicID == GenXIntrinsic::genx_convert_addr) + return RegCategory::ADDRESS; + if (GenXIntrinsic::isAnyNonTrivialIntrinsic(IntrinsicID) && !GenXIntrinsic::isRdRegion(IntrinsicID) + && !GenXIntrinsic::isWrRegion(IntrinsicID) && !GenXIntrinsic::isAbs(IntrinsicID)) { + // For any normal intrinsic, look up the argument class. + GenXIntrinsicInfo II(IntrinsicID); + auto AI = II.getRetInfo(); + return CategoryAndAlignment( + intrinsicCategoryToRegCategory(AI.getCategory()), + AI.getLogAlignment()); + } else if (GenXIntrinsic::isRdRegion(IntrinsicID)) { + // Add this to avoid conversion in case of read-region on SurfaceIndex + // or SamplerIndex type + auto RC = getCategoryAndAlignmentForDef( + CI->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum)); + if (RC.Cat == RegCategory::SURFACE || + RC.Cat == RegCategory::SAMPLER) + return RC.Cat; + } + } else if (CI->isInlineAsm()) { + return getCategoryForInlasmConstraintedOp(CI, 0, true /*IsOutput*/); + } + } + return RegCategory::GENERAL; +} + +/*********************************************************************** + * getCategoryForInlasmConstraintedOp : get register category for a + * operand of inline assembly (both for + * output and for input). Category of + * operand depends on its constraint. + * + */ +unsigned GenXCategory::getCategoryForInlasmConstraintedOp(CallInst *CI, + unsigned ArgNo, + bool IsOutput) const { + assert(CI->isInlineAsm() && "Inline asm expected"); + InlineAsm *IA = dyn_cast(CI->getCalledValue()); + assert(!IA->getConstraintString().empty() && "Here should be constraints"); + + auto ConstraintsInfo = genx::getGenXInlineAsmInfo(CI); + + if (!IsOutput) + ArgNo += genx::getInlineAsmNumOutputs(CI); + auto Info = ConstraintsInfo[ArgNo]; + + switch (Info.getConstraintType()) { + default: + llvm_unreachable("unreachable while setting category in constraints"); + case ConstraintType::Constraint_a: + case ConstraintType::Constraint_rw: + case ConstraintType::Constraint_r: + return RegCategory::GENERAL; + case ConstraintType::Constraint_n: + case ConstraintType::Constraint_i: + case ConstraintType::Constraint_F: + return RegCategory::NONE; + case ConstraintType::Constraint_cr: + return RegCategory::PREDICATE; + } +} + +/*********************************************************************** + * getCategoryAndAlignmentForUse : get register category for a use + * + * This returns RegCategory:: value, or RegCategory::NONE if no category + * is discernable. + */ +CategoryAndAlignment GenXCategory::getCategoryAndAlignmentForUse( + Value::use_iterator U) const +{ + Value *V = U->get(); + if (V->getType()->getScalarType()->isIntegerTy(1)) + return RegCategory::PREDICATE; + auto user = cast(U->getUser()); + if (PHINode *Phi = dyn_cast(user)) { + // This is a phi node. Get the category (if any) from the result, or from + // one of the incomings. (We should not have disagreement among the + // incomings, since whichever one gets a category first forces the category + // of all the others.) + if (auto LR = Liveness->getLiveRangeOrNull(Phi)) { + auto Cat = LR->getCategory(); + if (Cat != RegCategory::NONE) + return Cat; + } + return getCategoryForPhiIncomings(Phi); + } + unsigned Category = RegCategory::GENERAL; + if (IGCLLVM::CallInst *CI = dyn_cast(user)) { + if (CI->isInlineAsm()) + Category = getCategoryForInlasmConstraintedOp(CI, U->getOperandNo(), + false /*IsOutput*/); + else if (CI->isIndirectCall()) + Category = RegCategory::GENERAL; + else { + Function *Callee = CI->getCalledFunction(); + unsigned IntrinID = GenXIntrinsic::not_any_intrinsic; + if (Callee) + IntrinID = GenXIntrinsic::getAnyIntrinsicID(Callee); + // We should not see genx_convert, as it is inserted into a value after + // using this function to determine its category. + assert(IntrinID != GenXIntrinsic::genx_convert); + // For a read or write region or element intrisic, where the use we have + // is the address, mark as needing an address register. + switch (IntrinID) { + case GenXIntrinsic::not_any_intrinsic: + // Arg in subroutine call. Get the category from the function arg, + // or the arg at another call site. (We should not have disagreement + // among the call sites and the function arg, since whichever one + // gets a category first forces the category of all the others.) + Category = getCategoryForCallArg(Callee, U->getOperandNo()); + break; + case GenXIntrinsic::genx_convert_addr: + Category = RegCategory::GENERAL; + break; + case GenXIntrinsic::genx_rdregioni: + case GenXIntrinsic::genx_rdregionf: + if (U->getOperandNo() == 4) // is addr-operand + Category = RegCategory::ADDRESS; + else if (GenXIntrinsic::GenXRegion::OldValueOperandNum == U->getOperandNo()) + Category = RegCategory::NONE; // do not assign use-category + break; + case GenXIntrinsic::genx_wrregioni: + case GenXIntrinsic::genx_wrregionf: + if (U->getOperandNo() == 5) // is addr-operand + Category = RegCategory::ADDRESS; + break; + case GenXIntrinsic::genx_absf: + case GenXIntrinsic::genx_absi: + case GenXIntrinsic::genx_output: + break; + default: { + // For any other intrinsic, look up the argument class. + GenXIntrinsicInfo II(IntrinID); + auto AI = II.getArgInfo(U->getOperandNo()); + return CategoryAndAlignment( + intrinsicCategoryToRegCategory(AI.getCategory()), + AI.getLogAlignment()); + } + break; + } + } + } + return Category; +} + +/*********************************************************************** + * getCategoryForPhiIncomings : get register category from phi incomings + * + * Return: register category from a non-const incoming with a known category + * else NONE if at least one incoming is non-constant + * else GENERAL + * + * We will not have disagreement among the incomings, since whichever one gets + * a category first forces the category of all the others. + */ +unsigned GenXCategory::getCategoryForPhiIncomings(PHINode *Phi) const +{ + bool AllConst = true; + for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { + Value *Incoming = Phi->getIncomingValue(i); + if (!isa(Incoming)) { + AllConst = false; + if (auto LR = Liveness->getLiveRangeOrNull(Incoming)) { + unsigned Cat = LR->getCategory(); + if (Cat != RegCategory::NONE) + return Cat; + } + } + } + if (AllConst) { + // All incomings are constant. Arbitrarily make the phi node value + // general category. + return RegCategory::GENERAL; + } + // No incoming has a category yet. + return RegCategory::NONE; +} + +/*********************************************************************** + * getCategoryForCallArg : get register category from subroutine arg or + * the corresponding arg at some call site + * + * Enter: Callee = function being called + * ArgNo = argument number + * + * Return: register category from subroutine arg or a call arg with a + * known category, else NONE if no category found + * + * We will not have disagreement among the subroutine arg and its corresponding + * call args, since whichever one gets a category first forces the category of + * all the others. + */ +unsigned GenXCategory::getCategoryForCallArg(Function *Callee, unsigned ArgNo) const +{ + assert(Callee); + // First try the subroutine arg. + auto ai = Callee->arg_begin(); + for (unsigned i = 0; i != ArgNo; ++i, ++ai) + ; + if (auto LR = Liveness->getLiveRangeOrNull(&*ai)) { + unsigned Cat = LR->getCategory(); + if (Cat != RegCategory::NONE) + return Cat; + } + // Then try the arg at each call site. + bool UseUndef = true; + for (auto ui = Callee->use_begin(), ue = Callee->use_end(); ui != ue; ++ui) { + if (auto CI = dyn_cast(ui->getUser())) { + auto ArgV = CI->getArgOperand(ArgNo); + if (!isa(ArgV)) { + UseUndef = false; + if (auto LR = Liveness->getLiveRangeOrNull(ArgV)) { + unsigned Cat = LR->getCategory(); + if (Cat != RegCategory::NONE) + return Cat; + } + } + } + } + // special case handling to break deadlock when all uses are undef, + // force the argument to be GENERAL + return(UseUndef ? RegCategory::GENERAL : RegCategory::NONE); +} + diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXCisaBuilder.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXCisaBuilder.cpp new file mode 100644 index 000000000000..dc48d296615a --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXCisaBuilder.cpp @@ -0,0 +1,5779 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXCisaBuilder +/// --------------- +/// +/// This file contains to passes: GenXCisaBuilder and GenXFinalizer. +/// +/// 1. GenXCisaBuilder transforms LLVM IR to CISA IR via Finalizer' public API. +/// It is a FunctionGroupPass, thus it runs once for each kernel and creates +/// CISA IR for it and all its subroutines. +/// Real building of kernels is performed by the GenXKernelBuilder class. +/// This splitting is necessary because GenXCisaBuilder object lives +/// through all Function Groups, but we don't need to keep all Kernel +/// building specific data in such lifetime. +/// +/// 2. GenXFinalizer is a module pass, thus it runs once and all that it does +/// is a running of Finalizer for kernels created in GenXCisaBuilder pass. +/// +//===----------------------------------------------------------------------===// + +#include "GenXGotoJoin.h" +#include "GenXIntrinsics.h" +#include "GenXOCLRuntimeInfo.h" +#include "GenXPressureTracker.h" +#include "GenXRegion.h" +#include "GenXUtil.h" +#include "GenXVisaRegAlloc.h" +#include "common.h" +#include "vc/GenXOpts/Utils/KernelInfo.h" +#include "visaBuilder_interface.h" +#include "llvm/ADT/IndexedMap.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/GenXIntrinsics/GenXIntrinsicInst.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/DiagnosticPrinter.h" +#include "llvm/IR/Dominators.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/Regex.h" +#include "llvm/Support/ScopedPrinter.h" + +#include "llvmWrapper/IR/InstrTypes.h" + +#include +#include +#include + +// If 1, print VISA instructions after corresponding LLVM instruction. +// Only for debug purposes, uses Finalizer internal API. +#define DUMP_VISA_INTSTRUCTIONS 0 + +#if DUMP_VISA_INTSTRUCTIONS +#include "Common_ISA_framework.h" +#include "IsaDisassembly.h" +#include "Mem_Manager.h" +#include "VISAKernel.h" +#endif + +#ifndef COMMON_ISA_MAX_FILENAME_LENGTH +#define COMMON_ISA_MAX_FILENAME_LENGTH 1023 +#endif + +using namespace llvm; +using namespace genx; + +#define DEBUG_TYPE "GENX_CISA_BUILDER" + +static cl::opt EmitVisa("emit-visa", cl::init(false), cl::Hidden, + cl::desc("Generate Visa instead of fat binary.")); +static cl::list + FinalizerOpts("finalizer-opts", cl::Hidden, cl::ZeroOrMore, + cl::desc("Additional options for finalizer.")); + +static cl::opt AsmNameOpt("asm-name", cl::init(""), cl::Hidden, + cl::desc("Output assembly code to this file during compilation.")); + +static cl::opt ReverseKernels("reverse-kernels", cl::init(false), cl::Hidden, + cl::desc("Emit the kernel asm name in reversed order (if user asm name presented).")); + +static cl::opt + PrintFinalizerOptions("cg-print-finalizer-args", cl::init(false), cl::Hidden, + cl::desc("Prints options used to invoke finalizer")); + +enum { + BYTES_PER_OWORD = 16, + BYTES_PER_FADDR = 8, + // stackcall ABI related constants + ARG_SIZE_IN_GRFS = 32, + RET_SIZE_IN_GRFS = 12, + STACK_PER_THREAD = 256 +}; + +/// For VISA_PREDICATE_CONTROL & VISA_PREDICATE_STATE +template T &operator^=(T &a, T b) { + using _T = typename std::underlying_type::type; + static_assert(std::is_integral<_T>::value, + "Wrong operation for non-integral type"); + a = static_cast(static_cast<_T>(a) ^ static_cast<_T>(b)); + return a; +} + +template T operator|=(T &a, T b) { + using _T = typename std::underlying_type::type; + static_assert(std::is_integral<_T>::value, + "Wrong operation for non-integral type"); + a = static_cast(static_cast<_T>(a) | static_cast<_T>(b)); + return a; +} + +struct DstOpndDesc { + Instruction *WrRegion = nullptr; + Instruction *GStore = nullptr; + genx::BaleInfo WrRegionBI; +}; + +namespace { + +// Diagnostic information for errors/warnings in the GEN IR building passes. +class DiagnosticInfoCisaBuild : public DiagnosticInfo { +private: + std::string Description; + static int KindID; + + static int getKindID() { + if (KindID == 0) + KindID = llvm::getNextAvailablePluginDiagnosticKind(); + return KindID; + } + +public: + DiagnosticInfoCisaBuild(const Twine &Desc, DiagnosticSeverity Severity) + : DiagnosticInfo(getKindID(), Severity) { + Description = (Twine("GENX IR generation error: ") + Desc).str(); + } + + void print(DiagnosticPrinter &DP) const override { DP << Description; } + + static bool classof(const DiagnosticInfo *DI) { + return DI->getKind() == getKindID(); + } +}; +int DiagnosticInfoCisaBuild::KindID = 0; + + +static VISA_Exec_Size getExecSizeFromValue(unsigned int Size) { + int Res = genx::log2(Size); + assert(std::bitset(Size).count() <= 1); + assert(Res <= 5 && + "illegal common ISA execsize (should be 1, 2, 4, 8, 16, 32)."); + return Res == -1 ? EXEC_SIZE_ILLEGAL : (VISA_Exec_Size)Res; +} + +static VISA_Oword_Num getCisaOwordNumFromNumber(unsigned num) { + switch (num) { + case 1: + return OWORD_NUM_1; + case 2: + return OWORD_NUM_2; + case 4: + return OWORD_NUM_4; + case 8: + return OWORD_NUM_8; + case 16: + return OWORD_NUM_16; + default: + MUST_BE_TRUE(false, "illegal Oword number."); + return OWORD_NUM_ILLEGAL; + } +} + +VISAChannelMask convertChannelMaskToVisaType(unsigned Mask) { + switch (Mask & 0xf) { + case 1: + return CHANNEL_MASK_R; + case 2: + return CHANNEL_MASK_G; + case 3: + return CHANNEL_MASK_RG; + case 4: + return CHANNEL_MASK_B; + case 5: + return CHANNEL_MASK_RB; + case 6: + return CHANNEL_MASK_GB; + case 7: + return CHANNEL_MASK_RGB; + case 8: + return CHANNEL_MASK_A; + case 9: + return CHANNEL_MASK_RA; + case 10: + return CHANNEL_MASK_GA; + case 11: + return CHANNEL_MASK_RGA; + case 12: + return CHANNEL_MASK_BA; + case 13: + return CHANNEL_MASK_RBA; + case 14: + return CHANNEL_MASK_GBA; + case 15: + return CHANNEL_MASK_RGBA; + default: + llvm_unreachable("Wrong mask"); + } +} + +CHANNEL_OUTPUT_FORMAT getChannelOutputFormat(uint8_t ChannelOutput) { + return (CHANNEL_OUTPUT_FORMAT)((ChannelOutput >> 4) & 0x3); +} + +std::string cutString(std::string Str) { + // vISA is limited to 64 byte strings. But old fe-compiler seems to ignore + // that for source filenames. + if (Str.size() > 64) + Str.erase(64); + return Str; +} + +void handleCisaCallError(int CallResult, const Twine &Call, LLVMContext &Ctx) { + StringRef ErrorType; + switch (CallResult) { + case VISA_SPILL: + ErrorType = "register allocation for a kernel failed, even with spill code"; + break; + case VISA_FAILURE: + ErrorType = "general failure"; + break; + default: + ErrorType = "unknown error"; + break; + } +#ifndef NDEBUG + DiagnosticInfoCisaBuild Err( + "VISA builder API call failed (" + Call + "): " + ErrorType, DS_Error); +#else + DiagnosticInfoCisaBuild Err("VISA builder API call failed: " + ErrorType, + DS_Error); +#endif + Ctx.diagnose(Err); +} + +} // namespace + +#define CISA_CALL(c) \ + do { \ + auto result = c; \ + if (result != VISA_SUCCESS) { \ + handleCisaCallError(result, #c, getContext()); \ + } \ + } while (0); + +namespace llvm { + +static VISA_Type getVisaTypeFromBytesNumber(unsigned BytesNum, bool IsFloat, + genx::Signedness Sign) { + VISA_Type aliasType; + if (IsFloat) { + switch (BytesNum) { + case 2: + aliasType = ISA_TYPE_HF; + break; + case 4: + aliasType = ISA_TYPE_F; + break; + case 8: + aliasType = ISA_TYPE_DF; + break; + default: + report_fatal_error("unknown float type"); + break; + } + } else { + switch (BytesNum) { + case 1: + aliasType = (Sign == SIGNED) ? ISA_TYPE_B : ISA_TYPE_UB; + break; + case 2: + aliasType = (Sign == SIGNED) ? ISA_TYPE_W : ISA_TYPE_UW; + break; + case 4: + aliasType = (Sign == SIGNED) ? ISA_TYPE_D : ISA_TYPE_UD; + break; + case 8: + aliasType = (Sign == SIGNED) ? ISA_TYPE_Q : ISA_TYPE_UQ; + break; + default: + report_fatal_error("unknown integer type"); + break; + } + } + return aliasType; +} + +static VISA_Type llvmToVisaType(Type *Type, + genx::Signedness Sign = DONTCARESIGNED) { + auto T = Type; + assert(!T->isAggregateType()); + VISA_Type Result = ISA_TYPE_NUM; + if (T->isVectorTy() && T->getVectorElementType()->isIntegerTy(1)) { + switch (Type->getVectorNumElements()) { + case 8: + Result = (Sign == SIGNED) ? ISA_TYPE_B : ISA_TYPE_UB; + break; + case 16: + Result = (Sign == SIGNED) ? ISA_TYPE_W : ISA_TYPE_UW; + break; + case 32: + Result = (Sign == SIGNED) ? ISA_TYPE_D : ISA_TYPE_UD; + break; + default: + report_fatal_error("only 8xi1 and 32xi1 are currently supported"); + break; + } + } else { + if (T->isVectorTy()) + T = T->getVectorElementType(); + if (T->isPointerTy() && T->getPointerElementType()->isFunctionTy()) { + // we might have used DL to get the type size but that'd + // overcomplicate this function's type unnecessarily + Result = getVisaTypeFromBytesNumber(BYTES_PER_FADDR, false, DONTCARESIGNED); + } else { + assert(T->isFloatingPointTy() || T->isIntegerTy()); + Result = getVisaTypeFromBytesNumber(T->getScalarSizeInBits() / CHAR_BIT, + T->isFloatingPointTy(), Sign); + } + } + assert(Result != ISA_TYPE_NUM); + return Result; +} + +static VISA_Type llvmToVisaType(Value *V, + genx::Signedness Sign = DONTCARESIGNED) { + return llvmToVisaType(V->getType(), Sign); +} + +// Due to the lack of access to VISA_GenVar internal interfaces (concerning type, size, etc) +// some local DS are required to store such info: CisaVariable and GenericCisaVariable. + +//===----------------------------------------------------------------------===// +// CisaVariable +// ------------------ +// +// CisaVariable keeps VISA_GenVar of a specific VISA_Type and provides accessors +// to its byte size and number of elements thus emulating some internal vISA machinery. +// +//===----------------------------------------------------------------------===// +class CisaVariable { + VISA_Type Type; + unsigned ByteSize = 0; + VISA_GenVar *VisaVar = nullptr; + +public: + CisaVariable(VISA_Type T, unsigned BS, VISA_GenVar *V) + : Type(T), ByteSize(BS), VisaVar(V) {} + + VISA_Type getType() const { return Type; } + + VISA_GenVar *getGenVar() { return VisaVar; } + + unsigned getByteSize() const { return ByteSize; } + + unsigned getNumElements() const { + assert(!(ByteSize % CISATypeTable[Type].typeSize)); + return ByteSize / CISATypeTable[Type].typeSize; + } +}; + +//===----------------------------------------------------------------------===// +// GenericCisaVariable +// ------------------ +// +// GenericCisaVariable describes vISA value that isn't intended to have matching llvm::Value +// (e.g. stack regs %arg and %retv). It provides interface to get a VisaVar alias with a specific +// vISA type. +// +//===----------------------------------------------------------------------===// +class GenericCisaVariable { + const char *Name = ""; + VISA_GenVar *VisaVar = nullptr; + unsigned ByteSize = 0; + + IndexedMap AliasDecls; + std::list Storage; + + unsigned getNumElements(VISA_Type T) const { + assert(!(ByteSize % CISATypeTable[T].typeSize)); + return ByteSize / CISATypeTable[T].typeSize; + } + +public: + GenericCisaVariable(const char *Nm, VISA_GenVar *V, unsigned BS) + : Name(Nm), VisaVar(V), ByteSize(BS) { + AliasDecls.grow(ISA_TYPE_NUM); + } + + CisaVariable *getAlias(Value *V, VISAKernel *K) { + return getAlias(llvmToVisaType(V), K); + } + + CisaVariable *getAlias(VISA_Type T, VISAKernel *K) { + if (!AliasDecls[T]) { + VISA_GenVar *VV = nullptr; + K->CreateVISAGenVar(VV, Name, getNumElements(T), T, ALIGN_GRF, VisaVar); + Storage.push_back(CisaVariable(T, ByteSize, VV)); + AliasDecls[T] = &Storage.back(); + } + return AliasDecls[T]; + } + + unsigned getByteSize() const { return ByteSize; } +}; + +//===----------------------------------------------------------------------===// +/// GenXCisaBuilder +/// ------------------ +/// +/// This class encapsulates a creation of vISA kernels. +/// It is a FunctionGroupPass, thus it runs once for each kernel and +/// builds vISA kernel via class GenXKernelBuilder. +/// All created kernels are stored in CISA Builder object which is provided +/// by finalizer. +/// +//===----------------------------------------------------------------------===// +class GenXCisaBuilder : public FunctionGroupPass { + LLVMContext *Ctx = nullptr; + +public: + static char ID; + explicit GenXCisaBuilder() : FunctionGroupPass(ID) {} + + virtual StringRef getPassName() const { + return "GenX CISA construction pass"; + } + void getAnalysisUsage(AnalysisUsage &AU) const; + bool runOnFunctionGroup(FunctionGroup &FG); + + LLVMContext &getContext() { + assert(Ctx); + return *Ctx; + } +}; + +void initializeGenXCisaBuilderPass(PassRegistry &); + +//===----------------------------------------------------------------------===// +/// GenXKernelBuilder +/// ------------------ +/// +/// This class does all the work for creation of vISA kernels. +/// +//===----------------------------------------------------------------------===// +class GenXKernelBuilder { + using Register = GenXVisaRegAlloc::Reg; + + VISAKernel *MainKernel = nullptr; + VISAFunction *Kernel = nullptr; + genx::KernelMetadata TheKernelMetadata; + LLVMContext &Ctx; + const DataLayout &DL; + + std::map Func2Kern; + + std::map StringPool; + std::vector Labels; + std::map LabelMap; + + // loop info for each function + std::map *> Loops; + ValueMap IsInLoopCache; + + bool HasBarrier = false; + bool HasCallable = false; + bool HasStackcalls = false; + bool HasAlloca = false; + // GRF width in unit of byte + unsigned GrfByteSize = 32; + + int LastLabel = 0; + unsigned LastLine = 0; + unsigned PendingLine = 0; + StringRef LastFilename; + StringRef PendingFilename; + StringRef LastDirectory; + StringRef PendingDirectory; + + // function currently being written during constructor + Function *Func = nullptr; + // function corresponding to VISAKernel currently being written + Function *KernFunc = nullptr; + PreDefined_Surface StackSurf; + + std::map FPMap; + SmallVector RetvInserts; + + std::map> CisaVars; + + // The default float control from kernel attribute. Each subroutine may + // overrride this control mask, but it should revert back to the default float + // control mask before exiting from the subroutine. + uint32_t DefaultFloatControl = 0; + + static const uint32_t CR_Mask = 0x1 << 10 | 0x3 << 6 | 0x3 << 4 | 0x1; + + // normally false, set to true if there is any SIMD CF in the func or this is + // (indirectly) called inside any SIMD CF. + bool NoMask = false; + + genx::AlignmentInfo AI; + +public: + FunctionGroup *FG = nullptr; + GenXLiveness *Liveness = nullptr; + GenXNumbering *Numbering = nullptr; + GenXVisaRegAlloc *RegAlloc = nullptr; + FunctionGroupAnalysis *FGA = nullptr; + GenXModule *GM = nullptr; + DominatorTreeGroupWrapperPass *DTs = nullptr; + const GenXSubtarget *Subtarget = nullptr; + GenXBaling *Baling = nullptr; + VISABuilder *CisaBuilder = nullptr; + +private: + void collectKernelInfo(); + void buildVariables(); + void buildInstructions(); + + bool buildInstruction(Instruction *Inst); + bool buildMainInst(Instruction *Inst, genx::BaleInfo BI, unsigned Mod, + const DstOpndDesc &DstDesc); + void buildControlRegUpdate(unsigned Mask, bool Clear); + void buildJoin(CallInst *Join, BranchInst *Branch); + bool buildBranch(BranchInst *Branch); + void buildIntrinsic(CallInst *CI, unsigned IntrinID, genx::BaleInfo BI, + unsigned Mod, const DstOpndDesc &DstDesc); + void buildInputs(Function *F, bool NeedRetIP); + + void buildFunctionAddr(Instruction *Inst, const DstOpndDesc &DstDesc); + void buildLoneWrRegion(const DstOpndDesc &Desc); + void buildLoneWrPredRegion(Instruction *Inst, genx::BaleInfo BI); + void buildLoneOperand(Instruction *Inst, genx::BaleInfo BI, unsigned Mod, + const DstOpndDesc &DstDesc); + + VISA_PredVar *getPredicateVar(Register *Idx); + VISA_PredVar *getPredicateVar(Value *V); + VISA_PredVar *getZeroedPredicateVar(Value *V); + VISA_EMask_Ctrl getExecMaskFromWrPredRegion(Instruction *WrPredRegion, + bool IsNoMask); + VISA_EMask_Ctrl getExecMaskFromWrRegion(const DstOpndDesc &DstDesc, + bool IsNoMask = false); + unsigned getOrCreateLabel(Value *V, int Kind); + int getLabel(Value *V); + void setLabel(Value *V, unsigned Num); + + void emitOptimizationHints(); + + LoopInfoBase *getLoops(Function *F); + Value *getPredicateOperand(Instruction *Inst, unsigned OperandNum, + genx::BaleInfo BI, VISA_PREDICATE_CONTROL &Control, + VISA_PREDICATE_STATE &PredField, + VISA_EMask_Ctrl *MaskCtrl); + bool isInLoop(BasicBlock *BB); + + void addLabelInst(Value *BB); + void buildPhiNode(PHINode *Phi); + void buildGoto(CallInst *Goto, BranchInst *Branch); + void buildCall(IGCLLVM::CallInst *CI, const DstOpndDesc &DstDesc); + void buildStackCall(IGCLLVM::CallInst *CI, const DstOpndDesc &DstDesc); + void buildInlineAsm(CallInst *CI); + void buildPrintIndex(CallInst *CI, unsigned IntrinID, unsigned Mod, + const DstOpndDesc &DstDesc); + void buildSelectInst(SelectInst *SI, genx::BaleInfo BI, unsigned Mod, + const DstOpndDesc &DstDesc); + void buildBinaryOperator(BinaryOperator *BO, genx::BaleInfo BI, unsigned Mod, + const DstOpndDesc &DstDesc); +#if (LLVM_VERSION_MAJOR > 8) + void buildUnaryOperator(UnaryOperator *UO, genx::BaleInfo BI, unsigned Mod, + const DstOpndDesc &DstDesc); +#endif + void buildBoolBinaryOperator(BinaryOperator *BO); + void buildSymbolInst(PtrToIntInst *ptr2Int, unsigned Mod, + const DstOpndDesc &DstDesc); + void buildCastInst(CastInst *CI, genx::BaleInfo BI, unsigned Mod, + const DstOpndDesc &DstDesc); + void buildConvertAddr(CallInst *CI, genx::BaleInfo BI, unsigned Mod, + const DstOpndDesc &DstDesc); + void buildAlloca(CallInst *CI, unsigned IntrinID, unsigned Mod, + const DstOpndDesc &DstDesc); + void addWriteRegionLifetimeStartInst(Instruction *WrRegion); + void addLifetimeStartInst(Instruction *Inst); + void AddGenVar(Register &Reg); + void buildRet(ReturnInst *RI); + void buildBitCast(CastInst *CI, genx::BaleInfo BI, unsigned Mod, + const DstOpndDesc &DstDesc); + void buildCmp(CmpInst *Cmp, genx::BaleInfo BI, const DstOpndDesc &DstDesc); + void buildExtractRetv(ExtractValueInst *Inst); + void buildInsertRetv(InsertValueInst *Inst); + + VISA_VectorOpnd *createState(Register *Reg, unsigned Offset, bool IsDst); + VISA_Type getVISAImmTy(uint8_t ImmTy); + + VISA_PredOpnd *createPredOperand(VISA_PredVar *PredVar, + VISA_PREDICATE_STATE State, + VISA_PREDICATE_CONTROL Control); + + VISA_VectorOpnd *createCisaSrcOperand(VISA_GenVar *Decl, VISA_Modifier Mod, + unsigned VStride, unsigned Width, + unsigned HStride, unsigned ROffset, + unsigned COffset); + + VISA_VectorOpnd *createCisaDstOperand(VISA_GenVar *Decl, unsigned HStride, + unsigned ROffset, unsigned COffset); + + VISA_VectorOpnd *createDestination(Value *Dest, genx::Signedness Signed, + unsigned Mod, const DstOpndDesc &DstDesc, + genx::Signedness *SignedRes = nullptr, + unsigned *Offset = nullptr); + VISA_VectorOpnd *createDestination(CisaVariable *Dest, + genx::Signedness Signed, + unsigned *Offset = nullptr); + VISA_VectorOpnd *createDestination(Value *Dest, + genx::Signedness Signed, + unsigned *Offset = nullptr); + VISA_VectorOpnd *createSourceOperand(Instruction *Inst, + genx::Signedness Signed, + unsigned OperandNum, genx::BaleInfo BI, + unsigned Mod = 0, + genx::Signedness *SignedRes = nullptr, + unsigned MaxWidth = 16); + VISA_VectorOpnd *createSource(CisaVariable *V, genx::Signedness Signed, + unsigned MaxWidth = 16, + unsigned *Offset = nullptr); + VISA_VectorOpnd *createSource(Value *V, genx::Signedness Signed, bool Baled, + unsigned Mod = 0, + genx::Signedness *SignedRes = nullptr, + unsigned MaxWidth = 16, + unsigned *Offset = nullptr); + VISA_VectorOpnd *createSource(Value *V, genx::Signedness Signed, + unsigned MaxWidth = 16, + unsigned *Offset = nullptr); + + std::string createInlineAsmOperand(Register *Reg, genx::Region *R, bool IsDst, + genx::Signedness Signed, + genx::ConstraintType Ty, unsigned Mod); + + std::string createInlineAsmSourceOperand(Value *V, genx::Signedness Signed, + bool Baled, genx::ConstraintType Ty, + unsigned Mod = 0, + unsigned MaxWidth = 16); + + std::string createInlineAsmDestinationOperand(Value *Dest, + genx::Signedness Signed, + genx::ConstraintType Ty, + unsigned Mod, + const DstOpndDesc &DstDesc); + + VISA_VectorOpnd *createImmediateOperand(Constant *V, genx::Signedness Signed); + + VISA_PredVar *createPredicateDeclFromSelect(Instruction *SI, + genx::BaleInfo BI, + VISA_PREDICATE_CONTROL &Control, + VISA_PREDICATE_STATE &PredField, + VISA_EMask_Ctrl *MaskCtrl); + + VISA_RawOpnd *createRawSourceOperand(Instruction *Inst, unsigned OperandNum, + genx::BaleInfo BI, + genx::Signedness Signed); + VISA_RawOpnd *createRawDestination(Value *V, const DstOpndDesc &DstDesc, + genx::Signedness Signed); + + VISA_VectorOpnd *createAddressOperand(Value *V, bool IsDst); + + void addDebugInfo(); + + void deduceRegion(Region *R, bool IsDest, unsigned MaxWidth = 16); + + VISA_VectorOpnd *createGeneralOperand(genx::Region *R, VISA_GenVar *Decl, + genx::Signedness Signed, unsigned Mod, + bool IsDest, unsigned MaxWidth = 16); + VISA_VectorOpnd *createIndirectOperand(genx::Region *R, + genx::Signedness Signed, unsigned Mod, + bool IsDest, unsigned MaxWidth = 16); + VISA_VectorOpnd *createRegionOperand(genx::Region *R, VISA_GenVar *Decl, + genx::Signedness Signed, unsigned Mod, + bool IsDest, unsigned MaxWidth = 16); + VISA_PredOpnd *createPredFromWrRegion(const DstOpndDesc &DstDesc); + + VISA_PredOpnd *createPred(Instruction *Inst, genx::BaleInfo BI, + unsigned OperandNum); + + Instruction *getOriginalInstructionForSource(Instruction *CI, + genx::BaleInfo BI); + void buildConvert(CallInst *CI, genx::BaleInfo BI, unsigned Mod, + const DstOpndDesc &DstDesc); + std::string buildAsmName() const; + void beginFunction(Function *Func); + void endFunction(Function *Func, ReturnInst *RI); + + unsigned getFuncArgsSize(Function *F); + unsigned getValueSize(Type *T, unsigned Mod = 32) const; + unsigned getValueSize(CisaVariable *V) const { + return V->getByteSize(); + } + unsigned getValueSize(Value *V, unsigned Mod = 32) const { + return getValueSize(V->getType(), Mod); + } + GenericCisaVariable *createCisaVariable(VISAKernel *Kernel, const char *Name, + VISA_GenVar *AliasVar, unsigned ByteSize); + + template + void emitVectorCopy( + T1 *Dst, T2 *Src, unsigned &RowOff, unsigned &ColOff, unsigned &SrcRowOff, + unsigned &SrcColOff, int TotalSize, bool DoCopy = true); + + void pushStackArg(VISA_StateOpndHandle *Dst, Value *Src, int TotalSz, + unsigned &RowOff, unsigned &ColOff, unsigned &SrcRowOff, + unsigned &SrcColOff, bool DoCopy = true); + void popStackArg(Value *Dst, VISA_StateOpndHandle *Src, int TotalSz, + unsigned &RowOff, unsigned &ColOff, unsigned &SrcRowOff, + unsigned &SrcColOff, int &PrevStackOff); + +public: + GenXKernelBuilder(FunctionGroup *FG) + : TheKernelMetadata(FG->getHead()), Ctx(FG->getContext()), + DL(FG->getModule()->getDataLayout()), FG(FG) { + collectKernelInfo(); + } + ~GenXKernelBuilder() { clearLoops(); } + void clearLoops() { + for (auto i = Loops.begin(), e = Loops.end(); i != e; ++i) { + delete i->second; + i->second = nullptr; + } + Loops.clear(); + } + + bool run(std::string &KernelNameBuf); + + LLVMContext &getContext() { return Ctx; } + + unsigned addStringToPool(StringRef Str); + StringRef getStringByIndex(unsigned Val); +}; + +} // end namespace llvm + +char GenXCisaBuilder::ID = 0; +INITIALIZE_PASS_BEGIN(GenXCisaBuilder, "GenXCisaBuilderPass", + "GenXCisaBuilderPass", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeGroupWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GenXGroupBaling) +INITIALIZE_PASS_DEPENDENCY(GenXLiveness) +INITIALIZE_PASS_DEPENDENCY(GenXVisaRegAlloc) +INITIALIZE_PASS_DEPENDENCY(GenXModule) +INITIALIZE_PASS_END(GenXCisaBuilder, "GenXCisaBuilderPass", + "GenXCisaBuilderPass", false, false) + +FunctionGroupPass *llvm::createGenXCisaBuilderPass() { + initializeGenXCisaBuilderPass(*PassRegistry::getPassRegistry()); + return new GenXCisaBuilder(); +} + +void GenXCisaBuilder::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.setPreservesAll(); +} + +bool GenXCisaBuilder::runOnFunctionGroup(FunctionGroup &FG) { + Ctx = &FG.getContext(); + std::unique_ptr KernelBuilder(new GenXKernelBuilder(&FG)); + KernelBuilder->FGA = getAnalysisIfAvailable(); + KernelBuilder->GM = getAnalysisIfAvailable(); + KernelBuilder->CisaBuilder = KernelBuilder->GM->GetCisaBuilder(); + KernelBuilder->RegAlloc = getAnalysisIfAvailable(); + KernelBuilder->Baling = &getAnalysis(); + KernelBuilder->DTs = &getAnalysis(); + KernelBuilder->Liveness = &getAnalysis(); + auto P = getAnalysisIfAvailable(); + KernelBuilder->Subtarget = P ? P->getSubtarget() : nullptr; + + std::string KernelName; + KernelBuilder->run(KernelName); + + GenXModule *GM = KernelBuilder->GM; + VISABuilder *VisaBuilder = GM->GetCisaBuilder(); + if (GM->HasInlineAsm()) { + CISA_CALL(KernelBuilder->CisaBuilder->WriteVISAHeader()); + auto VISAAsmTextReader = GM->GetVISAAsmReader(); + auto VISATextHeader = + KernelBuilder->CisaBuilder->GetAsmTextHeaderStream().str(); + auto VISAText = KernelBuilder->CisaBuilder->GetAsmTextStream().str(); + CISA_CALL(VISAAsmTextReader->ParseVISAText(VISATextHeader, VISAText, "")); + VisaBuilder = VISAAsmTextReader; + } + for (auto &F : FG) { + if (genx::isKernel(F)) { + VISAKernel *BuiltKernel = VisaBuilder->GetVISAKernel(KernelName.c_str()); + GM->saveVisaKernel(F, BuiltKernel); + } else if (F->hasFnAttribute(genx::FunctionMD::CMStackCall)) { + VISAKernel *BuiltKernel = VisaBuilder->GetVISAKernel(F->getName()); + GM->saveVisaKernel(F, BuiltKernel); + } + } + + return false; +} + +static bool isDerivedFromUndef(Constant *C) { + if (isa(C)) + return true; + if (!isa(C)) + return false; + ConstantExpr *CE = cast(C); + for (auto &Opnd : CE->operands()) + if (isDerivedFromUndef(cast(Opnd))) + return true; + return false; +} + +static unsigned get8bitPackedFloat(float f) { + union { + float f; + unsigned u; + } u; + + u.f = f; + unsigned char Sign = (u.u >> 31) << 7; + unsigned Exp = (u.u >> 23) & 0xFF; + unsigned Frac = u.u & 0x7FFFFF; + if (Exp == 0 && Frac == 0) + return Sign; + + assert(Exp >= 124 && Exp <= 131); + Exp -= 124; + assert((Frac & 0x780000) == Frac); + Frac >>= 19; + assert(!(Exp == 124 && Frac == 0)); + + Sign |= (Exp << 4); + Sign |= Frac; + + return Sign; +} + +static Signedness getISatSrcSign(unsigned IID) { + switch (IID) { + case GenXIntrinsic::genx_sstrunc_sat: + case GenXIntrinsic::genx_ustrunc_sat: + return SIGNED; + case GenXIntrinsic::genx_sutrunc_sat: + case GenXIntrinsic::genx_uutrunc_sat: + return UNSIGNED; + default: + return DONTCARESIGNED; + } +} + +static Signedness getISatDstSign(unsigned IID) { + switch (IID) { + case GenXIntrinsic::genx_sstrunc_sat: + case GenXIntrinsic::genx_sutrunc_sat: + return SIGNED; + case GenXIntrinsic::genx_ustrunc_sat: + case GenXIntrinsic::genx_uutrunc_sat: + return UNSIGNED; + default: + return DONTCARESIGNED; + } +} + +static Signedness getISatSrcSign(Value *V) { + return getISatSrcSign(GenXIntrinsic::getGenXIntrinsicID(V)); +} + +static Signedness getISatDstSign(Value *V) { + return getISatDstSign(GenXIntrinsic::getGenXIntrinsicID(V)); +} + +void addKernelAttrsFromMetadata(VISAKernel &Kernel, const KernelMetadata &KM, + const GenXSubtarget* Subtarget) { + unsigned Val = KM.getSLMSize(); + if (Val) { + // Compute the slm size in KB and roundup to power of 2. + Val = alignTo(Val, 1024) / 1024; + if (!isPowerOf2_64(Val)) + Val = NextPowerOf2(Val); + unsigned MaxSLMSize = 64; + if (Val > MaxSLMSize) + report_fatal_error("slm size must not exceed 64KB"); + else { + // For pre-SKL, valid values are {0, 4, 8, 16, 32, 64}. + // For SKL+, valid values are {0, 1, 2, 4, 8, 16, 32, 64}. + // FIXME: remove the following line for SKL+. + Val = (Val < 4) ? 4 : Val; + uint8_t SLMSize = static_cast(Val); + Kernel.AddKernelAttribute("SLMSize", 1, &SLMSize); + } + } + +} + +// Legalize name for using as filename or in visa asm +static std::string legalizeName(std::string Name) { + std::replace_if(Name.begin(), Name.end(), + [](unsigned char c) { return (!isalnum(c) && c != '_'); }, + '_'); + return Name; +} + +std::string GenXKernelBuilder::buildAsmName() const { + assert(TheKernelMetadata.isKernel()); + std::string AsmName; + auto UserAsmName = AsmNameOpt.getValue(); + if (UserAsmName.empty()) { + AsmName = legalizeName(TheKernelMetadata.getName()); + } else { + int idx = -1; + auto *KernelMDs = + FG->getModule()->getOrInsertNamedMetadata(genx::FunctionMD::GenXKernels); + unsigned E = KernelMDs->getNumOperands(); + for (unsigned I = 0; I < E; ++I) { + MDNode *KernelMD = KernelMDs->getOperand(I); + StringRef KernelName = + cast(KernelMD->getOperand(genx::KernelMDOp::Name).get()) + ->getString(); + if (KernelName == TheKernelMetadata.getName()) { + idx = I; + break; + } + } + assert(idx >= 0); + // Reverse kernel ASM names during codegen. + // This provides an option to match the old compiler's output. + if (ReverseKernels.getValue()) + idx = E - idx - 1; + AsmName = (UserAsmName + llvm::Twine('_') + llvm::Twine(idx)).str(); + } + return AsmName; +} + + +bool GenXKernelBuilder::run(std::string &KernelNameBuf) { + GrfByteSize = Subtarget ? Subtarget->getGRFWidth() : 32; + StackSurf = Subtarget ? Subtarget->stackSurface() : PREDEFINED_SURFACE_STACK; + StringRef Name = TheKernelMetadata.getName(); + if (!Name.size()) { + // If it is not a kernel, or no metadata was found, then set the + // name to the IR name. + Name = FG->getHead()->getName(); + } + + // Cut kernel name to fit vISA name size + auto Size = (Name.size() > COMMON_ISA_MAX_FILENAME_LENGTH) + ? (COMMON_ISA_MAX_FILENAME_LENGTH) + : Name.size(); + KernelNameBuf.insert(0, Name.begin(), Size); + KernelNameBuf[Size] = 0; + if (TheKernelMetadata.isKernel()) { + CisaBuilder->AddKernel(MainKernel, KernelNameBuf.c_str()); + Kernel = static_cast(MainKernel); + Func2Kern[FG->getHead()] = Kernel; + } else { + CisaBuilder->AddFunction(Kernel, KernelNameBuf.c_str()); + } + + assert(Kernel && "Kernel initialization failed!"); + LLVM_DEBUG(dbgs() << "=== PROCESS KERNEL(" << TheKernelMetadata.getName() + << ") ===\n"); + + assert(Subtarget); + addKernelAttrsFromMetadata(*Kernel, TheKernelMetadata, Subtarget); + + bool NeedRetIP = false; // Need special return IP variable for FC. + if (TheKernelMetadata.isKernel()) { + // For a kernel, add an attribute for asm filename for the jitter. + std::string AsmName = buildAsmName(); + StringRef AsmNameRef = AsmName; + CISA_CALL(Kernel->AddKernelAttribute("OutputAsmPath", AsmNameRef.size(), + AsmNameRef.begin())); + + // Populate variable attributes if any. + unsigned Idx = 0; + bool IsComposable = false; + for (auto &Arg : FG->getHead()->args()) { + const char *Kind = nullptr; + switch (TheKernelMetadata.getArgInputOutputKind(Idx++)) { + default: + break; + case KernelMetadata::IO_INPUT: + Kind = "Input"; + break; + case KernelMetadata::IO_OUTPUT: + Kind = "Output"; + break; + case KernelMetadata::IO_INPUT_OUTPUT: + Kind = "Input_Output"; + break; + } + if (Kind != nullptr) { + auto R = RegAlloc->getRegForValueUntyped(FG->getHead(), &Arg); + assert(R && R->Category == RegCategory::GENERAL); + R->addAttribute(addStringToPool(Kind), ""); + IsComposable = true; + } + } + if (IsComposable) + CISA_CALL(Kernel->AddKernelAttribute("Composable", 0, "")); + if (HasCallable) { + CISA_CALL(Kernel->AddKernelAttribute("Caller", 0, "")); + NeedRetIP = true; + } + if (FG->getHead()->hasFnAttribute("CMCallable")) { + CISA_CALL(Kernel->AddKernelAttribute("Callable", 0, "")); + NeedRetIP = true; + } + if (FG->getHead()->hasFnAttribute("CMEntry")) { + CISA_CALL(Kernel->AddKernelAttribute("Entry", 0, "")); + } + } + + if (NeedRetIP) { + // Ask RegAlloc to add a special variable RetIP. + RegAlloc->addRetIPArgument(); + auto R = RegAlloc->getRetIPArgument(); + R->NameStr = "RetIP"; + R->addAttribute(addStringToPool("Input_Output"), ""); + } + + // Emit optimization hints if any. + emitOptimizationHints(); + + Func = FG->getHead(); + // Build variables + buildVariables(); + + // Build input variables + buildInputs(FG->getHead(), NeedRetIP); + + for (auto &F : *FG) { + Func = F; + if (F->hasFnAttribute(genx::FunctionMD::CMStackCall) || + F->hasFnAttribute(genx::FunctionMD::ReferencedIndirectly)) { + VISAFunction *stackFunc = nullptr; + CisaBuilder->AddFunction((VISAFunction *&)stackFunc, F->getName().data()); + assert(stackFunc); + Func2Kern[F] = stackFunc; + Kernel = stackFunc; + buildVariables(); + Kernel = static_cast(MainKernel); + } + } + + // Build instructions + buildInstructions(); + + // Reset Regalloc hook + RegAlloc->SetRegPushHook(nullptr, nullptr); + + if (TheKernelMetadata.isKernel()) { + // For a kernel with no barrier instruction, add a NoBarrier attribute. + if (!HasBarrier) + CISA_CALL(Kernel->AddKernelAttribute("NoBarrier", 0, nullptr)); + } + + return false; +} + +static bool PatchImpArgOffset(Function *F, const GenXSubtarget *ST, + const KernelMetadata &KM) { + return false; +} + +void GenXKernelBuilder::buildInputs(Function *F, bool NeedRetIP) { + + assert(F->arg_size() == TheKernelMetadata.getNumArgs() && + "Mismatch between metadata for kernel and number of args"); + + // Number of globals to be binded statically. + std::vector> Bindings; + Module *M = F->getParent(); + for (auto &GV : M->getGlobalList()) { + int32_t Offset = 0; + GV.getAttribute(genx::FunctionMD::GenXByteOffset) + .getValueAsString() + .getAsInteger(0, Offset); + if (Offset > 0) + Bindings.emplace_back(&GV, Offset); + } + // Each argument. + unsigned Idx = 0; + bool PatchImpArgOff = PatchImpArgOffset(F, Subtarget, TheKernelMetadata); + for (auto i = F->arg_begin(), e = F->arg_end(); i != e; ++i, ++Idx) { + if (TheKernelMetadata.shouldSkipArg(Idx)) + continue; + Argument *Arg = &*i; + Register *Reg = RegAlloc->getRegForValueUntyped(F, Arg); + assert(Reg); + uint8_t Kind = TheKernelMetadata.getArgKind(Idx); + uint16_t Offset; + if (!PatchImpArgOff) { + Offset = TheKernelMetadata.getArgOffset(Idx); + } + // Argument size in bytes. + auto &DL = F->getParent()->getDataLayout(); + Type *Ty = Arg->getType(); + uint16_t NumBytes = Ty->isPointerTy() ? DL.getPointerTypeSize(Ty) + : (Ty->getPrimitiveSizeInBits() / 8U); + + switch (Kind & 0x7) { + case visa::VISA_INPUT_GENERAL: + case visa::VISA_INPUT_SAMPLER: + case visa::VISA_INPUT_SURFACE: + CISA_CALL(Kernel->CreateVISAImplicitInputVar( + Reg->GetVar(Kernel), Offset, NumBytes, Kind >> 3)); + break; + + default: + report_fatal_error("Unknown input category"); + break; + } + } + // Add the special RetIP argument. + if (NeedRetIP) { + Register *Reg = RegAlloc->getRetIPArgument(); + uint16_t Offset = (127 * GrfByteSize + 6 * 4); // r127.6 + uint16_t NumBytes = (64 / 8); + CISA_CALL(Kernel->CreateVISAImplicitInputVar(Reg->GetVar(Kernel), + Offset, NumBytes, 0)); + } + // Add pseudo-input for global variables with offset attribute. + for (auto &Item : Bindings) { + // TODO: sanity check. No overlap with other inputs. + GlobalVariable *GV = Item.first; + uint16_t Offset = Item.second; + assert(Offset > 0); + uint16_t NumBytes = (GV->getValueType()->getPrimitiveSizeInBits() / 8U); + uint8_t Kind = KernelMetadata::IMP_PSEUDO_INPUT; + Register *Reg = RegAlloc->getRegForValueUntyped(F, GV); + CISA_CALL(Kernel->CreateVISAImplicitInputVar(Reg->GetVar(Kernel), + Offset, NumBytes, Kind >> 3)); + } +} + +// FIXME: We should use NM by default once code quality issues are addressed +// in vISA compiler. +static bool setNoMaskByDefault(Function *F) { + for (auto &BB : F->getBasicBlockList()) + if (GotoJoin::isGotoBlock(&BB)) + return true; + + // Check if this is subroutine call. + for (auto U : F->users()) { + if (auto CI = dyn_cast(U)) { + Function *G = CI->getParent()->getParent(); + if (G == F) + return false; + if (setNoMaskByDefault(G)) + return true; + } + } + + return false; +} + +void GenXKernelBuilder::buildInstructions() { + for (auto It = FG->begin(), E = FG->end(); It != E; ++It) { + Func = *It; + LLVM_DEBUG(dbgs() << "Building IR for func " << Func->getName().data() + << "\n"); + NoMask = setNoMaskByDefault(Func); + + if (Func->hasFnAttribute(genx::FunctionMD::CMGenXMain) || + Func->hasFnAttribute(genx::FunctionMD::CMStackCall) || + Func->hasFnAttribute(genx::FunctionMD::ReferencedIndirectly)) { + KernFunc = Func; + } else { + KernFunc = FGA->getSubGroup(Func) ? FGA->getSubGroup(Func)->getHead() + : FGA->getGroup(Func)->getHead(); + } + assert(KernFunc); + Kernel = Func2Kern.at(KernFunc); + + unsigned LabelID = getOrCreateLabel(Func, LABEL_SUBROUTINE); + CISA_CALL(Kernel->AppendVISACFLabelInst(Labels[LabelID])); + + beginFunction(Func); + + // If a float control is specified, emit code to make that happen. + // Float control contains rounding mode, denorm behaviour and single + // precision float mode (ALT or IEEE) Relevant bits are already set as + // defined for VISA control reg in header definition on enums + if (Func->hasFnAttribute(genx::FunctionMD::CMFloatControl)) { + uint32_t FloatControl = 0; + Func->getFnAttribute(genx::FunctionMD::CMFloatControl) + .getValueAsString() + .getAsInteger(0, FloatControl); + + // Clear current float control bits to known zero state + buildControlRegUpdate(CR_Mask, true); + + // Set rounding mode to required state if that isn't zero + FloatControl &= CR_Mask; + if (FloatControl) { + if (FG->getHead() == Func) + DefaultFloatControl = FloatControl; + buildControlRegUpdate(FloatControl, false); + } + } + + // Only output a label for the initial basic block if it is used from + // somewhere else. + bool NeedsLabel = !Func->front().use_empty(); + for (Function::iterator fi = Func->begin(), fe = Func->end(); fi != fe; + ++fi) { + BasicBlock *BB = &*fi; + if (!NeedsLabel && BB != &Func->front()) { + NeedsLabel = !BB->getSinglePredecessor(); + if (!NeedsLabel) + NeedsLabel = GotoJoin::isJoinLabel(BB); + } + if (NeedsLabel) { + unsigned LabelID = getOrCreateLabel(BB, LABEL_BLOCK); + CISA_CALL(Kernel->AppendVISACFLabelInst(Labels[LabelID])); + } + NeedsLabel = true; + for (BasicBlock::iterator bi = BB->begin(), be = BB->end(); bi != be; + ++bi) { + Instruction *Inst = &*bi; + if (Inst->isTerminator()) { + // Before the terminator inst of a basic block, if there is a single + // successor and it is the header of a loop, for any vector of at + // least four GRFs with a phi node where our incoming value is + // undef, insert a lifetime.start here. + auto TI = cast(Inst); + if (TI->getNumSuccessors() == 1) { + auto Succ = TI->getSuccessor(0); + if (getLoops(Succ->getParent())->isLoopHeader(Succ)) { + for (auto si = Succ->begin();; ++si) { + auto Phi = dyn_cast(&*si); + if (!Phi) + break; + if (Phi->getType()->getPrimitiveSizeInBits() >= + (GrfByteSize * 8) * 4 && + isa( + Phi->getIncomingValue(Phi->getBasicBlockIndex(BB)))) + addLifetimeStartInst(Phi); + } + } + } + } + + // Build the instruction. + if (!Baling->isBaled(Inst)) { +#if DUMP_VISA_INTSTRUCTIONS + errs() << *Inst << '\n'; + auto CisaInstCount = Kernel->getvIsaInstCount(); +#endif + if (ReturnInst *RI = dyn_cast(Inst)) + endFunction(Func, RI); + if (buildInstruction(Inst)) + NeedsLabel = false; +#if DUMP_VISA_INTSTRUCTIONS + VISAKernelImpl *KernelImpl = (VISAKernelImpl *)Kernel; + if (CisaInstCount != Kernel->getvIsaInstCount()) { + VISAKernel_format_provider fmt(KernelImpl); + auto It = KernelImpl->getInstructionListBegin(), + ItEnd = KernelImpl->getInstructionListEnd(); + for (int Idx = 0; It != ItEnd; ++It, ++Idx) { + if (Idx >= CisaInstCount + 1) { + errs() << printInstruction(&fmt, (*It)->getCISAInst(), + KernelImpl->getOptions()) + << "\n\n"; + } + } + } +#endif + } + } + } + } +} + +bool GenXKernelBuilder::buildInstruction(Instruction *Inst) { + // Make the source location pending, so it is output as vISA FILE and LOC + // instructions next time an opcode is written. + const DebugLoc &DL = Inst->getDebugLoc(); + if (DL) { + StringRef Filename = DL->getFilename(); + if (Filename != "") { + PendingFilename = Filename; + PendingDirectory = DL->getDirectory(); + } + PendingLine = DL.getLine(); + } + // Process the bale that this is the head instruction of. + BaleInfo BI = Baling->getBaleInfo(Inst); + + DstOpndDesc DstDesc; + if (BI.Type == BaleInfo::GSTORE) { + // Inst is a global variable store. It should be baled into a wrr + // instruction. + Bale B; + Baling->buildBale(Inst, &B); + // This is an identity bale; no code will be emitted. + if (isIdentityBale(B)) + return false; + + assert(BI.isOperandBaled(0)); + DstDesc.GStore = Inst; + Inst = cast(Inst->getOperand(0)); + BI = Baling->getBaleInfo(Inst); + } + + if (BI.Type == BaleInfo::WRREGION || BI.Type == BaleInfo::WRPREDREGION || + BI.Type == BaleInfo::WRPREDPREDREGION) { + // Inst is a wrregion or wrpredregion or wrpredpredregion. + DstDesc.WrRegion = Inst; + DstDesc.WrRegionBI = BI; + if (isa(Inst->getOperand(0)) && !DstDesc.GStore) { + // This is a wrregion, probably a partial write, to an undef value. + // Write a lifetime start if appropriate to help the jitter's register + // allocator. + addWriteRegionLifetimeStartInst(DstDesc.WrRegion); + } + // See if it bales in the instruction + // that generates the subregion/element. That is always operand 1. + enum { OperandNum = 1 }; + if (!BI.isOperandBaled(OperandNum)) { + if (BI.Type == BaleInfo::WRPREDREGION) { + buildLoneWrPredRegion(DstDesc.WrRegion, DstDesc.WrRegionBI); + } else { + buildLoneWrRegion(DstDesc); + } + return false; + } + // Yes, source of wrregion is baled in. + Inst = cast(DstDesc.WrRegion->getOperand(OperandNum)); + BI = Baling->getBaleInfo(Inst); + } + if (BI.Type == BaleInfo::FADDR) { + buildFunctionAddr(Inst, DstDesc); + return false; + } + unsigned Mod = 0; + if (BI.Type == BaleInfo::SATURATE) { + // Inst is a fp saturate. See if it bales in the instruction that + // generates the value to saturate. That is always operand 0. If + // not, just treat the saturate as a normal intrinsic. + if (BI.isOperandBaled(0)) { + Mod = MODIFIER_SAT; + Inst = cast(Inst->getOperand(0)); + BI = Baling->getBaleInfo(Inst); + } else + BI.Type = BaleInfo::MAININST; + } + if (BI.Type == BaleInfo::CMPDST) { + // Dst of sel instruction is baled in. + Inst = cast(Inst->getOperand(0)); + assert(isa(Inst) && "Only bale sel into a cmp instruction"); + BI = Baling->getBaleInfo(Inst); + } + switch (BI.Type) { + case BaleInfo::RDREGION: + case BaleInfo::ABSMOD: + case BaleInfo::NEGMOD: + case BaleInfo::NOTMOD: + // This is a rdregion or modifier not baled in to a main instruction + // (but possibly baled in to a wrregion or sat modifier). + buildLoneOperand(Inst, BI, Mod, DstDesc); + return false; + } + assert(BI.Type == BaleInfo::MAININST || BI.Type == BaleInfo::NOTP || + BI.Type == BaleInfo::ZEXT || BI.Type == BaleInfo::SEXT); + return buildMainInst(Inst, BI, Mod, DstDesc); +} + +VISA_PredVar *GenXKernelBuilder::createPredicateDeclFromSelect( + Instruction *SI, BaleInfo BI, VISA_PREDICATE_CONTROL &Control, + VISA_PREDICATE_STATE &State, VISA_EMask_Ctrl *MaskCtrl) { + *MaskCtrl = vISA_EMASK_M1_NM; + // Get the predicate (mask) operand, scanning through baled in + // all/any/not/rdpredregion and setting State and MaskCtrl + // appropriately. + Value *Mask = getPredicateOperand(SI, 0 /*selector operand in select*/, BI, + Control, State, MaskCtrl); + assert(!isa(Mask)); + // Variable predicate. Derive the predication field from any baled in + // all/any/not and the predicate register number. + Register *Reg = RegAlloc->getRegForValue(KernFunc, Mask); + assert(Reg && Reg->Category == RegCategory::PREDICATE); + if (NoMask) + *MaskCtrl |= vISA_EMASK_M1_NM; + return getPredicateVar(Reg); +} + +VISA_PredOpnd * +GenXKernelBuilder::createPredFromWrRegion(const DstOpndDesc &DstDesc) { + VISA_PredOpnd *result = nullptr; + Instruction *WrRegion = DstDesc.WrRegion; + if (WrRegion) { + // Get the predicate (mask) operand, scanning through baled in + // all/any/not/rdpredregion and setting PredField and MaskCtrl + // appropriately. + VISA_EMask_Ctrl MaskCtrl; + VISA_PREDICATE_CONTROL Control; + VISA_PREDICATE_STATE State; + Value *Mask = + getPredicateOperand(WrRegion, 7 /*mask operand in wrregion*/, + DstDesc.WrRegionBI, Control, State, &MaskCtrl); + if (auto C = dyn_cast(Mask)) { + (void)C; + assert(C->isAllOnesValue() && "wrregion mask or predication operand must " + "be constant 1 or not constant"); + } else { + // Variable predicate. Derive the predication field from any baled in + // all/any/not and the predicate register number. If the predicate has + // not has a register allocated, it must be EM. + Register *Reg = RegAlloc->getRegForValueOrNull(KernFunc, Mask); + if (Reg) { + assert(Reg->Category == RegCategory::PREDICATE); + result = createPredOperand(getPredicateVar(Reg), State, Control); + } + } + } + return result; +} + +/*********************************************************************** + * createPred : create predication field from an instruction operand + * + * Enter: Inst = the instruction (0 to write an "always true" pred field) + * BI = BaleInfo for the instruction, so we can see if there is a + * rdpredregion baled in to the mask + * OperandNum = operand number in the instruction + * + * If the operand is not constant 1, then it must be a predicate register. + */ +VISA_PredOpnd *GenXKernelBuilder::createPred(Instruction *Inst, BaleInfo BI, + unsigned OperandNum) { + VISA_PredOpnd *ResultOperand = nullptr; + VISA_PREDICATE_CONTROL PredControl; + VISA_PREDICATE_STATE Inverse; + VISA_EMask_Ctrl MaskCtrl; + Value *Mask = getPredicateOperand(Inst, OperandNum, BI, PredControl, Inverse, + &MaskCtrl); + if (auto C = dyn_cast(Mask)) { + (void)C; + assert(C->isAllOnesValue() && "wrregion mask or predication operand must " + "be constant 1 or not constant"); + } else { + // Variable predicate. Derive the predication field from any baled in + // all/any/not and the predicate register number. If the predicate has not + // has a register allocated, it must be EM. + Register *Reg = RegAlloc->getRegForValueOrNull(KernFunc, Mask); + VISA_PredVar *PredVar = nullptr; + if (Reg) { + assert(Reg->Category == RegCategory::PREDICATE); + PredVar = getPredicateVar(Reg); + } else + return nullptr; + ResultOperand = createPredOperand(PredVar, Inverse, PredControl); + } + return ResultOperand; +} + +VISA_VectorOpnd *GenXKernelBuilder::createState(Register *Reg, unsigned Offset, + bool IsDst) { + uint8_t Size = 0; + VISA_VectorOpnd *Op = nullptr; + + switch (Reg->Category) { + case RegCategory::SURFACE: + CISA_CALL(Kernel->CreateVISAStateOperand(Op, Reg->GetVar(Kernel), + Size, Offset, IsDst)); + break; + case RegCategory::SAMPLER: + CISA_CALL(Kernel->CreateVISAStateOperand(Op, Reg->GetVar(Kernel), + Size, Offset, IsDst)); + break; + default: + llvm_unreachable("unknown state operand"); + } + + return Op; +} + +VISA_VectorOpnd *GenXKernelBuilder::createDestination(CisaVariable *Dest, + genx::Signedness Signed, + unsigned *Offset) { + Region R(VectorType::get( + IntegerType::get(Ctx, CISATypeTable[Dest->getType()].typeSize * CHAR_BIT), + Dest->getNumElements())); + if (Offset) + R.Offset = *Offset; + return createRegionOperand(&R, Dest->getGenVar(), Signed, 0, true); +} + +VISA_VectorOpnd *GenXKernelBuilder::createDestination(Value *Dest, + genx::Signedness Signed, + unsigned *Offset) { + return createDestination(Dest, Signed, 0, DstOpndDesc(), nullptr, Offset); +} + +VISA_VectorOpnd * +GenXKernelBuilder::createDestination(Value *Dest, genx::Signedness Signed, + unsigned Mod, const DstOpndDesc &DstDesc, + Signedness *SignedRes, unsigned *Offset) { + assert(!Dest->getType()->isAggregateType() && + "cannot create destination register of an aggregate type"); + if (SignedRes) + *SignedRes = Signed; + + Type *OverrideType = nullptr; + if (BitCastInst *BCI = dyn_cast(Dest)) { + if (!(isa(BCI->getOperand(0))) && + !(BCI->getType()->getScalarType()->isIntegerTy(1)) && + (BCI->getOperand(0)->getType()->getScalarType()->isIntegerTy(1))) { + if (VectorType *VT = dyn_cast(Dest->getType())) { + unsigned int NumBits = VT->getNumElements() * + VT->getElementType()->getPrimitiveSizeInBits(); + OverrideType = IntegerType::get(BCI->getContext(), NumBits); + } + } + } + + // Saturation can also change signedness. + if (!Dest->user_empty() && GenXIntrinsic::isIntegerSat(Dest->user_back())) { + Signed = getISatDstSign(Dest->user_back()); + } + + if (!DstDesc.WrRegion) { + if (Mod) { + // There is a sat modifier. Either it is an fp saturate, which is + // represented by its own intrinsic which this instruction is baled + // into, or it is an int saturate which always comes from this + // instruction's semantics. In the former case, use the value + // that is the result of the saturate. But only if this instruction + // itself is not the sat intrinsic. + if (Dest->getType()->getScalarType()->isFloatingPointTy() && + GenXIntrinsic::getGenXIntrinsicID(Dest) != GenXIntrinsic::genx_sat) + Dest = cast(Dest->use_begin()->getUser()); + } + if ((Mod & MODIFIER_SAT) != 0) { + // Similar for integer saturation. + if (Dest->getType()->getScalarType()->isIntegerTy() && + !GenXIntrinsic::isIntegerSat(Dest) && GenXIntrinsic::isIntegerSat(Dest->user_back())) + Dest = cast(Dest->user_back()); + } + Register *Reg = RegAlloc->getRegForValue(KernFunc, Dest, Signed, OverrideType); + if (SignedRes) + *SignedRes = RegAlloc->getSigned(Reg); + // Write the vISA general operand: + if (Reg->Category == RegCategory::GENERAL) { + Region DestR(Dest); + if (Offset) + DestR.Offset = *Offset; + return createRegionOperand(&DestR, Reg->GetVar(Kernel), + DONTCARESIGNED, Mod, true /*isDest*/); + } else { + assert(Reg->Category == RegCategory::SURFACE || + Reg->Category == RegCategory::VME || + Reg->Category == RegCategory::SAMPLER); + + return createState(Reg, 0 /*Offset*/, true /*IsDst*/); + } + } + // We need to allow for the case that there is no register allocated if it + // is an indirected arg, and that is OK because the region is indirect so + // the vISA does not contain the base register. + Register *Reg; + + Value *V = nullptr; + if (DstDesc.GStore) { + auto GV = getUnderlyingGlobalVariable(DstDesc.GStore->getOperand(1)); + assert(GV && "out of sync"); + if (OverrideType == nullptr) + OverrideType = DstDesc.GStore->getOperand(0)->getType(); + Reg = RegAlloc->getRegForValue(KernFunc, GV, Signed, OverrideType); + V = GV; + } else { + V = DstDesc.WrRegion; + Reg = RegAlloc->getRegForValueOrNull(KernFunc, V, Signed, OverrideType); + } + + assert(!Reg || Reg->Category == RegCategory::GENERAL || + Reg->Category == RegCategory::SAMPLER || + Reg->Category == RegCategory::SURFACE || + Reg->Category == RegCategory::VME); + + // Write the vISA general operand with region: + Region R(DstDesc.WrRegion, DstDesc.WrRegionBI); + + if (SignedRes) + *SignedRes = RegAlloc->getSigned(Reg); + + if (Reg && (Reg->Category == RegCategory::SAMPLER || + Reg->Category == RegCategory::SURFACE || + Reg->Category == RegCategory::VME)) { + return createState(Reg, R.Offset / R.ElementBytes, true /*IsDest*/); + } else { + auto Decl = Reg ? Reg->GetVar(Kernel) : nullptr; + return createRegionOperand(&R, Decl, Signed, Mod, true /*IsDest*/); + } +} + +VISA_VectorOpnd *GenXKernelBuilder::createSourceOperand( + Instruction *Inst, Signedness Signed, unsigned OperandNum, + genx::BaleInfo BI, unsigned Mod, Signedness *SignedRes, unsigned MaxWidth) { + Value *V = Inst->getOperand(OperandNum); + return createSource(V, Signed, BI.isOperandBaled(OperandNum), Mod, SignedRes, + MaxWidth); +} + +VISA_PredOpnd * +GenXKernelBuilder::createPredOperand(VISA_PredVar *PredVar, + VISA_PREDICATE_STATE State, + VISA_PREDICATE_CONTROL Control) { + VISA_PredOpnd *PredOperand = nullptr; + CISA_CALL( + Kernel->CreateVISAPredicateOperand(PredOperand, PredVar, State, Control)); + + return PredOperand; +} + +VISA_VectorOpnd *GenXKernelBuilder::createCisaSrcOperand( + VISA_GenVar *Decl, VISA_Modifier Mod, unsigned VStride, unsigned Width, + unsigned HStride, unsigned ROffset, unsigned COffset) { + VISA_VectorOpnd *ResultOperand = nullptr; + CISA_CALL(Kernel->CreateVISASrcOperand(ResultOperand, Decl, Mod, VStride, + Width, HStride, ROffset, COffset)); + return ResultOperand; +} + +VISA_VectorOpnd *GenXKernelBuilder::createCisaDstOperand(VISA_GenVar *Decl, + unsigned HStride, + unsigned ROffset, + unsigned COffset) { + VISA_VectorOpnd *ResultOperand = nullptr; + CISA_CALL(Kernel->CreateVISADstOperand(ResultOperand, Decl, HStride, ROffset, + COffset)); + return ResultOperand; +} + +/*********************************************************************** + * createAddressOperand : create an address register operand + */ +VISA_VectorOpnd *GenXKernelBuilder::createAddressOperand(Value *V, bool IsDst) { + VISA_VectorOpnd *ResultOperand = nullptr; + Register *Reg = RegAlloc->getRegForValue(KernFunc, V, DONTCARESIGNED); + assert(Reg->Category == RegCategory::ADDRESS); + unsigned Width = 1; + if (VectorType *VT = dyn_cast(V->getType())) + Width = VT->getNumElements(); + if (IsDst) { + CISA_CALL(Kernel->CreateVISAAddressDstOperand( + ResultOperand, Reg->GetVar(Kernel), 0)); + } else { + CISA_CALL(Kernel->CreateVISAAddressSrcOperand( + ResultOperand, Reg->GetVar(Kernel), 0, Width)); + } + return ResultOperand; +} + +VISA_Type GenXKernelBuilder::getVISAImmTy(uint8_t ImmTy) { + return static_cast(ImmTy & 0xf); +} + +VISA_VectorOpnd *GenXKernelBuilder::createImmediateOperand(Constant *V, + Signedness Signed) { + if (isDerivedFromUndef(V)) + V = Constant::getNullValue(V->getType()); + + Type *T = V->getType(); + if (VectorType *VT = dyn_cast(T)) { + // Vector constant. + auto Splat = V->getSplatValue(); + if (!Splat) { + // Non-splatted vector constant. Must be a packed vector. + unsigned NumElements = VT->getNumElements(); + if (VT->getElementType()->isIntegerTy()) { + // Packed int vector. + assert(NumElements <= ImmIntVec::Width); + unsigned Packed = 0; + for (unsigned i = 0; i != NumElements; ++i) { + auto El = dyn_cast(V->getAggregateElement(i)); + if (!El) + continue; // undef element + int This = El->getSExtValue(); + if (This < ImmIntVec::MinUInt) { + assert(This >= ImmIntVec::MinSInt && + "too big imm, cannot encode as vector imm"); + Signed = SIGNED; + } else if (This > ImmIntVec::MaxSInt) { + assert(This <= ImmIntVec::MaxUInt && + "too big imm, cannot encode as vector imm"); + Signed = UNSIGNED; + } + Packed |= (This & ImmIntVec::MaxUInt) << (ImmIntVec::ElemSize * i); + } + // For a 2- or 4-wide operand, we need to repeat the vector elements + // as which ones are used depends on the position of the other + // operand in its oword. + switch (NumElements) { + case 2: + Packed = Packed * 0x01010101; + break; + case 4: + Packed = Packed * 0x00010001; + break; + } + auto ImmTy = + static_cast(Signed == UNSIGNED ? ISA_TYPE_UV : ISA_TYPE_V); + auto VISAImmTy = getVISAImmTy(ImmTy); + VISA_VectorOpnd *ImmOp = nullptr; + CISA_CALL(Kernel->CreateVISAImmediate(ImmOp, &Packed, VISAImmTy)); + return ImmOp; + } + // Packed float vector. + assert(VT->getElementType()->isFloatTy() && + (NumElements == 1 || NumElements == 2 || NumElements == 4)); + unsigned Packed = 0; + for (unsigned i = 0; i != 4; ++i) { + auto CFP = + dyn_cast(V->getAggregateElement(i % NumElements)); + if (!CFP) // Undef + continue; + const APFloat &FP = CFP->getValueAPF(); + Packed |= get8bitPackedFloat(FP.convertToFloat()) << (i * 8); + } + auto VISAImmTy = getVISAImmTy(ISA_TYPE_VF); + VISA_VectorOpnd *ImmOp = nullptr; + CISA_CALL(Kernel->CreateVISAImmediate(ImmOp, &Packed, VISAImmTy)); + return ImmOp; + } + // Splatted (or single element) vector. Use the scalar value. + T = VT->getElementType(); + V = Splat; + } + + if (isDerivedFromUndef(V)) + V = Constant::getNullValue(V->getType()); + else if (isa(V)) { + const DataLayout &DL = Func->getParent()->getDataLayout(); + T = DL.getIntPtrType(V->getType()); + V = Constant::getNullValue(T); + } + + // We have a scalar constant. + if (IntegerType *IT = dyn_cast(T)) { + ConstantInt *CI = cast(V); + // I think we need to use the appropriate one of getZExtValue or + // getSExtValue to avoid an assert on very large 64 bit values... + int64_t Val = Signed == UNSIGNED ? CI->getZExtValue() : CI->getSExtValue(); + visa::TypeDetails TD(Func->getParent()->getDataLayout(), IT, Signed); + VISA_VectorOpnd *ImmOp = nullptr; + CISA_CALL( + Kernel->CreateVISAImmediate(ImmOp, &Val, getVISAImmTy(TD.VisaType))); + return ImmOp; + } if (isa(V)) { + assert(0 && "Not baled function address"); + return nullptr; + } else { + VISA_VectorOpnd *ImmOp = nullptr; + ConstantFP *CF = cast(V); + if (T->isFloatTy()) { + union { + float f; + uint32_t i; + } Val; + Val.f = CF->getValueAPF().convertToFloat(); + auto VISAImmTy = getVISAImmTy(ISA_TYPE_F); + CISA_CALL(Kernel->CreateVISAImmediate(ImmOp, &Val.i, VISAImmTy)); + } else if (T->isHalfTy()) { + uint16_t Val( + (uint16_t)(CF->getValueAPF().bitcastToAPInt().getZExtValue())); + auto VISAImmTy = getVISAImmTy(ISA_TYPE_HF); + auto Val32 = static_cast(Val); + CISA_CALL(Kernel->CreateVISAImmediate(ImmOp, &Val32, VISAImmTy)); + } else { + assert(T->isDoubleTy()); + union { + double f; + uint64_t i; + } Val; + Val.f = CF->getValueAPF().convertToDouble(); + auto VISAImmTy = getVISAImmTy(ISA_TYPE_DF); + CISA_CALL(Kernel->CreateVISAImmediate(ImmOp, &Val.i, VISAImmTy)); + } + return ImmOp; + } +} + +/*********************************************************************** + * getOriginalInstructionForSource : trace a source operand back through + * its bale (if any), given a starting instruction. + * + * Enter: Inst = The instruction to start tracing from. + * BI = BaleInfo for Inst + */ +Instruction * +GenXKernelBuilder::getOriginalInstructionForSource(Instruction *Inst, + BaleInfo BI) { + while (!isa(Inst->getOperand(0)) && BI.isOperandBaled(0)) { + Inst = cast(Inst->getOperand(0)); + BI = Baling->getBaleInfo(Inst); + } + + return Inst; +} + +void GenXKernelBuilder::buildConvert(CallInst *CI, BaleInfo BI, unsigned Mod, + const DstOpndDesc &DstDesc) { + Register *DstReg = RegAlloc->getRegForValue(KernFunc, CI, UNSIGNED); + if (!isa(CI->getOperand(0))) { + Instruction *OrigInst = getOriginalInstructionForSource(CI, BI); + Register *SrcReg = RegAlloc->getRegForValue(KernFunc, OrigInst->getOperand(0)); + (void)SrcReg; + assert((SrcReg->Category != RegCategory::GENERAL || + DstReg->Category != RegCategory::GENERAL) && + "expected a category conversion"); + } + + if (DstReg->Category != RegCategory::ADDRESS) { + // State copy. + int ExecSize = 1; + if (VectorType *VT = dyn_cast(CI->getType())) { + ExecSize = VT->getNumElements(); + } + + auto ISAExecSize = static_cast(genx::log2(ExecSize)); + auto Dst = createDestination(CI, UNSIGNED, 0, DstDesc); + auto Src = createSourceOperand(CI, UNSIGNED, 0, BI); + addDebugInfo(); + CISA_CALL(Kernel->AppendVISADataMovementInst( + ISA_MOVS, nullptr /*Pred*/, false /*Mod*/, + NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1, ISAExecSize, Dst, Src)); + return; + } + + // Destination is address register. + int ExecSize = 1; + if (VectorType *VT = dyn_cast(CI->getType())) { + report_fatal_error("vector of addresses not implemented"); + } + + auto ISAExecSize = static_cast(genx::log2(ExecSize)); + Register *SrcReg = RegAlloc->getRegForValue(KernFunc, CI->getOperand(0)); + assert(SrcReg->Category == RegCategory::ADDRESS); + + (void)SrcReg; + // This is an address->address copy, inserted due to coalescing failure of + // the address for an indirected arg in GenXArgIndirection. + // (A conversion to address is handled in buildConvertAddr below.) + // Write the addr_add instruction. + Value *SrcOp0 = CI->getOperand(0); + unsigned Src0Width = 1; + if (VectorType *VT = dyn_cast(SrcOp0->getType())) + Src0Width = VT->getNumElements(); + + Register *RegDst = RegAlloc->getRegForValue(KernFunc, CI, DONTCARESIGNED); + Register *RegSrc0 = RegAlloc->getRegForValue(KernFunc, SrcOp0, DONTCARESIGNED); + + VISA_VectorOpnd *Dst = nullptr, *Src0 = nullptr, *Src1 = nullptr; + + CISA_CALL(Kernel->CreateVISAAddressDstOperand( + Dst, RegDst->GetVar(Kernel), 0)); + CISA_CALL(Kernel->CreateVISAAddressSrcOperand( + Src0, RegSrc0->GetVar(Kernel), 0, Src0Width)); + Src1 = + createImmediateOperand(Constant::getNullValue(CI->getType()), UNSIGNED); + + addDebugInfo(); + CISA_CALL(Kernel->AppendVISAAddrAddInst(vISA_EMASK_M1_NM, ISAExecSize, Dst, + Src0, Src1)); +} + +VISA_VectorOpnd *GenXKernelBuilder::createSource(CisaVariable *V, + Signedness Signed, + unsigned MaxWidth, + unsigned *Offset) { + Region R(VectorType::get( + IntegerType::get(Ctx, CISATypeTable[V->getType()].typeSize * CHAR_BIT), + V->getNumElements())); + if (Offset) + R.Offset = *Offset; + return createRegionOperand(&R, V->getGenVar(), Signed, 0, false, MaxWidth); +} + +VISA_VectorOpnd *GenXKernelBuilder::createSource(Value *V, Signedness Signed, + unsigned MaxWidth, + unsigned *Offset) { + return createSource(V, Signed, false, 0, nullptr, MaxWidth, Offset); +} + +VISA_VectorOpnd *GenXKernelBuilder::createSource(Value *V, Signedness Signed, + bool Baled, unsigned Mod, + Signedness *SignedRes, + unsigned MaxWidth, + unsigned *Offset) { + if (SignedRes) + *SignedRes = Signed; + if (auto C = dyn_cast(V)) { + if (Mod) { + // Need to negate constant. + assert(Mod == MODIFIER_NEG && "unexpected modifier"); + if (C->getType()->isIntOrIntVectorTy()) + C = ConstantExpr::getNeg(C); + else + C = ConstantExpr::getFNeg(C); + } + return createImmediateOperand(C, Signed); + } + if (!Baled) { + Register *Reg = RegAlloc->getRegForValue(KernFunc, V, Signed); + assert(Reg->Category == RegCategory::GENERAL || + Reg->Category == RegCategory::SURFACE || + Reg->Category == RegCategory::SAMPLER || + Reg->Category == RegCategory::VME); + // Write the vISA general operand. + Region R(V); + if (Offset) + R.Offset = *Offset; + if (R.NumElements == 1) + R.VStride = R.Stride = 0; + if (SignedRes) + *SignedRes = RegAlloc->getSigned(Reg); + if (Reg->Category == RegCategory::GENERAL) { + return createRegionOperand(&R, Reg->GetVar(Kernel), Signed, Mod, + false /*IsDst*/, MaxWidth); + } else { + return createState(Reg, R.Offset >> 2, false /*IsDst*/); + }; + } + + Instruction *Inst = cast(V); + BaleInfo BI(Baling->getBaleInfo(Inst)); + unsigned Idx = 0; + switch (BI.Type) { + case BaleInfo::RDREGION: { + // The source operand has a rdregion baled in. We need to allow for the + // case that there is no register allocated if it is an indirected arg, + // and that is OK because the region is indirect so the vISA does not + // contain the base register. + Value *V = Inst->getOperand(0); + Register *Reg = RegAlloc->getRegForValueOrNull(KernFunc, V, Signed); + + // Ensure we pick a non-DONTCARESIGNED signedness here, as, for an + // indirect region and DONTCARESIGNED, writeRegion arbitrarily picks a + // signedness as it is attached to the operand, unlike a direct region + // where it is attached to the vISA register. + if (Reg) + Signed = RegAlloc->getSigned(Reg); + else if (Signed == DONTCARESIGNED) + Signed = SIGNED; + // Write the vISA general operand with region. + Region R(Inst, Baling->getBaleInfo(Inst)); + if (Offset) + R.Offset = *Offset; + if (R.NumElements == 1) + R.VStride = 0; + if (R.Width == 1) + R.Stride = 0; + if (!Reg || Reg->Category == RegCategory::GENERAL || R.Indirect) { + if (SignedRes) + *SignedRes = Signed; + return createRegionOperand(&R, Reg ? Reg->GetVar(Kernel) : nullptr, + Signed, Mod, false, MaxWidth); + } else { + if (SignedRes) + *SignedRes = Signed; + return createState(Reg, R.Offset >> 2, false /*IsDst*/); + } + } + case BaleInfo::ABSMOD: + Signed = SIGNED; + Mod |= MODIFIER_ABS; + break; + case BaleInfo::NEGMOD: + if (!(Mod & MODIFIER_ABS)) + Mod ^= MODIFIER_NEG; + Idx = 1; // the input we want in "0-x" is x, not 0. + break; + case BaleInfo::NOTMOD: + Mod ^= MODIFIER_NOT; + break; + case BaleInfo::ZEXT: + Signed = UNSIGNED; + break; + case BaleInfo::SEXT: + Signed = SIGNED; + break; + default: + llvm_unreachable("unknown bale type"); + break; + } + return createSource(Inst->getOperand(Idx), Signed, BI.isOperandBaled(Idx), + Mod, SignedRes, MaxWidth); +} + +std::string GenXKernelBuilder::createInlineAsmOperand( + Register *Reg, genx::Region *R, bool IsDst, genx::Signedness Signed, + genx::ConstraintType Ty, unsigned Mod) { + deduceRegion(R, IsDst); + + VISA_VectorOpnd *ResultOperand = nullptr; + switch (Ty) { + default: + llvm_unreachable("constraint unhandled"); + case ConstraintType::Constraint_cr: { + assert(Reg && Reg->Category == RegCategory::PREDICATE); + VISA_PredVar *PredVar = getPredicateVar(Reg); + VISA_PredOpnd *PredOperand = + createPredOperand(PredVar, PredState_NO_INVERSE, PRED_CTRL_NON); + return Kernel->getPredicateOperandName(PredOperand); + } + case ConstraintType::Constraint_rw: + return Kernel->getVarName(Reg->GetVar(Kernel)); + case ConstraintType::Constraint_r: + ResultOperand = + createGeneralOperand(R, Reg->GetVar(Kernel), Signed, Mod, IsDst); + break; + case ConstraintType::Constraint_a: + if (!R->Indirect) + report_fatal_error("Inline asm operand can'be indirected here"); + ResultOperand = createIndirectOperand(R, Signed, Mod, IsDst); + break; + } + return Kernel->getVectorOperandName(ResultOperand, true); +} + +std::string GenXKernelBuilder::createInlineAsmDestinationOperand( + Value *Dest, genx::Signedness Signed, genx::ConstraintType Ty, unsigned Mod, + const DstOpndDesc &DstDesc) { + + Type *OverrideType = nullptr; + + // Saturation can also change signedness. + if (!Dest->user_empty() && GenXIntrinsic::isIntegerSat(Dest->user_back())) { + Signed = getISatDstSign(Dest->user_back()); + } + + if (!DstDesc.WrRegion) { + Register *Reg = RegAlloc->getRegForValue(KernFunc, Dest, Signed, OverrideType); + + Region DestR(Dest); + return createInlineAsmOperand(Reg, &DestR, true /*IsDst*/, DONTCARESIGNED, + Ty, Mod); + } + // We need to allow for the case that there is no register allocated if it is + // an indirected arg, and that is OK because the region is indirect so the + // vISA does not contain the base register. + Register *Reg; + + Value *V = nullptr; + if (DstDesc.GStore) { + auto GV = getUnderlyingGlobalVariable(DstDesc.GStore->getOperand(1)); + assert(GV && "out of sync"); + if (OverrideType == nullptr) + OverrideType = DstDesc.GStore->getOperand(0)->getType(); + Reg = RegAlloc->getRegForValue(KernFunc, GV, Signed, OverrideType); + V = GV; + } else { + V = DstDesc.WrRegion; + Reg = RegAlloc->getRegForValueOrNull(KernFunc, V, Signed, OverrideType); + } + + assert(!Reg || Reg->Category == RegCategory::GENERAL); + + // Write the vISA general operand with region: + Region R(DstDesc.WrRegion, DstDesc.WrRegionBI); + + return createInlineAsmOperand(Reg, &R, true /*IsDst*/, Signed, Ty, Mod); +} + +std::string GenXKernelBuilder::createInlineAsmSourceOperand( + Value *V, genx::Signedness Signed, bool Baled, genx::ConstraintType Ty, + unsigned Mod, unsigned MaxWidth) { + + if (auto C = dyn_cast(V)) { + if (Ty != genx::ConstraintType::Constraint_n) { + if (Mod) { + // Need to negate constant. + assert(Mod == MODIFIER_NEG && "unexpected modifier"); + if (C->getType()->isIntOrIntVectorTy()) + C = ConstantExpr::getNeg(C); + else + C = ConstantExpr::getFNeg(C); + } + VISA_VectorOpnd *ImmOp = createImmediateOperand(C, Signed); + return Kernel->getVectorOperandName(ImmOp, false); + } else { + ConstantInt *CI = cast(C); + return llvm::to_string(CI->getSExtValue()); + } + } + + if (!Baled) { + Register *Reg = RegAlloc->getRegForValue(KernFunc, V, Signed); + Region R(V); + if (R.NumElements == 1) + R.VStride = R.Stride = 0; + + return createInlineAsmOperand(Reg, &R, false /*IsDst*/, Signed, Ty, Mod); + } + + Instruction *Inst = cast(V); + BaleInfo BI(Baling->getBaleInfo(Inst)); + assert(BI.Type == BaleInfo::RDREGION); + // The source operand has a rdregion baled in. We need to allow for the + // case that there is no register allocated if it is an indirected arg, + // and that is OK because the region is indirect so the vISA does not + // contain the base register. + V = Inst->getOperand(0); + Register *Reg = RegAlloc->getRegForValue(KernFunc, V, Signed); + + // Ensure we pick a non-DONTCARESIGNED signedness here, as, for an + // indirect region and DONTCARESIGNED, writeRegion arbitrarily picks a + // signedness as it is attached to the operand, unlike a direct region + // where it is attached to the vISA register. + if (Signed == DONTCARESIGNED) + Signed = SIGNED; + // Write the vISA general operand with region. + Region R(Inst, Baling->getBaleInfo(Inst)); + if (R.NumElements == 1) + R.VStride = 0; + if (R.Width == 1) + R.Stride = 0; + + assert(Reg->Category == RegCategory::GENERAL || R.Indirect); + + return createInlineAsmOperand(Reg, &R, false /*IsDst*/, Signed, Ty, Mod); +} + +/*********************************************************************** + * getPredicateVar : get predicate var from value + */ +VISA_PredVar *GenXKernelBuilder::getPredicateVar(Value *V) { + auto Reg = RegAlloc->getRegForValue(KernFunc, V, DONTCARESIGNED); + assert(Reg && Reg->Category == RegCategory::PREDICATE); + return getPredicateVar(Reg); +} + +/*********************************************************************** + * getZeroedPredicateVar : get predicate var from value with zeroing it + */ +VISA_PredVar *GenXKernelBuilder::getZeroedPredicateVar(Value *V) { + auto Reg = RegAlloc->getRegForValue(KernFunc, V, DONTCARESIGNED); + assert(Reg && Reg->Category == RegCategory::PREDICATE); + auto PredVar = getPredicateVar(Reg); + unsigned Size = V->getType()->getPrimitiveSizeInBits(); + auto C = Constant::getNullValue(V->getType()); + CISA_CALL(Kernel->AppendVISASetP( + vISA_EMASK_M1_NM, VISA_Exec_Size(genx::log2(Size)), + PredVar, createImmediateOperand(C, DONTCARESIGNED))); + + return PredVar; +} + +/*********************************************************************** + * getPredicateVar : get predicate var from register + */ +VISA_PredVar *GenXKernelBuilder::getPredicateVar(Register *R) { + assert(R); + return R->Num >= visa::VISA_NUM_RESERVED_PREDICATES + ? R->GetVar(Kernel) + : nullptr; +} + +void GenXKernelBuilder::buildSelectInst(SelectInst *SI, BaleInfo BI, + unsigned Mod, + const DstOpndDesc &DstDesc) { + unsigned ExecSize = 1; + if (VectorType *VT = dyn_cast(SI->getType())) + ExecSize = VT->getNumElements(); + // Get the predicate (mask) operand, scanning through baled in + // all/any/not/rdpredregion and setting PredField and MaskCtrl + // appropriately. + VISA_EMask_Ctrl MaskCtrl; + VISA_PREDICATE_CONTROL Control; + VISA_PREDICATE_STATE State; + + VISA_PredVar *PredDecl = + createPredicateDeclFromSelect(SI, BI, Control, State, &MaskCtrl); + VISA_PredOpnd* PredOp = createPredOperand(PredDecl, State, Control); + + VISA_VectorOpnd *Dst = createDestination(SI, DONTCARESIGNED, Mod, DstDesc); + VISA_VectorOpnd *Src0 = createSourceOperand(SI, DONTCARESIGNED, 1, BI); + VISA_VectorOpnd *Src1 = createSourceOperand(SI, DONTCARESIGNED, 2, BI); + + addDebugInfo(); + CISA_CALL(Kernel->AppendVISADataMovementInst( + ISA_SEL, PredOp, Mod & MODIFIER_SAT, MaskCtrl, + getExecSizeFromValue(ExecSize), Dst, Src0, Src1)); +} + +void GenXKernelBuilder::buildBitCast(CastInst *CI, genx::BaleInfo BI, + unsigned Mod, const DstOpndDesc &DstDesc) { + if (!isMaskPacking(CI)) + assert(!BI.Bits && !Mod && !DstDesc.WrRegion && + "non predicate bitcast should not be baled with anything"); + + if (CI->getType()->getScalarType()->isIntegerTy(1)) { + if (CI->getOperand(0)->getType()->getScalarType()->isIntegerTy(1)) { + if (auto C = dyn_cast(CI->getOperand(0))) { + auto Reg = RegAlloc->getRegForValueOrNull(KernFunc, CI, DONTCARESIGNED); + if (!Reg) + return; // write to EM/RM value, ignore + // We can move a constant predicate to a predicate register + // using setp, if we get the constant predicate as a single int. + unsigned IntVal = getPredicateConstantAsInt(C); + unsigned Size = C->getType()->getPrimitiveSizeInBits(); + C = ConstantInt::get( + Type::getIntNTy(CI->getContext(), std::max(Size, 8U)), IntVal); + + addDebugInfo(); + CISA_CALL(Kernel->AppendVISASetP( + vISA_EMASK_M1_NM, VISA_Exec_Size(genx::log2(Size)), + getPredicateVar(Reg), createSourceOperand(CI, UNSIGNED, 0, BI))); + return; + } + // There does not appear to be a vISA instruction to move predicate + // to predicate. GenXCoalescing avoids this by moving in two steps + // via a general register. So the only pred->pred bitcast that arrives + // here should be one from GenXLowering, and it should have been copy + // coalesced in GenXCoalescing. + assert(RegAlloc->getRegForValue(KernFunc, CI, DONTCARESIGNED) == + RegAlloc->getRegForValue(KernFunc, CI->getOperand(0), DONTCARESIGNED) && + "uncoalesced phi move of predicate"); + return; + } + + VISA_PredVar *PredVar = getPredicateVar(CI); + + addDebugInfo(); + CISA_CALL(Kernel->AppendVISASetP( + vISA_EMASK_M1_NM, + VISA_Exec_Size( + genx::log2(CI->getType()->getPrimitiveSizeInBits())), + PredVar, createSourceOperand(CI, UNSIGNED, 0, BI))); + return; + } + if (isa(CI->getOperand(0))) { + if (isa(CI->getOperand(0))) + return; // undef source, generate no code + // Source is constant. + int ExecSize = 1; + if (VectorType *VT = dyn_cast(CI->getType())) + ExecSize = VT->getNumElements(); + + VISA_EMask_Ctrl ctrlMask = getExecMaskFromWrRegion(DstDesc, true); + VISA_Exec_Size execSize = getExecSizeFromValue(ExecSize); + addDebugInfo(); + CISA_CALL(Kernel->AppendVISADataMovementInst( + ISA_MOV, createPredFromWrRegion(DstDesc), Mod & MODIFIER_SAT, ctrlMask, + execSize, createDestination(CI, DONTCARESIGNED, Mod, DstDesc), + createSourceOperand(CI, DONTCARESIGNED, 0, BI))); + return; + } + if (CI->getOperand(0)->getType()->getScalarType()->isIntegerTy(1)) { + // Bitcast from predicate to scalar int + Register *PredReg = + RegAlloc->getRegForValue(KernFunc, CI->getOperand(0), DONTCARESIGNED); + assert(PredReg->Category == RegCategory::PREDICATE); + addDebugInfo(); + CISA_CALL(Kernel->AppendVISAPredicateMove( + createDestination(CI, UNSIGNED, 0, DstDesc), + PredReg->GetVar(Kernel))); + + return; + } + + // Real bitcast with possibly different types. Use whichever type has the + // largest element size, so we minimize the number of channels used in the + // move. + Type *Ty = CI->getOperand(0)->getType(); + if (Ty->getScalarType()->getPrimitiveSizeInBits() < + CI->getType()->getScalarType()->getPrimitiveSizeInBits()) + Ty = CI->getType(); + if (Liveness->isBitCastCoalesced(cast(CI))) + return; // bitcast was coalesced away + Register *DstReg = RegAlloc->getRegForValue(KernFunc, CI, DONTCARESIGNED, Ty); + // Give dest and source the same signedness for byte mov. + auto Signed = RegAlloc->getSigned(DstReg); + Register *SrcReg = RegAlloc->getRegForValue(KernFunc, CI->getOperand(0), Signed, Ty); + VISA_Exec_Size ExecSize = EXEC_SIZE_1; + if (VectorType *VT = dyn_cast(Ty)) + ExecSize = getExecSizeFromValue(VT->getNumElements()); + assert(ExecSize >= EXEC_SIZE_1 && ExecSize <= EXEC_SIZE_32 && + "illegal exec size in bitcast: should have been coalesced away"); + // destination + Region DestR(CI); + // source + Region SourceR(CI->getOperand(0)); + + VISA_EMask_Ctrl ctrlMask = NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1; + addDebugInfo(); + CISA_CALL(Kernel->AppendVISADataMovementInst( + ISA_MOV, nullptr, Mod, ctrlMask, ExecSize, + createRegionOperand(&DestR, DstReg->GetVar(Kernel), DONTCARESIGNED, + 0, true), + createRegionOperand(&SourceR, SrcReg->GetVar(Kernel), Signed, 0, + false))); +} + +void GenXKernelBuilder::buildFunctionAddr(Instruction *Inst, + const DstOpndDesc &DstDesc) { + + auto *Dst = createDestination(Inst, DONTCARESIGNED, MODIFIER_NONE, DstDesc); + assert(Dst); + auto *F = cast(cast(Inst)->getPointerOperand()); + CISA_CALL(Kernel->AppendVISACFSymbolInst(F->getName(), Dst)); +} + +/*********************************************************************** + * buildLoneWrRegion : build a lone wrregion + */ +void GenXKernelBuilder::buildLoneWrRegion(const DstOpndDesc &DstDesc) { + enum { OperandNum = 1 }; + Value *Input = DstDesc.WrRegion->getOperand(OperandNum); + if (isa(Input)) + return; // No code if input is undef + VISA_Exec_Size ExecSize = EXEC_SIZE_1; + if (VectorType *VT = dyn_cast(Input->getType())) + ExecSize = getExecSizeFromValue(VT->getNumElements()); + + VISA_EMask_Ctrl ExecMask = getExecMaskFromWrRegion(DstDesc, true); + + // TODO: fix signedness of the source + addDebugInfo(); + CISA_CALL(Kernel->AppendVISADataMovementInst( + ISA_MOV, createPredFromWrRegion(DstDesc), false, ExecMask, ExecSize, + createDestination(Input, DONTCARESIGNED, 0, DstDesc), + createSource(Input, DONTCARESIGNED, false, 0))); +} + +/*********************************************************************** + * buildLoneWrPredRegion : build a lone wrpredregion + */ +void GenXKernelBuilder::buildLoneWrPredRegion(Instruction *Inst, BaleInfo BI) { + enum { OperandNum = 1 }; + Value *Input = Inst->getOperand(OperandNum); + assert(isa(Input)); + auto C = dyn_cast(Input); + assert(C); + unsigned Size = C->getType()->getPrimitiveSizeInBits(); + + VISA_EMask_Ctrl ctrlMask = getExecMaskFromWrPredRegion(Inst, true); + VISA_Exec_Size execSize = getExecSizeFromValue(Size); + + unsigned IntVal = getPredicateConstantAsInt(C); + C = ConstantInt::get(Type::getIntNTy(Inst->getContext(), std::max(Size, 8U)), + IntVal); + addDebugInfo(); + CISA_CALL(Kernel->AppendVISASetP(ctrlMask, execSize, getPredicateVar(Inst), + createImmediateOperand(C, UNSIGNED))); +} + +/*********************************************************************** + * buildLoneOperand : build a rdregion or modifier that is not baled in to + * a main instruction + * + * Enter: Inst = the rdregion or modifier instruction + * BI = BaleInfo for Inst + * Mod = modifier for destination + * WrRegion = 0 else wrregion for destination + * WrRegionBI = BaleInfo for WrRegion (possibly baling in + * variable index add) + */ +void GenXKernelBuilder::buildLoneOperand(Instruction *Inst, genx::BaleInfo BI, + unsigned Mod, + const DstOpndDesc &DstDesc) { + Instruction *WrRegion = DstDesc.WrRegion; + BaleInfo WrRegionBI = DstDesc.WrRegionBI; + + VISA_Exec_Size ExecSize = EXEC_SIZE_1; + if (VectorType *VT = dyn_cast(Inst->getType())) + ExecSize = getExecSizeFromValue(VT->getNumElements()); + ISA_Opcode Opcode = ISA_MOV; + bool Baled = true; + VISA_EMask_Ctrl ExecMask = getExecMaskFromWrRegion(DstDesc); + // Default source from Inst + Value *Src = Inst; + + // Give dest and source the same signedness for byte mov. + auto Signed = DONTCARESIGNED; + // destination + auto Dest = createDestination(Inst, Signed, Mod, DstDesc, &Signed); + + // source + if ((Mod & MODIFIER_SAT) != 0 && + Inst->getType()->getScalarType()->isIntegerTy() && + GenXIntrinsic::isIntegerSat(Inst->user_back())) + Signed = getISatSrcSign(Inst->user_back()); + + if (BI.Type == BaleInfo::NOTMOD) { + // A lone "not" is implemented as a not instruction, rather than a mov + // with a not modifier. A mov only allows an arithmetic modifier. + Opcode = ISA_NOT; + Baled = BI.isOperandBaled(0); + // In this case the src is actually operand 0 of the noti intrinsic + Src = Inst->getOperand(0); + } else if (BI.Type == BaleInfo::RDREGION && !Mod) { + Register *DstReg; + if (WrRegion) { + DstReg = RegAlloc->getRegForValueOrNull(KernFunc, WrRegion, DONTCARESIGNED); + } else { + DstReg = RegAlloc->getRegForValue(KernFunc, Inst, DONTCARESIGNED); + } + if (DstReg && (DstReg->Category == RegCategory::SURFACE || + DstReg->Category == RegCategory::SAMPLER || + DstReg->Category == RegCategory::VME)) { + Opcode = ISA_MOVS; + } + } + // TODO: mb need to get signed from dest for src and then modify that + addDebugInfo(); + CISA_CALL(Kernel->AppendVISADataMovementInst( + Opcode, (Opcode != ISA_MOVS ? createPredFromWrRegion(DstDesc) : nullptr), + Mod & MODIFIER_SAT, ExecMask, ExecSize, Dest, + createSource(Src, Signed, Baled, 0))); +} + +static unsigned getResultedTypeSize(Type *Ty) { + unsigned TySz = 0; + if (Ty->isVectorTy()) + TySz = Ty->getVectorNumElements() * + getResultedTypeSize(Ty->getVectorElementType()); + else if (Ty->isArrayTy()) + TySz = Ty->getArrayNumElements() * + getResultedTypeSize(Ty->getArrayElementType()); + else if (Ty->isStructTy()) { + StructType *STy = dyn_cast(Ty); + assert(STy); + for (Type *Ty : STy->elements()) + TySz += getResultedTypeSize(Ty); + } else if (Ty->isPointerTy() && Ty->getPointerElementType()->isFunctionTy()) { + TySz = BYTES_PER_FADDR; + } else { + TySz = Ty->getPrimitiveSizeInBits() / CHAR_BIT; + assert(TySz && "Ty is not primitive?"); + } + + return TySz; +} + +// Check if we're trying to form return value of a structure type +// TODO: should check full insert/extract chain (for failed coalescing cases), +// e.g. after failed coalescing we may end up having a bunch of +// extractvalue, insertvalue and bitcasts inst where only the last one +// should be actually lowered +static bool checkInsertToRetv(InsertValueInst *Inst) { + if (auto IVI = dyn_cast(Inst->use_begin()->getUser())) + return checkInsertToRetv(IVI); + else if (auto RI = dyn_cast(Inst->use_begin()->getUser())) + return RI->getFunction()->hasFnAttribute(genx::FunctionMD::CMStackCall) || + RI->getFunction()->hasFnAttribute( + genx::FunctionMD::ReferencedIndirectly); + return false; +} + +/*********************************************************************** + * buildMainInst : build a main instruction + * + * Enter: Inst = the main instruction + * BI = BaleInfo for Inst + * Mod = modifier bits for destination + * WrRegion = 0 else wrregion for destination + * WrRegionBI = BaleInfo for WrRegion (possibly baling in + * variable index add) + * + * Return: true if terminator inst that falls through to following block + */ +bool GenXKernelBuilder::buildMainInst(Instruction *Inst, BaleInfo BI, + unsigned Mod, + const DstOpndDesc &DstDesc) { + if (PHINode *Phi = dyn_cast(Inst)) + buildPhiNode(Phi); + else if (ReturnInst *RI = dyn_cast(Inst)) { + buildRet(RI); + } else if (BranchInst *BR = dyn_cast(Inst)) { + return buildBranch(BR); + } else if (CmpInst *Cmp = dyn_cast(Inst)) { + buildCmp(Cmp, BI, DstDesc); + } else if (BinaryOperator *BO = dyn_cast(Inst)) { + if (!BO->getType()->getScalarType()->isIntegerTy(1)) { + buildBinaryOperator(BO, BI, Mod, DstDesc); + } else { + assert(!Mod && !DstDesc.WrRegion && !BI.isOperandBaled(0) && + !BI.isOperandBaled(1)); + buildBoolBinaryOperator(BO); + } + } else if (auto EVI = dyn_cast(Inst)) { + if (auto *CI = dyn_cast(Inst->getOperand(0))) + // translate extraction of structured type from retv + if (!CI->isInlineAsm() && (CI->getCalledFunction()->hasFnAttribute( + genx::FunctionMD::CMStackCall) || + CI->isIndirectCall())) + buildExtractRetv(EVI); + // no code generated + } else if (auto IVI = dyn_cast(Inst)) { + if (checkInsertToRetv(IVI) + // TODO: safely remove this tmp workaround for failed coalescing cases + // and insert-extract-insert chains + && !isa(Inst->getOperand(1))) + RetvInserts.push_back(IVI); + // no code generated + } else if (BitCastInst *BCI = dyn_cast(Inst)) { + buildBitCast(BCI, BI, Mod, DstDesc); + } else if (CastInst *CI = dyn_cast(Inst)) { + auto ptr2Int = dyn_cast(CI); + if (ptr2Int && isa(CI->getOperand(0))) { + buildSymbolInst(ptr2Int, Mod, DstDesc); + } else { + buildCastInst(CI, BI, Mod, DstDesc); + } + } else if (auto SI = dyn_cast(Inst)) { + buildSelectInst(SI, BI, Mod, DstDesc); + } else if (auto LI = dyn_cast(Inst)) { + (void)LI; // no code generated + } else if (auto GEPI = dyn_cast(Inst)) { + // check if gepi def is used in intrinsic, otherwise report error + auto GepiChecker = [](Use &ui) { + auto ci = cast(ui.getUser()); + Function *Callee = ci->getCalledFunction(); + unsigned IntrinID = GenXIntrinsic::getAnyIntrinsicID(Callee); + return (IntrinID == GenXIntrinsic::genx_print_format_index); + }; + if (!std::all_of(GEPI->use_begin(), GEPI->use_end(), GepiChecker)) { + report_fatal_error("gep is supported only for printf"); + } +#if (LLVM_VERSION_MAJOR > 8) + } else if (UnaryOperator *UO = dyn_cast(Inst)) { + buildUnaryOperator(UO, BI, Mod, DstDesc); +#endif + } else if (auto *CI = dyn_cast(Inst)) { + if (CI->isInlineAsm()) + buildInlineAsm(CI); + else if (CI->isIndirectCall()) { + assert(!Mod && !DstDesc.WrRegion && + "cannot bale subroutine call into anything"); + buildCall(CI, DstDesc); + } else { + Function *Callee = CI->getCalledFunction(); + unsigned IntrinID = GenXIntrinsic::getAnyIntrinsicID(Callee); + switch (IntrinID) { + case Intrinsic::dbg_value: + case Intrinsic::dbg_declare: + case GenXIntrinsic::genx_predefined_surface: + case GenXIntrinsic::genx_output: + // ignore + break; + case GenXIntrinsic::genx_simdcf_goto: + // A goto that is not baled into a branch (via an extractvalue) + buildGoto(CI, nullptr); + break; + case GenXIntrinsic::genx_simdcf_join: + // A join that is not baled into a branch (via an extractvalue) + buildJoin(CI, nullptr); + break; + case GenXIntrinsic::genx_convert: + buildConvert(CI, BI, Mod, DstDesc); + break; + case GenXIntrinsic::genx_print_format_index: + buildPrintIndex(CI, IntrinID, Mod, DstDesc); + break; + case GenXIntrinsic::genx_convert_addr: + buildConvertAddr(CI, BI, Mod, DstDesc); + break; + case GenXIntrinsic::genx_alloca: + buildAlloca(CI, IntrinID, Mod, DstDesc); + break; + case GenXIntrinsic::genx_constanti: + case GenXIntrinsic::genx_constantf: + case GenXIntrinsic::genx_constantpred: + if (isa(CI->getOperand(0))) + return false; // Omit llvm.genx.constant with undef operand. + if (!DstDesc.WrRegion && !RegAlloc->getRegForValueOrNull(KernFunc, CI)) + return false; // Omit llvm.genx.constantpred that is EM or RM and so + // does not have a register allocated. + // fall through... + case GenXIntrinsic::genx_barrier: + HasBarrier = true; + default: + if (!(CI->user_empty() && + GenXIntrinsic::getAnyIntrinsicID(CI->getCalledFunction()) == + GenXIntrinsic::genx_any)) + buildIntrinsic(CI, IntrinID, BI, Mod, DstDesc); + break; + case GenXIntrinsic::not_any_intrinsic: + assert(!Mod && !DstDesc.WrRegion && + "cannot bale subroutine call into anything"); + buildCall(CI, DstDesc); + break; + } + } + } else if (isa(Inst)) + ; // no code generated + else + report_fatal_error("main inst not implemented"); + + return false; +} + +/*********************************************************************** + * buildPhiNode : build code for a phi node + * + * A phi node generates no code because coalescing has ensured that all + * incomings and the result are in the same register. This function just + * asserts that that is the case. + */ +void GenXKernelBuilder::buildPhiNode(PHINode *Phi) { +#ifndef NDEBUG + for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { + Value *Incoming = Phi->getIncomingValue(i); + // This assert has to cope with the case that the phi node has no live + // range because it is part of an indirected arg/retval in + // GenXArgIndirection, or it is an EM/RM category. + if (!isa(Incoming)) + if (auto LR = Liveness->getLiveRangeOrNull(Incoming)) + if (LR->getCategory() < RegCategory::NUMREALCATEGORIES) + assert(LR == Liveness->getLiveRangeOrNull(Phi) && + "mismatched registers in phi node"); + } +#endif +} + +/*********************************************************************** + * buildGoto : translate a goto + * + * Enter: Goto = goto instruction that is baled into an extractvalue of + * field 2 (the !any(EM) value), that is baled into Branch + * Branch = branch instruction, 0 if this is a goto that is not + * baled into a branch, which happens when the goto is + * followed by a join point so the goto's JIP points there, + * and LLVM changes the resulting conditional branch with + * both successors the same into an unconditional branch + */ +void GenXKernelBuilder::buildGoto(CallInst *Goto, BranchInst *Branch) { + // GenXSimdCFConformance and GenXTidyControlFlow ensure that we have either + // 1. a forward goto, where the false successor is fallthrough; or + // 2. a backward goto, where the UIP (the join whose RM the goto updates) + // and the true successor are both fallthrough, and the false successor + // is the top of the loop. + // (1) generates a vISA forward goto, but the condition has the wrong sense + // so we need to invert it. + // (2) generates a vISA backward goto. + Value *BranchTarget = nullptr; + VISA_PREDICATE_STATE StateInvert = PredState_NO_INVERSE; + if (!Branch || + Branch->getSuccessor(1) == Branch->getParent()->getNextNode()) { + // Forward goto. Find the join. + auto Join = GotoJoin::findJoin(Goto); + assert(Join && "join not found"); + BranchTarget = Join; + StateInvert = PredState_INVERSE; + } else { + assert(Branch->getSuccessor(0) == Branch->getParent()->getNextNode() && + "bad goto structure"); + // Backward branch. + BranchTarget = Branch->getSuccessor(1); + } + // Get the condition. + VISA_EMask_Ctrl Mask = vISA_EMASK_M1; + VISA_PREDICATE_CONTROL Control = PRED_CTRL_NON; + VISA_PREDICATE_STATE State = PredState_NO_INVERSE; + + Value *Pred = getPredicateOperand( + Goto, 2 /*OperandNum*/, Baling->getBaleInfo(Goto), Control, State, &Mask); + assert(!Mask && "cannot have rdpredregion baled into goto"); + + Register *PredReg = nullptr; + if (auto C = dyn_cast(Pred)) { + (void)C; + if (StateInvert) + assert(C->isNullValue() && + "predication operand must be constant 0 or not constant"); + else + assert(C->isAllOnesValue() && + "predication operand must be constant 1 or not constant"); + } else { + State ^= StateInvert; + PredReg = RegAlloc->getRegForValueOrNull(KernFunc, Pred); + assert(PredReg && PredReg->Category == RegCategory::PREDICATE); + } + + uint8_t execSize = genx::log2(Pred->getType()->getVectorNumElements()); + + // Visa decoder part + VISA_EMask_Ctrl emask = + VISA_EMask_Ctrl((execSize >> 0x4) & 0xF); + VISA_Exec_Size esize = (VISA_Exec_Size)((execSize)&0xF); + + VISA_PredOpnd *pred = nullptr; + if (PredReg) { + VISA_PredVar *Decl = getPredicateVar(PredReg); + VISA_PredOpnd *opnd = createPredOperand(Decl, State, Control); + pred = opnd; + } + + unsigned LabelID = getOrCreateLabel(BranchTarget, LABEL_BLOCK); + + VISA_LabelOpnd *label = Labels[LabelID]; + addDebugInfo(); + CISA_CALL(Kernel->AppendVISACFGotoInst(pred, emask, esize, label)); +} + +// Convert predicate offset to EM offset according to +// vISA spec 3.3.1 Execution Mask. +static VISA_EMask_Ctrl getVisaEMOffset(unsigned PredOffset) { + switch (PredOffset) { + case 0: + return vISA_EMASK_M1; + case 4: + return vISA_EMASK_M2; + case 8: + return vISA_EMASK_M3; + case 12: + return vISA_EMASK_M4; + case 16: + return vISA_EMASK_M5; + case 20: + return vISA_EMASK_M6; + case 24: + return vISA_EMASK_M7; + case 28: + return vISA_EMASK_M8; + } + llvm_unreachable("Unexpected EM offset"); +} + +/*********************************************************************** + * getPredicateOperand : get predicate operand, scanning through any baled + * in rdpredregion, all, any, not instructions to derive the mask control + * field and the predication field + * + * Enter: Inst = instruction to get predicate operand from + * OperandNum = operand number in Inst + * BI = bale info for Inst + * *Control = where to write control information about predicate + * *State = where to write state information about predicate + * *MaskCtrl = where to write mask control field (bits 7..4) + * + * Return: Value of mask after scanning through baled in instructions + * *PredField and *MaskCtrl set + */ +Value *GenXKernelBuilder::getPredicateOperand( + Instruction *Inst, unsigned OperandNum, BaleInfo BI, + VISA_PREDICATE_CONTROL &Control, VISA_PREDICATE_STATE &State, + VISA_EMask_Ctrl *MaskCtrl) { + State = PredState_NO_INVERSE; + *MaskCtrl = vISA_EMASK_M1; + Control = PRED_CTRL_NON; + Value *Mask = Inst->getOperand(OperandNum); + // Check for baled in all/any/notp/rdpredregion. + while (BI.isOperandBaled(OperandNum)) { + Instruction *Inst = dyn_cast(Mask); + if (isNot(Inst)) { + if (Control != PRED_CTRL_NON) { + // switch any<->all as well as invert bit + Control ^= (VISA_PREDICATE_CONTROL)(PRED_CTRL_ANY | PRED_CTRL_ALL); + State ^= PredState_INVERSE; + } else { + // all/any not set, just invert invert bit + State ^= PredState_INVERSE; + } + OperandNum = 0; + assert(Inst); + Mask = Inst->getOperand(OperandNum); + BI = Baling->getBaleInfo(Inst); + continue; + } + switch (GenXIntrinsic::getGenXIntrinsicID(Inst)) { + case GenXIntrinsic::genx_all: + Control |= PRED_CTRL_ALL; // predicate combine field = "all" + OperandNum = 0; + Mask = Inst->getOperand(OperandNum); + BI = Baling->getBaleInfo(Inst); + continue; + case GenXIntrinsic::genx_any: + Control |= PRED_CTRL_ANY; // predicate combine field = "any" + OperandNum = 0; + Mask = Inst->getOperand(OperandNum); + BI = Baling->getBaleInfo(Inst); + continue; + case GenXIntrinsic::genx_rdpredregion: { + // Baled in rdpredregion. Use its constant offset for the mask control + // field. + unsigned MaskOffset = + cast(Inst->getOperand(1))->getSExtValue(); + *MaskCtrl = getVisaEMOffset(MaskOffset); + Mask = Inst->getOperand(0); + break; + } + default: + break; + } + // Baled shufflepred. Mask offset is deduced from initial value of slice. + if (auto *SVI = dyn_cast(Inst)) { + unsigned MaskOffset = + ShuffleVectorAnalyzer::getReplicatedSliceDescriptor(SVI) + .InitialOffset; + *MaskCtrl = getVisaEMOffset(MaskOffset); + Mask = SVI->getOperand(0); + } + break; + } + return Mask; +} + +void GenXKernelBuilder::AddGenVar(Register &Reg) { + auto &DL = FG->getModule()->getDataLayout(); + + VISA_GenVar *parentDecl = nullptr; + VISA_GenVar *Decl = nullptr; + + if (!Reg.AliasTo) { + // This is not an aliased register. Go through all the aliases and + // determine the biggest alignment required. If the register is at least + // as big as a GRF, make the alignment GRF. + unsigned Alignment = 5; // GRF alignment + Type *Ty = Reg.Ty; + unsigned NBits = Ty->isPointerTy() ? DL.getPointerSizeInBits() + : Ty->getPrimitiveSizeInBits(); + if (NBits < GrfByteSize * 8 /* bits in GRF */) { + Alignment = 0; + for (Register *AliasReg = &Reg; AliasReg; + AliasReg = AliasReg->NextAlias[KernFunc]) { + Type *AliasTy = AliasReg->Ty->getScalarType(); + unsigned ThisElementBytes = AliasTy->isPointerTy() + ? DL.getPointerTypeSize(AliasTy) + : AliasTy->getPrimitiveSizeInBits() / 8; + unsigned LogThisElementBytes = genx::log2(ThisElementBytes); + if (LogThisElementBytes > Alignment) + Alignment = LogThisElementBytes; + if (AliasReg->Alignment > Alignment) + Alignment = AliasReg->Alignment; + } + } + for (Register *AliasReg = &Reg; AliasReg; AliasReg = AliasReg->NextAlias[KernFunc]) { + if (AliasReg->Alignment < Alignment) + AliasReg->Alignment = Alignment; + } + } else { + if (Reg.AliasTo->Num < visa::VISA_NUM_RESERVED_REGS) { + CISA_CALL(Kernel->GetPredefinedVar(parentDecl, + (PreDefined_Vars)Reg.AliasTo->Num)); + assert(parentDecl && "Predefeined variable is null"); + } else { + parentDecl = Reg.AliasTo->GetVar(Kernel); + assert(parentDecl && "Refers to undefined var"); + } + } + + visa::TypeDetails TD(DL, Reg.Ty, Reg.Signed); + + CISA_CALL(Kernel->CreateVISAGenVar( + Decl, Reg.NameStr.c_str(), TD.NumElements, + static_cast(TD.VisaType), + // 0x7 is a hack because for some reasons + // alignment can be a large number + static_cast(Reg.Alignment & 0x7), parentDecl, 0)); + + Reg.SetVar(Kernel, Decl); + + for (auto &Attr : Reg.Attributes) { + CISA_CALL(Kernel->AddAttributeToVar( + Decl, getStringByIndex(Attr.first).begin(), Attr.second.size(), + (void *)(Attr.second.c_str()))); + } +} +/************************************************************************************************** + * Scan ir to collect information about whether kernel has callable function or + * barrier. + */ +void GenXKernelBuilder::collectKernelInfo() { + for (auto It = FG->begin(), E = FG->end(); It != E; ++It) { + auto Func = *It; + HasStackcalls |= + Func->hasFnAttribute(genx::FunctionMD::CMStackCall) || + Func->hasFnAttribute(genx::FunctionMD::ReferencedIndirectly); + for (auto &BB : *Func) { + for (auto &I : BB) { + if (CallInst *CI = dyn_cast(&I)) { + if (CI->isInlineAsm()) + continue; + if (GenXIntrinsicInst *II = dyn_cast(CI)) { + auto IID = II->getIntrinsicID(); + if (IID == GenXIntrinsic::genx_barrier) + HasBarrier = true; + else if (IID == GenXIntrinsic::genx_alloca) + HasAlloca = true; + } else { + Function *Callee = CI->getCalledFunction(); + if (Callee && Callee->hasFnAttribute("CMCallable")) + HasCallable = true; + } + } + } + } + } +} +/************************************************************************************************** + * Build variables + */ +void GenXKernelBuilder::buildVariables() { + RegAlloc->SetRegPushHook(this, [](void *Object, GenXVisaRegAlloc::Reg &Reg) { + static_cast(Object)->AddGenVar(Reg); + }); + + for (auto &It : RegAlloc->getRegStorage()) { + Register *Reg = &(It); + switch (Reg->Category) { + case RegCategory::GENERAL: + AddGenVar(*Reg); + break; + + case RegCategory::ADDRESS: { + VISA_AddrVar *Decl = nullptr; + unsigned NumElements = 1; + if (VectorType *VT = dyn_cast(Reg->Ty)) + NumElements = VT->getNumElements(); + CISA_CALL( + Kernel->CreateVISAAddrVar(Decl, Reg->NameStr.c_str(), NumElements)); + Reg->SetVar(Kernel, Decl); + for (auto &Attr : Reg->Attributes) { + CISA_CALL(Kernel->AddAttributeToVar( + Decl, getStringByIndex(Attr.first).begin(), Attr.second.size(), + (void *)(Attr.second.c_str()))); + } + } break; + + case RegCategory::PREDICATE: { + VISA_PredVar *Decl = nullptr; + unsigned NumElements = 1; + if (VectorType *VT = dyn_cast(Reg->Ty)) + NumElements = VT->getNumElements(); + CISA_CALL( + Kernel->CreateVISAPredVar(Decl, Reg->NameStr.c_str(), NumElements)); + Reg->SetVar(Kernel, Decl); + for (auto &Attr : Reg->Attributes) { + CISA_CALL(Kernel->AddAttributeToVar( + Decl, getStringByIndex(Attr.first).begin(), Attr.second.size(), + (void *)(Attr.second.c_str()))); + } + } break; + + case RegCategory::SAMPLER: { + unsigned NumElements = 1; + if (VectorType *VT = dyn_cast(Reg->Ty)) + NumElements = VT->getNumElements(); + VISA_SamplerVar *Decl = nullptr; + CISA_CALL(Kernel->CreateVISASamplerVar(Decl, Reg->NameStr.c_str(), + NumElements)); + Reg->SetVar(Kernel, Decl); + } break; + + case RegCategory::SURFACE: { + VISA_SurfaceVar *Decl = nullptr; + if (Reg->Num < visa::VISA_NUM_RESERVED_SURFACES) { + Kernel->GetPredefinedSurface(Decl, (PreDefined_Surface)Reg->Num); + } else { + unsigned NumElements = 1; + if (VectorType *VT = dyn_cast(Reg->Ty)) + NumElements = VT->getNumElements(); + + CISA_CALL(Kernel->CreateVISASurfaceVar(Decl, Reg->NameStr.c_str(), + NumElements)); + } + Reg->SetVar(Kernel, Decl); + } break; + + case RegCategory::VME: + report_fatal_error("VME variable is no longer supported"); + break; + + default: + report_fatal_error("Unknown category for register"); + break; + } + } + + VISA_GenVar *ArgDecl = nullptr, *RetDecl = nullptr; + Kernel->GetPredefinedVar(ArgDecl, PREDEFINED_ARG); + Kernel->GetPredefinedVar(RetDecl, PREDEFINED_RET); + createCisaVariable(Kernel, "argv", ArgDecl, ARG_SIZE_IN_GRFS * GrfByteSize); + createCisaVariable(Kernel, "retv", RetDecl, RET_SIZE_IN_GRFS * GrfByteSize); +} + +/*********************************************************************** + * getExecMaskFromWrPredRegion : write exec size field from wrpredregion + * or wrpredpredregion instruction + * + * Enter: ExecSize = execution size + * WrPredRegion = 0 else wrpredregion instruction + * + * The exec size byte includes the mask control field, which we need to set + * up from the wrpredregion/wrpredpredregion. + */ +VISA_EMask_Ctrl +GenXKernelBuilder::getExecMaskFromWrPredRegion(Instruction *WrPredRegion, + bool IsNoMask) { + VISA_EMask_Ctrl MaskCtrl = + (IsNoMask | NoMask) ? vISA_EMASK_M1_NM : vISA_EMASK_M1; + if (WrPredRegion) { + // Get the mask control field from the offset in the wrpredregion. + unsigned MaskOffset = + cast(WrPredRegion->getOperand(2))->getSExtValue(); + assert(MaskOffset < 32 && "unexpected mask offset"); + MaskCtrl = static_cast(MaskOffset >> 2); + } + + // Set to NoMask if requested. Otherwise use the default NM mode + // when WrPredRegion is null. + if ((IsNoMask && MaskCtrl < vISA_EMASK_M1_NM) || + (!WrPredRegion && NoMask && MaskCtrl < vISA_EMASK_M1_NM)) + MaskCtrl = static_cast( + static_cast(MaskCtrl) + vISA_EMASK_M1_NM); + + return MaskCtrl; +} + +/*********************************************************************** + * getExecMaskFromWrRegion : get exec size field from wrregion instruction + * + * Enter: ExecSize = execution size + * WrRegion = 0 else wrregion instruction + * WrRegionBI = BaleInfo for wrregion, so we can see if there is a + * rdpredregion baled in to the mask + * + * If WrRegion != 0, and it has a mask that is not constant 1, then the + * mask must be a predicate register. + * + * The exec size byte includes the mask control field, which we need to set + * up from any rdpredregion baled in to a predicated wrregion. + * + * If the predicate has no register allocated, it must be EM, and we set the + * instruction to be masked. Otherwise we set nomask. + */ +VISA_EMask_Ctrl +GenXKernelBuilder::getExecMaskFromWrRegion(const DstOpndDesc &DstDesc, + bool IsNoMask) { + // Override mask control if requested. + auto MaskCtrl = (IsNoMask | NoMask) ? vISA_EMASK_M1_NM : vISA_EMASK_M1; + + if (DstDesc.WrRegion) { + // Get the predicate (mask) operand, scanning through baled in + // all/any/not/rdpredregion and setting PredField and MaskCtrl + // appropriately. + VISA_PREDICATE_CONTROL Control = PRED_CTRL_NON; + VISA_PREDICATE_STATE State = PredState_NO_INVERSE; + Value *Mask = + getPredicateOperand(DstDesc.WrRegion, 7 /*mask operand in wrregion*/, + DstDesc.WrRegionBI, Control, State, &MaskCtrl); + if ((isa(Mask) || RegAlloc->getRegForValueOrNull(KernFunc, Mask)) && NoMask) + MaskCtrl |= vISA_EMASK_M1_NM; + } + return MaskCtrl; +} + +/*********************************************************************** + * buildIntrinsic : build code for an intrinsic + * + * Enter: CI = the CallInst + * IntrinID = intrinsic ID + * BI = BaleInfo for the instruction + * Mod = modifier bits for destination + * WrRegion = 0 else wrregion for destination + * WrRegionBI = BaleInfo for WrRegion + */ +void GenXKernelBuilder::buildIntrinsic(CallInst *CI, unsigned IntrinID, + BaleInfo BI, unsigned Mod, + const DstOpndDesc &DstDesc) { + using II = GenXIntrinsicInfo; + LLVM_DEBUG(dbgs() << "buildIntrinsic: " << *CI << "\n"); + + int MaxRawOperands = std::numeric_limits::max(); + + // TODO: replace lambdas by methods + + auto GetUnsignedValue = [&](II::ArgInfo AI) { + ConstantInt *Const = + dyn_cast(CI->getArgOperand(AI.getArgIdx())); + if (!Const) + report_fatal_error("Incorrect args to intrinsic call"); + return static_cast(Const->getSExtValue()); + }; + + auto CreateSurfaceOperand = [&](II::ArgInfo AI) { + llvm::Value *Arg = CI->getArgOperand(AI.getArgIdx()); + VISA_SurfaceVar *SurfDecl = nullptr; + int Index = visa::convertToSurfaceIndex(Arg); + if (visa::isReservedSurfaceIndex(Index)) { + Kernel->GetPredefinedSurface(SurfDecl, visa::getReservedSurface(Index)); + } else { + Register *Reg = RegAlloc->getRegForValue(KernFunc, Arg); + assert(Reg->Category == RegCategory::SURFACE && + "Expected surface register"); + SurfDecl = Reg->GetVar(Kernel); + } + VISA_StateOpndHandle *ResultOperand = nullptr; + CISA_CALL(Kernel->CreateVISAStateOperandHandle(ResultOperand, SurfDecl)); + return ResultOperand; + }; + + auto CreateSamplerOperand = [&](II::ArgInfo AI) { + Register *Reg = RegAlloc->getRegForValue(KernFunc, CI->getArgOperand(AI.getArgIdx())); + assert(Reg->Category == RegCategory::SAMPLER && + "Expected sampler register"); + VISA_StateOpndHandle *ResultOperand = nullptr; + CISA_CALL(Kernel->CreateVISAStateOperandHandle( + ResultOperand, Reg->GetVar(Kernel))); + return ResultOperand; + }; + + auto GetMediaHeght = [&](II::ArgInfo AI) { + // constant byte for media height that we need to infer from the + // media width and the return type or final arg + ConstantInt *Const = + dyn_cast(CI->getArgOperand(AI.getArgIdx())); + if (!Const) + report_fatal_error("Incorrect args to intrinsic call"); + unsigned Width = Const->getZExtValue(); + if (Width == 0 || Width > 64) + report_fatal_error("Invalid media width"); + unsigned RoundedWidth = 1 << genx::log2(Width); + if (RoundedWidth < Width) + RoundedWidth *= 2; + if (RoundedWidth < 4) + RoundedWidth = 4; + Type *DataType = CI->getType(); + if (DataType->isVoidTy()) + DataType = CI->getOperand(CI->getNumArgOperands() - 1)->getType(); + unsigned DataSize; + if (VectorType *VT = dyn_cast(DataType)) + DataSize = VT->getElementType()->getPrimitiveSizeInBits() / 8 * + VT->getNumElements(); + else + DataSize = DataType->getPrimitiveSizeInBits() / 8; + if (DataSize <= RoundedWidth && DataSize >= Width) + return static_cast(1); + if (DataSize % RoundedWidth) + report_fatal_error("Invalid media width"); + return static_cast(DataSize / RoundedWidth); + }; + + auto CreateOperand = [&](II::ArgInfo AI) { + VISA_VectorOpnd *ResultOperand = nullptr; + Signedness Signed = DONTCARESIGNED; + if (AI.needsSigned()) + Signed = SIGNED; + else if (AI.needsUnsigned()) + Signed = UNSIGNED; + if (AI.isRet()) { + if (AI.getSaturation() == II::SATURATION_SATURATE) + Mod |= MODIFIER_SAT; + ResultOperand = createDestination(CI, Signed, Mod, DstDesc); + } else { + unsigned MaxWidth = 16; + if (AI.getRestriction() == II::TWICEWIDTH) { + // For a TWICEWIDTH operand, do not allow width bigger than the + // execution size. + MaxWidth = CI->getType()->getVectorNumElements(); + } + ResultOperand = createSourceOperand(CI, Signed, AI.getArgIdx(), BI, 0, + nullptr, MaxWidth); + } + return ResultOperand; + }; + + auto CreateRawOperand = [&](II::ArgInfo AI) { + VISA_RawOpnd *ResultOperand = nullptr; + auto Signed = DONTCARESIGNED; + if (AI.needsSigned()) + Signed = SIGNED; + else if (AI.needsUnsigned()) + Signed = UNSIGNED; + if (AI.isRet()) { + assert(!Mod); + ResultOperand = createRawDestination(CI, DstDesc, Signed); + } else if (AI.getArgIdx() < MaxRawOperands) + ResultOperand = createRawSourceOperand(CI, AI.getArgIdx(), BI, Signed); + return ResultOperand; + }; + + auto CreateRawOperands = [&](II::ArgInfo AI, VISA_RawOpnd **Operands) { + assert(MaxRawOperands != std::numeric_limits::max() && + "MaxRawOperands must be defined"); + for (int i = 0; i < AI.getArgIdx() + MaxRawOperands; ++i) { + Operands[i] = CreateRawOperand(II::ArgInfo(II::RAW | (AI.Info + i))); + } + }; + + auto GetOwords = [&](II::ArgInfo AI) { + // constant byte for log2 number of owords + Value *Arg = CI; + if (!AI.isRet()) + Arg = CI->getOperand(AI.getArgIdx()); + VectorType *VT = dyn_cast(Arg->getType()); + if (!VT) + report_fatal_error("Invalid number of owords"); + int DataSize = VT->getNumElements() * + DL.getTypeSizeInBits(VT->getElementType()) / 8; + DataSize = genx::exactLog2(DataSize) - 4; + if (DataSize < 0 || DataSize > 4) + report_fatal_error("Invalid number of words"); + return static_cast(DataSize); + }; + + auto GetExecSize = [&](II::ArgInfo AI, VISA_EMask_Ctrl *Mask) { + int ExecSize = GenXIntrinsicInfo::getOverridedExecSize(CI, Subtarget); + if (ExecSize == 0) { + if (VectorType *VT = dyn_cast(CI->getType())) { + ExecSize = VT->getNumElements(); + } else { + ExecSize = 1; + } + } + bool IsNoMask = AI.getCategory() == II::EXECSIZE_NOMASK; + *Mask = getExecMaskFromWrRegion(DstDesc, IsNoMask); + return getExecSizeFromValue(ExecSize); + }; + + auto GetExecSizeFromArg = [&](II::ArgInfo AI, + VISA_EMask_Ctrl *ExecMask) { + // exec_size inferred from width of predicate arg, defaulting to 16 if + // it is scalar i1 (as can happen in raw send). Also get M3 etc flag + // if the predicate has a baled in rdpredregion, and mark as nomask if + // the predicate is not EM. + int ExecSize; + *ExecMask = NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1; + // Get the predicate (mask) operand, scanning through baled in + // all/any/not/rdpredregion and setting PredField and MaskCtrl + // appropriately. + VISA_PREDICATE_CONTROL Control; + VISA_PREDICATE_STATE State; + Value *Mask = + getPredicateOperand(CI, AI.getArgIdx(), BI, Control, State, ExecMask); + if (isa(Mask) || RegAlloc->getRegForValueOrNull(KernFunc, Mask)) + *ExecMask |= NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1; + if (auto VT = + dyn_cast(CI->getOperand(AI.getArgIdx())->getType())) + ExecSize = VT->getNumElements(); + else + ExecSize = GenXIntrinsicInfo::getOverridedExecSize(CI, Subtarget); + return getExecSizeFromValue(ExecSize); + }; + + auto GetExecSizeFromByte = [&](II::ArgInfo AI, VISA_EMask_Ctrl *Mask) { + ConstantInt *Const = + dyn_cast(CI->getArgOperand(AI.getArgIdx())); + if (!Const) + report_fatal_error("Incorrect args to intrinsic call"); + unsigned Byte = Const->getSExtValue() & 15; + *Mask = (VISA_EMask_Ctrl)(Byte >> 4); + unsigned Res = Byte & 0xF; + assert(Res <= 5 && + "illegal common ISA execsize (should be 1, 2, 4, 8, 16, 32)."); + return (VISA_Exec_Size)Res; + }; + + auto CreateImplicitPredication = [&](II::ArgInfo AI) { + return createPredFromWrRegion(DstDesc); + }; + + auto CreatePredication = [&](II::ArgInfo AI) { + return createPred(CI, BI, AI.getArgIdx()); + }; + + auto GetPredicateVar = [&](II::ArgInfo AI) { + if (AI.isRet()) + return getPredicateVar(CI); + else + return getPredicateVar(CI->getArgOperand(AI.getArgIdx())); + }; + + auto GetZeroedPredicateVar = [&](II::ArgInfo AI) { + if (AI.isRet()) + return getZeroedPredicateVar(CI); + else + return getZeroedPredicateVar(CI->getArgOperand(AI.getArgIdx())); + }; + + auto CreateNullRawOperand = [&](II::ArgInfo AI) { + VISA_RawOpnd *ResultOperand = nullptr; + CISA_CALL(Kernel->CreateVISANullRawOperand(ResultOperand, false)); + return ResultOperand; + }; + + auto ProcessTwoAddr = [&](II::ArgInfo AI) { + if (AI.getCategory() != II::TWOADDR) + return; + auto Reg = RegAlloc->getRegForValueOrNull(KernFunc, CI, DONTCARESIGNED); + if (isa(CI->getArgOperand(AI.getArgIdx())) && Reg && + isInLoop(CI->getParent())) + addLifetimeStartInst(CI); + }; + + // Constant vector of i1 (or just scalar i1) as i32 (used in setp) + auto ConstVi1Asi32 = [&](II::ArgInfo AI) { + VISA_VectorOpnd *ResultOperand = nullptr; + auto C = cast(CI->getArgOperand(AI.getArgIdx())); + // Get the bit value of the vXi1 constant. + unsigned IntVal = getPredicateConstantAsInt(C); + // unsigned i32 constant source operand + CISA_CALL(Kernel->CreateVISAImmediate(ResultOperand, &IntVal, ISA_TYPE_UD)); + return ResultOperand; + }; + + auto CreateAddressOperand = [&](II::ArgInfo AI) { + if (AI.isRet()) + return createAddressOperand(CI, true); + else + return createAddressOperand(CI->getArgOperand(AI.getArgIdx()), false); + }; + + auto GetArgCount = [&](II::ArgInfo AI) { + auto BaseArg = AI.getArgIdx(); + MaxRawOperands = BaseArg; + + for (unsigned Idx = BaseArg; Idx < CI->getNumArgOperands(); ++Idx) { + if (auto CA = dyn_cast(CI->getArgOperand(Idx))) { + if (CA->isNullValue()) + continue; + } + MaxRawOperands = Idx + 1; + } + + if (MaxRawOperands < BaseArg + AI.getArgCountMin()) + MaxRawOperands = BaseArg + AI.getArgCountMin(); + + return MaxRawOperands - AI.getArgIdx(); + }; + + auto GetNumGrfs = [&](II::ArgInfo AI) { + // constant byte for number of GRFs + Value *Arg = CI; + if (!AI.isRet()) + Arg = CI->getOperand(AI.getArgIdx()); + VectorType *VT = dyn_cast(Arg->getType()); + if (!VT) + report_fatal_error("Invalid number of GRFs"); + int DataSize = VT->getNumElements() * + VT->getElementType()->getPrimitiveSizeInBits() / 8; + return (uint8_t)((DataSize + (GrfByteSize - 1)) / GrfByteSize); + }; + + auto GetSampleChMask = [&](II::ArgInfo AI) { + ConstantInt *Const = + dyn_cast(CI->getArgOperand(AI.getArgIdx())); + if (!Const) + report_fatal_error("Incorrect args to intrinsic call"); + unsigned Byte = Const->getSExtValue() & 15; + // Find the U_offset arg. It is the first vector arg after this one. + VectorType *VT; + for (unsigned Idx = AI.getArgIdx() + 1; + !(VT = dyn_cast(CI->getOperand(Idx)->getType())); ++Idx) + ; + unsigned Width = VT->getNumElements(); + if (Width != 8 && Width != 16) + report_fatal_error("Invalid execution size for load/sample"); + Byte |= Width & 16; + return Byte; + }; + + auto GetSvmGatherBlockSize = [&](II::ArgInfo AI) { + // svm gather/scatter "block size" field, set to reflect the element + // type of the data + Value *V = CI; + if (!AI.isRet()) + V = CI->getArgOperand(AI.getArgIdx()); + unsigned ElBytes = + V->getType()->getScalarType()->getPrimitiveSizeInBits() / 8; + switch (ElBytes) { + // For N = 2 byte data type, use block size 1 and block count 2. + // Otherwise, use block size N and block count 1. + case 2: + case 1: + ElBytes = 0; + break; + case 4: + ElBytes = 1; + break; + case 8: + ElBytes = 2; + break; + default: + report_fatal_error("Bad element type for SVM scatter/gather"); + } + return ElBytes; + }; + + auto CreateOpndPredefinedSrc = [&](PreDefined_Vars RegId, unsigned ROffset, + unsigned COffset, unsigned VStride, + unsigned Width, unsigned HStride) { + VISA_GenVar *Decl = nullptr; + CISA_CALL(Kernel->GetPredefinedVar(Decl, RegId)); + VISA_VectorOpnd *ResultOperand = nullptr; + CISA_CALL(Kernel->CreateVISASrcOperand(ResultOperand, Decl, + (VISA_Modifier)Mod, VStride, Width, + HStride, ROffset, COffset)); + return ResultOperand; + }; + + auto CreateOpndPredefinedDst = [&](PreDefined_Vars RegId, unsigned ROffset, + unsigned COffset, unsigned HStride) { + VISA_GenVar *Decl = nullptr; + CISA_CALL(Kernel->GetPredefinedVar(Decl, RegId)); + VISA_VectorOpnd *ResultOperand = nullptr; + CISA_CALL(Kernel->CreateVISADstOperand(ResultOperand, Decl, HStride, + ROffset, COffset)); + return ResultOperand; + }; + + auto CreateImmOpndFromUInt = [&](VISA_Type ImmType, unsigned Val) { + VISA_VectorOpnd *src = nullptr; + CISA_CALL(Kernel->CreateVISAImmediate(src, &Val, ImmType)); + + return src; + }; + + + VISA_EMask_Ctrl exec_mask; + addDebugInfo(); +#include "GenXIntrinsicsBuildMap.inc" +} + +/************************************************************************************************** + * buildControlRegUpdate : generate an instruction to apply a mask to + * the control register (V14). + * + * Enter: Mask = the mask to apply + * Clear = false if bits set in Mask should be set in V14, + * true if bits set in Mask should be cleared in V14. + */ +void GenXKernelBuilder::buildControlRegUpdate(unsigned Mask, bool Clear) { + ISA_Opcode Opcode; + // write opcode + if (Clear) { + Opcode = ISA_AND; + Mask = ~Mask; + } else + Opcode = ISA_OR; + + Region Single = Region(1, 4); + + VISA_GenVar *Decl = nullptr; + CISA_CALL(Kernel->GetPredefinedVar(Decl, PREDEFINED_CR0)); + VISA_VectorOpnd *dst = + createRegionOperand(&Single, Decl, DONTCARESIGNED, 0, true); + VISA_VectorOpnd *src0 = + createRegionOperand(&Single, Decl, DONTCARESIGNED, 0, false); + + VISA_VectorOpnd *src1 = nullptr; + CISA_CALL(Kernel->CreateVISAImmediate(src1, &Mask, ISA_TYPE_UD)); + + addDebugInfo(); + CISA_CALL(Kernel->AppendVISALogicOrShiftInst(Opcode, nullptr, false, + vISA_EMASK_M1, EXEC_SIZE_1, dst, + src0, src1, nullptr, nullptr)); +} + +/*********************************************************************** + * buildBranch : build a conditional or unconditional branch + * + * Return: true if fell through to successor + */ +bool GenXKernelBuilder::buildBranch(BranchInst *Branch) { + BasicBlock *Next = Branch->getParent()->getNextNode(); + if (Branch->isUnconditional()) { + // Unconditional branch + if (Branch->getOperand(0) == Next) + return true; // fall through to successor + auto labelId = getOrCreateLabel(Branch->getSuccessor(0), LABEL_BLOCK); + addDebugInfo(); + CISA_CALL(Kernel->AppendVISACFJmpInst(nullptr, Labels[labelId])); + return false; + } + // Conditional branch. + // First check if it is a baled in goto/join, via an extractvalue. + auto BI = Baling->getBaleInfo(Branch); + if (BI.isOperandBaled(0 /*condition*/)) { + if (auto Extract = dyn_cast(Branch->getCondition())) { + auto GotoJoin = cast(Extract->getAggregateOperand()); + if (GenXIntrinsic::getGenXIntrinsicID(GotoJoin) == GenXIntrinsic::genx_simdcf_goto) { + buildGoto(GotoJoin, Branch); + } else { + assert(GotoJoin::isValidJoin(GotoJoin) && + "extra unexpected code in join block"); + buildJoin(GotoJoin, Branch); + } + return true; + } + } + // Normal conditional branch. + VISA_EMask_Ctrl MaskCtrl; + VISA_PREDICATE_CONTROL Control = PRED_CTRL_NON; + VISA_PREDICATE_STATE State = PredState_NO_INVERSE; + Value *Pred = getPredicateOperand(Branch, 0, BI, Control, State, &MaskCtrl); + assert(!isa(Branch->getCondition()->getType()) && + "branch must have scalar condition"); + BasicBlock *True = Branch->getSuccessor(0); + BasicBlock *False = Branch->getSuccessor(1); + if (True == Next) { + State ^= PredState_INVERSE; // invert bit in predicate field + True = False; + False = Next; + } + // Write the conditional branch. + VISA_PredVar *PredVar = getPredicateVar(Pred); + VISA_PredOpnd* PredOperand = createPredOperand(PredVar, State, Control); + addDebugInfo(); + CISA_CALL(Kernel->AppendVISACFJmpInst( + PredOperand, Labels[getOrCreateLabel(True, LABEL_BLOCK)])); + // If the other successor is not the next block, write an unconditional + // jmp to that. + if (False == Next) + return true; // fall through to successor + addDebugInfo(); + CISA_CALL(Kernel->AppendVISACFJmpInst( + nullptr, Labels[getOrCreateLabel(False, LABEL_BLOCK)])); + return false; +} + +/*********************************************************************** + * buildJoin : build a join + * + * Enter: Join = join instruction that is baled into an extractvalue of + * field 1 (the !any(EM) value), that is baled into Branch, + * if Branch is non-zero + * Branch = branch instruction, or 0 for a join that is not baled + * in to a branch because it always ends up with at least + * one channel enabled + */ +void GenXKernelBuilder::buildJoin(CallInst *Join, BranchInst *Branch) { + // A join needs a label. (If the join is at the start of its block, then + // this gets merged into the block label.) + addLabelInst(Join); + // There is no join instruction in vISA -- the finalizer derives it by + // looking for gotos targeting the basic block's label. +} + +#if (LLVM_VERSION_MAJOR > 8) +/*********************************************************************** + * buildUnaryOperator : build code for an unary operator + * + * Enter: UO = the UnaryOperator + * BI = BaleInfo for UO + * Mod = modifier bits for destination + * WrRegion = 0 else wrregion for destination + * WrRegionBI = BaleInfo for WrRegion + */ +void GenXKernelBuilder::buildUnaryOperator(UnaryOperator *UO, BaleInfo BI, + unsigned Mod, + const DstOpndDesc &DstDesc) { + ISA_Opcode Opcode = ISA_RESERVED_0; + Signedness DstSigned = SIGNED; + Signedness SrcSigned = SIGNED; + unsigned Mod1 = 0; + VISA_Exec_Size ExecSize = EXEC_SIZE_1; + VectorType *VT = dyn_cast(UO->getType()); + if (VT != nullptr) + ExecSize = getExecSizeFromValue(VT->getNumElements()); + + switch (UO->getOpcode()) { + case Instruction::FNeg: + Opcode = ISA_MOV; + Mod1 ^= MODIFIER_NEG; + break; + default: + report_fatal_error("buildUnaryOperator: unimplemented unary operator"); + } + + VISA_VectorOpnd *Dst = createDestination(UO, DstSigned, Mod, DstDesc); + + VISA_VectorOpnd *Src0 = nullptr; + VISA_PredOpnd *Pred = createPredFromWrRegion(DstDesc); + + Src0 = createSourceOperand(UO, SrcSigned, 0, BI, Mod1); + + auto ExecMask = getExecMaskFromWrRegion(DstDesc); + + addDebugInfo(); + + if (Opcode == ISA_MOV) { + CISA_CALL(Kernel->AppendVISADataMovementInst( + ISA_MOV, Pred, Mod1 & MODIFIER_SAT, ExecMask, ExecSize, Dst, Src0, NULL)); + return; + } + report_fatal_error("buildUnaryOperator: unimplemented opcode"); +} +#endif + +/*********************************************************************** + * buildBinaryOperator : build code for a binary operator + * + * Enter: BO = the BinaryOperator + * BI = BaleInfo for BO + * Mod = modifier bits for destination + * WrRegion = 0 else wrregion for destination + * WrRegionBI = BaleInfo for WrRegion + */ +void GenXKernelBuilder::buildBinaryOperator(BinaryOperator *BO, BaleInfo BI, + unsigned Mod, + const DstOpndDesc &DstDesc) { + bool IsLogic = false; + ISA_Opcode Opcode = ISA_RESERVED_0; + Signedness DstSigned = SIGNED; + Signedness SrcSigned = SIGNED; + unsigned Mod1 = 0; + VISA_Exec_Size ExecSize = EXEC_SIZE_1; + if (VectorType *VT = dyn_cast(BO->getType())) + ExecSize = getExecSizeFromValue(VT->getNumElements()); + switch (BO->getOpcode()) { + case Instruction::Add: + case Instruction::FAdd: + Opcode = ISA_ADD; + break; + case Instruction::Sub: + case Instruction::FSub: + Opcode = ISA_ADD; + Mod1 ^= MODIFIER_NEG; + break; + case Instruction::Mul: + case Instruction::FMul: + Opcode = ISA_MUL; + break; + case Instruction::Shl: + Opcode = ISA_SHL; + IsLogic = true; + break; + case Instruction::AShr: + Opcode = ISA_ASR; + IsLogic = true; + break; + case Instruction::LShr: + Opcode = ISA_SHR; + DstSigned = SrcSigned = UNSIGNED; + IsLogic = true; + break; + case Instruction::UDiv: + Opcode = ISA_DIV; + DstSigned = SrcSigned = UNSIGNED; + break; + case Instruction::SDiv: + Opcode = ISA_DIV; + break; + case Instruction::FDiv: { + Opcode = ISA_DIV; + if (Constant *Op0 = dyn_cast(BO->getOperand(0))) { + if (Op0->getType()->isVectorTy()) + Op0 = Op0->getSplatValue(); + ConstantFP *CFP = dyn_cast_or_null(Op0); + if (CFP && CFP->isExactlyValue(1.0)) + Opcode = ISA_INV; + } + } break; + case Instruction::URem: + Opcode = ISA_MOD; + DstSigned = SrcSigned = UNSIGNED; + break; + case Instruction::SRem: + case Instruction::FRem: + Opcode = ISA_MOD; + break; + case Instruction::And: + Opcode = ISA_AND; + IsLogic = true; + break; + case Instruction::Or: + Opcode = ISA_OR; + IsLogic = true; + break; + case Instruction::Xor: + Opcode = ISA_XOR; + IsLogic = true; + break; + default: + report_fatal_error("buildBinaryOperator: unimplemented binary operator"); + break; + } + VISA_VectorOpnd *Dst = createDestination(BO, DstSigned, Mod, DstDesc); + + VISA_VectorOpnd *Src0 = nullptr; + VISA_VectorOpnd *Src1 = nullptr; + VISA_PredOpnd *Pred = createPredFromWrRegion(DstDesc); + + if (Opcode == ISA_INV) { + Src0 = createSourceOperand(BO, SrcSigned, 1, BI, Mod1); // source 0 + } else { + Src0 = createSourceOperand(BO, SrcSigned, 0, BI); // source 0 + Src1 = createSourceOperand(BO, SrcSigned, 1, BI, Mod1); // source 1 + } + + auto ExecMask = getExecMaskFromWrRegion(DstDesc); + + addDebugInfo(); + if (IsLogic) { + CISA_CALL(Kernel->AppendVISALogicOrShiftInst( + Opcode, Pred, Mod, ExecMask, ExecSize, Dst, Src0, Src1, NULL, NULL)); + } else { + if (Opcode == ISA_ADDC || Opcode == ISA_SUBB) { + CISA_CALL(Kernel->AppendVISAArithmeticInst( + Opcode, Pred, ExecMask, ExecSize, Dst, Src0, Src1, NULL)); + } else { + CISA_CALL(Kernel->AppendVISAArithmeticInst( + Opcode, Pred, Mod, ExecMask, ExecSize, Dst, Src0, Src1, NULL)); + } + } +} + +/*********************************************************************** + * buildBoolBinaryOperator : build code for a binary operator acting on + * i1 or vector of i1 + * + * Enter: BO = the BinaryOperator + */ +void GenXKernelBuilder::buildBoolBinaryOperator(BinaryOperator *BO) { + VISA_Exec_Size ExecSize = EXEC_SIZE_1; + if (VectorType *VT = dyn_cast(BO->getType())) + ExecSize = getExecSizeFromValue(VT->getNumElements()); + ISA_Opcode Opcode = ISA_RESERVED_0; + switch (BO->getOpcode()) { + case Instruction::And: + Opcode = ISA_AND; + break; + case Instruction::Or: + Opcode = ISA_OR; + break; + case Instruction::Xor: + Opcode = ISA_XOR; + if (isNot(BO)) + Opcode = ISA_NOT; + break; + default: + report_fatal_error( + "buildBoolBinaryOperator: unimplemented binary operator"); + break; + } + + VISA_PredVar *Dst = getPredicateVar(BO); + VISA_PredVar *Src0 = getPredicateVar(BO->getOperand(0)); + VISA_PredVar *Src1 = + Opcode != ISA_NOT ? getPredicateVar(BO->getOperand(1)) : nullptr; + + addDebugInfo(); + CISA_CALL(Kernel->AppendVISALogicOrShiftInst( + Opcode, NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1, ExecSize, Dst, Src0, + Src1)); +} + +void GenXKernelBuilder::buildSymbolInst(PtrToIntInst *ptr2Int, unsigned Mod, + const DstOpndDesc &DstDesc) { + auto GV = cast(ptr2Int->getOperand(0)); + VISA_VectorOpnd *Dst = createDestination(ptr2Int, UNSIGNED, Mod, DstDesc); + CISA_CALL(Kernel->AppendVISACFSymbolInst(GV->getName().str(), Dst)); +} + +/*********************************************************************** + * buildCastInst : build code for a cast (other than a bitcast) + * + * Enter: CI = the CastInst + * BI = BaleInfo for CI + * Mod = modifier bits for destination + * WrRegion = 0 else wrregion for destination + * WrRegionBI = BaleInfo for WrRegion + */ +void GenXKernelBuilder::buildCastInst(CastInst *CI, BaleInfo BI, unsigned Mod, + const DstOpndDesc &DstDesc) { + Signedness InSigned = DONTCARESIGNED; + Signedness OutSigned = DONTCARESIGNED; + switch (CI->getOpcode()) { + case Instruction::UIToFP: + InSigned = UNSIGNED; + break; + case Instruction::SIToFP: + InSigned = SIGNED; + break; + case Instruction::FPToUI: + OutSigned = UNSIGNED; + break; + case Instruction::FPToSI: + OutSigned = SIGNED; + break; + case Instruction::ZExt: + InSigned = UNSIGNED; + break; + case Instruction::SExt: + InSigned = SIGNED; + break; + case Instruction::FPTrunc: + case Instruction::FPExt: + break; + case Instruction::PtrToInt: + case Instruction::IntToPtr: + break; + case Instruction::AddrSpaceCast: + break; + default: + report_fatal_error("buildCastInst: unimplemented cast"); + break; + } + + VISA_Exec_Size ExecSize = EXEC_SIZE_1; + if (VectorType *VT = dyn_cast(CI->getType())) + ExecSize = getExecSizeFromValue(VT->getNumElements()); + + auto ExecMask = getExecMaskFromWrRegion(DstDesc); + + VISA_PredOpnd *Pred = createPredFromWrRegion(DstDesc); + // Give dest and source the same signedness for byte mov. + VISA_VectorOpnd *Dst = createDestination(CI, OutSigned, Mod, DstDesc); + + if (InSigned == DONTCARESIGNED) + InSigned = OutSigned; + VISA_VectorOpnd *Src0 = createSourceOperand(CI, InSigned, 0, BI); + + addDebugInfo(); + CISA_CALL(Kernel->AppendVISADataMovementInst( + ISA_MOV, Pred, Mod & MODIFIER_SAT, ExecMask, ExecSize, Dst, Src0, NULL)); +} + +/*********************************************************************** + * buildCmp : build code for a compare + * + * Enter: Cmp = the compare instruction + * BI = BaleInfo for Cmp + * WrRegion = 0 else wrpredregion, wrpredpredregion, or wrregion for + * destination + */ +void GenXKernelBuilder::buildCmp(CmpInst *Cmp, BaleInfo BI, + const DstOpndDesc &DstDesc) { + assert((!DstDesc.WrRegion || Cmp->getType()->getPrimitiveSizeInBits() != 4 || + Cmp->getOperand(0) + ->getType() + ->getScalarType() + ->getPrimitiveSizeInBits() == 64) && + "write predicate size 4 only allowed for double/longlong type"); + Signedness Signed = DONTCARESIGNED; + VISA_Cond_Mod opSpec; + switch (Cmp->getPredicate()) { + case CmpInst::FCMP_OEQ: + case CmpInst::FCMP_UEQ: + case CmpInst::ICMP_EQ: + opSpec = ISA_CMP_E; + break; + case CmpInst::FCMP_ONE: + case CmpInst::FCMP_UNE: + case CmpInst::ICMP_NE: + opSpec = ISA_CMP_NE; + break; + case CmpInst::FCMP_OGT: + case CmpInst::FCMP_UGT: + opSpec = ISA_CMP_G; + break; + case CmpInst::ICMP_UGT: + opSpec = ISA_CMP_G; + Signed = UNSIGNED; + break; + case CmpInst::ICMP_SGT: + opSpec = ISA_CMP_G; + Signed = SIGNED; + break; + case CmpInst::FCMP_OGE: + case CmpInst::FCMP_UGE: + opSpec = ISA_CMP_GE; + break; + case CmpInst::ICMP_UGE: + opSpec = ISA_CMP_GE; + Signed = UNSIGNED; + break; + case CmpInst::ICMP_SGE: + opSpec = ISA_CMP_GE; + Signed = SIGNED; + break; + case CmpInst::FCMP_OLT: + case CmpInst::FCMP_ULT: + opSpec = ISA_CMP_L; + break; + case CmpInst::ICMP_ULT: + opSpec = ISA_CMP_L; + Signed = UNSIGNED; + break; + case CmpInst::ICMP_SLT: + opSpec = ISA_CMP_L; + Signed = SIGNED; + break; + case CmpInst::FCMP_OLE: + case CmpInst::FCMP_ULE: + opSpec = ISA_CMP_LE; + break; + case CmpInst::ICMP_ULE: + opSpec = ISA_CMP_LE; + Signed = UNSIGNED; + break; + case CmpInst::ICMP_SLE: + opSpec = ISA_CMP_LE; + Signed = SIGNED; + break; + default: + report_fatal_error("unknown predicate"); + opSpec = ISA_CMP_E; + break; + } + + // Check if this is to write to a predicate desination or a GRF desination. + bool WriteToPred = true; + if (Cmp->hasOneUse()) { + Instruction *UI = Cmp->user_back(); + BaleInfo UserBI = Baling->getBaleInfo(UI); + if (UserBI.Type == BaleInfo::CMPDST) + WriteToPred = false; + } + + VISA_Exec_Size ExecSize = EXEC_SIZE_1; + VISA_EMask_Ctrl ctrlMask = vISA_EMASK_M1; + if (VectorType *VT = dyn_cast(Cmp->getType())) + ExecSize = getExecSizeFromValue(VT->getNumElements()); + + VISA_VectorOpnd *Dst = nullptr; + genx::Signedness SignedSrc0; + VISA_VectorOpnd *Src0 = + createSourceOperand(Cmp, Signed, 0, BI, 0, &SignedSrc0); + VISA_VectorOpnd *Src1 = createSourceOperand(Cmp, SignedSrc0, 1, BI); + + if (WriteToPred) { + ctrlMask = getExecMaskFromWrPredRegion(DstDesc.WrRegion, false); + VISA_PredVar *PredVar = + getPredicateVar(DstDesc.WrRegion ? DstDesc.WrRegion : Cmp); + addDebugInfo(); + CISA_CALL(Kernel->AppendVISAComparisonInst(opSpec, ctrlMask, ExecSize, + PredVar, Src0, Src1)); + } else { + ctrlMask = getExecMaskFromWrRegion(DstDesc); + Value *Val = DstDesc.WrRegion ? DstDesc.WrRegion : Cmp->user_back(); + Dst = createDestination(Val, Signed, 0, DstDesc); + addDebugInfo(); + CISA_CALL(Kernel->AppendVISAComparisonInst(opSpec, ctrlMask, ExecSize, Dst, + Src0, Src1)); + } +} + +/*********************************************************************** + * buildConvertAddr : build code for conversion to address + * + * Enter: CI = the CallInst + * BI = BaleInfo for CI + * Mod = modifier bits for destination + * WrRegion = 0 else wrregion for destination + * WrRegionBI = BaleInfo for WrRegion + */ +void GenXKernelBuilder::buildConvertAddr(CallInst *CI, genx::BaleInfo BI, + unsigned Mod, + const DstOpndDesc &DstDesc) { + assert(!DstDesc.WrRegion); + Value *Base = Liveness->getAddressBase(CI); + VISA_Exec_Size ExecSize = EXEC_SIZE_1; + VISA_EMask_Ctrl MaskCtrl = NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1; + + if (VectorType *VT = dyn_cast(CI->getType())) + ExecSize = getExecSizeFromValue(VT->getNumElements()); + // If the offset is less aligned than the base register element type, then + // we need a different type. + Type *OverrideTy = nullptr; + Type *BaseTy = Base->getType(); + if (BaseTy->isPointerTy()) + BaseTy = BaseTy->getPointerElementType(); + unsigned ElementBytes = + BaseTy->getScalarType()->getPrimitiveSizeInBits() >> 3; + int Offset = cast(CI->getArgOperand(1))->getSExtValue(); + if ((ElementBytes - 1) & Offset) { + OverrideTy = VectorType::get(Type::getInt8Ty(CI->getContext()), + BaseTy->getVectorNumElements() * ElementBytes); + ElementBytes = 1; + } + Register *BaseReg = + RegAlloc->getRegForValue(KernFunc, Base, DONTCARESIGNED, OverrideTy); + + VISA_VectorOpnd *Dst = createAddressOperand(CI, true); + VISA_VectorOpnd *Src1 = nullptr; + + if (BaseReg->Category == RegCategory::SURFACE || + BaseReg->Category == RegCategory::SAMPLER) { + uint8_t offset = Offset >> 2; + switch (BaseReg->Category) { + case RegCategory::SURFACE: { + VISA_SurfaceVar *Decl = BaseReg->GetVar(Kernel); + unsigned int offsetB = offset * 2; // 2 is bytes size of UW + CISA_CALL(Kernel->CreateVISAAddressOfOperand(Src1, Decl, offsetB)); + break; + } + case RegCategory::SAMPLER: { + VISA_SurfaceVar *Decl = BaseReg->GetVar(Kernel); + unsigned int offsetB = offset * 2; // 2 is bytes size of UW + CISA_CALL(Kernel->CreateVISAAddressOfOperand(Src1, Decl, offsetB)); + break; + } + default: + report_fatal_error("Invalid state operand class: only surface, vme, and " + "sampler are supported."); + break; + } + } else { + uint8_t rowOffset = Offset >> genx::log2(GrfByteSize); + uint8_t colOffset = (Offset & (GrfByteSize - 1)) >> Log2_32(ElementBytes); + VISA_GenVar *Decl = BaseReg->GetVar(Kernel); + auto TypeSize = BaseReg->Ty->getScalarType()->getPrimitiveSizeInBits() >> 3; + unsigned int offset = colOffset * TypeSize + rowOffset * GrfByteSize; + CISA_CALL(Kernel->CreateVISAAddressOfOperand(Src1, Decl, offset)); + } + VISA_VectorOpnd *Src2 = createSourceOperand(CI, UNSIGNED, 0, BI); + addDebugInfo(); + CISA_CALL(Kernel->AppendVISAAddrAddInst(MaskCtrl, ExecSize, Dst, Src1, Src2)); +} + +/*********************************************************************** + * buildAlloca : build code for allocating in thread-private memory + * + * Enter: CI = the CallInst + * + */ +void GenXKernelBuilder::buildAlloca(CallInst *CI, unsigned IntrinID, + unsigned Mod, const DstOpndDesc &DstDesc) { + VISA_GenVar *Sp = nullptr; + CISA_CALL(Kernel->GetPredefinedVar(Sp, PreDefined_Vars::PREDEFINED_FE_SP)); + + VISA_VectorOpnd *SpSrc = nullptr; + CISA_CALL( + Kernel->CreateVISASrcOperand(SpSrc, Sp, MODIFIER_NONE, 0, 1, 0, 0, 0)); + + Value *AllocaOff = CI->getOperand(0); + Type *AllocaOffTy = AllocaOff->getType(); + unsigned OffVal = getResultedTypeSize(AllocaOffTy); + + VISA_VectorOpnd *Imm = nullptr; + CISA_CALL(Kernel->CreateVISAImmediate(Imm, &OffVal, ISA_TYPE_D)); + + if (IntrinID == llvm::GenXIntrinsic::genx_alloca) { + VISA_VectorOpnd *Src = nullptr; + CISA_CALL(Kernel->CreateVISASrcOperand(Src, static_cast(Sp), + MODIFIER_NONE, 0, 1, 0, 0, 0)); + VISA_VectorOpnd *Dst = createDestination(CI, DONTCARESIGNED, Mod, DstDesc); + CISA_CALL(Kernel->AppendVISADataMovementInst( + ISA_MOV, nullptr, false, vISA_EMASK_M1, EXEC_SIZE_1, Dst, Src)); + } + + VISA_VectorOpnd *DstSp = nullptr; + CISA_CALL(Kernel->CreateVISADstOperand(DstSp, static_cast(Sp), + 1, 0, 0)); + + CISA_CALL(Kernel->AppendVISAArithmeticInst( + ISA_ADD, nullptr, false, vISA_EMASK_M1, EXEC_SIZE_1, DstSp, SpSrc, Imm)); +} + +// extracts underlying c-string from provided constant +static StringRef extractCStr(const Constant &CStrConst) { + if (isa(CStrConst)) + return cast(CStrConst).getAsCString(); + assert(isa(CStrConst)); + return ""; +} + +/*********************************************************************** + * buildPrintIndex : build code for storing constant format strins as metadata + * and returning idx for that string + * + * Enter: CI = the CallInst + * + */ +void GenXKernelBuilder::buildPrintIndex(CallInst *CI, unsigned IntrinID, + unsigned Mod, + const DstOpndDesc &DstDesc) { + // create move with constant + VISA_VectorOpnd *Imm = nullptr; + Module* M = CI->getModule(); + NamedMDNode *NMD = M->getOrInsertNamedMetadata("cm_print_strings"); + unsigned NumOp = NMD->getNumOperands(); + CISA_CALL(Kernel->CreateVISAImmediate(Imm, &NumOp, ISA_TYPE_UD)); + VISA_VectorOpnd *Dst = createDestination(CI, DONTCARESIGNED, Mod, DstDesc); + CISA_CALL(Kernel->AppendVISADataMovementInst( + ISA_MOV, nullptr, false, vISA_EMASK_M1_NM, + EXEC_SIZE_1, Dst, Imm)); + + // access string + LLVMContext& Context = CI->getContext(); + ImmutableCallSite CallSite(CI); + const Value *Val = CallSite.getArgument(0); + const Instruction *Gep = cast(Val); + Val = Gep->getOperand(0); + StringRef UnderlyingCStr = + extractCStr(*cast(Val)->getInitializer()); + + // store metadata + MDNode* N = MDNode::get(Context, MDString::get(Context, UnderlyingCStr)); + NMD->addOperand(N); +} + +void GenXKernelBuilder::deduceRegion(Region *R, bool IsDest, + unsigned MaxWidth) { + assert(Subtarget); + if (!IsDest && !R->is2D() && R->Indirect && + Subtarget->hasIndirectGRFCrossing()) { + // For a source 1D indirect region that might possibly cross a GRF + // (because we are on SKL+ so a single GRF crossing is allowed), make it + // Nx1 instead of 1xN to avoid crossing a GRF within a row. + R->VStride = R->Stride; + R->Width = 1; + R->Stride = 0; + } + // another case of converting to region format + if (!IsDest && + (R->VStride == (int)R->Width * R->Stride || R->Width == R->NumElements)) { + R->Width = 1; + R->VStride = R->Stride; + R->Stride = 0; + } else if (R->Width > MaxWidth) { + // A Width of more than 16 (or whatever MaxWidth is) is not allowed. If it + // is more than 16, then legalization has ensured that either there is one + // row or the rows are contiguous (VStride == Width * Stride) and we can + // increase the number of rows. (Note that Width and VStride are ignored + // in a destination operand; legalization ensures that there is only one + // row.) + R->Width = MaxWidth; + R->VStride = R->Width * R->Stride; + } + + if (R->Width == R->NumElements) { + // Use VStride 0 on a 1D region. This is necessary for src0 in line or + // pln, so we may as well do it for everything. + R->VStride = 0; + } + + if (R->Indirect) { + R->IndirectAddrOffset = 0; + if (GenXIntrinsic::isRdRegion(R->Indirect)) { + auto AddrRdR = cast(R->Indirect); + Region AddrR(AddrRdR, BaleInfo()); + assert(!AddrR.Indirect && + "cannot have address rdregion that is indirect"); + R->IndirectAddrOffset = + AddrR.Offset / 2; // address element is always 2 byte + } + } +} + +VISA_VectorOpnd * +GenXKernelBuilder::createGeneralOperand(Region *R, VISA_GenVar *Decl, + Signedness Signed, unsigned Mod, + bool IsDest, unsigned MaxWidth) { + VISA_VectorOpnd *ResultOperand = nullptr; + // Write the vISA general operand, canonicalizing the + // region parameters where applicable. + assert(Decl && "no register allocated for this value"); + if (!IsDest) { + ResultOperand = createCisaSrcOperand( + Decl, static_cast(Mod), R->VStride, R->Width, R->Stride, + R->Offset >> genx::log2(GrfByteSize), + (R->Offset & (GrfByteSize - 1)) / R->ElementBytes); + } else { + ResultOperand = createCisaDstOperand( + Decl, R->Stride, R->Offset >> genx::log2(GrfByteSize), + (R->Offset & (GrfByteSize - 1)) / R->ElementBytes); + } + return ResultOperand; +} + +VISA_VectorOpnd *GenXKernelBuilder::createIndirectOperand(Region *R, + Signedness Signed, + unsigned Mod, + bool IsDest, + unsigned MaxWidth) { + VISA_VectorOpnd *ResultOperand = nullptr; + // Check if the indirect operand is a baled in rdregion. + Value *Indirect = R->Indirect; + if (GenXIntrinsic::isRdRegion(Indirect)) { + auto AddrRdR = cast(Indirect); + Indirect = AddrRdR->getOperand(0); + } + // Write the vISA indirect operand. + Register *IdxReg = RegAlloc->getRegForValue(KernFunc, Indirect, DONTCARESIGNED); + assert(IdxReg->Category == RegCategory::ADDRESS); + + bool NotCrossGrf = !(R->Offset & (GrfByteSize - 1)); + if (!NotCrossGrf) { + // Determine the NotCrossGrf bit setting (whether we can guarantee + // that adding an indirect region's constant offset does not cause + // a carry out of bit 4) + // by looking at the partial constant for the index + // before the constant is added on. + // This only works for a scalar index. + if (auto IndirInst = dyn_cast(R->Indirect)) { + auto A = AI.get(IndirInst); + unsigned Mask = (1U << std::min(5U, A.getLogAlign())) - 1; + if (Mask) { + if ((A.getExtraBits() & Mask) + (R->Offset & Mask) <= Mask && + (unsigned)(R->Offset & (GrfByteSize - 1)) <= Mask) { + // The alignment and extrabits are such that adding R->Offset + // cannot cause a carry from bit 4 to bit 5. + NotCrossGrf = true; + } + } + } + } + visa::TypeDetails TD(Func->getParent()->getDataLayout(), R->ElementTy, + Signed); + unsigned VStride = R->VStride; + if (isa(R->Indirect->getType())) + // multi indirect (vector index), set vstride + VStride = 0x8000; // field to null + VISA_AddrVar *AddrDecl = IdxReg->GetVar(Kernel); + if (IsDest) { + CISA_CALL(Kernel->CreateVISAIndirectDstOperand( + ResultOperand, AddrDecl, R->IndirectAddrOffset, R->Offset, R->Stride, + (VISA_Type)TD.VisaType)); + } else { + CISA_CALL(Kernel->CreateVISAIndirectSrcOperand( + ResultOperand, AddrDecl, static_cast(Mod), + R->IndirectAddrOffset, R->Offset, VStride, R->Width, R->Stride, + (VISA_Type)TD.VisaType)); + } + return ResultOperand; +} + + +/*********************************************************************** + * createRegionOperand : create a vISA region operand + * + * Enter: R = Region + * RegNum = vISA register number (ignored if region is indirect) + * Signed = whether signed or unsigned required (only used for + * indirect operand) + * Mod = modifiers + * IsDest = true if destination operand + * MaxWidth = maximum width (used to stop TWICEWIDTH operand + * getting a width bigger than the execution size, but + * for other uses defaults to 16) + */ +VISA_VectorOpnd * +GenXKernelBuilder::createRegionOperand(Region *R, VISA_GenVar *Decl, + Signedness Signed, unsigned Mod, + bool IsDest, unsigned MaxWidth) { + deduceRegion(R, IsDest, MaxWidth); + + if (R->Indirect) + return createIndirectOperand(R, Signed, Mod, IsDest, MaxWidth); + else + return createGeneralOperand(R, Decl, Signed, Mod, IsDest, MaxWidth); +} + + +bool GenXKernelBuilder::isInLoop(BasicBlock *BB) { + if (getLoops(BB->getParent())->getLoopFor(BB)) + return true; // inside loop in this function + // Now we need to see if this function is called from inside a loop. + // First check the cache. + auto i = IsInLoopCache.find(BB->getParent()); + if (i != IsInLoopCache.end()) + return i->second; + // Now check all call sites. This recurses as deep as the depth of the call + // graph, which must be acyclic as GenX does not allow recursion. + bool InLoop = false; + for (auto ui = BB->getParent()->use_begin(), ue = BB->getParent()->use_end(); + ui != ue; ++ui) { + auto CI = dyn_cast(ui->getUser()); + if (!CI) + continue; + assert(ui->getOperandNo() == CI->getNumArgOperands()); + if (CI->getFunction() == BB->getParent()) + continue; + if (isInLoop(CI->getParent())) { + InLoop = true; + break; + } + } + IsInLoopCache[BB->getParent()] = InLoop; + return InLoop; +} + +void GenXKernelBuilder::addWriteRegionLifetimeStartInst(Instruction *WrRegion) { + if (!GenXIntrinsic::isWrRegion(WrRegion)) + return; // No lifetime start for wrpredregion. + // See if the wrregion is in a loop. + auto BB = WrRegion->getParent(); + if (!isInLoop(BB)) + return; // not in loop + // See if the wrregion is the first of a sequence in the same basic block + // that together write the whole register. We assume that each region is + // contiguous, and the regions are written in ascending offset order, as + // that is what legalization does if the original write was to the whole + // register. + unsigned NumElementsSoFar = 0; + unsigned TotalNumElements = WrRegion->getType()->getVectorNumElements(); + Instruction *ThisWr = WrRegion; + for (;;) { + Region R(ThisWr, BaleInfo()); + if (R.Indirect) + break; + if ((unsigned)R.Offset != NumElementsSoFar * R.ElementBytes) + break; + if (R.Stride != 1 && R.Width != 1) + break; + if (R.Width != R.NumElements) + break; + NumElementsSoFar += R.NumElements; + if (NumElementsSoFar == TotalNumElements) + return; // whole register is written + // Go on to next wrregion in the same basic block if any. + if (!ThisWr->hasOneUse()) + break; + ThisWr = cast(ThisWr->use_begin()->getUser()); + if (!GenXIntrinsic::isWrRegion(ThisWr)) + break; + if (ThisWr->getParent() != BB) + break; + } + // The wrregion is in a loop and is not the first in a sequence in the same + // basic block that writes the whole register. Write a lifetime start. + addLifetimeStartInst(WrRegion); +} + +/************************************************************************************************** + * addLifetimeStartInst : add a lifetime.start instruction + * + * Enter: Inst = value to use in lifetime.start + */ +void GenXKernelBuilder::addLifetimeStartInst(Instruction *Inst) { + VISA_VectorOpnd *opnd = nullptr; + auto Reg = RegAlloc->getRegForValueOrNull(KernFunc, Inst); + if (!Reg) + return; // no register allocated such as being indirected. + + switch (Reg->Category) { + case RegCategory::GENERAL: + opnd = createCisaDstOperand(Reg->GetVar(Kernel), 1, 0, 0); + break; + case RegCategory::ADDRESS: + CISA_CALL(Kernel->CreateVISAAddressDstOperand( + opnd, Reg->GetVar(Kernel), 0)); + break; +#if 0 // Not currently used. + case RegCategory::PREDICATE: + break; +#endif // 0 + default: + report_fatal_error("createLifetimeStartInst: Invalid register category"); + break; + } + addDebugInfo(); + CISA_CALL(Kernel->AppendVISALifetime(LIFETIME_START, opnd)); +} + +/*********************************************************************** + * addDebugInfo : add debug infromation + */ +void GenXKernelBuilder::addDebugInfo() { + // Ensure that the last label does not get merged with the next one now we + // know that there is code in between. + LastLabel = -1; + // Check if we have a pending debug location. + if (PendingLine) { + // Do the source location debug info with vISA FILE and LOC instructions. + if (PendingFilename != "" && (PendingFilename != LastFilename || + PendingDirectory != LastDirectory)) { + SmallString<256> Filename; + // Bodge here to detect Windows absolute path even when built on cygwin. + if (sys::path::is_absolute(PendingFilename) || + (PendingFilename.size() > 2 && PendingFilename[1] == ':')) + Filename = PendingFilename; + else { + Filename = PendingDirectory; + sys::path::append(Filename, PendingFilename); + } + CISA_CALL(Kernel->AppendVISAMiscFileInst(Filename.c_str())); + LastDirectory = PendingDirectory; + LastFilename = PendingFilename; + } + if (PendingLine != LastLine) { + CISA_CALL(Kernel->AppendVISAMiscLOC(PendingLine)); + LastLine = PendingLine; + PendingLine = 0; + } + } +} + +void GenXKernelBuilder::emitOptimizationHints() { + if (skipOptWithLargeBlock(*FG)) + return; + + // Track rp considering byte variable widening. + PressureTracker RP(*FG, Liveness, /*ByteWidening*/ true); + const std::vector &WidenLRs = RP.getWidenVariables(); + + for (auto LR : WidenLRs) { + SimpleValue SV = *LR->value_begin(); + auto *R = RegAlloc->getRegForValueOrNull(FG->getHead(), SV); + // This variable is being used in or crossing a high register pressure + // region. Set an optimization hint not to widen it. + if (R && RP.intersectWithRedRegion(LR)) { + R->addAttribute(addStringToPool("NoWidening"), ""); + RP.decreasePressure(LR); + } + } +} + +/*********************************************************************** + * addLabelInst : add a label instruction for a basic block or join + */ +void GenXKernelBuilder::addLabelInst(Value *BB) { + // Skip this for now, because we don't know how to patch labels of branches. + if (0) { // LastLabel >= 0) { + // There has been no code since the last label, so use the same label + // for this basic block. + setLabel(BB, LastLabel); + } else { + // Need a new label. + LastLabel = getOrCreateLabel(BB, LABEL_BLOCK); + CISA_CALL(Kernel->AppendVISACFLabelInst(Labels[LastLabel])); + } +} + +/*********************************************************************** + * getOrCreateLabel : get/create label number for a Function or BasicBlock + */ +unsigned GenXKernelBuilder::getOrCreateLabel(Value *V, int Kind) { + int Num = getLabel(V); + if (Num >= 0) + return Num; + Num = Labels.size(); + setLabel(V, Num); + VISA_LabelOpnd *Decl = nullptr; + + // Replicate the functionality of the old compiler and make the first label + // for a function contain the name (makes sure the function label is unique) + // It's not clear this is strictly necessary any more (but doesn't do any + // harm and may even make reading the intermediate forms easier) + if (Kind == LABEL_SUBROUTINE) { + StringRef N = TheKernelMetadata.getName(); + std::string NameBuf; + if (V != FG->getHead()) { + // This is a subroutine, not the kernel/function at the head of the + // FunctionGroup. Use the name of the subroutine. + N = V->getName(); + } else { + // For a kernel/function name, fix illegal characters. The jitter uses + // the same name for the label in the .asm file, and aubload does not + // like the illegal characters. + NameBuf = legalizeName(N); + N = NameBuf; + } + CISA_CALL(Kernel->CreateVISALabelVar( + Decl, + cutString((Twine(N) + Twine("_BB_") + Twine(Labels.size())).str()) + .c_str(), + VISA_Label_Kind(Kind))); + Labels.push_back(Decl); + } else if (Kind == LABEL_BLOCK) { + CISA_CALL(Kernel->CreateVISALabelVar( + Decl, cutString((Twine("BB_") + Twine(Labels.size())).str()).c_str(), + VISA_Label_Kind(Kind))); + Labels.push_back(Decl); + } else if (Kind == LABEL_FC) { + assert(isa(V)); + auto F = cast(V); + StringRef N = F->getFnAttribute("CMCallable").getValueAsString(); + CISA_CALL(Kernel->CreateVISALabelVar( + Decl, cutString(Twine(N).str()).c_str(), VISA_Label_Kind(Kind))); + Labels.push_back(Decl); + } else { + StringRef N = V->getName(); + CISA_CALL(Kernel->CreateVISALabelVar( + Decl, + cutString( + (Twine("_") + Twine(N) + Twine("_") + Twine(Labels.size())).str()) + .c_str(), + VISA_Label_Kind(Kind))); + Labels.push_back(Decl); + } + return Num; +} + +void GenXKernelBuilder::buildInlineAsm(CallInst *CI) { + assert(CI->isInlineAsm() && "Inline asm expected"); + InlineAsm *IA = dyn_cast(CI->getCalledValue()); + std::string AsmStr(IA->getAsmString()); + std::stringstream &AsmTextStream = CisaBuilder->GetAsmTextStream(); + + // Nothing to substitute if no constraints provided + if (IA->getConstraintString().empty()) { + AsmTextStream << AsmStr << std::endl; + return; + } + + unsigned NumOutputs = genx::getInlineAsmNumOutputs(CI); + auto ConstraintsInfo = genx::getGenXInlineAsmInfo(CI); + + // Scan asm string in reverse direction to match larger numbers first + for (int ArgNo = ConstraintsInfo.size() - 1; ArgNo >= 0; ArgNo--) { + // Regexp to match number of operand + Regex R("\\$+" + llvm::to_string(ArgNo)); + if (!R.match(AsmStr)) + continue; + // Operand that must be substituded into inline assembly string + Value *InlasmOp = nullptr; + std::string InlasmOpAsString; + // For output collect destination descriptor with + // baling info and WrRegion instruction + DstOpndDesc DstDesc; + auto Info = ConstraintsInfo[ArgNo]; + if (Info.isOutput()) { + // If result is a struct than inline assembly + // instruction has multiple outputs + if (isa(CI->getType())) { + // Go through all users of a result and find extractelement with + // ArgNo indice: ArgNo is a number of a constraint in constraint + // list + for (auto ui = CI->use_begin(), ue = CI->use_end(); ui != ue; ++ui) { + auto EV = dyn_cast(ui->getUser()); + if (EV && (EV->getIndices()[0] == ArgNo)) { + InlasmOp = EV; + break; + } + } + } else + // Single output + InlasmOp = CI; + + if (InlasmOp) { + Instruction *Inst = cast(InlasmOp); + Instruction *Head = Baling->getBaleHead(Inst); + BaleInfo BI = Baling->getBaleInfo(Head); + // If head is g_store than change head to store's + // operand and check if it's baled wrr + if (BI.Type == BaleInfo::GSTORE) { + DstDesc.GStore = Head; + Head = cast(Head->getOperand(0)); + BI = Baling->getBaleInfo(Head); + } + if (BI.Type == BaleInfo::WRREGION) { + DstDesc.WrRegion = Head; + DstDesc.WrRegionBI = BI; + } + InlasmOpAsString = createInlineAsmDestinationOperand( + InlasmOp, DONTCARESIGNED, Info.getConstraintType(), 0, DstDesc); + } else { + // Can't deduce output operand because there are no users + // but we have register allocated. If region is needed we can use + // default one based one type. + SimpleValue SV(CI, ArgNo); + Register *Reg = RegAlloc->getRegForValue(KernFunc, SV, DONTCARESIGNED); + Region R(SV.getType()); + InlasmOpAsString = + createInlineAsmOperand(Reg, &R, true /*IsDst*/, DONTCARESIGNED, + Info.getConstraintType(), 0); + } + } else { + // Input of inline assembly + InlasmOp = CI->getArgOperand(ArgNo - NumOutputs); + bool IsBaled = false; + if (GenXIntrinsic::isRdRegion(InlasmOp)) { + Instruction *RdR = cast(InlasmOp); + IsBaled = Baling->isBaled(RdR); + } + InlasmOpAsString = createInlineAsmSourceOperand( + InlasmOp, DONTCARESIGNED, IsBaled, Info.getConstraintType()); + } + // Substitute string name of the variable until + // there are no possible sustitutions. Do-while + // since first match was checked in the beginning + // of the loop. + do { + AsmStr = R.sub(InlasmOpAsString, AsmStr); + } while (R.match(AsmStr)); + } + + AsmTextStream << "\n// INLASM BEGIN\n" + << AsmStr << "\n// INLASM END\n" + << std::endl; +} + +void GenXKernelBuilder::buildCall(IGCLLVM::CallInst *CI, + const DstOpndDesc &DstDesc) { + LLVM_DEBUG(dbgs() << CI << "\n"); + Function *Callee = CI->getCalledFunction(); + + if (!Callee || Callee->hasFnAttribute(genx::FunctionMD::CMStackCall)) { + buildStackCall(CI, DstDesc); + return; + } + + unsigned LabelKind = LABEL_SUBROUTINE; + if (Callee->hasFnAttribute("CMCallable")) + LabelKind = LABEL_FC; + else + assert(FG == FG->getParent()->getGroup(Callee) && + "unexpected call to outside FunctionGroup"); + + // Check whether the called function has a predicate arg that is EM. + int EMOperandNum = -1; + for (auto ai = Callee->arg_begin(), ae = Callee->arg_end(); ai != ae; ++ai) { + auto Arg = &*ai; + if (!Arg->getType()->getScalarType()->isIntegerTy(1)) + continue; + if (Liveness->getLiveRange(Arg)->getCategory() == RegCategory::EM) { + EMOperandNum = Arg->getArgNo(); + break; + } + } + + if (EMOperandNum < 0) { + addDebugInfo(); + // Scalar calls must be marked with NoMask + CISA_CALL(Kernel->AppendVISACFCallInst( + nullptr, vISA_EMASK_M1_NM, EXEC_SIZE_1, + Labels[getOrCreateLabel(Callee, LabelKind)])); + } else { + auto PredicateOpnd = NoMask ? nullptr : createPred(CI, BaleInfo(), EMOperandNum); + addDebugInfo(); + CISA_CALL(Kernel->AppendVISACFCallInst( + PredicateOpnd, vISA_EMASK_M1, + getExecSizeFromValue( + CI->getArgOperand(EMOperandNum)->getType()->getVectorNumElements()), + Labels[getOrCreateLabel(Callee, LabelKind)])); + } +} + +void GenXKernelBuilder::buildRet(ReturnInst *RI) { + uint32_t FloatControl = 0; + auto F = RI->getFunction(); + F->getFnAttribute(genx::FunctionMD::CMFloatControl) + .getValueAsString() + .getAsInteger(0, FloatControl); + FloatControl &= CR_Mask; + if (FloatControl != DefaultFloatControl) { + buildControlRegUpdate(CR_Mask, true); + if (DefaultFloatControl) + buildControlRegUpdate(DefaultFloatControl, false); + } + addDebugInfo(); + if (!isKernel(F) && + (F->hasFnAttribute(genx::FunctionMD::CMStackCall) || + F->hasFnAttribute(genx::FunctionMD::ReferencedIndirectly))) { + CISA_CALL(Kernel->AppendVISACFFunctionRetInst(nullptr, vISA_EMASK_M1, + EXEC_SIZE_16)); + } else { + CISA_CALL(Kernel->AppendVISACFRetInst(nullptr, vISA_EMASK_M1, EXEC_SIZE_1)); + } +} + +/*********************************************************************** + * createRawSourceOperand : create raw source operand of instruction + * + * Enter: Inst = instruction to get source operand from + * OperandNum = operand number + * BI = BaleInfo for Inst (so we can tell whether a rdregion + * or modifier is bundled in) + */ +VISA_RawOpnd *GenXKernelBuilder::createRawSourceOperand(Instruction *Inst, + unsigned OperandNum, + BaleInfo BI, + Signedness Signed) { + VISA_RawOpnd *ResultOperand = nullptr; + Value *V = Inst->getOperand(OperandNum); + if (isa(V)) { + CISA_CALL(Kernel->CreateVISANullRawOperand(ResultOperand, false)); + } else { + unsigned ByteOffset = 0; + if (Baling->getBaleInfo(Inst).isOperandBaled(OperandNum)) { + Instruction *RdRegion = cast(V); + Region R(RdRegion, BaleInfo()); + ByteOffset = R.Offset; + V = RdRegion->getOperand(0); + } + Register *Reg = RegAlloc->getRegForValue(KernFunc, V, Signed); + assert(Reg->Category == RegCategory::GENERAL); + CISA_CALL(Kernel->CreateVISARawOperand( + ResultOperand, Reg->GetVar(Kernel), ByteOffset)); + } + return ResultOperand; +} + +/*********************************************************************** + * createRawDestination : create raw destination operand + * + * Enter: Inst = destination value + * WrRegion = 0 else wrregion that destination is baled into + * + * A raw destination can be baled into a wrregion, but only if the region + * is direct and its start index is GRF aligned. + */ +VISA_RawOpnd * +GenXKernelBuilder::createRawDestination(Value *V, const DstOpndDesc &DstDesc, + Signedness Signed) { + VISA_RawOpnd *ResultOperand = nullptr; + unsigned ByteOffset = 0; + if (DstDesc.WrRegion) { + V = DstDesc.WrRegion; + Region R(DstDesc.WrRegion, BaleInfo()); + ByteOffset = R.Offset; + } + Type *OverrideType = nullptr; + if (DstDesc.GStore) { + V = getUnderlyingGlobalVariable(DstDesc.GStore->getOperand(1)); + assert(V && "out of sync"); + OverrideType = DstDesc.GStore->getOperand(0)->getType(); + } + Register *Reg = RegAlloc->getRegForValueOrNull(KernFunc, V, Signed, OverrideType); + if (!Reg) { + // No register assigned. This happens to an unused raw result where the + // result is marked as RAW_NULLALLOWED in GenXIntrinsics. + CISA_CALL(Kernel->CreateVISANullRawOperand(ResultOperand, true)); + } else { + assert(Reg->Category == RegCategory::GENERAL); + CISA_CALL(Kernel->CreateVISARawOperand( + ResultOperand, Reg->GetVar(Kernel), ByteOffset)); + } + return ResultOperand; +} + +/*********************************************************************** + * getLabel : get label number for a Function or BasicBlock + * + * Return: label number, -1 if none found + */ +int GenXKernelBuilder::getLabel(Value *V) { + std::map::iterator i = LabelMap.find(V); + if (i != LabelMap.end()) + return i->second; + return -1; +} + +/*********************************************************************** + * setLabel : set the label number for a Function or BasicBlock + */ +void GenXKernelBuilder::setLabel(Value *V, unsigned Num) { LabelMap[V] = Num; } + +unsigned GenXKernelBuilder::addStringToPool(StringRef Str) { + auto val = std::pair(Str.begin(), StringPool.size()); + auto Res = StringPool.insert(val); + return Res.first->second; +} + +StringRef GenXKernelBuilder::getStringByIndex(unsigned Val) { + for (const auto &it : StringPool) { + if (it.second == Val) + return it.first; + } + llvm_unreachable("Can't find string by index."); +} + +/*********************************************************************** + * GenXKernelBuilder::getLoops : get loop info for given function, cacheing in + * Loops map + */ +LoopInfoBase *GenXKernelBuilder::getLoops(Function *F) { + auto LoopsEntry = &Loops[F]; + if (!*LoopsEntry) { + auto DT = DTs->getDomTree(F); + *LoopsEntry = new LoopInfoBase; + (*LoopsEntry)->analyze(*DT); + } + return *LoopsEntry; +} + +/*********************************************************************** + * Get size of the argument of type 'type' in bytes considering layout of + * subtypes of aggregate type in units of size 'mod' + * mod is typically 32 (GRF) or 16 (oword) + */ +unsigned GenXKernelBuilder::getValueSize(Type *T, unsigned Mod) const { + unsigned Result = 0; + if (T->isAggregateType()) { + for (unsigned i = 0; i < T->getStructNumElements(); i++) { + Result += getValueSize(T->getContainedType(i)) / Mod + + (getValueSize(T->getContainedType(i)) % Mod ? 1 : 0); + } + Result *= Mod; + } else + Result = FG->getModule()->getDataLayout().getTypeSizeInBits(T) / 8; + return Result; +} + +unsigned GenXKernelBuilder::getFuncArgsSize(llvm::Function *F) { + unsigned Result = 0; + for (auto &Arg : F->args()) + Result += getValueSize(&Arg); + return Result; +} + +GenericCisaVariable * +GenXKernelBuilder::createCisaVariable(VISAKernel *Kernel, const char *Name, + VISA_GenVar *AliasVar, + unsigned ByteSize) { + auto it = CisaVars[Kernel].find(Name); + if (it != CisaVars[Kernel].end()) + it->second = GenericCisaVariable(Name, AliasVar, ByteSize); + else + CisaVars[Kernel].insert( + std::make_pair(Name, GenericCisaVariable(Name, AliasVar, ByteSize))); + return &(CisaVars[Kernel].at(Name)); +} + +static unsigned deduceByteSize(Value *V, const DataLayout &DL) { + return DL.getTypeSizeInBits(V->getType()->getScalarType()) / 8; +} + +static unsigned deduceByteSize(CisaVariable *V, const DataLayout &DL) { + assert(V->getType() < ISA_TYPE_NUM); + return CISATypeTable[V->getType()].typeSize; +} + +/************************************************************************************************** + * emitVectorCopy : emit vISA that performs copying form Dst to Src + * + * Emit sufficient amount of MOVs from Dst to Src picking size in a greedy manner + * + * T1 and T2 should be llvm::Value and CisaVariable or vice-versa, + * CisaVariable=>CisaVariable or Value=>Value copying is not supported here + * + */ +template +void GenXKernelBuilder::emitVectorCopy(T1 *Dst, T2 *Src, unsigned &RowOff, + unsigned &ColOff, unsigned &SrcRowOff, + unsigned &SrcColOff, int TotalSize, + bool DoCopy) { + auto partCopy = [&](int Sz) { + int ByteSz = Sz * deduceByteSize(Dst, DL); + assert(ByteSz); + + unsigned Start = SrcRowOff; + unsigned End = + (SrcRowOff * getGRFSize() + SrcColOff + ByteSz) / getGRFSize(); + + // mov is prohibited to span across >2 GRF + if (End - Start >= 2) { + assert(Sz > 1); + return; + } + + while (TotalSize >= ByteSz) { + VISA_VectorOpnd *ArgSrc = nullptr, *ArgDst = nullptr; + unsigned Offset = SrcRowOff * GrfByteSize + SrcColOff; + ArgSrc = createSource(Src, UNSIGNED, Sz, &Offset); + SrcRowOff += (SrcColOff + ByteSz) / GrfByteSize; + SrcColOff = (SrcColOff + ByteSz) % GrfByteSize; + + Offset = RowOff * GrfByteSize + ColOff; + ArgDst = createDestination(Dst, UNSIGNED, &Offset); + RowOff += (ColOff + ByteSz) / GrfByteSize; + ColOff = (ColOff + ByteSz) % GrfByteSize; + + if (DoCopy) + CISA_CALL(Kernel->AppendVISADataMovementInst( + ISA_MOV, nullptr, false, + (NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1), + getExecSizeFromValue(Sz), ArgDst, ArgSrc)); + TotalSize -= ByteSz; + } + }; + partCopy(16); + partCopy(8); + partCopy(4); + partCopy(2); + partCopy(1); +} + +void GenXKernelBuilder::pushStackArg(VISA_StateOpndHandle *Dst, Value *Src, + int TotalSz, unsigned &RowOff, + unsigned &ColOff, unsigned &SrcRowOff, + unsigned &SrcColOff, bool DoCopy) { + VISA_GenVar *StackOff = nullptr, *Sp = nullptr; + + auto StackTmp = createCisaVariable(Kernel, "stackTmp", nullptr, TotalSz); + + auto TmpType = llvmToVisaType(Src->getType()); + auto TmpVar = StackTmp->getAlias(TmpType, Kernel); + + CISA_CALL(Kernel->CreateVISAGenVar(StackOff, "stackOff", 1, ISA_TYPE_UQ, + ALIGN_OWORD)); + unsigned RawOff = 0; + auto partCopy = [&](int Sz) { + // TODO: mb we have some constant for oword size + int ByteSz = Sz * BYTES_PER_OWORD; + int CopySz = std::min(ByteSz, TotalSz); + + while (TotalSz - ByteSz >= 0 || (TotalSz > 0 && Sz == 1)) { + CISA_CALL(Kernel->GetPredefinedVar(Sp, PREDEFINED_FE_SP)); + VISA_VectorOpnd *SpOpSrc1 = nullptr; + VISA_VectorOpnd *SpOpSrc2 = nullptr; + VISA_VectorOpnd *SpOpDst = nullptr; + CISA_CALL(Kernel->CreateVISADstOperand(SpOpDst, Sp, 1, 0, 0)); + CISA_CALL(Kernel->CreateVISASrcOperand(SpOpSrc1, Sp, MODIFIER_NONE, 0, 1, + 0, 0, 0)); + CISA_CALL(Kernel->CreateVISASrcOperand(SpOpSrc2, Sp, MODIFIER_NONE, 0, 1, + 0, 0, 0)); + + VISA_VectorOpnd *TmpOffDst = nullptr, *TmpOffSrc = nullptr; + CISA_CALL(Kernel->CreateVISADstOperand(TmpOffDst, StackOff, 1, 0, 0)); + CISA_CALL(Kernel->CreateVISASrcOperand(TmpOffSrc, StackOff, MODIFIER_NONE, + 0, 1, 0, 0, 0)); + + emitVectorCopy(TmpVar, Src, RowOff, ColOff, SrcRowOff, SrcColOff, CopySz, + DoCopy); + VISA_VectorOpnd *Imm = nullptr; + unsigned OffVal = Sz; + if (Subtarget->useGlobalMem()) + OffVal *= BYTES_PER_OWORD; + CISA_CALL(Kernel->CreateVISAImmediate(Imm, &OffVal, ISA_TYPE_UD)); + VISA_RawOpnd *RawSrc = nullptr; + CISA_CALL( + Kernel->CreateVISARawOperand(RawSrc, TmpVar->getGenVar(), RawOff)); + RawOff += Sz * BYTES_PER_OWORD; + + if (DoCopy) { + CISA_CALL(Kernel->AppendVISADataMovementInst(ISA_MOV, nullptr, false, + vISA_EMASK_M1, EXEC_SIZE_1, + TmpOffDst, SpOpSrc1)); + if (Subtarget->useGlobalMem()) { + CISA_CALL(Kernel->AppendVISASvmBlockStoreInst( + getCisaOwordNumFromNumber(Sz), true, TmpOffSrc, RawSrc)); + } else { + CISA_CALL(Kernel->AppendVISASurfAccessOwordLoadStoreInst( + ISA_OWORD_ST, vISA_EMASK_M1, Dst, getCisaOwordNumFromNumber(Sz), + TmpOffSrc, RawSrc)); + } + } + CISA_CALL(Kernel->AppendVISAArithmeticInst(ISA_ADD, nullptr, false, + vISA_EMASK_M1, EXEC_SIZE_1, + SpOpDst, SpOpSrc2, Imm)); + TotalSz -= ByteSz; + } + }; + + partCopy(8); + partCopy(4); + partCopy(2); + partCopy(1); +} + +void GenXKernelBuilder::popStackArg(llvm::Value *Dst, VISA_StateOpndHandle *Src, + int TotalSz, unsigned &RowOff, + unsigned &ColOff, unsigned &SrcRowOff, + unsigned &SrcColOff, int &PrevStackOff) { + VISA_GenVar *StackOff = nullptr, *Sp = nullptr; + + auto StackTmp = createCisaVariable(Kernel, "stackTmp", nullptr, TotalSz); + + auto TmpType = llvmToVisaType(Dst->getType()); + auto TmpVar = StackTmp->getAlias(TmpType, Kernel); + + CISA_CALL(Kernel->CreateVISAGenVar(StackOff, "stackOff", 1, ISA_TYPE_UQ, + ALIGN_OWORD)); + auto partCopy = [&](int Sz) { + // TODO: mb we have some constant for oword size + int ByteSz = Sz * BYTES_PER_OWORD; + while (TotalSz - ByteSz >= 0 || (TotalSz > 0 && Sz == 1)) { + CISA_CALL(Kernel->GetPredefinedVar(Sp, PREDEFINED_FE_SP)); + VISA_VectorOpnd *SpOpSrc = nullptr; + CISA_CALL(Kernel->CreateVISASrcOperand(SpOpSrc, Sp, MODIFIER_NONE, 0, 1, + 0, 0, 0)); + + VISA_VectorOpnd *TmpOffDst = nullptr; + VISA_VectorOpnd *TmpOffSrc = nullptr; + CISA_CALL(Kernel->CreateVISADstOperand(TmpOffDst, StackOff, 1, 0, 0)); + CISA_CALL(Kernel->CreateVISASrcOperand(TmpOffSrc, StackOff, MODIFIER_NONE, + 0, 1, 0, 0, 0)); + + VISA_VectorOpnd *Imm = nullptr; + int OffVal = PrevStackOff; + if (Subtarget->useGlobalMem()) + OffVal *= BYTES_PER_OWORD; + CISA_CALL(Kernel->CreateVISAImmediate(Imm, &OffVal, ISA_TYPE_UD)); + PrevStackOff += Sz; + VISA_RawOpnd *RawSrc = nullptr; + CISA_CALL(Kernel->CreateVISARawOperand(RawSrc, TmpVar->getGenVar(), 0)); + + CISA_CALL(Kernel->AppendVISAArithmeticInst(ISA_ADD, nullptr, false, + vISA_EMASK_M1, EXEC_SIZE_1, + TmpOffDst, SpOpSrc, Imm)); + if (Subtarget->useGlobalMem()) { + CISA_CALL(Kernel->AppendVISASvmBlockLoadInst( + getCisaOwordNumFromNumber(Sz), false, TmpOffSrc, RawSrc)); + } else { + CISA_CALL(Kernel->AppendVISASurfAccessOwordLoadStoreInst( + ISA_OWORD_LD, vISA_EMASK_M1, Src, getCisaOwordNumFromNumber(Sz), + TmpOffSrc, RawSrc)); + } + + int CopySz = std::min(ByteSz, TotalSz); + SrcRowOff = SrcColOff = 0; + emitVectorCopy(Dst, TmpVar, RowOff, ColOff, SrcRowOff, SrcColOff, CopySz); + TotalSz -= ByteSz; + } + SrcRowOff = SrcColOff = 0; + }; + + partCopy(8); + partCopy(4); + partCopy(2); + partCopy(1); +} + +/************************************************************************************************** + * beginFunction : emit function prologue and arguments passing code + * + * Emit stack-related function prologue if Func is a kernel and there're + * stackcalls or Func is a stack function. + * + * Prologue performs Sp and Fp initialization (both for kernel and stack + * function). For stack functions arguments passing code is generated as well, + * %arg and stackmem passing is supported. + */ +void GenXKernelBuilder::beginFunction(Function *Func) { + VISA_GenVar *Sp = nullptr, *Fp = nullptr, *Hwtid = nullptr; + CISA_CALL(Kernel->GetPredefinedVar(Sp, PREDEFINED_FE_SP)); + CISA_CALL(Kernel->GetPredefinedVar(Fp, PREDEFINED_FE_FP)); + CISA_CALL(Kernel->GetPredefinedVar(Hwtid, PREDEFINED_HW_TID)); + + VISA_VectorOpnd *SpOpSrc = nullptr; + VISA_VectorOpnd *SpOpSrc1 = nullptr; + VISA_VectorOpnd *SpOpDst = nullptr; + VISA_VectorOpnd *SpOpDst1 = nullptr; + VISA_VectorOpnd *FpOpDst = nullptr; + VISA_VectorOpnd *FpOpSrc = nullptr; + VISA_VectorOpnd *Imm = nullptr; + + CISA_CALL(Kernel->CreateVISADstOperand(SpOpDst, Sp, 1, 0, 0)); + CISA_CALL(Kernel->CreateVISADstOperand(SpOpDst1, Sp, 1, 0, 0)); + CISA_CALL(Kernel->CreateVISADstOperand(FpOpDst, Fp, 1, 0, 0)); + + CISA_CALL( + Kernel->CreateVISASrcOperand(SpOpSrc, Sp, MODIFIER_NONE, 0, 1, 0, 0, 0)); + CISA_CALL( + Kernel->CreateVISASrcOperand(SpOpSrc1, Sp, MODIFIER_NONE, 0, 1, 0, 0, 0)); + + CISA_CALL( + Kernel->CreateVISASrcOperand(FpOpSrc, Fp, MODIFIER_NONE, 0, 1, 0, 0, 0)); + + if (isKernel(Func) && (HasStackcalls || HasAlloca)) { + // init kernel stack + VISA_GenVar *Hwtid = nullptr; + CISA_CALL(Kernel->GetPredefinedVar(Hwtid, PREDEFINED_HW_TID)); + + VISA_VectorOpnd *HwtidOp = nullptr; + uint32_t Val = STACK_PER_THREAD; + + CISA_CALL(Kernel->CreateVISAImmediate(Imm, &Val, ISA_TYPE_UD)); + CISA_CALL(Kernel->CreateVISASrcOperand(HwtidOp, Hwtid, MODIFIER_NONE, 0, 1, + 0, 0, 0)); + + if (StackSurf == PREDEFINED_SURFACE_STACK) { + CISA_CALL(Kernel->AppendVISAArithmeticInst( + ISA_MUL, nullptr, false, (NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1), + EXEC_SIZE_1, SpOpDst, HwtidOp, Imm)); + } else { + VISA_GenVar *Tmp = nullptr; + CISA_CALL( + Kernel->CreateVISAGenVar(Tmp, "SpOff", 1, ISA_TYPE_UQ, ALIGN_DWORD)); + + VISA_VectorOpnd *OffOpDst = nullptr; + VISA_VectorOpnd *OffOpSrc = nullptr; + CISA_CALL(Kernel->CreateVISADstOperand(OffOpDst, Tmp, 1, 0, 0)); + CISA_CALL(Kernel->CreateVISASrcOperand(OffOpSrc, Tmp, MODIFIER_NONE, 0, 1, + 0, 0, 0)); + CISA_CALL(Kernel->AppendVISAArithmeticInst( + ISA_MUL, nullptr, false, (NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1), + EXEC_SIZE_1, OffOpDst, HwtidOp, Imm)); + + VISA_VectorOpnd *OpSrc = nullptr; + if (Subtarget->useGlobalMem()) { + assert(Func->arg_size() > 0); + Value &PrivBase = *(Func->arg_end() - 1); + genx::KernelArgInfo AI(TheKernelMetadata.getArgKind(Func->arg_size() - 1)); + assert(AI.isPrivateBase()); + OpSrc = createSource(&PrivBase, DONTCARESIGNED); + } else { + VISA_GenVar *R0 = nullptr; + CISA_CALL(Kernel->GetPredefinedVar(R0, PREDEFINED_R0)); + + CISA_CALL(Kernel->CreateVISASrcOperand(OpSrc, R0, MODIFIER_NONE, 0, 1, + 0, 0, 5)); + } + CISA_CALL(Kernel->AppendVISADataMovementInst( + ISA_MOV, nullptr, false, (NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1), + EXEC_SIZE_1, SpOpDst, OpSrc)); + Kernel->AppendVISAArithmeticInst( + ISA_ADD, nullptr, false, (NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1), + EXEC_SIZE_1, SpOpDst1, SpOpSrc1, OffOpSrc); + } + CISA_CALL(Kernel->AppendVISADataMovementInst( + ISA_MOV, nullptr, false, (NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1), + EXEC_SIZE_1, FpOpDst, SpOpSrc)); + // use the max available for now + unsigned SMO = Subtarget ? Subtarget->stackSurfaceMaxSize() : 8192; + Kernel->AddKernelAttribute("SpillMemOffset", 4, &SMO); + } else if (Func->hasFnAttribute(genx::FunctionMD::CMStackCall) || + Func->hasFnAttribute(genx::FunctionMD::ReferencedIndirectly)) { + if (Func->hasFnAttribute(genx::FunctionMD::ReferencedIndirectly)) { + int ExtVal = 1; + Kernel->AddKernelAttribute("Extern", 4, &ExtVal); + } + // stack function prologue + VISA_GenVar *FpTmp = nullptr; + + auto *ArgVar = &CisaVars[Kernel].at("argv"); + auto *RetVar = &CisaVars[Kernel].at("retv"); + + if (FPMap.count(Func) == 0) { + CISA_CALL( + Kernel->CreateVISAGenVar(FpTmp, "tmp", 1, ISA_TYPE_UD, ALIGN_DWORD)); + FPMap.insert(std::pair(Func, FpTmp)); + } else + FpTmp = FPMap[Func]; + + // init func stack pointers + VISA_VectorOpnd *TmpOp = nullptr; + CISA_CALL(Kernel->CreateVISADstOperand(TmpOp, FpTmp, 1, 0, 0)); + + Kernel->AppendVISADataMovementInst( + ISA_MOV, nullptr, false, (NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1), + EXEC_SIZE_1, TmpOp, FpOpSrc); + Kernel->AppendVISADataMovementInst( + ISA_MOV, nullptr, false, (NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1), + EXEC_SIZE_1, FpOpDst, SpOpSrc); + + // unpack args + int Sz = 0, StackOff = 0; + unsigned RowOff = 0, ColOff = 0, SrcRowOff = 0, SrcColOff = 0; + bool StackStarted = false; + unsigned NoStackSize = 0; + // NOTE: using reverse iterators for args would be much better we don't have + // any though + for (auto &FArg : Func->args()) { + if (Liveness->getLiveRange(&FArg) && + Liveness->getLiveRange(&FArg)->getCategory() == RegCategory::EM) + continue; + + RowOff = 0, ColOff = 0; + unsigned ArgSize = getValueSize(FArg.getType()); + if (SrcColOff && + (FArg.getType()->isVectorTy() || ArgSize > (GrfByteSize - ColOff))) { + SrcRowOff++; + SrcColOff = 0; + NoStackSize++; + } + if (Liveness->getLiveRange(&FArg)->getCategory() == + RegCategory::PREDICATE) { + VISA_VectorOpnd *argSrc = nullptr; + Kernel->CreateVISASrcOperand( + argSrc, + ArgVar->getAlias(llvmToVisaType(FArg.getType()), Kernel) + ->getGenVar(), + MODIFIER_NONE, 0, 1, 0, SrcRowOff, SrcColOff); + auto *PReg = + RegAlloc->getRegForValueOrNull(KernFunc, SimpleValue(&FArg)); + assert(PReg); + Kernel->AppendVISASetP(vISA_EMASK_M1_NM, EXEC_SIZE_32, + PReg->GetVar(Kernel), argSrc); + } else { + if ((int)ArgVar->getByteSize() - SrcRowOff * GrfByteSize >= ArgSize && + !StackStarted) { + emitVectorCopy(&FArg, ArgVar->getAlias(&FArg, Kernel), RowOff, ColOff, + SrcRowOff, SrcColOff, getValueSize(&FArg)); + NoStackSize = RowOff; + } else { + StackStarted = true; + VISA_StateOpndHandle *stackSurf = nullptr; + VISA_SurfaceVar *stackSurfVar = nullptr; + CISA_CALL(Kernel->GetPredefinedSurface(stackSurfVar, StackSurf)); + CISA_CALL( + Kernel->CreateVISAStateOperandHandle(stackSurf, stackSurfVar)); + popStackArg(&FArg, stackSurf, ArgSize, RowOff, ColOff, SrcRowOff, + SrcColOff, StackOff); + } + } + Sz += ArgSize; + } + if (!StackStarted && ColOff) + NoStackSize++; + auto *StackCallee = Func2Kern[Func]; + auto *FuncTy = Func->getFunctionType(); + int RetSize = + (FuncTy->getReturnType()->isVoidTy() || + getValueSize(FuncTy->getReturnType()) > RetVar->getByteSize()) + ? 0 + : (getValueSize(FuncTy->getReturnType()) + GrfByteSize - 1) / + GrfByteSize; + + StackCallee->SetFunctionInputSize(NoStackSize); + StackCallee->SetFunctionReturnSize(RetSize); + StackCallee->AddKernelAttribute("ArgSize", 1, &NoStackSize); + StackCallee->AddKernelAttribute("RetValSize", 1, &RetSize); + } +} + +/************************************************************************************************** + * endFunction : emit function epilogue and return value passing code + * + * Emit stack-related function epilogue if Func is a stack function. + * + * Epilogue restores Sp and Fp. Return value may be passed either visa %retval + * arg or stackmem, both scalar/vector and aggregate types are supported (please + * also see build[Extract|Insert]Value). + */ +void GenXKernelBuilder::endFunction(Function *Func, ReturnInst *RI) { + if (!isKernel(Func) && + (Func->hasFnAttribute(genx::FunctionMD::CMStackCall) || + Func->hasFnAttribute(genx::FunctionMD::ReferencedIndirectly))) { + VISA_GenVar *Sp = nullptr, *Fp = nullptr; + CISA_CALL(Kernel->GetPredefinedVar(Sp, PREDEFINED_FE_SP)); + CISA_CALL(Kernel->GetPredefinedVar(Fp, PREDEFINED_FE_FP)); + + VISA_VectorOpnd *SpOpSrc = nullptr; + VISA_VectorOpnd *SpOpDst = nullptr; + VISA_VectorOpnd *FpOpDst = nullptr; + VISA_VectorOpnd *FpOpSrc = nullptr; + + CISA_CALL(Kernel->CreateVISADstOperand(SpOpDst, Sp, 1, 0, 0)); + CISA_CALL(Kernel->CreateVISADstOperand(FpOpDst, Fp, 1, 0, 0)); + CISA_CALL(Kernel->CreateVISASrcOperand(SpOpSrc, Sp, MODIFIER_NONE, 0, 1, + 0, 0, 0)); + CISA_CALL(Kernel->CreateVISASrcOperand(FpOpSrc, Fp, MODIFIER_NONE, 0, 1, + 0, 0, 0)); + + VISA_VectorOpnd *TmpOp = nullptr; + CISA_CALL(Kernel->CreateVISASrcOperand(TmpOp, FPMap[Func], MODIFIER_NONE, + 0, 1, 0, 0, 0)); + + Kernel->AppendVISADataMovementInst( + ISA_MOV, nullptr, false, (NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1), + EXEC_SIZE_1, SpOpDst, FpOpSrc); + Kernel->AppendVISADataMovementInst( + ISA_MOV, nullptr, false, (NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1), + EXEC_SIZE_1, FpOpDst, TmpOp); + + VISA_GenVar *Ret = nullptr; + CISA_CALL(Kernel->GetPredefinedVar(Ret, PREDEFINED_RET)); + + if (!Func->getReturnType()->isVoidTy() && + !Func->getReturnType()->isAggregateType() && + Liveness->getLiveRange(RI->getReturnValue()) && + (Liveness->getLiveRange(RI->getReturnValue())->getCategory() != + RegCategory::EM && + Liveness->getLiveRange(RI->getReturnValue())->getCategory() != + RegCategory::PREDICATE)) { + GenericCisaVariable *RetVar = &CisaVars[Kernel].at("retv"); + assert(!Func->getReturnType()->isAggregateType()); + + // pack retval + unsigned RowOff = 0, ColOff = 0, SrcRowOff = 0, SrcColOff = 0; + if (getValueSize(Func->getReturnType()) <= + RetVar->getByteSize()) { + unsigned RowOff = 0, ColOff = 0, SrcRowOff = 0, SrcColOff = 0; + emitVectorCopy(RetVar->getAlias(RI->getReturnValue(), Kernel), RI->getReturnValue(), + RowOff, ColOff, SrcRowOff, + SrcColOff, getValueSize(RI->getReturnValue())); + } else { + VISA_StateOpndHandle *StackSurfOp = nullptr; + VISA_SurfaceVar *StackSurfVar = nullptr; + CISA_CALL(Kernel->GetPredefinedSurface(StackSurfVar, + StackSurf)); + CISA_CALL( + Kernel->CreateVISAStateOperandHandle(StackSurfOp, StackSurfVar)); + pushStackArg(StackSurfOp, RI->getReturnValue(), + getValueSize(Func->getReturnType()), RowOff, ColOff, + SrcRowOff, SrcColOff); + } + } + for (auto II : RetvInserts) + buildInsertRetv(II); + RetvInserts.clear(); + } +} + +void GenXKernelBuilder::buildExtractRetv(ExtractValueInst *Inst) { + auto T = Inst->getOperand(0)->getType(); + auto *RetVar = &CisaVars[Kernel].at("retv"); + + bool UseStack = getValueSize(T) > RetVar->getByteSize(); + + auto Index = Inst->getIndices().front(); + if (T->getContainedType(Index)->isVectorTy() && + T->getContainedType(Index)->getVectorElementType()->isIntegerTy(1)) + // elements of type should be ignored + return; + + unsigned RowOff = 0, ColOff = 0; + unsigned SrcRowOff = 0, SrcColOff = 0; + for (unsigned i = 0; i < Index; i++) { + int Mod = UseStack ? BYTES_PER_OWORD : GrfByteSize; + SrcRowOff += (getValueSize(T->getContainedType(i)) + Mod - 1) / Mod; + } + + if (UseStack) { + int Prev = SrcRowOff; + VISA_StateOpndHandle *StackSurfOp = nullptr; + VISA_SurfaceVar *StackSurfVar = nullptr; + CISA_CALL( + Kernel->GetPredefinedSurface(StackSurfVar, StackSurf)); + CISA_CALL(Kernel->CreateVISAStateOperandHandle(StackSurfOp, StackSurfVar)); + popStackArg(Inst, StackSurfOp, getValueSize(T->getContainedType(Index)), + RowOff, ColOff, SrcRowOff, SrcColOff, Prev); + } else + emitVectorCopy(Inst, RetVar->getAlias(Inst, Kernel), RowOff, ColOff, + SrcRowOff, SrcColOff, getValueSize(Inst)); +} + +void GenXKernelBuilder::buildInsertRetv(InsertValueInst *Inst) { + auto T = Inst->getOperand(0)->getType(); + auto *RetVar = &CisaVars[Kernel].at("retv"); + + bool UseStack = getValueSize(T) > RetVar->getByteSize(); + + auto Index = Inst->getIndices().front(); + if (T->getContainedType(Index)->isVectorTy() && + T->getContainedType(Index)->getVectorElementType()->isIntegerTy(1)) { + // elements of type should be ignored + return; + } + + unsigned RowOff = 0, ColOff = 0; + unsigned SrcRowOff = 0, SrcColOff = 0; + + if (!UseStack) + for (unsigned i = 0; i < Index; i++) + RowOff += (getValueSize(T->getContainedType(i)) + GrfByteSize - 1) / + GrfByteSize; + + if (UseStack) { + VISA_StateOpndHandle *StackSurfOp = nullptr; + VISA_SurfaceVar *StackSurfVar = nullptr; + CISA_CALL( + Kernel->GetPredefinedSurface(StackSurfVar, StackSurf)); + CISA_CALL(Kernel->CreateVISAStateOperandHandle(StackSurfOp, StackSurfVar)); + pushStackArg(StackSurfOp, Inst->getOperand(1), + getValueSize(T->getContainedType(Index)), RowOff, ColOff, + SrcRowOff, SrcColOff); + } else + emitVectorCopy(RetVar->getAlias(Inst->getOperand(1), Kernel), + Inst->getOperand(1), RowOff, ColOff, SrcRowOff, SrcColOff, + getValueSize(Inst->getOperand(1))); +} + +void GenXKernelBuilder::buildStackCall(IGCLLVM::CallInst *CI, + const DstOpndDesc &DstDesc) { + LLVM_DEBUG(dbgs() << "Build stack call\n"; CI->print(dbgs()); dbgs() << "\n"); + Function *Callee = CI->getCalledFunction(); + auto *FuncTy = CI->getFunctionType(); + auto *StackCallee = Func2Kern[Callee]; + assert(CI->isIndirectCall() || StackCallee); + + // Check whether the called function has a predicate arg that is EM. + int EMOperandNum = -1, EMIdx = -1; + for (auto &Arg : CI->arg_operands()) { + ++EMIdx; + if (!Arg->getType()->getScalarType()->isIntegerTy(1)) + continue; + if (Liveness->getLiveRange(Arg)->getCategory() == RegCategory::EM) { + EMOperandNum = EMIdx; + break; + } + } + + int TotalArgSize = 0; + for (auto &CallArg : CI->arg_operands()) + TotalArgSize += getValueSize(CallArg->getType()); + + VISA_GenVar *Sp = nullptr, *Arg = nullptr, *Ret = nullptr; + CISA_CALL(Kernel->GetPredefinedVar(Sp, PREDEFINED_FE_SP)); + CISA_CALL(Kernel->GetPredefinedVar(Arg, PREDEFINED_ARG)); + CISA_CALL(Kernel->GetPredefinedVar(Ret, PREDEFINED_RET)); + + unsigned ColOff = 0, RowOff = 0, SrcRowOff = 0, SrcColOff = 0; + + int Sz = 0, NoStackSize = 0, StackArgSz = 0; + uint64_t StackOff = 0; + bool StackStarted = false; + // pack arguments + for (auto &CallArg : CI->arg_operands()) { + auto *CallArgLR = Liveness->getLiveRangeOrNull(CallArg.get()); + if (CallArgLR && CallArgLR->getCategory() == RegCategory::EM) + continue; + + assert(!CallArg->getType()->isAggregateType()); + SrcRowOff = 0, SrcColOff = 0; + unsigned ArgSize = getValueSize(CallArg->getType()); + + if (ColOff && (CallArg->getType()->isVectorTy() || + ArgSize > (GrfByteSize - ColOff))) { + RowOff++; + ColOff = 0; + // adjust size if we use only a part the last used GRF + NoStackSize++; + } + + bool IsUndef = isa(CallArg); + auto *ArgVar = &CisaVars[Kernel].at("argv"); + if ((int)ArgVar->getByteSize() - RowOff * GrfByteSize >= ArgSize && + !StackStarted) { + assert(ArgSize <= Sz - ArgVar->getByteSize() && + "cannot pass arg via stack and %arg as well"); + + SrcRowOff = 0, SrcColOff = 0; + if (!IsUndef && CallArgLR->getCategory() == RegCategory::PREDICATE) { + VISA_VectorOpnd *PredDst = nullptr; + Kernel->CreateVISADstOperand( + PredDst, + ArgVar->getAlias(llvmToVisaType(CallArg->getType()), Kernel) + ->getGenVar(), + 1, RowOff, ColOff); + auto PReg = + RegAlloc->getRegForValueOrNull(KernFunc, SimpleValue(CallArg)); + assert(PReg); + Kernel->AppendVISAPredicateMove(PredDst, + PReg->GetVar(Kernel)); + ColOff += ArgSize; + } else + emitVectorCopy( + ArgVar->getAlias(CallArg, Kernel), CallArg, RowOff, ColOff, + SrcRowOff, SrcColOff, getValueSize(CallArg), !IsUndef); + Sz += ArgSize; + NoStackSize = RowOff; + } else { + StackStarted = true; + RowOff = ColOff = 0; + SrcRowOff = SrcColOff = 0; + VISA_StateOpndHandle *StackSurfOp = nullptr; + VISA_SurfaceVar *StackSurfVar = nullptr; + CISA_CALL( + Kernel->GetPredefinedSurface(StackSurfVar, StackSurf)); + CISA_CALL(Kernel->CreateVISAStateOperandHandle(StackSurfOp, StackSurfVar)); + pushStackArg(StackSurfOp, CallArg, ArgSize, RowOff, ColOff, SrcRowOff, + SrcColOff, !IsUndef); + + StackArgSz += (ArgSize / BYTES_PER_OWORD) + (ArgSize % BYTES_PER_OWORD ? 1 : 0); + StackOff = -StackArgSz; + } + } + if (!StackStarted && ColOff) + NoStackSize++; + + VISA_VectorOpnd *SpOpSrc = nullptr, *SpOpDst = nullptr, *Imm = nullptr; + if (StackOff) { + CISA_CALL(Kernel->CreateVISADstOperand(SpOpDst, Sp, 1, 0, 0)); + CISA_CALL(Kernel->CreateVISASrcOperand(SpOpSrc, Sp, MODIFIER_NONE, 0, 1, 0, + 0, 0)); + + if (Subtarget->useGlobalMem()) + StackOff *= BYTES_PER_OWORD; + CISA_CALL(Kernel->CreateVISAImmediate(Imm, &StackOff, ISA_TYPE_UQ)); + CISA_CALL(Kernel->AppendVISAArithmeticInst( + ISA_ADD, nullptr, false, (NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1), + EXEC_SIZE_1, SpOpDst, SpOpSrc, Imm)); + } + + VISA_PredOpnd *Pred = nullptr; + VISA_Exec_Size Esz = EXEC_SIZE_16; + if (EMOperandNum >= 0) { + Pred = createPred(CI, BaleInfo(), EMOperandNum); + Esz = getExecSizeFromValue( + CI->getArgOperand(EMOperandNum)->getType()->getVectorNumElements()); + } + addDebugInfo(); + + auto *RetVar = &CisaVars[Kernel].at("retv"); + bool ProcessRet = + !FuncTy->getReturnType()->isVoidTy() && + !FuncTy->getReturnType()->isAggregateType() && + !(FuncTy->getReturnType()->isVectorTy() && + FuncTy->getReturnType()->getVectorElementType()->isIntegerTy(1)); + + // cannot use processRet here since aggr/em args should be co + int RetSize = + (FuncTy->getReturnType()->isVoidTy() || + getValueSize(FuncTy->getReturnType()) > RetVar->getByteSize()) + ? 0 + : (getValueSize(FuncTy->getReturnType()) + GrfByteSize - 1) / + GrfByteSize; + if (Callee) { + CISA_CALL(Kernel->AppendVISACFFunctionCallInst( + Pred, (NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1), EXEC_SIZE_16, + Callee->getName(), NoStackSize, RetSize)); + } else { + auto *FuncAddr = createSource(CI->getCalledValue(), DONTCARESIGNED); + assert(FuncAddr); + CISA_CALL(Kernel->AppendVISACFIndirectFuncCallInst( + Pred, (NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1), EXEC_SIZE_16, + FuncAddr, NoStackSize, RetSize)); + } + + unsigned StackRetSz = 0; + if (!FuncTy->getReturnType()->isVoidTy() && + getValueSize(FuncTy->getReturnType()) > RetVar->getByteSize()) + StackRetSz = (getValueSize(FuncTy->getReturnType(), BYTES_PER_OWORD) / BYTES_PER_OWORD + + ((getValueSize(FuncTy->getReturnType(), BYTES_PER_OWORD) % BYTES_PER_OWORD) ? 1 : 0)); + // unpack retval + if (ProcessRet && Liveness->getLiveRange(CI) && + Liveness->getLiveRange(CI)->getCategory() != RegCategory::EM) { + unsigned RowOff = 0, ColOff = 0, SrcRowOff = 0, SrcColOff = 0; + if (getValueSize(FuncTy->getReturnType()) <= RetVar->getByteSize()) { + emitVectorCopy(CI, RetVar->getAlias(CI, Kernel), RowOff, ColOff, + SrcRowOff, SrcColOff, getValueSize(CI)); + } else { + int StackOffVal = -StackRetSz; + VISA_StateOpndHandle *StackSurfOp = nullptr; + VISA_SurfaceVar *StackSurfVar = nullptr; + CISA_CALL( + Kernel->GetPredefinedSurface(StackSurfVar, StackSurf)); + CISA_CALL(Kernel->CreateVISAStateOperandHandle(StackSurfOp, StackSurfVar)); + popStackArg(CI, StackSurfOp, getValueSize(Callee->getReturnType()), RowOff, + ColOff, SrcRowOff, SrcColOff, StackOffVal); + } + } + // restore Sp + CISA_CALL( + Kernel->CreateVISASrcOperand(SpOpSrc, Sp, MODIFIER_NONE, 0, 1, 0, 0, 0)); + CISA_CALL(Kernel->CreateVISADstOperand(SpOpDst, Sp, 1, 0, 0)); + uint64_t OffVal = -StackRetSz; + CISA_CALL(Kernel->CreateVISAImmediate(Imm, &OffVal, ISA_TYPE_UQ)); + CISA_CALL(Kernel->AppendVISAArithmeticInst( + ISA_ADD, nullptr, false, (NoMask ? vISA_EMASK_M1_NM : vISA_EMASK_M1), + EXEC_SIZE_1, SpOpDst, SpOpSrc, Imm)); +} + +namespace { + +class GenXFinalizer : public ModulePass { + raw_pwrite_stream &Out; + LLVMContext *Ctx = nullptr; + +public: + static char ID; + explicit GenXFinalizer(raw_pwrite_stream &o) : ModulePass(ID), Out(o) {} + + virtual StringRef getPassName() const { return "GenX Finalizer"; } + + LLVMContext &getContext() { + assert(Ctx); + return *Ctx; + } + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.setPreservesAll(); + } + + void fillOCLRuntimeInfo(GenXOCLRuntimeInfo &Info, GenXModule &GM, + FunctionGroupAnalysis &FGA, const GenXSubtarget &ST); + + bool runOnModule(Module &M) { + Ctx = &M.getContext(); + + GenXModule &GM = getAnalysis(); + FunctionGroupAnalysis &FGA = getAnalysis(); + GenXOCLRuntimeInfo *OCLInfo = getAnalysisIfAvailable(); + const GenXSubtarget &ST = *getAnalysis().getSubtarget(); + + std::stringstream ss; + auto *CisaBuilder = GM.GetCisaBuilder(); + if (GM.HasInlineAsm()) { + auto VISAAsmTextReader = GM.GetVISAAsmReader(); + CISA_CALL(VISAAsmTextReader->Compile("genxir", &ss, EmitVisa)); + } else + CISA_CALL(CisaBuilder->Compile("genxir", &ss, EmitVisa)); + if (OCLInfo) + fillOCLRuntimeInfo(*OCLInfo, GM, FGA, ST); + dbgs() << CisaBuilder->GetCriticalMsg(); + GM.DestroyCISABuilder(); + GM.DestroyVISAAsmReader(); + Out << ss.str(); + return false; + } +}; +} // end anonymous namespace. + +char GenXFinalizer::ID = 0; + +ModulePass *llvm::createGenXFinalizerPass(raw_pwrite_stream &o) { + return new GenXFinalizer(o); +} + +static void constructSymbolTable(FunctionGroup &FG, GenXModule &GM, + void *&Buffer, unsigned &ByteSize, + unsigned &NumEntries) { + NumEntries = std::count_if(FG.begin(), FG.end(), [](Function *F) { + return F->hasFnAttribute("referenced-indirectly"); + }); + ByteSize = NumEntries * sizeof(vISA::GenSymEntry); + // this will be eventually freed in AdaptorOCL + Buffer = new vISA::GenSymEntry[NumEntries]; + auto *Entry = static_cast(Buffer); + for (auto &F : FG) + if (F->hasFnAttribute("referenced-indirectly")) { + assert(F->getName().size() <= vISA::MAX_SYMBOL_NAME_LENGTH); + strcpy_s(Entry->s_name, vISA::MAX_SYMBOL_NAME_LENGTH, + F->getName().str().c_str()); + VISAFunction *Func = static_cast(GM.getVISAKernel(F)); + Entry->s_type = vISA::GenSymType::S_FUNC; + Entry->s_offset = Func->getGenOffset(); + Entry->s_size = Func->getGenSize(); + Entry++; + } +} + +void GenXFinalizer::fillOCLRuntimeInfo(GenXOCLRuntimeInfo &OCLInfo, + GenXModule &GM, + FunctionGroupAnalysis &FGA, + const GenXSubtarget &ST) { + using KernelInfo = GenXOCLRuntimeInfo::KernelInfo; + using CompiledKernel = GenXOCLRuntimeInfo::CompiledKernel; + using TableInfo = GenXOCLRuntimeInfo::TableInfo; + for (auto *FG : FGA) { + // Compiler info. + KernelInfo Info{*FG, ST}; + + // Finalizer info (jitter struct and gen binary). + VISAKernel *BuiltKernel = GM.getVISAKernel(FG->getHead()); + assert(BuiltKernel); + FINALIZER_INFO *JitInfo = nullptr; + BuiltKernel->GetJitInfo(JitInfo); + assert(JitInfo && "Jit info is not set by finalizer"); + void *GenBin = nullptr; + int GenBinSize = 0; // Finalizer uses signed int for size... + BuiltKernel->GetGenxBinary(GenBin, GenBinSize); + assert(GenBin && GenBinSize && + "Unexpected null buffer or zero-sized kernel (compilation failed?)"); + TableInfo &RTable = Info.getRelocationTable(); + CISA_CALL(BuiltKernel->GetGenRelocEntryBuffer(RTable.Buffer, RTable.Size, RTable.Entries)); + TableInfo &STable = Info.getSymbolTable(); + constructSymbolTable(*FG, GM, STable.Buffer, STable.Size, STable.Entries); + + // Save it all here. + CompiledKernel FullInfo{std::move(Info), *JitInfo, + ArrayRef{static_cast(GenBin), + static_cast(GenBinSize)}}; + OCLInfo.saveCompiledKernel(std::move(FullInfo)); + + freeBlock(GenBin); + } +} + +void GenXModule::clearFinalizerArgs(std::vector& Owner) const { + std::for_each(Owner.begin(), Owner.end(), [](const char* a) { delete []a; }); + Owner.clear(); +} + +void GenXModule::collectFinalizerArgs(std::vector &Owner) const { + clearFinalizerArgs(Owner); + + auto grantArgument = [](const std::string& ArgString, + std::vector &Owner) { + const size_t BufferSize = ArgString.size() + 1; + char* ArgCopyBuff = new char [BufferSize]; + std::memcpy(ArgCopyBuff, ArgString.data(), BufferSize); + Owner.push_back(ArgCopyBuff); + }; + + grantArgument("-dumpvisa", Owner); + for (const auto& Fos: FinalizerOpts) { + // Add additional arguments if specified + std::istringstream f(Fos); + std::string s; + while (getline(f, s, ' ')) { + grantArgument(s, Owner); + } + } + Owner.push_back(nullptr); +} + +LLVMContext &GenXModule::getContext() { + assert(Ctx); + return *Ctx; +} + +void GenXModule::InitCISABuilder() { + assert(ST); + auto Platform = ST->getVisaPlatform(); + // Use SKL for unknown platforms + if (Platform == GENX_NONE) + Platform = GENX_SKL; + + // Prepare array of arguments for Builder API. + collectFinalizerArgs(CISA_Args); + + if (PrintFinalizerOptions.getValue()) { + outs() << "Finalizer Parameters:\n\t" << " -platform " << ST->getCPU(); + std::for_each(CISA_Args.begin(), CISA_Args.end(), + [](const char* Arg) { outs() << " " << Arg; }); + outs() << "\n"; + } + + CISA_CALL(CreateVISABuilder(CisaBuilder, + HasInlineAsm() ? vISA_ASM_WRITER : vISA_MEDIA, + EmitVisa ? VISA_BUILDER_VISA : VISA_BUILDER_BOTH, + Platform, CISA_Args.size() - 1, CISA_Args.data(), + WaTable)); + assert(CisaBuilder && "Failed to create VISABuilder!"); +} + +VISABuilder *GenXModule::GetCisaBuilder() { + if (!CisaBuilder) + InitCISABuilder(); + return CisaBuilder; +} + +void GenXModule::DestroyCISABuilder() { + if (CisaBuilder) { + CISA_CALL(DestroyVISABuilder(CisaBuilder)); + CisaBuilder = nullptr; + } +} + +void GenXModule::InitVISAAsmReader() { + assert(ST); + auto Platform = ST->getVisaPlatform(); + // Use SKL for unknown platforms + if (Platform == GENX_NONE) + Platform = GENX_SKL; + + // Prepare array of arguments for Builder API. + collectFinalizerArgs(VISA_Args); + + // Prepare array of arguments for Builder API. + if (PrintFinalizerOptions.getValue()) { + outs() << "Finalizer Parameters:\n\t" << " -platform " << ST->getCPU(); + std::for_each(VISA_Args.begin(), VISA_Args.end(), + [](const char* Arg) { outs() << " " << Arg; }); + outs() << "\n"; + } + + CISA_CALL(CreateVISABuilder(VISAAsmTextReader, vISA_ASM_READER, + VISA_BUILDER_BOTH, Platform, + VISA_Args.size() - 1, VISA_Args.data(), + WaTable)); + assert(VISAAsmTextReader && "Failed to create VISAAsmTextReader!"); +} + +VISABuilder *GenXModule::GetVISAAsmReader() { + if (!VISAAsmTextReader) + InitVISAAsmReader(); + return VISAAsmTextReader; +} + +void GenXModule::DestroyVISAAsmReader() { + if (VISAAsmTextReader) { + CISA_CALL(DestroyVISABuilder(VISAAsmTextReader)); + VISAAsmTextReader = nullptr; + } +} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXCoalescing.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXCoalescing.cpp new file mode 100644 index 000000000000..623bccd4c6fa --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXCoalescing.cpp @@ -0,0 +1,1759 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXCoalescing +/// -------------- +/// +/// The LLVM target independent code generator, used by most backends, has a +/// coalescing pass that runs after de-SSA of the machine IR and two-address +/// handling, and attempts to remove the added copies by coalescing values. It +/// also attempts to coalesce a value with a hardreg that it is copied to/from. +/// +/// This GenX coalescing and copy insertion pass is a bit different, in that +/// it runs on LLVM IR, which must remain in SSA, and it attempts to coalesce +/// values to try and avoid adding the copy in the first place. In any phi node +/// or two address op where it fails to coalesce, it inserts a copy (and +/// coalesces the result of the copy into the result of the phi node or +/// two address op). +/// +/// There are three different kinds of coalescing. Copy coalescing is done first, +/// then the other two are done together. +/// +/// 1. Copy coalescing. +/// +/// Generally there are no copy instructions in SSA, but we +/// can treat a bitcast as a copy (the operand and result can live in the +/// same register aliased in different registers), and an extractvalue is +/// treated as a copy to be coalesced, and the "inserted value" operand +/// and the corresponding element(s) of the result in an insertvalue are +/// treated as a copy to be coalesced. +/// +/// Copy coalescing represents two values that are known to be identical +/// occupying the same register at the same time, thus it is possible even +/// if the two values interfere (are live at the same point). Because we +/// handle copy coalescing before any other kind of coalescing, it usually +/// succeeds. +/// +/// This only works because we do copy coalescing first, so we know that +/// neither value that we want to copy coalesce has already undergone normal +/// or phi coalescing. +/// +/// However there is a case when copy coalescing between two live ranges +/// LR1 and LR2 (each of which is possibly already copy coalesced) cannot be +/// allowed: when LR2 loops round and has a phi use in the same basic block +/// as a phi definition in LR1, where the phi use of LR2 is after the phi +/// definition of LR1. This can happen because LLVM IR does not attach any +/// meaning to the order of phi nodes, but the GenX backend does with its +/// instruction numbering. +/// +/// This constraint on copy coalescing is embodied in the concept of +/// "copy-interference". The two live ranges LR1 and LR2 copy-interfere, +/// meaning they cannot be copy coalesced, if LR1 has a phi definition, +/// one of whose numbers is within LR2's live range. +/// +/// 2. Normal coalescing +/// +/// This arises where we have a two-address operation, that is, it has an +/// operand that needs to be in the same register as the result, because the +/// instruction represents a partial write operation. The main example of +/// this is wrregion, but there are also some shared function intrinsics +/// that need this. +/// +/// Here, we gather all the possible coalesces (including the phi ones), +/// together with an estimate of the cost of failing to coalesce (due to +/// needing to insert a copy), and then sort them in cost order and process +/// them. +/// +/// This kind of coalescing is possible only if the two live ranges do not +/// interfere. If coalescing fails, we need to insert a copy just before +/// the instruction, creating a new value with a very short live range +/// that can trivially be coalesced with the result of the original +/// instruction. +/// +/// Some subkinds of normal coalescing are: +/// +/// 2a. call arg pre-copy +/// +/// A call arg needs to be coalesced with or copied to the corresponding +/// function arg. +/// +/// Unlike most other kinds of coalescing, if coalescing fails, the copy +/// insertion is delayed until later, so we can ensure that the copies +/// are in the same order as the args, as the live ranges were computed +/// on that basis. +/// +/// Normally, call arg pre-copy coalescing occurs, like other normal +/// coalescing, if the two live ranges do not interfere. If this fails, +/// we can still do *call arg special coalescing* (CASC) of call arg A +/// and function arg B as long as both of the following are true: +/// +/// i. B has not been normal coalesced into anything (which would be +/// in the subroutine or some other subroutine it calls), except +/// that B is allowed to be call arg pre-copy coalesced; +/// +/// ii. For any other call site where the corresponding call arg is not +/// A, A does not interfere with it. +/// +/// Call arg special coalescing allows call arg A and function arg B to +/// be in the same register, even if A is used after the call, as long +/// as that register is not already being used for a different value +/// in the subroutine, and as long as a different value for the call +/// arg is not used at a different call site where A is live. +/// +/// **Note**: Call arg special coalescing is disabled, because it broke +/// a test and I never got round to investigating why. I don't even know +/// if it would be beneficial any more, given more recent changes to +/// liveness and coalescing. +/// +/// 2b. ret value pre-copy +/// +/// At a ReturnInst, the return value operand needs to be coalesced with +/// or copied to the unified return value for the function. This is +/// handled mostly the same as a normal coalesce. +/// +/// 2c. ret value post-copy +/// +/// After a CallInst for a subroutine call, the unified return value +/// needs to be coalesced with or copied to the result of the call. On +/// failure, the copy insertion is delayed until later. +/// +/// 3. Phi coalescing +/// +/// This is how we "de-SSA" the code. A phi incoming wants to coalesce with +/// the result of the phi node. +/// +/// Again, this kind of coalescing is possible only if the two live ranges +/// do not interfere. (A phi incoming can never interfere with its phi +/// result, but earlier coalescing could make them now interfere.) If +/// coalescing fails, we need to insert a copy at the end of the incoming +/// predecessor basic block. In fact we defer the copy insertion from failed +/// phi coalescing to the end, because we need to make sure the inserted +/// copies are in the same order as the phi nodes, as that is the basis on +/// which the live ranges were constructed. +/// +/// After phi coalescing, the LLVM IR is still in SSA form, but the phi +/// coalescing, and the copies inserted where phi coalescing failed, mean +/// that it is trivial to transform into non-SSA vISA code: generate code for +/// the phi copies, and ignore the phi nodes themselves because they are +/// completely coalesced. +/// +/// Kernel argument copying +/// ^^^^^^^^^^^^^^^^^^^^^^^ +/// +/// The kernel argument offsets (i.e. where kernel arguments appear in the GRF +/// on entry to the kernel) are set in a very early pass just after Clang +/// codegen. This sets offsets and packs holes in a way that is specific to the +/// language being compiled and its contract with its runtime. +/// +/// However, when we get here, we may find that a live range that contains a +/// kernel argument has an alignment requirement that the offset from +/// earlier does not comply with. +/// +/// So an extra function of this pass, after doing the coalescing, is to spot +/// this case, where a kernel argument has an offset that is not aligned enough, +/// and insert an extra copy at the start of the function. +/// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "GENX_COALESCING" + +#include "FunctionGroup.h" +#include "GenX.h" +#include "GenXBaling.h" +#include "GenXGotoJoin.h" +#include "GenXIntrinsics.h" +#include "GenXLiveness.h" +#include "GenXModule.h" +#include "GenXNumbering.h" +#include "GenXRegion.h" +#include "GenXSubtarget.h" +#include "GenXUtil.h" +#include "vc/GenXOpts/Utils/KernelInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/DiagnosticPrinter.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvmWrapper/IR/InstrTypes.h" +#include +#include + +using namespace llvm; +using namespace genx; + +static cl::opt GenXShowCoalesceFailThreshold("genx-show-coalesce-fail-threshold", cl::init(UINT_MAX), cl::Hidden, + cl::desc("GenX size threshold (bytes) for showing coalesce fails.")); + +// Diagnostic information for error/warning relating fast-composition. +class DiagnosticInfoFastComposition : public DiagnosticInfo { +private: + std::string Description; + StringRef Filename; + unsigned Line; + unsigned Col; + static int KindID; + static int getKindID() { + if (KindID == 0) + KindID = llvm::getNextAvailablePluginDiagnosticKind(); + return KindID; + } +public: + // Initialize from an Instruction and an Argument. + DiagnosticInfoFastComposition(Instruction *Inst, + const Twine &Desc, DiagnosticSeverity Severity = DS_Error); + void print(DiagnosticPrinter &DP) const override; + + static bool classof(const DiagnosticInfo *DI) { + return DI->getKind() == getKindID(); + } +}; +int DiagnosticInfoFastComposition::KindID = 0; + +namespace { + + // Candidate : description of a coalescing candidate + struct Candidate { + genx::SimpleValue Dest; + Use *UseInDest; + unsigned SourceIndex; + unsigned Priority; + unsigned Serial; + Candidate(SimpleValue Dest, Use *UseInDest, unsigned SourceIndex, + unsigned Priority, unsigned Serial) + : Dest(Dest), UseInDest(UseInDest), SourceIndex(SourceIndex), + Priority(Priority), Serial(Serial) {} + bool operator<(const Candidate &C2) const { + if (Priority != C2.Priority) + return Priority > C2.Priority; + // Make the sort order preserving for equal priority, to get consistent + // results across different runs. + return Serial < C2.Serial; + } + }; + + struct PhiCopy { + PHINode *Phi; + unsigned IncomingIdx; + PhiCopy(PHINode *Phi, unsigned IncomingIdx) + : Phi(Phi), IncomingIdx(IncomingIdx) {} + }; + + // GenX coalescing pass + class GenXCoalescing : public FunctionGroupPass { + private: + const GenXSubtarget *ST; + GenXBaling *Baling; + GenXLiveness *Liveness; + GenXNumbering *Numbering; + DominatorTreeGroupWrapperPass *DTWrapper; + std::vector CopyCandidates; + std::vector NormalCandidates; + std::vector Callables; + public: + static char ID; + explicit GenXCoalescing() : FunctionGroupPass(ID) {} + virtual StringRef getPassName() const { return "GenX coalescing and copy insertion"; } + void getAnalysisUsage(AnalysisUsage &AU) const { + FunctionGroupPass::getAnalysisUsage(AU); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.setPreservesCFG(); + } + bool runOnFunctionGroup(FunctionGroup &FG); + // createPrinterPass : get a pass to print the IR, together with the GenX + // specific analyses + virtual Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const + { return createGenXGroupPrinterPass(O, Banner); } + private: + void recordCandidates(FunctionGroup *FG); + unsigned getPriority(Type *Ty, BasicBlock *BB); + // Various permutations of the function to record a coalescing candidate. + void recordCopyCandidate(Value *Dest, Use *UseInDest, unsigned Priority) { + recordCandidate(SimpleValue(Dest), UseInDest, 0, Priority, + &CopyCandidates); + } + void recordCopyCandidate(SimpleValue Dest, Use *UseInDest, + unsigned SourceIndex, unsigned Priority) { + recordCandidate(Dest, UseInDest, SourceIndex, Priority, &CopyCandidates); + } + void recordNormalCandidate(Instruction *Dest, Use *UseInDest, + unsigned Priority) { + recordCandidate(SimpleValue(Dest), UseInDest, 0, Priority, + &NormalCandidates); + } + void recordNormalCandidate(SimpleValue Dest, Use *UseInDest, + unsigned SourceIndex, unsigned Priority) { + recordCandidate(Dest, UseInDest, SourceIndex, Priority, + &NormalCandidates); + } + void recordPhiCandidate(Value *Dest, Use *UseInDest, unsigned Priority) { + recordCandidate(SimpleValue(Dest), UseInDest, 0, Priority, + &NormalCandidates); + } + void recordCandidate(SimpleValue Dest, Use *UseInDest, unsigned SourceIndex, + unsigned Priority, std::vector *Candidates); + void recordCallCandidates(FunctionGroup *FG); + void recordCallArgCandidates(Value *Dest, unsigned ArgNum, + ArrayRef Insts); + // Functions for processing coalecing candidates. + void processCopyCandidate(Candidate *Cand) { + processCandidate(Cand, true /*IsCopy*/); + } + void processCandidate(Candidate *Cand, bool IsCopy = false); + void processPhiNodes(FunctionGroup *FG); + void analysePhiCopies(PHINode *Phi, std::vector &ToProcess); + void processPhiCopy(PHINode *Phi, unsigned Inc, + std::vector &Phis); + void processPhiBranchingJoinLabelCopy(PHINode *Phi, unsigned Inc, + std::vector &Phis); + PHINode *copyNonCoalescedPhi(PHINode *PhiPred, PHINode *PhiSucc); + void processCalls(FunctionGroup *FG); + void processKernelArgs(FunctionGroup *FG); + void coalesceOutputArgs(FunctionGroup *FG); + void coalesceCallables(); + void coalesceGlobalLoads(FunctionGroup *FG); + Instruction *insertCopy(SimpleValue Input, LiveRange *LR, + Instruction *InsertBefore, StringRef Name, + unsigned Number); + Instruction *insertIntoStruct(Type *Ty, unsigned FlattenedIndex, + Value *OldStruct, Instruction *NewVal, + Instruction *InsertBefore); + void showCoalesceFail(SimpleValue V, const DebugLoc &DL, const char *Intro, + LiveRange *DestLR, LiveRange *SourceLR); + // Helpers + DominatorTree *getDomTree(Function *F) { return DTWrapper->getDomTree(F); } + }; + +} // end anonymous namespace + +char GenXCoalescing::ID = 0; +namespace llvm { +void initializeGenXCoalescingPass(PassRegistry &); +} +INITIALIZE_PASS_BEGIN(GenXCoalescing, "GenXCoalescing", "GenXCoalescing", false, false) +INITIALIZE_PASS_DEPENDENCY(GenXGroupBaling) +INITIALIZE_PASS_DEPENDENCY(GenXLiveness) +INITIALIZE_PASS_DEPENDENCY(GenXNumbering) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeGroupWrapperPass); +INITIALIZE_PASS_END(GenXCoalescing, "GenXCoalescing", "GenXCoalescing", false, false) + +FunctionGroupPass *llvm::createGenXCoalescingPass() { + initializeGenXCoalescingPass(*PassRegistry::getPassRegistry()); + return new GenXCoalescing(); +} + +/*********************************************************************** + * runOnFunctionGroup : run the coalescing pass for this FunctionGroup + */ +bool GenXCoalescing::runOnFunctionGroup(FunctionGroup &FG) +{ + // Get analyses that we use and/or modify. + auto P = getAnalysisIfAvailable(); + ST = P ? P->getSubtarget() : nullptr; + Baling = &getAnalysis(); + Liveness = &getAnalysis(); + Numbering = &getAnalysis(); + DTWrapper = &getAnalysis(); + + // Coalesce all global loads prior to normal coalescing. + coalesceGlobalLoads(&FG); + + // Record all the coalescing candidates except the call arg and return + // value pre-copy ones. + recordCandidates(&FG); + + // Process the copy coalescing candidates. + for (unsigned i = 0; i != CopyCandidates.size(); ++i) + processCopyCandidate(&CopyCandidates[i]); + + // Record the call arg and return value pre-copy candidates. + recordCallCandidates(&FG); + + // Sort the array of normal coalescing candidates (including phi ones) then + // process them. + std::sort(NormalCandidates.begin(), NormalCandidates.end()); + for (unsigned i = 0; i != NormalCandidates.size(); ++i) + processCandidate(&NormalCandidates[i]); + + // Now scan all phi nodes again, inserting copies where necessary. Doing + // them in one go here ensures that the copies appear in the predecessor + // blocks in the same order as the phi nodes, which is the basis on which + // we computed live ranges. + processPhiNodes(&FG); + + // Scan all the calls, inserting copies where necessary for call arg + // pre-copies and return value pre- and post-copies. Doing them in one go + // here ensures that the copies appear in the order that live range + // computation assumed they would appear. Also, for call arg and return + // value pre-copies, a single coalesce candidate is shared across multiple + // calls/returns using the same LR, so we need this separate scan to find + // the calls/returns. + processCalls(&FG); + + // Add a copy for each kernel arg that is not aligned enough. + processKernelArgs(&FG); + coalesceCallables(); + coalesceOutputArgs(&FG); + + CopyCandidates.clear(); + NormalCandidates.clear(); + Callables.clear(); + return true; +} + +/*********************************************************************** + * recordCandidates : record all the coalescing candidates from code + * + * This does not record call arg pre-copy or ret value pre-copy candidates. + * That is done in recordCallCandidates. + */ +void GenXCoalescing::recordCandidates(FunctionGroup *FG) +{ + for (auto fgi = FG->begin(), fge = FG->end(); fgi != fge; ++fgi) { + Function *F = *fgi; + for (Function::iterator fi = F->begin(), fe = F->end(); fi != fe; ++fi) { + BasicBlock *BB = &*fi; + for (BasicBlock::iterator bi = BB->begin(), be = BB->end(); bi != be; ++bi) { + Instruction *Inst = &*bi; + if (PHINode *Phi = dyn_cast(Inst)) { + // Phi node. For each incoming, record a phi candidate, unless it is a + // registerless value (EM/RM). + // If the incoming block is a branching join label block, then we + // cannot insert any phi copies there, so give the coalescing + // candidate a high priority to ensure it gets coalesced first. + if (Liveness->getLiveRange(Phi)->getCategory() + < RegCategory::NUMREALCATEGORIES) { + for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { + auto IncomingBlock = Phi->getIncomingBlock(i); + unsigned Priority = getPriority(Phi->getType(), IncomingBlock); + if (GotoJoin::isBranchingJoinLabelBlock(IncomingBlock)) + Priority = UINT_MAX; + recordPhiCandidate(Phi, &Phi->getOperandUse(i), Priority); + } + } + } else if (IGCLLVM::CallInst *CI = dyn_cast(Inst)) { + if (!GenXIntrinsic::isAnyNonTrivialIntrinsic(CI)) { + if (CI->isInlineAsm()) { + InlineAsm *IA = cast(CI->getCalledValue()); + // Do not process if no constraints provided or it's baled + // (the coalescing actually needs to be done at the wrregion). + if (IA->getConstraintString().empty() || Baling->isBaled(CI)) + continue; + unsigned NumOutputs = genx::getInlineAsmNumOutputs(CI); + auto ConstraintsInfo = genx::getGenXInlineAsmInfo(CI); + // we need to coalesce if there is a '+' modifier + // because those operands are tied and have to be in the same + // registers + for (unsigned ArgNo = 0; ArgNo < ConstraintsInfo.size(); + ArgNo++) { + auto &Info = ConstraintsInfo[ArgNo]; + if (!Info.isOutput() || !Info.hasMatchingInput()) + continue; + unsigned ActualIdx = Info.getMatchingInput() - NumOutputs; + auto OpInst = dyn_cast(CI->getOperand(ActualIdx)); + if (!OpInst || Baling->isBaled(OpInst)) + continue; + Use *OpUse = &CI->getOperandUse(ActualIdx); + if (isa(CI->getType())) { + unsigned Priority = getPriority( + IndexFlattener::getElementType(CI->getType(), ArgNo), + CI->getParent()); + recordNormalCandidate(SimpleValue(CI, ArgNo), OpUse, 0, + Priority); + } else { + recordNormalCandidate( + CI, OpUse, 0, + getPriority(CI->getType(), CI->getParent())); + } + } + } else if (CI->isIndirectCall()) + continue; + // This is a non-intrinsic call. If it returns a value, mark + // (elements of) the return value for coalescing with the + // unified return value. + else if (!CI->getType()->isVoidTy()) { + for (unsigned i = 0, + e = IndexFlattener::getNumElements(CI->getType()); + i != e; ++i) + recordNormalCandidate(SimpleValue(CI, i), 0, i, + getPriority(IndexFlattener::getElementType( + CI->getType(), i), CI->getParent())); + } else { + // handle callable kernel + Function *Callee = CI->getCalledFunction(); + if (Callee->hasFnAttribute("CMCallable")) { + if (F->hasFnAttribute("CMCallable")) { + DiagnosticInfoFastComposition Err(CI, + "Callable function must not call another callable" + " function", DS_Error); + F->getContext().diagnose(Err); + } + Callables.push_back(CI); + } + } + } else { + int OperandNum = getTwoAddressOperandNum(CI); + if (OperandNum >= 0) { + // This is an intrinsic with a two address operand (including + // the case of operand 0 in wrregion). That operand has to be in + // the same register as the result. + if (Baling->isBaled(CI)) { + // The intrinsic is baled into a wrregion. The two address + // operand must also have a rdregion baled in whose input is + // the "old value" input of the wrregion, and the coalescing + // actually needs to be done at the wrregion. That is handled + // when this pass reaches the wrregion, so we do not want to do + // anything here. + // + // it may also be baled into a g_store. + // assert(Baling->getBaleInfo(CI).isOperandBaled(OperandNum) && + // "expecting rdregion to be baled in to the two addr operand"); + continue; + } + // Normal unbaled twoaddr operand. + recordNormalCandidate(CI, &CI->getOperandUse(OperandNum), + getPriority(CI->getType(), CI->getParent())); + } + } + } else if (isa(Inst) || isa(Inst)) { + assert(!isa(Inst->getType()) && "not expecting bitcast to struct"); + assert(!isa(Inst->getOperand(0)->getType()) && "not expecting bitcast from struct"); + // The source and destination of a bitcast can copy coalesce, + // but only if it is not the case that the source is a phi and + // the destination has a use in a phi node in the same block and + // after the source's phi. If the above is the case, then we try + // and normal coalesce instead, which fails, leading to a copy + // being generated. + if (GenXLiveness::wrapsAround(Inst->getOperand(0), Inst)) { + recordNormalCandidate(Inst, &Inst->getOperandUse(0), + getPriority(Inst->getType(), Inst->getParent())); + } else if (Liveness->getLiveRangeOrNull(Inst)) { + recordCopyCandidate(Inst, &Inst->getOperandUse(0), + getPriority(Inst->getType(), Inst->getParent())); + } + } else if (auto EVI = dyn_cast(Inst)) { + // extractvalue: copy coalesce the element being extracted, as long as + // both source and destination have live ranges. The two cases where + // they don't are: + // 1. the source live range got removed in the code below that + // handles undef elements in an insertvalue chain; + // 2. this is the extract of the !any(EM) result of a goto/join, + // which does not have a live range because it is baled in to the + // branch. + if (Liveness->getLiveRangeOrNull(Inst)) { + unsigned Index = IndexFlattener::flatten( + cast(EVI->getAggregateOperand()->getType()), + EVI->getIndices()); + if (Liveness->getLiveRangeOrNull( + SimpleValue(Inst->getOperand(0), Index))) { + recordCopyCandidate(SimpleValue(EVI), &Inst->getOperandUse(0), Index, + getPriority(EVI->getType(), EVI->getParent())); + } + } + } else if (auto IVI = dyn_cast(Inst)) { + // insertvalue: + // First, if the struct value input is undef, scan the possible chain + // of insertvalues and remove the live range for any SimpleValue that + // is undef. We need to do this to stop a register being allocated + // later for a coalesced SimpleValue from a chain of insertvalues + // for a return where that element is never set. + auto ST = cast(IVI->getType()); + unsigned NumElements = IndexFlattener::getNumElements(ST); + if (isa(IVI->getOperand(0))) { + SmallVector IsDefined; + IsDefined.resize(NumElements, false); + // For each insertvalue in the chain: + for (auto ThisIVI = IVI; ThisIVI;) { + // For the element set by this one, set it as defined (unless the + // input is undef). + IsDefined[IndexFlattener::flatten(ST, ThisIVI->getIndices())] + = !isa(IVI->getOperand(1)); + // For any element that is still undef, remove its live range. + for (unsigned i = 0; i != NumElements; ++i) + if (!IsDefined[i]) + Liveness->removeValue(SimpleValue(ThisIVI, i)); + if (!ThisIVI->hasOneUse()) + break; + ThisIVI = dyn_cast(ThisIVI->use_begin()->getUser()); + } + } + // Copy coalesce the element being inserted and the other elements, + // as long as the appropriate live ranges did not get removed above. + unsigned Index = IndexFlattener::flatten(ST, IVI->getIndices()); + for (unsigned i = 0; i != NumElements; ++i) { + if (!Liveness->getLiveRangeOrNull(SimpleValue(IVI, i))) + continue; + if (i == Index) { + if (Liveness->getLiveRangeOrNull(Inst->getOperand(1))) + recordCopyCandidate(SimpleValue(IVI, i), &Inst->getOperandUse(1), 0, + getPriority(IVI->getOperand(1)->getType(), IVI->getParent())); + } else { + if (Liveness->getLiveRangeOrNull(SimpleValue(Inst->getOperand(0), i))) + recordCopyCandidate(SimpleValue(IVI, i), &Inst->getOperandUse(0), i, + getPriority(IVI->getOperand(1)->getType(), IVI->getParent())); + } + } + } + } + } + } +} + +/*********************************************************************** + * recordCallCandidates : record the call arg pre-copy and ret value + * pre-copy candidates + * + * This is done here, after copy coalescing has been done, so we can + * more accurately estimate the cost of not coalescing a candidate by + * summing the cost from each call site / return instruction that uses + * the same (copy coalesced) value. + */ +void GenXCoalescing::recordCallCandidates(FunctionGroup *FG) +{ + // For each subroutine... + for (auto fgi = FG->begin() + 1, fge = FG->end(); fgi != fge; ++fgi) { + Function *F = *fgi; + // Gather the call sites. + SmallVector CallSites; + for (auto ui = F->use_begin(), ue = F->use_end(); ui != ue; ++ui) + if (auto CI = dyn_cast(ui->getUser())) + CallSites.push_back(CI); + // For each arg... + unsigned ArgIdx = 0; + for (auto ai = F->arg_begin(), ae = F->arg_end(); + ai != ae; ++ai, ++ArgIdx) { + Argument *Arg = &*ai; + if (Arg->use_empty()) + continue; // Ignore unused arg. + // Record a coalesce candidate for each unique input LR for each + // struct element in the arg. + recordCallArgCandidates(Arg, ArgIdx, CallSites); + } + // Now scan for return value pre-copies. + if (F->getReturnType()->isVoidTy()) + continue; + // Gather the return insts by looking at the terminator of each BB. + SmallVector RetInsts; + for (auto fi = F->begin(), fe = F->end(); fi != fe; ++fi) { + auto RetInst = dyn_cast(fi->getTerminator()); + if (RetInst) + RetInsts.push_back(RetInst); + } + // Record a coalesce candidate for each unique input LR for each + // struct element in the return value. + recordCallArgCandidates(Liveness->getUnifiedRet(F), 0, RetInsts); + } +} + +/*********************************************************************** + * recordCallArgCandidates : common code for adding a candidate for each + * struct element of a call arg or a return value pre-copy + * + * Enter: Dest = destination Value; the Argument for a call arg, or the + * Function's unified return value for a ret pre-copy + * ArgNum = argument number for call arg, 0 for ret pre-copy + * Insts = array of call sites or return instructions + * + * For each struct element, this adds a coalesce candidate for each unique LR + * used as a call arg or return value. + */ +namespace { struct CallArg { + Use *U; + LiveRange *LR; + CallArg(Use *U, LiveRange *LR) : U(U), LR(LR) {} +}; } +void GenXCoalescing::recordCallArgCandidates(Value *Dest, unsigned ArgNum, + ArrayRef Insts) +{ + for (unsigned StructIdx = 0, + StructEnd = IndexFlattener::getNumElements(Dest->getType()); + StructIdx != StructEnd; ++StructIdx) { + // For each unique LR used as this arg at any call site, sum the + // cost and add a candidate. + SmallVector CallArgs; + for (unsigned i = 0, ie = Insts.size(); i != ie; ++i) { + Use *U = &Insts[i]->getOperandUse(ArgNum); + CallArgs.push_back(CallArg(U, + Liveness->getLiveRangeOrNull(SimpleValue(*U, StructIdx)))); + } + for (unsigned i = 0, ie = CallArgs.size(); i != ie; ++i) { + LiveRange *LR = CallArgs[i].LR; + if (!LR) + continue; // Already done this one (or it was an undef). + unsigned Priority = 0; + for (unsigned j = i, je = CallArgs.size(); j != je; ++j) { + if (LR != CallArgs[j].LR) + continue; + Priority += getPriority(nullptr, Insts[j]->getParent()); + CallArgs[j].LR = 0; // Blank out so we can see we have done this one. + } + Use *U = CallArgs[i].U; + Priority *= getPriority(IndexFlattener::getElementType( + (*U)->getType(), StructIdx), 0); + recordNormalCandidate(SimpleValue(Dest, StructIdx), + U, StructIdx, Priority); + } + } +} + +/*********************************************************************** + * getPriority : get priority of coalescing candidate + * + * Enter: Ty = type that would need to be copied if coalescing failed, + * so we can estimate the copy cost. Can be nullptr, in which + * case the copy cost is assumed to be 1 + * BB = basic block where copy would be inserted, so we can use + * loop depth to adjust the cost. Can be nullptr, in which + * the loop depth is assumed to be 0 + * + * Return: priority (estimate of cost of inserting a copy) + * + * getPriority(Ty, BB) is equivalent to getPriority(Ty, 0) * getPriority(0, BB). + */ +unsigned GenXCoalescing::getPriority(Type *Ty, BasicBlock *BB) +{ + // Set priority to the number of GRFs. + // FIXME this should also take into account a non power of two + // vector size, which would result in multiple copy instructions. + // See GenXCoalescing::insertCopy. + // FIXME scale by loop depth. + unsigned Priority = 1; + if (Ty) { + if (VectorType *VT = dyn_cast(Ty)) { + Priority = VT->getNumElements() * VT->getElementType()->getPrimitiveSizeInBits(); + Priority = (Priority + 255) / 256; + } + } + return Priority; +} + +/*********************************************************************** + * recordCandidate : record a candidate for coalescing + * + * Enter: Dest = destination of copy + * UseInDest = pointer to the use of the source in Dest + * SourceIndex = flattened index of element in source struct + * Priority = priority of coalescing this candidate + * Candidates = vector of candidates to push to + * + * For call arg coalescing, Dest is the subroutine's Argument, and + * UseInDest/SourceIndex are the use in one of the possibly many call sites + * using the same source value. + * + * For ret value pre-copy coalescing (before the return inst), Dest is the the + * unified return value, and UseInDest/SourceIndex are the use in one of the + * possibly many return instructions using the same source value. + * + * For ret value post-copy coalescing (after the call inst), Dest is the + * CallInst, and UseInDest and SourceIndex are 0. + */ +void GenXCoalescing::recordCandidate(SimpleValue Dest, Use *UseInDest, + unsigned SourceIndex, unsigned Priority, std::vector *Candidates) +{ + if (UseInDest && isa(*UseInDest)) + return; + assert(!UseInDest || !isa(*UseInDest)); + Candidates->push_back(Candidate(Dest, UseInDest, SourceIndex, Priority, + Candidates->size())); +} + +/*********************************************************************** + * processCandidate : process a coalescing candidate + * + * This attempts to coalesce the candidate. On failure, it inserts a copy + * if necessary: + * + * - a copy candidate never fails to coalesce; + * - a two address candidate needs a copy and it is inserted here; + * - a phi candidate needs a copy, but it is not inserted here. Instead it + * is inserted later so we can ensure that multiple copies inserted at + * the end of an incoming block are in phi node order, which was the + * assumption made by the live range calculation. + * + * See the comment at the top of recordCandidate for the special values of + * fields in Candidate for a call arg coalesce and a ret value coalesce. + */ +void GenXCoalescing::processCandidate(Candidate *Cand, bool IsCopy) +{ + SimpleValue Dest = Cand->Dest; + SimpleValue Source; + if (!Cand->UseInDest) { + // This is a return value post-copy coalesce candidate. The actual source + // is the unified return value. + Source = SimpleValue(Liveness->getUnifiedRet(cast( + Dest.getValue())->getCalledFunction()), Cand->SourceIndex); + } else + Source = SimpleValue(*Cand->UseInDest, Cand->SourceIndex); + LLVM_DEBUG(dbgs() << "Trying coalesce from "; + Source.printName(dbgs()); + dbgs() << " to "; + Dest.printName(dbgs()); + dbgs() << " priority " << Cand->Priority; + if (isa(Dest.getValue())) + dbgs() << " (call arg)"; + else if (Liveness->isUnifiedRet(Dest.getValue())) + dbgs() << " (ret pre-copy)"; + else if (!Cand->UseInDest) + dbgs() << " (ret post-copy)"; + dbgs() << "\n"); + LiveRange *DestLR = Liveness->getLiveRange(Dest); + LiveRange *SourceLR = 0; + // Source should not be a constant (but could be undef) because + // GenXLowering ensured that all our two address operands and phi incomings + // are not constant. + assert(!Cand->UseInDest || !isa(Source.getValue()) || isa(Source.getValue())); + SourceLR = Liveness->getLiveRange(Source); + assert(DestLR); + if (SourceLR == DestLR) + return; // already coalesced + if (SourceLR && SourceLR->Category == DestLR->Category) { + if (IsCopy) { + // For a copy candidate, we can coalesce if the source and destination do + // not copy-interfere, i.e. we do not have a situation where DestLR + // wraps round a loop into a phi use in the same basic block as the phi + // def of SourceLR but after it. + if (!Liveness->copyInterfere(SourceLR, DestLR)) { + Liveness->coalesce(DestLR, SourceLR, /*DisallowCASC=*/ false); + return; + } + } else { + // For a normal candidate, we can coalesce if the source and destination + // do not interfere, i.e. there is no point in the program where both + // LRs are live. + if (!Liveness->twoAddrInterfere(DestLR, SourceLR)) { + // In the coalesce, disallow future call arg special coalescing if this + // is not a call arg coalesce. + Liveness->coalesce(DestLR, SourceLR, + /*DisallowCASC=*/ !isa(Dest.getValue())); + return; + } + } + } +#if 0 + // Disable call arg special coalescing for now, as it seems to break the FRC_MC example. + + if (isa(Dest.getValue()) + && SourceLR->Category == DestLR->Category) { + // This is an attempt at call arg coalescing. The two LRs interfere, but + // we can still try for "call arg special coalescing" (CASC). See the + // comment at the top of the file. + if (!DestLR->DisallowCASC) { + // CASC not disallowed. (It would have been disallowed if DestLR had + // already participated in normal coalescing other than CASC.) + // For any call site where SourceLR is not the corresponding call arg, + // check that A is not live. + auto ThisCallSite = cast(Cand->UseInDest->getUser()); + auto Callee = ThisCallSite->getCalledFunction(); + bool FailedCASC = false; + for (auto ui = Callee->use_begin(), ue = Callee->use_end(); + ui != ue; ++ui) { + auto CallSite = cast(ui->getUser()); + if (CallSite == ThisCallSite) + continue; + auto OtherArg = SimpleValue(CallSite->getArgOperand(cast( + Dest.getValue())->getArgNo()), Dest.getIndex()); + auto OtherLR = Liveness->getLiveRange(OtherArg); + // Check whether OtherArg is the same as SourceLR. This check covers + // several cases: + // 1. OtherArg == SourceLR: the other arg is already coalesced with + // our arg, so it would be OK to do CASC. + // 2. OtherArg is DestLR, meaning that the other call arg has already + // been coalesced with the func arg. We cannot do CASC if SourceLR + // and OtherArg interfere, which they do because we already know + // that DestLR interferes with SourceLR. + // 3. OtherArg is something else, meaning that some other value will + // be copied to the func arg here. We cannot do CASC if SourceLR + // and OtherArg interfere. + if (OtherLR == SourceLR) + continue; + if (Liveness->interfere(OtherLR, SourceLR)) { + FailedCASC = true; + break; + } + } + if (!FailedCASC) { + // Can coalesce. Do not disallow future CASC. + Liveness->coalesce(DestLR, SourceLR, /*DisallowCASC=*/ false); + return; + } + } + } +#endif + + // Coalescing failed. + LLVM_DEBUG( + if (SourceLR) { + dbgs() << "Live ranges \""; + DestLR->print(dbgs()); + dbgs() << "\" and \""; + SourceLR->print(dbgs()); + dbgs() << "\"" << (IsCopy ? " copy" : "") << " interfere, not coalescing\n"; + } else { + dbgs() << "Need copy of constant \""; + Source.print(dbgs()); + dbgs() << "\" to \""; + Dest.printName(dbgs()); + dbgs() << "\"\n"; + } + ); + if (isa(Dest.getValue())) + return; // Candidate is phi; copy insertion done later. + if (isa(Dest.getValue())) + return; // Call arg pre-copy, defer copy insertion + if (Liveness->isUnifiedRet(Dest.getValue())) + return; // Return value pre-copy, defer copy insertion + if (!Cand->UseInDest) + return; // Return value post-copy, defer copy insertion + if (isa(Dest.getValue()) || isa(Dest.getValue())) { + // A bitcast is normally copy coalesced, which means it cannot fail to + // coalesce. However, if the source is a phi node and the destination + // wraps round the loop and is used in another phi node in the same + // block that is later than the first phi node, then we instead + // try to normal coalesce, which fails because they interfere. + // This happens with a bitcast inserted in GenXLiveRanges to resolve + // an overlapping circular phi, but can happen in other cases too. + if ((int)genx::exactLog2( + Dest.getValue()->getType()->getPrimitiveSizeInBits()) <= 8) { + // This is a bitcast with a legal size for a single copy. We do not + // insert a copy, because GenXVisaFuncWriter will generate one. + // (GenXLegalization does not legalize a bitcast, so it can be + // illegal size here. We do that on the basis that a bitcast is + // normally copy coalesced.) + return; + } + // Otherwise, it is a bitcast of size more than 1 GRF or non-power-of-two, + // so we insert a copy. + } + // Insert the copy now for a two address op. Give it the number of the + // pre-copy slot, which is one less than the number of the two address + // instruction. + Instruction *DestInst = cast(Dest.getValue()); + showCoalesceFail(Dest, DestInst->getDebugLoc(), "two address", + DestLR, SourceLR); + Instruction *NewCopy = insertCopy(Source, DestLR, DestInst, "twoaddr", + Numbering->getNumber(DestInst) - 1); + NewCopy = insertIntoStruct(Dest.getValue()->getType(), + Dest.getIndex(), *Cand->UseInDest, NewCopy, DestInst); + // Replace the use of the old source. + *Cand->UseInDest = NewCopy; + // No need to extend the live range, as the result of the two address op was + // already marked as defined at the pre-copy slot. +} + +/*********************************************************************** + * processPhiNodes : add copies for uncoalesced phi node incomings + */ +void GenXCoalescing::processPhiNodes(FunctionGroup *FG) +{ + std::vector PhiCopies; + + for (auto fgi = FG->begin(), fge = FG->end(); fgi != fge; ++fgi) { + Function *F = *fgi; + for (Function::iterator fi = F->begin(), fe = F->end(); fi != fe; ++fi) { + BasicBlock *BB = &*fi; + for (BasicBlock::iterator bi = BB->begin(), be = BB->end(); bi != be; ++bi) { + // Scan the phi nodes at the start of this BB, if any. + PHINode *Phi = dyn_cast(&*bi); + if (!Phi) + break; + + // Collect copies to process + analysePhiCopies(Phi, PhiCopies); + } + } + } + + // Perform copy of uncoalesced phi node incomings. + // New phis can be created during this, store them. + std::vector NewPhis; + for (auto Elem : PhiCopies) { + processPhiCopy(Elem.Phi, Elem.IncomingIdx, NewPhis); + } + // Phi copies are resolved. Clean the list. + PhiCopies.clear(); + + // Process newly created phis. This loop is executed + // when coalescing failed to resolve issues with phis + // in branching join label blocks. Such situation is + // very rare because coalescing tries to solve it + // with the highest priority. + while (!NewPhis.empty()) { + // Collect phi copy candidates + for (auto *Phi : NewPhis) { + analysePhiCopies(Phi, PhiCopies); + } + // Phi copies are collected, clean current Phis worklist + NewPhis.clear(); + + // Perform copy of uncoalesced phi node incomings. + for (auto Elem : PhiCopies) { + processPhiCopy(Elem.Phi, Elem.IncomingIdx, NewPhis); + } + // Phi copies are resolved. Clean the list. + PhiCopies.clear(); + } +} + +/*********************************************************************** + * analysePhiCopies : for one phi node, collect copies for uncoalesced incomings + */ +void GenXCoalescing::analysePhiCopies(PHINode *Phi, + std::vector &ToProcess) { + // Scan each incoming to see if it was successfully coalesced. + LiveRange *DestLR = Liveness->getLiveRange(Phi); + if (DestLR->getCategory() >= RegCategory::NUMREALCATEGORIES) + return; // Ignore phi node of EM/RM value. + for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { + Value *Incoming = Phi->getIncomingValue(i); + // Incoming should not be a constant (but could be undef) because + // GenXPostLegalization and GenXCategory called loadNonSimpleConstants + // to load the non-simple constant incomings, then GenXCategory also + // called GenXConstants::loadConstant for each remaining (simple) + // constant. + if (isa(Incoming)) + continue; // undef, no copy needed + assert(!isa(Incoming)); + if (Liveness->getLiveRange(Incoming) == DestLR) + continue; // coalesced, no copy needed + // A phi copy is needed + auto IncomingBlock = Phi->getIncomingBlock(i); + LLVM_DEBUG(dbgs() << "Need phi copy " << Incoming->getName() << " -> " + << Phi->getName() << " in " << IncomingBlock->getName() + << "\n"); + ToProcess.push_back(PhiCopy(Phi, i)); + } +} + +/*********************************************************************** + * processPhiCopy : for one phi node incoming, add copy + */ +void GenXCoalescing::processPhiCopy(PHINode *Phi, unsigned Inc, + std::vector &Phis) { + LiveRange *DestLR = Liveness->getLiveRange(Phi); + Value *Incoming = Phi->getIncomingValue(Inc); + auto *IncomingBlock = Phi->getIncomingBlock(Inc); + // Should be checked in analysePhiCopies + assert(DestLR->getCategory() < RegCategory::NUMREALCATEGORIES && + "Should be checked earlier!"); + assert(!isa(Incoming) && "Should be checked earlier!"); + assert(!isa(Incoming) && "Should be checked earlier!"); + // Check it again: something could change + if (Liveness->getLiveRange(Incoming) == DestLR) { + LLVM_DEBUG(dbgs() << "Already coalesced " << Incoming->getName() << " -> " + << Phi->getName() << " in " << IncomingBlock->getName() + << "\n"); + return; + } + + LLVM_DEBUG(dbgs() << "Copying " << Incoming->getName() << " -> " + << Phi->getName() << " in " << IncomingBlock->getName() + << "\n"); + + // Handle branching join label block separately + if (GotoJoin::isBranchingJoinLabelBlock(IncomingBlock)) { + processPhiBranchingJoinLabelCopy(Phi, Inc, Phis); + return; + } + + DominatorTree *DomTree = getDomTree(IncomingBlock->getParent()); + Instruction *InsertPoint = IncomingBlock->getTerminator(); + InsertPoint = GotoJoin::getLegalInsertionPoint(InsertPoint, DomTree); + + // Give the copy the number allocated to the phi incoming + unsigned Num = Numbering->getPhiNumber(Phi, IncomingBlock); + + if (auto *I = dyn_cast(Incoming)) { + // This should not happen for good BBs (not join blocks) + // if DFG is correct. + assert(DomTree->dominates(I->getParent(), InsertPoint->getParent()) && + "Dominance corrupted!"); + } + + showCoalesceFail(SimpleValue(Incoming), InsertPoint->getDebugLoc(), "phi", + DestLR, Liveness->getLiveRange(Incoming)); + Instruction *NewCopy = + insertCopy(SimpleValue(Incoming), DestLR, InsertPoint, "phicopy", Num); + Phi->setIncomingValue(Inc, NewCopy); + // No need to extend the live range like we do in the two address op case + // in processCandidate(). The live range of a phi node already starts at + // each point where a copy might need to be inserted. +} + +/*********************************************************************** + * processPhiBranchingJoinLabelCopy : for one phi node incoming, add copy + * for branching join label incoming BB case + */ +void GenXCoalescing::processPhiBranchingJoinLabelCopy( + PHINode *Phi, unsigned Inc, std::vector &Phis) { + LiveRange *DestLR = Liveness->getLiveRange(Phi); + Value *Incoming = Phi->getIncomingValue(Inc); + auto *IncomingBlock = Phi->getIncomingBlock(Inc); + // Should be checked in analysePhiCopies + assert(DestLR->getCategory() < RegCategory::NUMREALCATEGORIES && + "Should be checked earlier!"); + assert(!isa(Incoming) && "Should be checked earlier!"); + assert(!isa(Incoming) && "Should be checked earlier!"); + // Should be checked in processPhiCopy + assert(Liveness->getLiveRange(Incoming) != DestLR && + "Should be checked earlier!"); + assert(GotoJoin::isBranchingJoinLabelBlock(IncomingBlock) && + "Should be checked earlier!"); + + LLVM_DEBUG(dbgs() << "Handling branching join label block case\n"); + + DominatorTree *DomTree = getDomTree(IncomingBlock->getParent()); + Instruction *InsertPoint = IncomingBlock->getTerminator(); + InsertPoint = GotoJoin::getLegalInsertionPoint(InsertPoint, DomTree); + + // Give the copy the number of term to make proper liverange + unsigned Num = Numbering->getNumber(InsertPoint); + + if (auto *PhiPred = dyn_cast(Incoming)) { + // In case when pred is Phi, it is possible to meet Phi in + // branching join blocks since such Phi does not brake + // SIMD CF Conformance. If such situation happens, we cannot + // perform copy of a phi value copy, we need to perform copy + // on all its incoming values. To do that, copy Phi and add + // it to Phis worklist. + // + // This situation is detected via corrupted dominance. + if (!DomTree->dominates(PhiPred->getParent(), InsertPoint->getParent())) { + auto *PhiCopy = copyNonCoalescedPhi(PhiPred, Phi); + assert(PhiCopy && "Invalid phi copy!"); + Phis.push_back(PhiCopy); + return; + } + } + + if (auto *I = dyn_cast(Incoming)) { + // This should not happen for good BBs (not join blocks) + // if DFG is correct. + // + // For join block, def must be somewhere before it + // because of SIMD CF Conformance. Case for Phi is + // described and handled above. + assert(DomTree->dominates(I->getParent(), InsertPoint->getParent()) && + "Dominance corrupted!"); + } + + showCoalesceFail(SimpleValue(Incoming), InsertPoint->getDebugLoc(), "phi", + DestLR, Liveness->getLiveRange(Incoming)); + Instruction *NewCopy = + insertCopy(SimpleValue(Incoming), DestLR, InsertPoint, "phicopy", Num); + Phi->setIncomingValue(Inc, NewCopy); + + // Extend liverange: we skipped some basic blocks + Liveness->rebuildLiveRange(DestLR); +} + +/*********************************************************************** + * copyNonCoalescedPhi : copy PhiPred and coalesce copy's LR with + * PhiSucc's LR + */ +PHINode *GenXCoalescing::copyNonCoalescedPhi(PHINode *PhiPred, + PHINode *PhiSucc) { + // Perform copy + auto *PhiCopy = cast(PhiPred->clone()); + PhiCopy->insertBefore(PhiPred->getNextNode()); + PhiCopy->setName(PhiPred->getName() + ".copy"); + Numbering->setNumber(PhiCopy, Numbering->getNumber(PhiPred)); + + // Handle LRs + Liveness->buildLiveRange(PhiCopy); + LiveRange *DestLR = Liveness->getLiveRange(PhiSucc); + LiveRange *NewLR = Liveness->getLiveRange(PhiCopy); + Liveness->coalesce(DestLR, NewLR, false); + + // Update incoming values + for (unsigned i = 0, e = PhiSucc->getNumIncomingValues(); i != e; ++i) { + Value *IncValue = PhiSucc->getIncomingValue(i); + if (IncValue == PhiPred) + PhiSucc->setIncomingValue(i, PhiCopy); + } + + return PhiCopy; +} + +/*********************************************************************** + * processCalls : insert copies where necessary for call args and ret values + * + * This scans all the calls, inserting copies where necessary for call arg + * pre-copies and return value pre- and post-copies. + * + * We need to do them in one go here because + * 1. a call arg or return value pre-copy coalescing candidate covers + * possibly multiple sites where the same LR input is used, without giving + * any way of getting back to them all; + * 2. we want the inserted copies to be in the order that live range + * computation assumed they would appear. + */ +void GenXCoalescing::processCalls(FunctionGroup *FG) +{ + // For each subroutine... + for (auto fgi = FG->begin() + 1, fge = FG->end(); fgi != fge; ++fgi) { + Function *F = *fgi; + // For each call site... + for (auto ui = F->use_begin(), ue = F->use_end(); ui != ue; ++ui) { + if (auto *CI = dyn_cast(ui->getUser())) { + // For each func arg... + unsigned ArgIdx = 0; + for (auto ai = F->arg_begin(), ae = F->arg_end(); ai != ae; + ++ai, ++ArgIdx) { + Argument *Arg = &*ai; + if (Arg->use_empty()) { + // Arg is unused inside the subroutine. Do not try and process + // further, as its live range probably does not have a category. + continue; + } + Value *CallArg = CI->getOperand(ArgIdx); + if (isa(CallArg)) { + // Call arg undefined. No coalescing needed. + continue; + } + // For each SimpleValue in the func arg... + for (unsigned StructIdx = 0, + se = IndexFlattener::getNumElements(Arg->getType()); + StructIdx != se; ++StructIdx) { + assert(!StructIdx && + "coalesce failure on struct call arg not tested"); + auto FuncArgSV = SimpleValue(Arg, StructIdx); + auto CallArgSV = SimpleValue(CallArg, StructIdx); + // See if they are coalesced. + auto DestLR = Liveness->getLiveRange(FuncArgSV); + auto SourceLR = Liveness->getLiveRange(CallArgSV); + if (!DestLR || DestLR == SourceLR || F == CI->getFunction()) + continue; + if (DestLR->getCategory() >= RegCategory::NUMREALCATEGORIES) + continue; // Called function arg is EM. + // Need to insert a copy. Give it the number of the arg's pre-copy + // slot. + showCoalesceFail(CallArgSV, CI->getDebugLoc(), "call arg", DestLR, + SourceLR); + unsigned Num = + Numbering->getArgPreCopyNumber(CI, ArgIdx, StructIdx); + Instruction *NewCopy = + insertCopy(CallArgSV, DestLR, CI, "callarg.precopy", Num); + NewCopy = insertIntoStruct(Arg->getType(), StructIdx, + CI->getOperand(ArgIdx), NewCopy, CI); + // Replace operand in call. + CI->setOperand(ArgIdx, NewCopy); + // No need to extend the live range like we do in the two address op + // case in processCandidate(). The live range of a func arg already + // starts at each point where a copy might need to be inserted. + } + } + // Now check the return value post-copy. + // + // The code to handle a coalesce failure in a return value post-copy + // is different to all other cases of coalesce failure, which are + // pre-copy. We need to ensure that the post-copied value is in the + // original live range for the original value (the return value), + // and all the original value's users are changed to use the post-copied + // value instead. The original value (the return value) gets moved out + // of its live range and put into that of the unified return value. + // + // If the return value is a struct, all the above happens for each + // struct element, with the extra complication of more new values to + // handle because of the extractvalue and insertvalue instructions we + // need to insert. + // + // First remember all uses of the return value, because we want to + // replace them after adding new ones below. Remember if they are + // all extractvalue with a non-struct result (which should usually be + // the case because GenXLowering removes most structs). + SmallVector CIUses; + bool AllUsesAreExtract = isa(CI->getType()); + for (auto ui = CI->use_begin(), ue = CI->use_end(); ui != ue; ++ui) { + auto EV = dyn_cast(ui->getUser()); + if (!EV || isa(EV->getType())) + AllUsesAreExtract = false; + CIUses.push_back(&*ui); + } + Instruction *InsertBefore = CI->getNextNode(); + Value *StructValue = CI; + SmallVector PreviousElements; + // For each SimpleValue in the return value... + for (unsigned StructIdx = 0, + se = IndexFlattener::getNumElements(CI->getType()); + StructIdx != se; ++StructIdx) { + auto UnifiedSV = SimpleValue(Liveness->getUnifiedRet(F), StructIdx); + auto SV = SimpleValue(CI, StructIdx); + // See if (the element in) the returned value is dead, or successfully + // coalesced with (the element in) the unified return value. + auto DestLR = Liveness->getLiveRangeOrNull(SV); + PreviousElements.push_back(DestLR); + if (!DestLR) + continue; // dead + auto SourceLR = Liveness->getLiveRange(UnifiedSV); + if (DestLR == SourceLR) + continue; // coalesced + assert(SourceLR); + if (SourceLR->getCategory() >= RegCategory::NUMREALCATEGORIES) + continue; // Unified return value is EM, ignore. + // Remove (the element of) CI, the actual return value, from its + // own live range, and add it instead to the unified return value. + // insertCopy() will add the new value to DestLR (what + // was the LR for the element of CI). + Liveness->removeValueNoDelete(SV); + Liveness->setLiveRange(SV, SourceLR); + // Need to insert a copy. Give it the number of the post-copy slot. + showCoalesceFail(SimpleValue(CI, StructIdx), CI->getDebugLoc(), + "ret postcopy", DestLR, SourceLR); + unsigned Num = Numbering->getRetPostCopyNumber(CI, StructIdx); + Instruction *NewCopy = + insertCopy(SimpleValue(CI, StructIdx), DestLR, InsertBefore, + "retval.postcopy", Num); + assert(NewCopy); + if (AllUsesAreExtract) { + // For a struct ret value where all the uses are non-struct + // extractvalue, replace uses of the extractvalues with NewCopy. + // Doing this, rather than calling insertIntoStruct() and letting + // the existing extractvalue extract it again, does not improve the + // code generated by the compiler (insertvalue/extractvalue do not + // generate any code), but it does make the IR simpler and easier + // to understand in a dump. + for (unsigned i = 0, e = CIUses.size(); i != e; ++i) { + if (!CIUses[i]) + continue; + auto EV = cast(CIUses[i]->getUser()); + if (StructIdx == + IndexFlattener::flatten(cast(CI->getType()), + EV->getIndices())) { + NewCopy->takeName(EV); + EV->replaceAllUsesWith(NewCopy); + if (EV == InsertBefore) + InsertBefore = InsertBefore->getNextNode(); + Liveness->removeValue(SimpleValue(EV)); + EV->eraseFromParent(); + CIUses[i] = 0; + } + } + } else { + // If this is a struct return value, we also need to insertvalue, + // creating a new struct value. + StructValue = insertIntoStruct(CI->getType(), StructIdx, + StructValue, NewCopy, InsertBefore); + // Also, for this and previously seen elements that are not dead, + // add that element of StructValue (the new insertvalue) to the live + // range. + if (StructValue != NewCopy) { + for (unsigned k = 0, ke = PreviousElements.size(); k != ke; ++k) { + if (PreviousElements[k]) + Liveness->setLiveRange(SimpleValue(StructValue, k), + PreviousElements[k]); + } + } + } + } + if (!AllUsesAreExtract) { + // Replace uses of the whole return value that existed before we added + // more uses above. + for (unsigned i = 0, e = CIUses.size(); i != e; ++i) + *CIUses[i] = StructValue; + } + } + } + if (F->getReturnType()->isVoidTy()) + continue; // no return value from this func + // For each return inst in the func... + for (auto fi = F->begin(), fe = F->end(); fi != fe; ++fi) { + auto RI = dyn_cast(fi->getTerminator()); + if (!RI) + continue; + Value *Input = RI->getOperand(0); + if (isa(Input)) + continue; + Value *UnifiedRet = Liveness->getUnifiedRet(F); + // For each struct element in the return value... + for (unsigned StructIdx = 0, + StructEnd = IndexFlattener::getNumElements(UnifiedRet->getType()); + StructIdx != StructEnd; ++StructIdx) { + auto DestLR = Liveness->getLiveRange(SimpleValue(UnifiedRet, StructIdx)); + auto SourceLR = Liveness->getLiveRange(SimpleValue(Input, StructIdx)); + if (DestLR == SourceLR) + continue; // coalesced + // Need to insert a copy. Give it the number of the ret pre-copy slot. + showCoalesceFail(SimpleValue(Input, StructIdx), RI->getDebugLoc(), + "ret precopy", DestLR, SourceLR); + unsigned Num = Numbering->getNumber(RI) - StructEnd + StructIdx; + Instruction *NewCopy = insertCopy(SimpleValue(Input, StructIdx), + DestLR, RI, "retval.precopy", Num); + NewCopy = insertIntoStruct(UnifiedRet->getType(), StructIdx, + RI->getOperand(0), NewCopy, RI); + // Replace operand in call. + RI->setOperand(0, NewCopy); + // No need to extend the live range like we do in the two address op + // case in processCandidate(). The live range of the unified return + // value already starts at each point where a copy might need to be + // inserted. + } + } + } +} + +/*********************************************************************** + * processKernelArgs : add a copy for each kernel arg that is not aligned enough + */ +void GenXCoalescing::processKernelArgs(FunctionGroup *FG) +{ + auto F = FG->getHead(); + if (!isKernel(F)) + return; + Instruction *InsertBefore = F->front().getFirstNonPHIOrDbg(); + KernelMetadata KM(F); + unsigned Idx = 0; + for (auto ai = F->arg_begin(), ae = F->arg_end(); ai != ae; ++ai) { + if (KM.shouldSkipArg(Idx++)) + continue; + auto Arg = &*ai; + auto LR = Liveness->getLiveRange(Arg); + if (!(LR->Offset & ((1U << LR->LogAlignment) - 1))) + continue; // aligned enough + // Insert a copy and give the original arg its own new live range. This + // leaves the original live range still live from the start of the + // function, and thus interfering with the new live range for the arg, + // but that doesn't matter. + SmallVector Uses; + for (auto ui = Arg->use_begin(), ue = Arg->use_end(); ui != ue; ++ui) + Uses.push_back(&*ui); + unsigned Num = Numbering->getKernelArgCopyNumber(Arg); + auto Copy = insertCopy(Arg, LR, InsertBefore, "argcopy", Num); + Liveness->removeValueNoDelete(Arg); + for (auto ui = Uses.begin(), ue = Uses.end(); ui != ue; ++ui) + **ui = Copy; + auto NewLR = Liveness->getOrCreateLiveRange(Arg); + NewLR->setCategory(LR->getCategory()); + NewLR->push_back(Segment(Numbering->getNumber(F), Num)); + NewLR->Offset = LR->Offset; + LR->Offset = 0; + } +} + +void GenXCoalescing::coalesceOutputArgs(FunctionGroup *FG) { + auto F = FG->getHead(); + if (!isKernel(F)) + return; + + std::string Name = GenXIntrinsic::getGenXName(GenXIntrinsic::genx_output); + Function *OutputFn = F->getParent()->getFunction(Name); + if (!OutputFn) + return; + + KernelMetadata KM(F); + for (auto U : OutputFn->users()) { + auto CI = dyn_cast(U); + if (!CI || CI->getParent()->getParent() != F) + continue; + + unsigned Idx = 0; // kernel argument index + unsigned i = 0; // call argument index + for (auto I = F->arg_begin(), E = F->arg_end(); I != E; ++I) { + if (!KM.isOutputArg(Idx++)) + continue; + + // This is the final value stored into the output argument. + // If this is coalesced into kernel argument, nothing to do. + // Otherwise, insert a copy. + Value *V = CI->getArgOperand(i); + Value *Arg = &*I; + LiveRange *LR1 = Liveness->getLiveRangeOrNull(V); + LiveRange *LR2 = Liveness->getLiveRange(Arg); + + auto coalesceInput = [=]() { + // When LR1 is null, the input value should be Undef. Otherwise, it + // should be loaded as a constant. + if (LR1 == nullptr || LR1 == LR2) + return false; + + if (!Liveness->interfere(LR1, LR2)) { + Liveness->coalesce(LR1, LR2, false); + return false; + } + + // A copy is needed. + return true; + }; + + if (coalesceInput()) { + // Insert copy and add a short live range for copy-out. + unsigned Num = Numbering->getNumber(CI); + auto Copy = insertCopy(V, LR2, CI, "copyout", Num); + CI->setArgOperand(i, Copy); + LR2->push_back(Num, Num + 1); + LR2->sortAndMerge(); + } + ++i; + } + } +} + +void GenXCoalescing::coalesceCallables() { + for (auto CI : Callables) { + auto NI = CI->getNextNode(); + // if the next instruction is a CM-output intrinsic, + // we don't really need that cm-output because CMCallable can serve as + // the anchor for preventing DCE + if (NI && isa(NI)) { + CallInst *OC = cast(NI); + if (GenXIntrinsic::getGenXIntrinsicID(OC) == GenXIntrinsic::genx_output) { + OC->eraseFromParent(); + } + } + + auto Nxt = CI->getNextNode(); + auto Ret = Nxt; + + // 1. Possible next node is branch to return + auto Br = dyn_cast(Nxt); + if (Br && Br->isUnconditional()) + Ret = &Br->getSuccessor(0)->front(); + + // 2. Possible next node is GenXIntrinsic::genx_output + if (GenXIntrinsic::getGenXIntrinsicID(Ret) == GenXIntrinsic::genx_output) + Ret = Ret->getNextNode(); + + // Check if next node is correct return insn + if (!Ret || !isa(Ret)) { + // getRetVal could not determine what happens to this return value. + DiagnosticInfoFastComposition Err(CI, + "Callable Call must be right before function return", + (ST->warnCallable() ? DS_Warning : DS_Error)); + CI->getContext().diagnose(Err); + } + Function *F = CI->getParent()->getParent(); + assert(isKernel(F)); + KernelMetadata KM(F); + unsigned Idx = 0; // kernel argument index + unsigned i = 0; // call argument index + for (auto I = F->arg_begin(), E = F->arg_end(); I != E; ++I) { + if (KM.getArgInputOutputKind(Idx++) == KernelMetadata::IO_Normal) + continue; + + // This is the final value stored into the output argument. + // If this is coalesced into kernel argument, nothing to do. + // Otherwise, insert a copy. + Value *V = CI->getArgOperand(i); + Value *Arg = &*I; + LiveRange *LR1 = Liveness->getLiveRangeOrNull(V); + LiveRange *LR2 = Liveness->getLiveRange(Arg); + + auto coalesceInput = [=]() { + // When LR1 is null, the input value should be Undef. Otherwise, it + // should be loaded as a constant. + if (LR1 == nullptr || LR1 == LR2) + return false; + + if (!Liveness->interfere(LR1, LR2)) { + Liveness->coalesce(LR1, LR2, false); + return false; + } + + // A copy is needed. + return true; + }; + + if (coalesceInput()) { + // Insert copy and add a short live range for copy-out. + unsigned Num = Numbering->getNumber(CI); + auto Copy = insertCopy(V, LR2, CI, "copyout", Num); + CI->setArgOperand(i, Copy); + LR2->push_back(Num, Num + 1); + LR2->sortAndMerge(); + } + ++i; + } + } +} + +void GenXCoalescing::coalesceGlobalLoads(FunctionGroup *FG) { + for (auto &GV : FG->getModule()->globals()) { + if (!GV.hasAttribute(genx::FunctionMD::GenXVolatile)) + continue; + LiveRange *LR1 = Liveness->getLiveRangeOrNull(&GV); + if (!LR1) + continue; + + // Collect all loads. + std::set LoadsInGroup; + for (auto UI : GV.users()) { + if (auto LI = dyn_cast(UI)) { + assert(LI->getPointerOperand() == &GV); + auto Fn = LI->getParent()->getParent(); + // Check this load is inside the group. + if (std::find(FG->begin(), FG->end(), Fn) != FG->end()) + LoadsInGroup.insert(LI); + } + // Global variable is used in a constexpr. + if (&GV != getUnderlyingGlobalVariable(UI)) + continue; + for (auto U : UI->users()) + if (auto LI = dyn_cast(U)) { + auto Fn = LI->getParent()->getParent(); + // Check this load is inside the group. + if (std::find(FG->begin(), FG->end(), Fn) != FG->end()) + LoadsInGroup.insert(LI); + } + } + + // Do coalescing. + for (auto LI : LoadsInGroup) { + LiveRange *LR2 = Liveness->getLiveRange(LI); + LR1 = Liveness->coalesce(LR1, LR2, false); + } + } +} + +/*********************************************************************** + * insertCopy : insert a copy of a non-struct value + * + * Enter: Input = value to copy + * LR = live range to add the new value to + * InsertBefore = insert copy before this inst + * Name = name to give the new value + * Number = number to give the new instruction(s) + * + * Return: The new copy instruction + * + * This inserts multiple copies if the input value is a vector that is + * bigger than two GRFs or a non power of two size. + */ +Instruction *GenXCoalescing::insertCopy(SimpleValue Input, LiveRange *LR, + Instruction *InsertBefore, StringRef Name, unsigned Number) +{ + assert(!isa(Input.getValue())); + if (auto ST = dyn_cast(Input.getValue()->getType())) { + // Input is a struct element. First extract it. This + // extract is created coalesced by adding it to the live + // range of the struct element. An extractvalue is always + // coalesced and never generates code. + SmallVector Indices; + IndexFlattener::unflatten(ST, Input.getIndex(), &Indices); + Instruction *Extract = ExtractValueInst::Create(Input.getValue(), Indices, + "twoaddr.extract", InsertBefore); + auto SourceLR = Liveness->getLiveRange(Input); + assert(SourceLR); + Liveness->setLiveRange(SimpleValue(Extract), SourceLR); + Input = SimpleValue(Extract); + } + return Liveness->insertCopy(Input.getValue(), LR, InsertBefore, Name, Number); +} + +/*********************************************************************** + * insertIntoStruct : create an insertvalue to insert a new value into a + * struct + * + * Enter: Ty = type of putative struct + * FlattenedIndex = flattened index within the struct + * OldStruct = old value of struct + * NewVal = new value to insert into it + * InsertBefore = where to insert new instruction before + * + * Return: the new InsertValueInst + * + * If Ty is not a struct type, this just returns NewVal. + */ +Instruction *GenXCoalescing::insertIntoStruct(Type *Ty, + unsigned FlattenedIndex, Value *OldStruct, Instruction *NewVal, + Instruction *InsertBefore) +{ + auto ST = dyn_cast(Ty); + if (!ST) + return NewVal; + // We're copying into struct element. We need to add an insertvalue. + SmallVector Indices; + IndexFlattener::unflatten(ST, FlattenedIndex, &Indices); + return InsertValueInst::Create(OldStruct, NewVal, + Indices, "coalescefail.insert", InsertBefore); +} + +/*********************************************************************** + * showCoalesceFail : output a message to say that coalescing has failed + */ +void GenXCoalescing::showCoalesceFail(SimpleValue V, const DebugLoc &DL, + const char *Intro, LiveRange *DestLR, + LiveRange *SourceLR) { + if (isa(V.getValue())) + return; + if (V.getType()->getPrimitiveSizeInBits() >= + GenXShowCoalesceFailThreshold * 8U) { + dbgs() << "GenX " << Intro << " coalesce failed on "; + V.printName(dbgs()); + dbgs() << " size " << V.getType()->getPrimitiveSizeInBits() / 8U + << " bytes at "; + DL.print(dbgs()); + dbgs() << "\nDestLR: " << *DestLR << "\nSourceLR: " << *SourceLR << "\n"; + } +} + +/*********************************************************************** +* DiagnosticInfoFastComposition initializer from Instruction +* +* If the Instruction has a DebugLoc, then that is used for the error +* location. +* Otherwise, the location is unknown. +*/ +DiagnosticInfoFastComposition::DiagnosticInfoFastComposition(Instruction *Inst, + const Twine &Desc, DiagnosticSeverity Severity) + : DiagnosticInfo(getKindID(), Severity), Line(0), Col(0) +{ + auto DL = Inst->getDebugLoc(); + if (!DL) { + Filename = DL->getFilename(); + Line = DL.getLine(); + Col = DL.getCol(); + } + Description = (Twine("Fast Composition restriction violation") + + ": " + Desc).str(); +} + +/*********************************************************************** +* DiagnosticInfoFastComposition::print : print the error/warning message +*/ +void DiagnosticInfoFastComposition::print(DiagnosticPrinter &DP) const +{ + std::string Loc( + (Twine(!Filename.empty() ? Filename : "") + + ":" + Twine(Line) + + (!Col ? Twine() : Twine(":") + Twine(Col)) + + ": ") + .str()); + DP << Loc << Description; +} + + diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXConstants.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXConstants.cpp new file mode 100644 index 000000000000..389bdd75eb75 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXConstants.cpp @@ -0,0 +1,1524 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXConstants +/// ------------- +/// +/// GenXConstants is not in itself a pass. It contains utility functions and a +/// class used by other passes for constant loading. +/// +/// loadNonSimpleConstants +/// ^^^^^^^^^^^^^^^^^^^^^^ +/// +/// The GenXPostLegalization pass calls loadNonSimpleConstants to insert a load +/// for any operand that is a non-simple constant. (A non-simple constant is one +/// that is too big or an invalid value for a constant operand.) +/// +/// It is called in two places: +/// +/// 1. in the GenXPostLegalization pass, run after legalization but +/// before CSE, so CSE has an opportunity to common up loaded non-simple +/// constants; +/// 2. later on in GenXCategory, to mop up non-simple constant operands +/// created by CSE's constant propagation. +/// +/// This does not insert a load if the constant is "big simple" (that is, it is +/// illegally wide but each legalized part of it is simple) and it is used in +/// the "old value" operand of a wrregion, or as a call arg. Inserting a load +/// of such a constant here would allow the load to be CSEd, which would be +/// counter productive as some of the uses would not be kill uses and so +/// coalescing would fail there. +/// +/// Phi incoming constants are not loaded here; they are loaded in +/// loadPhiConstants called from GenXCategory. Phi constant loads do not need to +/// participate in CSE as loadPhiConstants has its own commoning up tailored for +/// phi nodes. +/// +/// loadConstants +/// ^^^^^^^^^^^^^ +/// +/// This is called from GenXCategory. It inserts a load for each constant +/// operand that is not allowed to be constant, but remains after +/// loadNonSimpleConstants. +/// +/// Phi incoming constants are not loaded here; they are loaded in +/// loadPhiConstants called from GenXCategory. +/// +/// loadPhiConstants +/// ^^^^^^^^^^^^^^^^ +/// +/// This is called from GenXCategory, and it inserts loads for constant phi +/// incomings, commoning up when possible and sensible. +/// +/// Commoning up (inserting one load for multiple phi incomings with the same +/// constant, across one or more phi nodes) proceeds as follows: +/// +/// Firstly, we divide the phi nodes into _webs_, where each web is the maximal +/// set of phi nodes that are related through phi nodes and two address +/// instructions, so will be coalesced later on in the flow. +/// +/// Secondly, for a single web, we look for multiple uses of the same constant. +/// Such a constant has a load instruction inserted just once, at the end of the +/// nearest common dominator of all the corresponding incoming blocks. +/// +/// If that insert point is in an empty split critical edge block, we instead +/// insert in the block above that, in the hope that the split critical edge +/// block can be removed later. +/// +/// ConstantLoader +/// ^^^^^^^^^^^^^^ +/// +/// ConstantLoader is a class that represents a constant and information on how +/// to load it. This is where analysis happens of whether it is a legal packed +/// vector, or whether it needs multiple instructions to load it. It then has +/// methods to insert the code to load the constant. +/// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "GENX_CONSTANTS" + +#include "GenXConstants.h" +#include "GenXGotoJoin.h" +#include "GenXIntrinsics.h" +#include "GenXRegion.h" +#include "GenXUtil.h" +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/ValueMap.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; +using namespace genx; + +/*********************************************************************** + * loadConstantStruct : insert instructions to load a constant struct + */ +static Value *loadConstantStruct(Constant *C, Instruction *InsertBefore, + const GenXSubtarget *Subtarget) { + auto ST = cast(C->getType()); + Value *Agg = UndefValue::get(ST); + for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { + Constant *El = C->getAggregateElement(i); + if (isa(El)) + continue; + Value *LoadedEl = nullptr; + if (isa(El->getType())) + LoadedEl = loadConstantStruct(El, InsertBefore, Subtarget); + else + LoadedEl = ConstantLoader(El, Subtarget).load(InsertBefore); + Agg = InsertValueInst::Create(Agg, LoadedEl, i, "loadstruct", InsertBefore); + } + return Agg; +} + +/*********************************************************************** + * loadNonSimpleConstants : for any non-simple or illegal size constant in + * an instruction, load it. + * + * Enter: Inst = instruction to find constant operands in + * AddedInstructions = 0 else vector to push added instructions onto + * + * Return: whether code was modified + * + * This does not load constants in a phi nodes. That is done in + * loadPhiConstants. + */ +bool genx::loadNonSimpleConstants(Instruction *Inst, + SmallVectorImpl *AddedInstructions, + const GenXSubtarget* Subtarget) +{ + bool Modified = false; + if (isa(Inst)) + return Modified; + // Omit call target operand of a call. + unsigned NumArgs = Inst->getNumOperands(); + auto CI = dyn_cast(Inst); + if (CI) + NumArgs = CI->getNumArgOperands(); + unsigned IID = GenXIntrinsic::getAnyIntrinsicID(Inst); + for (unsigned i = 0; i != NumArgs; ++i) { + if (isa(Inst->getOperand(i))) { + Use *U = &Inst->getOperandUse(i); + Constant *C = dyn_cast(*U); + if (!C) + continue; + if (isa(C)) + continue; + if (isa(C)) + continue; + if (opMustBeConstant(Inst, i)) + continue; + ConstantLoader CL(C, Inst, AddedInstructions, Subtarget); + if (CL.needFixingSimple()) { + CL.fixSimple(i); + continue; + } + if (CL.isSimple()) + continue; + // Do not load a "big simple" constant for the "old value of vector" + // input of a wrregion, so it does not get CSEd. CSEing it is + // counter-productive because, if it has multiple uses, it will + // need to be two-address copied by GenXCoalescing anyway. + if (GenXIntrinsic::isWrRegion(IID) + && i == GenXIntrinsic::GenXRegion::OldValueOperandNum + && CL.isBigSimple()) + continue; + // Similarly, do not load a "big simple" constant for a call arg. + if (CI && IID == GenXIntrinsic::not_any_intrinsic && CL.isBigSimple()) + continue; + *U = CL.loadBig(Inst); + Modified = true; + } + } + return Modified; +} + +bool genx::loadConstantsForInlineAsm( + CallInst *CI, SmallVectorImpl *AddedInstructions, + const GenXSubtarget *Subtarget) { + assert(CI->isInlineAsm() && "Inline asm expected"); + bool Modified = false; + auto ConstraintsInfo = genx::getGenXInlineAsmInfo(CI); + Use *U; + for (unsigned i = 0, e = ConstraintsInfo.size(), ArgNo = 0; i != e; ++i) { + auto &Info = ConstraintsInfo[i]; + if (Info.isOutput()) + continue; + U = &CI->getOperandUse(ArgNo); + ArgNo++; + if (auto C = dyn_cast(*U)) { + if (!isa(C)) { + switch (Info.getConstraintType()) { + default: + *U = ConstantLoader(C, nullptr, AddedInstructions, Subtarget).load(CI); + Modified = true; + break; + case ConstraintType::Constraint_n: + case ConstraintType::Constraint_i: + case ConstraintType::Constraint_F: + break; + } + } + } + } + return Modified; +} + + + +/*********************************************************************** + * loadConstants : load constants as required for an instruction + * + * This handles operands that are not allowed to be constant. A constant + * operand that needs loading because it is a non-simple constant is + * handled in loadNonSimpleConstants. + * + * This does not load constants in a phi nodes. That is done in + * loadPhiConstants. + */ +bool genx::loadConstants(Instruction *Inst, + const GenXSubtarget* Subtarget) +{ + bool Modified = false; + Use *U; + if (isa(Inst)) + return Modified; + if (isa(Inst) && + Inst->getType()->getScalarType()->isIntegerTy(1)) { + // Predicate binary operator: disallow constant operands, except + // that xor with -1 is allowed. + for (unsigned oi = 0; oi != 2; ++oi) + if (auto C = dyn_cast(Inst->getOperand(oi))) { + auto IsNot = [=]() { + if (oi != 1) + return false; + if (Inst->getOpcode() != Instruction::Xor) + return false; + if (!C->getType()->isVectorTy()) + return C->isAllOnesValue(); + Constant *C1 = C->getSplatValue(); + return C1 && C1->isAllOnesValue(); + }; + if (!IsNot()) { + Inst->setOperand(oi, ConstantLoader(C, Subtarget).load(Inst)); + Modified = true; + } + } + } + if (isa(Inst)) { + // select: disallow constant selector + U = &Inst->getOperandUse(0); + if (auto C = dyn_cast(*U)) { + *U = ConstantLoader(C, Subtarget).load(Inst); + Modified = true; + } + return Modified; + } + if (isa(Inst)) { + // insertvalue (inserting a value into a struct): disallow constant + // on element operand. + U = &Inst->getOperandUse(1); + if (auto C = dyn_cast(*U)) { + *U = ConstantLoader(C, Subtarget).load(Inst); + Modified = true; + } + // Also disallow constant (other than undef) on old struct value operand. + // We need to load each non-undef element separately. + U = &Inst->getOperandUse(0); + if (auto C = dyn_cast(*U)) + if (!isa(C)) + *U = loadConstantStruct(C, Inst, Subtarget); + return Modified; + } + if (auto Br = dyn_cast(Inst)) { + // Conditional branch: disallow constant condition. + if (Br->isConditional()) { + if (auto C = dyn_cast(Br->getCondition())) { + Br->setCondition(ConstantLoader(C, Subtarget).load(Br)); + Modified = true; + } + } + return Modified; + } + if (auto Ret = dyn_cast(Inst)) { + // Return: disallow constant return value in a subroutine (internal + // linkage). + if (Ret->getNumOperands() && Ret->getParent()->getParent()->getLinkage() + == GlobalValue::InternalLinkage) { + if (auto C = dyn_cast(Ret->getOperand(0))) { + if (!C->getType()->isVoidTy()) + Ret->setOperand(0, ConstantLoader(C, Subtarget).load(Ret)); + } + } + return Modified; + } + auto CI = dyn_cast(Inst); + if (!CI) + return Modified; + if (CI->isInlineAsm()) + return loadConstantsForInlineAsm(CI, nullptr, Subtarget); + int IntrinsicID = GenXIntrinsic::getAnyIntrinsicID(CI); + switch (IntrinsicID) { + case GenXIntrinsic::not_any_intrinsic: + case Intrinsic::fma: + case GenXIntrinsic::genx_ssmad: + case GenXIntrinsic::genx_sumad: + case GenXIntrinsic::genx_usmad: + case GenXIntrinsic::genx_uumad: + case GenXIntrinsic::genx_output: + // load all args for subroutine and some intrinsic calls. + for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { + U = &CI->getOperandUse(i); + if (auto C = dyn_cast(*U)) { + if (!isa(C)) { + *U = ConstantLoader(C, Subtarget).loadBig(CI); + Modified = true; + } + } + } + break; + case GenXIntrinsic::genx_constanti: + case GenXIntrinsic::genx_constantf: + break; + case GenXIntrinsic::genx_absi: + case GenXIntrinsic::genx_absf: + // abs modifier: disallow constant input. + U = &CI->getOperandUse(0); + if (auto C = dyn_cast(*U)) { + *U = ConstantLoader(C, Subtarget).load(CI); + Modified = true; + } + break; + case GenXIntrinsic::genx_rdpredregion: + case GenXIntrinsic::genx_any: + case GenXIntrinsic::genx_all: + // rdpredregion, any, all: disallow constant input + U = &CI->getOperandUse(0); + if (auto C = dyn_cast(*U)) { + *U = ConstantLoader(C, Subtarget).load(CI); + Modified = true; + } + break; + case GenXIntrinsic::genx_rdregioni: + case GenXIntrinsic::genx_rdregionf: + // rdregion: disallow constant input + U = &CI->getOperandUse(0); + if (auto C = dyn_cast(*U)) { + *U = ConstantLoader(C, Subtarget).loadBig(CI); + Modified = true; + } + // Also disallow constant vector index (constant scalar OK). + U = &CI->getOperandUse(GenXIntrinsic::GenXRegion::RdIndexOperandNum); + if (auto C = dyn_cast(*U)) { + if (isa(C->getType())) { + *U = ConstantLoader(C, Subtarget).load(CI); + Modified = true; + } + } + break; + case GenXIntrinsic::genx_wrpredpredregion: + // wrpredpred: disallow constant "old vector" input unless undef + U = &CI->getOperandUse(0); + if (auto C = dyn_cast(*U)) { + if (!isa(C)) { + *U = ConstantLoader(C, Subtarget).loadBig(CI); + Modified = true; + } + } + break; + case GenXIntrinsic::genx_wrregioni: + case GenXIntrinsic::genx_wrregionf: + // wrregion: disallow constant "old vector" input unless undef + U = &CI->getOperandUse(0); + if (auto C = dyn_cast(*U)) { + if (!isa(C)) { + *U = ConstantLoader(C, Subtarget).loadBig(CI); + Modified = true; + } + } + // Also disallow constant vector index (constant scalar OK). + U = &CI->getOperandUse(GenXIntrinsic::GenXRegion::WrIndexOperandNum); + if (auto C = dyn_cast(*U)) { + if (isa(C->getType())) { + *U = ConstantLoader(C, Subtarget).load(CI); + Modified = true; + } + } + // Also disallow constant predicate unless all ones. + U = &CI->getOperandUse(GenXIntrinsic::GenXRegion::PredicateOperandNum); + if (auto C = dyn_cast(*U)) { + if (!C->isAllOnesValue()) { + *U = ConstantLoader(C, Subtarget).load(CI); + Modified = true; + } + } + break; + case GenXIntrinsic::genx_simdcf_goto: + // goto: disallow constant predicate input, unless it is all 0. We want to + // allow constant all 0, as it is the encoding used for an "else", and + // loading the constant into a predicate register stops the finalizer's + // structurizer working. + U = &CI->getOperandUse(2); + if (auto C = dyn_cast(*U)) { + if (!C->isNullValue()) { + *U = ConstantLoader(C, Subtarget).load(CI); + Modified = true; + } + } + break; + default: + // Intrinsic: check intrinsic descriptor to see where constant args + // are allowed. + // Iterate through each field in the intrinsic info. + GenXIntrinsicInfo II(IntrinsicID); + // Intrinsic not found. + if (II.isNull()) + return Modified; + unsigned MaxRawOperands = II.getTrailingNullZoneStart(CI); + for (GenXIntrinsicInfo::iterator i = II.begin(), e = II.end(); i != e; ++i) { + GenXIntrinsicInfo::ArgInfo AI = *i; + if (!AI.isArgOrRet() || AI.isRet()) + continue; + // This field relates to an operand. + U = &CI->getOperandUse(AI.getArgIdx()); + auto C = dyn_cast(*U); + if (!C) + continue; + // Operand is constant. + // Allow constant if it is i1 or vector of i1 set to all ones; this + // represents an "all true" predication field. + if (C->getType()->getScalarType()->isIntegerTy(1) && C->isAllOnesValue()) + continue; + // Allow constant if intrinsic descriptor allows it for this arg. + if (!AI.isImmediateDisallowed()) + continue; + // If it is a RAW operand, allow the constant if it's in the trailing + // null region (it must be a null constant if so), or if the value + // is undefined and RAW_NULLALLOWED is enabled. + if (AI.isRaw()) { + if ((unsigned)AI.getArgIdx() >= MaxRawOperands) { + assert(C->isNullValue()); + continue; + } + if (isa(C) && AI.rawNullAllowed()) + continue; + } + // Also allow constant if it is undef in a TWOADDR + if (isa(C) && AI.getCategory() == GenXIntrinsicInfo::TWOADDR) + continue; + // Also allow constant if it is a reserved surface index. + if (AI.getCategory() == GenXIntrinsicInfo::SURFACE && + visa::isReservedSurfaceIndex(visa::convertToSurfaceIndex(C))) { + continue; + } + // Operand is not allowed to be constant. Insert code to load it. + *U = ConstantLoader(C, Subtarget).loadBig(CI); + Modified = true; + } + break; + } + return Modified; +} + +/*********************************************************************** + * loadPhiConstants : load constant incomings in phi nodes, commoning up + * if appropriate + */ +bool genx::loadPhiConstants(Function *F, DominatorTree *DT, + bool ExcludePredicate, const GenXSubtarget* Subtarget) { + bool Modified = false; + std::set Done; + for (auto fi = F->begin(), fe = F->end(); fi != fe; ++fi) { + BasicBlock *BB = &*fi; + for (auto bi = BB->begin(); ; ++bi) { + auto Phi = dyn_cast(&*bi); + if (!Phi) + break; + if (!Done.insert(Phi).second) + continue; // phi node already processed in some web + // Gather the web of phi nodes and two address instructions related to + // this one. This is an approximation to the web of instructions that + // will or could be coalesced. + // (Use Web as a worklist of phi nodes and two address instructions to + // use to find other phi nodes and two address instructions.) + // + // We process a web of related phi nodes at a time, rather than all phi + // nodes that use the constant, to avoid this situation: + // we try and common up two phi nodes in the same basic block (e.g. two + // variables both initialized to 0 before a loop), but end up having to + // insert a copy for one of them anyway in coalescing. + SmallVector Web; + Web.push_back(Phi); + for (unsigned wi = 0; wi != Web.size(); ++wi) { + auto Inst = Web[wi]; + unsigned oi = 0, oe = 0; + if ((Phi = dyn_cast(Inst))) { + // Phi node: process each incoming. + oe = Phi->getNumIncomingValues(); + } else { + // Two address instruction: process just the two address operand. + oi = getTwoAddressOperandNum(cast(Inst)); + oe = oi + 1; + } + + auto IsPhiOrTwoAddress = [=](Value *V) { + if (isa(V)) + return true; + if (auto CI = dyn_cast(V)) + return getTwoAddressOperandNum(CI) >= 0; + return false; + }; + + // For each incoming: + for (; oi != oe; ++oi ) { + auto Incoming = Inst->getOperand(oi); + // If it is a phi node or two address instruction, push it into the + // web for processing later. + if (IsPhiOrTwoAddress(Incoming)) { + auto IncomingInst = cast(Incoming); + if (Done.insert(IncomingInst).second) + Web.push_back(IncomingInst); + } else if (!isa(Incoming)) { + // For any other inst or arg, see if it has any other use in a phi + // node or two address inst, and push that into the web. + for (auto ui = Incoming->use_begin(), ue = Incoming->use_end(); + ui != ue; ++ui) { + auto User = cast(ui->getUser()); + if (IsPhiOrTwoAddress(User)) + if (Done.insert(User).second) + Web.push_back(User); + } + } + } + // Now process each use of the result of the phi node or two address + // instruction. If the use is in a phi node or is a two address operand, + // push the user into the web. + for (auto ui = Inst->use_begin(), ue = Inst->use_end(); ui != ue; ++ui) { + auto User = cast(ui->getUser()); + if (IsPhiOrTwoAddress(User)) + if (Done.insert(User).second) + Web.push_back(User); + } + } + LLVM_DEBUG( + dbgs() << "loadPhiConstants: Web of phi nodes and two address insts:\n"; + for (auto wi = Web.begin(), we = Web.end(); wi != we; ++wi) + dbgs() << **wi << "\n" + ); + // Now process the web, ignoring anything other than phi nodes. + // Gather the distinct constants, and every use for each one in a phi + // node. + std::map> ConstantUses; + SmallVector DistinctConstants; + for (unsigned wi = 0, we = Web.size(); wi != we; ++wi) { + auto Phi = dyn_cast(Web[wi]); + if (!Phi) + continue; + for (unsigned oi = 0, oe = Phi->getNumIncomingValues(); oi != oe; ++oi) { + Use *U = &Phi->getOperandUse(oi); + auto *C = dyn_cast(*U); + if (!C || isa(C)) + continue; + // when doing this transform in pattern matching phase + if (ExcludePredicate) { + if (C->getType()->getScalarType()->isIntegerTy(1)) + continue; + if (C->getType()->getPrimitiveSizeInBits() <= 256) + continue; + auto IncomingBlock = Phi->getIncomingBlock(oi); + if (GotoJoin::isBranchingJoinLabelBlock(IncomingBlock)) + continue; + } + + auto Entry = &ConstantUses[C]; + if (!Entry->size()) + DistinctConstants.push_back(C); + Entry->push_back(U); + } + } + // Handle each distinct constant. + for (unsigned dci = 0, dce = DistinctConstants.size(); dci != dce; ++dci) { + Constant *C = DistinctConstants[dci]; + auto Entry = &ConstantUses[C]; + if (Entry->size() != 1) { + LLVM_DEBUG( + dbgs() << "multiple use of " << *C << "\n"; + for (unsigned ei = 0, ee = Entry->size(); ei != ee; ++ei) + dbgs() << *(*Entry)[ei]->getUser() << "\n" + ); + } + // Find the closest common dominator of the incoming blocks of all phi + // uses of the constant. That is where we want to insert the constant + // load. + Use *U = (*Entry)[0]; + auto InsertBB = cast(U->getUser()) + ->getIncomingBlock(U->getOperandNo()); + for (unsigned ei = 1, ee = Entry->size(); ei != ee; ++ei) { + U = (*Entry)[ei]; + auto Phi = cast(U->getUser()); + auto IncomingBB = Phi->getIncomingBlock(U->getOperandNo()); + InsertBB = DT->findNearestCommonDominator(InsertBB, IncomingBB); + } + // If that location is an empty split critical edge block, go up to its + // predecessor (which is also its immediate dominator) if this block is + // "true" successor of branching simd cf block. In this case we cannot + // insert anything in current block and have to create partial + // redundancy. + assert(InsertBB); + auto *InsertTerm = InsertBB->getTerminator(); + auto *SinglePred = InsertBB->getSinglePredecessor(); + if (InsertTerm->getNumSuccessors() == 1 && + InsertTerm == &InsertBB->front() && SinglePred && + GotoJoin::isBranchingGotoJoinBlock(SinglePred)) + InsertBB = SinglePred; + + // Insert the constant load. + ConstantLoader CL(C, Subtarget); + Value *Load = nullptr; + Instruction *InsertBefore = InsertBB->getTerminator(); + if (!CL.isSimple()) + Load = CL.loadNonSimple(InsertBefore); + else + Load = CL.load(InsertBefore); + Modified = true; + // Modify the uses. + for (unsigned ei = 0, ee = Entry->size(); ei != ee; ++ei) + *(*Entry)[ei] = Load; + // replace other non-phi uses that are also dominated by the InsertBB + for (unsigned wi = 0, we = Web.size(); wi != we; ++wi) { + if (isa(Web[wi])) + continue; + auto CI = dyn_cast(Web[wi]); + if (CI && getTwoAddressOperandNum(CI) >= 0) { + auto oi = getTwoAddressOperandNum(CI); + Use *U = &CI->getOperandUse(oi); + auto *UC = dyn_cast(*U); + if (UC && UC == C) { + if (CI->getParent() != InsertBB && DT->dominates(InsertBB, CI->getParent())) + *U = Load; + } + } + } + } + } + } + return Modified; +} + +void ConstantLoader::fixSimple(int OperandIdx) { + assert(NewC && + "no need to fix simple case"); + assert(User->getOperand(OperandIdx) == C && + "wrong arguments: wrong operand index was provided"); + User->setOperand(OperandIdx, NewC); + C = NewC; + // indicate that we no longer need fix + NewC = nullptr; +} + +/*********************************************************************** + * ConstantLoader::loadNonSimple : load a non-simple constant + * + * Enter: C = constant to lower if necessary + * Inst = instruction it is used in (also used to insert new + * code before) + * + * Return: new instruction + */ +Instruction *ConstantLoader::loadNonSimple(Instruction *Inst) +{ + assert(!isSimple()); + if (!isLegalSize()) + return loadBig(Inst); + if (PackedFloat) { + unsigned NumElts = C->getType()->getVectorNumElements(); + SmallVector Quads; + for (unsigned i = 0, e = NumElts; i != e; i += 4) { + SmallVector Quad; + for (unsigned j = 0; j != 4 && (i + j) < NumElts; ++j) + Quad.push_back(C->getAggregateElement(i + j)); + ConstantLoader Packed(ConstantVector::get(Quad)); + Quads.push_back(Packed.load(Inst)); + } + Value *V = UndefValue::get(C->getType()); + unsigned Offset = 0; + auto DL = Inst->getDebugLoc(); + for (auto &Q : Quads) { + VectorType *VTy = cast(Q->getType()); + Region R(V); + R.getSubregion(Offset, VTy->getNumElements()); + V = R.createWrRegion(V, Q, "constant.quad" + Twine(Offset), Inst, DL); + Offset += VTy->getNumElements(); + } + return cast(V); + } + if (PackedIntScale) { + auto PackTy = C->getType()->getScalarType(); + // limit the constant-type to 32-bit because we do not want 64-bit operation + if (PackTy->getPrimitiveSizeInBits() > 32) + PackTy = Type::getInt32Ty(Inst->getContext()); + // Load as a packed int vector with scale and/or adjust. + SmallVector PackedVals; + for (unsigned i = 0, e = C->getType()->getVectorNumElements(); + i != e; ++i) { + int64_t Val = 0; + if (auto CI = dyn_cast(C->getAggregateElement(i))) { + Val = CI->getSExtValue(); + Val -= PackedIntAdjust; + Val /= PackedIntScale; + } + PackedVals.push_back(ConstantInt::get(PackTy, Val, /*isSigned=*/true)); + assert(cast(PackedVals.back())->getSExtValue() >= -8 + && cast(PackedVals.back())->getSExtValue() <= 15); + } + ConstantLoader Packed(ConstantVector::get(PackedVals)); + auto LoadPacked = Packed.load(Inst); + if (PackedIntScale != 1) + LoadPacked = BinaryOperator::Create(Instruction::Mul, LoadPacked, + ConstantVector::getSplat(C->getType()->getVectorNumElements(), + ConstantInt::get(PackTy, PackedIntScale, + /*isSigned=*/true)), "constantscale", Inst); + if (PackedIntAdjust) + LoadPacked = BinaryOperator::Create(Instruction::Add, LoadPacked, + ConstantVector::getSplat(C->getType()->getVectorNumElements(), + ConstantInt::get(PackTy, PackedIntAdjust, + /*isSigned=*/true)), "constantadjust", Inst); + if (PackTy->getPrimitiveSizeInBits() < + C->getType()->getScalarType()->getPrimitiveSizeInBits()) { + LoadPacked = CastInst::CreateSExtOrBitCast( + LoadPacked, C->getType(), "constantzext", Inst); + } + return LoadPacked; + } + if (auto CC = getConsolidatedConstant(C)) { + // We're loading a vector of byte or short (but not i1). Use int so the + // instruction does not use so many channels. This may also save it being + // split by legalization. + ConstantLoader CCL(CC, Subtarget); + Instruction *NewInst = nullptr; + if (CCL.isSimple()) + NewInst = CCL.load(Inst); + else + NewInst = CCL.loadNonSimple(Inst); + NewInst = CastInst::Create(Instruction::BitCast, NewInst, C->getType(), + "constant", Inst); + if (AddedInstructions) + AddedInstructions->push_back(NewInst); + return NewInst; + } + VectorType *VT = dyn_cast(C->getType()); + unsigned NumElements = VT->getNumElements(); + SmallVector Elements; + unsigned UndefBits = 0; + if (ConstantDataVector *CDV = dyn_cast(C)) { + // Gather the elements. + for (unsigned i = 0; i != NumElements; ++i) { + Constant *El = CDV->getElementAsConstant(i); + assert(!isa(El) && "CDV element can't be undef"); + Elements.push_back(El); + } + } else { + ConstantVector *CV = cast(C); + // Gather the elements. + for (unsigned i = 0; i != NumElements; ++i) { + Constant *El = CV->getOperand(i); + if (isa(El)) + UndefBits |= 1 << i; + Elements.push_back(El); + } + } + unsigned RemainingBits = ~UndefBits + & ((NumElements == 32 ? 0 : 1 << NumElements) - 1); + if (!RemainingBits) { + // All elements are undef. This should have been simplified away earlier, + // but we need to cope with it in case it was not. Just load the first + // element. + RemainingBits = 1; + } + Instruction *Result = 0; + // If it is wider than 8 elements, see if we can load any group of 8 as a + // packed vector. + if (NumElements > 8) { + for (unsigned Idx = 0; Idx < NumElements - 4; Idx += 8) { + unsigned Size = std::min(8U, NumElements - Idx); + Constant *SubC = getConstantSubvector(C, Idx, Size); + if (isa(SubC)) + continue; + ConstantLoader SubLoader(SubC, Subtarget); + if (SubLoader.PackedIntScale == 0 && !SubLoader.isPackedFloatVector()) + continue; + Region R(C); + R.getSubregion(Idx, Size); + if (SubLoader.isSimple()) { + Value *SubV = SubC; + Result = cast(R.createWrConstRegion( + Result ? (Value *)Result : (Value *)UndefValue::get(C->getType()), + SubV, "constant.split" + Twine(Idx), + Inst, Inst->getDebugLoc())); + } else { + Value* SubV = SubLoader.loadNonSimple(Inst); + Result = cast(R.createWrRegion( + Result ? (Value *)Result : (Value *)UndefValue::get(C->getType()), + SubV, "constant.split" + Twine(Idx), + Inst, Inst->getDebugLoc())); + } + if (AddedInstructions) + AddedInstructions->push_back(Result); + RemainingBits &= ~(255 << Idx); + } + if (!RemainingBits) + return Result; + } + + // Build the splat sets, that is, the sets of elements of identical value. + SmallVector SplatSets; + { + ValueMap SplatSetFinder; + for (unsigned i = 0; i != NumElements; ++i) { + Constant *El = Elements[i]; + if (!isa(El)) { + std::pair::iterator, bool> Created + = SplatSetFinder.insert(std::pair(El, + SplatSets.size())); + if (Created.second) { + // First time this Constant has been seen. + SplatSets.push_back(1 << i); + } else { + // Add on to existing splat set. + SplatSets[Created.first->second] |= 1 << i; + } + } + } + } + // Remove any splat set with only a single element. + unsigned NewSize = 0; + for (unsigned i = 0, e = SplatSets.size(); i != e; ++i) { + if (countPopulation(SplatSets[i]) >= 2) + SplatSets[NewSize++] = SplatSets[i]; + } + SplatSets.resize(NewSize); + // Determine which elements are suitable for inclusion in a packed vector. + // FIXME Not implemented yet. For an int vector constant, we need to + // determine whether the instruction expects the operand to be signed + // or unsigned. + + // Loop constructing the constant until it is complete. + do { + // Find the splat set that will contribute the most elements + // to the vector, taking into account what elements we can access + // in a 1D region write. (Initialize BestSplatSetBits so, if no best + // splat is found, we just do a single element out of RemainingBits.) + // + // Note that we are looking for the splat set that sets the most elements, + // not the one that _usefully_ sets the most elements. For example, + // Examples/sepia has a constant vector of the form + // < A, B, C, 0, 0, A, B, C > + // We have four splat sets {0,5} {1,6} {2,7} {3,4}, each of which + // has two elements. What we want to do is set one of the A, B or C + // sets first, rather than the 0s, because region restrictions mean that + // we can only set such a pair if we do it first. If the loop below were + // to find the splat set that _usefully_ sets the most elements, all four + // sets would say "2" and we would arbitrarily pick one of them. But, if + // we ask each splat set how many elements it sets, even uselessly, then + // the A, B and C sets say "8" and the 0 set says "2", and we ensure that + // we do one of the A, B or C sets first. + // So we end up setting the constant in this order (arbitrarily picking + // A first): + // < A, A, A, A, A, A, A, A > + // < 0, 0 > + // < B > + // < B > + // < C > + // < C > + // giving five wrregion instructions rather than six. + unsigned BestSplatSetBits = 1 << genx::log2(RemainingBits); + unsigned BestSplatSetUsefulBits = BestSplatSetBits; + unsigned BestSplatSetCount = 1; + Constant *BestSplatSetConst = Elements[genx::log2(RemainingBits)]; + for (unsigned i = 0, e = SplatSets.size(); i != e; ++i) { + unsigned Bits = getRegionBits(SplatSets[i] & RemainingBits, + SplatSets[i] | RemainingBits | UndefBits, NumElements); + unsigned Count = countPopulation(Bits); + // For this splat set, Bits is a bitmap of the vector elements that + // we can set in this splat set in a legal 1D region (possibly including + // elements already set and undef elements), and Count is how many + // elements that still need setting the region will set. + if (Count > BestSplatSetCount) { + BestSplatSetBits = Bits; + BestSplatSetUsefulBits = Bits & SplatSets[i]; + BestSplatSetCount = Count; + BestSplatSetConst = Elements[genx::log2(SplatSets[i])]; + } + } + // Now BestSplatSetBits is a bitmap of the vector elements to include in + // the best splat. Set up the splatted constant. + if (!Result) { + // For the first time round the loop, just splat the whole vector, + // whatever BestSplatBits says. + Result = loadConstant(ConstantVector::getSplat( + NumElements, BestSplatSetConst), Inst, AddedInstructions); + Result->setDebugLoc(Inst->getDebugLoc()); + } else { + // Not the first time round the loop. Set up the splatted subvector, + // and write it as a region. + Region R(BestSplatSetBits, + VT->getElementType()->getPrimitiveSizeInBits() / 8); + Constant *NewConst = ConstantVector::getSplat(R.NumElements, + BestSplatSetConst); + Result = cast(R.createWrConstRegion(Result, NewConst, "constant", + Inst, Inst->getDebugLoc())); + if (AddedInstructions) + AddedInstructions->push_back(Result); + } + RemainingBits &= ~BestSplatSetUsefulBits; + } while (RemainingBits); + return Result; +} + +/*********************************************************************** + * getRegionBits : determine which vector elements we can set with a + * 1D region + * + * Enter: NeededBits = bits for vector elements we need to set + * OptionalBits = bits for vector elements we could set + * VecWidth = number of elements in vector + * + * Return: bits for vector elements to set as a legal 1D region, + * maximizing how many of NeededBits are set + */ +unsigned ConstantLoader::getRegionBits(unsigned NeededBits, + unsigned OptionalBits, unsigned VecWidth) +{ + if (!NeededBits) + return 0; + // Get the first and last element numbers in NeededBits. + unsigned FirstNeeded = countTrailingZeros(NeededBits, ZB_Undefined); + unsigned LastNeeded = 31 - countLeadingZeros((uint32_t)NeededBits, ZB_Undefined); + // Set the max width to the min size including both those elements + // rounded up to the next power of two. + unsigned MaxWidth = LastNeeded - FirstNeeded + 1; + unsigned LogMaxWidth = genx::log2(MaxWidth); + if (MaxWidth != 1U << LogMaxWidth) { + ++LogMaxWidth; + MaxWidth = 1U << LogMaxWidth; + } + // Special case NeededBits only having one element. + if (LogMaxWidth == 0) + return NeededBits; + // Now find the best region. + unsigned BestBits = 0; + unsigned BestCount = 0; + // Try each stride. + static const unsigned StrideBitsTable[] = { 0xffffffffU, 0x55555555U, 0x11111111U }; + for (unsigned LogStride = 0, Stride = 1; + LogStride <= 2U && LogStride < LogMaxWidth; + ++LogStride, Stride <<= 1U) { + // Try each width (not including 1). + for (unsigned Width = 1U << (LogMaxWidth - LogStride); Width > 1; Width >>= 1) { + if (Width <= BestCount) + break; + // Try each start index. + for (unsigned Idx = 0; Idx + (Width - 1) * Stride < VecWidth; ++Idx) { + if (Idx + Width > VecWidth) + break; + // Calculate which indexes the region will set. + unsigned Bits = StrideBitsTable[LogStride]; + if (Width != 32) + Bits &= (1 << Width) - 1; + Bits <<= Idx; + // See if it sets any elements that we are not allowed to set. + if (Bits & ~(NeededBits | OptionalBits)) + continue; + // See if it sets all of NeededBits. + if ((Bits & NeededBits) == NeededBits) + return Bits; + // See if it is the best one we have seen so far. + unsigned Count = countPopulation(Bits & NeededBits); + if (Count > BestCount) { + BestCount = Count; + BestBits = Bits; + if (BestCount == Width) + break; + } + } + } + } + if (!BestCount) { + // We could not find any region that includes more than one of NeededBits. + // Just do a single element. + return 1 << genx::log2(NeededBits); + } + return BestBits; +} + +Instruction *ConstantLoader::loadSplatConstant(Instruction *InsertPos) { + // Skip scalar types, vector type with just one element, or boolean vector. + VectorType *VTy = dyn_cast(C->getType()); + if (!VTy || + VTy->getNumElements() == 1 || + VTy->getScalarType()->isIntegerTy(1)) + return nullptr; + // Skip non-splat vector. + Constant *C1 = C->getSplatValue(); + if (!C1) + return nullptr; + // Create <1 x T> constant and broadcast it through rdregion. + Constant *CV = ConstantVector::get(C1); + // Load that scalar constant first. + ConstantLoader L(CV, Subtarget); + Value *V = L.load(InsertPos); + // Broadcast through rdregion. + Region R(V); + R.Width = R.NumElements = VTy->getNumElements(); + R.Stride = 0; + R.VStride = 0; + R.Offset = 0; + Instruction *NewInst = R.createRdRegion(V, ".constsplat", InsertPos, DebugLoc()); + if (AddedInstructions) + AddedInstructions->push_back(NewInst); + return NewInst; +} + +/*********************************************************************** + * ConstantLoader::load : insert instruction to load a constant + * + * We use llvm.genx.constant, rather than bitcast, because CSE has a habit + * of propagating a constant bitcast back into our operand that is not + * allowed to be constant. + * + * Enter: C = constant to load + * InsertBefore = insert new instruction before here + * + * Return: new instruction + */ +Instruction *ConstantLoader::load(Instruction *InsertBefore) +{ + assert(isSimple()); + // Do not splat load on byte data as HW does not support byte imm source. + if (!C->getType()->getScalarType()->isIntegerTy(8)) + if (auto NewInst = loadSplatConstant(InsertBefore)) + return NewInst; + + if (!PackedFloat && !PackedIntScale && !isa(C)) { // not packed int constant or undef + if (auto CC = getConsolidatedConstant(C)) { + // We're loading a vector of byte or short (but not i1). Use int so the + // instruction does not use so many channels. This may also save it being + // split by legalization. + Instruction *NewInst = loadConstant(CC, InsertBefore, AddedInstructions); + NewInst = CastInst::Create(Instruction::BitCast, NewInst, C->getType(), + "constant", InsertBefore); + if (AddedInstructions) + AddedInstructions->push_back(NewInst); + return NewInst; + } + } + + // Load the constant as normal. + Value *Args[] = { C }; // Args to new llvm.genx.constant + Type *OverloadedTypes[] = { C->getType() }; + GenXIntrinsic::ID IntrinsicID = GenXIntrinsic::genx_constanti; + if (C->getType()->isFPOrFPVectorTy()) + IntrinsicID = GenXIntrinsic::genx_constantf; + else if (C->getType()->getScalarType()->isIntegerTy(1)) + IntrinsicID = GenXIntrinsic::genx_constantpred; + Module *M = InsertBefore->getParent()->getParent()->getParent(); + Function *Decl = GenXIntrinsic::getGenXDeclaration(M, IntrinsicID, OverloadedTypes); + Instruction *NewInst = CallInst::Create(Decl, Args, "constant", InsertBefore); + if (AddedInstructions) + AddedInstructions->push_back(NewInst); + return NewInst; +} + +/*********************************************************************** + * ConstantLoader::loadBig : insert instruction to load a constant that might + * be illegally sized + */ +Instruction *ConstantLoader::loadBig(Instruction *InsertBefore) +{ + if (isLegalSize() || isa(C)) { + // Does not need legalizing. + if (!isSimple()) + return loadNonSimple(InsertBefore); + return load(InsertBefore); + } + assert(!C->getType()->getScalarType()->isIntegerTy(1) && "not expecting predicate in here"); + if (Constant *Consolidated = getConsolidatedConstant(C)) { + // Load as a consolidated constant, then bitcast to the correct type. + auto Load = ConstantLoader(Consolidated, nullptr, AddedInstructions, Subtarget) + .loadBig(InsertBefore); + assert(Load); + Load = CastInst::Create(Instruction::BitCast, Load, C->getType(), + Load->getName() + ".cast", InsertBefore); + if (AddedInstructions) + AddedInstructions->push_back(Load); + return Load; + } + auto VT = cast(C->getType()); + unsigned NumElements = VT->getNumElements(); + unsigned LogElementBits = genx::log2( + VT->getElementType()->getPrimitiveSizeInBits()); + unsigned MaxSize = 1 << (9/*log 2xGRFsize*/ - LogElementBits); + MaxSize = std::min(MaxSize, 32U); + Instruction *Result = nullptr; + for (unsigned Idx = 0; Idx != NumElements; ) { + unsigned Size = std::min(1U << genx::log2(NumElements - Idx), MaxSize); + // Load this subvector constant if necessary, and insert into the overall + // value with wrregion. + Constant *SubC = getConstantSubvector(C, Idx, Size); + Value *SubV = SubC; + ConstantLoader SubLoader(SubC, Subtarget); + if (!SubLoader.isSimple()) + SubV = SubLoader.loadNonSimple(InsertBefore); + Region R(C); + R.getSubregion(Idx, Size); + Result = cast(R.createWrRegion( + Result ? (Value *)Result : (Value *)UndefValue::get(C->getType()), + SubV, "constant.split" + Twine(Idx), + InsertBefore, DebugLoc())); + if (AddedInstructions) + AddedInstructions->push_back(Result); + Idx += Size; + } + return Result; +} + +/*********************************************************************** + * ConstantLoader::isLegalSize : detect if a constant is a legal size + */ +bool ConstantLoader::isLegalSize() +{ + auto VT = dyn_cast(C->getType()); + if (!VT) + return true; + int NumBits = C->getType()->getPrimitiveSizeInBits(); + if (!llvm::isPowerOf2_32(NumBits)) + return false; + int GRFSize = 32; + if (Subtarget) + GRFSize = Subtarget->getGRFWidth(); + if (NumBits > GRFSize * 8 /*bytes*/ * 2) + return false; // bigger than 2 GRFs + if (VT->getNumElements() > 32) + return false; // 64 bytes not allowed + return true; +} + +/*********************************************************************** + * ConstantLoader::isBigSimple : detect if a constant is either simple, + * or would be simple after being split into legal sizes + * + * This does not do a thorough check so it misses some cases of a constant + * that would split into simple constants. + */ +bool ConstantLoader::isBigSimple() +{ + assert(!needFixingSimple() && + "simple case shall be fixed first before this call"); + if (isa(C)) + return true; // undef is simple + auto VT = dyn_cast(C->getType()); + if (!VT) + return true; // scalar always simple + if (C->getSplatValue()) + return true; // splat constant always simple + if (VT->getElementType()->getPrimitiveSizeInBits() == 1) + return true; // predicate constant always simple + return false; +} + +/*********************************************************************** + * ConstantLoader::isSimple : detect if a constant is "simple" + * + * A simple constant is one we know can be a constant operand in an instruction. + */ +bool ConstantLoader::isSimple() +{ + assert(!needFixingSimple() && + "simple case shall be fixed first before this call"); + if (isa(C)) + return true; // undef is simple (and generates no vISA code) + if (C->getType()->getScalarType()->isIntegerTy(1) && C->isAllOnesValue()) + return true; // all 1s predicate is simple + if(User && User->isBinaryOp()) + if (isa(C->getType())) + if (auto splat = C->getSplatValue()) + if (splat->isZeroValue()) + return true; + if (!isLegalSize()) + return false; // Simple constant must be legally sized + if (isBigSimple()) + return true; // a big simple constant that is legally sized is simple + if (isPackedIntVector()) + return true; + if (isPackedFloatVector()) + return true; + return false; +} + +/*********************************************************************** + * ConstantLoader::isPackedIntVector : check for a packed int vector + * (having already done the analysis in the ConstantLoader constructor) + */ +bool ConstantLoader::isPackedIntVector() +{ + // Check for a packed int vector. Either the element type must be i16, or + // the user (instruction using the constant) must be genx.constanti or + // wrregion or wrconstregion. Not allowed if the user is a logic op. + if (PackedIntScale == 1 && (PackedIntAdjust == 0 || PackedIntAdjust == -8)) { + if (!User) + return true; // user not specified -- assume it is a mov, so wrong element + // size is allowed + if (!C->getType()->getScalarType()->isIntegerTy(16) + && GenXIntrinsic::getGenXIntrinsicID(User) != GenXIntrinsic::genx_constanti + && !GenXIntrinsic::isWrRegion(User)) + return false; // wrong element size when it is not a mov + switch (User->getOpcode()) { + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + return false; // disallow packed vector in logic op + default: + break; + } + return true; + } + return false; +} + +/*********************************************************************** + * ConstantLoader::isPackedFloatVector : check for a packed float vector + * (having already done the analysis in the ConstantLoader constructor) + */ +bool ConstantLoader::isPackedFloatVector() { + VectorType *VT = dyn_cast(C->getType()); + if (!VT) + return false; + if (VT->getNumElements() > 4) + return false; + return PackedFloat; +} + +/*********************************************************************** + * ConstantLoader::getConsolidatedConstant : get the consolidated constant + * for the given constant + * + * A "consolidated constant" is one where a vector of byte or short is + * turned into the equivalent (as if by bitcast) vector of int. + */ +Constant *ConstantLoader::getConsolidatedConstant(Constant *C) +{ + if (isa(C)) + return nullptr; + VectorType *VT = dyn_cast(C->getType()); + if (!VT) + return nullptr; + unsigned BytesPerElement = VT->getElementType()->getPrimitiveSizeInBits() / 8; + unsigned NumElements = VT->getNumElements(); + if (!BytesPerElement) + return nullptr; // vector of i1 + if (BytesPerElement >= 4) + return nullptr; // already vector of i32/i64/float/double + if (NumElements * BytesPerElement & 3) + return nullptr; // not a multiple of 4 bytes long + // We're loading a vector of byte or short (but not i1). Use int so the + // instruction does not use so many channels. This may also save it being + // split by legalization. + unsigned Compaction = BytesPerElement == 1 ? 4 : 2; + unsigned Mask = BytesPerElement == 1 ? 0xff : 0xffff; + SmallVector Elements; + Type *I32Ty = Type::getInt32Ty(C->getContext()); + for (unsigned i = 0; i != NumElements; i += Compaction) { + unsigned Val = 0; + bool IsUndef = true; + for (unsigned j = 0; j != Compaction; ++j) { + unsigned Bits = 0; + Constant *El = C->getAggregateElement(i + j); + // We assume that anything that is not ConstantInt is undefined. That + // can include a constant expression with an undefined value in the + // middle. + if (auto CI = dyn_cast(El)) { + Bits = CI->getSExtValue(); + IsUndef = false; + } + else if (auto CI = dyn_cast(El)) { + APFloat V = CI->getValueAPF(); + Bits = V.bitcastToAPInt().getZExtValue(); + IsUndef = false; + } + Val |= (Bits & Mask) << (j * BytesPerElement * 8); + } + if (IsUndef) + Elements.push_back(UndefValue::get(I32Ty)); + else + Elements.push_back(ConstantInt::get(I32Ty, Val)); + } + // Construct the constant with i32 element type. + return ConstantVector::get(Elements); +} + +/*********************************************************************** + * ConstantLoader::analyze : analyze a constant value + * + * This analyzes whether a constant of no more than the right vector width + * (integer 8 or fp 4) can be loaded as a packed vector, possibly scaled + * and adjusted. + */ +void ConstantLoader::analyze() +{ + auto VT = dyn_cast(C->getType()); + if (!VT) + return; + if (C->getSplatValue()) + return; // don't analyze if already a splat + unsigned NumElements = VT->getNumElements(); + if (NumElements <= 8 && VT->getElementType()->isIntegerTy()) + analyzeForPackedInt(NumElements); + else if (NumElements <= 8 && VT->getElementType()->isFloatingPointTy()) + analyzeForPackedFloat(NumElements); +} + +void ConstantLoader::analyzeForPackedInt(unsigned NumElements) +{ + // Get element values. + int64_t Min = INT64_MAX; + int64_t Max = INT64_MIN; + SmallVector Elements; + Constant *SomeDefinedElement = nullptr; + for (unsigned i = 0; i != NumElements; ++i) { + auto El = C->getAggregateElement(i); + if (isa(El)) + continue; + SomeDefinedElement = El; + int64_t Element = cast(El)->getSExtValue(); + Elements.push_back(Element); + Min = std::min(Min, Element); + Max = std::max(Max, Element); + } + if (Elements.empty()) { + // Constant is undef. + assert(C == UndefValue::get(C->getType()) && + "constant consists only of undef elements only if it's undef itself"); + return; + } + if (Elements.size() == 1) { + // All but one element undef. Turn into a splat constant. + NewC = ConstantVector::getSplat(NumElements, SomeDefinedElement); + return; + } + if (Max - Min <= ImmIntVec::MaxUInt) { + if (Min >= ImmIntVec::MinUInt && Max <= ImmIntVec::MaxUInt) { + // Values all in the range [MinUInt..MaxUInt]. We can do this with a packed + // unsigned int with no extra scaling or adjustment. + PackedIntScale = 1; + PackedIntAdjust = 0; + PackedIntMax = Max; + return; + } + if (Min >= ImmIntVec::MinSInt && Max <= ImmIntVec::MaxSInt) { + // Values all in the range [MinSInt..MaxSInt]. We can do this with a packed + // unsigned int with no extra scaling or adjustment. + PackedIntScale = 1; + PackedIntAdjust = -8; + PackedIntMax = Max + 8; + return; + } + // Values all in the range [Min..Min+MaxUInt]. We can do this + // with a packed int with an adjustment. + PackedIntScale = 1; + PackedIntAdjust = Min; + PackedIntMax = Max - Min; + return; + } + // Get unique absolute differences, so we can detect if we have a valid + // packed int vector that is then scaled and has a splatted constant + // added/subtracted. + SmallVector Diffs; + SmallSet DiffsSet; + for (unsigned i = 0, e = Elements.size() - 1; i != e; ++i) { + Min = std::min(Min, Elements[i + 1]); + Max = std::max(Max, Elements[i + 1]); + int64_t Diff = Elements[i + 1] - Elements[i]; + if (!Diff) + continue; + if (Diff < 0) + Diff = -Diff; + if (Diff > UINT_MAX) + return; + if (DiffsSet.insert((unsigned)Diff).second) + Diffs.push_back((unsigned)Diff); + } + assert(!Diffs.empty() && "not expecting splatted constant"); + // Calculate the GCD (greatest common divisor) of the diffs using the binary + // GCD algorithm http://en.wikipedia.org/wiki/Binary_GCD_algorithm + unsigned GCD = Diffs[0]; + if (Diffs.size() > 1) { + // Remove factors of 2. + unsigned MaxPowerOfTwo = 31; + for (unsigned i = 0, e = Diffs.size(); i != e; ++i) + MaxPowerOfTwo = std::min(MaxPowerOfTwo, + (unsigned)countTrailingZeros(Diffs[i], ZB_Undefined)); + if (MaxPowerOfTwo) + for (unsigned i = 0, e = Diffs.size(); i != e; ++i) + Diffs[i] >>= MaxPowerOfTwo; + // Apply the rest of the binary GCD algorithm to Diffs[0] and Diffs[1] + // first, then to the (not yet scaled by the power of two) GCD so far + // and each other element of Diffs in turn. + unsigned V = Diffs[0]; + for (unsigned i = 1, e = Diffs.size(); i != e; ++i) { + unsigned U = Diffs[i]; + for (;;) { + while (!(U & 1)) + U >>= 1; + while (!(V & 1)) + V >>= 1; + if (U == V) + break; + if (U < V) + std::swap(U, V); // make U >= V + U = (U - V) / 2; + } + } + // Scale the resulting GCD by the common power of two. + GCD = V << MaxPowerOfTwo; + } + if ((Max - Min) > GCD * ImmIntVec::MaxUInt) + return; // range of values too big. + PackedIntScale = GCD; + PackedIntMax = ImmIntVec::MaxUInt; + // Special case adjust of 0 or -8 as then we can save doing an adjust at all + // by using unsigned or signed packed vector respectively. + if (!(Min % GCD)) { + if (Min >= ImmIntVec::MinUInt && Max <= GCD * ImmIntVec::MaxUInt) { + PackedIntAdjust = ImmIntVec::MinUInt; + return; + } + if (Min >= ImmIntVec::MinSInt * GCD && Max <= ImmIntVec::MaxSInt * GCD) { + PackedIntAdjust = Min; + PackedIntMax = ImmIntVec::MaxSInt; + return; + } + // Special case all pre-scaled values being in [-15,0] as we can do that + // by negating the scale and not needing to adjust. + if (Min >= -ImmIntVec::MaxUInt * GCD && Max <= -ImmIntVec::MinUInt) { + PackedIntAdjust = ImmIntVec::MinUInt; + PackedIntScale = -PackedIntScale; + return; + } + } + PackedIntAdjust = Min; +} + +static bool is8bitPackedFloat(float f) { + union { + float f; + unsigned u; + } u; + + u.f = f; + unsigned Exp = (u.u >> 23) & 0xFF; + unsigned Frac = u.u & 0x7FFFFF; + if (Exp == 0 && Frac == 0) + return true; + if (Exp < 124 || Exp > 131) + return false; + if ((Frac & 0x780000) != Frac) + return false; + Frac >>= 19; + if (Exp == 124 && Frac == 0) + return false; + return true; +} + +void ConstantLoader::analyzeForPackedFloat(unsigned NumElements) { + for (unsigned i = 0; i != NumElements; ++i) { + auto Elt = C->getAggregateElement(i); + if (isa(Elt)) + continue; + ConstantFP *CFP = dyn_cast(Elt); + // Bail out if any element cannot be analyzed. + if (!CFP) + return; + const APFloat &FP = CFP->getValueAPF(); + // Bail out if it's not supported. + // TODO: Only support single precision so far. + if (&FP.getSemantics() != &APFloat::IEEEsingle()) + return; + // Bail out if it's not finite. + if (!FP.isFinite()) + return; + // Check if it could be represented in 8-bit packed float. + if (!is8bitPackedFloat(FP.convertToFloat())) + return; + } + PackedFloat = true; +} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXConstants.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXConstants.h new file mode 100644 index 000000000000..83fd5879a323 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXConstants.h @@ -0,0 +1,135 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +#ifndef GENX_CONSTANTS_H +#define GENX_CONSTANTS_H + +#include "GenXSubtarget.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Instructions.h" + +namespace llvm { +namespace genx { + +// ConstantLoader : class to insert instruction(s) to load a constant +class ConstantLoader { + Constant *C; + Instruction *User; + // NewC != nullptr signals that we should replace C with NewC in User + // nothing to do otherwise + Constant *NewC = nullptr; + // AddedInstructions: a vector that the caller has requested any added + // instructions to be pushed in to. + SmallVectorImpl *AddedInstructions; + // Info from analyzing for possible packed vector constant. + int PackedIntScale = 0; // amount to scale packed int vector by + int64_t PackedIntAdjust; // amount to adjust by, special casing 0 or -8 + // when PackedIntScale is 1 + unsigned PackedIntMax; // max value in packed vector, used when scale is + // 1 and adjust is 0 to tell whether it would fit + // in 0..7 + bool PackedFloat = false; + +public: + // Constructor + // User = the instruction that uses the constant. If this is genx.constanti, + // then a packed vector constant can be an isSimple() constant even + // when the element type is not i16. Also used to disallow a packed + // vector constant in a logic op. If User==0 then it is assumed that + // a packed vector constant with an element type other than i16 is OK. + // AddedInstructions = vector to add new instructions to when loading a + // non simple constant, so the caller can see all the newly added + // instructions. + ConstantLoader(Constant *C, Instruction *User = nullptr, + SmallVectorImpl *AddedInstructions = nullptr, + const GenXSubtarget *Subtarget = nullptr) + : C(C), User(User), AddedInstructions(AddedInstructions), + Subtarget(Subtarget) { + analyze(); + } + ConstantLoader(Constant *C, const GenXSubtarget *Subtarget) + : ConstantLoader(C, nullptr, nullptr, Subtarget) {} + Instruction *load(Instruction *InsertBefore); + Instruction *loadBig(Instruction *InsertBefore); + Instruction *loadNonSimple(Instruction *InsertBefore); + bool needFixingSimple() const { return NewC; } + void fixSimple(int OperandIdx); + bool isBigSimple(); + bool isSimple(); + bool isLegalSize(); + +private: + const GenXSubtarget *Subtarget; + bool isPackedIntVector(); + bool isPackedFloatVector(); + void analyze(); + Constant *getConsolidatedConstant(Constant *C); + unsigned getRegionBits(unsigned NeededBits, unsigned OptionalBits, + unsigned VecWidth); + void analyzeForPackedInt(unsigned NumElements); + void analyzeForPackedFloat(unsigned NumElements); + Instruction *loadSplatConstant(Instruction *InsertPos); +}; + +// Some instructions force their operands to be constants. +// Check here if operand of instruction must be constant. +inline bool opMustBeConstant(Instruction *I, unsigned OpNum) { + // Mask of shufflevector should always be constant. + if (isa(I)) + return OpNum == 2; + return false; +} + +// Load a constant using the llvm.genx.constant intrinsic. +inline Instruction * +loadConstant(Constant *C, Instruction *InsertBefore, + SmallVectorImpl *AddedInstructions = nullptr) { + return ConstantLoader(C, nullptr, AddedInstructions).load(InsertBefore); +} + +// Load non-simple constants used in an instruction. +bool loadNonSimpleConstants( + Instruction *Inst, + SmallVectorImpl *AddedInstructions = nullptr, + const GenXSubtarget *Subtarget = nullptr); + +bool loadConstantsForInlineAsm( + CallInst *Inst, SmallVectorImpl *AddedInstructions = nullptr, + const GenXSubtarget *Subtarget = nullptr); + +// Load constants used in an instruction. +bool loadConstants(Instruction *Inst, const GenXSubtarget *Subtarget = nullptr); + +// Load constants used in phi nodes in a function. +bool loadPhiConstants(Function *F, DominatorTree *DT, + bool ExcludePredicate = false, + const GenXSubtarget *Subtarget = nullptr); + +} // namespace genx +} // namespace llvm + +#endif // GENX_CONSTANTS_H diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXDeadVectorRemoval.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXDeadVectorRemoval.cpp new file mode 100644 index 000000000000..a0309007c373 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXDeadVectorRemoval.cpp @@ -0,0 +1,746 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXDeadVectorRemoval +/// --------------------- +/// +/// GenXDeadVectorRemoval is an aggressive dead code removal pass that analyzes +/// individual elements of a vector rather than whole values. +/// +/// As a result of this analysis, the pass can then make the two following +/// modifications to the code: +/// +/// 1. If all vector elements of an instruction result turn out to be unused, the +/// instruction is removed. In fact, this pass just sets all its uses to +/// undef, relying on the subsequent dead code removal pass to actually +/// remove it. +/// +/// 2. If all vector elements of the "old value" input (even a constant) of a +/// wrregion turn out to be unused, then that input is set to undef. This +/// covers further cases over (1) above: +/// +/// a. the "old value" input is constant, and we want to turn it into undef +/// to save a useless constant load; +/// +/// b. the "old value" input is an instruction that does have elements used +/// elsewhere, and we want to turn it into undef to detach the two webs +/// of defs and uses from each other to reduce register pressure in +/// between. +/// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "GENX_DEAD_VECTOR_REMOVAL" + +#include "GenX.h" +#include "GenXBaling.h" +#include "GenXRegion.h" +#include "GenXUtil.h" + +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" + +#include +#include + +using namespace llvm; +using namespace genx; +using namespace GenXIntrinsic::GenXRegion; + +static cl::opt LimitGenXDeadVectorRemoval("limit-genx-dead-vector-removal", cl::init(UINT_MAX), cl::Hidden, + cl::desc("Limit GenX dead element removal.")); + +namespace { + +// LiveBitsStorage : encapsulate how live bits for a vector value are stored +// For 31/63 elements or fewer, the bitmap is inside the LiveBitsStorage +// object. For 32/64 elements or more, the bitmap is separately allocated. +class LiveBitsStorage { + uintptr_t V; +public: + LiveBitsStorage() : V(0) {} + ~LiveBitsStorage() { + if (auto P = getExternal()) + delete[] P; + V = 0; + } +private: + // getExternal : get the external pointer, 0 if none + // Whether we have an external pointer is encoded in the top bit. + // The pointer itself is shifted down one and stored in the other bits. + uintptr_t *getExternal() { + if ((intptr_t)V >= 0) + return nullptr; // top bit not set, not external + return (uintptr_t *)(V * 2); + } + // setExternal : set the external pointer + void setExternal(uintptr_t *P) { + assert(!getExternal()); + V = (uintptr_t)P >> 1 | (uintptr_t)1U << (sizeof(uintptr_t) * 8 - 1); + } +public: + // setNumElements : set the number of elements to be stored in this + // LiveBitsStorage. Allocate external storage if necessary. + void setNumElements(unsigned NumElements) { + if (NumElements >= sizeof(uintptr_t) * 8 - 1) { + unsigned Size = NumElements + sizeof(uintptr_t) * 8 - 1 + / (sizeof(uintptr_t) * 8); + setExternal(new uintptr_t[Size]); + memset(getExternal(), 0, Size * sizeof(uintptr_t)); + } + } + // get : get the pointer to the bitmap + uintptr_t *get() { + if (auto P = getExternal()) + return P; + return &V; + } +}; + +// LiveBits : encapsulate a pointer to a bitmap of element liveness and its size +class LiveBits { + uintptr_t *P; + unsigned NumElements; +public: + static const unsigned BitsPerWord = sizeof(uintptr_t) * 8; + LiveBits() : P(nullptr), NumElements(0) {} + LiveBits(LiveBitsStorage *LBS, unsigned NumElements) + : P(LBS->get()), NumElements(NumElements) {} + // getNumElements : get the number of elements in this bitmap + unsigned getNumElements() const { return NumElements; } + // get : get a bit value + bool get(unsigned Idx) const { + assert(Idx < NumElements); + return P[Idx / BitsPerWord] >> (Idx % BitsPerWord) & 1; + } + // isAllZero : return true if all bits zero + bool isAllZero() const; + // set : set a bit value + // Returns true if value changed + bool set(unsigned Idx, bool Val = true); + // copy : copy all bits from another LiveBits + // Returns true if value changed + bool copy(LiveBits Src); + // orBits : or all bits from another LiveBits into this one + // Returns true if value changed + bool orBits(LiveBits Src); + // setRange : set range of bits, returning true if any changed + bool setRange(unsigned Start, unsigned Len); + // debug print + void print(raw_ostream &OS) const; +}; + +#ifndef NDEBUG +static raw_ostream &operator<<(raw_ostream &OS, const LiveBits &LB) { + LB.print(OS); + return OS; +} +#endif + +// GenXDeadVectorRemoval : dead vector element removal pass +class GenXDeadVectorRemoval : public FunctionPass { + std::map InstMap; + std::set WorkListSet; + std::queue WorkList; + std::set WrRegionsWithUsedOldInput; + bool WorkListPhase; +public: + static char ID; + explicit GenXDeadVectorRemoval() : FunctionPass(ID) { } + virtual StringRef getPassName() const { return "GenX dead vector element removal pass"; } + void getAnalysisUsage(AnalysisUsage &AU) const; + bool runOnFunction(Function &F); +private: + void clear() { + InstMap.clear(); + WorkListSet.clear(); + assert(WorkList.empty()); + WrRegionsWithUsedOldInput.clear(); + } + bool nullOutInstructions(Function *F); + void processInst(Instruction *Inst); + void processRdRegion(Instruction *Inst, LiveBits LB); + void processWrRegion(Instruction *Inst, LiveBits LB); + void processBitCast(Instruction *Inst, LiveBits LB); + void processElementwise(Instruction *Inst, LiveBits LB); + void markWhollyLive(Value *V); + void addToWorkList(Instruction *Inst); + LiveBits getLiveBits(Instruction *Inst, bool Create = false); +}; + +} // end anonymous namespace + + +char GenXDeadVectorRemoval::ID = 0; +namespace llvm { void initializeGenXDeadVectorRemovalPass(PassRegistry &); } +INITIALIZE_PASS_BEGIN(GenXDeadVectorRemoval, "GenXDeadVectorRemoval", "GenXDeadVectorRemoval", false, false) +INITIALIZE_PASS_END(GenXDeadVectorRemoval, "GenXDeadVectorRemoval", "GenXDeadVectorRemoval", false, false) + +FunctionPass *llvm::createGenXDeadVectorRemovalPass() +{ + initializeGenXDeadVectorRemovalPass(*PassRegistry::getPassRegistry()); + return new GenXDeadVectorRemoval(); +} + +void GenXDeadVectorRemoval::getAnalysisUsage(AnalysisUsage &AU) const +{ + AU.setPreservesCFG(); +} + +/*********************************************************************** + * isRootInst : check if this is a "root" instruction, one that we want to + * keep even if unused + */ +static bool isRootInst(Instruction *Inst) { + if (isa(Inst) || isa(Inst) || + Inst->isTerminator() || Inst->mayHaveSideEffects()) + return true; + if (auto CI = dyn_cast(Inst)) + return !CI->onlyReadsMemory(); + return false; +} + +/*********************************************************************** + * GenXDeadVectorRemoval::runOnFunction : process one function + */ +bool GenXDeadVectorRemoval::runOnFunction(Function &F) +{ + // First scan all the code to compute the initial live set + WorkListPhase = false; + for (po_iterator i = po_begin(&F.getEntryBlock()), + e = po_end(&F.getEntryBlock()); i != e; ++i) { + BasicBlock *BB = *i; + for (Instruction *Inst = BB->getTerminator(); Inst;) { + if (isRootInst(Inst)) + processInst(Inst); + else if (WorkListSet.count(Inst)) { + if (!isa(Inst)) + WorkListSet.erase(Inst); + processInst(Inst); + } + Inst = (Inst == &BB->front()) ? nullptr : Inst->getPrevNode(); + } + } + + WorkListPhase = true; + // initialize the worklist + for (auto Inst : WorkListSet) { + WorkList.push(Inst); + } + // process until the work list is empty. + LLVM_DEBUG(dbgs() << "GenXDeadVectorRemoval: process work list\n"); + while (!WorkList.empty()) { + Instruction *Inst = WorkList.front(); + WorkList.pop(); + WorkListSet.erase(Inst); + processInst(Inst); + } + // Null out unused instructions so the subsequent dead code removal pass + // removes them. + LLVM_DEBUG(dbgs() << "GenXDeadVectorRemoval: null out instructions\n"); + bool Modified = nullOutInstructions(&F); + clear(); + return Modified; +} + +/*********************************************************************** + * nullOutInstructions : null out unused instructions so the subsequent dead + * code removal pass removes them + * + * For wrregion, there are two special cases: + * - when no elements in the "new value" input of a wrregion are use, + * then bypass the wrregion with the "old value". + * - when no elements in the "old value" input of a wrregion are used, + * then changes the input to undef. + */ +bool GenXDeadVectorRemoval::nullOutInstructions(Function *F) +{ + static unsigned Count = 0; + bool Modified = false; + for (auto fi = F->begin(), fe = F->end(); fi != fe; ++fi) { + for (auto bi = fi->begin(), be = fi->end(); bi != be; ++bi) { + Instruction *Inst = &*bi; + // Ignore "root" instructions. + if (isRootInst(Inst)) + continue; + // See if the instruction has no used elements. If so, null out its uses. + auto LB = getLiveBits(Inst); + if (LB.isAllZero()) { + if (++Count > LimitGenXDeadVectorRemoval) + return Modified; + if (LimitGenXDeadVectorRemoval != UINT_MAX) + dbgs() << "-limit-genx-dead-vector-removal " << Count << "\n"; + LLVM_DEBUG(if (!Inst->use_empty()) + dbgs() << "nulled out uses of " << *Inst << "\n"); + while (!Inst->use_empty()) { + Use *U = &*Inst->use_begin(); + *U = UndefValue::get((*U)->getType()); + } + Modified = true; + } else if (GenXIntrinsic::isWrRegion(Inst)) { + if (!Inst->use_empty()) { + auto *SI = dyn_cast(Inst->user_back()); + if (SI && genx::isGlobalStore(SI)) { + assert(Inst->hasOneUse() && + "Wrregion in gstore bale has more than one use"); + continue; + } + } + // Otherwise, for a wrregion, check if it is in the old input used set. + // If not, then no element of the "old value" input is used by this + // instruction (even if it has bits set from other uses), and we can + // undef out the input. + Use *U = &Inst->getOperandUse(GenXIntrinsic::GenXRegion::OldValueOperandNum); + if (WrRegionsWithUsedOldInput.find(Inst) + == WrRegionsWithUsedOldInput.end()) { + if (!isa(*U)) { + if (++Count > LimitGenXDeadVectorRemoval) + return Modified; + if (LimitGenXDeadVectorRemoval != UINT_MAX) + dbgs() << "-limit-genx-dead-vector-removal " << Count << "\n"; + *U = UndefValue::get((*U)->getType()); + LLVM_DEBUG(dbgs() << "null out old value input in " << *Inst << "\n"); + Modified = true; + } + } + // when no elements in the "new value" input of a wrregion are use, + // then bypass the wrregion with the "old value". + bool bypass = true; + Region R(Inst, BaleInfo()); + if (R.Mask || R.Indirect) + bypass = false; + else { + for (unsigned RowIdx = R.Offset / R.ElementBytes, Row = 0, + NumRows = R.NumElements / R.Width; Row != NumRows && bypass; + RowIdx += R.VStride, ++Row) { + for (unsigned Idx = RowIdx, Col = 0; Col != R.Width && bypass; + Idx += R.Stride, ++Col) { + if (Idx < LB.getNumElements() && LB.get(Idx)) + bypass = false; + } + } + } + if (bypass) { + Inst->replaceAllUsesWith(Inst->getOperandUse(GenXIntrinsic::GenXRegion::OldValueOperandNum)); + Modified = true; + } + } + } + } + return Modified; +} + +/*********************************************************************** + * processInst : process an instruction in the dead element removal pass + */ +void GenXDeadVectorRemoval::processInst(Instruction *Inst) +{ + LLVM_DEBUG(dbgs() << " " << *Inst << "\n has bits " << getLiveBits(Inst) << "\n"); + if (isRootInst(Inst)) { + // This is a "root" instruction. Mark its inputs as wholly live. + for (unsigned oi = 0, oe = Inst->getNumOperands(); oi != oe; ++oi) + markWhollyLive(Inst->getOperand(oi)); + return; + } + // Check for the result of the instruction not being used at all. + auto LB = getLiveBits(Inst); + if (!LB.getNumElements()) + return; + // Handle phi node. + if (auto Phi = dyn_cast(Inst)) { + processElementwise(Phi, LB); + return; + } + // Special case for bitcast. + if (auto BC = dyn_cast(Inst)) { + processBitCast(BC, LB); + return; + } + // Check for element-wise instructions. + if (isa(Inst) || isa(Inst) + || isa(Inst) || isa(Inst)) { + processElementwise(Inst, LB); + return; + } + // Check for rdregion and wrregion. + switch (GenXIntrinsic::getGenXIntrinsicID(Inst)) { + case GenXIntrinsic::genx_rdregionf: + case GenXIntrinsic::genx_rdregioni: + case GenXIntrinsic::genx_rdpredregion: + processRdRegion(Inst, LB); + return; + case GenXIntrinsic::genx_wrregionf: + case GenXIntrinsic::genx_wrregioni: + case GenXIntrinsic::genx_wrconstregion: + case GenXIntrinsic::genx_wrpredregion: + processWrRegion(Inst, LB); + return; + default: + break; + } + // For any other instruction, just mark all operands as wholly live. + for (unsigned oi = 0, oe = Inst->getNumOperands(); oi != oe; ++oi) + markWhollyLive(Inst->getOperand(oi)); +} + +/*********************************************************************** + * processRdRegion : process a rdregion instruction for element liveness + */ +void GenXDeadVectorRemoval::processRdRegion(Instruction *Inst, LiveBits LB) +{ + auto InInst = dyn_cast( + Inst->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum)); + Region R(Inst, BaleInfo()); + if (R.Indirect) { + markWhollyLive(InInst); + markWhollyLive(Inst->getOperand(GenXIntrinsic::GenXRegion::RdIndexOperandNum)); + return; + } + if (!InInst) + return; + // Set bits in InLB (InInst's livebits) for live elements read by the + // rdregion. + bool Modified = false; + LiveBits InLB = getLiveBits(InInst, /*Create=*/true); + for (unsigned RowIdx = R.Offset / R.ElementBytes, Row = 0, + NumRows = R.NumElements / R.Width; Row != NumRows; + RowIdx += R.VStride, ++Row) + for (unsigned Idx = RowIdx, Col = 0; Col != R.Width; Idx += R.Stride, ++Col) + if (LB.get(Row * R.Width + Col)) + if (Idx < InLB.getNumElements()) + Modified |= InLB.set(Idx); + if (Modified) + addToWorkList(InInst); +} + +/*********************************************************************** + * processWrRegion : process a wrregion instruction for element liveness + */ +void GenXDeadVectorRemoval::processWrRegion(Instruction *Inst, LiveBits LB) +{ + Region R(Inst, BaleInfo()); + if (R.Mask) + markWhollyLive(Inst->getOperand(GenXIntrinsic::GenXRegion::PredicateOperandNum)); + auto NewInInst = dyn_cast( + Inst->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum)); + if (R.Indirect) { + markWhollyLive(NewInInst); + markWhollyLive(Inst->getOperand(GenXIntrinsic::GenXRegion::WrIndexOperandNum)); + } else if (NewInInst) { + // Set bits in NewInLB (NewInInst's livebits) for live elements read by + // the wrregion in the "new value" input. + bool Modified = false; + LiveBits NewInLB = getLiveBits(NewInInst, /*Create=*/true); + for (unsigned RowIdx = R.Offset / R.ElementBytes, Row = 0, + NumRows = R.NumElements / R.Width; Row != NumRows; + RowIdx += R.VStride, ++Row) + for (unsigned Idx = RowIdx, Col = 0; Col != R.Width; + Idx += R.Stride, ++Col) + if (Idx < LB.getNumElements() && LB.get(Idx)) + Modified |= NewInLB.set(Row * R.Width + Col); + if (Modified) + addToWorkList(NewInInst); + } + // For the "old value" input, we want to see if any elements are used even if + // the input is a constant, since we want to be able to turn it into undef + // later on if it is not used. In the non-instruction case, OldInLB is left + // in a state where it contains no bits and OldInLB.getNumElements() is 0. + LiveBits OldInLB; + auto OldInInst = dyn_cast( + Inst->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum)); + if (OldInInst) + OldInLB = getLiveBits(OldInInst, /*Create=*/true); + bool Modified = false; + bool UsedOldInput = false; + if (R.Indirect) { + if (OldInLB.getNumElements()) + Modified = OldInLB.orBits(LB); + UsedOldInput = true; + } else { + // Set bits in OldLB (OldInInst's livebits) for live elements read by the + // wrregion in the "old value" input, excluding ones that come from the + // "new value" input. + unsigned NextRow = 0, NextCol = 0, NextIdx = R.Offset / R.ElementBytes, + NextRowIdx = NextIdx, NumRows = R.NumElements / R.Width; + for (unsigned Idx = 0, End = LB.getNumElements(); Idx != End; ++Idx) { + if (Idx == NextIdx) { + // This element comes from the "new value" input, unless the wrregion + // is predicated in which case it could come from either. + if (R.Mask && LB.get(Idx)) { + UsedOldInput = true; + if (OldInLB.getNumElements()) + Modified |= OldInLB.set(Idx); + } + if (++NextCol == R.Width) { + if (++NextRow == NumRows) + NextIdx = End; + else + NextIdx = NextRowIdx += R.VStride; + NextCol = 0; + } else + NextIdx += R.Stride; + } else { + // This element comes from the "old value" input. + if (LB.get(Idx)) { + UsedOldInput = true; + if (OldInLB.getNumElements()) + Modified |= OldInLB.set(Idx); + } + } + } + } + if (Modified) + addToWorkList(OldInInst); + if (UsedOldInput) { + // We know that at least one element of the "old value" input is used, + // so add the wrregion to the used old input set. + WrRegionsWithUsedOldInput.insert(Inst); + } +} + +/*********************************************************************** + * processBitCast : process a bitcast instruction for element liveness + */ +void GenXDeadVectorRemoval::processBitCast(Instruction *Inst, LiveBits LB) +{ + auto InInst = dyn_cast(Inst->getOperand(0)); + if (!InInst) + return; + LiveBits InLB = getLiveBits(InInst, /*Create=*/true); + bool Modified = false; + if (InLB.getNumElements() == LB.getNumElements()) + Modified = InLB.orBits(LB); + else if (InLB.getNumElements() > LB.getNumElements()) { + assert((InLB.getNumElements() % LB.getNumElements()) == 0); + int Scale = InLB.getNumElements() / LB.getNumElements(); + // Input element is smaller than result element. + for (unsigned Idx = 0, End = LB.getNumElements(); Idx != End; ++Idx) + if (LB.get(Idx)) + Modified |= InLB.setRange(Idx * Scale, Scale); + } else { + assert((LB.getNumElements() % InLB.getNumElements()) == 0); + int Scale = LB.getNumElements() / InLB.getNumElements(); + // Input element is bigger than result element. + for (unsigned Idx = 0, End = InLB.getNumElements(); Idx != End; ++Idx) { + bool IsSet = false; + for (unsigned Idx2 = 0; Idx2 != Scale; ++Idx2) + IsSet |= LB.get(Idx*Scale | Idx2); + if (IsSet) + Modified |= InLB.set(Idx); + } + } + if (Modified) + addToWorkList(InInst); +} + +/*********************************************************************** + * processElementwise : process an element-wise instruction such as add or + * a phi node + */ +void GenXDeadVectorRemoval::processElementwise(Instruction *Inst, LiveBits LB) +{ + for (unsigned oi = 0, oe = Inst->getNumOperands(); oi != oe; ++oi) { + auto OpndInst = dyn_cast(Inst->getOperand(oi)); + if (!OpndInst) + continue; + auto OpndLB = getLiveBits(OpndInst, /*Create=*/true); + if (isa(Inst) && oi == 0 && + !OpndInst->getType()->isVectorTy()) { + // First operand of select inst can be scalar, ignore it + markWhollyLive(OpndInst); + continue; + } + + if (OpndLB.orBits(LB)) + addToWorkList(OpndInst); + } +} + +/*********************************************************************** + * markWhollyLive : mark a value as wholly live (all elements live) + */ +void GenXDeadVectorRemoval::markWhollyLive(Value *V) +{ + auto Inst = dyn_cast_or_null(V); + if (!Inst) + return; + auto LB = getLiveBits(Inst, /*Create=*/true); + if (LB.setRange(0, LB.getNumElements())) + addToWorkList(Inst); +} + +/*********************************************************************** + * addToWorkList : add instruction to work list if not already there + * + * Enter: Inst = the instruction + * + * This does not actually add to the work list in the initial scan through + * the whole code. + */ +void GenXDeadVectorRemoval::addToWorkList(Instruction *Inst) +{ + LLVM_DEBUG(dbgs() << " " << Inst->getName() << " now " << getLiveBits(Inst) << "\n"); + if (WorkListSet.insert(Inst).second && WorkListPhase) { + LLVM_DEBUG(dbgs() << " adding " << Inst->getName() << " to work list\n"); + WorkList.push(Inst); + } +} + +/*********************************************************************** + * getLiveBits : get the bitmap of live elements for the given instruction + * + * Return: LiveBits object, which contains a pointer to the bitmap for + * this instruction, and a size which is set to 0 if there is no + * bitmap allocated yet for this instruction and Create is false + */ +LiveBits GenXDeadVectorRemoval::getLiveBits(Instruction *Inst, bool Create) +{ + unsigned NumElements = 1; + if (auto VT = dyn_cast(Inst->getType())) + NumElements = VT->getNumElements(); + LiveBitsStorage *LBS = nullptr; + if (!Create) { + auto i = InstMap.find(Inst); + if (i == InstMap.end()) + return LiveBits(); + LBS = &i->second; + } else { + auto Ret = InstMap.insert(std::map::value_type(Inst, LiveBitsStorage())); + LBS = &Ret.first->second; + if (Ret.second) { + // New entry. Set its number of elements. + LBS->setNumElements(NumElements); + } + } + return LiveBits(LBS, NumElements); +} + +/*********************************************************************** + * LiveBits::isAllZero : return true if all bits zero + */ +bool LiveBits::isAllZero() const +{ + for (unsigned Idx = 0, End = (NumElements + BitsPerWord - 1) / BitsPerWord; + Idx != End; ++Idx) + if (P[Idx]) + return false; + return true; +} + +/*********************************************************************** + * LiveBits::set : set (or clear) bit + * + * Enter: Idx = element number + * Val = true to set, false to clear, default true + * + * Return: true if the bitmap changed + */ +bool LiveBits::set(unsigned Idx, bool Val) +{ + assert(Idx < NumElements); + uintptr_t *Ptr = P + Idx / BitsPerWord; + uintptr_t Bit = 1ULL << (Idx % BitsPerWord); + uintptr_t Entry = *Ptr; + if (Val) + Entry |= Bit; + else + Entry &= ~Bit; + bool Ret = Entry != *Ptr; + *Ptr = Entry; + return Ret; +} + +/*********************************************************************** + * LiveBits::copy : copy all bits from another LiveBits + */ +bool LiveBits::copy(LiveBits Src) +{ + assert(NumElements == Src.NumElements); + bool Modified = false; + for (unsigned Idx = 0, End = (NumElements + BitsPerWord - 1) / BitsPerWord; + Idx != End; ++Idx) { + Modified |= P[Idx] != Src.P[Idx]; + P[Idx] = Src.P[Idx]; + } + return Modified; +} + +/*********************************************************************** + * LiveBits::orBits : or all bits from another LiveBits into this one + */ +bool LiveBits::orBits(LiveBits Src) +{ + assert(NumElements == Src.NumElements); + bool Modified = false; + for (unsigned Idx = 0, End = (NumElements + BitsPerWord - 1) / BitsPerWord; + Idx != End; ++Idx) { + uintptr_t Word = P[Idx] | Src.P[Idx]; + Modified |= P[Idx] != Word; + P[Idx] = Word; + } + return Modified; +} + +/*********************************************************************** + * LiveBits::setRange : set range of bits, returning true if any changed + */ +bool LiveBits::setRange(unsigned Start, unsigned Len) +{ + bool Modified = false; + unsigned End = Start + Len; + assert(End <= NumElements); + while (Start != End) { + unsigned ThisLen = BitsPerWord - (Start & (BitsPerWord - 1)); + if (ThisLen > End - Start) + ThisLen = End - Start; + uintptr_t *Entry = P + (Start / BitsPerWord); + uintptr_t Updated = *Entry + | ((uintptr_t)-1LL >> (BitsPerWord - ThisLen)) + << (Start & (BitsPerWord - 1)); + if (Updated != *Entry) { + Modified = true; + *Entry = Updated; + } + Start += ThisLen; + } + return Modified; +} + +/*********************************************************************** + * LiveBits::print : debug print + */ +void LiveBits::print(raw_ostream &OS) const +{ + for (unsigned Idx = 0, End = getNumElements(); Idx != End; ++Idx) + OS << get(Idx); +} + diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXDepressurizer.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXDepressurizer.cpp new file mode 100644 index 000000000000..9b9a24d89690 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXDepressurizer.cpp @@ -0,0 +1,1662 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXDepressurizer +/// ----------------- +/// +/// GenXDepressurizer is a pass that identifies where register pressure is +/// excessive, and attempts to sink and/or clone definitions past that area to +/// reduce register pressure. +/// +/// Currently the pass is enabled to handle only flag (predicate) values. It is +/// supposed to work for general values, but that is not yet enabled and it may +/// require some bug fixing and fine tuning before it is. +/// +/// In fact this pass is now viewed as a dead end. The plan to replace it is a +/// pass that does register allocation as if into Gen's real registers, doing +/// live range splitting and rematerialization where required, to help undo the +/// register-pressure-increasing effects of CSE and LICM where it would cause a +/// spill. +/// +/// The basic idea of the existing GenXDepressurizer pass: +/// +/// 1. Scan the code backwards, keeping track of what values are live and what +/// the register pressure is (total size of all live values, also the total +/// size for flag (predicate) values). +/// +/// 2. Where register pressure becomes excessive, look at currently live values +/// to see if any is a definition that could profitably be sunk to below the +/// current point. +/// +/// 3. Sink any such instructions until register pressure is no longer +/// excessive. +/// +/// 4. For a flag value, "profitably be sunk" includes the case that it +/// decreases flag register pressure but increases overall register pressure +/// (by, for instance, lengthening the live ranges of the inputs to a cmp), +/// but general register pressure is not high at the current point. +/// +/// 5. A flag value that does not require cloning (all uses are dominated by the +/// current point) is sunk anyway, as long as it does not push an already +/// high general pressure up higher. +/// +/// Point 5 means that this pass replaces GenXCodeSinking, which sank any single +/// use flag value. +/// +/// There are some complications to the scheme: +/// +/// * How do we scan code backwards in a way that keeps track of pressure when +/// there is control flow, particularly loops? +/// +/// * When considering a definition to sink, we need to know whether a +/// particular use is reachable from the current point, and whether it is +/// dominated by it. +/// +/// Backwards scanning order and pseudo CFG +/// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +/// +/// In order to keep track of liveness and pressure as we scan backwards, we +/// want to scan the basic blocks in an order that ensures that we do not scan +/// a particular basic block until we have scanned all its successors. In that +/// way we can easily gather the live out set of the basic block from the live +/// in of each successor, modified by the incoming for our block in the phi +/// nodes in the successor. (If there are phi nodes, there is only one +/// successor, because critical edges have been split.) +/// +/// A loop needs special consideration. We want to scan all of the blocks of a +/// loop (including inner loops) in one go, after scanning all possible +/// successors of the loop, and before scanning the predecessor(s) of the loop +/// header. Within the loop, we want to start at the backedge predecessor(s), +/// but we need to set up the liveness at the end of a backedge predecessor to +/// take account of +/// +/// a. any value that is live in to the loop and live out of the loop at some +/// loop exit, and +/// +/// b. any value that is defined in the loop and is live round the backedge. +/// +/// Superbales +/// ^^^^^^^^^^ +/// +/// Sinking is performed in units of a superbale. +/// +/// For a general value, a superbale is the bale that defines the value, and, +/// if that is a wrregion, the rest of the chain of wrregion bales that write +/// to other parts of that value and have the same inputs as the defining bale. +/// We consider such a superbale as a whole because considering and sinking +/// just the bale would not show any benefit, because it has an input to the +/// wrregion the same size as the result. Such a chain of wrregions typically +/// arises from legalization where vector decomposing has not subsequently been +/// able to split the big vector up. +/// +/// For a flag value, a superbale is a tree where each non-leaf node is an +/// and/or/xor/not instruction acting on predicates. Again this is done because +/// sinking just an and/or/xor/not instruction would not show any benefit to +/// flag pressure. +/// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "GENX_DEPRESSURIZER" + +#include "FunctionGroup.h" +#include "GenX.h" +#include "GenXBaling.h" +#include "GenXGotoJoin.h" +#include "GenXIntrinsics.h" +#include "GenXLiveness.h" +#include "GenXModule.h" +#include "GenXRegion.h" +#include "GenXUtil.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; +using namespace genx; + +static cl::opt + LimitGenXDepressurizer("limit-genx-depressurizer", cl::init(UINT_MAX), + cl::Hidden, cl::desc("Limit GenX depressurizer.")); + +STATISTIC(NumSunk, "Number of instructions sunk"); +STATISTIC(NumCloned, "Number of instructions cloned"); + +namespace { + +// PseudoCFG : the pseudo control flow graph for a function +// +// The pseudo CFG is a graph of the basic blocks in a function, similar to the +// real CFG, but with the following differences: +// * It is acyclic +// * Therefore there are no loop backedges +// * What was a loop backedge in the real CFG is replaced by special "loop +// exit" edges from what was the loop backedge predecessor of the loop +// header to each loop exit block, also pointing to the loop header. +// * This only works if the real CFG is reducible. Any unnatural loops in the +// real CFG are probably not properly represented in the pseudoCFG. +// +// The pseudo CFG also provides an ordering of blocks such that a block is not +// visited until all its predecessors have been. Because of the above changes +// in the pseudo CFG, this also has the property that, once we get to a +// (natural) loop header, all blocks in the loop are processed before anything +// else. +// +// The pseudo CFG provides a way to propagate liveness backwards through the +// function: +// * Visit blocks in the reverse of the pseudo CFG ordering, such that no +// block is visited until all its successors have been, and no part of a +// loop is visited until all of the loop exits have been. +// * For a block: +// 1. initialize the live out with (real CFG) successors' corresponding phi +// incomings; +// 2. for a normal edge, propagate the successor's live in to this block's +// live out; +// 3. for a "loop exit" edge, propagate the successor's live in to this +// block's live out, but only for values that are defined before the loop +// header, i.e. in a block that would be visited by the loop header in +// this visit order. +// This provides the correct liveness for any particular point within a loop +// for these cases: +// a. a value that is used after this point in the loop (from 2); +// b. a value that is live round any backedge reachable from this point +// (from 1); +// c. a value that is defined in the loop and used after the loop via a +// loop exit reachable from this point (from 2); +// d. a value that is defined before the loop and used after the loop, and +// is thus live through the whole loop (from 3). +// +// If the real CFG is irreducible, then this liveness information will be +// inaccurate. +// +class PseudoCFG { +public: + struct Node { + friend PseudoCFG; + SmallVector Preds; + SmallVector Succs; + BasicBlock *LoopHeader; + Node() : LoopHeader(nullptr) {} + + public: + void removeSucc(BasicBlock *Succ); + void removePred(BasicBlock *Pred); + // getLoopHeader : normally returns 0. If this is a backedge node, + // returns the corresponding loop header block + BasicBlock *getLoopHeader() { return LoopHeader; } + // pred and succ iterators + typedef SmallVectorImpl::iterator pred_iterator; + pred_iterator pred_begin() { return Preds.begin(); } + pred_iterator pred_end() { return Preds.end(); } + typedef SmallVectorImpl::iterator succ_iterator; + succ_iterator succ_begin() { return Succs.begin(); } + succ_iterator succ_end() { return Succs.end(); } + }; + +private: + std::vector Ordering; + std::map Nodes; + +public: + void clear() { + Ordering.clear(); + Nodes.clear(); + } + // compute : compute the pseudo CFG for the function. + // It is assumed that critical edges have been split. + void compute(Function *F, DominatorTree *DT, + LoopInfoBase *LI); + // getNode : get pseudo CFG node for basic block + Node *getNode(BasicBlock *BB) { return &Nodes[BB]; } + // iterators through the ordering + typedef std::vector::iterator iterator; + iterator begin() { return Ordering.begin(); } + iterator end() { return Ordering.end(); } + typedef std::vector::reverse_iterator reverse_iterator; + reverse_iterator rbegin() { return Ordering.rbegin(); } + reverse_iterator rend() { return Ordering.rend(); } + // Debug dump/print + void dump() { print(dbgs()); } + void print(raw_ostream &OS); +}; + +// Liveness : the liveness information at some point in the program +// This class is local to this source file and completely unrelated to +// GenXLiveness. +class Liveness { +public: + enum Category { GENERAL, FLAG, ADDR, NUMCATS }; + +private: + std::set Values[NUMCATS]; + unsigned Pressure; // overall register pressure + unsigned Pressures[NUMCATS]; // pressure for each individual category +public: + Liveness() : Pressure(0) { + for (unsigned Cat = 0; Cat != NUMCATS; ++Cat) + Pressures[Cat] = 0; + } + static bool isFlag(Value *V) { + return V->getType()->getScalarType()->isIntegerTy(1); + } + static bool isAddr(Value *V) { + if (!V->getType()->getScalarType()->isIntegerTy(16)) + return false; + switch (GenXIntrinsic::getGenXIntrinsicID(V)) { + case GenXIntrinsic::genx_convert_addr: + case GenXIntrinsic::genx_add_addr: + return true; + default: + break; + } + return false; + } + static unsigned getValueSize(Value *V); + void copyFrom(Liveness *Other); + void addValue(Value *V); + bool removeValue(Value *V); + void copyValues(Liveness *Other); + unsigned getPressure(unsigned Cat) { return Pressures[Cat]; } + unsigned getPressure() { return Pressure; } + bool contains(Value *V) { + auto ValueSet = &Values[GENERAL]; + if (isFlag(V)) + ValueSet = &Values[FLAG]; + else if (isAddr(V)) + ValueSet = &Values[ADDR]; + return ValueSet->find(V) != ValueSet->end(); + } + // Iterator (over set of values) + typedef std::set::iterator iterator; + iterator begin(unsigned Cat) { return Values[Cat].begin(); } + iterator end(unsigned Cat) { return Values[Cat].end(); } + unsigned cat_begin() { return 0; } + unsigned cat_end() { return NUMCATS; } + // Debug print and dump + void print(raw_ostream &OS); + void dump() { print(dbgs()); dbgs() << '\n'; } +}; + +// Superbale : a sequence of bales where each is headed by a wrregion whose +// "old value of vector" input is the previous bale, and the other operands of +// the bales are all the same. +struct Superbale { + // Instruction number of head instruction of superbale + unsigned Number; + // Bale head instructions, stored in reverse of code order + SmallVector Bales; + // Operands (some entries can be nullptr) + SmallVector Operands; + Instruction *getHead() { return Bales[0]; } + void print(raw_ostream &OS); + void dump() { print(dbgs()); dbgs() << '\n'; } +}; + +// SinkCandidate : a candidate superbale for sinking +struct SinkCandidate { + Superbale *SB; + int Benefit; + bool AllUsesDominatedByHere; + SinkCandidate(Superbale *SB, int Benefit, bool AUDBH) + : SB(SB), Benefit(Benefit), AllUsesDominatedByHere(AUDBH) {} + // Sort by whether all uses are dominated by here, then by best benefit, then + // by latest definition point. + bool operator<(const SinkCandidate &Rhs) const { + if (AllUsesDominatedByHere != Rhs.AllUsesDominatedByHere) + return AllUsesDominatedByHere > Rhs.AllUsesDominatedByHere; + if (Benefit != Rhs.Benefit) + return Benefit > Rhs.Benefit; + if (SB == nullptr) + return false; + if (Rhs.SB == nullptr) + return true; + return SB->Number > Rhs.SB->Number; + } +}; + +// GenX depressurizer pass +class GenXDepressurizer : public FunctionGroupPass { + enum { FlagThreshold = 6, AddrThreshold = 32, GRFThreshold = 2560, + FlagGRFTolerance = 3840 }; + bool Modified; + GenXGroupBaling *Baling; + DominatorTree *DT; + LoopInfoBase *LI; + PseudoCFG *PCFG; + unsigned MaxPressure; + std::map SubroutinePressures; + std::map LiveIn; + std::map LiveOut; + Liveness *Live; + // A numbering of instructions. Because of the way the basic block ordering + // is constructed, if instruction I2 is reachable from instruction I1, then + // InstNumbers[I1] < InstNumbers[I2], unless the reachability is via a + // loop backedge. The converse is not necessarily true. + std::map InstNumbers; + +public: + static char ID; + explicit GenXDepressurizer() : FunctionGroupPass(ID) {} + StringRef getPassName() const override { + return "GenX register pressure reducer"; + } + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnFunctionGroup(FunctionGroup &FG) override; + // createPrinterPass : get a pass to print the IR, together with the GenX + // specific analyses + Pass *createPrinterPass(raw_ostream &O, + const std::string &Banner) const override { + return createGenXGroupPrinterPass(O, Banner); + } + +private: + void processFunction(Function *F); + void orderAndNumber(Function *F); + void processBasicBlock(BasicBlock *BB); + void getLiveOut(BasicBlock *BB, Liveness *Live); + void processInstruction(Instruction *Inst); + void attemptSinking(Instruction *InsertBefore, std::set *Exclude, + Liveness::Category Cat, bool AllowClone); + bool sink(Instruction *InsertBefore, Superbale *SB, bool AllowClone = false); + BasicBlock *sinkOnce(Instruction *InsertBefore, Superbale *SB, + ArrayRef Uses); + bool modifyLiveness(Liveness *Live, Superbale *SB); + int getSuperbaleKillSize(Superbale *SB); + int getSinkBenefit(Superbale *SB, Liveness::Category Cat, unsigned Headroom); + bool fillSuperbale(Superbale *SB, Instruction *Inst, bool IsFlag); + void MergeCandidate(SinkCandidate &Lhs, SinkCandidate &Rhs); +}; + +} // end anonymous namespace + +char GenXDepressurizer::ID = 0; +namespace llvm { +void initializeGenXDepressurizerPass(PassRegistry &); +} +INITIALIZE_PASS_BEGIN(GenXDepressurizer, "GenXDepressurizer", "GenXDepressurizer", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeGroupWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GenXLiveness) +INITIALIZE_PASS_DEPENDENCY(GenXGroupBaling) +INITIALIZE_PASS_END(GenXDepressurizer, "GenXDepressurizer", "GenXDepressurizer", false, false) + +FunctionGroupPass *llvm::createGenXDepressurizerPass() +{ + initializeGenXDepressurizerPass(*PassRegistry::getPassRegistry()); + return new GenXDepressurizer(); +} + +void GenXDepressurizer::getAnalysisUsage(AnalysisUsage &AU) const { + FunctionGroupPass::getAnalysisUsage(AU); + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.setPreservesCFG(); +} + +/*********************************************************************** + * runOnFunctionGroup : run the register pressure reduction pass for + * this FunctionGroup + */ +bool GenXDepressurizer::runOnFunctionGroup(FunctionGroup &FG) { + if (skipOptWithLargeBlock(FG)) + return false; + + Modified = false; + Baling = &getAnalysis(); + // Process functions in the function group in reverse order, so we know the + // max pressure in a subroutine when we see a call to it. + for (auto fgi = FG.rbegin(), fge = FG.rend(); fgi != fge; ++fgi) { + Function *F = *fgi; + processFunction(F); + SubroutinePressures[F] = MaxPressure; + } + SubroutinePressures.clear(); + return Modified; +} + +/*********************************************************************** + * processFunction : run depressurizer on one function + */ +void GenXDepressurizer::processFunction(Function *F) { + LLVM_DEBUG(dbgs() << "GenXDepressurizer on function " << F->getName() << '\n'); + MaxPressure = 0; + DT = getAnalysis().getDomTree(F); + LI = new LoopInfoBase(); + LI->analyze(*DT); + // Calculate the pseudo CFG. + PCFG = new PseudoCFG(); + PCFG->compute(F, DT, LI); + // Order and number the instructions. + orderAndNumber(F); + // Visit each basic block. + MaxPressure = 0; + for (auto ri = PCFG->rbegin(), re = PCFG->rend(); ri != re; ++ri) { + processBasicBlock(*ri); + } + + delete PCFG; + delete LI; + LLVM_DEBUG(dbgs() << "max pressure " << MaxPressure << " for function " + << F->getName() << '\n'); + SubroutinePressures[F] = MaxPressure; +} + +/*********************************************************************** + * orderAndNumber : order and number the instructions + * + * This has three purposes: + * + * 1. ensure the instructions in a bale are adjacent; + * + * 2. for a boolean and/or, ensure that a tree of bales (where each bale has + * a single use that is its parent in the tree, in the same basic block) is + * adjacent and in depth first order to minimize flag pressure in a tree of + * boolean ops; + * + * 3. number the instructions, with each instruction in a bale given the same + * number. + * + * This scans the code backwards, so numbers backwards starting at a high + * number. + */ +void GenXDepressurizer::orderAndNumber(Function *F) { + unsigned InstNum = 1000000000; + for (auto fi = PCFG->rbegin(), fe = PCFG->rend(); fi != fe; ++fi) { + BasicBlock *BB = *fi; + auto Inst = &BB->back(); + for (;;) { + --InstNum; + if (isa(Inst)) + InstNumbers[Inst] = InstNum; + else { + Bale B; + Baling->buildBale(Inst, &B); + auto InsertBefore = Inst; + // Move the bale instructions to a contiguous lump, and number them. + Instruction *GotoJoin = nullptr; + for (auto ii = B.begin(), ie = B.end(); ii != ie; ++ii) { + Inst = ii->Inst; + InstNumbers[Inst] = InstNum; + if (Inst == InsertBefore) + continue; + switch (GenXIntrinsic::getGenXIntrinsicID(Inst)) { + case GenXIntrinsic::genx_simdcf_goto: + case GenXIntrinsic::genx_simdcf_join: + GotoJoin = Inst; + break; + default: + break; + } + Inst->removeFromParent(); + Inst->insertBefore(InsertBefore); + } + if (GotoJoin) { + // For a goto/join, check that its outside-bale uses are also moved, + // and number the instructions. + // This is the only case of an inside-bale instruction having + // outside-bale uses. + // This is a bit of a bodge, which we'll tolerate for now on the + // basis that this pass will go away once we have a better pass for + // detecting register pressure and alleviating it by moving code and + // rematerializing. + SmallVector Users; + for (auto ui = GotoJoin->use_begin(), ue = GotoJoin->use_end(); + ui != ue; ++ui) + Users.push_back(cast(ui->getUser())); + Instruction *InsertBefore = GotoJoin->getNextNode(); + for (auto ui = Users.begin(), ue = Users.end(); ui != ue; ++ui) { + Instruction *User = *ui; + if (!isa(User->getType())) { + // Skip the use that is in the bale. We are relying on the use in + // the bale being the only extractvalue that is scalar; the other + // two (for goto) or one (for join) are vector (the EM and RM + // values). + continue; + } + if (User->getParent() == GotoJoin->getParent()) { + // Only move the extractvalue if it is in the same basic block. + User->removeFromParent(); + User->insertBefore(InsertBefore); + InstNumbers[User] = InstNum; + } + } + } + Inst = B.getHead()->Inst; + if (Inst->getType()->getScalarType()->isIntegerTy(1) && + (Inst->getOpcode() == Instruction::And || + Inst->getOpcode() == Instruction::Or)) { + // Now look at the operands. Any that is a single use instruction in + // the same basic block is moved. The rest of its bale, and that + // bale's own operands, get moved when it is later processed in the + // loop. + InsertBefore = B.begin()->Inst; + for (auto ii = B.begin(), ie = B.end(); ii != ie; ++ii) { + Inst = ii->Inst; + for (unsigned oi = 0, oe = Inst->getNumOperands(); oi != oe; ++oi) { + if (ii->Info.isOperandBaled(oi)) + continue; // only consider out-of-bale operands + auto OpndInst = dyn_cast(Inst->getOperand(oi)); + if (!OpndInst) + continue; + if (OpndInst->getParent() != BB) + continue; + if (isa(OpndInst)) + continue; + if (!OpndInst->hasOneUse()) + continue; + OpndInst->removeFromParent(); + OpndInst->insertBefore(InsertBefore); + } + } + } + // On to the previous instruction, which is now the one before the first + // instruction in the current bale. + Inst = B.begin()->Inst; + } + if (Inst == &BB->front()) + break; + Inst = Inst->getPrevNode(); + } + } +} + +/*********************************************************************** + * processBasicBlock : process one basic block + */ +void GenXDepressurizer::processBasicBlock(BasicBlock *BB) { + // Create a new empty entry for this BB in the LiveIn map, and use it for + // keeping track of liveness as we scan backwards through the block. + Live = &LiveIn[BB]; + // Populate Live with the live out values. + getLiveOut(BB, Live); + // Scan backwards through the block, excluding phi nodes. + auto Inst = &BB->back(); + for (;;) { + if (isa(Inst)) + break; + processInstruction(Inst); + if (Inst == &BB->front()) + break; + Inst = Inst->getPrevNode(); + } + // Just before the first (non-phi) instruction, attempt sinking of flag + // values, as long as non-flag pressure is low, and as long as this is not a + // join label. + if (!GotoJoin::isJoinLabel(BB) && FlagGRFTolerance > Live->getPressure()) + attemptSinking(BB->getFirstNonPHI(), nullptr, Liveness::FLAG, + /*AllowClone=*/false); +} + +/*********************************************************************** + * getLiveOut : populate empty Liveness with the live out of the BB + */ +void GenXDepressurizer::getLiveOut(BasicBlock *BB, Liveness *Live) { + // Get each successor's live in values into our liveness. If getLoopHeader + // returns non-0, then we are looking at a loop backedge and we only want + // to get successors' live in values if they are defined before the loop + // header. + unsigned LoopHeaderNum = 0; + auto BBNode = PCFG->getNode(BB); + if (auto LoopHeader = BBNode->getLoopHeader()) + LoopHeaderNum = InstNumbers[&LoopHeader->front()]; + for (auto si = BBNode->succ_begin(), se = BBNode->succ_end(); si != se; + ++si) { + auto LI = &LiveIn[*si]; + for (auto ci = LI->cat_begin(), ce = LI->cat_end(); ci != ce; ++ci) + for (auto vi = LI->begin(ci), ve = LI->end(ci); vi != ve; ++vi) { + Value *V = *vi; + if (auto Inst = dyn_cast(V)) + if (LoopHeaderNum && LoopHeaderNum <= InstNumbers[Inst]) + continue; // Ignore instruction defined in loop from loop exit succ + Live->addValue(V); + } + } + // Now adjust the liveness for the phi nodes of each real CFG successor. This + // includes the case that this is a backedge and the real CFG successor is + // the loop header; this is how we get defs inside the loop into our + // liveness. + auto TI = BB->getTerminator(); + for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) { + auto Succ = TI->getSuccessor(i); + for (auto ii = Succ->begin();; ++ii) { + auto Phi = dyn_cast(&*ii); + if (!Phi) + break; + Live->removeValue(Phi); + Live->addValue(Phi->getIncomingValue(Phi->getBasicBlockIndex(BB))); + } + } + if (MaxPressure < Live->getPressure()) { + MaxPressure = Live->getPressure(); + LLVM_DEBUG(dbgs() << "max pressure now " << MaxPressure << '\n'); + } + LLVM_DEBUG(dbgs() << "getLiveOut(" << BB->getName() << "): "; Live->print(dbgs()); + dbgs() << '\n'); + // Copy the liveness to the LiveOut entry for this BB. + LiveOut[BB].copyFrom(Live); +} + +/*********************************************************************** + * processInstruction : process one instruction in backwards scan of BB + * + * Return: Prev = previous instruction, i.e. next one to scan + */ +void GenXDepressurizer::processInstruction(Instruction *Inst) { + if (!Inst) + return; + if (Baling->isBaled(Inst)) + return; // Not head of bale, ignore + if (isa(Inst)) + return; // Too confusing to consider sinking when we get to an extractvalue + // out of a goto/join, so ignore. + Bale B; + Baling->buildBale(Inst, &B); + LLVM_DEBUG(dbgs() << '[' << InstNumbers[Inst] << ']'; + if (!Inst->getDebugLoc()) + dbgs() << " {line " << Inst->getDebugLoc().getLine() << '}'; + B.print(dbgs())); + unsigned OldFlagPressure = Live->getPressure(Liveness::FLAG); + // Remove the result of the bale from liveness. + Live->removeValue(Inst); + // If this is a non-intrisic call, add the max pressure from inside the call. + if (auto CI = dyn_cast(Inst)) { + if (!GenXIntrinsic::isAnyNonTrivialIntrinsic(CI)) { + LLVM_DEBUG(dbgs() << "pressure inside subroutine: " + << SubroutinePressures[CI->getCalledFunction()] << '\n'); + unsigned AddedPressure = + Live->getPressure() + SubroutinePressures[CI->getCalledFunction()]; + if (MaxPressure < AddedPressure) { + MaxPressure = AddedPressure; + LLVM_DEBUG(dbgs() << "max pressure now " << MaxPressure << '\n'); + } + } + } + // Add operands from outside the bale to liveness. Also keep them in a + // separate set for the use of attemptSinking. + std::set BaleOperands; + for (auto bi = B.rbegin(), be = B.rend(); bi != be; ++bi) { + BaleInst *BI = &*bi; + for (unsigned ii = 0, ie = BI->Inst->getNumOperands(); ii != ie; ++ii) { + if (!BI->Info.isOperandBaled(ii)) { + Value *Opnd = BI->Inst->getOperand(ii); + if (isa(Opnd) || isa(Opnd)) { + Live->addValue(Opnd); + BaleOperands.insert(Opnd); + } + } + } + } + LLVM_DEBUG(Live->print(dbgs()); dbgs() << '\n'); + if (Inst && Inst->isTerminator()) + return; // Do not attempt to sink past last instruction in block. + + // FIXME: This does not deal with a subroutine call instruction, where + // pressure goes up during the call and then comes back down again on + // return. I think the last remaining flag spill in HEVCEnc_PB is because + // of this; a CSEd flag is live over a subroutine call but we do not notice + // that increased flag pressure inside the call should force the flag def + // to be cloned and sunk. + + // Attempt sinking of flag values if necessary. Do not do that if non-flag + // pressure is already high. If flag pressure has just gone high, sink any + // flag value (with a benefit). Otherwise, only sink single use flag values. + if (FlagGRFTolerance > Live->getPressure()) { + bool AllowClone = OldFlagPressure <= FlagThreshold && + Live->getPressure(Liveness::FLAG) > FlagThreshold; + attemptSinking(Inst->getNextNode(), &BaleOperands, Liveness::FLAG, + AllowClone); + } + + // Attemp sinking of address values if necessary. + if (Live->getPressure(Liveness::ADDR) > AddrThreshold) + attemptSinking(Inst->getNextNode(), &BaleOperands, Liveness::ADDR, + /*AllowClone=*/false); + + // Attempt sinking of non-flag value(s) if necessary. + if (Live->getPressure() > GRFThreshold) + attemptSinking(Inst->getNextNode(), &BaleOperands, Liveness::GENERAL, + /*AllowClone=*/false); + + if (MaxPressure < Live->getPressure()) { + MaxPressure = Live->getPressure(); + LLVM_DEBUG(dbgs() << "max pressure up to " << MaxPressure << '\n'); + } +} + +/*********************************************************************** + * attemptSinking : attempt some sinking to reduce pressure + * + * Enter: InsertBefore = instruction to insert sunk instruction before + * Exclude = 0 else exclude any sink candidate in this set (used to + * exclude superbales used in the present bale) + * FlagSinking = true to sink flags + * AllowClone = true to sink anything suitable, false to only sink + * when cloning is not required, used to sink flag defs + * even when flag pressure is low. + * + * This is called in three different ways: + * + * FlagSinking, !AllowClone: sink any flag def whose uses are all dominated by + * the current position (quit if normal pressure gets too high) + * + * FlagSinking, AllowClone: sink any flag def, preferring ones that do not need + * a clone, but switch to !AllowClone mode once flag pressure is low enough + * (and quit if normal pressure gets too high) + * + * !FlagSinking, AllowClone: sink normal (non-flag) def if it provides a + * benefit to pressure, until pressure is low enough. + */ +void GenXDepressurizer::attemptSinking(Instruction *InsertBefore, + std::set *Exclude, + Liveness::Category Cat, + bool AllowClone) { + LLVM_DEBUG(dbgs() << "attemptSinking(Cat=" << (Cat == Liveness::FLAG ? "flag" : + Cat == Liveness::ADDR ? "addr" : + "general") + << ", AllowClone=" << AllowClone << ")\n"); + if (!InsertBefore) + return; + // Build two-addr operand -> instruction map for checking against two-addr + // instructions. + std::map TwoAddrValueMap; + BasicBlock *BB = InsertBefore->getParent(); + if (InsertBefore != &BB->front()) { + for (auto I = InsertBefore->getPrevNode(); I != &BB->front(); + I = I->getPrevNode()) { + auto CI = dyn_cast(I); + if (!CI) + continue; + int OpndNum = getTwoAddressOperandNum(CI); + if (OpndNum < 0) + continue; + TwoAddrValueMap[I->getOperand(OpndNum)] = CI; + } + } + // Gather the currently live superbales with a sink benefit. + // Exclude any that is used in the present bale. + SmallVector Candidates; + SmallVector SecondRound; + std::map Superbales; + unsigned CurNumber = InstNumbers[InsertBefore]; + int Headroom = 0; + switch (Cat) { + case Liveness::FLAG: + Headroom = FlagGRFTolerance - Live->getPressure(); + break; + default: + break; + } + for (auto i = Live->begin(Cat), e = Live->end(Cat); i != e; ++i) { + if (Exclude && Exclude->find(*i) != Exclude->end()) + continue; + auto Inst = dyn_cast(*i); + if (!Inst) + continue; // only instructions can sink, not args + if (isa(Inst)) + continue; // cannot sink phi node + if (isa(Inst)) + continue; // Don't sink extractvalue from a goto/join. + if (Inst->mayHaveSideEffects() || Inst->mayReadOrWriteMemory()) + continue; + // For this candidate, determine where its uses are, one of these cases: + // + // 1. All uses are dominated by here. This is the preferred case as the + // instruction can simply be sunk, with no cloning. + // 2. Not all uses are reachable from here, but all uses that are reachable + // from here are dominated by here. This can be handled by a clone of + // the instruction, where the cloned instruction takes on the uses that + // are reachable from here. + // 3. Other cases. We do not handle that, although we could enhance it in + // the future to handle this case by finding multiple sites to clone + // the instruction to. + // + // We are using "has a higher instruction number than" as a proxy for "is + // reachable from", which in fact could include some uses that are not + // reachable. + bool AllUsesDominatedByHere = true; + bool AllReachableUsesDominatedByHere = true; + for (auto ui = Inst->use_begin(), ue = Inst->use_end(); ui != ue; ++ui) { + Instruction *user = cast(ui->getUser()); + if (InstNumbers[user] < CurNumber) { + // Unreachable use. + AllUsesDominatedByHere = false; + continue; + } + if (InsertBefore->getParent() != user->getParent() && + !DT->dominates(InsertBefore->getParent(), user->getParent())) { + AllReachableUsesDominatedByHere = false; + break; + } + } + if (!AllReachableUsesDominatedByHere) + continue; // exclude case 3 + if (!AllowClone && !AllUsesDominatedByHere) + continue; // exclude case 2 if !AllowClone + bool IsFlag = Liveness::isFlag(Inst); + bool IsAddr = Liveness::isAddr(Inst); + if (!IsFlag && !IsAddr && + Inst->getType()->getPrimitiveSizeInBits() < 32 * 8) { + // don't bother with anything smaller than a GRF unless it is a flag + continue; + } + Superbale *SB = &Superbales[Inst]; + assert(SB->Bales.empty()); + if (!fillSuperbale(SB, Inst, IsFlag)) + continue; + // Check whether the sink of this SB will cross its operands' two-addr + // instructions, i.e. + // + // ... := use(v0); // SB.Head + // + // v1 := twoaddr(v0); // two-addr intruction. + // + // x <--- here this SB could be sunk to. + // + // In such case, sinking this SB should be avoided as it creates + // overlapping between v0 and v1; otherwise, additional copy of v0 has to + // be inserted. That won't alleviate the register pressure. + bool CrossTwoAddr = false; + for (auto OI = SB->Operands.begin(), + OE = SB->Operands.end(); OI != OE; ++OI) { + Value *Opnd = *OI; + if (!TwoAddrValueMap.count(Opnd)) + continue; + unsigned TwoAddrNum = InstNumbers[TwoAddrValueMap[Opnd]]; + unsigned SBNum = InstNumbers[SB->getHead()]; + // Ignore the case where the SB itself is a two-addr instruction or part + // of chain of two-addr instructions. + if (TwoAddrNum <= SBNum) + continue; + // Skip sinking/cloning if the current sinking point is beyond where the + // two-addr instruction overwriting the same register. + if (CurNumber > TwoAddrNum) { + LLVM_DEBUG(dbgs() << "could not sink/clone as it will cross the two-addr " + << "instruction sharing the same operand!"); + CrossTwoAddr = true; + break; + } + } + if (CrossTwoAddr) + continue; + // Add the candidate. + int Benefit = getSinkBenefit(SB, Cat, Headroom); + LLVM_DEBUG(dbgs() << "candidate " << SB->getHead()->getName() + << " with benefit " << Benefit + << " and AllUsesDominatedByHere " << AllUsesDominatedByHere + << '\n'); + if (Benefit > 0) + Candidates.push_back(SinkCandidate(SB, Benefit, AllUsesDominatedByHere)); + else if (AllUsesDominatedByHere) + SecondRound.push_back(SinkCandidate(SB, Benefit, true)); + } + if (!Candidates.empty()) { + // Sort the candidates. + std::sort(Candidates.begin(), Candidates.end()); + // Try each candidate. + for (auto i = Candidates.begin(), e = Candidates.end(); i != e; ++i) { + if (!AllowClone && !i->AllUsesDominatedByHere) + continue; // Ignore candidate that needs cloning if AllowClone has + // switched to false (i.e. flag pressure is low) + if (sink(InsertBefore, i->SB)) { + switch (Cat) { + case Liveness::FLAG: + if (Live->getPressure(Liveness::FLAG) <= FlagThreshold) { + // Flag pressure is now low so we can stop sinking when a clone + // is needed. + AllowClone = false; + } + Headroom = FlagGRFTolerance - Live->getPressure(); + if (Headroom <= 0) + return; + break; + case Liveness::ADDR: + if (Live->getPressure(Liveness::ADDR) < AddrThreshold) + return; + break; + default: + if (Live->getPressure() < GRFThreshold) + return; + break; + } + } else if (i->AllUsesDominatedByHere) { + SecondRound.push_back(*i); + } + } + } + if (AllowClone) { + LLVM_DEBUG(dbgs() << "could not do enough sinking to alleviate pressure\n"); + if (Cat == Liveness::FLAG) { + for (auto i = Candidates.begin(), e = Candidates.end(); i != e; ++i) { + (void)sink(InsertBefore, i->SB, true); + } + } + } else { + // Try to sink a group of candidates to reduce register pressure. + // Do NOT Allow Clone for now. + for (auto i = SecondRound.begin(), ie = SecondRound.end(); i != ie; ++i) { + if (i->SB == nullptr) + continue; + auto SB = i->SB; + SmallSet OperandSet; + for (auto k = SB->Operands.begin(), ke = SB->Operands.end(); k != ke; ++k) + OperandSet.insert(*k); + // find a group that shares the same input + auto j = i; + for (++j; j != ie; ++j) { + if (j->SB == nullptr) + continue; + auto SB2 = j->SB; + bool EqualInputs = (SB2->Operands.size() == SB->Operands.size()); + for (auto k = SB2->Operands.begin(), ke = SB2->Operands.end(); + EqualInputs && k != ke; ++k) { + if (OperandSet.count(*k) == 0) + EqualInputs = false; + } + // merge superbale if i covers j + if (EqualInputs) { + MergeCandidate(*i, *j); + } + } + } + // Sort the candidates. + std::sort(SecondRound.begin(), SecondRound.end()); + // Try each candidate. + for (auto i = SecondRound.begin(), e = SecondRound.end(); i != e; ++i) { + if (i->Benefit <= 0 || i->SB == nullptr) + break; + bool status = sink(InsertBefore, i->SB); + assert(status); + (void)status; + } + } +} + +// Merge the Rhs into the Lhs candidate assuming that Rhs input operands +// are covered by the Lhs candidate +void GenXDepressurizer::MergeCandidate(SinkCandidate &Lhs, SinkCandidate &Rhs) { + // update the benefit + Lhs.Benefit += getSuperbaleKillSize(Rhs.SB); + // merge superbale + SmallVector Merge; + auto a = Lhs.SB->Bales.begin(); + auto ae = Lhs.SB->Bales.end(); + auto b = Rhs.SB->Bales.begin(); + auto be = Rhs.SB->Bales.end(); + while (1) { + if (a == ae && b == be) + break; + if (b == be) { + Merge.push_back(*a); + ++a; + } else if (a == ae) { + Merge.push_back(*b); + ++b; + } else if (InstNumbers[*b] > InstNumbers[*a]) { + Merge.push_back(*b); + ++b; + } else { + Merge.push_back(*a); + ++a; + } + } + Lhs.SB->Number = InstNumbers[Merge[0]]; + std::swap(Lhs.SB->Bales, Merge); + Rhs.SB = nullptr; + Rhs.Benefit = (-1); +} + +/*********************************************************************** + * sink : sink the superbale if possible + * + * Enter: InsertBefore = instruction to insert before + * SB = the superbale to sink + * + * Return: whether succeeded + */ +bool GenXDepressurizer::sink(Instruction *InsertBefore, Superbale *SB, + bool AllowClone) { + static unsigned Count = 0; + if (++Count > LimitGenXDepressurizer) + return false; + if (LimitGenXDepressurizer != UINT_MAX) + dbgs() << "genx depressurizer " << Count << '\n'; + unsigned CurNumber = InstNumbers[InsertBefore]; + LLVM_DEBUG(dbgs() << "sink(" << SB->getHead()->getName() << ")\n"); + // Gather the uses that we are going to modify. + SmallVector UsesDominatedByHere; + for (auto ui = SB->getHead()->use_begin(), ue = SB->getHead()->use_end(); + ui != ue; ++ui) { + Use *U = &*ui; + Instruction *user = cast(U->getUser()); + LLVM_DEBUG(dbgs() << " used in [" << InstNumbers[user] << "] " + << user->getName() << '\n'); + unsigned UserNumber = InstNumbers[user]; + if (UserNumber < CurNumber) { + // Skip this user if cloning is allowed. + if (AllowClone) + continue; + LLVM_DEBUG(dbgs() << " rejecting: less than CurNumber " << CurNumber << '\n'); + // This code was originally designed to cope with some uses not being + // dominated by the sink site by cloning the superbale. But this gives an + // assert on frc_iteration6_4x8_ipa. So I am disabling the cloning + // functionality for now by rejecting the whole sink unless all uses are + // dominated by the sink site. This also gives a few minor code size + // improvements in examples too. + return false; + } + UsesDominatedByHere.push_back(U); + } + if (UsesDominatedByHere.empty()) + return false; + // Do the sinking. + BasicBlock *DefBB = sinkOnce(InsertBefore, SB, UsesDominatedByHere); + assert(DefBB == InsertBefore->getParent()); + (void)DefBB; + // We need to modify liveness at the current point. + modifyLiveness(Live, SB); + LLVM_DEBUG(dbgs() << "Successfully sunk "<< SB->getHead()->getName() << '\n'; + Live->print(dbgs()); + dbgs() << '\n'); + return true; +} + +/*********************************************************************** + * sinkOnce : do one sinking of a superbale for a group of uses + * + * Enter: InsertBefore = instruction to insert before + * SB = superbale to sink + * Uses = uses in the group + * + * Return: basic block where sunk superbale was inserted + * + * Currently this only copes with the case that the uses are all dominated + * by InsertBefore, and the moved or cloned def is inserted before InsertBefore + * and the function returns the basic block containing InsertBefore. + * + * However it could be extended to sink for a group of uses that are not + * dominated by InsertBefore but are reachable from it. Then it would insert + * the def at a place that is a common dominator of the uses, and return that + * basic block. + */ +BasicBlock *GenXDepressurizer::sinkOnce(Instruction *InsertBefore, + Superbale *SB, ArrayRef Uses) { + LLVM_DEBUG(dbgs() << "sinkOnce with uses:"; + for (auto i = Uses.begin(), e = Uses.end(); i != e; ++i) + dbgs() << " [" + << InstNumbers[cast((*i)->getUser())] + << ']' << (*i)->getUser()->getName(); + dbgs() << '\n'); + // Insert after the current instruction. + BasicBlock *InsertBB = InsertBefore->getParent(); + unsigned InsertNum = InstNumbers[InsertBefore]; + assert(InsertNum != 0); + LLVM_DEBUG(dbgs() << "InsertBefore: " << InsertBefore->getName() << '\n'); + // Remove this group of uses from the superbale. + auto Undef = UndefValue::get(SB->getHead()->getType()); + for (auto i = Uses.begin(), e = Uses.end(); i != e; ++i) + **i = Undef; + Instruction *Changed = nullptr; + if (SB->getHead()->use_empty()) { + // The superbale now has no uses. So we can simply move the instructions. + for (auto i = SB->Bales.rbegin(), e = SB->Bales.rend(); i != e; ++i) { + Bale B; + Baling->buildBale(*i, &B); + for (auto j = B.begin(), je = B.end(); j != je; ++j) { + Changed = j->Inst; + Changed->removeFromParent(); + Changed->insertBefore(InsertBefore); + InstNumbers[Changed] = InsertNum - 1; + ++NumSunk; + } + } + } else { + // The superbale still has uses, so we need to clone it. + std::map ClonedInsts; + for (auto i = SB->Bales.rbegin(), e = SB->Bales.rend(); i != e; ++i) { + Bale B; + Baling->buildBale(*i, &B); + Instruction *InstToClone = nullptr; + for (auto j = B.begin(), je = B.end(); j != je; ++j) { + InstToClone = j->Inst; + Changed = InstToClone->clone(); + Changed->insertBefore(InsertBefore); + Changed->setName(InstToClone->getName() + ".cloned"); + // Ensure new instruction has the same baling. + Baling->setBaleInfo(Changed, j->Info); + for (unsigned k = 0, ke = Changed->getNumOperands(); k != ke; ++k) { + if (auto O = dyn_cast(Changed->getOperand(k))) { + auto it = ClonedInsts.find(O); + if (it != ClonedInsts.end()) + Changed->setOperand(k, it->second); + } + } + ClonedInsts[InstToClone] = Changed; + InstNumbers[Changed] = InsertNum - 1; + ++NumCloned; + } + } + } + // Change our uses to use the moved/cloned superbale. + for (auto i = Uses.begin(), e = Uses.end(); i != e; ++i) + **i = Changed; + if (Changed) { + LLVM_DEBUG(dbgs() << "Sunk/cloned superbale head is " << Changed->getName() + << '\n'); + } else { + LLVM_DEBUG(dbgs() << "Warning: Changed is nullptr\n"); + } + return InsertBB; +} + +/*********************************************************************** + * modifyLiveness : modify liveness (at some point) to reflect the sinking + * of the superbale past it + * + * Enter: Live = the liveness to modify + * SB = the superbale + * + * Return: true if the result of the superbale was removed from liveness, + * false if it was not live already + */ +bool GenXDepressurizer::modifyLiveness(Liveness *Live, Superbale *SB) { + // Remove the superbale's result from liveness. + for (auto i = SB->Bales.begin(), e = SB->Bales.end(); i != e; ++i) { + Live->removeValue(*i); + } + for (auto i = SB->Operands.begin(), e = SB->Operands.end(); i != e; ++i) + if (*i) + Live->addValue(*i); + return true; +} + +int GenXDepressurizer::getSuperbaleKillSize(Superbale *SB) { + int sum = 0; + for (auto i = SB->Bales.rbegin(), e = SB->Bales.rend(); i != e; ++i) { + if (GenXIntrinsic::isWrRegion(*i)) + sum += Liveness::getValueSize((*i)->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum)); + else + sum += Liveness::getValueSize(*i); + } + return sum; +} + +/*********************************************************************** + * getSinkBenefit : calculate the benefit of sinking this Superbale + * + * Enter: SB = superbale to consider + * Cat = category of value being sunk + * Headroom = if flag sinking, the headroom in normal register + * pressure + * + * For normal (non-flag) sinking, the benefit is the size of the result + * minus the total size of the superbale's operands. + * + * For flag sinking, the benefit is the size of the flag result minus the + * total size of any flag operands to the superbale. + */ +int GenXDepressurizer::getSinkBenefit(Superbale *SB, Liveness::Category Cat, + unsigned Headroom) { + int Benefit = getSuperbaleKillSize(SB); + unsigned FlagOperandSize = 0, AddrOperandSize = 0, OperandSize = 0; + for (auto i = SB->Operands.begin(), e = SB->Operands.end(); i != e; ++i) { + Value *Operand = *i; + if (!Operand || isa(Operand)) + continue; + if (Live->contains(Operand)) + continue; + if (Liveness::isFlag(Operand)) + FlagOperandSize += Liveness::getValueSize(Operand); + else if (Liveness::isAddr(Operand)) + AddrOperandSize += Liveness::getValueSize(Operand); + else + OperandSize += Liveness::getValueSize(Operand); + } + switch (Cat) { + case Liveness::FLAG: + return Benefit - FlagOperandSize; // Flag sinking. + case Liveness::ADDR: + return Benefit - AddrOperandSize; // Addr sinking. + default: + break; + } + return Benefit - OperandSize; +} + +/*********************************************************************** + * fillSuperbale : find a chain of instruction to move + * + * Return: false is the chain has side-effect, cannot be moved. + * + * For a vector-of-i1 and/or/not instruction, the superbale contains the + * tree of boolean and/or/not instructions plus the bales for the cmp + * instructions that created the booleans. + * + * For a wrregion, the superbale contains the bale for each wrregion in + * the chain of wrregion bales with the same inputs. + * + * Otherwise, it contains just the present instruction's bale. + * + * A bale with an indirect operand also includes the address generating + * instruction(s) in the superbale, so that, where a superbale is cloned, + * we maintain the constraint that an address generating instruction has + * exactly one use between GenXCategory and GenXAddressCommoning. + */ +bool GenXDepressurizer::fillSuperbale(Superbale *SB, Instruction *Inst, + bool IsFlag) { + // This is a new Superbale. Gather the bale(s) that make the superbale, + // and record the operands. First get the out-of-bale operands of the bale + // headed by Inst. We do this in the order such that the "old value of + // vector" operand of any wrregion heading the bale is the first operand + // pushed into SB->Operands. + SB->Number = InstNumbers[Inst]; + SB->Bales.push_back(Inst); + SmallSet OperandSet; + Bale B; + Baling->buildBale(Inst, &B); + bool OnlyRdWrRegion = true; + for (auto bi = B.rbegin(), be = B.rend(); bi != be; ++bi) { + BaleInst *BI = &*bi; + if (BI->Inst->mayHaveSideEffects() || BI->Inst->mayReadOrWriteMemory()) + return false; // not safe to sink + if (OnlyRdWrRegion && // Only chk the following conds if still required. + !GenXIntrinsic::isWrRegion(BI->Inst) && !GenXIntrinsic::isRdRegion(BI->Inst) && + !isa(BI->Inst) && + GenXIntrinsic::getGenXIntrinsicID(BI->Inst) != GenXIntrinsic::genx_add_addr) + OnlyRdWrRegion = false; + for (unsigned oi = 0, oe = BI->Inst->getNumOperands(); oi != oe; ++oi) { + if (BI->Info.isOperandBaled(oi)) + continue; + Value *Opnd = BI->Inst->getOperand(oi); + if (!isa(Opnd) && !isa(Opnd)) + continue; + if (OperandSet.insert(Opnd).second) + SB->Operands.push_back(Opnd); + } + } + if (OnlyRdWrRegion || SB->Operands.empty()) { + return false; // moving this kind of bale may mess up coalescing + } + if (IsFlag) { + // Boolean operation. For any boolean input, include the instruction that + // generates it in the bale, as long as this is the only use. A superbale + // is then potentially a tree of boolean operations combined by and/or/not, + // and then at each leaf of the tree a cmp or a chain of cmps linked by + // wrpredregion (i.e. multiple cmps writing to different parts of the same + // flag register). + for (unsigned i = 0; i != SB->Operands.size(); ++i) { + Inst = dyn_cast_or_null(SB->Operands[i]); + if (!Inst) + continue; + if (!Liveness::isFlag(Inst)) + continue; + if (!Inst->hasOneUse()) + continue; + if (isa(Inst)) + continue; + Bale B2; + Baling->buildBale(Inst, &B2); + SB->Operands[i] = nullptr; + SB->Bales.push_back(Inst); + for (auto bi = B2.rbegin(), be = B2.rend(); bi != be; ++bi) { + BaleInst *BI = &*bi; + for (unsigned oi = 0, oe = BI->Inst->getNumOperands(); oi != oe; ++oi) { + if (BI->Info.isOperandBaled(oi)) + continue; + Value *Opnd = BI->Inst->getOperand(oi); + if (OperandSet.insert(Opnd).second) + SB->Operands.push_back(Opnd); + } + } + } + } else if (GenXIntrinsic::isWrRegion(Inst)) { + // Non-boolean operation headed by a wrregion. + Value *Opnd0 = SB->Operands[0]; + for (;;) { + if (!GenXIntrinsic::isWrRegion(Opnd0)) + break; + Inst = cast(Opnd0); + if (!Inst->hasOneUse()) + break; + // The "old value of vector" input is another wrregion. Check that all + // the operands are the same, except the "old value of vector" input + // to that one. + Bale B2; + Baling->buildBale(Inst, &B2); + Opnd0 = Inst->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum); + for (auto bi = B2.rbegin(), be = B2.rend(); bi != be; ++bi) { + BaleInst *BI = &*bi; + for (unsigned oi = 0, oe = BI->Inst->getNumOperands(); oi != oe; ++oi) { + if (BI->Info.isOperandBaled(oi)) + continue; + Value *Opnd = BI->Inst->getOperand(oi); + if (Opnd == Opnd0) + continue; + if (!isa(Opnd) && !isa(Opnd)) + continue; + if (OperandSet.insert(Opnd).second) + SB->Operands.push_back(Opnd); + } + } + SB->Bales.push_back(Inst); + // Replace the previous "old value of vector" in SB->Operands. + SB->Operands[0] = Opnd0; // Opnd0 could be "undef" + } + } + // Now check whether any operand is an address. If so, include the address + // generating instruction in the superbale, so that, where a superbale is + // cloned, we maintain the constraint that an address generating instruction + // has exactly one use between GenXCategory and GenXAddressCommoning. + for (unsigned oi = 0, oe = SB->Operands.size(); oi != oe; /*EMPTY*/) { + Value *Opnd = SB->Operands[oi]; + switch (GenXIntrinsic::getGenXIntrinsicID(Opnd)) { + case GenXIntrinsic::genx_convert_addr: + case GenXIntrinsic::genx_add_addr: { + auto Addr = cast(Opnd); + SB->Bales.push_back(Addr); + SB->Operands[oi] = Addr->getOperand(0); + continue; + } + default: + break; + } + ++oi; + } + return true; +} + +/*********************************************************************** + * Superbale::print : debug print + */ +void Superbale::print(raw_ostream &OS) { + OS << "Superbale[" << Number << ']'; + for (auto i = Bales.begin(), e = Bales.end(); i != e; ++i) + OS << ' ' << (*i)->getName(); + OS << ", operands:"; + for (auto i = Operands.begin(), e = Operands.end(); i != e; ++i) + if (*i) + OS << ' ' << (*i)->getName(); +} + +/*********************************************************************** + * copyFrom : copy this Liveness from the other one + */ +void Liveness::copyFrom(Liveness *Other) { + for (auto ci = cat_begin(), ce = cat_end(); ci != ce; ++ci) { + Values[ci].clear(); + Pressures[ci] = Other->Pressures[ci]; + } + Pressure = Other->Pressure; + copyValues(Other); +} + +/*********************************************************************** + * getValueSize : get the byte size of a value + * + * We round up to an even number of bytes as that's what we need for counting + * flag pressure, and we may as well do the same for normal pressure. + */ +unsigned Liveness::getValueSize(Value *V) { + if (isAddr(V)) + switch (GenXIntrinsic::getGenXIntrinsicID(V)) { + case GenXIntrinsic::genx_add_addr: + return 0; + default: + break; + } + return (V->getType()->getPrimitiveSizeInBits() + 15) / 8U & -2U; +} + +/*********************************************************************** + * Liveness::addValue : add value to this liveness + */ +void Liveness::addValue(Value *V) { + auto Cat = GENERAL; + if (isFlag(V)) + Cat = FLAG; + else if (isAddr(V)) + Cat = ADDR; + if (Values[Cat].insert(V).second) { + Pressure += getValueSize(V); + Pressures[Cat] += getValueSize(V); + } +} + +/*********************************************************************** + * Liveness::removeValue : remove value from this liveness + * + * Return: true if the value was removed, false if it was not live anyway + */ +bool Liveness::removeValue(Value *V) { + auto Cat = GENERAL; + if (isFlag(V)) + Cat = FLAG; + else if (isAddr(V)) + Cat = ADDR; + if (!Values[Cat].erase(V)) + return false; + Pressure -= getValueSize(V); + Pressures[Cat] -= getValueSize(V); + return true; +} + +/*********************************************************************** + * Liveness::copyValues : copy values from Other into this liveness + */ +void Liveness::copyValues(Liveness *Other) { + for (auto ci = cat_begin(), ce = cat_end(); ci != ce; ++ci) { + for (auto i = Other->Values[ci].begin(), e = Other->Values[ci].end(); + i != e; ++i) + addValue(*i); + } +} + +/*********************************************************************** + * Liveness::print : debug print + */ +void Liveness::print(raw_ostream &OS) { + OS << "[addrpressure=" << Pressures[ADDR] + << ",flagpressure=" << Pressures[FLAG] << ",pressure=" << Pressure << ']'; + for (unsigned Cat = NUMCATS; Cat--; /*EMPTY*/) { + if (!Values[Cat].empty()) { + const char *CatName = (Cat == FLAG ? "flag." : + Cat == ADDR ? "addr." : ""); + OS << ' ' << CatName << "live:"; + for (auto i = begin(Cat), e = end(Cat); i != e; ++i) + OS << ' ' << (*i)->getName(); + } + } +} + +/*********************************************************************** + * PseudoCFG::Node::removeSucc : remove block from the node's successor + * list (if it is in the list at all) + * + * This is only used when removing edges to unstick ourselves when there is + * irreducible flow. + */ +void PseudoCFG::Node::removeSucc(BasicBlock *Succ) { + for (unsigned i = 0, e = Succs.size(); i != e; ++i) { + if (Succ == Succs[i]) { + Succs[i] = Succs[Succs.size() - 1]; + Succs.pop_back(); + break; + } + } +} + +/*********************************************************************** + * PseudoCFG::Node::removePred : remove block from the node's predecessor + * list (if it is in the list at all) + * + * The only case when this is possibly called with a Pred that is not on the + * list is when attempting to remove loop backedges but flow is irreducible. + * This happens in compute_first_def_bug_5. + */ +void PseudoCFG::Node::removePred(BasicBlock *Pred) { + for (unsigned i = 0, e = Preds.size(); i != e; ++i) { + if (Pred == Preds[i]) { + Preds[i] = Preds[Preds.size() - 1]; + Preds.pop_back(); + break; + } + } +} + +/*********************************************************************** + * PseudoCFG::compute : compute the pseudo CFG for the function + */ +void PseudoCFG::compute(Function *F, DominatorTree *DT, + LoopInfoBase *LI) { + clear(); + // Initialize the graph to the same as the CFG. While we're scanning the + // CFG, remember the natural loop backedges. + SmallVector Backedges; + for (auto fi = F->begin(), fe = F->end(); fi != fe; ++fi) { + BasicBlock *BB = &*fi; + auto TI = BB->getTerminator(); + // Remember BB if it is a backedge. + if (TI->getNumSuccessors() == 1 && DT->dominates(TI->getSuccessor(0), BB)) + Backedges.push_back(BB); + // Add the edges out of BB. + auto BBNode = getNode(BB); + for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) { + BasicBlock *Succ = TI->getSuccessor(i); + BBNode->Succs.push_back(Succ); + getNode(Succ)->Preds.push_back(BB); + } + } + // For each natural loop backedge, remove the backedge and add the loop + // exit edges. This is all the changes we need to make if the CFG is + // reducible. If it is irreducible, we will need to remove more edges + // when we derive the ordering. + for (unsigned i = 0, e = Backedges.size(); i != e; ++i) { + BasicBlock *BB = Backedges[i]; + auto BBNode = getNode(BB); + assert(BBNode->Succs.size() == 1 && + "expecting backedge to have one successor " + "as we have split critical edges"); + BasicBlock *Header = BBNode->Succs[0]; + BBNode->LoopHeader = Header; + BBNode->Succs.clear(); // This removes Header as BB's only successor. + getNode(Header)->removePred(BB); + Loop *L = LI->getLoopFor(Header); + SmallVector ExitBlocks; + assert(L); + L->getExitBlocks(ExitBlocks); + for (unsigned j = 0, je = ExitBlocks.size(); j != je; ++j) { + BasicBlock *Exit = ExitBlocks[j]; + BBNode->Succs.push_back(Exit); + getNode(Exit)->Preds.push_back(BB); + } + } + // Derive the ordering. + std::map Pending; + SmallVector Ready; + std::set Done; + Ready.push_back(&F->front()); + for (;;) { + if (Ready.empty()) { + if (Pending.empty()) + break; // finished + // We have got stuck. The CFG must be irreducible. Unstick ourselves + // by choosing the pending block that is earliest in the function, + // removing any pending edges from it, and making it ready. + BasicBlock *BB = nullptr; + for (auto fi = F->begin();; ++fi) { + BB = &*fi; + if (Pending.find(BB) != Pending.end()) + break; + } + std::set UnseenPreds; + for (auto ui = BB->use_begin(), ue = BB->use_end(); ui != ue; ++ui) { + auto Pred = cast(ui->getUser())->getParent(); + if (Done.find(Pred) != Done.end()) + continue; + UnseenPreds.insert(Pred); + } + for (auto i = UnseenPreds.begin(), e = UnseenPreds.end(); i != e; ++i) { + getNode(BB)->removePred(*i); + getNode(*i)->removeSucc(BB); + } + Pending.erase(BB); + Ready.push_back(BB); + continue; + } + // Pop a ready block off the stack. + auto BB = Ready.back(); + Ready.pop_back(); + Ordering.push_back(BB); + // For each successor, decrement the pending count. If it becomes 0, the + // successor becomes ready. + auto BBNode = getNode(BB); + for (auto si = BBNode->succ_begin(), se = BBNode->succ_end(); + si != se; ++si) { + BasicBlock *Succ = *si; + auto PendingEntry = &Pending[Succ]; + if (!*PendingEntry) { + // New entry in the pending map. Count the predecessors. + for (auto pi = getNode(Succ)->pred_begin(), + pe = getNode(Succ)->pred_end(); pi != pe; ++pi) + ++*PendingEntry; + } + if (--*PendingEntry) + continue; + // Successor needs to become ready. + Pending.erase(Succ); + Ready.push_back(Succ); + } + } +} + +/*********************************************************************** + * PseudoCFG::print : print the pseudo-CFG + */ +void PseudoCFG::print(raw_ostream &OS) { + OS << "PseudoCFG:\n"; + for (auto i = Ordering.begin(), e = Ordering.end(); i != e; ++i) { + auto BB = *i; + auto Node = getNode(BB); + OS << BB->getName(); + if (Node->LoopHeader) + OS << " loop header " << Node->LoopHeader->getName(); + OS << "\n preds:"; + for (auto pi = Node->pred_begin(), pe = Node->pred_end(); pi != pe; ++pi) + OS << ' ' << (*pi)->getName(); + OS << "\n succs:"; + for (auto si = Node->succ_begin(), se = Node->succ_end(); si != se; ++si) + OS << ' ' << (*si)->getName(); + OS << '\n'; + } +} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXEmulate.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXEmulate.cpp new file mode 100644 index 000000000000..3c6102f47e53 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXEmulate.cpp @@ -0,0 +1,174 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXEmulate +/// ----------- +/// +/// GenXEmulate is a mudule pass that emulates certain LLVM IR instructions. +/// +//===----------------------------------------------------------------------===// + +#include "GenX.h" +#include "GenXSubtarget.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" + +using namespace llvm; +using namespace genx; + +namespace { + +class GenXEmulate : public ModulePass { + // Maps to its corresponding emulation function. + using OpType = std::pair; + std::map EmulationFuns; + const GenXSubtarget * ST = nullptr; + +public: + static char ID; + explicit GenXEmulate() : ModulePass(ID) {} + virtual StringRef getPassName() const { return "GenX emulation"; } + void getAnalysisUsage(AnalysisUsage &AU) const; + bool runOnModule(Module &M); + bool runOnFunction(Function &F); +private: + bool emulateInst(Instruction *Inst); + Function *getEmulationFunction(Instruction *Inst); + // Check if a function is to emulate instructions. + static bool isEmulationFunction(const Function* F) { + if (F->empty()) + return false; + if (F->hasFnAttribute("CMBuiltin")) + return true; + // FIXME: The above attribute is lost during SPIR-V translation. + if (F->getName().contains("__cm_intrinsic_impl_")) + return true; + return false; + } +}; + +} // end namespace + +char GenXEmulate::ID = 0; +namespace llvm { +void initializeGenXEmulatePass(PassRegistry &); +} +INITIALIZE_PASS_BEGIN(GenXEmulate, "GenXEmulate", "GenXEmulate", false, false) +INITIALIZE_PASS_END(GenXEmulate, "GenXEmulate", "GenXEmulate", false, false) + +ModulePass *llvm::createGenXEmulatePass() { + initializeGenXEmulatePass(*PassRegistry::getPassRegistry()); + return new GenXEmulate; +} + +void GenXEmulate::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); +} + +bool GenXEmulate ::runOnModule(Module &M) { + bool Changed = false; + EmulationFuns.clear(); + if (auto P = getAnalysisIfAvailable()) + ST = P->getSubtarget(); + + // Process non-builtin functions. + for (auto &F : M.getFunctionList()) { + if (!isEmulationFunction(&F)) + Changed |= runOnFunction(F); + } + + // Delete unuse builtins or make used builtins internal. + for (auto I = M.begin(); I != M.end();) { + Function &F = *I++; + if (isEmulationFunction(&F)) { + Changed = true; + if (F.use_empty()) + F.eraseFromParent(); + else + F.setLinkage(GlobalValue::InternalLinkage); + } + } + + return Changed; +} + +bool GenXEmulate::runOnFunction(Function &F) { + bool Changed = false; + for (auto &BB : F.getBasicBlockList()) { + for (auto I = BB.begin(); I != BB.end();) { + Instruction *Inst = &*I++; + Changed |= emulateInst(Inst); + } + } + return Changed; +} + +Function *GenXEmulate::getEmulationFunction(Instruction *Inst) { + unsigned Opcode = Inst->getOpcode(); + Type *Ty = Inst->getType(); + OpType OpAndType = std::make_pair(Opcode, Ty); + + // Check if this emulation function has been cached. + auto Iter = EmulationFuns.find(OpAndType); + if (Iter != EmulationFuns.end()) + return Iter->second; + + assert(ST && "subtarget expected"); + StringRef EmuFnName = ST->getEmulateFunction(Inst); + if (EmuFnName.empty()) + return nullptr; + + Module *M = Inst->getParent()->getParent()->getParent(); + for (auto &F : M->getFunctionList()) { + if (!isEmulationFunction(&F)) + continue; + if (F.getReturnType() != Inst->getType()) + continue; + StringRef FnName = F.getName(); + if (FnName.contains(EmuFnName)) { + EmulationFuns[OpAndType] = &F; + return &F; + } + } + + return nullptr; +} + +bool GenXEmulate::emulateInst(Instruction *Inst) { + Function *EmuFn = getEmulationFunction(Inst); + if (!EmuFn) + return false; + + assert(!isa(Inst) && "call emulation not supported yet"); + IRBuilder<> Builder(Inst); + SmallVector Args(Inst->operands()); + Value *EmuInst = Builder.CreateCall(EmuFn, Args); + Inst->replaceAllUsesWith(EmuInst); + Inst->eraseFromParent(); + return true; +} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXExtractVectorizer.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXExtractVectorizer.cpp new file mode 100644 index 000000000000..32f2cf3bfa02 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXExtractVectorizer.cpp @@ -0,0 +1,295 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXExtractVectorizer +/// --------------------- +/// +/// GenX extract vectorizer pass is stage 1 of the histogram optimization: if +/// there are multiple scalar rdregions from the same vector, all subject +/// to the same binary operator with constant rhs or the same trunc/zext/sext, +/// then they are combined into a vector version of the binary operator or +/// trunc/zext/sext, with scalar rdregions from the result of that. This is +/// designed to handle any trunc/zext/sext then scale of the index in the +/// histogram optimization, although it does also apply in a few other cases. +/// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "GENX_ExtractVectorizer" + +#include "GenX.h" +#include "GenXRegion.h" +#include "GenXUtil.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; +using namespace genx; + +namespace { + +// GenX extract vectorizer pass +class GenXExtractVectorizer : public FunctionPass { + bool Modified = false; + DominatorTree *DT = nullptr; + SmallVector Extracted; + std::set ExtractedSet; + struct Extract { + Instruction *Inst; // the binary operator applied to the extracted element + int Offset; // constant offset from the rdregion + Extract(Instruction *Inst, int Offset) : Inst(Inst), Offset(Offset) {} + // Sort in offset order + bool operator<(const Extract &Other) const { return Offset < Other.Offset; } + }; + struct BucketIndex { + unsigned Opcode; + Type *CastTo; + Value *Indirect; + Type *ConvTy; + BucketIndex(unsigned Opcode, Type *CastTo, Value *Indirect, Type *ConvTy) + : Opcode(Opcode), CastTo(CastTo), Indirect(Indirect), ConvTy(ConvTy) {} + bool operator<(const BucketIndex &Other) const { + if (Opcode != Other.Opcode) + return Opcode < Other.Opcode; + if (CastTo != Other.CastTo) + return CastTo < Other.CastTo; + return Indirect < Other.Indirect; + } + }; +public: + static char ID; + explicit GenXExtractVectorizer() : FunctionPass(ID) { } + virtual StringRef getPassName() const { return "GenX Extract Vectorizer"; } + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.setPreservesCFG(); + } + bool runOnFunction(Function &F); +private: + void processExtracted(Value *V); + void processBucket(const BucketIndex *BIdx, SmallVectorImpl *B); +}; + +}// end namespace llvm + + +char GenXExtractVectorizer::ID = 0; +namespace llvm { void initializeGenXExtractVectorizerPass(PassRegistry &); } +INITIALIZE_PASS_BEGIN(GenXExtractVectorizer, "GenXExtractVectorizer", + "GenXExtractVectorizer", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(GenXExtractVectorizer, "GenXExtractVectorizer", + "GenXExtractVectorizer", false, false) + +// Publicly exposed interface to pass... +FunctionPass *llvm::createGenXExtractVectorizerPass() +{ + initializeGenXExtractVectorizerPass(*PassRegistry::getPassRegistry()); + return new GenXExtractVectorizer(); +} + +/*********************************************************************** + * runOnFunction : run the extract vectorizer for this Function + */ +bool GenXExtractVectorizer::runOnFunction(Function &F) +{ + DT = &getAnalysis().getDomTree(); + // Scan the code looking for vector values that have an extract (a rdregion + // of one element) applied. + for (auto fi = F.begin(), fe = F.end(); fi != fe; ++fi) { + BasicBlock *BB = &*fi; + for (auto bi = BB->begin(), be = BB->end(); bi != be; ++bi) { + Instruction *Inst = &*bi; + if (!GenXIntrinsic::isRdRegion(Inst)) + continue; + if (isa(Inst->getType())) + continue; + Value *V = Inst->getOperand(0); + if (isa(V)) + continue; + if (ExtractedSet.insert(V).second) + Extracted.push_back(V); + } + } + ExtractedSet.clear(); + // Process each such vector. Processing a vector might result in another + // new vector being pushed onto Extracted, so that in turn will be processed. + while (!Extracted.empty()) { + Value *V = Extracted.back(); + Extracted.pop_back(); + processExtracted(V); + } + return Modified; +} + +/*********************************************************************** + * GenXExtractVectorizer::processExtracted : process an instruction or arg that + * has at least one scalar extracted from it (using rdregion), in the hope that + * we can vectorize it as the first stage of the histogram optimization + */ +void GenXExtractVectorizer::processExtracted(Value *V) +{ + // Gather the scalar extracting rdregion uses of V into buckets, one for + // each binaryoperator with constant rhs that the extracted value is used in. + std::map> Buckets; + for (auto ui = V->use_begin(), ue = V->use_end(); ui != ue; ++ui) { + auto user = cast(ui->getUser()); + if (!GenXIntrinsic::isRdRegion(user)) + continue; // not rdregion + if (isa(user->getType())) + continue; // not rdregion with scalar result + if (!user->hasOneUse()) + continue; // rdregion not single use + auto Use2 = &*user->use_begin(); + auto User2 = cast(Use2->getUser()); + // We want User2 to be either a binary operator with constant rhs, + // or a trunc/zext/sext. + Type *CastTo = nullptr; + if (isa(User2)) { + if (!isa(User2->getOperand(1))) + continue; // binary operator has non-constant rhs + } else { + if (!isa(User2) || isa(User2)) + continue; // not trunc/zext/sext + CastTo = User2->getType(); + } + // Get the index, possibly as index+offset if the index is a balable add + // instruction. + Region R = Region::getWithOffset(user); + // Add to the bucket. The bucket is indexed by: + // - the opcode of the binaryoperator or trunc/zext/sext using the + // extracted value + // - the type being trunc/zext/sext to + // - any variable part of the rdregion index + // The Extract pushed into the bucket contains: + // - the binaryoperator itself (from which we can find the rdregion) + // - the constant offset part of the rdregion index. + Buckets[BucketIndex(User2->getOpcode(), CastTo, R.Indirect, User2->getType())] + .push_back(Extract(User2, R.Offset)); + } + // Now look at each bucket. Only bother with a bucket that has at least four + // scalar extracts in it. + for (auto i = Buckets.begin(), e = Buckets.end(); i != e; ++i) { + auto Bucket = &i->second; + if (Bucket->size() < 4) + continue; + processBucket(&i->first, Bucket); + } +} + +/*********************************************************************** + * GenXExtractVectorizer::processBucket : process one bucket of extracts from + * the same vector + * + * The bucket contains at least 4 instances of a binary operator whose rhs + * is constant and whose lhs is an extract (a scalar rdregion) from the same + * vector. Either each index is constant, or each index is an add with constant + * rhs and with the same lhs. + */ +void GenXExtractVectorizer::processBucket(const BucketIndex *BIdx, + SmallVectorImpl *B) +{ + // Sort the extracts into offset order. + std::sort(B->begin(), B->end()); + // See if we have a sequence of offsets such that we can construct a + // 1D region. + int Diff = (*B)[1].Offset - (*B)[0].Offset; + for (unsigned j = 1, je = B->size() - 1; j != je; ++j) + if ((*B)[j + 1].Offset - (*B)[j].Offset != Diff) + return; + // Find the latest point that we can insert the vectorized instruction. + SmallVector Insts; + for (auto j = B->begin(), je = B->end(); j != je; ++j) + Insts.push_back(j->Inst); + auto InsertBefore = findClosestCommonDominator(DT, Insts); + // Create the new rdregion. + Extract *Extract0 = &(*B)[0]; + Region R(Extract0->Inst->getOperand(0)); + R.NumElements = R.Width = B->size(); + R.Stride = Diff / R.ElementBytes; + R.Indirect = BIdx->Indirect; + R.Offset = Extract0->Offset; + Value *OrigVector = cast(Extract0->Inst->getOperand(0)) + ->getOperand(0); + Value *NewRdRegion = OrigVector; + // Need to splat if Diff is 0, otherwise elements extracted are wrong. + if (Diff == 0 || R.Indirect || R.Offset || + R.NumElements != + cast(OrigVector->getType())->getNumElements()) { + // Not identity region. + NewRdRegion = R.createRdRegion(OrigVector, + Extract0->Inst->getName() + ".histogrammed", InsertBefore, + Extract0->Inst->getDebugLoc(), /*AllowScalar=*/false); + } + // Create the vectorized binary operator or trunc/zext/sext. + Instruction *NewInst = nullptr; + if (isa(Extract0->Inst)) { + // Create a vector of the constants used in the right side of the binary + // operators. + SmallVector RhsConsts; + for (auto j = B->begin(), je = B->end(); j != je; ++j) + RhsConsts.push_back(cast(j->Inst->getOperand(1))); + auto CV = ConstantVector::get(RhsConsts); + NewInst = BinaryOperator::Create( + (Instruction::BinaryOps)Extract0->Inst->getOpcode(), NewRdRegion, CV, + Extract0->Inst->getName() + ".histogrammed", InsertBefore); + } else { + // Create the vectorized trunc/zext/sext. + auto VT = VectorType::get(Extract0->Inst->getType(), B->size()); + NewInst = CastInst::Create((Instruction::CastOps)Extract0->Inst->getOpcode(), + NewRdRegion, VT, + Extract0->Inst->getName() + ".histogrammed", InsertBefore); + } + NewInst->setDebugLoc(Extract0->Inst->getDebugLoc()); + // For each original scalar binary operator or cast, create a rdregion to + // extract the equivalent scalar from the result of the vectorized binary + // operator, and use it to replace uses of the original binary operator. + for (auto j = B->begin(), je = B->end(); j != je; ++j) { + Region R2(NewInst); + R2.NumElements = R2.Width = 1; + R2.Offset = (j - B->begin()) * R2.ElementBytes; + auto NewRdRegion2 = R2.createRdRegion(NewInst, "", + InsertBefore, j->Inst->getDebugLoc(), /*AllowScalar=*/true); + NewRdRegion2->takeName(j->Inst); + j->Inst->replaceAllUsesWith(NewRdRegion2); + } + for (auto j = B->begin(), je = B->end(); j != je; ++j) { + auto OldRdRegion = cast(j->Inst->getOperand(0)); + j->Inst->eraseFromParent(); + OldRdRegion->eraseFromParent(); + } + // Add the new vectorized binary operator or cast back into + // ExtractVectorizer so the extracts we added could in turn be vectorized. + Extracted.push_back(NewInst); + Modified = true; +} + + diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXFuncPtrsLowering.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXFuncPtrsLowering.cpp new file mode 100644 index 000000000000..9558b6fddb37 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXFuncPtrsLowering.cpp @@ -0,0 +1,364 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// This pass lowers all function pointers related instructions +// +//===----------------------------------------------------------------------===// + +#include "GenX.h" +#include "GenXIntrinsics.h" +#include "GenXModule.h" +#include "GenXRegion.h" +#include "GenXUtil.h" +#include "llvmWrapper/IR/InstrTypes.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/GenXIntrinsics/GenXMetadata.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" + +using namespace llvm; +using namespace genx; + +namespace { + +// Function pointers lowering consists of two stages: +// 1. Collect all instruction that use function pointers and their users that +// have to be modified +// 2. Actually modify the instructions collected: +// - reconstruct existing wrr/rdr instrinsics (remove internal casts, use i64 +// types) +// - create new wrr/rdrs where necessary, e.g. as a select args for further +// baling to succeed +// - reconstruct all funcptrs-related phis +// - update all users of the instruction modified (may insert additional +// casts where necessary, +// e.g. ptrtoint for wrr to +// indirect call) + +class GenXFunctionPointersLowering : public ModulePass { + SetVector InstToProcess; + std::map PhisIndex; + + const DataLayout *DL; + LLVMContext *Ctx; + + bool IsFuncPointerVec(Value *V, SetVector *Funcs = nullptr); + + void collectFuncUsers(User *U); + void collectFuncUsers(IGCLLVM::CallInst *CI); + void collectFuncUsers(PHINode *Phi); + void collectFuncUsers(CastInst *Phi); + void collectFuncUsers(SelectInst *SI); + + void reconstructCall(CallInst *CI); + void reconstructPhi(PHINode *Phi); + void reconstructSelect(SelectInst *SI); + + void replaceAllUsersCommon(Instruction *Old, Instruction *New); + + Value *transformFuncPtrVec(Value *V); + +public: + static char ID; + explicit GenXFunctionPointersLowering(); + StringRef getPassName() const override { + return "GenX function pointers lowering"; + } + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnModule(Module &M) override; +}; + +} // namespace + +char GenXFunctionPointersLowering::ID = 0; +namespace llvm { +void initializeGenXFunctionPointersLoweringPass(PassRegistry &); +} +INITIALIZE_PASS_BEGIN(GenXFunctionPointersLowering, + "GenXFunctionPointersLowering", + "GenXFunctionPointersLowering", false, false) +INITIALIZE_PASS_DEPENDENCY(FunctionGroupAnalysis) +INITIALIZE_PASS_DEPENDENCY(GenXModule) +INITIALIZE_PASS_END(GenXFunctionPointersLowering, + "GenXFunctionPointersLowering", + "GenXFunctionPointersLowering", false, false) + +GenXFunctionPointersLowering::GenXFunctionPointersLowering() : ModulePass(ID) { + initializeGenXFunctionPointersLoweringPass(*PassRegistry::getPassRegistry()); +} + +ModulePass *llvm::createGenXFunctionPointersLoweringPass() { + return new GenXFunctionPointersLowering(); +} + +void GenXFunctionPointersLowering::getAnalysisUsage(AnalysisUsage &AU) const { + ModulePass::getAnalysisUsage(AU); + AU.addRequired(); + AU.addRequired(); + AU.setPreservesCFG(); +} + +bool GenXFunctionPointersLowering::runOnModule(Module &M) { + bool Modified = false; + + for (auto &F : M) + if (F.hasAddressTaken()) { + F.addFnAttr(genx::FunctionMD::CMStackCall); + F.addFnAttr(genx::FunctionMD::ReferencedIndirectly); + } + + for (auto &F : M) + if (F.hasFnAttribute("referenced-indirectly")) + for (auto *U : F.users()) + collectFuncUsers(U); + + Ctx = &M.getContext(); + DL = &M.getDataLayout(); + for (auto *TI : InstToProcess) { + if (auto *Phi = dyn_cast(TI)) + reconstructPhi(Phi); + else if (auto *CI = dyn_cast(TI)) + reconstructCall(CI); + else if (auto *SI = dyn_cast(TI)) + reconstructSelect(SI); + else + assert(0 && "Unsupported instruction to process"); + } + return Modified; +} + +void GenXFunctionPointersLowering::collectFuncUsers(User *U) { + if (auto *CI = dyn_cast(U)) + collectFuncUsers(CI); + else if (auto *C = dyn_cast(U)) + collectFuncUsers(C); + else if (auto *Phi = dyn_cast(U)) + collectFuncUsers(Phi); + else if (auto *SI = dyn_cast(U)) + collectFuncUsers(SI); + else if (auto *ICmp = dyn_cast(U)) { + // skip + } else if (auto *EE = dyn_cast(U)) { + collectFuncUsers(EE); + } else if (isa(U)) + for (auto *UU : U->users()) + collectFuncUsers(UU); + else { + assert(0 && "unsupported funcptr user"); + } +} + +void GenXFunctionPointersLowering::collectFuncUsers(IGCLLVM::CallInst *CI) { + if (!CI->isIndirectCall() && + (GenXIntrinsic::getAnyIntrinsicID(CI->getCalledFunction()) == + GenXIntrinsic::genx_rdregioni || + GenXIntrinsic::getAnyIntrinsicID(CI->getCalledFunction()) == + GenXIntrinsic::genx_wrregioni)) { + InstToProcess.insert(CI); + + for (auto *U : CI->users()) + collectFuncUsers(U); + } +} + +// do not process bitcast itself, after our transformations +// it should become dead and will be swept +void GenXFunctionPointersLowering::collectFuncUsers(CastInst *BC) { + for (auto *U : BC->users()) + collectFuncUsers(U); +} + +void GenXFunctionPointersLowering::collectFuncUsers(PHINode *Phi) { + InstToProcess.insert(Phi); + + for (auto *U : Phi->users()) + collectFuncUsers(U); +} + +void GenXFunctionPointersLowering::collectFuncUsers(SelectInst *SI) { + InstToProcess.insert(SI); + + if (!SI->getType()->getScalarType()->isIntegerTy(64)) + for (auto *U : SI->users()) + collectFuncUsers(U); +} + +void GenXFunctionPointersLowering::replaceAllUsersCommon(Instruction *Old, + Instruction *New) { + while (!Old->use_empty()) { + auto *U = Old->user_back(); + if (auto *CIU = dyn_cast(U)) { + if (CIU->getCalledOperand() == Old) { + auto *IntToPtr = CastInst::CreateBitOrPointerCast( + New, CIU->getCalledOperand()->getType(), "", CIU); + CIU->replaceUsesOfWith(Old, IntToPtr); + } else if (GenXIntrinsic::getAnyIntrinsicID(CIU->getCalledFunction()) == + GenXIntrinsic::genx_rdregioni || + GenXIntrinsic::getAnyIntrinsicID(CIU->getCalledFunction()) == + GenXIntrinsic::genx_wrregioni || + CIU->getCalledOperand() != Old) { + CIU->replaceUsesOfWith(Old, New); + } else + assert(0 && "unsupported call of a function pointer"); + } else if (isa(U) || isa(U)) { + U->replaceUsesOfWith(Old, New); + } else if (auto *Phi = dyn_cast(U)) { + Phi->replaceUsesOfWith(Old, New); + PhisIndex[Phi]++; + } else { + assert(0 && "Unsupported function pointer user\n"); + } + } + Old->eraseFromParent(); +} + +void GenXFunctionPointersLowering::reconstructCall(CallInst *CI) { + assert(GenXIntrinsic::getAnyIntrinsicID(CI->getCalledFunction()) == + GenXIntrinsic::genx_rdregioni || + GenXIntrinsic::getAnyIntrinsicID(CI->getCalledFunction()) == + GenXIntrinsic::genx_wrregioni); + Region R(Type::getInt64Ty(*Ctx)); + unsigned OffIdx = GenXIntrinsic::getAnyIntrinsicID(CI->getCalledFunction()) == + GenXIntrinsic::genx_rdregioni + ? 4 + : 5; + if (!isa(CI->getOperand(OffIdx))) + R.Indirect = CI->getOperand(OffIdx); + else + R.Offset = cast(CI->getOperand(OffIdx))->getZExtValue(); + Instruction *Result = nullptr; + if (GenXIntrinsic::getAnyIntrinsicID(CI->getCalledFunction()) == + GenXIntrinsic::genx_rdregioni) { + Result = cast( + R.createRdRegion(transformFuncPtrVec(CI->getOperand(0)), CI->getName(), + CI, CI->getDebugLoc(), true)); + } else if (GenXIntrinsic::getAnyIntrinsicID(CI->getCalledFunction()) == + GenXIntrinsic::genx_wrregioni) + Result = cast( + R.createWrRegion(transformFuncPtrVec(CI->getOperand(0)), + transformFuncPtrVec(CI->getOperand(1)), CI->getName(), + CI, CI->getDebugLoc())); + if (Result->getType() == CI->getType()) + return; + replaceAllUsersCommon(CI, Result); +} + +void GenXFunctionPointersLowering::reconstructPhi(PHINode *Phi) { + for (auto *Op : Phi->operand_values()) { + auto *OpTr = transformFuncPtrVec(Op); + Phi->replaceUsesOfWith(Op, OpTr); + if (OpTr != Op) + PhisIndex[Phi]++; + } + assert(Phi->getNumOperands() > 0 && Phi->getNumOperands() == PhisIndex[Phi]); + Type *NewTy = Phi->value_op_begin()->getType(); + assert(std::all_of(Phi->value_op_begin(), Phi->value_op_end(), + [&NewTy](Value *V) { return V->getType() == NewTy; })); + auto *NewPhi = PHINode::Create(NewTy, 0, Phi->getName(), Phi); + for (unsigned i = 0; i < Phi->getNumIncomingValues(); ++i) + NewPhi->addIncoming(Phi->getIncomingValue(i), Phi->getIncomingBlock(i)); + while (!Phi->user_empty()) { + // already checked that this is only wrr/rdr + auto *U = Phi->user_back(); + U->replaceUsesOfWith(Phi, NewPhi); + } + Phi->eraseFromParent(); +} + +void GenXFunctionPointersLowering::reconstructSelect(SelectInst *SI) { + Value *TVal = nullptr, *FVal = nullptr; + Region R1(SI->getTrueValue()->getType(), DL), + R2(SI->getFalseValue()->getType(), DL); + auto *BCT = BitCastInst::CreateBitOrPointerCast( + transformFuncPtrVec(SI->getTrueValue()), Type::getInt64Ty(*Ctx), "", SI); + BCT = BitCastInst::CreateBitOrPointerCast( + BCT, VectorType::get(Type::getInt64Ty(*Ctx), 1), "", SI); + auto *BCF = BitCastInst::CreateBitOrPointerCast( + transformFuncPtrVec(SI->getFalseValue()), Type::getInt64Ty(*Ctx), "", SI); + BCF = BitCastInst::CreateBitOrPointerCast( + BCF, VectorType::get(Type::getInt64Ty(*Ctx), 1), "", SI); + R1.Width = (SI->getTrueValue()->getType()->isVectorTy()) + ? SI->getTrueValue()->getType()->getVectorNumElements() + : 1; + R1.Width = (SI->getFalseValue()->getType()->isVectorTy()) + ? SI->getFalseValue()->getType()->getVectorNumElements() + : 1; + R1.Stride = 0, R1.VStride = 0; + R2.Stride = 0, R2.VStride = 0; + TVal = R1.createRdRegion(BCT, SI->getName(), SI, SI->getDebugLoc(), true); + FVal = R2.createRdRegion(BCF, SI->getName(), SI, SI->getDebugLoc(), true); + auto *NewSI = SelectInst::Create(SI->getCondition(), TVal, FVal, "", SI); + if (SI->getType() == NewSI->getType()) + SI->replaceAllUsesWith(NewSI); + else + replaceAllUsersCommon(SI, NewSI); +} + +Value *GenXFunctionPointersLowering::transformFuncPtrVec(Value *V) { + // quite often wrr/rdr get bitcast of funcptrs to as input, + // here we simply don't need them and DCE will sweep them later + auto Int64Ty = Type::getInt64Ty(*Ctx); + if (isa(V)) { + assert(V->getType()->isSingleValueType()); + if (V->getType()->getScalarType()->isIntegerTy(64)) + return V; + else if (V->getType()->isVectorTy()) + return UndefValue::get( + VectorType::get(Int64Ty, V->getType()->getVectorNumElements())); + else + return UndefValue::get(Int64Ty); + } + if (isa(V) && + cast(V)->getOpcode() == Instruction::BitCast) + V = cast(V)->getOperand(0); + else if (auto *BC = dyn_cast(V)) + if (!(BC->getType()->isVectorTy() && + BC->getType()->getScalarType() == BC->getOperand(0)->getType())) + V = BC->getOperand(0); + SetVector Funcs; + if (!isFuncPointerVec(V, &Funcs)) + return V; + assert(Funcs.size() > 0); + + assert(V->getType()->isVectorTy()); + std::vector CF; + for (auto &Val : Funcs) + CF.push_back(ConstantExpr::getPtrToInt(cast(Val), Int64Ty)); + Value *NewVal = nullptr; + // generate i64 instead of <1 x i64> + if (CF.size() > 1) + NewVal = ConstantVector::get(CF); + else if (CF.size() == 1) + NewVal = CF.front(); + return NewVal; +} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXGEPLowering.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXGEPLowering.cpp new file mode 100644 index 000000000000..36a2a2b8475b --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXGEPLowering.cpp @@ -0,0 +1,324 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +/// GenXGEPLowering +/// --------------- +/// +/// GenXGEPLowering is a function pass that lowers GEP instructions into +/// primitive ones that the rest of the GenX backend can deal with. +/// +//===----------------------------------------------------------------------===// + +#include "GenX.h" +#include "GenXModule.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" + +using namespace llvm; +using namespace genx; + +namespace { +class GenXGEPLowering : public FunctionPass { + const DataLayout *DL = nullptr; + LoopInfo *LI = nullptr; + IRBuilder<> *Builder = nullptr; + +public: + static char ID; + + GenXGEPLowering() : FunctionPass(ID) {} + + virtual StringRef getPassName() const override { return "GenX GEP Lowering"; } + + virtual bool runOnFunction(Function &F) override; + + virtual void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.setPreservesCFG(); + AU.addPreserved(); + AU.addPreserved(); + } + +private: + bool lowerGetElementPtrInst(GetElementPtrInst *GEP, + BasicBlock::iterator &BBI) const; + Value *truncExpr(Value *Val, Type *NewTy) const; + Value *getSExtOrTrunc(Value *, Type *) const; +}; + +} // namespace + +char GenXGEPLowering::ID = 0; +namespace llvm { void initializeGenXGEPLoweringPass(PassRegistry &); } +INITIALIZE_PASS_BEGIN(GenXGEPLowering, "GenXGEPLowering", "GenXGEPLowering", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_END(GenXGEPLowering, "GenXGEPLowering", "GenXGEPLowering", false, false) + +FunctionPass *llvm::createGenXGEPLoweringPass() { + initializeGenXGEPLoweringPass(*PassRegistry::getPassRegistry()); + return new GenXGEPLowering; +} + +bool GenXGEPLowering::runOnFunction(Function &F) { + LI = &getAnalysis().getLoopInfo(); + DL = &F.getParent()->getDataLayout(); + + const TargetTransformInfo &TTI = + getAnalysis().getTTI(F); + auto FlatAddrSpace = TTI.getFlatAddressSpace(); + + assert(DL && "null datalayout"); +#if 0 + // a good place to fix block layout + if (LI->empty()) + LayoutBlocks(F); + else + LayoutBlocks(F, *LI); +#endif + IRBuilder<> TheBuilder(F.getContext()); + Builder = &TheBuilder; + + bool Changed = false; + for (auto &BB : F) { + for (auto BI = BB.begin(), BE = BB.end(); BI != BE;) { + Instruction *Inst = &(*BI++); + Builder->SetInsertPoint(Inst); + + switch (Inst->getOpcode()) { + default: // By default, DO NOTHING + break; + case Instruction::GetElementPtr: + Changed |= lowerGetElementPtrInst(cast(Inst), BI); + break; + case Instruction::PtrToInt: + auto PtrV = cast(Inst)->getPointerOperand(); + auto AddrSpace = cast(Inst)->getPointerAddressSpace(); + if (AddrSpace == FlatAddrSpace) { + if (auto PtrCast = dyn_cast(PtrV)) { + // this is no-op AddrSpaceCast, should be removed + // create a new PtrToInt from the original pointer + // bypass the AddrSpaceCast and PtrToInt + auto P2I = Builder->CreatePtrToInt(PtrCast->getOperand(0), Inst->getType()); + Inst->replaceAllUsesWith(P2I); + Inst->eraseFromParent(); + if (PtrCast->use_empty()) { + PtrCast->eraseFromParent(); + } + } + } + break; + } + } + } + Builder = nullptr; + + return Changed; +} + +bool GenXGEPLowering::lowerGetElementPtrInst(GetElementPtrInst *GEP, + BasicBlock::iterator &BBI) const { + assert(Builder); + Value *PtrOp = GEP->getPointerOperand(); + PointerType *PtrTy = dyn_cast(PtrOp->getType()); + assert(PtrTy && "Only accept scalar pointer!"); + + unsigned PtrSizeInBits = DL->getPointerSizeInBits(PtrTy->getAddressSpace()); + unsigned PtrMathSizeInBits = PtrSizeInBits; + auto IntPtrTy = IntegerType::get(Builder->getContext(), PtrSizeInBits); + auto PtrMathTy = IntegerType::get(Builder->getContext(), PtrMathSizeInBits); + + // Check if the pointer itself is created from IntToPtr. If it is, and if + // the int is the same size, we can use the int directly. Otherwise, we + // need to add PtrToInt. + Value *BasePointer = nullptr; + if (IntToPtrInst *I2PI = dyn_cast(PtrOp)) { + Value *IntOp = I2PI->getOperand(0); + if (IntOp->getType() == IntPtrTy) + BasePointer = IntOp; + } + if (!BasePointer) + BasePointer = Builder->CreatePtrToInt(PtrOp, IntPtrTy); + + // This is the value of the pointer, which will ultimately replace gep. + Value *PointerValue = BasePointer; + + Type *Ty = PtrTy; + gep_type_iterator GTI = gep_type_begin(GEP); + for (auto OI = GEP->op_begin() + 1, E = GEP->op_end(); OI != E; ++OI, ++GTI) { + Value *Idx = *OI; + if (StructType *StTy = GTI.getStructTypeOrNull()) { + unsigned Field = unsigned(cast(Idx)->getZExtValue()); + if (Field) { + uint64_t Offset = DL->getStructLayout(StTy)->getElementOffset(Field); + Value *OffsetVal = Builder->getInt(APInt(PtrMathSizeInBits, Offset)); + PointerValue = Builder->CreateAdd(PointerValue, OffsetVal); + } + Ty = StTy->getElementType(Field); + } else { + Ty = GTI.getIndexedType(); + if (const ConstantInt *CI = dyn_cast(Idx)) { + if (!CI->isZero()) { + uint64_t Offset = DL->getTypeAllocSize(Ty) * CI->getSExtValue(); + Value *OffsetVal = Builder->getInt(APInt(PtrMathSizeInBits, Offset)); + PointerValue = Builder->CreateAdd(PointerValue, OffsetVal); + } + } else { + Value *NewIdx = getSExtOrTrunc(Idx, PtrMathTy); + APInt ElementSize = APInt(PtrMathSizeInBits, DL->getTypeAllocSize(Ty)); + + if (BinaryOperator *BO = dyn_cast(NewIdx)) { + // Detect the pattern GEP base, a + b where base and a are both loop + // invariant (but not b), so we could rearrange the lowered code into + // (base + (a << shftAmt)) + (b << shftAmt). + Loop *L = LI ? LI->getLoopFor(BO->getParent()) : nullptr; + if (L && L->isLoopInvariant(PtrOp) && + BO->getOpcode() == Instruction::Add) { + + auto reassociate = [&](Value *A, Value *B) { + Value *InvVal = nullptr; + if (ElementSize == 1) + InvVal = A; + else if (ElementSize.isPowerOf2()) + InvVal = Builder->CreateShl( + A, APInt(PtrMathSizeInBits, ElementSize.logBase2())); + else + InvVal = Builder->CreateMul(A, Builder->getInt(ElementSize)); + PointerValue = Builder->CreateAdd(PointerValue, InvVal); + NewIdx = B; + }; + + Value *LHS = BO->getOperand(0); + Value *RHS = BO->getOperand(1); + bool isLHSLI = L->isLoopInvariant(LHS); + bool isRHSLI = L->isLoopInvariant(RHS); + if (isLHSLI && !isRHSLI) + reassociate(LHS, RHS); + else if (!isLHSLI && isRHSLI) + reassociate(RHS, LHS); + } + } + if (ElementSize == 1) { + // DO NOTHING. + } else if (ElementSize.isPowerOf2()) { + APInt ShiftAmount = APInt(PtrMathSizeInBits, ElementSize.logBase2()); + NewIdx = Builder->CreateShl(NewIdx, ShiftAmount); + } else + NewIdx = Builder->CreateMul(NewIdx, Builder->getInt(ElementSize)); + + PointerValue = Builder->CreateAdd(PointerValue, NewIdx); + } + } + } + + PointerValue = Builder->CreateIntToPtr(PointerValue, GEP->getType()); + GEP->replaceAllUsesWith(PointerValue); + GEP->eraseFromParent(); + if (Instruction *I = dyn_cast(PointerValue)) { + BBI = BasicBlock::iterator(I); + ++BBI; + } + + return true; +} + +Value *GenXGEPLowering::getSExtOrTrunc(Value *Val, Type *NewTy) const { + assert(Builder); + Type *OldTy = Val->getType(); + unsigned OldWidth = OldTy->getIntegerBitWidth(); + unsigned NewWidth = NewTy->getIntegerBitWidth(); + + if (OldWidth < NewWidth) // SExt + return Builder->CreateSExt(Val, NewTy); + if (OldWidth > NewWidth) // Trunc + return truncExpr(Val, NewTy); + return Val; +} + +Value *GenXGEPLowering::truncExpr(Value *Val, Type *NewTy) const { + assert(Builder); + // Truncation on Gen could be as cheap as NOP by creating proper regions. + // Instead of truncating the value itself, truncate how it's calculated. + if (Constant *C = dyn_cast(Val)) + return Builder->CreateIntCast(C, NewTy, false); + + if (!isa(Val)) + return Builder->CreateTrunc(Val, NewTy); + + Instruction *I = cast(Val); + unsigned Opc = I->getOpcode(); + switch (Opc) { + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + BinaryOperator *BO = cast(I); + Value *LHS = truncExpr(BO->getOperand(0), NewTy); + Value *RHS = truncExpr(BO->getOperand(1), NewTy); + return Builder->CreateBinOp(BO->getOpcode(), LHS, RHS); + } + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: { + Value *Opnd = I->getOperand(0); + if (Opnd->getType() == NewTy) + return Opnd; + return Builder->CreateIntCast(Opnd, NewTy, Opc == Instruction::SExt); + } + case Instruction::Select: { + Value *TVal = truncExpr(I->getOperand(1), NewTy); + Value *FVal = truncExpr(I->getOperand(2), NewTy); + return Builder->CreateSelect(I->getOperand(0), TVal, FVal); + } +#if 0 + // TODO: Rewrite truncExpr into iterative one instead of recursive one to + // easily found the loop due to phi-node. + case Instruction::PHI: { + PHINode *PN = cast(I); + PHINode *Res = PHINode::Create(NewTy, PN->getNumIncomingValues()); + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + Value *V = truncExpr(PN->getIncomingValue(i), NewTy); + Res->addIncoming(V, PN->getIncomingBlock(i)); + } + return Res; + } +#endif + default: + // Don't know truncate its calculation safely, fall back to the regular way. + break; + } + + return Builder->CreateTrunc(Val, NewTy); +} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXGotoJoin.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXGotoJoin.cpp new file mode 100644 index 000000000000..388b3297a293 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXGotoJoin.cpp @@ -0,0 +1,332 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +// Utility functions relating to SIMD CF goto/join. +// +//===----------------------------------------------------------------------===// +#include "GenXGotoJoin.h" +#include "GenX.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/ADT/SetVector.h" + +using namespace llvm; +using namespace genx; + +/*********************************************************************** + * isEMValue : detect whether a value is an EM (execution mask) + * + * It is an EM value if it is an extractvalue instruction extracting element + * 0 from the struct returned by goto/join. + */ +bool GotoJoin::isEMValue(Value *V) +{ + if (auto EI = dyn_cast(V)) { + if (EI->getIndices()[0] == 0/* element number of EM in goto/join struct */) { + switch (GenXIntrinsic::getGenXIntrinsicID(EI->getAggregateOperand())) { + case GenXIntrinsic::genx_simdcf_goto: + case GenXIntrinsic::genx_simdcf_join: + return true; + default: + break; + } + } + } + return false; +} + +/*********************************************************************** + * findJoin : given a goto, find the join whose RM it modifies + * + * Return: the join instruction, 0 if join not found + */ +CallInst *GotoJoin::findJoin(CallInst *Goto) +{ + // Find the RM value from the goto. We know that the only + // uses of the goto are extracts. + ExtractValueInst *RM = nullptr; + for (auto ui = Goto->use_begin(), ue = Goto->use_end(); ui != ue; ++ui) { + auto Extract = dyn_cast(ui->getUser()); + if (Extract && Extract->getIndices()[0] == 1/* RM index in struct */) { + RM = Extract; + break; + } + } + if (!RM) + return nullptr; + // Find the single use of the RM in a join, possibly via phi nodes and + // other goto instructions. + CallInst *Join = nullptr; + SetVector RMVals; + RMVals.insert(RM); + for (unsigned ri = 0; !Join && ri != RMVals.size(); ++ri) { + auto RM = RMVals[ri]; + for (auto ui = RM->use_begin(), ue = RM->use_end(); + !Join && ui != ue; ++ui) { + auto User = cast(ui->getUser()); + if (isa(User)) + RMVals.insert(User); + else switch (GenXIntrinsic::getGenXIntrinsicID(User)) { + case GenXIntrinsic::genx_simdcf_join: + // We have found the join the RM is for. + Join = cast(User); + break; + case GenXIntrinsic::genx_simdcf_goto: { + // This is another goto that modifies the same RM. Find the + // extractvalue for the updated RM value. + ExtractValueInst *Extract = nullptr; + for (auto gui = User->use_begin(), gue = User->use_end(); + gui != gue; ++gui) { + auto ThisExtract = dyn_cast(gui->getUser()); + if (ThisExtract + && ThisExtract->getIndices()[0] == 1/*RM index in struct*/) { + Extract = ThisExtract; + break; + } + } + if (Extract) + RMVals.insert(Extract); + break; + } + default: + return nullptr; // unexpected use of RM + } + } + } + return Join; +} + +/*********************************************************************** + * isValidJoin : check that a join is valid + * + * In a block that is a join label (the "true" successor of a goto/join), there + * must be a join at the start of the block, ignoring phi nodes and bitcasts + * (which generate no code). + * + */ +bool GotoJoin::isValidJoin(CallInst *Join) +{ + assert(GenXIntrinsic::getGenXIntrinsicID(Join) == GenXIntrinsic::genx_simdcf_join); + auto BB = Join->getParent(); + // If this block has a goto/join predecessor of which it is "true" successor, + // check that this block starts with a join -- not necessarily the join we + // were given. + if (!isJoinLabel(BB)) + return true; + auto Inst = BB->getFirstNonPHIOrDbg(); + while (isa(Inst)) + Inst = Inst->getNextNode(); + if (GenXIntrinsic::getGenXIntrinsicID(Inst) == GenXIntrinsic::genx_simdcf_join) + return true; + return false; +} + +/*********************************************************************** + * isBranchingJoinLabelBlock : check whether a block has a single join and + * is both a join label and a branching join + * + * This only works after GenXLateSimdCFConformance. + * + * For a block for which this returns true, a pass must not insert code. + */ +bool GotoJoin::isBranchingJoinLabelBlock(BasicBlock *BB) +{ + auto Join = isBranchingJoinBlock(BB); + if (!Join || Join != BB->getFirstNonPHIOrDbg()) + return false; + return isJoinLabel(BB); +} + +/*********************************************************************** + * getBranchingBlockForBB : if this block is "true" successor of branching + * goto/join then return this branching block. Otherwise return nullptr. + * + * Enter: BB = the basic block + * SkipCriticalEdgeSplitter = if true, skip a critical edge splitter + * block when trying to find a branching goto/join + * + * SkipCriticalEdgeSplitter only needs to be set when used from inside + * GenXSimdCFConformance, before it has removed critical edge splitter blocks + * that separate a branching goto/join and the join label. + * + * "true" successor of branching block has to be a join label if it is not + * empty. This function does not test that. + * + */ +BasicBlock *GotoJoin::getBranchingBlockForBB(BasicBlock *BB, + bool SkipCriticalEdgeSplitter) { + for (auto ui = BB->use_begin(), ue = BB->use_end(); ui != ue; ++ui) { + auto PredBr = dyn_cast(ui->getUser()); + if (!PredBr || ui->getOperandNo() != PredBr->getNumOperands() - 1) + continue; + // PredBr is a branch that has BB as its "true" successor. First skip a + // critical edge splitter. + auto PredBB = PredBr->getParent(); + if (SkipCriticalEdgeSplitter && PredBr->getNumSuccessors() == 1 + && PredBr == PredBB->getFirstNonPHIOrDbg() && PredBB->hasOneUse()) { + auto ui2 = PredBB->use_begin(); + PredBr = dyn_cast(ui2->getUser()); + if (!PredBr || ui2->getOperandNo() != PredBr->getNumOperands() - 1) + continue; + PredBB = PredBr->getParent(); + } + // Check to see if it is a goto/join. + if (isBranchingGotoJoinBlock(PredBB)) + return PredBB; + } + return nullptr; +} + +/*********************************************************************** + * isJoinLabel : check whether this block needs to be a join label, because + * it is the "true" successor of at least one goto/join branch + * + * See getBranchingBlockForBB for details. + * + */ +bool GotoJoin::isJoinLabel(BasicBlock *BB, bool SkipCriticalEdgeSplitter) { + return getBranchingBlockForBB(BB, SkipCriticalEdgeSplitter); +} + +/*********************************************************************** + * isGotoBlock : see if a basic block is a goto block (hence branching), + * returning the goto if so + * + * See the comment at the top of isBranchingGotoJoinBlock regarding the case + * of a goto with an unconditional branch. + */ +CallInst *GotoJoin::isGotoBlock(BasicBlock *BB) +{ + auto Goto = isBranchingGotoJoinBlock(BB); + if (GenXIntrinsic::getGenXIntrinsicID(Goto) != GenXIntrinsic::genx_simdcf_goto) + Goto = nullptr; + return Goto; +} + +/*********************************************************************** + * isBranchingJoinBlock : see if a basic block is a branching + * join block, returning the join if so + */ +CallInst *GotoJoin::isBranchingJoinBlock(BasicBlock *BB) +{ + auto Join = isBranchingGotoJoinBlock(BB); + if (GenXIntrinsic::getGenXIntrinsicID(Join) != GenXIntrinsic::genx_simdcf_join) + Join = nullptr; + return Join; +} + +/*********************************************************************** + * isBranchingGotoJoinBlock : see if a basic block is a branching + * goto/join block, returning the goto/join if so + * + * This includes the case of a goto with an unconditional branch, as long as + * this is after GenXLateSimdCFConformance (or during GenX*SimdCFConformance + * after it has run moveCodeInGotoBlocks), because it relies on + * moveCodeInGotoBlocks having sunk the goto and its extracts to the end of the + * block. + */ +CallInst *GotoJoin::isBranchingGotoJoinBlock(BasicBlock *BB) +{ + auto Br = dyn_cast(BB->getTerminator()); + if (!Br) + return nullptr; + if (!Br->isConditional()) { + // Unconditional branch. Check for the block ending with a goto or an + // extract from a goto. + if (Br == &BB->front()) + return nullptr; + Value *LastInst = Br->getPrevNode(); + if (auto EV = dyn_cast(LastInst)) + LastInst = EV->getOperand(0); + if (GenXIntrinsic::getGenXIntrinsicID(LastInst) == GenXIntrinsic::genx_simdcf_goto) + return cast(LastInst); + return nullptr; + } + // Conditional branch. Check for the condition being an extractvalue from a + // goto/join. + auto EV = dyn_cast(Br->getCondition()); + if (!EV) + return nullptr; + auto GotoJoin = dyn_cast(EV->getOperand(0)); + if (!GotoJoin || GotoJoin->getParent() != BB) + return nullptr; + switch (GenXIntrinsic::getGenXIntrinsicID(GotoJoin)) { + case GenXIntrinsic::genx_simdcf_goto: + case GenXIntrinsic::genx_simdcf_join: + return GotoJoin; + default: + break; + } + return nullptr; +} + +/*********************************************************************** + * getLegalInsertionPoint : ensure an insertion point is legal in the presence + * of SIMD CF + * + * This is used by a pass that inserts or moves code after + * GenXLateSimdCFConformance. + * + * A branching join label block is not allowed any other code. If the insertion + * point is in one of those, move up to its immediate dominator. + * + * A goto or branching join is not allowed code after the goto/join. If the + * insertion point is there, move to just before the goto/join. + */ +Instruction *GotoJoin::getLegalInsertionPoint(Instruction *InsertBefore, + DominatorTree *DomTree) +{ + auto *InsertPoint = InsertBefore; + auto *InsertBB = InsertBefore->getParent(); + while (isBranchingJoinLabelBlock(InsertBB)) { + auto Node = DomTree->getNode(InsertBB); + assert(Node); + auto IDom = Node->getIDom(); + assert(IDom); + InsertBB = IDom->getBlock(); + InsertPoint = InsertBB->getTerminator(); + } + if (auto GotoJoin = isBranchingGotoJoinBlock(InsertBB)) + InsertPoint = GotoJoin; + + if (InsertBB == InsertBefore->getParent()) { + // If this is the same BB check that our InsertPoint + // goes before than InsertBefore + auto *TermInst = InsertBB->getTerminator(); + Instruction *t = InsertPoint; + while (t != InsertBefore) { + if (t == TermInst) { + InsertPoint = InsertBefore; + break; + } + t = t->getNextNode(); + } + } + return InsertPoint; +} + diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXGotoJoin.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXGotoJoin.h new file mode 100644 index 000000000000..6fd8535376ce --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXGotoJoin.h @@ -0,0 +1,83 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ + +#ifndef TARGET_GENXGOTOJOIN_H +#define TARGET_GENXGOTOJOIN_H + +namespace llvm { + +class BasicBlock; +class CallInst; +class DominatorTree; +class Instruction; +class Value; + +namespace genx { + +// GotoJoin : class containing goto/join related utility functions +class GotoJoin { +public: + + // isEMValue : detect whether a value is an EM (execution mask) + static bool isEMValue(Value *V); + + // findJoin : given a goto, find the join whose RM it modifies + static CallInst *findJoin(CallInst *Goto); + + // isValidJoin : check that the block containing a join is valid + static bool isValidJoin(CallInst *Join); + + // isBranchingJoinLabelBlock : check whether a block has a single join and + // is both a join label and a branching join + static bool isBranchingJoinLabelBlock(BasicBlock *BB); + + // getBranchingBlockForJoinLabel : if BB is "true" successor of branching + // block, return this branching block. If SkipCriticalEdgeSplitter is set, + // empty critical edge splitter blocks are skipped. + static BasicBlock *getBranchingBlockForBB(BasicBlock *BB, + bool SkipCriticalEdgeSplitter); + + // isJoinLabel : see if the block is a join label + static bool isJoinLabel(BasicBlock *BB, bool SkipCriticalEdgeSplitter = false); + + // isGotoBlock : see if a basic block is a goto block (hence branching), returning the goto if so + static CallInst *isGotoBlock(BasicBlock *BB); + + // isBranchingJoinBlock : see if a basic block is a branching join block + static CallInst *isBranchingJoinBlock(BasicBlock *BB); + + // isBranchingGotoJoinBlock : see if a basic block is a branching goto/join block + static CallInst *isBranchingGotoJoinBlock(BasicBlock *BB); + + // getLegalInsertionPoint : ensure an insertion point is legal in the presence of SIMD CF + static Instruction *getLegalInsertionPoint(Instruction *InsertBefore, DominatorTree *DomTree); + +}; + +} // End genx namespace +} // End llvm namespace + +#endif diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXIMadPostLegalization.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXIMadPostLegalization.cpp new file mode 100644 index 000000000000..a12c3b5d1a21 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXIMadPostLegalization.cpp @@ -0,0 +1,390 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXIMadLegalization +/// -------------------- +/// +/// This pass performs the legalization on integer mad to ensure additive +/// operand is alway single-used so that it could be mapped to accumulator +/// register. +/// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "GENX_IMAD_POST_LEGALIZATION" + +#include "GenX.h" +#include "GenXBaling.h" +#include "GenXModule.h" +#include "GenXRegion.h" +#include "GenXUtil.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; +using namespace genx; + +namespace { + +class GenXIMadPostLegalization : public FunctionPass { + DominatorTree *DT; + GenXBaling *Baling; +public: + static char ID; + + explicit GenXIMadPostLegalization() : + FunctionPass(ID), DT(nullptr), Baling(nullptr) {} + + StringRef getPassName() const override { + return "GenX IMAD post-legalization pass"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + } + + bool runOnFunction(Function &F) override; + +protected: + bool fixMadChain(BasicBlock *); +}; + +} // end anonymous namespace + +char GenXIMadPostLegalization::ID = 0; + +namespace llvm { +void initializeGenXIMadPostLegalizationPass(PassRegistry &); +} + +INITIALIZE_PASS_BEGIN(GenXIMadPostLegalization, "GenXIMadLegalization", "GenXIMadLegalization", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GenXFuncBaling) +INITIALIZE_PASS_END(GenXIMadPostLegalization, "GenXIMadLegalization", "GenXIMadLegalization", false, false) + +FunctionPass *llvm::createGenXIMadPostLegalizationPass() { + initializeGenXIMadPostLegalizationPass(*PassRegistry::getPassRegistry()); + return new GenXIMadPostLegalization(); +} + +static bool isIntegerMadIntrinsic(Value *V) { + switch (GenXIntrinsic::getGenXIntrinsicID(V)) { + default: break; + case GenXIntrinsic::genx_ssmad: + case GenXIntrinsic::genx_sumad: + case GenXIntrinsic::genx_usmad: + case GenXIntrinsic::genx_uumad: + case GenXIntrinsic::genx_ssmad_sat: + case GenXIntrinsic::genx_sumad_sat: + case GenXIntrinsic::genx_usmad_sat: + case GenXIntrinsic::genx_uumad_sat: + return true; + } + return false; +} + +static bool isIntegerMulIntrinsic(Value *V) { + switch (GenXIntrinsic::getGenXIntrinsicID(V)) { + default: break; + case GenXIntrinsic::genx_ssmul: + case GenXIntrinsic::genx_sumul: + case GenXIntrinsic::genx_usmul: + case GenXIntrinsic::genx_uumul: + return true; + } + return false; +} + +static std::tuple +findNearestInsertPt(DominatorTree *DT, ArrayRef Users) { + DenseMap BBs; + for (auto U : Users) { + auto UseBB = U->getParent(); + auto MI = BBs.end(); + bool New = false; + std::tie(MI, New) = BBs.insert(std::make_pair(UseBB, U)); + if (New) + continue; + // Find the earliest user if more than one users are in the same block. + auto BI = UseBB->begin(); + for (; &*BI != U && &*BI != MI->second; ++BI) + /* EMPTY */; + MI->second = &*BI; + } + + assert(BBs.size() != 0 && "At least one BB should be found!"); + + auto MI = BBs.begin(); + if (BBs.size() == 1) + return std::make_tuple(MI->first, MI->second); + + auto BB = MI->first; + auto ME = BBs.end(); + for (++MI; MI != ME; ++MI) + BB = DT->findNearestCommonDominator(BB, MI->first); + + MI = BBs.find(BB); + if (MI != BBs.end()) + return std::make_tuple(MI->first, MI->second); + + return std::make_tuple(BB, nullptr); +} + +bool GenXIMadPostLegalization::runOnFunction(Function &F) { + DT = &getAnalysis().getDomTree(); + Baling = &getAnalysis(); + bool Changed = false; + + // After this point, we should not do constant folding. + Changed |= breakConstantExprs(&F); + + // The following alorithm runs very slowly on large blocks. + if (skipOptWithLargeBlock(F)) + return Changed; + + SmallVector Deads; + for (auto &BB : F) { + for (auto BI = BB.begin(), BE = BB.end(); BI != BE; /* EMPTY */) { + Instruction *I = &*BI++; + if (!isIntegerMadIntrinsic(I)) + continue; + auto II = cast(I); + // Check src2 and duplicate if necessary. + Value *S2 = II->getOperand(2); + if (S2->hasOneUse()) { + // Sink S2 closer to user to shorten acc live ranges. + // This is particular important when 32 bit integer multiplications + // are not native and acc registers will be used to emulate them. + auto I2 = dyn_cast(S2); + if (I2 == nullptr || I2->getParent() != I->getParent()) + continue; + if (I2->mayHaveSideEffects() || isa(I2) || + I2->getNextNode() == I) + continue; + I2->moveBefore(I); + Changed = true; + continue; + } + // Only duplicate on selective instructions. + if (!GenXIntrinsic::isRdRegion(S2) && !isIntegerMulIntrinsic(S2)) + continue; + Instruction *RII = cast(S2); + SmallVector Others; + for (auto UI = S2->use_begin(), + UE = S2->use_end(); UI != UE; /* EMPTY */) { + Use &U = *UI++; + auto InsertPt = cast(U.getUser()); + if (!isIntegerMadIntrinsic(InsertPt) || U.getOperandNo() != 2) { + Others.push_back(InsertPt); + continue; + } + auto NewInst = RII->clone(); + NewInst->setName(RII->getName() + ".postimad"); + NewInst->insertBefore(InsertPt); + U.set(NewInst); + } + if (!Others.empty()) { + // Find a new place for RII. + BasicBlock *NBB = nullptr; + Instruction *Pt = nullptr; + std::tie(NBB, Pt) = findNearestInsertPt(DT, Others); + Pt = Pt ? Pt : NBB->getTerminator(); + RII->moveBefore(Pt); + } else + Deads.push_back(RII); + Changed = true; + } + } + for (auto I : Deads) + I->eraseFromParent(); + + for (auto &BB : F) + Changed |= fixMadChain(&BB); + + + return Changed; +} + +bool GenXIMadPostLegalization::fixMadChain(BasicBlock *BB) { + + // Given the bale 'B', collect all its operand instructions in the same basic + // block. + auto collectUnbaledOpndInsts = [](BasicBlock *BB, Bale &B) { + std::vector Opnds; + Instruction *In = nullptr; + // Collect operand instructions not baled yet. + for (auto I = B.begin(), E = B.end(); I != E; ++I) { + bool isFMA = GenXIntrinsic::getAnyIntrinsicID(I->Inst) == Intrinsic::fma; + for (unsigned i = 0, e = I->Inst->getNumOperands(); i != e; ++i) { + // Skip if that operand is baled. + if (I->Info.isOperandBaled(i)) + continue; + auto Op = dyn_cast(I->Inst->getOperand(i)); + // Skip if it's not an instruction or from the same BB. + if (Op && Op->getParent() == BB) { + Opnds.push_back(Op); + if (isFMA && i == 2) + In = Op; + } + } + // Bail out once 'maininst' is processed. The 'maininst' is usually baled + // in 'wrregion', 'sat' and similar stuffs, which usually doesn't require + // additional operands. + if (I->Info.Type == BaleInfo::MAININST) + break; + } + return std::make_pair(In, Opnds); + }; + + // Given two instructions, 'A' and 'B', in the same basic block, check + // whether 'A' dominates 'B'. + auto dominates = [](const Instruction *A, const Instruction *B) { + const BasicBlock *BB = A->getParent(); + assert(BB == B->getParent()); + + BasicBlock::const_iterator BI = BB->begin(); + for (; &*BI != A && &*BI != B; ++BI) + /*EMPTY*/; + + return &*BI == A; + }; + + bool Changed = false; + std::set FMAs; // 'fma' already handled. + for (auto BI = BB->rbegin(), BE = BB->rend(); BI != BE; ++BI) { + auto Inst = &*BI; + Bale OutB; + Baling->buildBale(Inst, &OutB); + // Skip bale non-FMA bale. + if (!OutB.getMainInst()) + continue; + auto CandidateInsn = OutB.getMainInst()->Inst; + assert(CandidateInsn); + if (GenXIntrinsic::getAnyIntrinsicID(CandidateInsn) != Intrinsic::fma) + continue; + // Skip if it's already handled. + if (FMAs.count(CandidateInsn)) + continue; + + // Collection of all inputs for the chain curently discovered. + std::set Inputs; + // The mad chain itself. + std::vector Chain; + Chain.push_back(OutB); + FMAs.insert(CandidateInsn); + do { + auto &OutB = Chain.back(); + Instruction *In = nullptr; + std::vector Opnds; + // Collect all operands so that we could grow the chain through the + // chain-in. + std::tie(In, Opnds) = collectUnbaledOpndInsts(BB, OutB); + if (!In || !In->hasOneUse()) + break; + // Check whether all inputs collected so far dominates 'In' so that we + // won't add extra register pressure. + for (auto &I : Inputs) { + if (dominates(I, In)) + continue; + In = nullptr; + break; + } + // Skip chain building if there are inputs won't be dominated by the new + // chain-in. + if (!In) + break; + // Check inputs from the tip of chain, i.e. the current chain-out. + for (auto &OpI : Opnds) { + // Skip the chain-in. + if (OpI == In) + continue; + // Skip if that input dominates the chain-in but record it as inputs. + // + // FIXME: revisit the following check. This stops sinking non-mad bales + // which may increase register pressure and inserts non-mad instructions + // among mads. + if (true || !OpI->hasOneUse() || dominates(OpI, In)) { + Inputs.insert(OpI); + continue; + } + // TODO: So far, only traverse one step further from that chain-out + // operands. + Bale OpB; + Baling->buildBale(OpI, &OpB); + std::vector SubOpnds; + std::tie(std::ignore, SubOpnds) = collectUnbaledOpndInsts(BB, OpB); + for (auto &SubI : SubOpnds) { + if (dominates(SubI, In)) { + Inputs.insert(SubI); + continue; + } + // Stop chaining as 'SubI' intervenes between 'In' and 'Out'. + In = nullptr; + break; + } + if (!In) + break; + Chain.push_back(OpB); + } + if (!In) + break; + // Grow the chain by appending this chain-in. + Bale InB; + Baling->buildBale(In, &InB); + Chain.push_back(InB); + // Stop chaining if it's not mad any more. + if (!InB.getMainInst()) + break; + auto CandidateInst = InB.getMainInst()->Inst; + assert(CandidateInst); + if (GenXIntrinsic::getAnyIntrinsicID(CandidateInst) != Intrinsic::fma) + break; + FMAs.insert(CandidateInst); + } while (1); + // Cluster the discovered chain together. + if (FMAs.size() > 1) { + Instruction *Pos = nullptr; + for (auto I = Chain.begin(), E = Chain.end(); I != E; ++I) { + for (auto II = I->rbegin(), IE = I->rend(); II != IE; ++II) { + if (!Pos) { + Pos = II->Inst; + continue; + } + // Skip phi which is not movable. + if (isa(II->Inst)) + break; + II->Inst->moveBefore(Pos); + Pos = II->Inst; + Changed = true; + } + } + } + } + return Changed; +} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXInlineAsmLowering.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXInlineAsmLowering.cpp new file mode 100644 index 000000000000..22cd57501ee5 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXInlineAsmLowering.cpp @@ -0,0 +1,345 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXInlineAsmLowering +/// ------------ +/// This pass recreates VISA inline assembly with new types +/// if 'cr' constraint is used. Also pass inserts constraints +/// information as metadata in order not to parse constraints +/// string every time in each pass where this information is needed. +/// +//===----------------------------------------------------------------------===// + +#include "GenX.h" +#include "GenXGotoJoin.h" +#include "GenXIntrinsics.h" +#include "GenXModule.h" +#include "GenXRegion.h" +#include "GenXSubtarget.h" +#include "GenXUtil.h" +#include "GenXVisa.h" +#include "visa_igc_common_header.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; +using namespace genx; + +namespace { + +class GenXInlineAsmLowering : public FunctionPass { + using ConstraintInfoVector = InlineAsm::ConstraintInfoVector; + using ConstraintInfo = InlineAsm::ConstraintInfo; + using GenXConstraintInfoVector = std::vector; + +private: + LLVMContext *Context = nullptr; + SmallVector ToErase; + + MDNode *createInlineAsmMetadata( + CallInst *CI, + const InlineAsm::ConstraintInfoVector &ConstraintsInfo) const; + + Type *rewriteTypeForConstraintIfNeeded( + Type *Ty, const GenXInlineAsmInfo &ConstraintInfo) const; + Type *rewriteTypeForCR(Type *CRType) const; + + FunctionType *rewriteFunctionTypeForInlineAsmIfNeeded( + CallInst *CI, const GenXConstraintInfoVector &ConstraintsInfo) const; + + void replaceInlineAsmUses(CallInst *Of, CallInst *With, + const GenXConstraintInfoVector &ConstraintsInfo); + + CallInst * + recreateInlineAsmWithCR(CallInst *CI, + const GenXConstraintInfoVector &ConstraintsInfo); + +public: + static char ID; + explicit GenXInlineAsmLowering() : FunctionPass(ID) {} + StringRef getPassName() const override { + return "GenX VISA inline asm lowering"; + } + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnFunction(Function &F) override; +}; + +} // end namespace + +char GenXInlineAsmLowering::ID = 0; +namespace llvm { +void initializeGenXInlineAsmLoweringPass(PassRegistry &); +} +INITIALIZE_PASS_BEGIN(GenXInlineAsmLowering, "GenXInlineAsmLowering", + "GenXInlineAsmLowering", false, false) +INITIALIZE_PASS_END(GenXInlineAsmLowering, "GenXInlineAsmLowering", + "GenXInlineAsmLowering", false, false) + +FunctionPass *llvm::createGenXInlineAsmLoweringPass() { + initializeGenXInlineAsmLoweringPass(*PassRegistry::getPassRegistry()); + return new GenXInlineAsmLowering; +} + +void GenXInlineAsmLowering::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addPreserved(); +} + +bool GenXInlineAsmLowering::runOnFunction(Function &F) { + auto GM = getAnalysisIfAvailable(); + if (GM && !GM->HasInlineAsm()) + return false; + + // Collect inline asm worklist + auto InlineAsmsToProcess = + make_filter_range(instructions(&F), [&](Instruction &I) { + auto *CI = dyn_cast(&I); + // No need to process inline asm with empty constraint string + return CI && CI->isInlineAsm() && + !cast(CI->getCalledValue()) + ->getConstraintString() + .empty(); + }); + + SmallVector InlineAsms; + llvm::transform(InlineAsmsToProcess, std::back_inserter(InlineAsms), + [&](Instruction &I) { return cast(&I); }); + + if (InlineAsms.empty()) + return false; + + Context = &InlineAsms[0]->getContext(); + for (auto *CI : InlineAsms) { + auto *IA = cast(CI->getCalledValue()); + InlineAsm::ConstraintInfoVector ConstraintsInfo = IA->ParseConstraints(); + MDNode *ConstraintsMD = createInlineAsmMetadata(CI, ConstraintsInfo); + GenXConstraintInfoVector GenXConstraintsInfo = + genx::getGenXInlineAsmInfo(ConstraintsMD); + + // No need to recreate asm expression if + // there is no 'cr' constraint. Set created metadata and return. + if (!genx::hasConstraintOfType(GenXConstraintsInfo, + ConstraintType::Constraint_cr)) { + CI->setMetadata(genx::MD_genx_inline_asm_info, ConstraintsMD); + continue; + } + + // Create new inline asm and don't forget to set + // earlier created metadata. + CallInst *NewCI = recreateInlineAsmWithCR(CI, GenXConstraintsInfo); + NewCI->setMetadata(genx::MD_genx_inline_asm_info, ConstraintsMD); + } + + for (auto *I : ToErase) + I->eraseFromParent(); + ToErase.clear(); + + return true; +} + +// Creating metadata for inline asm constraints +// in order not to parse constraints string every time in each pass +// where this information is needed. +MDNode *GenXInlineAsmLowering::createInlineAsmMetadata( + CallInst *CI, const ConstraintInfoVector &ConstraintsInfo) const { + assert(!ConstraintsInfo.empty() && "Non empty constraints expected"); + assert(CI->isInlineAsm() && "Inline asm expected"); + + Type *Int1Ty = Type::getInt1Ty(*Context); + Type *Int32Ty = Type::getInt32Ty(*Context); + std::vector Entries; + for (auto &&Info : ConstraintsInfo) { + std::string Codes; + if (genx::isInlineAsmMatchingInputConstraint(Info)) + Codes = genx::getInlineAsmCodes( + ConstraintsInfo[genx::getInlineAsmMatchedOperand(Info)]); + else + Codes = genx::getInlineAsmCodes(Info); + + genx::ConstraintType CTy = genx::getInlineAsmConstraintType(Codes); + if (CTy == ConstraintType::Constraint_unknown) + Context->emitError(CI, "Unsupported constraint '" + Codes + + "' in inline assembly"); + + Metadata *EntryMD[3] = { + ConstantAsMetadata::get( + ConstantInt::get(Int32Ty, static_cast(CTy))), + ConstantAsMetadata::get(ConstantInt::get(Int32Ty, Info.MatchingInput)), + ConstantAsMetadata::get(ConstantInt::get( + Int1Ty, (Info.Type == InlineAsm::ConstraintPrefix::isOutput)))}; + Entries.push_back(MDNode::get(*Context, EntryMD)); + } + return MDTuple::get(*Context, Entries); +} + +Type *GenXInlineAsmLowering::rewriteTypeForConstraintIfNeeded( + Type *Ty, const GenXInlineAsmInfo &ConstraintInfo) const { + switch (ConstraintInfo.getConstraintType()) { + default: + return Ty; + case ConstraintType::Constraint_cr: + return rewriteTypeForCR(Ty); + } +} + +Type *GenXInlineAsmLowering::rewriteTypeForCR(Type *CRType) const { + assert(CRType->isIntOrIntVectorTy() && + "Expected integer inputs for 'cr' constraint"); + Type *Int1Ty = Type::getInt1Ty(*Context); + return CRType->isVectorTy() + ? VectorType::get(Int1Ty, CRType->getVectorNumElements()) + : Int1Ty; +} + +// If there exist 'cr' for now output a new result type must be constructed +FunctionType *GenXInlineAsmLowering::rewriteFunctionTypeForInlineAsmIfNeeded( + CallInst *CI, const GenXConstraintInfoVector &ConstraintsInfo) const { + // Rewriting return type + unsigned NumOutputs = genx::getInlineAsmNumOutputs(CI); + std::vector NewResultsTypes; + if (NumOutputs == 1) { + NewResultsTypes.push_back( + rewriteTypeForConstraintIfNeeded(CI->getType(), ConstraintsInfo[0])); + } else if (NumOutputs > 1) { + auto *ST = cast(CI->getType()); + std::transform(ST->element_begin(), ST->element_end(), + ConstraintsInfo.begin(), std::back_inserter(NewResultsTypes), + [&](Type *Ty, const GenXInlineAsmInfo &Info) { + return rewriteTypeForConstraintIfNeeded(Ty, Info); + }); + } + + // New return type: struct for multiple outputs, + // void for no outputs, and one exact type for single output + Type *NewRetType; + if (NewResultsTypes.empty()) + NewRetType = Type::getVoidTy(*Context); + else if (NewResultsTypes.size() == 1) + NewRetType = NewResultsTypes[0]; + else + NewRetType = StructType::get(*Context, NewResultsTypes); + + // Rewritng params types + std::vector NewParamsTypes; + std::transform(CI->arg_begin(), CI->arg_end(), + ConstraintsInfo.begin() + NumOutputs, + std::back_inserter(NewParamsTypes), + [&](Value *V, const GenXInlineAsmInfo &Info) { + return rewriteTypeForConstraintIfNeeded(V->getType(), Info); + }); + return FunctionType::get(NewRetType, NewParamsTypes, false); +} + +// If result type differs than iterate over all +// users of original call and replace it's +// uses with new outputs. Thus new extractelements and +// zero exstensions might be created. Existing extracts should be eliminated. +void GenXInlineAsmLowering::replaceInlineAsmUses( + CallInst *Of, CallInst *With, + const GenXConstraintInfoVector &ConstraintsInfo) { + if (Of->getType() == With->getType()) { + Of->replaceAllUsesWith(With); + return; + } + IRBuilder<> Builder(*Context); + Builder.SetInsertPoint(With->getNextNode()); + unsigned NumOutputs = genx::getInlineAsmNumOutputs(Of); + if (NumOutputs == 1) { + Value *NewResZExt = Builder.CreateZExt(With, Of->getType(), ".asm.zext.cr"); + Of->replaceAllUsesWith(NewResZExt); + return; + } + + // Create new extractvalues and replace all uses + for (auto *U : Of->users()) { + Value *ToZext = With; + auto *EV = cast(U); + ToErase.push_back(EV); + unsigned OutputConstraintIdx = EV->getIndices()[0]; + ToZext = + Builder.CreateExtractValue(ToZext, OutputConstraintIdx, "asmresult.cr"); + // Zero extension needed only for 'cr' output + if (ConstraintsInfo[OutputConstraintIdx].getConstraintType() == + genx::ConstraintType::Constraint_cr) + ToZext = Builder.CreateZExt(ToZext, U->getType(), ".asmresult.zext.cr"); + U->replaceAllUsesWith(ToZext); + } +} + +// If inline assembly uses 'cr' constraints (for now) +// all types should be converted to i1. So inserting +// truncations for inputs and zero extensions for outputs. +CallInst *GenXInlineAsmLowering::recreateInlineAsmWithCR( + CallInst *CI, const GenXConstraintInfoVector &ConstraintsInfo) { + assert(!ConstraintsInfo.empty() && "Non empty constraints expected"); + assert(CI->isInlineAsm() && "Inline asm expected"); + + // If there exist 'cr' output a new result type must be constructed + FunctionType *NewFTy = + rewriteFunctionTypeForInlineAsmIfNeeded(CI, ConstraintsInfo); + + // New function types for 'cr': + // any_int -> i1 + // -> + // + // Create truncation for input args if needed + IRBuilder<> Builder(CI); + std::vector NewArgs; + std::transform(CI->arg_begin(), CI->arg_end(), NewFTy->param_begin(), + std::back_inserter(NewArgs), [&](Value *Arg, Type *NewArgTy) { + if (Arg->getType() != NewArgTy) + Arg = Builder.CreateTrunc(Arg, NewArgTy, ".trunc.cr"); + return Arg; + }); + + // Create exactly the same inline assembly but with new function type + auto *IA = cast(CI->getCalledValue()); + InlineAsm *NewIA = InlineAsm::get( + NewFTy, IA->getAsmString(), IA->getConstraintString(), + IA->hasSideEffects(), IA->isAlignStack(), IA->getDialect()); + CallInst *NewCI = Builder.CreateCall(NewIA, NewArgs, ".asm.cr"); + NewCI->setAttributes(CI->getAttributes()); + NewCI->setDebugLoc(CI->getDebugLoc()); + + replaceInlineAsmUses(CI, NewCI, ConstraintsInfo); + ToErase.push_back(CI); + + return NewCI; +} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXInstCombineCleanup.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXInstCombineCleanup.cpp new file mode 100644 index 000000000000..a3d179cdaba3 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXInstCombineCleanup.cpp @@ -0,0 +1,141 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXInstCombineCleanupPass +/// -------------------------- +/// +/// For switch instructions llvm 7.0 instcombine aggressively shrikns the type +/// of the condition variable. This can introduce types which are unsupported +/// in GenX IR (like i2, i27, etc) +/// The pass tries to detect such switch instructions and modify them to use +/// the original condition instead of a truncated one. +/// The idea is to do it using a standard llvm passes, so we just try to do the +/// opposite to inst combine change and expect irbuilder folding or other passes +/// to change code as it was before. + +#define DEBUG_TYPE "GENX_INSTCOMBCLEANUP" + +#include "GenX.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; +using namespace genx; + +namespace { + +class GenXInstCombineCleanup : public FunctionPass { +public: + static char ID; + + explicit GenXInstCombineCleanup() : FunctionPass(ID) { } + + StringRef getPassName() const override { return "GenX InstCombineCleanup"; } + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnFunction(Function &F) override; +}; + +} // end anonymous namespace + +char GenXInstCombineCleanup::ID = 0; +namespace llvm { void initializeGenXInstCombineCleanupPass(PassRegistry &); } +INITIALIZE_PASS_BEGIN(GenXInstCombineCleanup, "GenXInstCombineCleanup", "GenXInstCombineCleanup", false, false) +INITIALIZE_PASS_END(GenXInstCombineCleanup, "GenXInstCombineCleanup", "GenXInstCombineCleanup", false, false) + +FunctionPass *llvm::createGenXInstCombineCleanup() +{ + initializeGenXInstCombineCleanupPass(*PassRegistry::getPassRegistry()); + return new GenXInstCombineCleanup(); +} + +void GenXInstCombineCleanup::getAnalysisUsage(AnalysisUsage &AU) const +{ + AU.setPreservesCFG(); +} + +bool typeMustBeChanged(Type *Ty) { + assert(Ty); + if (!Ty->isIntegerTy()) + return false; + unsigned Size = Ty->getPrimitiveSizeInBits(); + // Possible sizes are 1, 8, 16, 32, ... (2 and 4 must be excluded) + if (isPowerOf2_32(Size) && !(genx::BoolBits < Size && Size < genx::ByteBits)) + return false; + return true; +} + +bool GenXInstCombineCleanup::runOnFunction(Function &F) +{ + bool Modified = false; + +#if (LLVM_VERSION_MAJOR <= 7) + LLVM_DEBUG(dbgs() << "running GenXInstCombineCleanup on " << F.getName() << "\n"); + + LLVMContext &Ctx = F.getContext(); + IRBuilder<> Builder(Ctx); + + for (auto I = inst_begin(F), E = inst_end(F); I != E; ++I) { + auto Switch = dyn_cast(&*I); + if (!Switch) + continue; + + auto Cond = Switch->getCondition(); + Type *CondTy = Cond->getType(); + if (!typeMustBeChanged(CondTy)) + continue; + + unsigned CondSize = CondTy->getPrimitiveSizeInBits(); + assert(CondSize != genx::BoolBits && + "CondSize == 1 is not expected here. See typeMustBeChanged"); + // Round up to the next power of 2 skipping i2 and i4 (i3 -> i8, i2 -> i8, + // etc) + unsigned Size = + CondSize < genx::ByteBits ? genx::ByteBits : NextPowerOf2(CondSize); + + Type *NewTy = Type::getIntNTy(Ctx, Size); + + Builder.SetInsertPoint(Switch); + Value *NewCond = + Builder.CreateSExt(Cond, NewTy, Switch->getName() + ".condSExt"); + Switch->setCondition(NewCond); + + for (auto Case : Switch->cases()) { + APInt UpdatedCase = Case.getCaseValue()->getValue().sext(Size); + Case.setValue(ConstantInt::get(Ctx, UpdatedCase)); + } + + Modified = true; + } +#endif + + return Modified; +} + diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXIntrinsics.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXIntrinsics.cpp new file mode 100644 index 000000000000..df8c4e93244c --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXIntrinsics.cpp @@ -0,0 +1,201 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +// This file contains a table of extra information about the llvm.genx.* +// intrinsics, used by the vISA register allocator and function writer to +// decide exactly what operand type to use. The more usual approach in an LLVM +// target is to have an intrinsic map to an instruction in instruction +// selection, then have register category information on the instruction. But +// we are not using the target independent code generator, we are generating +// code directly from LLVM IR. +// +//===----------------------------------------------------------------------===// +#include "GenXIntrinsics.h" +#include "IsaDescription.h" +#include "visa_igc_common_header.h" +#include "llvm/GenXIntrinsics/GenXIntrinsics.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" + +using namespace llvm; + +// In this table: +// +// Each ALU and shared function intrinsic has a record giving information +// about its operands, and how it is written as a vISA instruction. The +// record has an initial field giving the intrinsic ID, then a number of +// fields where each corresponds to a field in the vISA instruction. +// +// A field may be several values combined with the | operator. The first +// value is the operand category (GENERAL etc), or one of a set of +// non-register operand categories (LITERAL, BYTE), or END to terminate +// the record. Other modifier values may be combined, such as SIGNED. +// The LLVM IR argument index plus 1 is also combined in, or 0 for the +// return value. + +// Video Analytics intrinsic helper macros, mainly to avoid large blocks +// of near-identical code in the intrinsics look-up table and also to +// aid readability. + +const GenXIntrinsicInfo::DescrElementType GenXIntrinsicInfo::Table[] = { + +// Region access intrinsics do not appear in this table + +#include "GenXIntrinsicInfoTable.inc" + + END}; + +GenXIntrinsicInfo::GenXIntrinsicInfo(unsigned IntrinId) : Args(0) { + const auto *p = Table; + for (;;) { + if (*p == END) + break; // intrinsic not found; leave Args pointing at END field + if (IntrinId == *p++) + break; + // Scan past the rest of this entry. + while (*p++ != END) + ; + } + // We have found the right entry. + Args = p; +} + +// Get the category and modifier for an arg idx (-1 means return value). +// The returned ArgInfo struct contains just the short read from the table, +// and has methods for accessing the various fields. +GenXIntrinsicInfo::ArgInfo GenXIntrinsicInfo::getArgInfo(int Idx) { + // Read through the fields in the table to find the one with the right + // arg index... + for (const auto *p = Args; *p; p++) { + ArgInfo AI(*p); + if (AI.isRealArgOrRet() && AI.getArgIdx() == Idx) + return AI; + } + // Field with requested arg index was not found. + return END; +} + +// Return the starting point of any trailing null (zero) arguments +// for this call. If the intrinsic does not have a ARGCOUNT descriptor +// this will always return the number of operands to the call (ie, there +// is no trailing null zone), even if there are some trailing nulls. +unsigned GenXIntrinsicInfo::getTrailingNullZoneStart(CallInst *CI) { + unsigned TrailingNullStart = CI->getNumArgOperands(); + + const auto *p = Args; + for (; *p; p++) { + ArgInfo AI(*p); + if (AI.getCategory() == ARGCOUNT) + break; + } + + if (*p) { + ArgInfo ACI(*p); + unsigned BaseArg = ACI.getArgIdx(); + + TrailingNullStart = BaseArg; + for (unsigned Idx = BaseArg; Idx < CI->getNumArgOperands(); ++Idx) { + if (auto CA = dyn_cast(CI->getArgOperand(Idx))) { + if (CA->isNullValue()) + continue; + } + TrailingNullStart = Idx + 1; + } + + if (TrailingNullStart < BaseArg + ACI.getArgCountMin()) + TrailingNullStart = BaseArg + ACI.getArgCountMin(); + } + + return TrailingNullStart; +} + +/*********************************************************************** + * getExecSizeAllowedBits : get bitmap of which execsize values are allowed + * for this intrinsic + * + * Return: bit N set if execution size 1<getCalledFunction(); + assert(CalledF); + auto ID = GenXIntrinsic::getGenXIntrinsicID(CalledF); + + switch (ID) { + default: + break; + // Exec size of intrinsics with channels are inferred from address operand. + case GenXIntrinsic::genx_gather4_scaled2: + return CI->getArgOperand(4)->getType()->getVectorNumElements(); + case GenXIntrinsic::genx_raw_send: + case GenXIntrinsic::genx_raw_sends: + case GenXIntrinsic::genx_raw_send_noresult: + case GenXIntrinsic::genx_raw_sends_noresult: + return 16; + } + + return 0; +} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXIntrinsics.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXIntrinsics.h new file mode 100644 index 000000000000..9162c57e6126 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXIntrinsics.h @@ -0,0 +1,324 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +// This file declares a class to access a table of extra information about the +// llvm.genx.* intrinsics, used by the vISA register allocator and function +// writer to decide exactly what operand type to use. The more usual approach +// in an LLVM target is to have an intrinsic map to an instruction in +// instruction selection, then have register category information on the +// instruction. But we are not using the target independent code generator, we +// are generating code directly from LLVM IR. +// +//===----------------------------------------------------------------------===// +#ifndef GENXINTRINSICS_H +#define GENXINTRINSICS_H +#include "GenXVisa.h" + +#define GENX_ITR_CATVAL(v) ((v) << CATBASE) +#define GENX_ITR_FLAGENUM(o, v) ((v) << ((o) + FLAGBASE)) +#define GENX_ITR_FLAGMASK(o, w) (((1 << (w)) - 1) << ((o) + FLAGBASE)) +#define GENX_ITR_FLAGVAL(o) GENX_ITR_FLAGENUM(o, 1) + +namespace llvm { + class CallInst; + +class GenXIntrinsicInfo { +public: + typedef uint32_t DescrElementType; +private: + const DescrElementType *Args; + static const DescrElementType Table[]; +public: + enum { + // General format of intrinsic descriptor words: + // Bits 31..24: Category enumeration + // Bits 23..8: Flags, if any, meaning and layout depends on category + // Bits 7..0: Operand or literal, if any + // + // One exception to the above is LITERAL, where everything that isn't + // the category field is assumed to be the literal value. + // + // If you want to re-apportion space in the descriptor word (typically + // because you need another flag and you can't express what you need to + // do without creating one) then just modify FLAGBASE and FLAGWIDTH + // below, and everything else will shake itself out appropriately. + // Currently 8 bits are allocated for the category enumaration bitfield, + // although the actual enumeration values defined only require 6 bits - + // and there is still plenty of space left over even within that. + // Similarly, there are 8 bits allocated to the operand bitfield, and + // currently the maximum needed is 5. + // + // At the moment, the GENERAL category has 5 unused flag bits available + // to it, the RAW category has 13 unused bits, and the ARGCOUNT category + // has 13 unused bits. No other categories make use of the flags yet, + // so it should be a good while yet before it's necessary to resize + // the bitfields. + + FLAGBASE = 8, + FLAGWIDTH = 16, + CATBASE = FLAGBASE + FLAGWIDTH, + + CATMASK = ~((1 << CATBASE) - 1), + FLAGMASK = ((~((1 << FLAGBASE) - 1)) ^ CATMASK), + OPNDMASK = ~(CATMASK | FLAGMASK), + + // A field that does not contain an operand number or literal value: + END = 0, // end of instruction description + IMPLICITPRED = GENX_ITR_CATVAL(0x01), // implicit predication field + NULLRAW = GENX_ITR_CATVAL(0x02), // null raw operand + ISBARRIER = GENX_ITR_CATVAL(0x03), // intrinsic is barrier: suppress nobarrier attribute + + EXECSIZE = GENX_ITR_CATVAL(0x04), // execution size + EXECSIZE_GE2 = GENX_ITR_CATVAL(0x05), // execution size (must be >= 2) + EXECSIZE_GE4 = GENX_ITR_CATVAL(0x06), // execution size (must be >= 4) + EXECSIZE_GE8 = GENX_ITR_CATVAL(0x07), // execution size (must be >= 8) + EXECSIZE_NOT2 = GENX_ITR_CATVAL(0x08), // execution size (cannot be 2) + + // A field that contains a literal value the operand field + LITERAL = GENX_ITR_CATVAL(0x09), // literal byte (usually opcode) + LITMASK = ~CATMASK, + + // A field that contains an operand number, other than general: + FIRST_OPERAND = GENX_ITR_CATVAL(0x10), + LOG2OWORDS = GENX_ITR_CATVAL(0x10), // log2 number of owords + NUMGRFS = GENX_ITR_CATVAL(0x11), // rounded up number of GRFs + EXECSIZE_FROM_ARG = GENX_ITR_CATVAL(0x12), // exec_size field inferred from width of + // predication arg + SVMGATHERBLOCKSIZE = GENX_ITR_CATVAL(0x13), // svm gather block size, inferred from data type + LOG2OWORDS_PLUS_8 = GENX_ITR_CATVAL(0x14), // log2 number of owords, plus 8 + GATHERNUMELTS = GENX_ITR_CATVAL(0x15), // gather/scatter "num elements" field + TRANSPOSEHEIGHT = GENX_ITR_CATVAL(0x16), // block_height field in transpose + LOG2ELTSIZE = GENX_ITR_CATVAL(0x17), // log2 element size in gather/scatter + ARGCOUNT = GENX_ITR_CATVAL(0x18), // Byte containing number of non-undef operands + EXECSIZE_FROM_BYTE = GENX_ITR_CATVAL(0x19), // exec_size specified in byte + ARGCOUNTMASK = GENX_ITR_FLAGMASK(0, 3), // Space for minumum argument count + ARGCOUNTMIN1 = GENX_ITR_FLAGENUM(0, 1), // Must have at least one argument + + // A field that contains an operand number, other than general, and it + // is the "real" use of the operand, rather than an auxiliary use + // such as a "number of GRFs" field relating to this operand. + FIRST_REAL_OPERAND = GENX_ITR_CATVAL(0x20), + BYTE = GENX_ITR_CATVAL(0x20), // constant byte operand + SHORT = GENX_ITR_CATVAL(0x21), // constant short operand + INT = GENX_ITR_CATVAL(0x22), // constant int operand + ADDRESS = GENX_ITR_CATVAL(0x23), // address operand + PREDICATE = GENX_ITR_CATVAL(0x24), // predicate operand + PREDICATE_ZEROED = GENX_ITR_FLAGVAL(0), + Z_PREDICATE = PREDICATE | PREDICATE_ZEROED, + SAMPLER = GENX_ITR_CATVAL(0x25), // sampler operand + SURFACE = GENX_ITR_CATVAL(0x26), // surface operand + VME = GENX_ITR_CATVAL(0x27), // vme operand + // byte height of media 2D block, inferred from the width operand + // pointed at and the size of the return type or final operand type + MEDIAHEIGHT = GENX_ITR_CATVAL(0x28), + // predication control field from explicit predicate arg + PREDICATION = GENX_ITR_CATVAL(0x29), + // chmask field in load/sample, with exec size bit + SAMPLECHMASK = GENX_ITR_CATVAL(0x2a), + // does not appear in the vISA output, but needs to be two address + // coalesced with result + TWOADDR = GENX_ITR_CATVAL(0x2b), + CONSTVI1ASI32 = GENX_ITR_CATVAL(0x2c), // constant vXi1 written as i32 (used in setp) + RAW = GENX_ITR_CATVAL(0x2d), // raw operand or result, + // Raw descriptor flags, 3 bits used + RAW_UNSIGNED = GENX_ITR_FLAGVAL(0), // raw operand/result must be unsigned + RAW_SIGNED = GENX_ITR_FLAGVAL(1), // raw operand/result must be signed + RAW_NULLALLOWED = GENX_ITR_FLAGVAL(2), // raw operand or result can be null (V0) + URAW = RAW | RAW_UNSIGNED, + SRAW = RAW | RAW_SIGNED, + EXECSIZE_NOMASK = GENX_ITR_CATVAL(0x2e), // execution size with NoMask + + // A general operand + GENERAL = GENX_ITR_CATVAL(0x30), + // Modifiers for destination or source, 7 bits used + UNSIGNED = GENX_ITR_FLAGVAL(0), // int type forced to unsigned + SIGNED = GENX_ITR_FLAGVAL(1), // int type forced to signed + OWALIGNED = GENX_ITR_FLAGVAL(2), // must be oword aligned + GRFALIGNED = GENX_ITR_FLAGVAL(3), // must be grf aligned + RESTRICTION = GENX_ITR_FLAGMASK(4, 3), // field with operand width restriction + FIXED4 = GENX_ITR_FLAGENUM(4, 1), // operand is fixed size 4 vector and contiguous + CONTIGUOUS = GENX_ITR_FLAGENUM(4, 2), // operand must be contiguous + SCALARORCONTIGUOUS = GENX_ITR_FLAGENUM(4, 3), // operand must be stride 0 or contiguous + TWICEWIDTH = GENX_ITR_FLAGENUM(4, 4), // operand is twice the execution width + STRIDE1 = GENX_ITR_FLAGENUM(4, 5), // horizontal stride must be 1 + // Modifiers for destination only, 2 bits used + SATURATION = GENX_ITR_FLAGMASK(7, 2), + SATURATION_DEFAULT = GENX_ITR_FLAGENUM(7, 0), // saturation default: not saturated, fp is + // allowed to bale in to saturate inst + SATURATION_SATURATE = GENX_ITR_FLAGENUM(7, 1), // saturated + SATURATION_NOSAT = GENX_ITR_FLAGENUM(7, 2), // fp not allowed to bale in to saturate inst + SATURATION_INTALLOWED = GENX_ITR_FLAGENUM(7, 3), // int is allowed to bale in to saturate, + // because inst cannot overflow so + // saturation only required on destination + // truncation + // Modifiers for source only, 3 bits used + NOIMM = GENX_ITR_FLAGVAL(7), // source not allowed to be immediate + MODIFIER = GENX_ITR_FLAGMASK(8, 2), + MODIFIER_DEFAULT = GENX_ITR_FLAGENUM(8, 0), // src modifier default: none + MODIFIER_ARITH = GENX_ITR_FLAGENUM(8, 1), // src modifier: arithmetic + MODIFIER_LOGIC = GENX_ITR_FLAGENUM(8, 2), // src modifier: logic + MODIFIER_EXTONLY = GENX_ITR_FLAGENUM(8, 3), // src modifier: extend only + DIRECTONLY = GENX_ITR_FLAGVAL(10), // indirect region not allowed + }; + struct ArgInfo { + unsigned Info; + // Default constructor, used in GenXBaling to construct an ArgInfo that + // represents an arg of a non-call instruction. + ArgInfo() : Info(GENERAL) {} + // Construct from a field read from the intrinsics info table. + ArgInfo(unsigned Info) : Info(Info) {} + // getCategory : return field category + unsigned getCategory() { return Info & CATMASK; } + // getLogAlignment : get any special alignment requirement, else 0 + unsigned getLogAlignment() { + if (isGeneral()) { + if (Info & GRFALIGNED) + return 5; + if (Info & OWALIGNED) + return 4; + return 0; + } + if (isRaw()) + return 5; + return 0; + } + // isGeneral : test whether this is a general operand + bool isGeneral() { return getCategory() == GENERAL; } + bool needsSigned() { + if (isGeneral()) + return Info & SIGNED; + if (isRaw()) + return Info & RAW_SIGNED; + return false; + } + bool needsUnsigned() { + if (isGeneral()) + return Info & UNSIGNED; + if (isRaw()) + return Info & RAW_UNSIGNED; + return false; + } + bool rawNullAllowed() { + assert(isRaw()); + return Info & RAW_NULLALLOWED; + } + // isArgOrRet : test whether this field has an arg index + bool isArgOrRet() { + if (isGeneral()) return true; + if ((Info & CATMASK) >= FIRST_OPERAND) + return true; + return false; + } + // isRealArgOrRet : test whether this field has an arg index, and is + // a "real" use of the arg + bool isRealArgOrRet() { + if (isGeneral()) return true; + if ((Info & CATMASK) >= FIRST_REAL_OPERAND) + return true; + return false; + } + // getArgCountMin : return minimum number of arguments + int getArgCountMin() { + assert(getCategory() == ARGCOUNT); + return (Info & ARGCOUNTMASK) >> FLAGBASE; + } + // getArgIdx : return argument index for this field, or -1 for return value + // (assuming isArgOrRet()) + int getArgIdx() { assert(isArgOrRet()); return (Info & OPNDMASK) - 1; } + // getLiteral : for a LITERAL or EXECSIZE field, return the literal value + unsigned getLiteral() { return Info & LITMASK; } + // isRet : test whether this is the field for the return value + // (assuming isArgOrRet()) + bool isRet() { return getArgIdx() < 0; } + // isRaw : test whether this is a raw arg or return value + bool isRaw() { return getCategory() == RAW; } + // getSaturation : return saturation info for the arg + unsigned getSaturation() { return Info & SATURATION; } + // getRestriction : return operand width/region restriction, one of + // 0 (no restriction), FIXED4, CONTIGUOUS, TWICEWIDTH + unsigned getRestriction() { return Info & RESTRICTION; } + // isImmediateDisallowed : test whether immediate disallowed + // (assuming isArgOrRet()) + bool isImmediateDisallowed() { + assert(isArgOrRet()); + if (isGeneral()) + return Info & NOIMM; + if (isRaw()) + return true; + switch (Info & CATMASK) { + case TWOADDR: + case PREDICATION: + case SURFACE: + case SAMPLER: + case VME: + return true; + default: break; + } + return false; + } + // getModifier : get what source modifier is allowed + unsigned getModifier() { + assert(isGeneral() && isArgOrRet() && !isRet()); + return Info & MODIFIER; + } + }; + // GenXIntrinsics::iterator : iterate through the fields + class iterator { + const DescrElementType *p; + public: + iterator(const DescrElementType *p) : p(p) {} + iterator &operator++() { ++p; if (*p == END) p = 0; return *this; } + ArgInfo operator*() { return ArgInfo(*p); } + bool operator!=(iterator i) { return p != i.p; } + }; + iterator begin() { + assert(isNotNull() && "iterating an intrinsic without info"); + return iterator(Args); + } + iterator end() { return iterator(0); } + // Construct a GenXIntrinsicInfo for a particular intrinsic + GenXIntrinsicInfo(unsigned IntrinId); + bool isNull() const { return *getInstDesc() == GenXIntrinsicInfo::END; } + bool isNotNull() const { return !isNull(); } + // Return instruction description. + const DescrElementType *getInstDesc() const { return Args; } + // Get the category and modifier for an arg idx + ArgInfo getArgInfo(int Idx); + // Get the trailing null zone, if any. + unsigned getTrailingNullZoneStart(CallInst *CI); + // Get the category and modifier for the return value + ArgInfo getRetInfo() { return getArgInfo(-1); } + // Get bitmap of allowed execution sizes + unsigned getExecSizeAllowedBits(); + // Determine if a predicated destination mask is permitted + bool getPredAllowed(); + // Get The overrided execution size or 0. + static unsigned getOverridedExecSize(CallInst *CI, + const GenXSubtarget *ST = nullptr); +}; + +} // namespace llvm +#endif // ndef GENXINTRINSICS_H diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXLayoutBlocks.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLayoutBlocks.cpp new file mode 100644 index 000000000000..485635a8268c --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLayoutBlocks.cpp @@ -0,0 +1,126 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXLayoutBlocks +/// ------------------- +/// +/// This pass tidies the control flow in the following way: +/// +/// It reorders blocks to increase fallthrough generally, and specifically +/// to ensure that SIMD CF goto and join have the required structure: the +/// "false" successor must be fallthrough and the "true" successor must be +/// forward. (The '"true" successor must be forward' requirement is a vISA +/// requirement, because vISA goto/join does not specify JIP, and the +/// finalizer reconstructs it on this assumption.) +/// +/// This pass is invoked in ISPC flow to ensure SIMD CF conformance. +// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "GENX_LAYOUTBLOCKS" + +#include "GenX.h" +#include "GenXBaling.h" +#include "GenXGotoJoin.h" +#include "GenXLiveness.h" +#include "GenXModule.h" +#include "GenXNumbering.h" +#include "GenXSubtarget.h" +#include "GenXUtil.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/PassRegistry.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; +using namespace genx; + +/*********************************************************************** + * GenXLayoutBlocks pass declaration + */ +namespace { +class GenXLayoutBlocks : public FunctionPass { +public: + static char ID; + explicit GenXLayoutBlocks() : FunctionPass(ID) {} + virtual StringRef getPassName() const { return "GenX layout blocks"; } + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.addRequired(); + } + + bool runOnFunction(Function &F); + // createPrinterPass : get a pass to print the IR, together with the GenX + // specific analyses + virtual Pass *createPrinterPass(raw_ostream &O, + const std::string &Banner) const { + return createGenXPrinterPass(O, Banner); + } +}; +} // end anonymous namespace. + +char GenXLayoutBlocks::ID = 0; +namespace llvm { +void initializeGenXLayoutBlocksPass(PassRegistry &); +} +INITIALIZE_PASS_BEGIN(GenXLayoutBlocks, "GenXLayoutBlocks", "GenXLayoutBlocks", + false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_END(GenXLayoutBlocks, "GenXLayoutBlocks", "GenXLayoutBlocks", + false, false) +namespace llvm { + FunctionPass *createGenXLayoutBlocksPass() { + initializeGenXLayoutBlocksPass(*PassRegistry::getPassRegistry()); + return new GenXLayoutBlocks; + } +} // namespace llvm + +/*********************************************************************** + * GenXLayoutBlocks::runOnFunction: + * reorder blocks to increase fallthrough, + * and specifically to satisfy the requirements of SIMD control flow + */ +bool GenXLayoutBlocks::runOnFunction(Function &F) { + if (F.empty()) + return false; + LoopInfo &LI = getAnalysis().getLoopInfo(); + if (LI.empty()) + LayoutBlocks(F); + else + LayoutBlocks(F, LI); + return true; +} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXLegalization.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLegalization.cpp new file mode 100644 index 000000000000..339cb22e5900 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLegalization.cpp @@ -0,0 +1,2613 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXLegalization +/// ---------------- +/// +/// GenXLegalization is a function pass that splits vector instructions +/// up to make execution widths legal, and to ensure that the GRF crossing rules +/// are satisfied. +/// +/// This pass makes the LLVM IR closer to legal vISA by +/// splitting up any instruction that has an illegal vector width (too big or +/// non power of two) or an illegal region (illegal vstride/width/stride or +/// illegal GRF crossing). +/// +/// **IR restriction**: After this pass, LLVM IR represents vISA instructions +/// with legal execution width and region parameters, and with any particular +/// instruction's region restrictions adhered to. +/// +/// The pass uses the instruction baling information to tell which +/// regions an instruction has. Splitting an instruction and its regions needs +/// to be done with reference to all the regions at the same time, as they may +/// need splitting at different points. +/// +/// For general values, an illegal width instruction is split by +/// creating narrower instructions, each of which uses a rdregion to extract the +/// subregion for each source operand, and then uses a wrregion to insert the +/// resulting subregion into the original destination value. The original +/// illegal width values survive, and that is OK because a vISA register can +/// have any vector width. +/// +/// The pass uses the hasIndirectGRFCrossing feature from GenXSubtarget when +/// calculating whether a region is legal, or how a region needs to be split, in +/// the case that the region is indirect. +/// +/// The legalization pass considers a bale of instructions as a separate +/// entity which can be split without reference to other bales. This works +/// because the overhead of splitting, which is an extra rdregion per operand +/// and an extra wrregion on the result, is pretty much free in that these extra +/// region accesses are baled in to the split instruction. +/// +/// There are some cases where we decide we need to unbale an instruction, i.e. +/// remove it (or rather the subtree of instructions in the bale rooted at it) +/// from the bale, and then re-start the analysis for the bale. This happens +/// when there are two conflicting requirements in the bale, for example a main +/// instruction that needs at least simd4 but a rdregion that can only manage +/// simd2. +/// +/// The pass scans backwards through the code, which makes this unbaling a bit +/// easier. An unbaled instruction will be encountered again a bit later, and +/// be processed as its own bale. +/// +/// If a source operand being split is already an rdregion, then that rdregion +/// is split, so the new split rdregions read from the original rdregion's +/// input. +/// +/// Similarly, if the bale is already headed by an wrregion, it is replaced by +/// the new split wrregions used to join the splits back together. +/// +/// BitCast is not split in this pass. A non-category-converting BitCast is +/// always coalesced in GenXCoalescing, so never generates actual code. Thus it +/// does not matter if it has an illegal size. +/// +/// Predicate legalization +/// ^^^^^^^^^^^^^^^^^^^^^^ +/// +/// Predicates (vector of i1) are more complex. A general vISA value can be any +/// vector width, but a predicate can only be a power of two up to 32. Thus the +/// actual predicate values need to be split, not just the reads from and writes +/// to the values. +/// +/// Furthermore, although it is possible to read and write a region within a +/// predicate, using H1/H2/Q1..Q4 flags, there are restrictions: the start +/// offset must be 8 aligned (4 aligned for a select or cmp with 64-bit +/// operands), and the size must be no more than the misalignment of the start +/// offset (e.g. for a start offset of 8, the size can be 8 but not 16). +/// +/// So this pass splits an arbitrary size predicate value (including predicate +/// phi nodes) into as many as possible 32 bit parts, then descending power of +/// two parts. For example, a predicate of size 37 is split into 32,4,1. +/// +/// Then, within each part, a read or write of the predicate can be further +/// split as long as it fits the restrictions above, e.g. a 32 bit part can be +/// read/written in 8 or 16 bit subregions. +/// +/// This is achieved in two steps: +/// +/// 1. Predicates take part in the main code of GenXLegalization. When deciding +/// how to split a read or write of a predicate, we determine how the predicate +/// value will be split into parts (e.g. the 37 split into 32,4,1 example +/// above), then decides how a part could be subregioned if necessary (e.g. +/// the 32 could have a 16 aligned 16 bit region, or an 8 aligned 8 bit +/// region). As well as a maximum, this usually gives a minimum size region. +/// If the rest of the bale cannot achieve that minimum size, then we unbale +/// to avoid the problem and restart the analysis of the bale. +/// +/// 2. Then, fixIllegalPredicates() actually divides the illegally sized +/// predicate values, including phi nodes. The splitting in the main part of +/// GenXLegalization ensures that no read or write of a predicate value +/// crosses a part boundary, so it is straightforward to split the values +/// into those parts. +/// +/// This is complicated by the case that the IR before legalization has an +/// rdpredregion. This typically happens when a CM select has odd size operands +/// but an i32 mask. Clang codegen bitcasts the i32 mask to v32i1, then does a +/// shufflevector to extract the correct size predicate. GenXLowering turns the +/// shufflevector into rdpredregion. The main code in GenXLegalization splits +/// the rdpredregion into several rdpredregions. +/// +/// In that case, we cannot guarantee that fixIllegalPredicates will find legal +/// rdpredregions. For example, suppose the original rdpredregion has a v32i1 as +/// input, and v13i1 as result. It is determined that the 13 bit predicate will +/// be split into 8,4,1 parts. The main GenXLegalization code will generate +/// an rdpredregion from the 32 bit predicate for each part of the 13 bit +/// predicate. However, the rdpredregion for the 1 bit part is illegal, because +/// its start offset is not 8 aligned. +/// +/// We currently do not cope with that (it will probably assert somewhere). If +/// we do find a need to cope with it, then the illegal rdpredregion will need +/// to be lowered to bit twiddling code. +/// +/// Other tasks of GenXLegalization +/// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +/// +/// An additional task of this pass is to lower an any/all intrinsic that is +/// used anywhere other than as the predicate of a scalar wrregion by inserting +/// such a scalar wrregion with a byte 0/1 result and then a compare of that +/// to give an i1. +/// +/// A further task of this pass is to lower any predicated wrregion where the +/// value to write is a vector wider than 1 but the predicate is a scalar i1 +/// (other than the value 1, which means unpredicated). It inserts code to splat +/// the scalar i1 predicate to v16i1 or v32i1. This is really part of lowering, +/// but we need to do it here because in GenXLowering the value to write might +/// be wider than 32. +/// +/// An extra optimization performed in this pass is to transform a move (that +/// is, a lone wrregion or lone rdregion or a rdregion+wrregion baled together) +/// with a byte element type into the equivalent short or int move. This saves +/// the jitter having to split the byte move into even and odd halves. This +/// optimization needs to be done when baling info is available, so legalization +/// is a handy place to put it. +/// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "GENX_LEGALIZATION" + +#include "GenX.h" +#include "GenXAlignmentInfo.h" +#include "GenXBaling.h" +#include "GenXIntrinsics.h" +#include "GenXRegion.h" +#include "GenXSubtarget.h" +#include "GenXUtil.h" +#include "KillAnalysis.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/Debug.h" + +#include + +using namespace llvm; +using namespace genx; + +namespace { + +// Information on a part of a predicate. +struct PredPart { + unsigned Offset; + unsigned Size; + unsigned PartNum; +}; + +// min and max legal size for a predicate split +struct LegalPredSize { + unsigned Min; + unsigned Max; +}; + +// GenXLegalization : legalize execution widths and GRF crossing +class GenXLegalization : public FunctionPass { + enum { DETERMINEWIDTH_UNBALE = 0, DETERMINEWIDTH_NO_SPLIT = 256 }; + GenXBaling *Baling = nullptr; + const GenXSubtarget *ST = nullptr; + ScalarEvolution *SE = nullptr; + bool EnableTransformByteMove = true; + // Work variables when in the process of splitting a bale. + // The Bale being split. (Also info on whether it has FIXED4 and TWICEWIDTH + // operands.) + Bale B; + Use *Fixed4 = nullptr; + Use *TwiceWidth = nullptr; + // Map from the original instruction to the split one for the current index. + std::map SplitMap; + + // Consider reading from and writing to the same region in this bale, + // bale { + // W1 = rdr(V0, R) + // W2 = op(W1, ...) + // V1 = wrd(V0, W2, R) + // } + // if splitting the above bale into two bales + // bale { + // W1.0 = rdr(V0, R.0) + // W2.0 = op(W1.0, ...) + // V1.0 = wrr(V0, W2.0, R.0) + // } + // bale { + // W1.1 = rdr(V0, R.1) + // W2.1 = op(W1.1, ...) + // V1.1 = wrr(V1.0, W2.1, R1) + // } + // V1.0 and V0 are live at the same time. This makes copy-coalescing + // fail and also increases rp by the size of V0. + // + // If we can prove that + // (*) rdr(V0, R.1) == rdr(V1.0, R.1) = rdr(wrr(V0, W2.0, R.0), R.1) + // then we could split the bale slightly differently: + // bale { + // W1.0 = rdr(V0, R.0) + // W2.0 = op(W1.0, ...) + // V1.0 = wrr(V0, W2.0, R.0) + // } + // bale { + // W1.1 = rdr(V1.0, R.1) + // W2.1 = op(W1.1, ...) + // V1.1 = wrr(V1.0, W2.1, R1) + // } + // If V0 is killed after this bale, then V1.0, V1.1 and V0 + // could be coalesced into a single variable. This is the pattern + // for in-place operations. + // + // To satisfy equation (*), it suffices to prove there is no overlap for any + // two neighbor subregions. This holds for the following two cases: + // (1) 1D direct regions or indirect regions with single offset + // (2) 2D direct regions with VStride >= Width, or indirect regions with + // single offset. + // + // While legalizing a bale ends with a g_store instruction, we produce the + // following code sequences. + // bale { + // V1 = rdr(V0, 0, 32) + // V2 = fadd V1, 1 + // store V2, p + // } + // ===> + // bale { + // V1.0 = rdr(V0, 0, 16) + // V2.0 = fadd V1.0, 1 + // V3.0 = wrr(load(p), V2.0, 0, 16) + // store V3.0, p + // } + // bale { + // V1.1 = rdr(V0, 16, 32) + // V2.1 = fadd V1.1, 1 + // V3.1 = wrr(load(p), V2.1, 16, 32) + // store V3.1, p + // } + // The instruction stream looks like: + // + // V1.0 = rdr(V0, 0, 16) + // V1.1 = rdr(V0, 16, 32) + // V2.0 = fadd V1.0, 1 + // V2.1 = fadd V1.1, 1 + // V3.0 = wrr(load(p), V2.0, 0, 16) + // store V3.0, p + // V3.1 = wrr(load(p), V2.1, 16, 32) + // store V3.1, p + // + // That is, this process does not produce region joins. + // + enum SplitKind { + SplitKind_Normal, // split bales without propagation. + SplitKind_Propagation, // split bales with propagation. + SplitKind_GStore // split bales end with g_store. + }; + SplitKind CurSplitKind = SplitKind_Normal; + // Current instruction in loop in runOnFunction, which gets adjusted if that + // instruction is erased. + Instruction *CurrentInst = nullptr; + // Illegally sized predicate values that need splitting at the end of + // processing the function. + SetVector IllegalPredicates; + +public: + static char ID; + explicit GenXLegalization() : FunctionPass(ID) { clearBale(); } + virtual StringRef getPassName() const { + return "GenX execution width and GRF crossing legalization"; + } + void getAnalysisUsage(AnalysisUsage &AU) const; + bool runOnFunction(Function &F); + // createPrinterPass : get a pass to print the IR, together with the GenX + // specific analyses + virtual Pass *createPrinterPass(raw_ostream &O, + const std::string &Banner) const { + return createGenXPrinterPass(O, Banner); + } + +private: + void clearBale() { + B.clear(); + Fixed4 = nullptr; + TwiceWidth = nullptr; + } + unsigned getExecSizeAllowedBits(Instruction *Inst); + bool processInst(Instruction *Inst); + bool processBale(Instruction *InsertBefore); + bool noSplitProcessing(); + bool processAllAny(Instruction *Inst, Instruction *InsertBefore); + bool processBitCastFromPredicate(Instruction *Inst, + Instruction *InsertBefore); + bool processBitCastToPredicate(Instruction *Inst, Instruction *InsertBefore); + unsigned getExecutionWidth(); + unsigned determineWidth(unsigned WholeWidth, unsigned StartIdx); + unsigned determineNonRegionWidth(Instruction *Inst, unsigned StartIdx); + LegalPredSize getLegalPredSize(Value *Pred, Type *ElementTy, + unsigned StartIdx, unsigned RemainingSize = 0); + PredPart getPredPart(Value *V, unsigned Offset); + Value *splitBale(Value *Last, unsigned StartIdx, unsigned Width, + Instruction *InsertBefore); + Value *joinBaleInsts(Value *Last, unsigned StartIdx, + unsigned Width, Instruction *InsertBefore); + Value *joinBaleResult(Value *Last, Value *LastSplitInst, unsigned StartIdx, + unsigned Width, Instruction *InsertBefore); + Value *joinGStore(Value *Last, BaleInst GStore, BaleInst WrRegion, + unsigned StartIdx, unsigned Width, + Instruction *InserBefore); + Value *joinWrRegion(Value *Last, BaleInst BInst, unsigned StartIdx, + unsigned Width, Instruction *InserBefore); + Value *joinPredPredWrRegion(Value *Last, BaleInst BInst, unsigned StartIdx, + unsigned Width, Instruction *InserBefore); + Value *joinAnyWrRegion(Value *Last, BaleInst BInst, unsigned StartIdx, + unsigned Width, Instruction *InserBefore); + Value *splitInst(Value *Last, BaleInst BInst, unsigned StartIdx, + unsigned Width, Instruction *InsertBefore, + const DebugLoc &DL); + Value *getSplitOperand(Instruction *Inst, unsigned OperandNum, + unsigned StartIdx, unsigned Size, + Instruction *InsertBefore, const DebugLoc &DL); + Instruction *convertToMultiIndirect(Instruction *Inst, Value *LastJoinVal, + Region *R, Instruction *InsertBefore); + Instruction *transformByteMove(Bale *B); + Value *splatPredicateIfNecessary(Value *V, Type *ValueToWriteTy, + Instruction *InsertBefore, + const DebugLoc &DL); + Value *splatPredicateIfNecessary(Value *V, unsigned Width, + Instruction *InsertBefore, + const DebugLoc &DL); + void eraseInst(Instruction *Inst); + void removingInst(Instruction *Inst); + void fixIllegalPredicates(Function *F); + void fixIntrinsicCalls(Function *F); + SplitKind checkBaleSplittingKind(); +}; + +static const unsigned MaxPredSize = 32; + +} // end anonymous namespace + +char GenXLegalization::ID = 0; +namespace llvm { +void initializeGenXLegalizationPass(PassRegistry &); +} +INITIALIZE_PASS_BEGIN(GenXLegalization, "GenXLegalization", "GenXLegalization", + false, false) +INITIALIZE_PASS_DEPENDENCY(GenXFuncBaling) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_END(GenXLegalization, "GenXLegalization", "GenXLegalization", + false, false) + +FunctionPass *llvm::createGenXLegalizationPass() { + initializeGenXLegalizationPass(*PassRegistry::getPassRegistry()); + return new GenXLegalization; +} + +void GenXLegalization::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); +} + +/*********************************************************************** + * GenXLegalization::runOnFunction : process one function to + * legalize execution width and GRF crossing + */ +bool GenXLegalization::runOnFunction(Function &F) { + Baling = &getAnalysis(); + SE = &getAnalysis().getSE(); + auto P = getAnalysisIfAvailable(); + ST = P ? P->getSubtarget() : nullptr; + // Check args for illegal predicates. + for (Function::arg_iterator fi = F.arg_begin(), fe = F.arg_end(); fi != fe; + ++fi) { + Argument *Arg = &*fi; + if (auto VT = dyn_cast(Arg->getType())) + if (VT->getElementType()->isIntegerTy(1)) + assert(getPredPart(Arg, 0).Size == VT->getNumElements() && + "function arg not allowed to be illegally sized predicate"); + } + + // TODO. remove this restriction. + for (auto &GV : F.getParent()->getGlobalList()) { + if (std::any_of(GV.user_begin(), GV.user_end(), [](Value *U) { + return isa(U) || isa(U); + })) { + EnableTransformByteMove = false; + break; + } + } + + // Legalize instructions. This does a postordered depth first traversal of the + // CFG, and scans backwards in each basic block, to ensure that, if we unbale + // anything, it then gets processed subsequently. + for (po_iterator i = po_begin(&F.getEntryBlock()), + e = po_end(&F.getEntryBlock()); + i != e; ++i) { + BasicBlock *BB = *i; + // The effect of this loop is that we process the instructions in reverse + // order, and we re-process anything inserted before the instruction + // being processed. CurrentInst is a field in the GenXLegalization object, + // which gets updated if a + for (CurrentInst = BB->getTerminator(); CurrentInst;) { + // If processInst returns true, re-process the same instruction. This is + // used when unbaling. + while (processInst(CurrentInst)) + LLVM_DEBUG(dbgs() << "reprocessing\n"); + CurrentInst = + CurrentInst == &BB->front() ? nullptr : CurrentInst->getPrevNode(); + } + } + fixIntrinsicCalls(&F); + fixIllegalPredicates(&F); + IllegalPredicates.clear(); + + return true; +} + +/*********************************************************************** + * getExecSizeAllowedBits : get bitmap of allowed execution sizes + * + * Enter: Inst = main instruction of bale + * + * Return: bit N set if execution size 1<getOpcode()) { + default: + break; + case BinaryOperator::SDiv: + case BinaryOperator::UDiv: + case BinaryOperator::SRem: + case BinaryOperator::URem: + return ST->emulateIDivRem() ? 0x3f : 0x1f; + } + + unsigned ID = GenXIntrinsic::getAnyIntrinsicID(Inst); + switch (ID) { + case GenXIntrinsic::genx_ssmad: + case GenXIntrinsic::genx_sumad: + case GenXIntrinsic::genx_usmad: + case GenXIntrinsic::genx_uumad: + case GenXIntrinsic::genx_ssmad_sat: + case GenXIntrinsic::genx_sumad_sat: + case GenXIntrinsic::genx_usmad_sat: + case GenXIntrinsic::genx_uumad_sat: + case Intrinsic::fma: + // Do not emit simd32 mad for pre-CNL. + return ST->isCNLplus() ? 0x3f : 0x1f; + default: + break; + } + + if (CallInst *CI = dyn_cast(Inst)) { + // We have a call instruction, so we can assume it is an intrinsic since + // otherwise processInst would not have got as far as calling us as + // a non-intrinsic call forces isSplittable() to be false. + auto CalledF = CI->getCalledFunction(); + assert(CalledF); + GenXIntrinsicInfo II(GenXIntrinsic::getAnyIntrinsicID(CalledF)); + // While we have the intrinsic info, we also spot whether we have a FIXED4 + // operand and/or a TWICEWIDTH operand. + for (auto i = II.begin(), e = II.end(); i != e; ++i) { + auto ArgInfo = *i; + if (ArgInfo.isArgOrRet()) { + switch (ArgInfo.getRestriction()) { + case GenXIntrinsicInfo::FIXED4: + Fixed4 = &CI->getOperandUse(ArgInfo.getArgIdx()); + break; + case GenXIntrinsicInfo::TWICEWIDTH: + TwiceWidth = &CI->getOperandUse(ArgInfo.getArgIdx()); + break; + } + } + } + return II.getExecSizeAllowedBits(); + } + return 0x3f; +} + +/*********************************************************************** + * processInst : process one instruction to legalize execution width and GRF + * crossing + * + * Return: true to re-process same instruction (typically after unbaling + * something from it) + */ +bool GenXLegalization::processInst(Instruction *Inst) { + LLVM_DEBUG(dbgs() << "processInst: " << *Inst << "\n"); + if (Inst->isTerminator()) + return false; // ignore terminator + // Prepare to insert split code after current instruction. + auto InsertBefore = Inst->getNextNode(); + if (isa(Inst)) + return false; // ignore phi node + // Sanity check for illegal operand type + if ((Inst->getType()->getScalarType()->getPrimitiveSizeInBits() == 64) && + !(ST->hasLongLong())) + report_fatal_error( + "'double' and 'long long' type are not supported by this target"); + if (ST->isICLLP() || ST->isTGLLP()) { + switch (GenXIntrinsic::getGenXIntrinsicID(Inst)) { + case GenXIntrinsic::genx_ssad2: + case GenXIntrinsic::genx_sssad2add: + case GenXIntrinsic::genx_sssad2add_sat: + case GenXIntrinsic::genx_susad2add: + case GenXIntrinsic::genx_susad2add_sat: + case GenXIntrinsic::genx_usad2: + case GenXIntrinsic::genx_ussad2add: + case GenXIntrinsic::genx_ussad2add_sat: + case GenXIntrinsic::genx_uusad2add: + case GenXIntrinsic::genx_uusad2add_sat: + report_fatal_error("'sad2' and 'sada2' are not supported by this target"); + default: + break; + } + } + + if (!isa(Inst->getType())) { + if (Inst->getOpcode() == Instruction::BitCast && + Inst->getOperand(0)->getType()->getScalarType()->isIntegerTy(1)) { + // Special processing for bitcast from predicate to scalar int. + return processBitCastFromPredicate(Inst, InsertBefore); + } + switch (GenXIntrinsic::getGenXIntrinsicID(Inst)) { + case GenXIntrinsic::genx_all: + case GenXIntrinsic::genx_any: + return processAllAny(Inst, + InsertBefore); // Special processing for all/any + default: + break; + } + if (!isa(Inst)) + return false; // no splitting needed for other scalar op. + } + if (isa(Inst)) + return false; + if (isa(Inst)) { + if (Inst->getType()->getScalarType()->isIntegerTy(1)) { + // Special processing for bitcast from scalar int to predicate. + return processBitCastToPredicate(Inst, InsertBefore); + } + // Ignore any other bitcast. + return false; + } + + if (Baling->isBaled(Inst)) { + LLVM_DEBUG(dbgs() << "is baled\n"); + return false; // not head of bale, ignore + } + // No need to split an llvm.genx.constant with an undef value. + switch (GenXIntrinsic::getGenXIntrinsicID(Inst)) { + case GenXIntrinsic::genx_constanti: + case GenXIntrinsic::genx_constantf: + if (isa(Inst->getOperand(0))) + return false; + break; + default: + break; + } + clearBale(); + Baling->buildBale(Inst, &B); + // Get the main inst from the bale and decide whether it is something we do + // not split. If there is no main inst, the bale is splittable. + if (auto MainInst = B.getMainInst()) { + if (isa(MainInst->Inst)) { + // No legalization for inline asm + if (cast(MainInst->Inst)->isInlineAsm()) + return false; + unsigned IntrinID = GenXIntrinsic::getAnyIntrinsicID(MainInst->Inst); + switch (IntrinID) { + case GenXIntrinsic::not_any_intrinsic: + return false; // non-intrinsic call, ignore + case GenXIntrinsic::genx_constantpred: + break; // these intrinsics can be split + default: + if (GenXIntrinsicInfo(IntrinID).getRetInfo().getCategory() != + GenXIntrinsicInfo::GENERAL) { + // This is not an ALU intrinsic (e.g. cm_add). + // We have a non-splittable intrinsic. Such an intrinsic can + // have a scalar arg with a baled in rdregion, which does not + // need legalizing. It never has a vector arg with a baled in + // rdregion. So no legalization needed. + return false; + } + break; + } + } else if (isa(MainInst->Inst)) { + // BitCast is not splittable in here. A non-category-converting BitCast + // is always coalesced in GenXCoalescing, so never generates actual + // code. Thus it does not matter if it has an illegal size. + return false; + } else if (auto LI = dyn_cast(MainInst->Inst)) { + (void)LI; + // Do not split a (global) load as it does not produce code. + return false; + } else if (isa(MainInst->Inst)) { + // If EV is main than it's related to inline assembly with + // multiple outputs, no legalization + return false; + } + // Any other instruction: split. + } + // Check if it is a byte move that we want to transform into a short/int move. + if (EnableTransformByteMove && transformByteMove(&B)) { + // Successfully transformed. Run legalization on the new instruction (which + // got inserted before the existing one, so will be processed next). + LLVM_DEBUG(dbgs() << "done transformByteMove\n"); + return false; + } + // Normal instruction splitting. + LLVM_DEBUG(dbgs() << "processBale: "; B.print(dbgs())); + + if (B.isGstoreBale() && !B.isGStoreBaleLegal()) { +#ifdef _DEBUG + dbgs() << "processBale: "; + B.print(dbgs()); +#endif + report_fatal_error("this g_store bale is not supported yet!"); + } + + return processBale(InsertBefore); +} + +/*********************************************************************** + * processBale : process one bale to legalize execution width and GRF crossing + * + * Return: true to re-process same head of bale + */ +bool GenXLegalization::processBale(Instruction *InsertBefore) { + // Get the current execution width. + unsigned WholeWidth = getExecutionWidth(); + if (WholeWidth == 1) + return false; // No splitting of scalar or 1-vector + + // Check the bale split kind if do splitting. + CurSplitKind = checkBaleSplittingKind(); + + // We will be generating a chain of joining wrregions. The initial "old + // value" input is undef. If the bale is headed by a wrregion or + // wrpredpredregion that is being split, code inside splitInst uses the + // original operand 0 for split 0 instead. + Value *Joined = nullptr; + // For bales ending with g_store, joining is not through wrr, but through + // g_load and g_store. + if (CurSplitKind != SplitKind::SplitKind_GStore) + Joined = UndefValue::get(B.getHeadIgnoreGStore()->Inst->getType()); + + // Do the splits. + for (unsigned StartIdx = 0; StartIdx != WholeWidth;) { + // Determine the width of the next split. + unsigned Width = determineWidth(WholeWidth, StartIdx); + if (Width == DETERMINEWIDTH_UNBALE) { + // determineWidth wants us to re-start processing from the head of the + // bale, because it did some unbaling. First erase any newly added + // instructions. + for (;;) { + Instruction *Erase = InsertBefore->getPrevNode(); + if (Erase == B.getHead()->Inst) + break; + eraseInst(Erase); + } + return true; // ask to re-start processing + } + if (Width == DETERMINEWIDTH_NO_SPLIT) + return noSplitProcessing(); // no splitting required + // Some splitting is required. This includes the case that there will be + // only one split (i.e. no splitting really required), but: + // * it includes an indirect rdregion that is converted to multi indirect; + // Create the next split. + Joined = splitBale(Joined, StartIdx, Width, InsertBefore); + StartIdx += Width; + } + if (!B.endsWithGStore()) + B.getHead()->Inst->replaceAllUsesWith(Joined); + // Erase the original bale. We erase in reverse order so erasing each one + // removes the uses of earlier ones. However we do not erase an instruction + // that still has uses; that happens for a FIXED4 operand. + InsertBefore = B.getHead()->Inst->getNextNode(); + for (auto bi = B.rbegin(), be = B.rend(); bi != be; ++bi) { + if (bi->Inst->use_empty()) + eraseInst(bi->Inst); + else { + // Do not erase this one as it still has a use; it must be a FIXED4 + // operand so it is used by the new split bales. Instead move it so it + // does not get re-processed by the main loop of this pass. + removingInst(bi->Inst); + bi->Inst->removeFromParent(); + bi->Inst->insertBefore(InsertBefore); + InsertBefore = bi->Inst; + } + } + return false; +} + +/*********************************************************************** + * noSplitProcessing : processing of a splttable bale in the case + * that it is not split + * + * Return: true to re-process same head of bale + */ +bool GenXLegalization::noSplitProcessing() { + if (auto SI = dyn_cast(B.getHeadIgnoreGStore()->Inst)) { + // Handle the case that a vector select has a scalar condition. + SI->setOperand(0, + splatPredicateIfNecessary(SI->getCondition(), SI->getType(), + SI, SI->getDebugLoc())); + } + return false; +} + +/*********************************************************************** + * processAllAny : legalize all/any + * + * Return: true to re-process same head of bale + */ +bool GenXLegalization::processAllAny(Instruction *Inst, + Instruction *InsertBefore) { + // See if the all/any is already legally sized. + Value *Pred = Inst->getOperand(0); + unsigned WholeSize = Pred->getType()->getVectorNumElements(); + if (getPredPart(Pred, 0).Size == WholeSize) { + // Already legally sized. We need to check whether it is used just in a + // branch or select, possibly via a not; if not we need to convert the + // result to a non-predicate then back to a predicate with a cmp, as there + // is no way of expressing a non-baled-in all/any in the generated code. + if (Inst->hasOneUse()) { + auto User = cast(Inst->use_begin()->getUser()); + if (isNot(User)) { + if (!User->hasOneUse()) + User = nullptr; + else + User = cast(User->use_begin()->getUser()); + } + if (User && (isa(User) || isa(User))) + return false; + } + // Do that conversion. + const DebugLoc &DL = Inst->getDebugLoc(); + auto I16Ty = Type::getInt16Ty(Inst->getContext()); + auto V1I16Ty = VectorType::get(I16Ty, 1); + Region R(V1I16Ty); + R.Mask = Inst; + auto NewWr = cast(R.createWrRegion( + Constant::getNullValue(V1I16Ty), ConstantInt::get(I16Ty, 1), + Inst->getName() + ".allany_lowered", InsertBefore, DL)); + auto NewBC = CastInst::Create(Instruction::BitCast, NewWr, I16Ty, + NewWr->getName(), InsertBefore); + auto NewPred = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE, NewBC, + Constant::getNullValue(I16Ty), + NewBC->getName(), InsertBefore); + NewPred->setDebugLoc(DL); + NewWr->setOperand(GenXIntrinsic::GenXRegion::PredicateOperandNum, + UndefValue::get(Inst->getType())); + Inst->replaceAllUsesWith(NewPred); + NewWr->setOperand(GenXIntrinsic::GenXRegion::PredicateOperandNum, Inst); + return false; + } + // It needs to be split. For each part, we have an all/any on that part, and + // use it to do a select on a scalar that keeps track of whether all/any set + // bits have been found. + unsigned IID = GenXIntrinsic::getAnyIntrinsicID(Inst); + Type *I16Ty = Type::getInt16Ty(Inst->getContext()); + Value *Zero = Constant::getNullValue(I16Ty); + Value *One = ConstantInt::get(I16Ty, 1); + Value *Result = IID == GenXIntrinsic::genx_all ? One : Zero; + const DebugLoc &DL = Inst->getDebugLoc(); + for (unsigned StartIdx = 0; StartIdx != WholeSize;) { + auto PP = getPredPart(Pred, StartIdx); + auto Part = Region::createRdPredRegionOrConst( + Pred, StartIdx, PP.Size, Pred->getName() + ".split" + Twine(StartIdx), + InsertBefore, DL); + Module *M = InsertBefore->getParent()->getParent()->getParent(); + Function *Decl = + GenXIntrinsic::getAnyDeclaration(M, IID, Part->getType()); + Instruction *NewAllAny = nullptr; + if (PP.Size != 1) + NewAllAny = CallInst::Create(Decl, Part, + Inst->getName() + ".split" + Twine(StartIdx), + InsertBefore); + else { + // Part is v1i1. All we need to do is bitcast it to i1, which does not + // generate any code. + NewAllAny = CastInst::Create( + Instruction::BitCast, Part, Part->getType()->getScalarType(), + Inst->getName() + ".split" + Twine(StartIdx), InsertBefore); + } + NewAllAny->setDebugLoc(DL); + SelectInst *Sel = nullptr; + if (IID == GenXIntrinsic::genx_all) + Sel = SelectInst::Create(NewAllAny, Result, Zero, + Inst->getName() + ".join" + Twine(StartIdx), + InsertBefore); + else + Sel = SelectInst::Create(NewAllAny, One, Result, + Inst->getName() + ".join" + Twine(StartIdx), + InsertBefore); + Sel->setDebugLoc(DL); + Result = Sel; + StartIdx += PP.Size; + } + // Add a scalar comparison to get the final scalar bool result. + auto Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE, Result, Zero, + Inst->getName() + ".joincmp", InsertBefore); + // Replace and erase the old all/any. + Inst->replaceAllUsesWith(Cmp); + eraseInst(Inst); + return false; +} + +/*********************************************************************** + * processBitCastFromPredicate : legalize bitcast from predicate (vector of + * i1) to scalar int + */ +bool GenXLegalization::processBitCastFromPredicate(Instruction *Inst, + Instruction *InsertBefore) { + Value *Pred = Inst->getOperand(0); + unsigned SplitWidth = getPredPart(Pred, 0).Size; + if (SplitWidth == 0) + return false; +#if _DEBUG + unsigned WholeWidth = Pred->getType()->getVectorNumElements(); + assert(!(WholeWidth % SplitWidth) && "does not handle odd predicate sizes"); +#endif + // Bitcast each split predicate into an element of an int vector. + // For example, if the split size is 16, then the result is a vector + // of i16. Then bitcast that to the original result type. + Type *IntTy = Type::getIntNTy(Inst->getContext(), SplitWidth); + unsigned NumSplits = Inst->getType()->getPrimitiveSizeInBits() / SplitWidth; + if (NumSplits == 1) + return false; + const DebugLoc &DL = Inst->getDebugLoc(); + Type *IntVecTy = VectorType::get(IntTy, NumSplits); + Value *Result = UndefValue::get(IntVecTy); + // For each split... + for (unsigned i = 0; i != NumSplits; ++i) { + // Bitcast that split of the predicate. + auto *NewBitCast = + CastInst::Create(Instruction::BitCast, + getSplitOperand(Inst, /*OperandNum=*/0, i * SplitWidth, + SplitWidth, InsertBefore, DL), + IntTy, Inst->getName() + ".split", InsertBefore); + NewBitCast->setDebugLoc(DL); + // Write it into the element of the vector. + Region R(Result); + R.getSubregion(i, 1); + Result = R.createWrRegion(Result, NewBitCast, + Inst->getName() + ".join" + Twine(i * SplitWidth), + InsertBefore, DL); + } + // Bitcast the vector to the original type. + auto *NewBitCast = + CastInst::Create(Instruction::BitCast, Result, Inst->getType(), + Inst->getName() + ".cast", InsertBefore); + NewBitCast->setDebugLoc(DL); + // Change uses and erase original. + Inst->replaceAllUsesWith(NewBitCast); + eraseInst(Inst); + return false; +} + +/*********************************************************************** + * processBitCastToPredicate : legalize bitcast to predicate (vector of + * i1) from scalar int + */ +bool GenXLegalization::processBitCastToPredicate(Instruction *Inst, + Instruction *InsertBefore) { + unsigned WholeWidth = Inst->getType()->getVectorNumElements(); + unsigned SplitWidth = getPredPart(Inst, 0).Size; + assert(!(WholeWidth % SplitWidth) && "does not handle odd predicate sizes"); + unsigned NumSplits = WholeWidth / SplitWidth; + if (NumSplits == 1) + return false; + // Bitcast the scalar int input to a vector of ints each with a number of + // bits matching the predicate split size. + const DebugLoc &DL = Inst->getDebugLoc(); + auto IVTy = VectorType::get(Type::getIntNTy(Inst->getContext(), SplitWidth), + WholeWidth / SplitWidth); + auto IntVec = CastInst::Create(Instruction::BitCast, Inst->getOperand(0), + IVTy, Inst->getName() + ".cast", InsertBefore); + IntVec->setDebugLoc(DL); + Value *Result = UndefValue::get(Inst->getType()); + Type *SplitPredTy = + VectorType::get(Inst->getType()->getScalarType(), SplitWidth); + // For each predicate split... + for (unsigned i = 0; i != NumSplits; ++i) { + // Get the element of the vector using rdregion. + Region R(IntVec); + R.getSubregion(i, 1); + auto NewRd = R.createRdRegion( + IntVec, Inst->getName() + ".rdsplit" + Twine(i), InsertBefore, DL); + // Bitcast that element of the int vector to a predicate. + auto NewPred = + CastInst::Create(Instruction::BitCast, NewRd, SplitPredTy, + Inst->getName() + ".split" + Twine(i), InsertBefore); + NewPred->setDebugLoc(DL); + // Join into the overall result using wrpredregion. + auto NewWr = Region::createWrPredRegion( + Result, NewPred, i * SplitWidth, Inst->getName() + ".join" + Twine(i), + InsertBefore, DL); + // If this is the first wrpredregion, add it to IllegalPredicates so it gets + // processed later in fixIllegalPredicates. + if (!i) + IllegalPredicates.insert(NewWr); + Result = NewWr; + } + // Change uses and erase original. + Inst->replaceAllUsesWith(Result); + eraseInst(Inst); + return false; +} + +/*********************************************************************** + * getExecutionWidth : get the execution width of the bale + * + * If there is no wrregion at the head of the bale, then the execution width is + * the width of the head. If there is a wrregion or wrpredpredregion, then the + * execution width is the width of the subregion input to the wrregion. + */ +unsigned GenXLegalization::getExecutionWidth() { + BaleInst *Head = B.getHeadIgnoreGStore(); + Value *Dest = Head->Inst; + if (Head->Info.Type == BaleInfo::WRREGION || + Head->Info.Type == BaleInfo::WRPREDREGION || + Head->Info.Type == BaleInfo::WRPREDPREDREGION) + Dest = Head->Inst->getOperand(1); + VectorType *VT = dyn_cast(Dest->getType()); + if (!VT) + return 1; + return VT->getNumElements(); +} + +/*********************************************************************** + * determineWidth : determine width of the next split + * + * Enter: WholeWidth = whole execution width of the bale before splitting + * StartIdx = start index of this split + * + * Return: width of next split, DETERMINEWIDTH_UNBALE if unbaling occurred, + * DETERMINEWIDTH_NO_SPLIT if no split required + * + * If this function returns WholeWidth rather than DETERMINEWIDTH_NO_SPLIT, it + * means that there is an indirect rdregion that needs to be converted to multi + * indirect. This is different to the condition of not needing a split at all, + * which causes this function to return DETERMINEWIDTH_NO_SPLIT. + */ +unsigned GenXLegalization::determineWidth(unsigned WholeWidth, + unsigned StartIdx) { + // Prepare to keep track of whether an instruction with a minimum width + // (e.g. dp4) would be split too small, and whether we need to unbale. + unsigned ExecSizeAllowedBits = 0x3f; + if (auto Main = B.getMainInst()) + ExecSizeAllowedBits = getExecSizeAllowedBits(Main->Inst); + unsigned MainInstMinWidth = + 1 << countTrailingZeros(ExecSizeAllowedBits, ZB_Undefined); + // Determine the vector width that we need to split into. + bool IsReadSameVector = false; + unsigned Width = WholeWidth - StartIdx; + unsigned PredMinWidth = 1; + Value *WrRegionInput = nullptr; + auto Head = B.getHeadIgnoreGStore(); + if (Head->Info.Type == BaleInfo::WRREGION) + WrRegionInput = + Head->Inst->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum); + bool MustSplit = false; + for (Bale::iterator i = B.begin(), InstWithMinWidth = i, e = B.end(); i != e; + ++i) { + unsigned ThisWidth = Width; + // Determine the width we need for this instruction. + switch (i->Info.Type) { + case BaleInfo::WRREGION: { + bool Unbale = false; + Region R(i->Inst, i->Info); + if (R.Mask && + !i->Info.isOperandBaled(GenXIntrinsic::GenXRegion::PredicateOperandNum)) { + // We have a predicate, and it is not a baled in rdpredregion. (A + // baled in rdpredregion is handled when this loop reaches that + // instruction.) Get the min and max legal predicate size. + auto PredWidths = getLegalPredSize(R.Mask, R.ElementTy, StartIdx); + ThisWidth = std::min(ThisWidth, PredWidths.Max); + PredMinWidth = PredWidths.Min; + } + if (PredMinWidth > Width) { + // The min predicate size is bigger than the legal size for the rest + // of the bale other than the wrregion. Unbale the main instruction. + Unbale = true; + } + // Get the max legal size for the wrregion. + ThisWidth = std::min( + ThisWidth, + R.getLegalSize( + StartIdx, false /*Allow2D*/, + i->Inst->getOperand(0)->getType()->getVectorNumElements(), ST, + &(Baling->AlignInfo))); + if (!Unbale && R.Mask && PredMinWidth > ThisWidth) { + // The min predicate size (from this wrregion) is bigger than the + // legal size for this wrregion. We have to rewrite the wrregion as: + // rdregion of the region out of the old value + // predicated wrregion, which now has a contiguous region + // wrregion (the original wrregion but with no predicate) + // then set DETERMINEWIDTH_UNBALE to restart. + auto DL = i->Inst->getDebugLoc(); + auto NewRd = R.createRdRegion( + i->Inst->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum), + i->Inst->getName() + ".separatepred.rd", i->Inst, DL, false); + Baling->setBaleInfo(NewRd, BaleInfo(BaleInfo::RDREGION)); + Region R2(NewRd); + R2.Mask = R.Mask; + auto NewWr = cast(R2.createWrRegion( + NewRd, + i->Inst->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum), + i->Inst->getName() + ".separatepred.wr", i->Inst, DL)); + auto NewBI = i->Info; + NewBI.clearOperandBaled(GenXIntrinsic::GenXRegion::WrIndexOperandNum); + Baling->setBaleInfo(NewWr, NewBI); + i->Inst->setOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum, NewWr); + i->Inst->setOperand(GenXIntrinsic::GenXRegion::PredicateOperandNum, + Constant::getAllOnesValue(R.Mask->getType())); + i->Info.clearOperandBaled(GenXIntrinsic::GenXRegion::PredicateOperandNum); + i->Info.clearOperandBaled(GenXIntrinsic::GenXRegion::NewValueOperandNum); + Baling->setBaleInfo(i->Inst, i->Info); + ThisWidth = DETERMINEWIDTH_UNBALE; + break; + } + if (PredMinWidth > ThisWidth) { + // The min predicate size (from a select baled into this wrregion) is + // bigger than the legal size for this wrregion. Unbale the select. + Unbale = true; + } + if (ThisWidth < MainInstMinWidth) { + // The wrregion is split too small for the main instruction. Unbale + // the main instruction. + Unbale = true; + } + if (Unbale) { + i->Info.clearOperandBaled(GenXIntrinsic::GenXRegion::NewValueOperandNum); + Baling->setBaleInfo(i->Inst, i->Info); + ThisWidth = DETERMINEWIDTH_UNBALE; + } + break; + } + case BaleInfo::RDREGION: { + if (i->Inst->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum) == + WrRegionInput) + IsReadSameVector = true; // See use of this flag below. + // Determine the max region width. If this rdregion is baled into a + // TWICEWIDTH operand, double the start index and half the resulting + // size. + Region R(i->Inst, i->Info); + unsigned Doubling = TwiceWidth && i->Inst == *TwiceWidth; + unsigned ModifiedStartIdx = StartIdx << Doubling; + if (Fixed4 && i->Inst == *Fixed4) + ModifiedStartIdx = 0; + ThisWidth = R.getLegalSize( + ModifiedStartIdx, true /*Allow2D*/, + i->Inst->getOperand(0)->getType()->getVectorNumElements(), ST, + &(Baling->AlignInfo)); + if (ThisWidth == 1 && + R.Indirect && !R.isMultiIndirect()) { + // This is a single indirect rdregion where we failed to make the + // valid size any more than one. If possible, increase the valid size + // to 4 or 8 on the assumption that we are going to convert it to a + // multi indirect. + auto NewThisWidth = 1 << genx::log2(R.Width - StartIdx % R.Width); + if (NewThisWidth >= 4) { + ThisWidth = std::min(NewThisWidth, 8); + MustSplit = true; + } + } + ThisWidth >>= Doubling; + if (ThisWidth < MainInstMinWidth) { + // The rdregion is split too small for the main instruction. + // Unbale the rdregion from its user (must be exactly one user as + // it is baled). Note that the user is not necessarily the main + // inst, it might be a modifier baled in to the main inst. + Value::use_iterator UI = i->Inst->use_begin(); + Instruction *User = cast(UI->getUser()); + BaleInfo BI = Baling->getBaleInfo(User); + BI.clearOperandBaled(UI->getOperandNo()); + Baling->setBaleInfo(User, BI); + ThisWidth = DETERMINEWIDTH_UNBALE; + } + break; + } + case BaleInfo::NOTP: + // Only process notp + // - if predicate is a vector and + // - if it does not have rdpredregion baled in. + if (!i->Info.isOperandBaled(0) && i->Inst->getType()->isVectorTy()) { + // Get the min and max legal predicate size. First get the element type + // from the wrregion or select that the notp is baled into. + Type *ElementTy = nullptr; + auto Head = B.getHeadIgnoreGStore()->Inst; + if (Head != i->Inst) + ElementTy = Head->getOperand(1)->getType()->getScalarType(); + auto PredWidths = + getLegalPredSize(i->Inst->getOperand(0), ElementTy, StartIdx); + // If the min legal predicate size is more than the remaining size in + // the predicate that the rdpredregion extracts, ignore it. This results + // in an illegal rdpredregion from splitInst, which then has to be + // lowered to less efficient code by fixIllegalPredicates. This + // situation arises when the original unsplit bale has an odd size + // rdpredregion out of a v32i1, from a CM select() where the mask is an + // i32. + if (PredWidths.Min <= WholeWidth - StartIdx) + PredMinWidth = PredWidths.Min; + ThisWidth = std::min(ThisWidth, PredWidths.Max); + } + break; + case BaleInfo::RDPREDREGION: { + unsigned RdPredStart = + cast(i->Inst->getOperand(1))->getZExtValue(); + // Get the min and max legal predicate size. + auto PredWidths = getLegalPredSize( + i->Inst->getOperand(0), // the input predicate + cast(i->Inst->use_begin()->getUser()) + ->getOperand(1) + ->getType() + ->getScalarType(), // the wrregion/select element type + RdPredStart + StartIdx); + // If the min legal predicate size is more than the remaining size in + // the predicate that the rdpredregion extracts, ignore it. This results + // in an illegal rdpredregion from splitInst, which then has to be + // lowered to less efficient code by fixIllegalPredicates. This situation + // arises when the original unsplit bale has an odd size rdpredregion + // out of a v32i1, from a CM select() where the mask is an i32. + if (PredWidths.Min <= WholeWidth - StartIdx) + PredMinWidth = PredWidths.Min; + ThisWidth = std::min(ThisWidth, PredWidths.Max); + break; + } + case BaleInfo::SHUFFLEPRED: { + // If shufflepred is baled with load with channels then it is always legal. + if (const BaleInst *BI = B.getMainInst()) { + unsigned IID = GenXIntrinsic::getGenXIntrinsicID(BI->Inst); + switch (IID) { + default: + break; + case GenXIntrinsic::genx_gather4_scaled2: + continue; + } + } + + // In other case we need to legalize it using rdpredregion. + // Probably later rdpredregion will be legalized further. + auto *SI = cast(i->Inst); + return ShuffleVectorAnalyzer::getReplicatedSliceDescriptor(SI).SliceSize; + } + case BaleInfo::ADDRADD: + case BaleInfo::ADDROR: + case BaleInfo::GSTORE: + break; + default: { + ThisWidth = determineNonRegionWidth(i->Inst, StartIdx); + Value *Pred = nullptr; + if (auto SI = dyn_cast(i->Inst)) { + Pred = SI->getCondition(); + if (!isa(Pred->getType())) { + // For a select with a scalar predicate, the predicate will be + // splatted by splatPredicateIfNecessary. We need to limit the legal + // width to the max predicate width. + ThisWidth = std::min(ThisWidth, MaxPredSize); + Pred = nullptr; + } + } else if (isa(i->Inst)) + Pred = i->Inst; + if (Pred && isa(Pred->getType())) { + // For a select (with a vector predicate) or cmp, we need to take the + // predicate into account. Get the min and max legal predicate size. + auto PredWidths = getLegalPredSize( + Pred, i->Inst->getOperand(1)->getType()->getVectorElementType(), + StartIdx); + // If the min legal predicate size is more than the remaining size in + // the predicate that the rdpredregion extracts, ignore it. This results + // in an illegal rdpredregion from splitInst, which then has to be + // lowered to less efficient code by fixIllegalPredicates. This + // situation arises when the original unsplit bale has an odd size + // rdpredregion out of a v32i1, from a CM select() where the mask is an + // i32. + if (PredWidths.Min <= WholeWidth - StartIdx) + PredMinWidth = PredWidths.Min; + if (PredMinWidth > Width) { + // The min predicate size is bigger than the legal size for the + // rest of the bale so far. There must be a rdregion that needs to + // be split too much. Unbale it. + assert(InstWithMinWidth->Info.Type == BaleInfo::RDREGION); + Instruction *RdToUnbale = InstWithMinWidth->Inst; + Use *U = &*RdToUnbale->use_begin(); + auto User = cast(U->getUser()); + BaleInfo BI = Baling->getBaleInfo(User); + BI.clearOperandBaled(U->getOperandNo()); + Baling->setBaleInfo(User, BI); + ThisWidth = DETERMINEWIDTH_UNBALE; + } + ThisWidth = std::min(ThisWidth, PredWidths.Max); + } + break; + } + } + if (ThisWidth < Width) { + InstWithMinWidth = i; + Width = ThisWidth; + } + if (Width == DETERMINEWIDTH_UNBALE) + return DETERMINEWIDTH_UNBALE; + } + while (!(ExecSizeAllowedBits & Width)) { + // This width is disallowed by the main instruction. We have already + // dealt with the case where there is a minimum width above; the + // code here is for when there is a particular disallowed width + // (e.g. bfi disallows width 2 but allows 1). Try a smaller width. + assert(Width != 1); + Width >>= 1; + } + if (Width != WholeWidth && IsReadSameVector && + CurSplitKind == SplitKind_Normal) { + // Splitting required, and the bale contains a rdregion from the same + // vector as the wrregion's old value input, and we're not already + // unbaling. Splitting that would result + // in the original value of the vector and a new value being live at the + // same time, so we avoid it by unbaling the wrregion. The resulting + // code will use an intermediate smaller register for the result of the + // main inst before writing that back in to a region of the vector. + // + // Note that this unbaling is necessary despite pretty much the same + // thing being done in second baling in GenXBaling::unbaleBadOverlaps. + // Not doing the unbaling here results in code where the split rdregions + // and wrregions are interleaved, so the unbaling in + // GenXBaling::unbaleBadOverlaps does not actually stop the bad live range + // overlap. (This might change if we had a pass to schedule to reduce + // register pressure.) + auto Head = B.getHeadIgnoreGStore(); + Head->Info.clearOperandBaled(GenXIntrinsic::GenXRegion::NewValueOperandNum); + Baling->setBaleInfo(Head->Inst, Head->Info); + LLVM_DEBUG( + dbgs() + << "GenXLegalization unbaling when rdr and wrr use same vector\n"); + return DETERMINEWIDTH_UNBALE; + } + if (Width == WholeWidth && !MustSplit) { + // No split required, so return that to the caller, which then just + // returns. However we do not do that if MustSplit is set, because there + // is some reason we need to go through splitting code anyway, one of: + // 1. there is an rdregion that needs to be converted to multi indirect; + // 2. there is an rdpredregion. + return DETERMINEWIDTH_NO_SPLIT; + } + + // If join is generated after splitting, need to check destination region rule + { + auto Head = B.getHeadIgnoreGStore(); + if (Head->Info.Type != BaleInfo::WRREGION && + Head->Info.Type != BaleInfo::WRPREDPREDREGION) { + auto VT = cast(Head->Inst->getType()); + unsigned VecSize = VT->getNumElements(); + if (VecSize != Width) { + if (!VT->getElementType()->isIntegerTy(1)) { + Region R(Head->Inst); + auto ThisWidth = R.getLegalSize(StartIdx, false /*no 2d for dst*/, + VecSize, ST, &(Baling->AlignInfo)); + if (ThisWidth < Width) { + Width = ThisWidth; + } + } + } + } + } + + return Width; +} + +/*********************************************************************** + * determineNonRegionWidth : determine max valid width of non-region instruction + * + * Enter: Inst = the instruction + * StartIdx = start index + * + * Return: max valid width + */ +unsigned GenXLegalization::determineNonRegionWidth(Instruction *Inst, + unsigned StartIdx) { + VectorType *VT = dyn_cast(Inst->getType()); + if (!VT) + return 1; + unsigned Width = VT->getNumElements() - StartIdx; + unsigned BytesPerElement = VT->getElementType()->getPrimitiveSizeInBits() / 8; + // Check whether the operand element size is bigger than the result operand + // size. Normally we just check operand 0. This won't work on a select, and + // we don't need to do the check on a select anyway as its operand and result + // type are the same. + if (!isa(Inst)) { + unsigned NumOperands = Inst->getNumOperands(); + if (CallInst *CI = dyn_cast(Inst)) + NumOperands = CI->getNumArgOperands(); + if (NumOperands) { + assert(isa(Inst->getOperand(0)->getType()) && + "instruction not supported"); + unsigned InBytesPerElement = + cast(Inst->getOperand(0)->getType()) + ->getElementType() + ->getPrimitiveSizeInBits() / + 8; + if (InBytesPerElement > BytesPerElement) + BytesPerElement = InBytesPerElement; + } + } + unsigned int TwoGRFWidth = ST ? (2 * ST->getGRFWidth()) : 64; + if (BytesPerElement) { + // Non-predicate result. + if (Width * BytesPerElement > TwoGRFWidth) + Width = TwoGRFWidth / BytesPerElement; + Width = 1 << genx::log2(Width); + } else { + // Predicate result. This is to handle and/or/xor/not of predicates; cmp's + // def of a predicate is handled separately where this function is called + // in determineWidth(). + Width = getPredPart(Inst, StartIdx).Size; + } + return Width; +} + +/*********************************************************************** + * getLegalPredSize : get legal predicate size + * + * Enter: Pred = predicate value + * ElementTy = element type, 0 to assume not 64 bit + * StartIdx = start index in that predicate + * RemainingSize = remaining size from StartIdx in whole vector + * operation being split, or 0 to imply from the + * number of elements in the type of Pred + * + * Return: Min = min legal size + * Max = max legal size + */ +LegalPredSize GenXLegalization::getLegalPredSize(Value *Pred, Type *ElementTy, + unsigned StartIdx, + unsigned RemainingSize) { + // Get details of the part containing StartIdx. + auto PP = getPredPart(Pred, StartIdx); + // Set Min to 8, or 4 if the element type of the operation using the + // intrinsic is 64 bit. Doing this ensures that the next split in the same + // part is on a legal offset. The offset of a split within a part must be 8 + // aligned, or 4 aligned if the element type is 64 bit. + LegalPredSize Ret; + Ret.Min = !ElementTy ? 8 : ElementTy->getPrimitiveSizeInBits() != 64 ? 8 : 4; + // Set Max to the remaining size left in this part, rounded down to a power + // of two. + unsigned LogMax = Log2_32(PP.Size - StartIdx + PP.Offset); + // However, Max cannot be any bigger than the misalignment of the offset into + // the part. For example. if the offset is 4 or 12, the size must be 4, not 8 + // or 16. + LogMax = std::min(LogMax, findFirstSet(StartIdx - PP.Offset)); + Ret.Max = 1 << LogMax; + // If Min>Max, then we're at the end of that part and we don't need to ensure + // that the next split in the same part is legally aligned. + Ret.Min = std::min(Ret.Min, Ret.Max); + return Ret; +} + +/*********************************************************************** + * getPredPart : get info on which part of a predicate an index is in + * + * Enter: V = a value of predicate type + * Offset = offset to get info on + * + * Return: PredPart struct with + * Offset = start offset of the part + * Size = size of the part + * PartNum = part number + * + * On entry, Offset is allowed to be equal to the total size of V, in which + * case the function returns PartNum = the number of parts and Size = 0. + * + * This function is what determines how an illegally sized predicate is divided + * into parts. It is constrained by vISA only allowing a power of two size for + * each part. Therefore it divides into zero or more 32 bit parts (currently 16 + * bit), then descending powers of two to fill up any odd size end. + * + * These parts correspond to how predicate values in the IR are divided up, not + * just how instructions that use or define them get legalized. Thus a + * predicate of size 13 actually gets divided into parts of 8,4,1 as vISA + * predicate registers P1,P2,P3 (for example). + */ +PredPart GenXLegalization::getPredPart(Value *V, unsigned Offset) { + unsigned WholeSize = V->getType()->getVectorNumElements(); + PredPart Ret; + if (Offset == WholeSize && !(WholeSize & (MaxPredSize - 1))) { + Ret.Offset = Offset; + Ret.Size = 0; + Ret.PartNum = Offset / MaxPredSize; + return Ret; + } + if ((Offset ^ WholeSize) & -MaxPredSize) { + // This is in one of the 32 bit parts. + Ret.Offset = Offset & -MaxPredSize; + Ret.Size = MaxPredSize; + Ret.PartNum = Offset / MaxPredSize; + return Ret; + } + // This is in the odd less-than-32 section at the end. + Ret.Offset = WholeSize & -MaxPredSize; + Ret.PartNum = WholeSize / MaxPredSize; + for (unsigned Pwr2 = MaxPredSize / 2U;; Pwr2 >>= 1) { + if (Pwr2 <= Offset - Ret.Offset) { + Ret.Offset += Pwr2; + ++Ret.PartNum; + if (Offset == WholeSize && Ret.Offset == Offset) { + Ret.Size = 0; + break; + } + } + if (Pwr2 <= WholeSize - Ret.Offset && Pwr2 > Offset - Ret.Offset) { + Ret.Size = Pwr2; + break; + } + } + return Ret; +} + +/************************************************************************ + * SplittableInsts : takes Bale and constructs the range of splittable + * instructions of this bale + * + * Splittable are those instructions that later will be split. By current design + * it is all instruction except last wrregion or wrregion+gstore. + * + * Usage: for (auto BI : SplittableInsts(B)), SplittableInst(B).begin(),... + */ +class SplittableInsts { + Bale::iterator Begin; + Bale::iterator End; + +public: + SplittableInsts(Bale &SomeBale) : Begin(SomeBale.begin()) { + auto HeadIt = SomeBale.getHeadIgnoreGStoreIt(); + // Only WRREGION, WRPREDPREDREGION, GSTORE should be joined, thus the + // instructions before them should be split + if (HeadIt->Info.Type == BaleInfo::WRREGION || + HeadIt->Info.Type == BaleInfo::WRPREDPREDREGION) + End = HeadIt; + else { + assert(HeadIt->Info.Type != BaleInfo::GSTORE && + "GSTORE must have been considered before"); + End = SomeBale.end(); + } + } + Bale::iterator begin() { return Begin; } + Bale::iterator end() { return End; } +}; + +/*********************************************************************** + * joinBaleInsts : create join instructions in bale + * (2 in case of gstore, 1 - otherwise) + */ +Value *GenXLegalization::joinBaleInsts(Value *PrevSliceRes, unsigned StartIdx, + unsigned Width, + Instruction *InsertBefore) { + assert(SplittableInsts(B).end() != B.end() && + "must have some instructions to join in the bale"); + if (B.endsWithGStore()) { + assert(SplittableInsts(B).end() == B.getPreHeadIt() && + "a bale is considered to have only 1 dst, in case of GSTORE it's " + "represented by the last 2 instructions"); + return joinGStore(PrevSliceRes, *B.getHead(), *B.getPreHead(), StartIdx, + Width, InsertBefore); + } else { + assert(SplittableInsts(B).end() == B.getHeadIt() && + "a bale is considered to have only 1 dst, in common case it's " + "represented by the last instruction"); + return joinAnyWrRegion(PrevSliceRes, *B.getHead(), StartIdx, Width, + InsertBefore); + } +} + +/*********************************************************************** + * If the last instruction in the created bale is a split instruction, + * need to join this result into the overall result with a wrregion or + * wrpredregion. Do not generate the join if it is a write into the whole + * of the overall result, which can happen when going through the split + * code even when no split is required other than conversion to multi + * indirect. + */ +Value *GenXLegalization::joinBaleResult(Value *PrevSliceRes, + Value *LastSplitInst, unsigned StartIdx, + unsigned Width, + Instruction *InsertBefore) { + assert(PrevSliceRes && LastSplitInst && InsertBefore && "wrong arguments"); + auto Head = B.getHeadIgnoreGStore()->Inst; + auto VT = cast(Head->getType()); + assert(VT->getNumElements() != Width && + "there's no need to join results if they have the proper type"); + if (VT->getElementType()->isIntegerTy(1)) { + auto NewWr = Region::createWrPredRegion( + PrevSliceRes, LastSplitInst, StartIdx, + LastSplitInst->getName() + ".join" + Twine(StartIdx), InsertBefore, + Head->getDebugLoc()); + // If this is the first wrpredregion into an illegally sized predicate, + // save it for processing later. (Only the first one could possibly be + // the root of a tree of wrpredregions, and only the roots of + // wrpredregion trees need to be in IllegalPredicates.) + if (!StartIdx) { + auto PredSize = getLegalPredSize(NewWr, nullptr, 0); + if (PredSize.Max != NewWr->getType()->getVectorNumElements()) + IllegalPredicates.insert(NewWr); + } + return NewWr; + } else { + Region R(Head); + R.Width = R.NumElements = Width; + R.Offset = StartIdx * R.ElementBytes; + return R.createWrRegion(PrevSliceRes, LastSplitInst, + LastSplitInst->getName() + ".join" + + Twine(StartIdx), + InsertBefore, Head->getDebugLoc()); + } +} + +/*********************************************************************** + * splitBale : create one slice of the bale + * + * Enter: PrevSliceRes = result of previously created bale slice, + * undef if this is the first one + * StartIdx = element start index for this slice + * Width = number of elements in this slice + * InsertBefore = insert new inst before this point + * + * Return: result of this split + */ +Value *GenXLegalization::splitBale(Value *PrevSliceRes, unsigned StartIdx, + unsigned Width, Instruction *InsertBefore) { + Value *LastCreatedInst = nullptr; + auto SplittableInstsRange = SplittableInsts(B); + for (auto BI : SplittableInstsRange) + // Split the instruction. + SplitMap[BI.Inst] = LastCreatedInst = + splitInst(PrevSliceRes, BI, StartIdx, Width, InsertBefore, + BI.Inst->getDebugLoc()); + if (SplittableInstsRange.end() != B.end()) + LastCreatedInst = + joinBaleInsts(PrevSliceRes, StartIdx, Width, InsertBefore); + else { + assert(LastCreatedInst && "must have at least some split inst"); + auto Head = B.getHeadIgnoreGStore()->Inst; + if (cast(Head->getType())->getNumElements() != Width) + LastCreatedInst = joinBaleResult(PrevSliceRes, LastCreatedInst, StartIdx, + Width, InsertBefore); + } + SplitMap.clear(); + return LastCreatedInst; +} + +// joins both gstore inst and the wrregion which gstore stores +// more info at joinAnyWrRegion +Value *GenXLegalization::joinGStore(Value *PrevSliceRes, BaleInst GStore, + BaleInst WrRegion, unsigned StartIdx, + unsigned Width, Instruction *InsertBefore) { + assert(GStore.Info.Type == BaleInfo::GSTORE && "wrong argument"); + Value *Op = + joinAnyWrRegion(PrevSliceRes, WrRegion, StartIdx, Width, InsertBefore); + return new StoreInst(Op, GStore.Inst->getOperand(1), /*volatile*/ true, + InsertBefore); +} + +// specialized join function for wrregion instruction +// more info at joinAnyWrRegion +Value *GenXLegalization::joinWrRegion(Value *PrevSliceRes, BaleInst BInst, + unsigned StartIdx, unsigned Width, + Instruction *InsertBefore) { + assert(BInst.Info.Type == BaleInfo::WRREGION && "wrong argument"); + Region R(BInst.Inst, BInst.Info); + R.getSubregion(StartIdx, Width); + if (R.Mask && isa(R.Mask->getType())) + R.Mask = getSplitOperand( + BInst.Inst, GenXIntrinsic::GenXRegion::PredicateOperandNum, StartIdx, + Width, InsertBefore, BInst.Inst->getDebugLoc()); + // For SplitIdx==0, the old vector value comes from the original + // wrregion. Otherwise it comes from the split wrregion created + // last time round. + Value *In = !StartIdx ? BInst.Inst->getOperand(0) : PrevSliceRes; + if (CurSplitKind == SplitKind::SplitKind_GStore && StartIdx != 0) { + Instruction *ST = B.getHead()->Inst; + assert(isa(ST)); + Value *GV = ST->getOperand(1); + In = new LoadInst(GV, ".gload", /*volatile*/ true, InsertBefore); + } + Value *NewWrRegion = + R.createWrRegion(In, + getSplitOperand(BInst.Inst, 1, StartIdx, Width, + InsertBefore, BInst.Inst->getDebugLoc()), + BInst.Inst->getName() + ".join" + Twine(StartIdx), + InsertBefore, BInst.Inst->getDebugLoc()); + return NewWrRegion; +} + +// specialized join function for wrpredpredregion instruction +// more info at joinAnyWrRegion +Value *GenXLegalization::joinPredPredWrRegion(Value *PrevSliceRes, + BaleInst BInst, unsigned StartIdx, + unsigned Width, + Instruction *InsertBefore) { + assert(BInst.Info.Type == BaleInfo::WRPREDPREDREGION && "wrong argument"); + unsigned WrPredStart = + cast(BInst.Inst->getOperand(2))->getZExtValue(); + Value *WrPredNewVal = getSplitOperand( + BInst.Inst, 1, StartIdx, Width, InsertBefore, BInst.Inst->getDebugLoc()); + // For SplitIdx==0, the old vector value comes from the original + // wrregion. Otherwise it comes from the split wrregion created + // last time round. + Value *In = !StartIdx ? BInst.Inst->getOperand(0) : PrevSliceRes; + // Create the split wrpredpredregion. Note that the mask is passed in + // its original unsplit form; the spec of wrpredpredregion is that the + // mask is the same size as the result, and the index is used to slice + // the mask as well as to determine the slice where the value is written + // in the result. + return Region::createWrPredPredRegion( + In, WrPredNewVal, StartIdx + WrPredStart, BInst.Inst->getOperand(3), + BInst.Inst->getName() + ".join" + Twine(StartIdx), InsertBefore, + BInst.Inst->getDebugLoc()); +} + +/*********************************************************************** + * joinAnyWrRegion : join any wrregion instruction in the bale + * + * Enter: PrevSliceRes = result of previously created bale slice, + * undef if this is the first one + * BInst = the BaleInst to join + * StartIdx = element start index for this slice + * Width = number of elements in this slice + * InsertBefore = insert new inst before this point + * + * Return: the new join value. Join value/instruction has original ("illegal") + * width elements. Each bale slice writes its own part of the value. + */ +Value *GenXLegalization::joinAnyWrRegion(Value *PrevSliceRes, BaleInst BInst, + unsigned StartIdx, unsigned Width, + Instruction *InsertBefore) { + switch (BInst.Info.Type) { + case BaleInfo::WRREGION: + return joinWrRegion(PrevSliceRes, BInst, StartIdx, Width, InsertBefore); + break; + case BaleInfo::WRPREDPREDREGION: + return joinPredPredWrRegion(PrevSliceRes, BInst, StartIdx, Width, + InsertBefore); + break; + default: + llvm_unreachable("unexpected/unsupported instruction"); + } +} + +/*********************************************************************** + * splitInst : split an instruction in the bale + * + * Enter: PrevSliceRes = result of previous bale slice, + * undef if this is the first one + * BInst = the BaleInst to split + * StartIdx = element start index for this slice + * Width = number of elements in this slice + * InsertBefore = insert new inst before this point + * DL = debug location to give new instruction(s) + * + * Return: the new split value + * Split value/instruction has Width elements. + */ +Value *GenXLegalization::splitInst(Value *PrevSliceRes, BaleInst BInst, + unsigned StartIdx, unsigned Width, + Instruction *InsertBefore, + const DebugLoc &DL) { + switch (BInst.Info.Type) { + case BaleInfo::GSTORE: + case BaleInfo::WRREGION: + case BaleInfo::WRPREDPREDREGION: + llvm_unreachable("these instructions must be processed in join functions"); + break; + case BaleInfo::RDREGION: { + // Allow for this being a rdregion baled in to a TWICEWIDTH operand. + // If it is, double the start index and width. + unsigned Doubling = TwiceWidth && BInst.Inst == *TwiceWidth; + StartIdx <<= Doubling; + Width <<= Doubling; + // Get the subregion. + Region R(BInst.Inst, BInst.Info); + // Check whether this is an indirect operand that was allowed only + // because we assumed that we are going to convert it to a multi + // indirect. + bool ConvertToMulti = + R.Indirect && Width != 1 && + R.getLegalSize( + StartIdx, true /*Allow2D*/, + BInst.Inst->getOperand(0)->getType()->getVectorNumElements(), ST, + &(Baling->AlignInfo)) == 1; + + R.getSubregion(StartIdx, Width); + // The region to read from. This is normally from the input region baled + // in. If this is reading from and writing to the same region and + // split progapation is on, then just reading from the last joined value + // (but not the initial undef). + // + Value *OldVal = BInst.Inst->getOperand(0); + if (PrevSliceRes && !isa(PrevSliceRes) && + CurSplitKind == SplitKind_Propagation) { + auto Head = B.getHeadIgnoreGStore(); + if (Head->Info.Type == BaleInfo::WRREGION) { + Value *WrRegionInput = Head->Inst->getOperand(0); + if (OldVal == WrRegionInput) + OldVal = PrevSliceRes; + } + } + if (!ConvertToMulti) { + // Not converting to multi indirect. + return R.createRdRegion( + OldVal, BInst.Inst->getName() + ".split" + Twine(StartIdx), + InsertBefore, DL); + } + // Converting to multi indirect. + return convertToMultiIndirect(BInst.Inst, OldVal, &R, InsertBefore); + } + case BaleInfo::RDPREDREGION: { + unsigned RdPredStart = + cast(BInst.Inst->getOperand(1))->getZExtValue(); + Value *RdPredInput = BInst.Inst->getOperand(0); + return Region::createRdPredRegionOrConst( + RdPredInput, RdPredStart + StartIdx, Width, + BInst.Inst->getName() + ".split" + Twine(StartIdx), InsertBefore, DL); + } + case BaleInfo::SHUFFLEPRED: { + // If we need to split predication shuffle vector, then we definitely failed to + // bale it with channel instruction. In this case we do not need such complicated + // predication logic anymore and can fallback to rdpredregions. + auto *SI = cast(BInst.Inst); + auto RS = ShuffleVectorAnalyzer::getReplicatedSliceDescriptor(SI); + assert(RS.SliceSize == Width && "Unexpected width for predicate shuffle split"); + Value *Pred = SI->getOperand(0); + return Region::createRdPredRegionOrConst( + Pred, RS.InitialOffset, Width, + SI->getName() + ".split" + Twine(StartIdx), InsertBefore, DL); + } + } + // Splitting non-region instruction. + assert(!isa(BInst.Inst) && "not expecting to split phi node"); + if (CastInst *CI = dyn_cast(BInst.Inst)) { + Type *CastToTy = VectorType::get( + cast(CI->getType())->getElementType(), Width); + Instruction *NewInst = CastInst::Create( + CI->getOpcode(), + getSplitOperand(CI, 0, StartIdx, Width, InsertBefore, DL), CastToTy, + CI->getName() + ".split" + Twine(StartIdx), InsertBefore); + NewInst->setDebugLoc(DL); + return NewInst; + } + if (BinaryOperator *BO = dyn_cast(BInst.Inst)) { + Instruction *NewInst = BinaryOperator::Create( + BO->getOpcode(), + getSplitOperand(BO, 0, StartIdx, Width, InsertBefore, DL), + getSplitOperand(BO, 1, StartIdx, Width, InsertBefore, DL), + BO->getName() + ".split" + Twine(StartIdx), InsertBefore); + NewInst->setDebugLoc(DL); + return NewInst; + } +#if (LLVM_VERSION_MAJOR > 8) + if (UnaryOperator *UO = dyn_cast(BInst.Inst)) { + Instruction *NewInst = UnaryOperator::Create( + UO->getOpcode(), + getSplitOperand(UO, 0, StartIdx, Width, InsertBefore, DL), + UO->getName() + ".split" + Twine(StartIdx), InsertBefore); + NewInst->setDebugLoc(DL); + return NewInst; + } +#endif + if (CmpInst *CI = dyn_cast(BInst.Inst)) { + Instruction *NewInst = CmpInst::Create( + CI->getOpcode(), CI->getPredicate(), + getSplitOperand(CI, 0, StartIdx, Width, InsertBefore, DL), + getSplitOperand(CI, 1, StartIdx, Width, InsertBefore, DL), + CI->getName() + ".split" + Twine(StartIdx), InsertBefore); + NewInst->setDebugLoc(DL); + return NewInst; + } + if (auto SI = dyn_cast(BInst.Inst)) { + Value *Selector = getSplitOperand(SI, 0, StartIdx, Width, InsertBefore, DL); + Selector = splatPredicateIfNecessary(Selector, Width, InsertBefore, DL); + auto Split1 = getSplitOperand(SI, 1, StartIdx, Width, InsertBefore, DL); + auto Split2 = getSplitOperand(SI, 2, StartIdx, Width, InsertBefore, DL); + auto NewInst = SelectInst::Create( + Selector, Split1, Split2, SI->getName() + ".split" + Twine(StartIdx), + InsertBefore); + NewInst->setDebugLoc(DL); + return NewInst; + } + // Must be a splittable intrinsic. + CallInst *CI = dyn_cast(BInst.Inst); + assert(CI); + auto CalledF = CI->getCalledFunction(); + assert(CalledF); + unsigned IntrinID = GenXIntrinsic::getAnyIntrinsicID(CalledF); + assert(GenXIntrinsic::isAnyNonTrivialIntrinsic(IntrinID)); + if (IntrinID == GenXIntrinsic::genx_constanti || + IntrinID == GenXIntrinsic::genx_constantf) { + // This is the constant loading intrinsic. + // We don't need to load the split constants, since a constant value-to- + // write operand is valid in the wrregions that will be used to link + // the values back together. + return getSplitOperand(BInst.Inst, 0, StartIdx, Width, InsertBefore, DL); + } + + // Some other splittable intrinsic. + SmallVector Args; + SmallVector OverloadedTypes; + OverloadedTypes.push_back( + VectorType::get(cast(BInst.Inst->getType())->getElementType(), + Width)); // RetTy + for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) { + Use *U = &CI->getOperandUse(i); + if (U == Fixed4) { + Args.push_back(CI->getArgOperand(i)); + } else if (U == TwiceWidth) { + // TWICEWIDTH: operand is twice the width of other operand and result + Args.push_back(getSplitOperand(BInst.Inst, i, StartIdx * 2, Width * 2, + InsertBefore, DL)); + } else + Args.push_back( + getSplitOperand(BInst.Inst, i, StartIdx, Width, InsertBefore, DL)); + if (GenXIntrinsic::isOverloadedArg((GenXIntrinsic::ID)IntrinID, i)) + OverloadedTypes.push_back(Args[i]->getType()); + } + Module *M = InsertBefore->getParent()->getParent()->getParent(); + Function *Decl = + GenXIntrinsic::getAnyDeclaration(M, IntrinID, OverloadedTypes); + Instruction *NewInst = CallInst::Create( + Decl, Args, CI->getName() + ".split" + Twine(StartIdx), InsertBefore); + NewInst->setDebugLoc(DL); + return NewInst; +} + +/*********************************************************************** + * getSplitOperand : get a possibly split operand + * + * Enter: Inst = original non-split instruction + * OperandNum = operand number we want + * StartIdx = element start index for this split + * Size = number of elements in this split + * InsertBefore = where to insert any added rdregion + * DL = debug location to give new instruction(s) + * + * If the requested operand is a constant, it splits the constant. + * Otherwise it creates an rdregion from the original operand. + */ +Value *GenXLegalization::getSplitOperand(Instruction *Inst, unsigned OperandNum, + unsigned StartIdx, unsigned Size, + Instruction *InsertBefore, + const DebugLoc &DL) { + Value *V = Inst->getOperand(OperandNum); + if (!isa(V->getType())) + return V; // operand not vector, e.g. variable index in region + if (auto C = dyn_cast(V)) + return getConstantSubvector(C, StartIdx, Size); + // Split a non-constant vector. + if (Instruction *OperandInst = dyn_cast(V)) { + auto i = SplitMap.find(OperandInst); + if (i != SplitMap.end()) { + // Operand is another instruction in the bale being split. + return i->second; + } + } + // Non-constant operand not baled in. + // Create an rdregion for the operand. + if (!V->getType()->getScalarType()->isIntegerTy(1)) { + Region R(V); + R.getSubregion(StartIdx, Size); + return R.createRdRegion(V, V->getName() + ".split" + Twine(StartIdx), + InsertBefore, DL); + } + // Predicate version. + return Region::createRdPredRegion(V, StartIdx, Size, + V->getName() + ".split" + Twine(StartIdx), + InsertBefore, DL); +} + +/*********************************************************************** + * convertToMultiIndirect : convert a rdregion into multi-indirect + * + * Enter: Inst = original rdregion + * LastJoinVal = the acutal region to read from + * R = region for it, already subregioned if applicable + * + * Return: new rdregion instruction (old one has not been erased) + */ +Instruction * +GenXLegalization::convertToMultiIndirect(Instruction *Inst, Value *LastJoinVal, + Region *R, Instruction *InsertBefore) { + assert(!R->is2D() && (R->NumElements == 4 || R->NumElements == 8)); + Value *Indirect = R->Indirect; + assert(Indirect); + const DebugLoc &DL = Inst->getDebugLoc(); + + // scalar indirect index + if (R->Stride == 1 && !R->is2D() && !isa(Indirect->getType()) && + ST->hasIndirectGRFCrossing()) { + Instruction *NewInst = + R->createRdRegion(LastJoinVal, Inst->getName(), InsertBefore, DL); + return NewInst; + } + + // 1. Splat the address. (We will get multiple copies of this + // instruction, one per split, but they will be CSEd away.) + Instruction *SplattedIndirect = CastInst::Create( + Instruction::BitCast, Indirect, VectorType::get(Indirect->getType(), 1), + Twine(Indirect->getName()) + ".splat", InsertBefore); + SplattedIndirect->setDebugLoc(DL); + Region AddrR(SplattedIndirect); + AddrR.Stride = 0; + AddrR.Width = AddrR.NumElements = R->NumElements; + SplattedIndirect = AddrR.createRdRegion( + SplattedIndirect, SplattedIndirect->getName(), InsertBefore, DL); + // 2. Add the constant vector <0,1,2,3,4,5,6,7> to it (adjusted + // for stride in bytes). + uint16_t OffsetValues[8]; + for (unsigned i = 0; i != 8; ++i) + OffsetValues[i] = i * (R->Stride * R->ElementBytes); + Constant *Offsets = ConstantDataVector::get( + InsertBefore->getContext(), + ArrayRef(OffsetValues).slice(0, R->NumElements)); + SplattedIndirect = + BinaryOperator::Create(Instruction::Add, SplattedIndirect, Offsets, + SplattedIndirect->getName(), InsertBefore); + SplattedIndirect->setDebugLoc(DL); + // 3. Create the multi indirect subregion. + R->Indirect = SplattedIndirect; + R->VStride = R->Stride; + R->Stride = 1; + R->Width = 1; + Instruction *NewInst = + R->createRdRegion(LastJoinVal, Inst->getName(), InsertBefore, DL); + return NewInst; +} + +/*********************************************************************** + * transformByteMove : transform a byte move into short or int move + * + * Enter: B = bale (not necessarily a byte move) + * + * Return: 0 if nothing changed, else the new head of bale (ignoring the + * bitcasts inserted either side) + * + * If the bale is a byte move (a lone wrregion or lone rdregion or + * rdregion+wrregion where the element type is byte), and the region parameters + * are suitably aligned, we turn it into a short or int move. This saves the + * jitter having to split the byte move into an even half and an odd half. + * + * If the code is modified, it updates bale info. + * + * This optimization needs to be done when baling info is available, so + * legalization is a handy place to put it. + */ +Instruction *GenXLegalization::transformByteMove(Bale *B) { + auto HeadBI = B->getHead(); + Instruction *Head = HeadBI->Inst; + if (!Head->getType()->getScalarType()->isIntegerTy(8)) + return nullptr; + Instruction *Wr = nullptr, *Rd = nullptr; + if (HeadBI->Info.Type == BaleInfo::WRREGION) { + Wr = Head; + if (HeadBI->Info.isOperandBaled( + GenXIntrinsic::GenXRegion::NewValueOperandNum)) { + Rd = dyn_cast( + Wr->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum)); + if (!GenXIntrinsic::isRdRegion(Rd)) + return nullptr; + } + } else { + if (HeadBI->Info.Type != BaleInfo::RDREGION) + return nullptr; + Rd = Head; + } + // Now Rd is the rdregion and Wr is the wrregion, and one of them might be 0. + if (Rd && !isa(Rd->getType())) + return nullptr; + if (Wr && !isa(Wr->getOperand(1)->getType())) + return nullptr; + assert(Rd || Wr); + Value *In = Rd ? Rd->getOperand(0) : Wr->getOperand(1); + Region WrR; + if (Wr) { + WrR = Region(Wr, BaleInfo()); + if (WrR.Stride != 1 || WrR.Indirect || WrR.Mask) + return nullptr; + } else + WrR = Region(Rd); // representing just the result of the rd, not the region + Region RdR; + if (Rd) { + RdR = Region(Rd, BaleInfo()); + if (RdR.Stride != 1 || RdR.Indirect) + return nullptr; + } else + RdR = Region(Wr->getOperand(0)); // representing just the value being + // written in to the region + unsigned InNumElements = In->getType()->getVectorNumElements(); + assert(Wr || Rd); + unsigned OutNumElements = (Wr ? Wr : Rd)->getType()->getVectorNumElements(); + unsigned Misalignment = InNumElements | OutNumElements | RdR.NumElements | + RdR.Width | RdR.VStride | RdR.Offset | + WrR.NumElements | WrR.Width | WrR.VStride | + WrR.Offset; + if (Misalignment & 1) + return nullptr; + unsigned LogAlignment = Misalignment & 2 ? 1 : 2; + auto InTy = + VectorType::get(Type::getIntNTy(Head->getContext(), 8 << LogAlignment), + InNumElements >> LogAlignment); + // Create the bitcast of the input if necessary. (We do that even if the input + // is constant, on the basis that EarlyCSE will simplify it.) + Value *BCIn = nullptr; + if (BitCastInst *InCast = dyn_cast(In)) { + if (InCast->getSrcTy() == InTy) + BCIn = InCast->getOperand(0); + } + if (BCIn == nullptr) { + BCIn = CastInst::Create(Instruction::BitCast, In, InTy, "bytemov", Head); + cast(BCIn)->setDebugLoc(Head->getDebugLoc()); + } + Value *Val = BCIn; + if (Rd) { + // Create the new rdregion. + RdR.NumElements >>= LogAlignment; + RdR.VStride >>= LogAlignment; + RdR.Width >>= LogAlignment; + auto NewRd = RdR.createRdRegion(Val, "", Head, Rd->getDebugLoc(), + /*AllowScalar=*/false); + NewRd->takeName(Rd); + Baling->setBaleInfo(NewRd, BaleInfo(BaleInfo::RDREGION)); + Val = NewRd; + } + if (Wr) { + // Create the bitcast of the old value of the vector. (Or just reuse + // the first bitcast if it is of the same value -- I saw this in + // Boxfilter.) + Value *BCOld = BCIn; + if (In != Wr->getOperand(0)) { + Value *OV = Wr->getOperand(0); + BCOld = nullptr; + auto ResTy = VectorType::get( + Type::getIntNTy(Head->getContext(), 8 << LogAlignment), + OutNumElements >> LogAlignment); + if (BitCastInst *OVCast = dyn_cast(OV)) { + if (OVCast->getSrcTy() == ResTy) + BCOld = OVCast->getOperand(0); + } + if (BCOld == nullptr) { + BCOld = + CastInst::Create(Instruction::BitCast, OV, ResTy, "bytemov", Head); + cast(BCOld)->setDebugLoc(Wr->getDebugLoc()); + } + } + // Create the new wrregion. + WrR.NumElements >>= LogAlignment; + WrR.VStride >>= LogAlignment; + WrR.Width >>= LogAlignment; + auto NewWr = cast( + WrR.createWrRegion(BCOld, Val, "", Head, Wr->getDebugLoc())); + NewWr->takeName(Wr); + BaleInfo BI(BaleInfo::WRREGION); + if (Rd) + BI.setOperandBaled(GenXIntrinsic::GenXRegion::NewValueOperandNum); + Baling->setBaleInfo(NewWr, BI); + Val = NewWr; + } + + bool NeedBC = true; + if (Head->hasOneUse()) { + auto U = Head->use_begin()->getUser(); + if (BitCastInst *UBC = dyn_cast(U)) { + if (UBC->getDestTy() == Val->getType()) { + UBC->replaceAllUsesWith(Val); + eraseInst(UBC); + NeedBC = false; + } + } + } + if (NeedBC) { + // Create the bitcast back to the original type. + auto BCOut = CastInst::Create(Instruction::BitCast, Val, Head->getType(), + "bytemov", Head); + BCOut->setDebugLoc(Head->getDebugLoc()); + // Replace and erase the original rdregion and wrregion. We do not need + // to do anything with their baling info as that is a ValueMap and they get + // removed automatically. + Head->replaceAllUsesWith(BCOut); + } + if (Wr) + eraseInst(Wr); + if (Rd) + eraseInst(Rd); + // Return the new wrregion if any, else the new rdregion. Do not return + // BCOut as it is not part of the bale for the move. + assert(dyn_cast(Val)); + return cast(Val); +} + +/*********************************************************************** + * splatPredicateIfNecessary : splat a wrregion/select predicate if necessary + * + * Enter: V = the predicate + * Width = width it needs to be splatted to + * InsertBefore = where to insert new instructions + * DL = debug loc for new instructions + * + * Return: the predicate, possibly a new instruction + * + * From GenXLegalization onwards, the predicate (mask) in a wrregion must + * either be scalar constant 1, or have the same vector width as the value + * being written by the wrregion. Similarly for the selector in a vector + * select, except that is not allowed to be scalar constant 1. + * + * It might make more sense to do this in GenXLowering, except that the + * predicate might be wider than 32 at that point. So we have to do it here. + */ +Value *GenXLegalization::splatPredicateIfNecessary(Value *V, + Type *ValueToWriteTy, + Instruction *InsertBefore, + const DebugLoc &DL) { + if (auto VT = dyn_cast(ValueToWriteTy)) + return splatPredicateIfNecessary(V, VT->getNumElements(), InsertBefore, DL); + return V; +} + +Value *GenXLegalization::splatPredicateIfNecessary(Value *V, unsigned Width, + Instruction *InsertBefore, + const DebugLoc &DL) { + if (Width == 1) + return V; + if (auto C = dyn_cast(V)) + if (C->isAllOnesValue()) + return V; + if (isa(V->getType())) + return V; + // Round Width up to 16 or 32. (No point in using up a 32 bit predicate + // register if we only need 16.) + unsigned RoundedWidth = Width > 16 ? 32 : 16; + // Use a select to turn the predicate into 0 or -1. + auto ITy = Type::getIntNTy(InsertBefore->getContext(), RoundedWidth); + auto Sel = SelectInst::Create( + V, Constant::getAllOnesValue(ITy), Constant::getNullValue(ITy), + InsertBefore->getName() + ".splatpredicate", InsertBefore); + Sel->setDebugLoc(DL); + // Bitcast that to v16i1 or v32i1 predicate (which becomes a setp + // instruction). + Instruction *Res = CastInst::Create( + Instruction::BitCast, Sel, + VectorType::get(Type::getInt1Ty(InsertBefore->getContext()), + RoundedWidth), + InsertBefore->getName() + ".splatpredicate", InsertBefore); + Res->setDebugLoc(DL); + // If the required size is smaller, do an rdpredregion. + if (Width == RoundedWidth) + return Res; + return Region::createRdPredRegionOrConst( + Res, 0, Width, Res->getName() + ".rdpredregion", InsertBefore, DL); +} + +/*********************************************************************** + * eraseInst : erase instruction, updating CurrentInst if we're erasing that + */ +void GenXLegalization::eraseInst(Instruction *Inst) { + removingInst(Inst); + // If the result is a predicate, ensure it is removed from IllegalPredicates, + // just in case it is a wrpredregion that was in IllegalPredicates. + if (auto VT = dyn_cast(Inst->getType())) + if (VT->getElementType()->isIntegerTy(1)) + IllegalPredicates.remove(Inst); + Inst->eraseFromParent(); +} + +void GenXLegalization::removingInst(Instruction *Inst) { + if (Inst == CurrentInst) + CurrentInst = Inst->getNextNode(); +} + +/*********************************************************************** + * fixIllegalPredicates : fix illegally sized predicate values + */ +struct StackEntry { + Instruction *Wr; // the wrpredregion this stack entry is for + Instruction *Parent; // its parent wrpredregion in the tree + SmallVector Parts; + // Constructor given wrpredregion and parent. + StackEntry(Instruction *Wr, Instruction *Parent) : Wr(Wr), Parent(Parent) {} +}; + +void GenXLegalization::fixIllegalPredicates(Function *F) { + // First fix illegal size predicate phi nodes, replacing each with multiple + // phi nodes with rdpredregion on the incomings and wrpredregion on the + // result. These rdpredregions and wrpredregions then get removed with other + // illegal size predicates in the code below. + SmallVector PhisToErase; + for (auto fi = F->begin(), fe = F->end(); fi != fe; ++fi) { + auto BB = &*fi; + Instruction *FirstNonPhi = BB->getFirstNonPHI(); + for (auto Phi = dyn_cast(BB->begin()); Phi; + Phi = dyn_cast(Phi->getNextNode())) { + if (!Phi->getType()->getScalarType()->isIntegerTy(1)) + continue; + // We have a predicate phi. Get the first part of it, which might show + // that we do not need to split it at all. + auto VT = dyn_cast(Phi->getType()); + if (!VT) + continue; + unsigned WholeSize = VT->getNumElements(); + auto PP = getPredPart(Phi, 0); + if (PP.Size == WholeSize) + continue; + // We do need to split. + Value *Joined = UndefValue::get(Phi->getType()); + unsigned NumIncoming = Phi->getNumIncomingValues(); + for (unsigned StartIdx = 0; StartIdx != WholeSize;) { + // Create a split phi node. + PP = getPredPart(Phi, StartIdx); + auto NewPhi = PHINode::Create( + VectorType::get(Phi->getType()->getScalarType(), PP.Size), + NumIncoming, Phi->getName() + ".split" + Twine(StartIdx), Phi); + // Do a rdpredregion for each incoming. + for (unsigned ii = 0; ii != NumIncoming; ++ii) { + BasicBlock *IncomingBlock = Phi->getIncomingBlock(ii); + Value *Incoming = Phi->getIncomingValue(ii); + auto NewRd = Region::createRdPredRegionOrConst( + Incoming, StartIdx, PP.Size, + Incoming->getName() + ".split" + Twine(StartIdx), + IncomingBlock->getTerminator(), DebugLoc()); + NewPhi->addIncoming(NewRd, IncomingBlock); + } + // Join with previous new phis for this original phi. + Joined = Region::createWrPredRegion(Joined, NewPhi, StartIdx, + Phi->getName() + ".join" + + Twine(StartIdx), + FirstNonPhi, DebugLoc()); + // If that was the first join, add it to the IllegalPredicates list for + // processing its tree of wrpredregions below. + if (!StartIdx) + IllegalPredicates.insert(cast(Joined)); + StartIdx += PP.Size; + } + // Replace the original phi and mark it for erasing. Also undef out its + // incomings so it doesn't matter what order we do the erases in. + auto Undef = UndefValue::get(Phi->getType()); + for (unsigned ii = 0; ii != NumIncoming; ++ii) + Phi->setIncomingValue(ii, Undef); + Phi->replaceAllUsesWith(Joined); + PhisToErase.push_back(Phi); + } + } + for (auto i = PhisToErase.begin(), e = PhisToErase.end(); i != e; ++i) + (*i)->eraseFromParent(); + // For each entry in IllegalPredicates that is the root of a tree of + // wrpredregions... + SmallVector ToErase; + for (auto ipi = IllegalPredicates.begin(), ipe = IllegalPredicates.end(); + ipi != ipe; ++ipi) { + std::vector Stack; + auto Root = *ipi; + if (GenXIntrinsic::getGenXIntrinsicID(Root->getOperand(0)) == + GenXIntrinsic::genx_wrpredregion) + continue; // not root of tree + assert(isa(Root->getOperand(0)) && + "expecting undef input to root of tree"); + // See if it really is illegally sized. + if (getPredPart(Root, 0).Size == Root->getType()->getVectorNumElements()) + continue; + // For traversing the tree, create a stack where each entry represents a + // value in the tree, and contains the values of the parts. Create an + // initial entry for the root of the tree. + Stack.push_back(StackEntry(Root, nullptr)); + // Process stack entries. + while (!Stack.empty()) { + auto Entry = &Stack.back(); + if (!Entry->Parts.empty()) { + // This stack entry has already been processed; we are on the way back + // down having processed its children. Just pop the stack entry, and + // mark the wrpredregion for erasing. We do not erase it now because it + // might be yet to visit in the IllegalPredicates vector. + ToErase.push_back(Entry->Wr); + Stack.pop_back(); + continue; + } + // Populate Parts with the value of each part from the parent. + if (!Entry->Parent) { + // No parent. All parts are undef. + auto Ty = Entry->Wr->getType(); + unsigned WholeSize = Ty->getVectorNumElements(); + for (unsigned Offset = 0; Offset != WholeSize;) { + auto PP = getPredPart(Entry->Wr, Offset); + Entry->Parts.push_back( + UndefValue::get(VectorType::get(Ty->getScalarType(), PP.Size))); + Offset += PP.Size; + } + } else { + // Inherit from parent. + for (auto i = (Entry - 1)->Parts.begin(), e = (Entry - 1)->Parts.end(); + i != e; ++i) + Entry->Parts.push_back(*i); + } + // For this wrpredregion, determine the part that it writes to, and see + // if it is the whole part. (It cannot overlap more than one part, + // because getLegalPredSize ensured that all splits were within parts.) + unsigned WrOffset = + cast(Entry->Wr->getOperand(2))->getZExtValue(); + unsigned WrSize = + Entry->Wr->getOperand(1)->getType()->getVectorNumElements(); + auto PP = getPredPart(Entry->Wr, WrOffset); + assert(WrOffset + WrSize <= PP.Offset + PP.Size && + "overlaps multiple parts"); + Value *Part = Entry->Parts[PP.PartNum]; + if (WrSize != PP.Size) { + // Not the whole part. We need to write into the previous value of this + // part. + auto NewWr = Region::createWrPredRegion( + Part, Entry->Wr->getOperand(1), WrOffset - PP.Offset, "", Entry->Wr, + Entry->Wr->getDebugLoc()); + NewWr->takeName(Entry->Wr); + Part = NewWr; + } else + Part = Entry->Wr->getOperand(1); + // Store the new value of this part. + Entry->Parts[PP.PartNum] = Part; + // Gather uses in rdpredregion. + SmallVector Rds; + for (auto ui = Entry->Wr->use_begin(), ue = Entry->Wr->use_end(); + ui != ue; ++ui) { + auto User = cast(ui->getUser()); + if (GenXIntrinsic::getGenXIntrinsicID(User) == + GenXIntrinsic::genx_rdpredregion) + Rds.push_back(User); + } + // For each rdpredregion, turn it into a read from the appropriate + // part. + for (auto ri = Rds.begin(), re = Rds.end(); ri != re; ++ri) { + Instruction *Rd = *ri; + unsigned RdOffset = + cast(Rd->getOperand(1))->getZExtValue(); + unsigned RdSize = Rd->getType()->getVectorNumElements(); + auto PP = getPredPart(Entry->Wr, RdOffset); + assert(RdOffset + RdSize <= PP.Offset + PP.Size && + "overlaps multiple parts"); + Value *Part = Entry->Parts[PP.PartNum]; + if (RdSize != PP.Size) { + // Only reading a subregion of a part. + // Assert that the rdpredregion is legal. In fact we will probably + // have to cope with an illegal one, by generating code to bitcast + // the predicate to a scalar int (or finding code where it is already + // bitcast from a scalar int), using bit twiddling to get the + // required subregion, and bitcasting back. I think this situation + // will arise where the input to legalization had an odd size + // rdpredregion in a wrregion where the input predicate is a v32i1 + // from an odd size CM select using an i32 as the mask. +#if _DEBUG + if (RdOffset) { + unsigned RdMisalignment = 1U << findFirstSet(RdOffset); + assert((RdMisalignment >= 8 || + (RdMisalignment == 4 && Rd->hasOneUse() && + cast(Rd->use_begin()->getUser()) + ->getOperand(1) + ->getType() + ->getScalarType() + ->getPrimitiveSizeInBits() == 64)) && + !((RdOffset - PP.Offset) % RdSize) && + "illegal rdpredregion"); + } +#endif + // Create a new rdpredregion. + auto NewRd = Region::createRdPredRegion( + Part, RdOffset - PP.Offset, RdSize, "", Rd, Rd->getDebugLoc()); + NewRd->takeName(Rd); + Part = NewRd; + } + // Replace the original rdpredregion with the value of the part. + Rd->replaceAllUsesWith(Part); + Rd->eraseFromParent(); + } + // All remaining uses must be wrpredregion. Push them onto the stack. + for (auto ui = Entry->Wr->use_begin(), ue = Entry->Wr->use_end(); + ui != ue; ++ui) { + auto User = cast(ui->getUser()); + assert(GenXIntrinsic::getGenXIntrinsicID(User) == + GenXIntrinsic::genx_wrpredregion && + !ui->getOperandNo() && "expecting only wrpredregion uses"); + Stack.push_back(StackEntry(User, Entry->Wr)); + } + } + } + // Erase the old wrpredregions. + for (auto i = ToErase.begin(), e = ToErase.end(); i != e; ++i) + (*i)->eraseFromParent(); +} + +GenXLegalization::SplitKind GenXLegalization::checkBaleSplittingKind() { + if (B.endsWithGStore()) + return SplitKind::SplitKind_GStore; + + auto Head = B.getHeadIgnoreGStore(); + SplitKind Kind = SplitKind::SplitKind_Normal; + + if (Head->Info.Type == BaleInfo::WRREGION) { + Value *WrRegionInput = Head->Inst->getOperand(0); + Region R1(Head->Inst, Head->Info); + for (auto &I : B) { + if (I.Info.Type != BaleInfo::RDREGION) + continue; + if (I.Inst->getOperand(0) != WrRegionInput) + continue; + Region R2(I.Inst, I.Info); + if (R1 != R2) { + // Check if R1 overlaps with R2. Create a new region for R1 as we are + // rewriting region offsets if their difference is a constant. + Region R(Head->Inst, Head->Info); + + // Analyze dynamic offset difference, but only for a scalar offset. + if (R1.Indirect && R2.Indirect) { + if (R1.Indirect->getType()->isVectorTy() || + R2.Indirect->getType()->isVectorTy()) + return SplitKind::SplitKind_Normal; + + // Strip truncation from bitcast followed by a region read. + auto stripConv = [](Value *Val) { + if (GenXIntrinsic::isRdRegion(Val)) { + CallInst *CI = cast(Val); + Region R(CI, BaleInfo()); + if (R.Offset == 0 && R.Width == 1) + Val = CI->getOperand(0); + if (auto BI = dyn_cast(Val)) + Val = BI->getOperand(0); + } + return Val; + }; + + Value *Offset1 = stripConv(R.Indirect); + Value *Offset2 = stripConv(R2.Indirect); + if (Offset1->getType() == Offset2->getType()) { + auto S1 = SE->getSCEV(Offset1); + auto S2 = SE->getSCEV(Offset2); + auto Diff = SE->getMinusSCEV(S1, S2); + assert(R.Indirect); + Diff = SE->getTruncateOrNoop(Diff, R.Indirect->getType()); + if (auto SCC = dyn_cast(Diff)) { + ConstantInt *CI = SCC->getValue(); + int OffsetDiff = std::abs(static_cast(CI->getSExtValue())); + R.Offset = 0; + R.Indirect = nullptr; + R2.Offset = OffsetDiff; + R2.Indirect = nullptr; + } + } + } + + // Ignore the mask and adjust both offsets by a common dynamic + // value if exists. If the resulting regions do not overlap, then two + // original regions do not overlap. + R.Mask = nullptr; + R2.Mask = nullptr; + + // As both R and R2 have constant offsets, the overlap function + // should check their footprints accurately. + if (R.overlap(R2)) + return SplitKind::SplitKind_Normal; + Kind = SplitKind::SplitKind_Propagation; + continue; + } + + // (1) 1D direct regions or indirect regions with single offset. + // (2) 2D direct regions with VStride >= Width, or indirect regions with + // single offset. + bool IsMultiAddr = R1.Indirect && R1.Indirect->getType()->isVectorTy(); + if (!R1.is2D()) { + if (IsMultiAddr) + return SplitKind::SplitKind_Normal; + Kind = SplitKind::SplitKind_Propagation; + } else { + if (R1.VStride < (int)R1.Width || IsMultiAddr) + return SplitKind::SplitKind_Normal; + Kind = SplitKind::SplitKind_Propagation; + } + } + } + + return Kind; +} + +// This function deals with intrinsic calls with special restrictions. +// - Certain intrinsic calls should be placed in the entry blocks: +// llvm.genx.predifined.surface +// +void GenXLegalization::fixIntrinsicCalls(Function *F) { + auto PF = F->getParent()->getFunction("llvm.genx.predefined.surface"); + if (!PF) + return; + + // Collect all calls to PF in this function. + std::map> Calls; + for (auto U : PF->users()) { + if (auto UI = dyn_cast(U)) { + BasicBlock *BB = UI->getParent(); + if (BB->getParent() != F) + continue; + if (auto CI = dyn_cast(UI->getOperand(0))) { + int64_t Arg = CI->getSExtValue(); + Calls[Arg].push_back(UI); + } + } + } + + BasicBlock *EntryBB = &F->getEntryBlock(); + Instruction *InsertPos = &*EntryBB->getFirstInsertionPt(); + + for (auto I = Calls.begin(), E = Calls.end(); I != E; ++I) { + Instruction *EntryDef = nullptr; + for (auto Inst : I->second) { + if (Inst->getParent() == EntryBB) { + EntryDef = Inst; + break; + } + } + + // No entry definition found, then clone one. + if (EntryDef == nullptr) { + EntryDef = I->second.front()->clone(); + EntryDef->insertBefore(InsertPos); + } else + EntryDef->moveBefore(InsertPos); + + // Now replace all uses with this new definition. + for (auto Inst : I->second) { + std::vector WorkList{Inst}; + while (!WorkList.empty()) { + Instruction *CurI = WorkList.back(); + WorkList.pop_back(); + + for (auto UI = CurI->use_begin(); UI != CurI->use_end();) { + Use &U = *UI++; + // Skip if this use just comes from EntryDef. + if (EntryDef == U.get()) + continue; + // All uses of this PHI will be replaced as well. + if (auto PHI = dyn_cast(U.getUser())) + WorkList.push_back(PHI); + U.set(EntryDef); + } + if (CurI->use_empty()) + CurI->eraseFromParent(); + } + } + } +} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveRanges.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveRanges.cpp new file mode 100644 index 000000000000..0c772f3ba69e --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveRanges.cpp @@ -0,0 +1,215 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXLiveRanges +/// -------------- +/// +/// GenXLiveRanges calculates the actual live range information (the segments) +/// on the LiveRange object for each value. See the comment at the top of +/// GenXLiveness.h for details of how the live range information works. This +/// pass calls GenXLiveness::buildLiveRange to do the work for each value. +/// +/// The LiveRange object for each value already existed before this pass, as it +/// was created by GenXCategory. In the case of a value that we can now see does +/// not want a LiveRange, because it is an Instruction baled in to something, +/// we erase the LiveRange here. +/// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "GENX_LIVERANGES" + +#include "FunctionGroup.h" +#include "GenX.h" +#include "GenXBaling.h" +#include "GenXIntrinsics.h" +#include "GenXLiveness.h" +#include "GenXNumbering.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Type.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; +using namespace genx; + +namespace { + +class GenXLiveRanges : public FunctionGroupPass { + FunctionGroup *FG; + GenXBaling *Baling; + GenXLiveness *Liveness; +public: + static char ID; + explicit GenXLiveRanges() : FunctionGroupPass(ID) { } + virtual StringRef getPassName() const { return "GenX live ranges analysis"; } + void getAnalysisUsage(AnalysisUsage &AU) const; + bool runOnFunctionGroup(FunctionGroup &FG); + // createPrinterPass : get a pass to print the IR, together with the GenX + // specific analyses + virtual Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const + { return createGenXGroupPrinterPass(O, Banner); } + +private: + void buildLiveRanges(); + + bool isPredefinedVariable(Value *) const; +}; + +} // end anonymous namespace + +namespace llvm { void initializeGenXLiveRangesPass(PassRegistry &); } +char GenXLiveRanges::ID = 0; +INITIALIZE_PASS_BEGIN(GenXLiveRanges, "GenXLiveRanges", "GenXLiveRanges", false, false) +INITIALIZE_PASS_DEPENDENCY(GenXGroupBaling) +INITIALIZE_PASS_DEPENDENCY(GenXLiveness) +INITIALIZE_PASS_DEPENDENCY(GenXNumbering) +INITIALIZE_PASS_DEPENDENCY(FunctionGroupAnalysis) +INITIALIZE_PASS_END(GenXLiveRanges, "GenXLiveRanges", "GenXLiveRanges", false, false) + +FunctionGroupPass *llvm::createGenXLiveRangesPass() +{ + initializeGenXLiveRangesPass(*PassRegistry::getPassRegistry()); + return new GenXLiveRanges(); +} + +void GenXLiveRanges::getAnalysisUsage(AnalysisUsage &AU) const +{ + FunctionGroupPass::getAnalysisUsage(AU); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.setPreservesAll(); +} + +/*********************************************************************** + * runOnFunctionGroup : run the liveness analysis for this FunctionGroup + */ +bool GenXLiveRanges::runOnFunctionGroup(FunctionGroup &ArgFG) +{ + FG = &ArgFG; + Baling = &getAnalysis(); + Liveness = &getAnalysis(); + Liveness->setBaling(Baling); + Liveness->setNumbering(&getAnalysis()); + // Build the live ranges. + Liveness->buildSubroutineLRs(); + buildLiveRanges(); +#ifndef NDEBUG + // Check we don't have any leftover empty live ranges. If we do, it means + // that a pass between GenXCategory and here has erased a value and failed + // to erase its LiveRange, or alternatively this pass has failed to erase + // the LiveRange for a value that does not need it because it is a baled + // in instruction. + for (GenXLiveness::iterator i = Liveness->begin(), e = Liveness->end(); i != e; ++i) { + LiveRange *LR = i->second; + assert(LR->size()); // Check the LR has at least one segment. + } +#endif // ndef NDEBUG + return false; +} + +/*********************************************************************** + * isPredefinedVariable : check if it's tranlated into predefined + * variables in vISA. + */ +bool GenXLiveRanges::isPredefinedVariable(Value *V) const { + switch (GenXIntrinsic::getGenXIntrinsicID(V)) { + case GenXIntrinsic::genx_predefined_surface: + return true; + default: + break; + } + return false; +} + +/*********************************************************************** + * buildLiveRanges : build live ranges for all args and instructions + */ +void GenXLiveRanges::buildLiveRanges() +{ + // Build live ranges for global variables; + for (auto &G : FG->getModule()->globals()) + Liveness->buildLiveRange(&G); + for (auto i = FG->begin(), e = FG->end(); i != e; ++i) { + Function *Func = *i; + // Build live ranges for args. + for (auto fi = Func->arg_begin(), fe = Func->arg_end(); fi != fe; ++fi) + Liveness->buildLiveRange(&*fi); + if (i != FG->begin() && !Func->getReturnType()->isVoidTy()) { + // Build live range(s) for unified return value. + Liveness->buildLiveRange(Liveness->getUnifiedRet(Func)); + } + // Build live ranges for code. + for (Function::iterator fi = Func->begin(), fe = Func->end(); fi != fe; ++fi) { + BasicBlock *BB = &*fi; + for (BasicBlock::iterator bi = BB->begin(), be = BB->end(); bi != be; ++bi) { + Instruction *Inst = &*bi; + // Skip building live range for instructions + // - has no destination + // - is already baled, or + // - is predefined variable in vISA. + if (!Inst->getType()->isVoidTy() && !Baling->isBaled(Inst) && + !isPredefinedVariable(Inst)) { + // Instruction is not baled in to anything. + // First check if the result is unused and it is an intrinsic whose + // result is marked RAW_NULLALLOWED. If so, don't create a live range, + // so no register gets allocated. + if (Inst->use_empty()) { + unsigned IID = GenXIntrinsic::getAnyIntrinsicID(Inst); + switch (IID) { + case GenXIntrinsic::not_any_intrinsic: + case GenXIntrinsic::genx_rdregioni: + case GenXIntrinsic::genx_rdregionf: + case GenXIntrinsic::genx_wrregioni: + case GenXIntrinsic::genx_wrregionf: + case GenXIntrinsic::genx_wrconstregion: + break; + default: { + GenXIntrinsicInfo::ArgInfo AI + = GenXIntrinsicInfo(IID).getRetInfo(); + if (AI.isRaw() && AI.rawNullAllowed()) { + // Unused RAW_NULLALLOWED result. + Liveness->eraseLiveRange(Inst); + continue; + } + break; + } + } + } + // Build its live range. + Liveness->buildLiveRange(Inst); + } else { + // Instruction is baled in to something. Erase its live range so the + // register allocator does not try and allocate it something. + Liveness->eraseLiveRange(Inst); + } + } + } + } +} + diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveness.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveness.cpp new file mode 100644 index 000000000000..ea4d871f2038 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveness.cpp @@ -0,0 +1,1872 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +// GenXLiveness is an analysis that contains the liveness information for the +// values in the code. See the comment at the top of GenXLiveness.h for further +// details. +// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "GENX_LIVENESS" + +#include "GenXLiveness.h" +#include "GenX.h" +#include "GenXBaling.h" +#include "GenXIntrinsics.h" +#include "GenXNumbering.h" +#include "GenXRegion.h" +#include "GenXUtil.h" +#include "vc/GenXOpts/Utils/RegCategory.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Type.h" +#include "llvm/Support/Debug.h" +#include "llvmWrapper/IR/InstrTypes.h" + +#include + +using namespace llvm; +using namespace genx; + +char GenXLiveness::ID = 0; +INITIALIZE_PASS_BEGIN(GenXLiveness, "GenXLiveness", "GenXLiveness", false, false) +INITIALIZE_PASS_END(GenXLiveness, "GenXLiveness", "GenXLiveness", false, false) + +FunctionGroupPass *llvm::createGenXLivenessPass() +{ + initializeGenXLivenessPass(*PassRegistry::getPassRegistry()); + return new GenXLiveness(); +} + +void GenXLiveness::getAnalysisUsage(AnalysisUsage &AU) const +{ + FunctionGroupPass::getAnalysisUsage(AU); + AU.setPreservesAll(); +} + +/*********************************************************************** + * runOnFunctionGroup : do nothing + */ +bool GenXLiveness::runOnFunctionGroup(FunctionGroup &ArgFG) +{ + clear(); + FG = &ArgFG; + return false; +} + +/*********************************************************************** + * clear : clear the GenXLiveness + */ +void GenXLiveness::clear() +{ + while (!LiveRangeMap.empty()) { + LiveRange *LR = LiveRangeMap.begin()->second; + for (auto i = LR->value_begin(), e = LR->value_end(); i != e; ++i) { + SimpleValue V = *i; + LiveRangeMap.erase(V); + } + delete LR; + } + FG = 0; + delete CG; + CG = 0; + for (auto i = UnifiedRets.begin(), e = UnifiedRets.end(); i != e; ++i) + i->second->deleteValue(); + UnifiedRets.clear(); + UnifiedRetToFunc.clear(); + ArgAddressBaseMap.clear(); +} + +/*********************************************************************** + * setLiveRange : add a SimpleValue to a LiveRange + * + * This: + * 1. adds the SimpleValue to the LiveRange's value list; + * 2. sets the SimpleValue's entry in the map to point to the LiveRange. + */ +void GenXLiveness::setLiveRange(SimpleValue V, LiveRange *LR) +{ + assert(LiveRangeMap.find(V) == LiveRangeMap.end() && "Attempting to set LiveRange for Value that already has one"); + LR->addValue(V); + LiveRangeMap[V] = LR; + LR->setAlignmentFromValue(V); +} + +/*********************************************************************** + * setAlignmentFromValue : set a live range's alignment from a value + */ +void LiveRange::setAlignmentFromValue(SimpleValue V) +{ + Type *Ty = IndexFlattener::getElementType( + V.getValue()->getType(), V.getIndex()); + if (Ty->isPointerTy()) + Ty = Ty->getPointerElementType(); + unsigned SizeInBits = Ty->getScalarType()->getPrimitiveSizeInBits(); + if (auto VT = dyn_cast(Ty)) + SizeInBits *= VT->getNumElements(); + unsigned LogAlign = Log2_32(SizeInBits) - 3; + // Set max alignment to GRF + if (LogAlign > 5) + LogAlign = 5; + setLogAlignment(LogAlign); +} + +/*********************************************************************** + * rebuildCallGraph : rebuild GenXLiveness's call graph + */ +void GenXLiveness::rebuildCallGraph() +{ + delete CG; + CG = new CallGraph(FG); + CG->build(this); +} + +/*********************************************************************** + * buildSubroutineLRs : build the subroutine LRs + * + * If the FunctionGroup has subroutines, then each one (each Function other + * than the head one) gets a "subroutine LR", giving the live range + * of the whole subroutine plus any other subroutines it can call. + * Then, when building a real live range later, if it goes over a call, + * we can add the subroutine LR. + * + * The subroutine LR has weak liveness, as that's what we want to add to + * anything live over a call to the subroutine. + */ +void GenXLiveness::buildSubroutineLRs() +{ + if (FG->size() == 1) + return; // no subroutines + // Build a call graph for the FunctionGroup. It is acyclic because there is + // no recursion. + rebuildCallGraph(); + // Depth-first walk the graph to propagate live ranges upwards. + visitPropagateSLRs(FG->getHead()); +} + +/*********************************************************************** + * visitPropagateSLRs : visit a callgraph node to propagate subroutine LR + * + * This is recursive. + */ +LiveRange *GenXLiveness::visitPropagateSLRs(Function *F) +{ + LiveRange *LR = getOrCreateLiveRange(F); + // Add a segment for just this function. + LR->push_back(Segment(Numbering->getNumber(F), + Numbering->getNumber(F->back().getTerminator()) + 1, Segment::WEAK)); + // For each child... + CallGraph::Node *N = CG->getNode(F); + for (auto i = N->begin(), e = N->end(); i != e; ++i) { + // Visit the child to calculate its LR. + LiveRange *ChildLR = visitPropagateSLRs(i->Call->getCalledFunction()); + // Merge it into ours. + LR->addSegments(ChildLR); + } + LR->sortAndMerge(); + return LR; +} + +/*********************************************************************** + * buildLiveRange : build live range for one value (arg or non-baled inst) + * + * For a struct value, each element's live range is built separately, even + * though they are almost identical. They are not exactly identical, + * differing at the def if it is the return value of a call, and at a use + * that is a call arg. + */ +void GenXLiveness::buildLiveRange(Value *V) +{ + auto ST = dyn_cast(V->getType()); + if (!ST) { + buildLiveRange(SimpleValue(V)); + return; + } + for (unsigned i = 0, e = IndexFlattener::getNumElements(ST); i != e; ++i) + buildLiveRange(SimpleValue(V, i)); +} + +/*********************************************************************** + * buildLiveRange : build live range for one SimpleValue + * + * rebuildLiveRange : rebuild live range for a LiveRange struct + * + * The BBs[] array, one entry per basic block, is temporarily used here to + * store the live range for the value within that block. We start by + * registering the short live range for the definition, then, for each use, + * create a live range in the use's block then recursively scan back + * through predecessors until we meet a block where there is already a + * live range. This is guaranteed to terminate because of the dominance + * property of SSA. + * + * See Appel "Modern Compiler Implementation in C" 19.6. + * + * rebuildLiveRange can be called from later passes to rebuild the segments + * for a particular live range. If used after coalescing, the live range might + * have more than one value, in which case segments are added for each value + * and then merged. Thus we assume that, after whatever code change a pass made + * to require rebuilding the live range, the coalesced values can still be + * validly coalesced, without having any way of checking that. + * + */ +LiveRange *GenXLiveness::buildLiveRange(SimpleValue V) +{ + LiveRange *LR = getOrCreateLiveRange(V); + rebuildLiveRange(LR); + return LR; +} + +void GenXLiveness::rebuildLiveRange(LiveRange *LR) +{ + LR->getOrDefaultCategory(); + LR->Segments.clear(); + for (auto vi = LR->value_begin(), ve = LR->value_end(); vi != ve; ++vi) + rebuildLiveRangeForValue(LR, *vi); + LR->sortAndMerge(); +} + +void GenXLiveness::rebuildLiveRangeForValue(LiveRange *LR, SimpleValue SV) +{ + Value *V = SV.getValue(); + + // This value is a global variable. Its live range is the entire kernel. + if (auto GV = getUnderlyingGlobalVariable(V)) { + (void)GV; + LR->push_back(0, Numbering->getLastNumber()); + return; + } + + std::map BBRanges; + if (auto Func = isUnifiedRet(V)) { + // This value is the unified return value of the function Func. Its live + // range is from the call to where its post-copy would go just afterwards + // for each call site, also from the site of the pre-copy to the return + // instruction. + for (auto ui = Func->use_begin(), ue = Func->use_end(); ui != ue; ++ui) { + if (auto CI = dyn_cast(ui->getUser())) + LR->push_back(Numbering->getNumber(CI), + Numbering->getRetPostCopyNumber(CI, SV.getIndex())); + } + for (auto fi = Func->begin(), fe = Func->end(); fi != fe; ++fi) + if (auto RI = dyn_cast(fi->getTerminator())) + LR->push_back(Numbering->getRetPreCopyNumber(RI, SV.getIndex()), + Numbering->getNumber(RI)); + return; + } + + // Mark the value as live and then almost immediately dead again at the + // point where it is defined. + unsigned StartNum = 0, EndNum = 0; + Function *Func = 0; + auto Arg = dyn_cast(V); + BasicBlock *BB = nullptr; + if (Arg) { + Func = Arg->getParent(); + StartNum = Numbering->getNumber(Func); + EndNum = StartNum + 1; + BB = &Func->front(); + } else if (auto Phi = dyn_cast(V)) { + // Phi node. Treat as defined at the start of the block. + EndNum = Numbering->getNumber(Phi) + 1; + BB = Phi->getParent(); + StartNum = Numbering->getNumber(BB); + // For a phi node, we also need to register an extra little live range at + // the end of each predecessor, from where we will insert a copy to the + // end. This is done lower down in this function. + } else { + StartNum = Numbering->getNumber(V); + auto Inst = cast(V); + BB = Inst->getParent(); + auto CI = dyn_cast(V); + if (CI) { + if (!GenXIntrinsic::isAnyNonTrivialIntrinsic(V)) { + // For the return value from a call, move the definition point to the ret + // post-copy slot after the call, where the post-copy will be inserted if + // it fails to be coalesced with the function's unified return value. + StartNum = Numbering->getRetPostCopyNumber(CI, SV.getIndex()); + } + } + EndNum = StartNum + 1; + if (CI && getTwoAddressOperandNum(CI) >= 0) { + // Two address op. Move the definition point one earlier, to where + // GenXCoalescing will need to insert a copy if coalescing fails. + --StartNum; + } + } + BBRanges[BB] = Segment(StartNum, EndNum); + // The stack for predecessors that need to be processed: + std::vector Stack; + // Process each use. + for (Value::use_iterator i = V->use_begin(), e = V->use_end(); + i != e; ++i) { + BasicBlock *BB = nullptr; + Instruction *user = cast(i->getUser()); + unsigned Num; + if (PHINode *Phi = dyn_cast(user)) { + // Use in a phi node. We say that the use is where the phi copy will be + // placed in the predecessor block. + BB = Phi->getIncomingBlock(*i); + Num = Numbering->getPhiNumber(Phi, BB); + } else { + // Normal use. + // For live range purposes, an instruction is considered to be at the + // same place as the head of its bale. We need to use getBaleHead to + // ensure that we consider it to be there. + Instruction *UserHead = Baling->getBaleHead(user); + BB = UserHead->getParent(); + Num = Numbering->getNumber(UserHead); + if (auto CI = dyn_cast(user)) { + if (CI->isInlineAsm() || CI->isIndirectCall()) + Num = Numbering->getNumber(UserHead); + else { + switch (GenXIntrinsic::getAnyIntrinsicID(CI)) { + case GenXIntrinsic::not_any_intrinsic: + // Use as a call arg. We say that the use is at the arg pre-copy + // slot, where the arg copy will be inserted in coalescing. This + // assumes that the copies will be in the same order as args in the + // call, with struct elements in order too. + Num = Numbering->getArgPreCopyNumber(CI, i->getOperandNo(), + SV.getIndex()); + break; + default: + if (getTwoAddressOperandNum(CI) == (int)i->getOperandNo()) { + // The use is the two address operand in a two address op. Move + // the use point one earlier, to where GenXCoalescing will need + // to insert a copy if coalescing fails. If there is any other + // use of this value in the same bale, that will not have its use + // point one number earlier. The unnecessary interference that + // would cause is fixed in the way that twoAddrInterfere() + // detects interference. + --Num; + } + break; + case GenXIntrinsic::genx_simdcf_goto: + // Use in a goto. Treat it as at the branch, as GenXVisaFuncWriter + // writes the goto just before the branch, after any intervening IR. + Num = Numbering->getNumber(CI->getParent()->getTerminator()); + break; + } + } + } else if (auto RI = dyn_cast(user)) { + // Use in a return. We say that the use is where the ret value + // pre-copy will be inserted in coalescing. This assumes that the + // copies will be in the same order as the struct elements in the + // return value. + Num = Numbering->getRetPreCopyNumber(RI, SV.getIndex()); + } + } + auto BBRange = &BBRanges[BB]; + if (BBRange->getEnd()) { + // There is already a live range in this block. Extend it if + // necessary. No need to scan back from here, so we're done with + // this use. + if (BBRange->getEnd() < Num) + BBRange->setEnd(Num); + continue; + } + // Add a new live range from the start of this block, and remember the + // range of blocks that contain a live range (so we don't have to scan + // all of them at the end). + *BBRange = Segment(Numbering->getNumber(BB), Num); + // Push this block's predecessors onto the stack. + // (A basic block's predecessors are those blocks containing a + // TerminatorInst that uses the basic block.) + for (Value::use_iterator i = BB->use_begin(), e = BB->use_end(); + i != e; ++i) { + Instruction *TI = dyn_cast(i->getUser()); + assert(TI); + if (TI->isTerminator()) + Stack.push_back(TI->getParent()); + } + // Process stack until empty. + while (Stack.size()) { + BB = Stack.back(); + Stack.pop_back(); + BBRange = &BBRanges[BB]; + auto BBNum = Numbering->getBBNumber(BB); + if (BBRange->getEnd()) { + // There is already a live range in this block. Extend it to the end. + // No need to scan back from here. + BBRange->setEnd(BBNum->EndNumber); + continue; + } + // Add a new live range through the whole of this block, and remember the + // range of blocks that contain a live range (so we don't have to scan + // all of them at the end). + BBRange->setStartEnd(Numbering->getNumber(BB), BBNum->EndNumber); + // Push this block's predecessors onto the stack. + // (A basic block's predecessors are those blocks containing a + // TerminatorInst that uses the basic block.) + for (Value::use_iterator i = BB->use_begin(), e = BB->use_end(); + i != e; ++i) { + Instruction *TI = dyn_cast(i->getUser()); + assert(TI); + if (TI->isTerminator()) + Stack.push_back(TI->getParent()); + } + } + } + // Now we can build the live range. + for (auto bri = BBRanges.begin(), bre = BBRanges.end(); bri != bre; ++bri) { + auto BBRange = &bri->second; + LR->push_back(*BBRange); + } + if (PHINode *Phi = dyn_cast(V)) { + // For a phi node, we also need to register an extra little live range at + // the end of each predecessor, from where we will insert a copy to the + // end. + for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { + auto Pred = Phi->getIncomingBlock(i); + auto BBNum = Numbering->getBBNumber(Pred); + LR->push_back(Segment(Numbering->getPhiNumber(Phi, Pred), + BBNum->EndNumber, Segment::PHICPY)); + } + } + LR->sortAndMerge(); + if (CG) { + // Check if the live range crosses any call instruction. If so, add the + // appropriate subroutine live range. + bool NeedSort = false; + auto N = CG->getNode(Func); + for (auto i = N->begin(), e = N->end(); i != e; ++i) { + auto E = &*i; + // See if this call is in a segment of the LR. + auto Seg = LR->find(E->Number); + if (Seg != LR->end() && Seg->getStart() <= E->Number && Seg->getEnd() > E->Number) { + // Yes it is. Merge the subroutine LR of the callee into our LR. + if (!E->Call->getCalledFunction()->hasFnAttribute("CMStackCall")) + LR->addSegments(getLiveRange(E->Call->getCalledFunction())); + NeedSort = true; + } + } + if (NeedSort) + LR->sortAndMerge(); + } + if (Arg) { + // For a function arg, for each call site, add a segment from the arg + // pre-copy site, the point just before the call at which it will be copied + // into, up to the call. We assume that any copies before the call + // inserted by coalescing will be in the obvious order of args and elements + // within args. + Function *F = Arg->getParent(); + if (*FG->begin() != F) { // is a subroutine + for (auto ui = F->use_begin(), ue = F->use_end(); ui != ue; ++ui) { + if (auto CI = dyn_cast(ui->getUser())) { + LR->push_back( + Numbering->getArgPreCopyNumber(CI, Arg->getArgNo(), SV.getIndex()), + Numbering->getNumber(CI)); + } + } + } + } +} + +void GenXLiveness::removeBale(Bale &B) { + for (auto bi = B.begin(), be = B.end(); bi != be; ++bi) + removeValue(bi->Inst); +} + +/*********************************************************************** + * removeValue : remove the supplied value from its live range, and delete + * the range if it now has no values + * + * removeValueNoDelete : same, but do not delete the LR if it is now + * valueless + * + * Calling this with a value that does not have a live range is silently + * ignored. + */ +void GenXLiveness::removeValue(Value *V) +{ + for (unsigned i = 0, e = IndexFlattener::getNumElements(V->getType()); i != e; ++i) + removeValue(SimpleValue(V, i)); +} + +void GenXLiveness::removeValue(SimpleValue V) +{ + LiveRange *LR = removeValueNoDelete(V); + if (LR && !LR->Values.size()) { + // V was the only value in LR. Remove LR completely. + delete LR; + } +} + +LiveRange *GenXLiveness::removeValueNoDelete(SimpleValue V) +{ + LiveRangeMap_t::iterator i = LiveRangeMap.find(V); + if (i == LiveRangeMap.end()) + return nullptr; + LiveRange *LR = i->second; + LiveRangeMap.erase(i); + // Remove V from LR. + unsigned j; + for (j = 0; LR->Values[j].get() != V; ++j) { + assert(j != LR->Values.size()); + } + if (&LR->Values[j] != &LR->Values.back()) + LR->Values[j] = LR->Values.back(); + LR->Values.pop_back(); + return LR; +} + +/*********************************************************************** + * removeValuesNoDelete : remove all values from the live range, but do not + * delete the LR + */ +void GenXLiveness::removeValuesNoDelete(LiveRange *LR) +{ + for (auto vi = LR->value_begin(), ve = LR->value_end(); vi != ve; ++vi) + LiveRangeMap.erase(*vi); + LR->value_clear(); +} + +/*********************************************************************** + * replaceValue : update liveness such that NewVal has OldVal's live range, + * and OldVal does not have one at all. + */ +void GenXLiveness::replaceValue(Value *OldVal, Value *NewVal) +{ + for (unsigned i = 0, e = IndexFlattener::getNumElements(OldVal->getType()); + i != e; ++i) + replaceValue(SimpleValue(OldVal, i), SimpleValue(NewVal, i)); +} + +void GenXLiveness::replaceValue(SimpleValue OldVal, SimpleValue NewVal) +{ + LiveRangeMap_t::iterator i = LiveRangeMap.find(OldVal); + assert(i != LiveRangeMap.end()); + LiveRange *LR = i->second; + LiveRangeMap.erase(i); + LiveRangeMap[NewVal] = LR; + unsigned j = 0; + assert(!LR->Values.empty()); + for (j = 0; LR->Values[j].get() != OldVal; ++j) + assert(j != LR->Values.size()); + LR->Values[j] = NewVal; +} + +/*********************************************************************** + * getOrCreateLiveRange : get live range for a value, creating if necessary + */ +LiveRange *GenXLiveness::getOrCreateLiveRange(SimpleValue V) +{ + LiveRangeMap_t::iterator i = LiveRangeMap.insert( + LiveRangeMap_t::value_type(V, 0)).first; + LiveRange *LR = i->second; + if (!LR) { + // Newly created map entry. Create the LiveRange for it. + LR = new LiveRange; + LR->Values.push_back(V); + i->second = LR; + LR->setAlignmentFromValue(V); + } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + // Give the Value a name if it doesn't already have one. + if (!V.getValue()->getName().size()) { + std::string NameBuf; + StringRef Name = "arg"; + if (auto Inst = dyn_cast(V.getValue())) { + unsigned IID = GenXIntrinsic::getAnyIntrinsicID(V.getValue()); + if (GenXIntrinsic::isAnyNonTrivialIntrinsic(IID)) { + // For an intrinsic call, use the intrinsic name after the + // final period. + NameBuf = GenXIntrinsic::getAnyName(IID, None); + Name = NameBuf; + size_t Period = Name.rfind('.'); + if (Period != StringRef::npos) + Name = Name.slice(Period + 1, Name.size()); + } else + Name = Inst->getOpcodeName(); + } + V.getValue()->setName(Name); + } +#endif + return LR; +} + +LiveRange *GenXLiveness::getOrCreateLiveRange(SimpleValue V, unsigned Cat, unsigned LogAlign) { + auto LR = getOrCreateLiveRange(V); + LR->setCategory(Cat); + LR->setLogAlignment(LogAlign); + return LR; +} + +/*********************************************************************** + * eraseLiveRange : get rid of live range for a Value, possibly multiple + * ones if it is a struct value + */ +void GenXLiveness::eraseLiveRange(Value *V) +{ + auto ST = dyn_cast(V->getType()); + if (!ST) { + eraseLiveRange(SimpleValue(V)); + return; + } + for (unsigned i = 0, e = IndexFlattener::getNumElements(ST); i != e; ++i) + eraseLiveRange(SimpleValue(V, i)); +} + +/*********************************************************************** + * eraseLiveRange : get rid of live range for a Value, if any + */ +void GenXLiveness::eraseLiveRange(SimpleValue V) +{ + auto LR = getLiveRangeOrNull(V); + if (LR) + eraseLiveRange(LR); +} + +/*********************************************************************** + * eraseLiveRange : get rid of the specified live range, and remove its + * values from the map + */ +void GenXLiveness::eraseLiveRange(LiveRange *LR) +{ + for (auto vi = LR->value_begin(), ve = LR->value_end(); vi != ve; ++vi) + LiveRangeMap.erase(*vi); + delete LR; +} + +/*********************************************************************** + * getLiveRangeOrNull : get live range for value, or 0 if none + * + * The returned LiveRange pointer is valid only until the next time the + * live ranges are modified, including the case of coalescing. + */ +const LiveRange *GenXLiveness::getLiveRangeOrNull(SimpleValue V) const +{ + auto i = LiveRangeMap.find(V); + if (i == LiveRangeMap.end()) + return nullptr; + return i->second; +} + +LiveRange *GenXLiveness::getLiveRangeOrNull(SimpleValue V) +{ + return const_cast(static_cast(this)->getLiveRangeOrNull(V)); +} + +/*********************************************************************** + * getLiveRange : get live range for value + * + * The returned LiveRange pointer is valid only until the next time the + * live ranges are modified, including the case of coalescing. + */ +LiveRange *GenXLiveness::getLiveRange(SimpleValue V) +{ + LiveRange *LR = getLiveRangeOrNull(V); + assert(LR && "no live range found"); + return LR; +} + +/*********************************************************************** + * getUnifiedRet : get/create unified return value for a function + * + * Returns already created unified value, or creates new one + * if there was no such. + */ +Value *GenXLiveness::getUnifiedRet(Function *F) +{ + auto RetIt = UnifiedRets.find(F); + if (RetIt == UnifiedRets.end()) + return createUnifiedRet(F); + return RetIt->second; +} + +/*********************************************************************** + * createUnifiedRet : create unified return value for a function + * + * To allow all returns in a function and all results of calls to that + * function to use the same register, we have a dummy "unified return + * value". + * + * Cannot be called on a function with void return type. + * + * This also creates the LiveRange for the unified return value, or + * multiple ones if it is struct type, and sets the category to the same as in + * one of the return instructions. + */ +Value *GenXLiveness::createUnifiedRet(Function *F) { + assert(!F->isDeclaration() && "must be a function definition"); + assert(UnifiedRets.find(F) == UnifiedRets.end() && + "Unified ret must not have been already created"); + Type *Ty = F->getReturnType(); + assert(!Ty->isVoidTy()); + auto URet = genx::createUnifiedRet(Ty, "", F->getParent()); + // Find some return inst. + ReturnInst *Ret = nullptr; + for (auto fi = F->begin(), fe = F->end(); fi != fe; ++fi) + if ((Ret = dyn_cast(fi->getTerminator()))) + break; + assert(Ret && "must find return instruction"); + Value *RetVal = Ret->getOperand(0); + // Use the categories of its operand to set the categories of the unified + // return value. + for (unsigned StructIdx = 0, NumElements = IndexFlattener::getNumElements(Ty); + StructIdx != NumElements; ++StructIdx) { + int Cat = getOrCreateLiveRange(SimpleValue(RetVal, StructIdx)) + ->getOrDefaultCategory(); + SimpleValue SV(URet, StructIdx); + getOrCreateLiveRange(SV)->setCategory(Cat); + } + + UnifiedRets[F] = URet; + UnifiedRetToFunc[URet] = F; + return URet; +} + +/*********************************************************************** + * isUnifiedRet : test whether a value is a unified return value + * + * A unified ret value is a call instruction that is + * not attached to any BB, and is in the UnifiedRetFunc map. + */ +Function *GenXLiveness::isUnifiedRet(Value *V) +{ + // Quick checks first. + auto Inst = dyn_cast(V); + if (!Inst) + return nullptr; + if (Inst->getParent()) + return nullptr; + // Then map lookup. + auto i = UnifiedRetToFunc.find(V); + if (i == UnifiedRetToFunc.end()) + return nullptr; + return i->second; +} + +/*********************************************************************** + * moveUnifiedRet : move a function's unified return value to another function + * + * This is used when replacing a function with a new one in GenXArgIndirection. + */ +void GenXLiveness::moveUnifiedRet(Function *OldF, Function *NewF) +{ + auto i = UnifiedRets.find(OldF); + if (i == UnifiedRets.end()) + return; + Value *UR = i->second; + UnifiedRets[NewF] = UR; + UnifiedRets.erase(i); + UnifiedRetToFunc[UR] = NewF; +} + +/*********************************************************************** + * find : given an instruction number, find a segment in a live range + * + * If the number is within a segment, or is just on its end point, that + * segment is returned. If the number is in a hole, the next segment + * after the hole is returned. If the number is before the first + * segment, the first segment is returned. If the number is after the + * last segment, end() is returned. + */ +LiveRange::iterator LiveRange::find(unsigned Pos) +{ + size_t Len = size(); + if (!Len) + return end(); + if (Pos > Segments[Len - 1].getEnd()) + return end(); + iterator I = begin(); + do { + size_t Mid = Len >> 1; + if (Pos <= I[Mid].getEnd()) + Len = Mid; + else + I += Mid + 1, Len -= Mid + 1; + } while (Len); + assert(I->getEnd() >= Pos); + return I; +} + +/*********************************************************************** + * getOrDefaultCategory : get category; if none, set default + * + * The default category is PREDICATE for i1 or a vector of i1, or GENERAL + * for anything else. + */ +unsigned LiveRange::getOrDefaultCategory() +{ + unsigned Cat = getCategory(); + if (Cat != RegCategory::NONE) + return Cat; + assert(!value_empty()); + SimpleValue SV = *value_begin(); + Type *Ty = IndexFlattener::getElementType( + SV.getValue()->getType(), SV.getIndex()); + if (Ty->getScalarType()->isIntegerTy(1)) + Cat = RegCategory::PREDICATE; + else + Cat = RegCategory::GENERAL; + setCategory(Cat); + return Cat; +} + +/*********************************************************************** + * interfere : check whether two live ranges interfere + * + * Two live ranges interfere if there is a segment from each that overlap + * and they are considered to cause interference by + * checkIfOverlappingSegmentsInterfere below. + */ +bool GenXLiveness::interfere(LiveRange *LR1, LiveRange *LR2) +{ + return getSingleInterferenceSites(LR1, LR2, nullptr); +} + +/*********************************************************************** + * twoAddrInterfere : check whether two live ranges interfere, allowing for + * single number interference sites at two address ops + * + * Return: true if they interfere + * + * Two live ranges interfere if there is a segment from each that overlap + * and are not both weak. + * + * But, if each interfering segment is a single number that is the precopy + * site of a two address op, and the result of the two address op is in one LR + * and the two address operand is in the other, then that is not counted as + * interference. + * + * That provision allows for coalescing at a two address op where the two + * address operand has already been copy coalesced with, or is the same value + * as, a different operand in the same bale, as follows: + * + * Suppose the two address op a has number N, and it has two address operand b + * and some other operand c in the same bale: + * + * N-1: (space for precopy) + * N: a = op(b, c) + * + * with live ranges + * a:[N-1,..) + * b:[..,N-1) + * c:[..,N) + * + * Then a and b can coalesce. + * + * But suppose b and c are the same value, or had previously been copy coalesced. + * Then we have live ranges + * a:[N-1,..) + * b,c:[..,N) + * + * and a and b now interfere needlessly. + * + * This function is called on an attempt to coalesce a and b (or rather the + * live range containing a and the live range containing b). In it, we see + * that there is a single number segment of interference [N-1,N), where a is + * the result and b is the two address operand of the two address op at N. Thus + * we discount that segment of interference, and a and b can still coalesce. + * + * Note that this formulation allows for there to be multiple such sites because + * of multiple two address results being already coalesced together through phi + * nodes. + */ +bool GenXLiveness::twoAddrInterfere(LiveRange *LR1, LiveRange *LR2) +{ + SmallVector Sites; + if (getSingleInterferenceSites(LR1, LR2, &Sites)) + return true; // interferes, not just single number sites + if (Sites.empty()) + return false; // does not interfere at all + // Put the single number sites in a set. + SmallSet SitesSet; + LLVM_DEBUG(dbgs() << "got single number interference sites:"); + for (auto i = Sites.begin(), e = Sites.end(); i != e; ++i) { + LLVM_DEBUG(dbgs() << " " << *i); + SitesSet.insert(*i); + } + LLVM_DEBUG(dbgs() << "\nbetween:\n" << *LR1 << "\n" << *LR2 << "\n"); + Sites.clear(); + // Check each def in LR1 and LR2 for being a two address op that causes us to + // discount a single number interference site. + for (auto LR = LR1, OtherLR = LR2; LR; + LR = LR == LR1 ? LR2 : nullptr, OtherLR = LR1) { + for (auto vi = LR->value_begin(), ve = LR->value_end(); vi != ve; ++vi) { + auto CI = dyn_cast(vi->getValue()); + if (!CI) + continue; + int OperandNum = getTwoAddressOperandNum(CI); + if (OperandNum < 0) + continue; + // Got a two addr op. Check whether the two addr operand is in the other + // LR. + if (getLiveRangeOrNull(CI->getOperand(OperandNum)) != OtherLR) + continue; + // Discount the single number interference site here, if there is one. + SitesSet.erase(getNumbering()->getNumber(CI) - 1); + } + } + // If we have discounted all sites, then there is no interference. + return !SitesSet.empty(); +} + +/*********************************************************************** + * getSingleInterferenceSites : check whether two live ranges interfere, + * returning single number interference sites + * + * Enter: LR1, LR2 = live ranges to check + * Sites = vector in which to store single number interference sites, + * or 0 if we do not want to collect them + * + * Return: true if the live ranges interfere other than as reflected in Sites + * + * Two live ranges interfere if there is a segment from each that overlap + * and are not both weak. + * + * If Sites is 0 (the caller does not want the Sites list), then the function + * returns true if there is any interference. + * + * If Sites is not 0, then any interference in a single number segment, for + * example [19,20), causes the start number to be pushed into Sites. The + * function returns true only if there is interference that cannot be described + * in Sites. + */ +bool GenXLiveness::getSingleInterferenceSites(LiveRange *LR1, LiveRange *LR2, + SmallVectorImpl *Sites) +{ + // Swap if necessary to make LR1 the one with more segments. + if (LR1->size() < LR2->size()) + std::swap(LR1, LR2); + auto Idx2 = LR2->begin(), End2 = LR2->end(); + // Find segment in LR1 that contains or is the next after the start + // of the first segment in LR2, including the case that the start of + // the LR2 segment abuts the end of the LR1 segment. + auto Idx1 = LR1->find(Idx2->getStart()), End1 = LR1->end(); + if (Idx1 == End1) + return false; + for (;;) { + // Check for overlap. + if (Idx1->getStart() < Idx2->getStart()) { + if (Idx1->getEnd() > Idx2->getStart()) + if (checkIfOverlappingSegmentsInterfere(LR1, Idx1, LR2, Idx2)) { + // Idx1 overlaps Idx2. Check if it is a single number overlap that can + // be pushed into Sites. + if (!Sites || Idx1->getEnd() != Idx2->getStart() + 1) + return true; + Sites->push_back(Idx2->getStart()); + } + } else { + if (Idx1->getStart() < Idx2->getEnd()) + if (checkIfOverlappingSegmentsInterfere(LR1, Idx1, LR2, Idx2)) { + // Idx2 overlaps Idx1. Check if it is a single number overlap that can + // be pushed into Sites. + if (!Sites || Idx2->getEnd() != Idx1->getStart() + 1) + return true; + Sites->push_back(Idx1->getStart()); + } + } + // Advance whichever one has the lowest End. + if (Idx1->getEnd() < Idx2->getEnd()) { + if (++Idx1 == End1) + return false; + } else { + if (++Idx2 == End2) + return false; + } + } +} + +/*********************************************************************** + * checkIfOverlappingSegmentsInterfere : given two segments that have been + * shown to overlap, check whether their strengths make them interfere + * + * If both segments are weak, they do not interfere. + * + * Interference between a normal segment in one LR and a phicpy segment in the + * other LR is ignored, as long as the phicpy segment relates to a phi incoming + * where the phi node is in the LR with the phicpy segment and the incoming + * value is in the LR with the strong segment. This is used to avoid + * unnecessary interference for a phi incoming through a critical edge, where + * the incoming is likely to be used in the other successor as well. + */ +bool GenXLiveness::checkIfOverlappingSegmentsInterfere( + LiveRange *LR1, Segment *S1, LiveRange *LR2, Segment *S2) +{ + if (S1->isWeak() && S2->isWeak()) + return false; // both segments weak + if (S2->Strength == Segment::PHICPY) { + // Swap so that if either segment is phicpy, then it is S1 for the check + // below. + std::swap(LR1, LR2); + std::swap(S1, S2); + } + if (S1->Strength != Segment::PHICPY) + return true; + // S1 is phicpy. If its corresponding phi cpy insertion point is for a phi + // node in LR1 and an incoming in LR2, then this does not cause interference. + auto PhiIncoming = Numbering->getPhiIncomingFromNumber(S1->getStart()); + assert(PhiIncoming.first && "phi incoming not found"); + if (getLiveRange(PhiIncoming.first) != LR1) + return true; // phi not in LR1, interferes + if (getLiveRangeOrNull( + PhiIncoming.first->getIncomingValue(PhiIncoming.second)) != LR2) + return true; // phi incoming not in LR2, interferes + // Conditions met -- does not cause interference. + return false; +} + +/*********************************************************************** + * coalesce : coalesce two live ranges that do not interfere + * + * Enter: LR1 = first live range + * LR2 = second live range + * DisallowCASC = true to disallow call arg special coalescing + * into the resulting live range + * + * Return: new live range (LR1 and LR2 now invalid) + */ +LiveRange *GenXLiveness::coalesce(LiveRange *LR1, LiveRange *LR2, + bool DisallowCASC) +{ + assert(LR1 != LR2 && "cannot coalesce an LR to itself"); + assert(LR1->Category == LR2->Category && "cannot coalesce two LRs with different categories"); + // Make LR1 the one with the longer list of segments. + if (LR2->Segments.size() > LR1->Segments.size()) { + LiveRange *temp = LR1; + LR1 = LR2; + LR2 = temp; + } + LLVM_DEBUG( + dbgs() << "Coalescing \""; + LR1->print(dbgs()); + dbgs() << "\" and \""; + LR2->print(dbgs()); + dbgs() << "\"\n" + ); + // Do the merge of the segments. + merge(LR1, LR2); + // Copy LR2's values across to LR1. + for (auto i = LR2->value_begin(), e = LR2->value_end(); i != e; ++i) + LiveRangeMap[LR1->addValue(*i)] = LR1; + // Use the largest alignment from the two LRs. + LR1->LogAlignment = std::max(LR1->LogAlignment, LR2->LogAlignment); + // If either LR has a non-zero offset, use it. + assert(!LR1->Offset || !LR2->Offset); + LR1->Offset |= LR2->Offset; + // Set DisallowCASC. + LR1->DisallowCASC |= LR2->DisallowCASC | DisallowCASC; + delete LR2; + LLVM_DEBUG( + dbgs() << " giving \""; + LR1->print(dbgs()); + dbgs() << "\"\n" + ); + return LR1; +} + +/*********************************************************************** + * copyInterfere : check whether two live ranges copy-interfere + * + * Two live ranges LR1 and LR2 copy-interfere (a non-commutative relation) + * if LR1 includes a value that is a phi node whose definition is within + * LR2. + */ +bool GenXLiveness::copyInterfere(LiveRange *LR1, LiveRange *LR2) +{ + // Find a phi node value in LR1. It can have at most one, because only + // copy coalescing has occurred up to now, and copy coalescing does not + // occur at a phi node. + for (unsigned i = 0, e = LR1->Values.size(); i != e; ++i) { + auto Phi = dyn_cast(LR1->Values[i].getValue()); + if (!Phi) + continue; + // Found a phi node in LR1. A phi node has multiple instruction numbers, + // one for each incoming block. See if any one of those is in LR2's + // live range. + for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) + if (LR2->contains(Numbering->getPhiNumber(Phi, Phi->getIncomingBlock(i)))) + return true; + break; + } + return false; // no phi node found +} + +/*********************************************************************** + * wrapsAround : detects if V1 is a phi node and V2 wraps round to a use + * in a phi node in the same basic block as V1 and after it + */ +bool GenXLiveness::wrapsAround(Value *V1, Value *V2) +{ + auto PhiDef = dyn_cast(V1); + if (!PhiDef) + return false; + for (auto ui = V2->use_begin(), ue = V2->use_end(); ui != ue; ++ui) { + if (auto PhiUse = dyn_cast(ui->getUser())) { + if (PhiUse->getParent() == PhiDef->getParent()) { + // Phi use in the same BB. Scan until we find PhiDef or the end + // of the phi nodes. + while (PhiUse != PhiDef) { + PhiUse = dyn_cast(PhiUse->getNextNode()); + if (!PhiUse) + return true; // reach end of phi nodes + } + } + } + } + return false; +} + +/*********************************************************************** + * insertCopy : insert a copy of a non-struct value + * + * Enter: InputVal = value to copy + * LR = live range to add the new value to (0 to avoid adjusting + * live ranges) + * InsertBefore = insert copy before this inst + * Name = name to give the new value + * Number = number to give the new instruction(s), 0 for none + * + * Return: The new copy instruction + * + * This inserts multiple copies if the input value is a vector that is + * bigger than two GRFs or a non power of two size. + * + * This method is mostly used from GenXCoalescing, which passes an LR to + * add the new copied value to. + * + * It is also used from GenXLiveRange if it needs to add a copy to break an + * overlapping circular phi value, in which case LR is 0 as we do not want to + * adjust live ranges. Also at this stage there is no baling info to update. + */ +Instruction *GenXLiveness::insertCopy(Value *InputVal, LiveRange *LR, + Instruction *InsertBefore, const Twine &Name, unsigned Number) +{ + assert(!isa(InputVal)); + bool AdjustLRs = LR != nullptr; + LiveRange *SourceLR = nullptr; + if (AdjustLRs) + SourceLR = getLiveRange(InputVal); + auto InputTy = InputVal->getType(); + if (InputTy->getScalarType()->isIntegerTy(1)) { + // The input is a predicate. + if (!isa(InputVal)) { + // The predicate input is not a constant. + // There is no way in vISA of copying from one + // predicate to another, so we copy all 0s into the destination + // then "or" the source into it. + Instruction *NewInst = CastInst::Create(Instruction::BitCast, + Constant::getNullValue(InputTy), InputTy, Name, InsertBefore); + Numbering->setNumber(NewInst, Number); + if (AdjustLRs) + setLiveRange(SimpleValue(NewInst), LR); + NewInst = BinaryOperator::Create(Instruction::Or, NewInst, InputVal, Name, + InsertBefore); + Numbering->setNumber(NewInst, Number); + if (AdjustLRs) + setLiveRange(SimpleValue(NewInst), LR); + return NewInst; + } + // Predicate input is constant. + auto NewInst = CastInst::Create(Instruction::BitCast, InputVal, + InputTy, Name, InsertBefore); + Numbering->setNumber(NewInst, Number); + if (AdjustLRs) + setLiveRange(SimpleValue(NewInst), LR); + return NewInst; + } + Instruction *NewInst = nullptr; + if (InputTy->isPointerTy()) { + // BitCast used to represent a normal copy. + NewInst = CastInst::Create(Instruction::BitCast, InputVal, + InputVal->getType(), Name, InsertBefore); + if (Number) + Numbering->setNumber(NewInst, Number); + if (AdjustLRs) + setLiveRange(SimpleValue(NewInst), LR); + return NewInst; + } + + Region R(InputVal); + unsigned MaxNum = R.ElementBytes == 1 ? 32 : 64 / R.ElementBytes; + if (exactLog2(R.NumElements) >= 0 && R.NumElements <= MaxNum) { + // Can be done with a single copy. + if (SourceLR && (SourceLR->Category != RegCategory::GENERAL + || (LR && LR->Category != RegCategory::GENERAL))) { + // Need a category conversion (including the case that the two + // categories are the same but not GENERAL). + NewInst = createConvert(InputVal, Name, InsertBefore); + } else { + // BitCast used to represent a normal copy. + NewInst = CastInst::Create(Instruction::BitCast, InputVal, + InputVal->getType(), Name, InsertBefore); + } + if (Number) + Numbering->setNumber(NewInst, Number); + if (AdjustLRs) + setLiveRange(SimpleValue(NewInst), LR); + return NewInst; + } + + auto collectFragment = [](Value *V, unsigned MaxFrag, + SmallVectorImpl>& Frag, + unsigned MaxElt) { + while (!isa(V)) { + if (!GenXIntrinsic::isWrRegion(V)) + return true; + IntrinsicInst *WII = cast(V); + Region R(WII, BaleInfo()); + if (R.Indirect || !R.isContiguous() || !R.isWholeNumRows()) + return true; + if ((R.Offset % R.ElementBytes) != 0) + return true; + unsigned Base = R.Offset / R.ElementBytes; + for (unsigned Offset = 0; Offset < R.NumElements; /*EMPTY*/) { + unsigned NumElts = std::min(MaxElt, R.NumElements - Offset); + // Round NumElts down to power of 2. That is how many elements we + // are copying this time round the loop. + NumElts = 1 << genx::log2(NumElts); + Frag.push_back(std::make_pair(Base + Offset, NumElts)); + Offset += NumElts; + } + V = WII->getOperand(0); + } + if (Frag.size() > MaxFrag) + return true; + std::sort(Frag.begin(), Frag.end()); + return false; + }; + + unsigned NumElements = R.NumElements; + SmallVector, 8> Fragments; + unsigned MaxCopies = (NumElements + MaxNum - 1) / MaxNum; + if (collectFragment(InputVal, MaxCopies, Fragments, MaxNum)) { + Fragments.clear(); + for (unsigned Offset = 0; Offset < NumElements; /*EMPTY*/) { + unsigned NumElts = std::min(MaxNum, NumElements - Offset); + // Round NumElts down to power of 2. That is how many elements we + // are copying this time round the loop. + NumElts = 1 << genx::log2(NumElts); + Fragments.push_back(std::make_pair(Offset, NumElts)); + Offset += NumElts; + } + } + // Need to split the copy up. Start with an undef destination. + Value *Res = UndefValue::get(InputVal->getType()); + for (auto &I : Fragments) { + unsigned Offset = I.first; + // Set the subregion. + R.NumElements = I.second; + R.Width = R.NumElements; + R.Offset = Offset * R.ElementBytes; + // Create the rdregion. Do not add this to a live range because it is + // going to be baled in to the wrregion. + Instruction *RdRegion = R.createRdRegion(InputVal, Name, InsertBefore, + DebugLoc(), true/*AllowScalar*/); + if (Baling) + Baling->setBaleInfo(RdRegion, BaleInfo(BaleInfo::RDREGION, 0)); + if (Number) + Numbering->setNumber(RdRegion, Number); + // Create the wrregion, and mark that it bales in the rdregion (operand 1). + NewInst = cast(R.createWrRegion(Res, RdRegion, Name, + InsertBefore, DebugLoc())); + if (Number) + Numbering->setNumber(NewInst, Number); + if (Baling) { + BaleInfo BI(BaleInfo::WRREGION); + BI.setOperandBaled(1); + Baling->setBaleInfo(NewInst, BI); + } + if (AdjustLRs) { + // Add the last wrregion to the live range (thus coalescing them all + // together and in with the phi node or two address op that we're doing + // the copy for). + setLiveRange(SimpleValue(NewInst), LR); + } + Res = NewInst; + } + return NewInst; +} + +/*********************************************************************** + * merge : merge segments of LR2 into LR1 + * + * This is equivalent to addSegments followed by sortAndMerge. + * + * Previously there was some code here that attempted to optimize on the + * assumption that the caller passed the one with the longer list of segments + * as LR1. However that became too complicated once we introduced weak and + * strong liveness. + * + * One day we could re-introduce some simple optimized paths, such as when + * LR2 has a single segment. + */ +void GenXLiveness::merge(LiveRange *LR1, LiveRange *LR2) +{ + LR1->addSegments(LR2); + LR1->sortAndMerge(); +} + +/*********************************************************************** + * eraseUnusedTree : erase unused tree of instructions + * + * Enter: Inst = root of unused tree + * + * This erases Inst, then recursively erases other instructions that become + * unused. Erased instructions are also removed from liveness. + * + * Other than the given Inst, this does not erase a non-intrinsic call, or + * an intrinsic call with a side effect. + * + * Instead of erasing as we go, we undef operands to make them unused and then + * erase everything at the end. This is required for the case that we have an + * unused DAG of instructions rather than just an unused tree, for example + * where we have a rd-wr sequence and all the rds use the same input. + */ +void GenXLiveness::eraseUnusedTree(Instruction *TopInst) +{ + SmallVector Stack; + std::set ToErase; + Stack.push_back(TopInst); + while (!Stack.empty()) { + auto Inst = Stack.back(); + Stack.pop_back(); + if (!Inst->use_empty()) + continue; + if (TopInst != Inst) { + if (auto CI = dyn_cast(Inst)) { + if (!GenXIntrinsic::isAnyNonTrivialIntrinsic(CI)) + continue; + if (!CI->doesNotAccessMemory()) + continue; + } + } + for (unsigned oi = 0, oe = Inst->getNumOperands(); oi != oe; ++oi) + if (auto OpndInst = dyn_cast(Inst->getOperand(oi))) { + Stack.push_back(OpndInst); + Inst->setOperand(oi, UndefValue::get(OpndInst->getType())); + } + removeValue(Inst); + ToErase.insert(Inst); + } + for (auto i = ToErase.begin(), e = ToErase.end(); i != e; ++i) + (*i)->eraseFromParent(); +} + +/*********************************************************************** + * getAddressBase : get the base register of an address + * + * Enter: Addr = address conversion (genx.convert.addr instruction) + * + * Return: The Value for the base that the address is used with, or some + * other Value that is coalesced with that + */ +Value *GenXLiveness::getAddressBase(Value *Addr) +{ + // Get the base register from the rdregion/wrregion that the index is used + // in. This might involve going via an add or an rdregion. + Use *U = &*Addr->use_begin(); + auto user = cast(U->getUser()); + while (!U->getOperandNo()) { + U = &*user->use_begin(); + user = cast(U->getUser()); + } + if (GenXIntrinsic::isRdRegion(user)) + return user->getOperand(0); + if (GenXIntrinsic::isWrRegion(user)) { + auto Head = Baling->getBaleHead(user); + if (Head && isa(Head)) { + Value *V = Head->getOperand(1); + V = getUnderlyingGlobalVariable(V); + assert(V && "null base not expected"); + return V; + } + return user; + } + // The above scheme does not work for an address conversion added by + // GenXArgIndirection. Instead we have AddressBaseMap to provide the mapping. + auto i = ArgAddressBaseMap.find(Addr); + assert(i != ArgAddressBaseMap.end() && "base register not found for address"); + Value *BaseV = i->second; + LiveRange *LR = getLiveRange(BaseV); + // Find a SimpleValue in the live range that is not a struct member. + for (auto vi = LR->value_begin(), ve = LR->value_end(); vi != ve; ++vi) { + Value *V = vi->getValue(); + if (!isa(V->getType())) + return V; + } + llvm_unreachable("non-struct value not found"); +} + +/*********************************************************************** + * isBitCastCoalesced : see if the bitcast has been coalesced away + * + * This handles the case that the input and result of the bitcast are coalesced + * in to the same live range. + */ +bool GenXLiveness::isBitCastCoalesced(BitCastInst *BCI) +{ + return getLiveRangeOrNull(BCI) == getLiveRangeOrNull(BCI->getOperand(0)); +} + +/*********************************************************************** + * dump, print : dump the liveness info + */ +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void GenXLiveness::dump() +{ + print(errs()); errs() << '\n'; +} +void LiveRange::dump() const +{ + print(errs()); errs() << '\n'; +} +#endif + +void GenXLiveness::print(raw_ostream &OS) const +{ + OS << "GenXLiveness for FunctionGroup " << FG->getName() << "\n"; + for (const_iterator i = begin(), e = end(); i != e; ++i) { + LiveRange *LR = i->second; + // Only show an LR if the map iterator is on the value that appears first + // in the LR. That avoids printing the same LR multiple times. + if (i->first == *LR->value_begin()) { + LR->print(OS); + OS << "\n"; + } + } + OS << "\n"; +} + +#ifndef NDEBUG +/*********************************************************************** + * LiveRange::assertOk : assert that no segments abut or overlap or are + * in the wrong order + */ +void LiveRange::assertOk() +{ + // Assert that no segments abut or overlap or are in the wrong order. + iterator Idx1 = begin(), End1 = end(); + Idx1++; + for (; Idx1 != End1; ++Idx1) + assert(((Idx1 - 1)->Strength != Idx1->Strength || + (Idx1 - 1)->getEnd() < Idx1->getStart()) && + "invalid live range"); +} +#endif + +/*********************************************************************** + * LiveRange::addSegment : add a segment to a live range + * + * The segment might already be covered by an existing segment, in which + * case nothing changes. + * + * It would be inefficient to implement coalesce() in terms of this, because + * it might have to shuffle lots of elements along by one each time. + * This function only gets called when adding a single segment to a live + * range when inserting a copy in coalescing. + */ +void LiveRange::addSegment(Segment Seg) +{ + iterator i = find(Seg.getStart()), e = end(); + if (i == e) { + // New segment off end. + Segments.push_back(Seg); + } else if (i->getStart() <= Seg.getStart()) { + // New segment is covered by or overlaps the end of old segment i. + if (i->getEnd() < Seg.getEnd()) { + i->setEnd(Seg.getEnd()); + // See if it covers or overlaps any later segments. + iterator j = i + 1; + while (j != e) { + if (j->getStart() > Seg.getEnd()) + break; + i->setEnd(j->getEnd()); + if (j->getEnd() >= Seg.getEnd()) + break; + ++j; + } + Segments.erase(i + 1, j); + } + } else if (i->getStart() == Seg.getEnd()) { + // New segment abuts start of old segment i, without abutting or + // overlapping anything before. + i->setStart(Seg.getStart()); + } else { + // New segment is completely in a hole just before i. + Segments.insert(i, Seg); + } + assertOk(); +} + +/*********************************************************************** + * LiveRange::setSegmentsFrom : for this live range, clear out its segments + * and copy them from the other live range + */ +void LiveRange::setSegmentsFrom(LiveRange *Other) +{ + Segments.clear(); + Segments.append(Other->Segments.begin(), Other->Segments.end()); +} + +/*********************************************************************** + * LiveRange::addSegments : add segments of LR2 into this + */ +void LiveRange::addSegments(LiveRange *LR2) +{ + Segments.append(LR2->Segments.begin(), LR2->Segments.end()); +} + +/*********************************************************************** + * LiveRange::sortAndMerge : after doing some push_backs, sort the segments, + * and merge overlapping/adjacent ones + */ +void LiveRange::sortAndMerge() { + std::sort(Segments.begin(), Segments.end()); + + // Ensure that there are no duplicate segments: + Segments_t::iterator ip; + ip = std::unique(Segments.begin(), Segments.end()); + Segments.resize(std::distance(Segments.begin(), ip)); + + Segments_t SegmentsSortedEnd = Segments; + std::sort(SegmentsSortedEnd.begin(), SegmentsSortedEnd.end(), + [](Segment L, Segment R) { + if (L.getEnd() != R.getEnd()) + return L.getEnd() < R.getEnd(); + return L.getStart() < R.getStart(); + }); + + Segments_t NewSegments; + std::unordered_set OpenedSegments; + Segment *SS = Segments.begin(); + Segment *ES = SegmentsSortedEnd.begin(); + unsigned prevBorder = 0; + unsigned curBorder = 0; + bool isStartBorder; + + // Split & Merge + while (ES != SegmentsSortedEnd.end()) { + if (SS != Segments.end() && SS->getStart() < ES->getEnd()) { + isStartBorder = true; + curBorder = SS->getStart(); + } else { + isStartBorder = false; + curBorder = ES->getEnd(); + } + + // To create or extend segment, first check that there are + // open segments or that we haven't already created or extended one + if (OpenedSegments.size() > 0 && prevBorder < curBorder) { + Segment NS = + *std::max_element(OpenedSegments.begin(), OpenedSegments.end(), + [](Segment L, Segment R) { + return L.Strength < R.Strength; + }); // New segment + if (NewSegments.size() > 0 && + NewSegments.rbegin()->getEnd() == prevBorder && + // This segment and previous segment abut or overlap. Merge + // as long as they have the same strength. + (NS.Strength == NewSegments.rbegin()->Strength || + // Also allow for the case that the first one is strong and the + // second one is phicpy. The resulting merged segment is strong, + // because a phicpy segment is valid only if it starts in the + // same place as when it was originally created and there is no + // liveness just before it. + (NS.Strength == Segment::PHICPY && + NewSegments.rbegin()->Strength == Segment::STRONG))) { + // In these cases we can extend + NewSegments.rbegin()->setEnd(curBorder); + } else { + NS.setStart(prevBorder); + NS.setEnd(curBorder); + NewSegments.push_back(NS); + } + } + prevBorder = curBorder; + if (isStartBorder) + OpenedSegments.insert(*SS++); + else + OpenedSegments.erase(*ES++); + } + Segments = NewSegments; +} + +/*********************************************************************** + * LiveRange::prepareFuncs : fill the Funcs set with kernel or stack functions + * which this LR is alive in + * + * To support RegAlloc for function groups that consist of kernel and stack + * functions we have to track which kernel/stack functions the LR spans across. + * + */ +void LiveRange::prepareFuncs(FunctionGroupAnalysis *FGA) { + for (auto &val : getValues()) { + auto Inst = dyn_cast(val.getValue()); + Function *DefFunc = nullptr; + if (Inst && Inst->getParent()) + DefFunc = Inst->getFunction(); + else if (auto Arg = dyn_cast(val.getValue())) + DefFunc = Arg->getParent(); + + if (DefFunc) + Funcs.insert(FGA->getSubGroup(DefFunc) + ? FGA->getSubGroup(DefFunc)->getHead() + : FGA->getGroup(DefFunc)->getHead()); + + for (auto U : val.getValue()->users()) + if (Instruction *userInst = dyn_cast(U)) { + auto F = userInst->getFunction(); + Funcs.insert(FGA->getSubGroup(F) ? FGA->getSubGroup(F)->getHead() + : FGA->getGroup(F)->getHead()); + } + } +} + +/*********************************************************************** + * LiveRange::getLength : add up the number of instructions covered by this LR + */ +unsigned LiveRange::getLength(bool WithWeak) +{ + unsigned Length = 0; + for (auto i = begin(), e = end(); i != e; ++i) { + if (i->isWeak() && !WithWeak) + continue; + Length += i->getEnd() - i->getStart(); + } + return Length; +} + +/*********************************************************************** + * LiveRange::print : print the live range + */ +void LiveRange::print(raw_ostream &OS) const +{ + auto vi = Values.begin(), ve = Values.end(); + assert(vi != ve); + for (;;) { + vi->printName(OS); + if (++vi == ve) + break; + OS << ","; + } + OS << ":"; + printSegments(OS); + const char *Cat = "???"; + switch (Category) { + case RegCategory::NONE: Cat = "none"; break; + case RegCategory::GENERAL: Cat = "general"; break; + case RegCategory::ADDRESS: Cat = "address"; break; + case RegCategory::PREDICATE: Cat = "predicate"; break; + case RegCategory::EM: Cat = "em"; break; + case RegCategory::RM: Cat = "rm"; break; + case RegCategory::SAMPLER: Cat = "sampler"; break; + case RegCategory::SURFACE: Cat = "surface"; break; + case RegCategory::VME: Cat = "vme"; break; + } + OS << "{" << Cat << ",align" << (1U << LogAlignment); + if (Offset) + OS << ",offset" << Offset; + OS << "}"; +} + +/*********************************************************************** + * LiveRange::printSegments : print the live range's segments + */ +void LiveRange::printSegments(raw_ostream &OS) const +{ + for (auto ri = Segments.begin(), re = Segments.end(); + ri != re; ++ri) { + OS << "["; + switch (ri->Strength) { + case Segment::WEAK: OS << "w"; break; + case Segment::PHICPY: OS << "ph"; break; + } + OS << ri->getStart() << "," << ri->getEnd() << ")"; + } +} + +/*********************************************************************** + * IndexFlattener::flatten : convert struct indices into a flattened index + * + * This has a special case of Indices having a single element that is the + * number of elements in ST, which returns the total number of flattened + * indices in the struct. + * + * This involves scanning through the struct layout each time it is called. + * If it is used a lot, it might benefit from some cacheing of the results. + */ +unsigned IndexFlattener::flatten(StructType *ST, ArrayRef Indices) +{ + if (!Indices.size()) + return 0; + unsigned Flattened = 0; + unsigned i = 0; + for (; i != Indices[0]; ++i) { + Type *ElTy = ST->getElementType(i); + if (auto ElST = dyn_cast(ElTy)) + Flattened += flatten(ElST, ElST->getNumElements()); + else + ++Flattened; + } + if (i == ST->getNumElements()) + return Flattened; // handle special case noted at the top + Type *ElTy = ST->getElementType(i); + if (auto ElST = dyn_cast(ElTy)) + Flattened += flatten(ElST, Indices.slice(1)); + return Flattened; +} + +/*********************************************************************** + * IndexFlattener::unflatten : convert flattened index into struct indices + * + * Enter: Indices = vector to put unflattened indices into + * + * Return: number left over from flattened index if it goes off the end + * of the struct (used internally when recursing). If this is + * non-zero, nothing has been pushed into Indices + * + * This involves scanning through the struct layout each time it is called. + * If it is used a lot, it might benefit from some cacheing of the results. + */ +unsigned IndexFlattener::unflatten(StructType *ST, unsigned Flattened, + SmallVectorImpl *Indices) +{ + for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { + Type *ElTy = ST->getElementType(i); + if (auto ElST = dyn_cast(ElTy)) { + Indices->push_back(i); + Flattened = unflatten(ElST, Flattened, Indices); + if (!Flattened) + return 0; + Indices->pop_back(); + } else if (!Flattened--) { + Indices->push_back(i); + return 0; + } + } + return Flattened; +} + +/*********************************************************************** + * IndexFlattener::getElementType : get type of struct element from + * flattened index + * + * Enter: Ty = type, possibly struct type + * FlattenedIndex = flattened index in the struct, 0 if not struct + * + * Return: type of that element + */ +Type *IndexFlattener::getElementType(Type *Ty, unsigned FlattenedIndex) +{ + auto ST = dyn_cast(Ty); + if (!ST) + return Ty; + SmallVector Indices; + IndexFlattener::unflatten(ST, FlattenedIndex, &Indices); + Type *T = 0; + for (unsigned i = 0;;) { + T = ST->getElementType(Indices[i]); + if (++i == Indices.size()) + return T; + ST = cast(T); + } +} + +/*********************************************************************** + * IndexFlattener::flattenArg : flatten an arg in a function or call + * + * This calculates the total number of flattened indices used up by previous + * args. If all previous args are not struct type, then this just returns the + * arg index. + */ +unsigned IndexFlattener::flattenArg(FunctionType *FT, unsigned ArgIndex) +{ + unsigned FlattenedIndex = 0; + while (ArgIndex--) { + Type *ArgTy = FT->getParamType(ArgIndex); + FlattenedIndex += getNumElements(ArgTy); + } + return FlattenedIndex; +} + +/*********************************************************************** + * SimpleValue::getType : get the type of the SimpleValue + */ +Type *SimpleValue::getType() +{ + return IndexFlattener::getElementType(V->getType(), Index); +} + +/*********************************************************************** + * dump, print : debug print a SimpleValue + */ +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void SimpleValue::dump() const +{ + print(errs()); errs() << '\n'; +} +#endif +void SimpleValue::print(raw_ostream &OS) const +{ + OS << V->getName(); + if (Index || isa(V->getType())) + OS << "#" << Index; +} +void SimpleValue::printName(raw_ostream &OS) const +{ + OS << V->getName(); + if (Index || isa(V->getType())) + OS << "#" << Index; +} + +/*********************************************************************** + * CallGraph::build : build the call graph for the FunctionGroup + * + * The call graph is acyclic because no recursive edges added here + * CM supports recursion though + */ +void CallGraph::build(GenXLiveness *Liveness) +{ + Nodes.clear(); + // Create a node for each Function. + for (auto fgi = FG->begin(), fge = FG->end(); fgi != fge; ++fgi) { + Function *F = *fgi; + (void)Nodes[F]; + } + // For each Function, find its call sites and add edges for them. + for (auto fgi = FG->begin() + 1, fge = FG->end(); fgi != fge; ++fgi) { + Function *F = *fgi; + for (Value::use_iterator ui = F->use_begin(), ue = F->use_end(); + ui != ue; ++ui) { + // TODO: deduce possible callsites thru cast chains + if (isa(ui->getUser())) { + auto Call = cast(ui->getUser()); + auto Caller = Call->getParent()->getParent(); + // do not add edges for recursive calls + if (Caller != F) + Nodes[Caller].insert( + Edge(Liveness->getNumbering()->getNumber(Call), Call)); + } + } + } +} + diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveness.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveness.h new file mode 100644 index 000000000000..1ce86c0dbbdd --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLiveness.h @@ -0,0 +1,666 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXLiveness +/// ------------ +/// +/// GenXLiveness is an analysis that contains the liveness information for the +/// values in the code. Unlike the usual LLVM liveness analysis, the values +/// are in LLVM IR rather than machine IR. +/// +/// This GenXLiveness pass is a container for the data structures required +/// for liveness analysis, plus methods to perform the analysis. The pass itself +/// does nothing; later passes manipulate it: +/// +/// * GenXCategory creates a LiveRange and sets the category on it for each +/// value. +/// +/// * GenXLiveRanges calls GenXLiveness to set up the LiveRange for each +/// value that needs one (a non-baled instruction or a function argument), +/// and erases the LiveRange for a value that does not need one (a baled +/// in instruction). +/// +/// GenXLiveness is a FunctionGroupPass, because we want to share liveness +/// information between all the Functions in a FunctionGroup (i.e. between a +/// GenX kernel/function and its subroutines). Any pass that uses GenXLiveness, +/// which is almost all passes that run after it, must itself be a +/// FunctionGroupPass. +/// +/// Here is what a LiveRange might look like if you dump() it in the debugger, +/// or see it as part of the liveness info in a -print-after-all: +/// +/// ``add12.split48172:[145,199){general,align32}`` +/// +/// * ``add12.split48172`` is the Value attached to the LiveRange. As outlined below, +/// a LiveRange actually has SimpleValues rather than Values; if the attached +/// SimpleValue had been an element of a struct rather than a scalar value in +/// its own right, the name would have had # then the flattened index appended. +/// +/// * A LiveRange can have more than one value attached after GenXCoalescing. +/// This would be shown by multiple comma-separated names. +/// +/// * ``[145,199)`` is the segment in which the LiveRange is live. A LiveRange can +/// have multiple segments. This one is a normal (strong) segment; a weak one has +/// the start number prefixed with 'w' and a phicpy one has the start number +/// prefixed with 'ph'. +/// +/// * ``general`` is the register category of the LiveRange. +/// +/// * ``align32`` shows that the LiveRange has been marked as needing to be 32 +/// byte (i.e. GRF) aligned. +/// +/// * If the LiveRange was a kernel argument, its allocated offset would have +/// been shown with the word 'offset'. +/// +/// SimpleValue +/// ^^^^^^^^^^^ +/// +/// Liveness information deals with SimpleValues rather than Values. +/// SimpleValue (a GenX backend specific class) is the entity that can have +/// a live range attached and a register allocated. A SimpleValue is either a +/// non-struct Value, or a non-struct element of a struct Value (where the +/// struct can contain nested structs). +/// +/// A SimpleValue is represented by a pair: +/// +/// - a Value * +/// - a flattened index for a non-struct element of a struct, otherwise 0 +/// +/// Having a flattened index (as generated by IndexFlattener::flatten()) allows +/// us to encode an element in multiply nested structs with a single index. +/// +/// The idea of SimpleValue is that, where the LLVM IR contains a struct value, +/// which is unavoidable when a function has multiple return values, we want +/// to allocate a register to each non-struct element, not the whole struct. +/// +/// Segments +/// ^^^^^^^^ +/// +/// A live range consists of one or more non-overlapping *segments*, where each +/// segment has a start (inclusive) and end (exclusive) instruction number, and a +/// strength, which is strong (normal), weak (see below) or phicpy (see below). +/// Two segments cannot be abutting if they have the same +/// strength. Later passes can interrogate this information to find out whether +/// two live ranges interfere, and can modify it by coalescing (merging) two +/// live ranges. After coalescing, multiple SimpleValues share the same live +/// range. +/// +/// The numbering of instructions is handled in GenXNumbering. +/// +/// Weak liveness +/// ^^^^^^^^^^^^^ +/// +/// A live range that extends over a call has the entire range of the called +/// subroutine, and any subroutines it can call, added to it. This makes that +/// live range interfere with any live range inside the subroutine, and thus +/// stops them using the same register. +/// +/// However, because a subroutine has a single range in instruction numbering, +/// rather than one range per call site, this scheme means that two values A +/// and B that are live over two *different* call sites of the same subroutine +/// both include the subroutine's range, and thus look like they interfere. +/// This could stop A and B being coalesced, and thus add extra code and +/// register pressure. +/// +/// To fix this, we have the concept of *weak liveness*. The values A and B +/// are only weakly live inside the subroutine. Two values are considered to +/// interfere only if there is some point where both are live, and at least +/// one of them is not weakly live at that point. +/// +/// Thus, in our A and B example, A and B each interferes with any value inside +/// the subroutine, but not with each other. +/// +/// Phicpy liveness +/// ^^^^^^^^^^^^^^^ +/// +/// A phi node has a short segment of liveness (a *phicpy segment*) at the end +/// of each of its incoming blocks, from the phi copy insertion point up to the +/// end of the block. The use of the incoming value in the phi node is counted +/// as being at that phi copy insertion point. +/// +/// Normally, we split critical edges, so an incoming block to a phi node has +/// only the one successor, and the use of the incoming value at the phi copy +/// insertion point is a kill use. Often, the phi node and the incoming can be +/// coalesced, unless there is some interference elsewhere due to other values +/// previously coalesced into the two live ranges. +/// +/// However, in one case (a goto/join branching to a join), we cannot split the +/// critical edge. Thus the phi copy insertion point is before the conditional +/// branch in a block with two successors, and the incoming value is likely to +/// be used in the other successor too. Then, there is interference between the +/// phi node and the incoming value, even though they could be coalesced. +/// +/// To avoid this problem, each phicpy segment in a live range is marked as +/// such. A phicpy segment is valid only if there is no segment abutting it +/// before; if there is an abutting before segment, the coalescing code turns it +/// into a normal strong segment and merges the two together. +/// +/// Then, interference between two live ranges LR1 and LR2 is ignored if: +/// +/// 1. the interference arises between a phicpy segment in LR1 and a normal +/// (strong) segment in LR2; and +/// +/// 2. the start of the phicpy segment is the phi copy insertion point where the +/// phi node is in LR1 and the incoming value is in LR2. +/// +/// This then allows the incoming value and the phi node to be coalesced, even +/// if the incoming value is also used in the branch's other successor. +/// +//===----------------------------------------------------------------------===// +#ifndef GENXLIVENESS_H +#define GENXLIVENESS_H + +#include "FunctionGroup.h" +#include "IgnoreRAUWValueMap.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/ValueHandle.h" +#include "llvm/ADT/Hashing.h" +#include "llvm/ADT/MapVector.h" +#include +#include +#include +#include + +namespace llvm { + +class BasicBlock; +class BitCastInst; +class CallInst; +class Function; +class FunctionPass; +class GenXBaling; +class GenXLiveness; +class GenXNumbering; +class Instruction; +class PHINode; +class raw_ostream; +class ReturnInst; +class Value; + +FunctionGroupPass *createGenXGroupPrinterPass(raw_ostream &O, const std::string &Banner); + +namespace genx { + +class Bale; + +/*********************************************************************** + * IndexFlattener : a class containing some (static) utility functions to + * convert between struct indices (as found in an extractelement instruction) + * and a flattened index, in which a struct containing further structs is + * flattened as if it is a single struct containing just the non-struct + * elements. + * + * SimpleValue uses this to encode and decode its flattened index. + * Liveness and coalescing use flattenArg and getNumArgElements to calculate + * live ranges for function args at the call sites. + */ +struct IndexFlattener { + // flatten : convert struct indices into a flattened index + static unsigned flatten(StructType *ST, ArrayRef Indices); + // getNumElements : get the number of non-struct elements in the flattened + // struct. Returns 1 if it is not a struct type, but 0 for void type. + static unsigned getNumElements(Type *Ty) { + if (auto ST = dyn_cast(Ty)) + return flatten(ST, ST->getNumElements()); + return !Ty->isVoidTy(); + } + // unflatten : convert a flattened index back into normal struct indices + static unsigned unflatten(StructType *ST, unsigned Unflattened, SmallVectorImpl *Indices); + // getElementType : get type of struct element from flattened index + static Type *getElementType(Type *Ty, unsigned FlattenedIndex); + // flattenArg : flatten an arg in a function or call, i.e. calculate the + // total number of flattened indices used up by previous args. If all + // previous args are not struct type, then this just returns the arg + // index + static unsigned flattenArg(FunctionType *FT, unsigned ArgIndex); + // getNumArgElements : get the number of non-struct elements in all args + // of the function + static unsigned getNumArgElements(FunctionType *FT) { + return flattenArg(FT, FT->getNumParams()); + } +}; + +class AssertingSV; + +/*********************************************************************** + * SimpleValue : a non-struct value, possibly inside a struct + * See comment at the top of the file. + */ +class SimpleValue { + Value *V; + unsigned Index; // flattened struct index +public: + SimpleValue() : V(nullptr), Index(0) {} + // Constructor from a non-struct value + SimpleValue(Value *V) : V(V), Index(0) {} + // Constructor from a struct value and an already flattened index + SimpleValue(Value *V, unsigned Index) : V(V), Index(Index) {} + // Constructor from a struct value and unflattened indices (as found in extractelement) + SimpleValue(Value *V, ArrayRef Indices) : V(V), + Index(IndexFlattener::flatten(cast(V->getType()), Indices)) {} + // Accessors + Value *getValue() const { return V; } + unsigned getIndex() const { return Index; } + // getType : get the type of the (element) value + Type *getType(); + // Comparisons + bool operator==(SimpleValue Rhs) const { return V == Rhs.V && Index == Rhs.Index; } + bool operator!=(SimpleValue Rhs) const { return !(*this == Rhs); } + bool operator<(SimpleValue Rhs) const { + if (V != Rhs.V) + return V < Rhs.V; + return Index < Rhs.Index; + } + // Debug dump/print +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void dump() const; +#endif + void print(raw_ostream &OS) const; + void printName(raw_ostream &OS) const; +}; + +inline raw_ostream &operator<<(raw_ostream &OS, SimpleValue V) { + V.print(OS); + return OS; +} + +// AssertingSV : like a SimpleValue, but contains an AssertingVH +class AssertingSV { + AssertingVH V; + unsigned Index; +public: + AssertingSV(SimpleValue SV) : V(SV.getValue()), Index(SV.getIndex()) {} + SimpleValue get() const { return SimpleValue(V, Index); } + Value *getValue() const { return V; } + unsigned getIndex() const { return Index; } + Type *getType() const { return get().getType(); } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void dump() const { get().dump(); } +#endif + void print(raw_ostream &OS) const { get().print(OS); } + void printName(raw_ostream &OS) const { get().printName(OS); } +}; + +// Segment : a single range of instruction numbers in which a value is +// live +struct Segment { + enum { WEAK, PHICPY, STRONG }; + unsigned Strength : 2; // whether it is a weak or phicpy or strong segment + +private: + unsigned Start : 30; // inclusive start of range + unsigned End : 30; // exclusive end of range +public: + Segment() : Strength(STRONG), Start(0), End(0) {} + Segment(unsigned S, unsigned E, unsigned Strength = STRONG) + : Strength(Strength) { + assert(E >= S); + Start = S; + End = E; + } + unsigned getStart() const noexcept { return Start; } + void setStart(unsigned S) noexcept { + assert(End >= S); + Start = S; + } + unsigned getEnd() const noexcept{ return End; } + void setEnd(unsigned E) noexcept{ + assert(E >= Start); + End = E; + } + void setStartEnd(unsigned S, unsigned E) noexcept{ + assert(E >= S); + Start = S; + End = E; + } + bool operator<(Segment Rhs) const noexcept{ + if (Start != Rhs.Start) + return Start < Rhs.Start; + return End < Rhs.End; + } + + // use this via std::hash (see end of this file) + size_t hash() const noexcept { + return hash_combine(Start, End, Strength); + } + bool operator==(Segment Rhs) const noexcept{ + return (Start == Rhs.Start) && (End == Rhs.End) && + (Strength == Rhs.Strength); + } + bool isWeak() const noexcept{ return Strength == WEAK; } +}; + +// LiveRange : a collection of Segment structs, in order, describing +// all points in the program in which a value is live. +// Also contains a list of each SimpleValue that points to this LiveRange. +// Also a bitmap of register classes (general, surface, etc) that +// its def and uses need. +class LiveRange { + friend class llvm::GenXLiveness; + typedef SmallVector Segments_t; + Segments_t Segments; + typedef SmallVector Values_t; + Values_t Values; +public: + // kernel/stack functions that this LR spans across + std::set Funcs; + unsigned Category :8; + unsigned LogAlignment :7; + bool DisallowCASC: 1; // disallow call arg special coalescing + unsigned Offset :12; // kernel arg offset, else 0 + LiveRange() : Category(0), LogAlignment(0), DisallowCASC(false), Offset(0) {} + // Iterator forwarders for Segments + typedef Segments_t::iterator iterator; + typedef Segments_t::const_iterator const_iterator; + iterator begin() { return Segments.begin(); } + iterator end() { return Segments.end(); } + const_iterator begin() const { return Segments.begin(); } + const_iterator end() const { return Segments.end(); } + unsigned size() { return Segments.size(); } + void resize(unsigned len) { Segments.resize(len); } + // Iterator forwarders for Values. + // This is complicated by the Values vector containing AssertingSV, but the + // iterator wants to dereference to a Simplevalue. + class value_iterator { + Values_t::iterator i; + public: + value_iterator(Values_t::iterator i) : i(i) {} + SimpleValue operator*() { return i->get(); } + AssertingSV *operator->() { return i; } + bool operator==(const value_iterator &Rhs) const { return i == Rhs.i; } + bool operator!=(const value_iterator &Rhs) const { return !(*this == Rhs); } + value_iterator &operator++() { ++i; return *this; } + }; + Values_t& getValues() { return Values; } + value_iterator value_begin() { return Values.begin(); } + value_iterator value_end() { return Values.end(); } + unsigned value_size() { return Values.size(); } + bool value_empty() { return Values.empty(); } + // find : return iterator to segment containing Num (including the case + // of being equal to the segment's End), or, if in a hole, the + // iterator of the next segment, or, if at end, end(). + iterator find(unsigned Num); + void clear() { Segments.clear(); Values.clear(); } + void push_back(Segment Seg) { Segments.push_back(Seg); } + void push_back(unsigned S, unsigned E) { Segments.push_back(Segment(S, E)); } + SimpleValue addValue(SimpleValue V) { Values.push_back(V); return V; } + // contains : test whether live range contains instruction number + bool contains(unsigned Num) { + iterator i = find(Num); + return i != end() && i->getEnd() != Num && i->getStart() <= Num; + } + // getCategory : get the LR's register category + unsigned getCategory() const { return Category; } + // setCategory : set the LR's register category + void setCategory(unsigned Cat) { Category = Cat; } + // getOrDefaultCategory : return category; if none, set default + unsigned getOrDefaultCategory(); + // getLogAlignment : get log alignment + unsigned getLogAlignment() const { return LogAlignment; } + // setAlignmentFromValue : increase alignment if necessary from a value + void setAlignmentFromValue(SimpleValue V); + // setLogAlignment : set log alignment to greater than implied by the LR's values + void setLogAlignment(unsigned Align) { LogAlignment = std::max(LogAlignment, Align); } + // addSegment : add a segment to a live range + void addSegment(Segment Seg); + // setSegmentsFrom : for this live range, clear out its segments + // and copy them from the other live range + void setSegmentsFrom(LiveRange *Other); + // addSegments : add segments from another LR to this one + void addSegments(LiveRange *LR2); + // sortAndMerge : after doing some push_backs, sort the segments + // and merge overlapping/adjacent ones + void sortAndMerge(); + // prepareFuncs : fill the Funcs set with kernel or stack functions which this + // LR is alive in + void prepareFuncs(FunctionGroupAnalysis *FGA); + // getLength : add up the number of instructions covered by this LR + unsigned getLength(bool WithWeak); + // debug dump/print + void dump() const; + void print(raw_ostream &OS) const; + void printSegments(raw_ostream &OS) const; +private: + void value_clear() { Values.clear(); } +#ifndef NDEBUG + // assertOk : assert that live range's segments are well formed + void assertOk(); +#else + void assertOk() {} +#endif +}; + +inline raw_ostream &operator<<(raw_ostream &OS, const LiveRange &LR) { + LR.print(OS); + return OS; +} + +// CallGraph : the call graph within a FunctionGroup +class CallGraph { + FunctionGroup *FG; +public: + class Node; + struct Edge { + unsigned Number; + CallInst *Call; + Node *Callee; + bool operator==(Edge Rhs) const { return Number == Rhs.Number; } + bool operator!=(Edge Rhs) const { return !(*this == Rhs); } + bool operator<(Edge Rhs) const { return Number < Rhs.Number; } + Edge() : Number(0), Call(0) {} + Edge(unsigned Number, CallInst *Call) : Number(Number), Call(Call) {} + }; + class Node { + std::set Edges; + public: + typedef std::set::iterator iterator; + iterator begin() { return Edges.begin(); } + iterator end() { return Edges.end(); } + void insert(Edge E) { Edges.insert(E); } + }; +private: + std::map Nodes; +public: + // constructor from FunctionGroup + CallGraph(FunctionGroup *FG) : FG(FG) {} + // build : build the call graph from the FunctionGroup + void build(GenXLiveness *Liveness); + + // getRoot : get the root node + Node *getRoot() { return &Nodes[FG->getHead()]; } + // getNode : get the node for a Function + Node *getNode(Function *F) { return &Nodes[F]; } +}; + +} // end namespace genx + +class GenXLiveness : public FunctionGroupPass { + FunctionGroup *FG; + using LiveRangeMap_t = MapVector; + LiveRangeMap_t LiveRangeMap; + genx::CallGraph *CG; + GenXBaling *Baling; + GenXNumbering *Numbering; + std::map UnifiedRets; + std::map UnifiedRetToFunc; + std::map, Value *> ArgAddressBaseMap; +public: + static char ID; + explicit GenXLiveness() + : FunctionGroupPass(ID), CG(nullptr), Baling(nullptr), + Numbering(nullptr) {} + ~GenXLiveness() { clear(); } + virtual StringRef getPassName() const override { return "GenX liveness analysis"; } + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnFunctionGroup(FunctionGroup &FG) override; + // setBaling : tell GenXLiveness where GenXBaling is + void setBaling(GenXBaling *B) { Baling = B; } + // Iterator forwarders. + // This gives you an iterator of LiveRangeMap. The ->first field is the + // value, and you only get each value once. The ->second field is the + // LiveRange pointer, and you may get each one multiple times because + // a live range may contain multiple values. + typedef LiveRangeMap_t::iterator iterator; + typedef LiveRangeMap_t::const_iterator const_iterator; + iterator begin() { return LiveRangeMap.begin(); } + iterator end() { return LiveRangeMap.end(); } + const_iterator begin() const { return LiveRangeMap.begin(); } + const_iterator end() const { return LiveRangeMap.end(); } + // getLiveRange : get the live range for a Value of non-struct type + genx::LiveRange *getLiveRange(Value *V) { return getLiveRange(genx::SimpleValue(V)); } + // getLiveRange : get the live range for a genx::SimpleValue + genx::LiveRange *getLiveRange(genx::SimpleValue V); + // getLiveRangeOrNull : get the live range for a Value, or 0 if none + genx::LiveRange *getLiveRangeOrNull(genx::SimpleValue V); + const genx::LiveRange *getLiveRangeOrNull(genx::SimpleValue V) const; + // getOrCreateLiveRange : get the live range for a Value, or create + // a new one if none + genx::LiveRange *getOrCreateLiveRange(genx::SimpleValue V); + genx::LiveRange *getOrCreateLiveRange(genx::SimpleValue V, unsigned Cat, unsigned LogAlign); + // eraseLiveRange : get rid of live range for a Value, possibly multiple + // ones if it is a struct value + void eraseLiveRange(Value *V); + // eraseLiveRange : get rid of live range for a SimpleValue, if any. + // It is assumed that the LiveRange (if any) has no other value atached. + void eraseLiveRange(genx::SimpleValue V); + // eraseLiveRange : get rid of the specified live range, and remove its + // values from the map + void eraseLiveRange(genx::LiveRange *LR); + // twoAddrInterfere : check whether two live ranges interfere, allowing for single number interference sites at two address ops + bool twoAddrInterfere(genx::LiveRange *LR1, genx::LiveRange *LR2); + // interfere : test whether two live ranges interfere + bool interfere(genx::LiveRange *LR1, genx::LiveRange *LR2); + // getSingleInterferenceSites : check whether two live ranges interfere, returning single number interference sites + bool getSingleInterferenceSites(genx::LiveRange *LR1, genx::LiveRange *LR2, SmallVectorImpl *Sites); + // checkIfOverlappingSegmentsInterfere : given two segments that have been + // shown to overlap, check whether their strengths make them interfere + bool checkIfOverlappingSegmentsInterfere(genx::LiveRange *LR1, genx::Segment *S1, genx::LiveRange *LR2, genx::Segment *S2); + // coalesce : coalesce two live ranges + genx::LiveRange *coalesce(genx::LiveRange *LR1, genx::LiveRange *LR2, bool DisallowCASC); + // Set the GenXNumbering pointer for use by live range building + void setNumbering(GenXNumbering *N) { Numbering = N; } + GenXNumbering *getNumbering() { return Numbering; } + // rebuildCallGraph : rebuild GenXLiveness's call graph + void rebuildCallGraph(); + // buildSubroutineLRs : build an LR for each subroutine. Must be called + // before the first BuildLiveRange + void buildSubroutineLRs(); + // buildLiveRange : build live range for given value if it is simple, + // or one for each flattened index if it is struct type + void buildLiveRange(Value *V); + // buildLiveRange : build live range for given value + genx::LiveRange *buildLiveRange(genx::SimpleValue V); + // rebuildLiveRange : rebuild a live range that only has one value + void rebuildLiveRange(genx::LiveRange *LR); + // removeBale : remove the bale from its live range, and delete the range if + // it now has no values. + void removeBale(genx::Bale &B); + // removeValue : remove the value from its live range, and delete the + // range if it now has no values + void removeValue(Value *V); + void removeValue(genx::SimpleValue V); + // removeValue : remove the value from its live range. Do not delete the + // LR if it now has no values. + genx::LiveRange *removeValueNoDelete(genx::SimpleValue V); + // removeValuesNoDelete : remove all values from the live range, but do not + // delete the LR + void removeValuesNoDelete(genx::LiveRange *LR); + // replaceValue : update liveness such that NewVal has OldVal's live range, + // and OldVal does not have one at all. + void replaceValue(Value *OldVal, Value *NewVal); + void replaceValue(genx::SimpleValue OldVal, genx::SimpleValue(NewVal)); + // Set the LiveRange for a value in the map + void setLiveRange(genx::SimpleValue V, genx::LiveRange *LR); + // Get/create the unified return value for a function + Value *getUnifiedRet(Function *F); + Value *createUnifiedRet(Function *F); + // Test whether a value is a unified return value (and return its Function). + Function *isUnifiedRet(Value *V); + // Move unified return value from OldF to NewF. + void moveUnifiedRet(Function *OldF, Function *NewF); + // copyInterfere : test whether two live ranges copy-interfere + bool copyInterfere(genx::LiveRange *LR1, genx::LiveRange *LR2); + // See if V1 is a phi node and V2 wraps round to a phi use in the same BB after V1's def + static bool wrapsAround(Value *V1, Value *V2); + // Insert a copy of a non-struct value. + Instruction *insertCopy(Value *InputVal, genx::LiveRange *LR, Instruction *InsertBefore, const Twine &Name, unsigned Number); + // eraseUnusedTree : erase unused tree of instructions, and remove from GenXLiveness + void eraseUnusedTree(Instruction *Inst); + // setArgAddressBase : set the base value of an argument indirect address + void setArgAddressBase(Value *Addr, Value *Base) { ArgAddressBaseMap[Addr] = Base; } + // getAddressBase : get the base register of an address + Value *getAddressBase(Value *Addr); + // isBitCastCoalesced : see if the bitcast has been coalesced away + bool isBitCastCoalesced(BitCastInst *BCI); + // createPrinterPass : get a pass to print the IR, together with the GenX + // specific analyses + virtual Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const override + { return createGenXGroupPrinterPass(O, Banner); } + // Debug dump + void dump(); + using Pass::print; // Indicates we aren't replacing base class version of print + virtual void print(raw_ostream &OS) const; + virtual void releaseMemory() override { clear(); } + +private: + void clear(); + unsigned numberInstructionsInFunc(Function *Func, unsigned Num); + unsigned getPhiOffset(PHINode *Phi) const; + void rebuildLiveRangeForValue(genx::LiveRange *LR, genx::SimpleValue SV); + genx::LiveRange *visitPropagateSLRs(Function *F); + void merge(genx::LiveRange *LR1, genx::LiveRange *LR2); +}; + +void initializeGenXLivenessPass(PassRegistry &); + +// Specialize DenseMapInfo for SimpleValue. +template <> struct DenseMapInfo { + static inline genx::SimpleValue getEmptyKey() { + return genx::SimpleValue(DenseMapInfo::getEmptyKey()); + } + static inline genx::SimpleValue getTombstoneKey() { + return genx::SimpleValue(DenseMapInfo::getTombstoneKey()); + } + static unsigned getHashValue(const genx::SimpleValue &SV) { + return DenseMapInfo::getHashValue(SV.getValue()) ^ + DenseMapInfo::getHashValue(SV.getIndex()); + } + static bool isEqual(const genx::SimpleValue &LHS, + const genx::SimpleValue &RHS) { + return LHS == RHS; + } +}; + +} // end namespace llvm +namespace std { +template <> struct hash { + size_t operator()(llvm::genx::Segment const &x) const noexcept { + return x.hash(); + } +}; +} // end namespace std +#endif // GENXLIVENESS_H diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXLowerAggrCopies.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLowerAggrCopies.cpp new file mode 100644 index 000000000000..a66bb7785bba --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLowerAggrCopies.cpp @@ -0,0 +1,200 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +// \file +// Lower aggregate copies, memset, memcpy, memmov intrinsics into loops when +// the size is large or is not a compile-time constant. +// +//===----------------------------------------------------------------------===// + +#include "GenXLowerAggrCopies.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/StackProtector.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/LowerMemIntrinsics.h" + +#define DEBUG_TYPE "GENX_LOWERAGGRCOPIES" + +using namespace llvm; + +// 8 * 8 * 16 = 8 instructions each read 8 OWords +static cl::opt + ExpandLimitOpt("lower-aggr-copies-expand-limit", + cl::desc("max memcpy/memset/memmove length (in bytes) that " + "is lowered as scalar code"), + cl::init(8 * 8 * 16)); + +namespace { + +// actual analysis class, which is a functionpass +struct GenXLowerAggrCopies : public FunctionPass { + // TODO: more advance analysis + // (at least different values for different arch) + const int ExpandLimit; + static char ID; + + GenXLowerAggrCopies() : FunctionPass(ID), ExpandLimit(ExpandLimitOpt) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addPreserved(); + AU.addRequired(); + } + + bool runOnFunction(Function &F) override; + + static const unsigned MaxAggrCopySize = 1; // 128; + + StringRef getPassName() const override { + return "Lower aggregate copies/intrinsics into loops"; + } + + template void expandMemMov2VecLoadStore(T *MemCall); +}; + +char GenXLowerAggrCopies::ID = 0; + +bool GenXLowerAggrCopies::runOnFunction(Function &F) { + SmallVector MemCalls; + + const TargetTransformInfo &TTI = + getAnalysis().getTTI(F); + + // Collect all aggregate loads and mem* calls. + for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { + for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE; + ++II) { + if (MemIntrinsic *IntrCall = dyn_cast(II)) { + // Convert intrinsic calls with variable size or with constant size + // larger than the MaxAggrCopySize threshold. + if (ConstantInt *LenCI = dyn_cast(IntrCall->getLength())) { + if (LenCI->getZExtValue() >= MaxAggrCopySize) { + MemCalls.push_back(IntrCall); + } + } else { + MemCalls.push_back(IntrCall); + } + } + } + } + + if (MemCalls.size() == 0) { + return false; + } + + // Transform mem* intrinsic calls. + for (MemIntrinsic *MemCall : MemCalls) { + bool doLinearExpand = !MemCall->isVolatile() && isa(MemCall->getLength()) && + cast(MemCall->getLength())->getSExtValue() <= ExpandLimit; + if (MemCpyInst *Memcpy = dyn_cast(MemCall)) { + if (doLinearExpand) { + expandMemMov2VecLoadStore(Memcpy); + } else { + expandMemCpyAsLoop(Memcpy, TTI); + } + } else if (MemMoveInst *Memmove = dyn_cast(MemCall)) { + if (doLinearExpand) { + expandMemMov2VecLoadStore(Memmove); + } else { + expandMemMoveAsLoop(Memmove); + } + } else if (MemSetInst *Memset = dyn_cast(MemCall)) { + if (doLinearExpand) { + llvm::Value *SetVal = Memset->getValue(); + llvm::Value *LenVal = Memset->getLength(); + assert(isa(LenVal)); + assert(SetVal->getType()->getScalarSizeInBits() == 8); + auto Len = (unsigned)cast(LenVal)->getZExtValue(); + auto VecTy = VectorType::get(SetVal->getType(), Len); + Value *WriteOut = UndefValue::get(VecTy); + IRBuilder<> IRB(Memset); + for (unsigned i = 0; i < Len; ++i) { + WriteOut = IRB.CreateInsertElement(WriteOut, SetVal, IRB.getInt32(i)); + } + auto DstAddr = Memset->getRawDest(); + unsigned dstAS = cast(DstAddr->getType())->getAddressSpace(); + auto StorePtrV = + IRB.CreateBitCast(DstAddr, VecTy->getPointerTo(dstAS)); + IRB.CreateStore(WriteOut, StorePtrV); + } else { + expandMemSetAsLoop(Memset); + } + } + MemCall->eraseFromParent(); + } + + return true; +} + +template +void GenXLowerAggrCopies::expandMemMov2VecLoadStore(T *MemCall) { + IRBuilder<> IRB(MemCall); + llvm::Value *LenVal = MemCall->getLength(); + assert(isa(LenVal)); + auto Len = (unsigned)cast(LenVal)->getZExtValue(); + auto DstPtrV = MemCall->getRawDest(); + assert(DstPtrV->getType()->isPointerTy()); + auto I8Ty = cast(DstPtrV->getType())->getElementType(); + assert(I8Ty->isIntegerTy(8)); + auto VecTy = VectorType::get(I8Ty, Len); + auto SrcAddr = MemCall->getRawSource(); + unsigned srcAS = cast(SrcAddr->getType())->getAddressSpace(); + auto LoadPtrV = IRB.CreateBitCast(SrcAddr, VecTy->getPointerTo(srcAS)); + auto ReadIn = IRB.CreateLoad(LoadPtrV); + auto DstAddr = MemCall->getRawDest(); + unsigned dstAS = cast(DstAddr->getType())->getAddressSpace(); + auto StorePtrV = IRB.CreateBitCast(DstAddr, VecTy->getPointerTo(dstAS)); + IRB.CreateStore(ReadIn, StorePtrV); +} + +} // namespace + +namespace llvm { +void initializeGenXLowerAggrCopiesPass(PassRegistry &); +} + +INITIALIZE_PASS_BEGIN(GenXLowerAggrCopies, "genx-lower-aggr-copies", + "Lower aggregate copies, and llvm.mem* intrinsics into loops", + false, false) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END(GenXLowerAggrCopies, "genx-lower-aggr-copies", + "Lower aggregate copies, and llvm.mem* intrinsics into loops", + false, false) + +FunctionPass *llvm::createGenXLowerAggrCopiesPass() { + return new GenXLowerAggrCopies(); +} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXLowerAggrCopies.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLowerAggrCopies.h new file mode 100644 index 000000000000..540aaf32614f --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLowerAggrCopies.h @@ -0,0 +1,41 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +// This file contains the declaration of the VC specific lowering of +// aggregate copies +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_GENX_GENXLOWERAGGRCOPIES_H +#define LLVM_LIB_TARGET_GENX_GENXLOWERAGGRCOPIES_H + +namespace llvm { +class FunctionPass; + +FunctionPass *createGenXLowerAggrCopiesPass(); +} + +#endif diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXLowering.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLowering.cpp new file mode 100644 index 000000000000..26e1b4bd8233 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXLowering.cpp @@ -0,0 +1,3071 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXLowering +/// ------------ +/// +/// GenXLowering is a function pass that lowers certain LLVM IR instructions +/// that the rest of the GenX backend cannot deal with, or to implement peephole +/// optimizations. +/// +/// It also performs a few other tasks: +/// +/// 1. It implements add sinking for a variable index in a region/element +/// access. This ensures that, in a sequence of operations to calculate a +/// variable index for a region/element access, any add constant is sunk to +/// the end, such that it can become a constant offset in an indirect +/// operand, and give GenXAddressCommoning more chance to common up address +/// calculations. +/// +/// 2. It splits struct values where possible, by splitting all struct phi nodes +/// before running the main pass, then removing an extractvalue by using the +/// corresponding insertvalue's input instead. Any struct value used as an +/// arg or return value still remains, and needs to be dealt with by register +/// allocation. +/// +/// 3. It widens some byte vector operations to short vector. +/// +/// Gen has restrictions on byte operands. The jitter copes with that, but +/// sometimes it needs to do even-odd splitting, which can lead to suboptimal +/// code if cmps and predicates are involved. +/// Here we attempt to pick up the common cases by converting a byte +/// operation to short. +/// +/// Note that we might end up with the extends being baled into the +/// instruction anyway, resulting in a byte operation in vISA. +/// +/// 4. Certain uses of shufflevector are lowered: +/// +/// a. a splat (copy of one element across a vector); +/// b. a boolean slice (extract of a subvector) becomes rdpredregion; +/// c. a boolean unslice (insert subvector) becomes wrpredregion. +/// d. non-boolean shufflevector is lowered to sequence of rd/wrregions +/// +/// The only one case of shufflevector allowed is shufflevector of +/// predicate and undef with replicated mask. +/// +/// 5. A Trunc is lowered to a bitcast then a region/element read with a stride. +/// GenXCoalescing will coalesce the bitcast, and possibly bale in the region +/// read, so this will hopefully save an instruction or two. +/// +/// 6. Certain floating point comparison instructions are lowered. +/// +/// **IR restriction**: LLVM IR instructions not supported after this pass: +/// +/// * insertelement +/// * extractelement +/// * trunc +/// * zext/sext/uitofp from (vector of) i1 +/// * select on vector of i1 +/// * ``llvm.uadd.with.overflow`` (the other +/// overflowing arithmetic intrinsics are not allowed by the GenX backend +/// anyway.) +/// +/// +/// **IR restriction**: all gather/scatter/atomic must have the width supported +/// by the hardware target. +/// +/// **IR restriction**: rdpredregion intrinsic (which is generated by this pass +/// from certain cases of shufflevector, and represents a use of part of a +/// predicate) can only be used in select, wrregion, wrpredpredregion. +/// +/// **IR restriction**: wrpredregion intrinsic (which is generated by this pass +/// from certain cases of shufflevector, and represents the write of part of a +/// predicate) must have a compare as its "new value" input. +/// +/// **IR restriction**: No phi node of struct type after this pass. This is only +/// a general rule; subsequent passes have been known to reintroduce them so +/// GenXLiveness has another go at splitting them up. +/// +//===----------------------------------------------------------------------===// + +#include "GenX.h" +#include "GenXGotoJoin.h" +#include "GenXIntrinsics.h" +#include "GenXModule.h" +#include "GenXRegion.h" +#include "GenXSubtarget.h" +#include "GenXUtil.h" +#include "GenXVisa.h" +#include "visa_igc_common_header.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +#include +#include +#include + +using namespace llvm; +using namespace genx; + +static cl::opt + EnableGenXByteWidening("enable-genx-byte-widening", cl::init(true), + cl::Hidden, cl::desc("Enable GenX byte widening.")); + +namespace { + +// GenXLowering : legalize execution widths and GRF crossing +class GenXLowering : public FunctionPass { + DominatorTree *DT = nullptr; + const GenXSubtarget *ST = nullptr; + SmallVector ToErase; + +public: + static char ID; + explicit GenXLowering() : FunctionPass(ID), DT(nullptr) {} + virtual StringRef getPassName() const { return "GenX lowering"; } + void getAnalysisUsage(AnalysisUsage &AU) const; + bool runOnFunction(Function &F); + static bool splitStructPhi(PHINode *Phi); + +private: + bool splitGatherScatter(CallInst *CI, unsigned IID); + bool processTwoAddressOpnd(CallInst *CI); + bool processInst(Instruction *Inst); + bool lowerRdRegion(Instruction *Inst); + bool lowerWrRegion(Instruction *Inst); + bool lowerRdPredRegion(Instruction *Inst); + bool lowerWrPredRegion(Instruction *Inst); + bool lowerInsertElement(Instruction *Inst); + bool lowerExtractElement(Instruction *Inst); + Value *scaleInsertExtractElementIndex(Value *IdxVal, Type *ElTy, + Instruction *InsertBefore); + bool lowerTrunc(Instruction *Inst); + bool lowerCast(Instruction *Inst); + bool lowerBoolScalarSelect(SelectInst *SI); + bool lowerBoolVectorSelect(SelectInst *SI); + bool lowerBoolShuffle(ShuffleVectorInst *Inst); + bool lowerBoolSplat(ShuffleVectorInst *SI, Value *In, unsigned Idx); + bool lowerSelect(SelectInst* SI); + bool lowerShuffle(ShuffleVectorInst *Inst); + void lowerShuffleSplat(ShuffleVectorInst *SI, + ShuffleVectorAnalyzer::SplatInfo Splat); + bool lowerShuffleToSelect(ShuffleVectorInst *Inst); + void lowerShuffleToMove(ShuffleVectorInst *SI); + bool lowerShr(Instruction *Inst); + bool lowerExtractValue(ExtractValueInst *Inst); + bool lowerInsertValue(InsertValueInst *Inst); + bool lowerUAddWithOverflow(CallInst *CI); + bool lowerCtpop(CallInst *CI); + bool lowerFCmpInst(FCmpInst *Inst); + bool widenByteOp(Instruction *Inst); + bool lowerLoadStore(Instruction *Inst); + bool lowerMul64(Instruction *Inst); + bool lowerTrap(CallInst *CI); +}; + +} // end namespace + +char GenXLowering::ID = 0; +namespace llvm { +void initializeGenXLoweringPass(PassRegistry &); +} +INITIALIZE_PASS_BEGIN(GenXLowering, "GenXLowering", "GenXLowering", false, + false) +INITIALIZE_PASS_END(GenXLowering, "GenXLowering", "GenXLowering", false, false) + +FunctionPass *llvm::createGenXLoweringPass() { + initializeGenXLoweringPass(*PassRegistry::getPassRegistry()); + return new GenXLowering; +} + +void GenXLowering::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); +} + +/*********************************************************************** + * GenXLowering::runOnFunction : process one function to + * lower instructions as required for GenX backend. + * + * This does a postordered depth first traversal of the CFG, + * processing instructions within a basic block in reverse, to + * ensure that we see a def after its uses (ignoring phi node uses). + * This helps peephole optimizations which generally want to be + * approached from the top down. For example, add sinking in the index + * of an indirect region/element wants to see the trunc before the trunc + * is lowered to a bitcast and an element access. + */ +bool GenXLowering::runOnFunction(Function &F) { + auto *DTWP = getAnalysisIfAvailable(); + DT = DTWP ? &DTWP->getDomTree() : nullptr; + auto P = getAnalysisIfAvailable(); + ST = P ? P->getSubtarget() : nullptr; + // First split any phi nodes with struct type. + splitStructPhis(&F); + // Create a list of basic blocks in the order we want to process them, before + // we start the lowering. This is because lowering can split a basic block. + SmallVector BBs; + for (auto i = po_begin(&F.getEntryBlock()), e = po_end(&F.getEntryBlock()); + i != e; ++i) + BBs.push_back(*i); + // Process each basic block. + for (auto i = BBs.begin(), e = BBs.end(); i != e; ++i) { + BasicBlock *BB = *i; + // The effect of this loop is that we process the instructions in reverse + // order, and we re-process anything inserted before the instruction + // being processed. + for (Instruction *Inst = BB->getTerminator();;) { + processInst(Inst); + BasicBlock *Parent = Inst->getParent(); + if (Inst != &Parent->front()) + Inst = Inst->getPrevNode(); + else { + if (Parent == BB) + break; + // We have reached the start of the basic block, but it is a different + // basic block to BB, so lowering must have split a BB. Just go back to + // the end of the previous one. + Inst = Parent->getPrevNode()->getTerminator(); + } + } + } + // Erase the instructions that we saved in ToErase. + for (SmallVectorImpl::iterator i = ToErase.begin(), + e = ToErase.end(); + i != e; ++i) + (*i)->eraseFromParent(); + ToErase.clear(); + return true; +} + +// Optimize two address operands if any. +// +// An instruction with a two address opernd should be predicated. If predicate +// is a constant splat, then the old value will be over-written. In this case, +// replace the old value with undef which allows more optimizations to kick in. +// +bool GenXLowering::processTwoAddressOpnd(CallInst *CI) { + int OpNum = getTwoAddressOperandNum(CI); + // Skip write regions whose OpNum is 0. + if (OpNum > 0) { + Type *Ty = CI->getArgOperand(OpNum)->getType(); + assert(Ty == CI->getType() && "two address op type out of sync"); + + for (unsigned i = 0; i < CI->getNumArgOperands(); ++i) { + auto Op = dyn_cast(CI->getArgOperand(i)); + // Check if the predicate operand is all true. + if (Op && Op->getType()->getScalarSizeInBits() == 1) { + if (Op->getType()->isVectorTy()) + Op = Op->getSplatValue(); + if (Op && Op->isOneValue()) { + CI->setOperand(OpNum, UndefValue::get(Ty)); + return true; + } + return false; + } + } + } + + return false; +} + +// Check whether given intrinsic is new load +// without predicate and old value arguments. +static bool isNewLoadInst(CallInst *Inst) { + unsigned IID = GenXIntrinsic::getGenXIntrinsicID(Inst); + switch (IID) { + case GenXIntrinsic::genx_gather4_scaled2: + case GenXIntrinsic::genx_gather_scaled2: + return true; + default: + return false; + } +} + +// Find single wrregion user of load instruction. +// Returns nullptr on failure. +static CallInst *getLoadWrregion(CallInst *Inst) { + assert(isNewLoadInst(Inst) && "Expected new load intrinsics"); + if (Inst->getNumUses() != 1) + return nullptr; + + auto *WrR = dyn_cast(Inst->user_back()); + if (!WrR) + return nullptr; + return GenXIntrinsic::isWrRegion(WrR) ? WrR : nullptr; +} + +// Find single select user of load instruction. +// Returns nullptr on failure. +// TODO: maybe just lower every select to wrregion in lowerSelect? +static SelectInst *getLoadSelect(CallInst *Inst) { + assert(isNewLoadInst(Inst) && "Expected new load intrinsics"); + if (Inst->getNumUses() != 1) + return nullptr; + + auto *SI = dyn_cast(Inst->user_back()); + if (!SI) + return nullptr; + // TODO: handle inverted selects. + // Need to regenerate mask in this case. + if (SI->getTrueValue() != Inst) + return nullptr; + return SI; +} + +// Generate predicate for wrregion of splitted load. +// Returns new predicate. +static Value *generatePredicateForLoadWrregion( + Value *OldPred, unsigned Offset, unsigned Width, unsigned NumChannels, + Instruction *InsertBefore, const DebugLoc &DL, const Twine &Name) { + if (isa(OldPred)) + return OldPred; + + Value *Pred = OldPred; + // If old predicate is result of rdpredregion or shufflevector then + // we can reuse their predicate and offset to avoid double read of predicate. + if (GenXIntrinsic::getGenXIntrinsicID(OldPred) == GenXIntrinsic::genx_rdpredregion) { + auto *OldPredInst = cast(OldPred); + Offset += cast(OldPredInst->getArgOperand(1))->getZExtValue(); + Pred = OldPredInst->getArgOperand(0); + } else if (auto *SVI = dyn_cast(OldPred)) { + Offset += + ShuffleVectorAnalyzer::getReplicatedSliceDescriptor(SVI).InitialOffset; + Pred = SVI->getOperand(0); + } + + // Replicate mask across channels. + SmallVector NewMaskVals(Width); + unsigned ChannelWidth = Width / NumChannels; + Type *Int32Ty = IntegerType::getInt32Ty(Pred->getContext()); + for (unsigned i = 0; i < NumChannels; ++i) + std::generate_n(NewMaskVals.begin() + ChannelWidth * i, ChannelWidth, + [Int32Ty, Offset]() mutable { + return ConstantInt::get(Int32Ty, Offset++); + }); + Constant *NewMask = ConstantVector::get(NewMaskVals); + + Value *Undef = UndefValue::get(Pred->getType()); + auto *Res = new ShuffleVectorInst(Pred, Undef, NewMask, Name, InsertBefore); + Res->setDebugLoc(DL); + return Res; +} + +// Generate partial write for result of splitted 1-channel load instruction. +// Initially we could have following sequence for illegal load (on gather_scaled example): +// res = gather_scaled <32> +// mask = rdpredregion <32> pred, offset +// newV = wrregion <32> oldV, res, wroffset, mask +// After splitting we want to get as less extra code as possible. +// To achieve this we generate following pattern: +// bale { +// res1 = gather_scaled <16> +// mask1 = rdpredregion <16> pred, offset +// partialV = wrregion <16> oldV, res1, mask1 +// } +// bale { +// res2 = gather_scaled <16> +// mask2 = rdpredregion <16> pred, offset + 16 +// newV = wrregion <16> partialV, res2, wroffset + 16 * elemsize, mask2 +// } +// Bale markers show how this will be baled later. +static Value *generate1ChannelWrrregion(Value *Target, unsigned InitialOffset, + CallInst *Load, Value *OldPred, + unsigned SplitNum, + Instruction *InsertBefore) { + const DebugLoc &DL = Load->getDebugLoc(); + Type *LoadType = Load->getType(); + unsigned LoadWidth = LoadType->getVectorNumElements(); + + Value *Pred = + generatePredicateForLoadWrregion(OldPred, LoadWidth * SplitNum, LoadWidth, + 1, InsertBefore, DL, "load1.pred.split"); + Region WrR(LoadType); + WrR.Mask = Pred; + WrR.Offset = InitialOffset + + LoadWidth * SplitNum * (LoadType->getScalarSizeInBits() / 8); + return WrR.createWrRegion(Target, Load, "load1.join", InsertBefore, DL); +} + +// Generate partial write for result of splitted N-channel load. +// For channelled loads we need to also shuffle result of splitted +// instructions to write back them to destination in expected order. +// Temporary splits should always be predicated in case of atomics +// because latter load and store at the same time. +// Example for gather4_scaled (with two channels enabled). Before: +// res = gather4_scaled <32> RG +// mask = rdpredregion <64> pred, offset ; mask is replicated across channels +// newV = wrregion <64> oldV, res, wroffset, mask +// After: +// bale { +// res1temp = gather4_scaled <16> RG ; create temporary (unnecessary in case of non-atomics) +// splitmask1 = rdpredregion <32> pred, offset ; replicated +// res1 = wrregion <32> undef, res1temp, 0, splitmask1 +// } +// bale { +// res1R = rdregion <16> res1, 0 +// mask1R = rdpredregion <16> pred, offset ; same for all channels +// partialVR = wrregion <16> oldV, res1R, wroffset, mask1R +// } +// bale { +// res1G = rdregion <16> res1, 16 * elemsize +// mask1G = rdpredregion <16> pred, offset +// partialV = wrregion <16> partialVR, res1G, wroffset + 32 * elemsize, mask1G +// } +// bale { +// res2temp = gather4_scaled <16> RG ; second temporary +// splitmask2 = rdpredregion <32> pred, offset + 16 +// res2 = wrregion <32> undef, res2temp, 0, splitmask2 +// } +// bale { +// res2R = rdregion <16> res2, 0 +// mask2R = rdpredregion <16> pred, offset + 16 +// newVR = wrregion <16> partialV, res2R, wroffset + 16 * elemsize, mask2R +// } +// bale { +// res2G = rdregion <16> res2, 16 * elemsize +// mask2G = rdpredregion <16> pred, offset + 16 +// newV = wrregion <16> newVR, res2G, wroffset + 48 * elemsize, mask2G +// } +// As it can be noticed, splitting of channeled loads is quite expensive. +// We should hope that later passes (like region collapsing) can optimize it +// by analyzing how resulting value was assembled. +static Value *generateNChannelWrregion(Value *Target, unsigned InitialOffset, + CallInst *Load, Value *OldPred, + unsigned SplitNum, unsigned NumSplits, + unsigned NumChannels, + Instruction *InsertBefore) { + const DebugLoc &DL = Load->getDebugLoc(); + Type *LoadType = Load->getType(); + unsigned LoadWidth = LoadType->getVectorNumElements(); + unsigned ChannelWidth = LoadWidth / NumChannels; + unsigned MaskOffset = ChannelWidth * SplitNum; + + // Generate temporary for load. + Value *Pred = generatePredicateForLoadWrregion( + OldPred, MaskOffset, LoadWidth, NumChannels, InsertBefore, DL, "loadN.pred.split"); + Region WrR(LoadType); + WrR.Mask = Pred; + Value *SplitRes = WrR.createWrRegion(UndefValue::get(LoadType), Load, + "loadN.split", InsertBefore, DL); + + // Generate shuffle writes to the target. + unsigned ElemByteSize = LoadType->getScalarSizeInBits() / 8; + Type *ShuffleType = VectorType::get(LoadType->getScalarType(), ChannelWidth); + Region ChannelRdR(ShuffleType); + Region ChannelWrR(ShuffleType); + Value *ResChannel = nullptr; + for (unsigned i = 0; i < NumChannels; ++i) { + ChannelRdR.Offset = ChannelWidth * i * ElemByteSize; + ResChannel = ChannelRdR.createRdRegion(SplitRes, "loadN.channel.read.join", + InsertBefore, DL); + Pred = generatePredicateForLoadWrregion(OldPred, MaskOffset, ChannelWidth, + 1, InsertBefore, DL, + "loadN.channel.pred.join"); + ChannelWrR.Offset = + InitialOffset + + (ChannelWidth * SplitNum + ChannelWidth * NumSplits * i) * ElemByteSize; + ChannelWrR.Mask = Pred; + Target = ChannelWrR.createWrRegion(Target, ResChannel, "loadN.channel.join", + InsertBefore, DL); + } + return Target; +} + +// Get target for wrregions of splitted load. +// Returns tuple consisted of: +// 1. Target for wrregions +// 2. Predicate +// 3. Initial offset of target +// 4. Instruction to replace later +static std::tuple +getLoadTarget(CallInst *Load, const GenXSubtarget *ST) { + Value *LoadPred; + if (CallInst *LoadWrr = getLoadWrregion(Load)) { + // If we found wrregion user, then use its predicate for splitted instructions. + LoadPred = + LoadWrr->getArgOperand(GenXIntrinsic::GenXRegion::PredicateOperandNum); + + // If wrregion can be represented as raw operand, we can reuse its target and offset. + if (genx::isValueRegionOKForRaw(LoadWrr, true /* IsWrite */, ST)) { + // TODO: mark wrregion to be erased once issue with ToErase and + // iteration order will be resolved. + Value *Target = + LoadWrr->getArgOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum); + Value *Offset = + LoadWrr->getArgOperand(GenXIntrinsic::GenXRegion::WrIndexOperandNum); + unsigned InitialOffset = cast(Offset)->getZExtValue(); + return {Target, LoadPred, InitialOffset, LoadWrr}; + } + } else if (SelectInst *SI = getLoadSelect(Load)) { + LoadPred = SI->getCondition(); + Value *Target = SI->getFalseValue(); + return {Target, LoadPred, 0, SI}; + } else { + // No wrregion user, load is not predicated. + LoadPred = ConstantInt::get(IntegerType::getInt1Ty(Load->getContext()), 1); + } + + // Create new target for load. + Value *Target = UndefValue::get(Load->getType()); + return {Target, LoadPred, 0, Load}; +} + +/*********************************************************************** + * splitGatherScatter : lower gather/scatter/atomic to the width support + * by the hardware platform. + * + * This performs two functions: + * + * 1. If the operation is wider than what hardware can support, splits it + * into the legal width. + * + * 2. For typed gather4/scatter4, when r or both v and r are zero, replace + * with undef so that they are not encoded in the vISA instruction and the + * message skips them. + */ +bool GenXLowering::splitGatherScatter(CallInst *CI, unsigned IID) { + enum { + MASK_IDX = 0, + PRED_IDX = 1, + SURF_IDX = 2, + U_IDX = 3, + DATA_IDX = 6, + NONEED = 11 + }; + + unsigned MaskIdx = NONEED; + unsigned PredIdx = NONEED; + unsigned AddrIdx = NONEED; + unsigned DataIdx = NONEED; + unsigned AtomicSrcIdx = NONEED; + bool IsTyped = false; + int AtomicNumSrc = (-1); // -1 means not-an-atomic + + switch (IID) { + case GenXIntrinsic::genx_typed_atomic_add: + case GenXIntrinsic::genx_typed_atomic_and: + case GenXIntrinsic::genx_typed_atomic_fmax: + case GenXIntrinsic::genx_typed_atomic_fmin: + case GenXIntrinsic::genx_typed_atomic_imax: + case GenXIntrinsic::genx_typed_atomic_imin: + case GenXIntrinsic::genx_typed_atomic_max: + case GenXIntrinsic::genx_typed_atomic_min: + case GenXIntrinsic::genx_typed_atomic_or: + case GenXIntrinsic::genx_typed_atomic_sub: + case GenXIntrinsic::genx_typed_atomic_xchg: + case GenXIntrinsic::genx_typed_atomic_xor: + AtomicSrcIdx = 2; + PredIdx = 0; + AddrIdx = 3; + IsTyped = true; + AtomicNumSrc = 1; + break; + case GenXIntrinsic::genx_typed_atomic_dec: + case GenXIntrinsic::genx_typed_atomic_inc: + PredIdx = 0; + AddrIdx = 2; + IsTyped = true; + AtomicNumSrc = 0; + break; + case GenXIntrinsic::genx_typed_atomic_cmpxchg: + case GenXIntrinsic::genx_typed_atomic_fcmpwr: + AtomicSrcIdx = 2; + PredIdx = 0; + AddrIdx = 4; + IsTyped = true; + AtomicNumSrc = 2; + break; + case GenXIntrinsic::genx_scatter4_typed: + case GenXIntrinsic::genx_gather4_typed: + DataIdx = DATA_IDX; + MaskIdx = MASK_IDX; + PredIdx = PRED_IDX; + AddrIdx = U_IDX; + IsTyped = true; + break; + case GenXIntrinsic::genx_scatter4_scaled: + case GenXIntrinsic::genx_gather4_scaled: + DataIdx = 6; + PredIdx = 0; + MaskIdx = 1; + AddrIdx = 5; + break; + case GenXIntrinsic::genx_gather4_scaled2: + MaskIdx = 0; + AddrIdx = 4; + break; + case GenXIntrinsic::genx_svm_scatter4_scaled: + case GenXIntrinsic::genx_svm_gather4_scaled: + DataIdx = 5; + PredIdx = 0; + MaskIdx = 1; + AddrIdx = 4; + break; + case GenXIntrinsic::genx_scatter_scaled: + case GenXIntrinsic::genx_gather_scaled: + DataIdx = 6; + PredIdx = 0; + AddrIdx = 5; + break; + case GenXIntrinsic::genx_gather_scaled2: + AddrIdx = 4; + break; + case GenXIntrinsic::genx_svm_scatter: + case GenXIntrinsic::genx_svm_gather: + DataIdx = 3; + PredIdx = 0; + AddrIdx = 2; + break; + case GenXIntrinsic::genx_svm_atomic_dec: + case GenXIntrinsic::genx_svm_atomic_inc: + DataIdx = 2; + PredIdx = 0; + AddrIdx = 1; + AtomicNumSrc = 0; + case GenXIntrinsic::genx_svm_atomic_add: + case GenXIntrinsic::genx_svm_atomic_and: + case GenXIntrinsic::genx_svm_atomic_fmax: + case GenXIntrinsic::genx_svm_atomic_fmin: + case GenXIntrinsic::genx_svm_atomic_imax: + case GenXIntrinsic::genx_svm_atomic_imin: + case GenXIntrinsic::genx_svm_atomic_max: + case GenXIntrinsic::genx_svm_atomic_min: + case GenXIntrinsic::genx_svm_atomic_or: + case GenXIntrinsic::genx_svm_atomic_sub: + case GenXIntrinsic::genx_svm_atomic_xchg: + case GenXIntrinsic::genx_svm_atomic_xor: + DataIdx = 3; + PredIdx = 0; + AddrIdx = 1; + AtomicSrcIdx = 2; + AtomicNumSrc = 1; + break; + case GenXIntrinsic::genx_svm_atomic_cmpxchg: + case GenXIntrinsic::genx_svm_atomic_fcmpwr: + DataIdx = 4; + PredIdx = 0; + AddrIdx = 1; + AtomicSrcIdx = 2; + AtomicNumSrc = 2; + break; + case GenXIntrinsic::genx_dword_atomic_add: + case GenXIntrinsic::genx_dword_atomic_and: + case GenXIntrinsic::genx_dword_atomic_fmax: + case GenXIntrinsic::genx_dword_atomic_fmin: + case GenXIntrinsic::genx_dword_atomic_imax: + case GenXIntrinsic::genx_dword_atomic_imin: + case GenXIntrinsic::genx_dword_atomic_max: + case GenXIntrinsic::genx_dword_atomic_min: + case GenXIntrinsic::genx_dword_atomic_or: + case GenXIntrinsic::genx_dword_atomic_sub: + case GenXIntrinsic::genx_dword_atomic_xchg: + case GenXIntrinsic::genx_dword_atomic_xor: + DataIdx = 4; + PredIdx = 0; + AddrIdx = 2; + AtomicSrcIdx = 3; + AtomicNumSrc = 1; + break; + case GenXIntrinsic::genx_dword_atomic_cmpxchg: + case GenXIntrinsic::genx_dword_atomic_fcmpwr: + DataIdx = 5; + PredIdx = 0; + AddrIdx = 2; + AtomicSrcIdx = 3; + AtomicNumSrc = 2; + break; + case GenXIntrinsic::genx_dword_atomic_dec: + case GenXIntrinsic::genx_dword_atomic_inc: + DataIdx = 3; + PredIdx = 0; + AddrIdx = 2; + AtomicNumSrc = 0; + break; + + default: + return false; + } + + // nulling unused inputs for typed gather/scatter/atomic + if (IsTyped) { + Constant *V = dyn_cast(CI->getArgOperand(AddrIdx + 1)); + Constant *R = dyn_cast(CI->getArgOperand(AddrIdx + 2)); + // Only continue when R is known to be zero. + if (R && R->isNullValue()) { + CI->setOperand(AddrIdx + 2, UndefValue::get(R->getType())); + if (V && V->isNullValue()) + CI->setOperand(AddrIdx + 1, UndefValue::get(V->getType())); + } + // check if LOD is zero for atomic + if (AtomicNumSrc >= 0) { + Constant *LOD = dyn_cast(CI->getArgOperand(AddrIdx + 3)); + if (LOD && LOD->isNullValue()) + CI->setOperand(AddrIdx + 3, UndefValue::get(LOD->getType())); + } + } + // Deduce intrinsic width: check predicate if exists, then check address vector. + unsigned WidthOperand; + if (PredIdx != NONEED) + WidthOperand = PredIdx; + else if (AddrIdx != NONEED) + WidthOperand = AddrIdx; + else + llvm_unreachable("Cannot infer execution width of intrinsic (checked pred and addr operands)"); + auto Width = CI->getArgOperand(WidthOperand)->getType()->getVectorNumElements(); + unsigned TargetWidth = IsTyped ? 8 : 16; + if (Width <= TargetWidth) + return false; + assert((Width % TargetWidth) == 0); + auto NumSplits = Width / TargetWidth; + assert(NumSplits == 2 || NumSplits == 4); + unsigned NumChannels = 1; + if (MaskIdx != NONEED) { + NumChannels = (unsigned)cast(CI->getArgOperand(MaskIdx)) + ->getZExtValue(); + NumChannels = (NumChannels & 1) + ((NumChannels & 2) >> 1) + + ((NumChannels & 4) >> 2) + ((NumChannels & 8) >> 3); + } + + unsigned NumBlks = 1; + if (IID == GenXIntrinsic::genx_svm_scatter || + IID == GenXIntrinsic::genx_svm_gather) { + NumBlks = (unsigned)cast(CI->getArgOperand(1))->getZExtValue(); + NumBlks = (1 << NumBlks); + auto ElmSz = CI->getArgOperand(DataIdx)->getType()->getScalarSizeInBits() / 8; + if (ElmSz == 1 && NumBlks < 4) + NumBlks = 4; + else if (ElmSz == 2 && NumBlks < 2) + NumBlks = 2; + } + const DebugLoc &DL = CI->getDebugLoc(); + Value *NewResult = nullptr; + if (CI->getType() && + CI->getType()->isVectorTy() && + CI->getType()->getVectorNumElements() >= Width * NumChannels * NumBlks) { + if (DataIdx != NONEED) + NewResult = CI->getArgOperand(DataIdx); + else + NewResult = UndefValue::get(CI->getType()); + } + + bool IsNewLoad = isNewLoadInst(CI); + Value *LoadPred = nullptr; + unsigned InitialOffset = 0; + Instruction *InstToReplace = CI; + if (IsNewLoad) + std::tie(NewResult, LoadPred, InitialOffset, InstToReplace) = + getLoadTarget(CI, ST); + + for (unsigned i = 0; i < NumSplits; ++i) { + SmallVector Args; + // initialize the args with the old values + for (unsigned ArgI = 0; ArgI < CI->getNumArgOperands(); ++ArgI) + Args.push_back(CI->getArgOperand(ArgI)); + // Predicate + if (PredIdx != NONEED) { + Value *V = CI->getArgOperand(PredIdx); + if (auto C = dyn_cast(V)) + Args[PredIdx] = getConstantSubvector(C, i * TargetWidth, TargetWidth); + else + Args[PredIdx] = Region::createRdPredRegion( + V, i * TargetWidth, TargetWidth, "predsplit", CI, DL); + } + // address source + unsigned NumAddrs = 1; + if (IsTyped) + NumAddrs = (AtomicNumSrc >= 0) ? 4 : 3; + for (unsigned AddrI = 0; AddrI < NumAddrs; ++AddrI) { + Value *V = CI->getArgOperand(AddrIdx + AddrI); + Region R(V); + R.Width = R.NumElements = TargetWidth; + R.Offset = i * TargetWidth * V->getType()->getScalarSizeInBits()/8; // in bytes + Args[AddrIdx + AddrI] = R.createRdRegion(V, "addrsplit", CI, DL); + } + // data source + // We need to construct a new vector with 8 elements per enabled + // color. + if (DataIdx != NONEED) { + Value *V = CI->getArgOperand(DataIdx); + auto DataTy = VectorType::get(V->getType()->getScalarType(), + TargetWidth * NumChannels * NumBlks); + auto ElmSz = V->getType()->getScalarSizeInBits() / 8; + Value *NewVec = UndefValue::get(DataTy); + if (!isa(V)) { + for (unsigned Channel = 0; Channel < NumChannels; ++Channel) { + Region RdR(V); + RdR.Width = RdR.NumElements = TargetWidth * NumBlks; + RdR.Offset = 4 * (Width * NumBlks * Channel + TargetWidth * NumBlks * i); + auto Rd = RdR.createRdRegion(V, "datasplit", CI, DL); + if (NumChannels > 1) { + Region WrR(DataTy); + WrR.Width = WrR.NumElements = TargetWidth * NumBlks; + WrR.Offset = ElmSz * TargetWidth * NumBlks * Channel; + NewVec = WrR.createWrRegion(NewVec, Rd, "datasplit", CI, DL); + } else + NewVec = Rd; + } + } + Args[DataIdx] = NewVec; + } + // atomic source operands + if (AtomicSrcIdx != NONEED) { + for (int SrcI = 0; SrcI < AtomicNumSrc; ++SrcI) { + Value *V = CI->getArgOperand(AtomicSrcIdx + SrcI); + Region R(V); + R.Width = R.NumElements = TargetWidth; + R.Offset = i * TargetWidth * V->getType()->getScalarSizeInBits()/8; // in bytes + Args[AtomicSrcIdx + SrcI] = R.createRdRegion(V, "addrsplit", CI, DL); + } + } + // now create the new narrower instruction + if (NewResult) { + Type *DstTy = nullptr; + if (DataIdx != NONEED) + DstTy = Args[DataIdx]->getType(); + else { + DstTy = VectorType::get(CI->getType()->getScalarType(), + TargetWidth * NumBlks * NumChannels); + } + SmallVector Tys = {DstTy}; + if (PredIdx != NONEED) + Tys.push_back(Args[PredIdx]->getType()); + if (AddrIdx != NONEED) + Tys.push_back(Args[AddrIdx]->getType()); + auto Decl = GenXIntrinsic::getAnyDeclaration( + CI->getParent()->getParent()->getParent(), IID, Tys); + auto *Gather = CallInst::Create(Decl, Args, CI->getName() + ".split", CI); + Gather->setDebugLoc(DL); + if (IsNewLoad) { + if (NumChannels == 1) + NewResult = generate1ChannelWrrregion(NewResult, InitialOffset, + Gather, LoadPred, i, CI); + else + NewResult = + generateNChannelWrregion(NewResult, InitialOffset, Gather, + LoadPred, i, NumSplits, NumChannels, CI); + continue; + } + // Join the results together, starting with the old value. + auto ElmSz = DstTy->getScalarSizeInBits() / 8; + if (NumChannels > 1) { + Region RdR(Gather); + RdR.Width = RdR.NumElements = TargetWidth * NumBlks; + Region WrR(NewResult); + WrR.Width = WrR.NumElements = TargetWidth * NumBlks; + WrR.Mask = Args[PredIdx]; + for (unsigned Channel = 0; Channel != NumChannels; ++Channel) { + RdR.Offset = ElmSz * TargetWidth * NumBlks * Channel; + auto Rd = RdR.createRdRegion(Gather, "joint", CI, DL); + WrR.Offset = 4 * (Width * NumBlks * Channel + TargetWidth * NumBlks * i); + NewResult = WrR.createWrRegion(NewResult, Rd, "join", CI, DL); + } + } else { + Region WrR(NewResult); + WrR.Width = WrR.NumElements = TargetWidth * NumBlks; + WrR.Offset = ElmSz * TargetWidth * NumBlks * i; + WrR.Mask = Args[PredIdx]; + NewResult = WrR.createWrRegion(NewResult, Gather, "join", CI, DL); + } + } else { + assert(CI->use_empty()); + assert(DataIdx != NONEED); + // Create the target-wide scatter instructions. + Type *Tys[] = {Args[PredIdx]->getType(), Args[AddrIdx]->getType(), + Args[DataIdx]->getType()}; + auto Decl = GenXIntrinsic::getAnyDeclaration( + CI->getParent()->getParent()->getParent(), IID, Tys); + auto NewInst = CallInst::Create(Decl, Args, "", CI); + NewInst->setDebugLoc(DL); + } + } + + if (NewResult) + InstToReplace->replaceAllUsesWith(NewResult); + + if (InstToReplace != CI) + ToErase.push_back(InstToReplace); + ToErase.push_back(CI); + return true; +} + + +/*********************************************************************** + * processInst : process one instruction in GenXLowering + * + * Return: whether any change was made, and thus the current instruction + * is now marked for erasing + */ +bool GenXLowering::processInst(Instruction *Inst) { + if (isa(Inst)) + return lowerInsertElement(Inst); + if (isa(Inst)) + return lowerExtractElement(Inst); + if (isa(Inst)) + return lowerTrunc(Inst); + if (isa(Inst)) + return lowerCast(Inst); + if (auto SI = dyn_cast(Inst)) { + if (SI->getType()->getScalarType()->isIntegerTy(1)) { + if (SI->getType() == SI->getCondition()->getType()) + return lowerBoolVectorSelect(SI); + return lowerBoolScalarSelect(SI); + } + // Try lowering a non-bool select to wrregion. If lowerSelect decides + // not to, and it is a byte operation, widen it if necessary. + return lowerSelect(SI) || widenByteOp(SI); + } + if (auto SI = dyn_cast(Inst)) { + if (SI->getType()->getScalarType()->isIntegerTy(1)) + return lowerBoolShuffle(SI); + return lowerShuffle(SI); + } + if (isa(Inst)) { + if (widenByteOp(Inst)) + return true; + if (Inst->getOpcode() == Instruction::AShr || + Inst->getOpcode() == Instruction::LShr) + return lowerShr(Inst); + if (Inst->getOpcode() == Instruction::Mul) + return lowerMul64(Inst); + return false; + } + if (Inst->getOpcode() == Instruction::ICmp) + return widenByteOp(Inst); + else if (auto CI = dyn_cast(Inst)) + return lowerFCmpInst(CI); + if (CallInst *CI = dyn_cast(Inst)) { + if (CI->isInlineAsm()) + return false; + processTwoAddressOpnd(CI); + unsigned IntrinsicID = GenXIntrinsic::not_any_intrinsic; + if (Function *Callee = CI->getCalledFunction()) { + IntrinsicID = GenXIntrinsic::getAnyIntrinsicID(Callee); + assert(CI->getNumArgOperands() < GenXIntrinsicInfo::OPNDMASK); + } + // split gather/scatter/atomic into the width legal to the target + if (splitGatherScatter(CI, IntrinsicID)) + return true; + switch (IntrinsicID) { + case GenXIntrinsic::genx_rdregioni: + case GenXIntrinsic::genx_rdregionf: + return lowerRdRegion(Inst); + case GenXIntrinsic::genx_wrregioni: + case GenXIntrinsic::genx_wrregionf: + return lowerWrRegion(Inst); + case GenXIntrinsic::genx_rdpredregion: + return lowerRdPredRegion(Inst); + case GenXIntrinsic::genx_wrpredregion: + return lowerWrPredRegion(Inst); + case GenXIntrinsic::not_any_intrinsic: + break; + case Intrinsic::dbg_value: + case GenXIntrinsic::genx_absf: + case GenXIntrinsic::genx_absi: + break; + default: + case GenXIntrinsic::genx_constantpred: + case GenXIntrinsic::genx_constanti: + case GenXIntrinsic::genx_constantf: + break; // ignore + case GenXIntrinsic::genx_vload: { + if (!Inst->use_empty()) { + Value *Ptr = Inst->getOperand(0); + LoadInst *LI = new LoadInst(Ptr, "", /*volatile*/ true, Inst); + LI->takeName(Inst); + LI->setDebugLoc(Inst->getDebugLoc()); + Inst->replaceAllUsesWith(LI); + } + ToErase.push_back(Inst); + return true; + } + case GenXIntrinsic::genx_vstore: { + Value *Val = Inst->getOperand(0); + Value *Ptr = Inst->getOperand(1); + auto ST = new StoreInst(Val, Ptr, /*volatile*/ true, Inst); + ST->setDebugLoc(Inst->getDebugLoc()); + ToErase.push_back(Inst); + return true; + } + case Intrinsic::trap: + return lowerTrap(CI); + case Intrinsic::ctpop: + return lowerCtpop(CI); + case Intrinsic::uadd_with_overflow: + return lowerUAddWithOverflow(CI); + case Intrinsic::sadd_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::usub_with_overflow: + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: + Inst->getContext().emitError( + Inst, "GenX backend cannot handle overflowing intrinsics yet"); + break; + } + return false; + } + if (ExtractValueInst *EV = dyn_cast(Inst)) + return lowerExtractValue(EV); + if (InsertValueInst *IV = dyn_cast(Inst)) + return lowerInsertValue(IV); + if (isa(Inst) || isa(Inst)) + return lowerLoadStore(Inst); + if (isa(Inst)) + Inst->getContext().emitError(Inst, + "GenX backend cannot handle allocas yet"); + return false; +} + +/*********************************************************************** + * lowerRdRegion : handle read region instruction + * + * Return: whether any change was made, and thus the current instruction + * is now marked for erasing + * + * 1. If index is variable do add sinking on it. (This in itself does not + * cause this function to return true, because it does not cause the + * original instruction to be replaced.) + */ +bool GenXLowering::lowerRdRegion(Instruction *Inst) { + // Sink add in address calculation. + Use *U = &Inst->getOperandUse(GenXIntrinsic::GenXRegion::RdIndexOperandNum); + *U = sinkAdd(*U); + return false; +} + +/*********************************************************************** + * lowerWrRegion : handle write region instruction + * + * Return: whether any change was made, and thus the current instruction + * is now marked for erasing + * + * 1. If index is variable do add sinking on it. (This in itself does not + * cause this function to return true, because it does not cause the + * original instruction to be replaced.) + * + * 2. If it is a predicated byte wrregion, see if it can be widened. + */ +bool GenXLowering::lowerWrRegion(Instruction *Inst) { + // Sink add in address calculation. + Use *U = &Inst->getOperandUse(GenXIntrinsic::GenXRegion::WrIndexOperandNum); + *U = sinkAdd(*U); + // See if a predicated byte wrregion can be widened. + return widenByteOp(Inst); +} + +/*********************************************************************** + * lowerRdPredRegion : handle read predicate region instruction + * + * Return: whether any change was made, and thus the current instruction + * is now marked for erasing + * + * rdpredregion is a GenX backend internal intrinsic, and was thus created + * within this GenXLowering pass. However it is considered legal only if its + * uses are all in select or wrregion or wrpredpredregion; if not we lower + * it further here. If a use is in rdpredregion, we need to combine the two + * rdpredregions into one. + */ +bool GenXLowering::lowerRdPredRegion(Instruction *Inst) { + SmallVector RdPredRegionUsers; + bool Ok = true; + for (auto ui = Inst->use_begin(), ue = Inst->use_end(); ui != ue; ++ui) { + auto User = cast(ui->getUser()); + if (isa(User)) + continue; + unsigned IID = GenXIntrinsic::getAnyIntrinsicID(User); + if (GenXIntrinsic::isWrRegion(IID)) + continue; + if (IID == GenXIntrinsic::genx_wrpredpredregion) + continue; + if (IID == GenXIntrinsic::genx_rdpredregion) { + RdPredRegionUsers.push_back(cast(User)); + continue; + } + if (IID == GenXIntrinsic::not_any_intrinsic) { + Ok = false; + break; + } + if (cast(User)->doesNotAccessMemory()) { + Ok = false; + break; + } + } + unsigned Start = cast(Inst->getOperand(1))->getZExtValue(); + unsigned Size = Inst->getType()->getVectorNumElements(); + if (Ok) { + // All uses in select/wrregion/rdpredregion/non-ALU intrinsic, so we can + // keep the rdpredregion. Check for uses in another rdpredregion; we need + // to combine those. + for (auto ui = RdPredRegionUsers.begin(), ue = RdPredRegionUsers.end(); + ui != ue; ++ui) { + auto User = *ui; + unsigned UserStart = + cast(User->getOperand(1))->getZExtValue(); + unsigned UserSize = User->getType()->getVectorNumElements(); + auto Combined = + Region::createRdPredRegion(Inst->getOperand(0), Start + UserStart, + UserSize, "", User, User->getDebugLoc()); + Combined->takeName(User); + User->replaceAllUsesWith(Combined); + ToErase.push_back(User); + } + return false; + } + // Need to lower it further. + const DebugLoc &DL = Inst->getDebugLoc(); + // Convert input to vector of short. + auto In = Inst->getOperand(0); + Type *I16Ty = Type::getInt16Ty(Inst->getContext()); + Type *InI16Ty = VectorType::get(I16Ty, In->getType()->getVectorNumElements()); + auto InI16 = CastInst::Create(Instruction::ZExt, In, InI16Ty, + Inst->getName() + ".lower1", Inst); + InI16->setDebugLoc(DL); + // Use rdregion to extract the region. + Region R(InI16); + R.getSubregion(Start, Size); + auto Rd = R.createRdRegion(InI16, Inst->getName() + ".lower3", Inst, DL); + // Convert back to predicate. + auto Res = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE, Rd, + Constant::getNullValue(Rd->getType()), + Inst->getName() + ".lower4", Inst); + Res->setDebugLoc(DL); + // Replace uses and erase. + Inst->replaceAllUsesWith(Res); + ToErase.push_back(Inst); + return true; +} + +/*********************************************************************** + * lowerWrPredRegion : handle write predicate region instruction + * + * Return: whether any change was made, and thus the current instruction + * is now marked for erasing + * + * wrpredregion is a GenX backend internal intrinsic, and was thus created + * within this GenXLowering pass. However it is considered legal only if its + * "new value" input is a compare; if not we lower it further here. + */ +bool GenXLowering::lowerWrPredRegion(Instruction *Inst) { + auto NewVal = Inst->getOperand(1); + if (isa(NewVal)) + return false; + // Need to lower it further. + const DebugLoc &DL = Inst->getDebugLoc(); + // Convert "old value" input to vector of short. + auto OldVal = Inst->getOperand(0); + Type *I16Ty = Type::getInt16Ty(Inst->getContext()); + Type *OldValI16Ty = + VectorType::get(I16Ty, OldVal->getType()->getVectorNumElements()); + auto OldValI16 = CastInst::Create(Instruction::ZExt, OldVal, OldValI16Ty, + Inst->getName() + ".lower1", Inst); + OldValI16->setDebugLoc(DL); + // Convert "new value" input to vector of short. + Type *NewValI16Ty = + VectorType::get(I16Ty, NewVal->getType()->getVectorNumElements()); + auto NewValI16 = CastInst::Create(Instruction::ZExt, NewVal, NewValI16Ty, + Inst->getName() + ".lower2", Inst); + NewValI16->setDebugLoc(DL); + // Use wrregion to write the new value into the old value. + Region R(OldValI16); + R.getSubregion(cast(Inst->getOperand(2))->getZExtValue(), + NewValI16Ty->getVectorNumElements()); + auto Wr = R.createWrRegion(OldValI16, NewValI16, Inst->getName() + ".lower3", + Inst, DL); + // Convert back to predicate. + auto Res = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE, Wr, + Constant::getNullValue(Wr->getType()), + Inst->getName() + ".lower4", Inst); + Res->setDebugLoc(DL); + // Replace uses and erase. + Inst->replaceAllUsesWith(Res); + ToErase.push_back(Inst); + return true; +} + +/*********************************************************************** + * lowerInsertElement : lower InsertElement to wrregion, multiplying the + * index by the element size + * + * Return: whether any change was made, and thus the current instruction + * is now marked for erasing + */ +bool GenXLowering::lowerInsertElement(Instruction *Inst) { + Instruction *NewInst = NULL; + // Special case - if the result has 1 element (usually turning scalar into 1 + // element vector) then simply transform the insert element into a bitcast We + // don't need to worry about the index since if it is not zero the result is + // undef anyway (and can be set to anything we like) We also don't need to + // worry about what the original vector is (usually undef) since it will be + // overwritten or undef + VectorType *VT = dyn_cast(Inst->getType()); + assert(VT); + unsigned NumElements = VT->getNumElements(); + const DebugLoc &DL = Inst->getDebugLoc(); + if (NumElements == 1) { + Value *ToInsert = Inst->getOperand(1); + NewInst = CastInst::Create(Instruction::BitCast, ToInsert, VT, + Inst->getName(), Inst); + NewInst->setDebugLoc(DL); + } else if (!Inst->getType()->getScalarType()->isIntegerTy(1)) { + // Cast and scale the index. + Value *IdxVal = scaleInsertExtractElementIndex( + Inst->getOperand(2), Inst->getOperand(1)->getType(), Inst); + // Sink adds in the address calculation. + IdxVal = sinkAdd(IdxVal); + // Create the new wrregion + Value *Src = Inst->getOperand(1); + Region R(Src); + R.Indirect = IdxVal; + NewInst = cast(R.createWrRegion( + Inst->getOperand(0), Src, Inst->getName(), Inst /*InsertBefore*/, DL)); + } else { + // Boolean insertelement. We have to cast everything to i16, do the + // insertelement, and cast it back again. All this gets further lowered + // subsequently. + auto I16Ty = Type::getIntNTy(Inst->getContext(), 16); + auto VecTy = + VectorType::get(I16Ty, Inst->getType()->getVectorNumElements()); + auto CastVec = + CastInst::Create(Instruction::ZExt, Inst->getOperand(0), VecTy, + Inst->getOperand(0)->getName() + ".casti16", Inst); + CastVec->setDebugLoc(DL); + auto CastEl = + CastInst::Create(Instruction::ZExt, Inst->getOperand(1), I16Ty, + Inst->getOperand(1)->getName() + ".casti16", Inst); + CastEl->setDebugLoc(DL); + auto NewInsert = InsertElementInst::Create(CastVec, CastEl, + Inst->getOperand(2), "", Inst); + NewInsert->takeName(Inst); + NewInsert->setDebugLoc(DL); + NewInst = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE, NewInsert, + Constant::getNullValue(VecTy), + NewInsert->getName() + ".casti1", Inst); + NewInst->setDebugLoc(DL); + } + // Change uses and mark the old inst for erasing. + Inst->replaceAllUsesWith(NewInst); + ToErase.push_back(Inst); + return true; +} + +/*********************************************************************** + * lowerExtractElement : lower ExtractElement to rdregion, multiplying the + * index by the element size + * + * Return: whether any change was made, and thus the current instruction + * is now marked for erasing + */ +bool GenXLowering::lowerExtractElement(Instruction *Inst) { + Instruction *NewInst = nullptr; + if (!Inst->getType()->isIntegerTy(1)) { + // Cast and scale the index. + Type *ElTy = Inst->getType(); + Value *IdxVal = + scaleInsertExtractElementIndex(Inst->getOperand(1), ElTy, Inst); + // Sink adds in the address calculation. + IdxVal = sinkAdd(IdxVal); + // Create the new rdregion. + Region R(Inst); + R.Indirect = IdxVal; + NewInst = R.createRdRegion(Inst->getOperand(0), Inst->getName(), + Inst /*InsertBefore*/, Inst->getDebugLoc(), + true /*AllowScalar*/); + } else { + // Boolean extractelement. We have to cast everything to i16, do the + // extractelement, and cast it back again. All this gets further lowered + // subsequently. + auto I16Ty = Type::getIntNTy(Inst->getContext(), 16); + auto VecTy = VectorType::get( + I16Ty, Inst->getOperand(0)->getType()->getVectorNumElements()); + auto CastVec = + CastInst::Create(Instruction::ZExt, Inst->getOperand(0), VecTy, + Inst->getOperand(0)->getName() + ".casti16", Inst); + const DebugLoc &DL = Inst->getDebugLoc(); + CastVec->setDebugLoc(DL); + auto NewExtract = + ExtractElementInst::Create(CastVec, Inst->getOperand(1), "", Inst); + NewExtract->takeName(Inst); + NewExtract->setDebugLoc(DL); + NewInst = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_NE, NewExtract, + Constant::getNullValue(I16Ty), + NewExtract->getName() + ".casti1", Inst); + NewInst->setDebugLoc(DL); + } + // Change uses and mark the old inst for erasing. + Inst->replaceAllUsesWith(NewInst); + ToErase.push_back(Inst); + return true; +} + +/*********************************************************************** + * scaleInsertExtractElementIndex : scale index by element byte size, + * and ensure it is an i16 + */ +Value *GenXLowering::scaleInsertExtractElementIndex(Value *IdxVal, Type *ElTy, + Instruction *InsertBefore) { + // Do the cast and multiply. + unsigned ElementBytes = ElTy->getPrimitiveSizeInBits() / 8; + IntegerType *I16Ty = Type::getInt16Ty(IdxVal->getContext()); + if (ConstantInt *CI = dyn_cast(IdxVal)) + return ConstantInt::get(I16Ty, CI->getSExtValue() * ElementBytes); + // Ensure the variable offset is i16. + Instruction *IdxInst = CastInst::CreateIntegerCast( + IdxVal, I16Ty, false /*isSigned*/, "cast", InsertBefore); + IdxInst->setDebugLoc(InsertBefore->getDebugLoc()); + // Multiply it by the element size in bytes. + if (ElementBytes != 1) { + IdxInst = BinaryOperator::Create( + Instruction::Shl, IdxInst, + ConstantInt::get(I16Ty, genx::log2(ElementBytes)), "scale", + InsertBefore); + IdxInst->setDebugLoc(InsertBefore->getDebugLoc()); + } + return IdxInst; +} + +/*********************************************************************** + * lowerTrunc : lower a TruncInst + * + * Return: whether any change was made, and thus the current instruction + * is now marked for erasing + * + * A Trunc is lowered to a bitcast then a region/element read with a stride. + * GenXCoalescing will coalesce the bitcast, so this will hopefully save + * an instruction. + */ +bool GenXLowering::lowerTrunc(Instruction *Inst) { + Value *InValue = Inst->getOperand(0); + // Check for the trunc's input being a sext/zext where the original element + // size is the same as the result of the trunc. We can just remove the + // whole thing then. (This can arise from GenXReduceIntSize.) + if (auto CI = dyn_cast(InValue)) { + if ((isa(CI) || isa(CI)) && + CI->getOperand(0)->getType() == Inst->getType()) { + // Just replace uses with the original unextended value. + Inst->replaceAllUsesWith(CI->getOperand(0)); + ToErase.push_back(Inst); + return true; + } + } + + // Lower "trunc i8 %v to i1" into "cmp.ne (%v & 1), 0" + if (Inst->getType()->isIntOrIntVectorTy(1)) { + IRBuilder<> Builder(Inst); + auto V = + Builder.CreateAnd(InValue, ConstantInt::get(InValue->getType(), 1)); + V = Builder.CreateICmpNE(V, ConstantInt::get(V->getType(), 0)); + if (auto I = dyn_cast(V)) + I->setDebugLoc(Inst->getDebugLoc()); + Inst->replaceAllUsesWith(V); + ToErase.push_back(Inst); + return true; + } + + Type *InElementTy = InValue->getType(); + Type *OutElementTy = Inst->getType(); + unsigned NumElements = 1; + if (VectorType *VT = dyn_cast(InElementTy)) { + InElementTy = VT->getElementType(); + OutElementTy = cast(OutElementTy)->getElementType(); + NumElements = VT->getNumElements(); + } + + // Lower "trunc <32 x i16> %v to <32 x i1>" into "cmp.ne (%v & 1), 0" + if (NumElements > 1 && OutElementTy->isIntegerTy(1)) { + IRBuilder<> Builder(Inst); + unsigned N = NumElements; + Value *Os = ConstantVector::getSplat(N, ConstantInt::get(InElementTy, 1)); + Value *Zs = ConstantVector::getSplat(N, ConstantInt::get(InElementTy, 0)); + auto V = Builder.CreateAnd(InValue, Os); + if (auto I = dyn_cast(V)) + I->setDebugLoc(Inst->getDebugLoc()); + V = Builder.CreateICmpNE(V, Zs); + if (auto I = dyn_cast(V)) + I->setDebugLoc(Inst->getDebugLoc()); + Inst->replaceAllUsesWith(V); + ToErase.push_back(Inst); + return true; + } + + assert(OutElementTy->getPrimitiveSizeInBits()); + unsigned Stride = InElementTy->getPrimitiveSizeInBits() / + OutElementTy->getPrimitiveSizeInBits(); + // Create the new bitcast. + Instruction *BC = + CastInst::Create(Instruction::BitCast, InValue, + VectorType::get(OutElementTy, Stride * NumElements), + Inst->getName(), Inst /*InsertBefore*/); + BC->setDebugLoc(Inst->getDebugLoc()); + // Create the new rdregion. + Region R(BC); + R.NumElements = NumElements; + R.Stride = Stride; + R.Width = NumElements; + R.VStride = R.Stride * R.Width; + Instruction *NewInst = R.createRdRegion( + BC, Inst->getName(), Inst /*InsertBefore*/, Inst->getDebugLoc(), + !isa(Inst->getType()) /*AllowScalar*/); + // Change uses and mark the old inst for erasing. + Inst->replaceAllUsesWith(NewInst); + ToErase.push_back(Inst); + return true; +} + +/*********************************************************************** + * lowerCast : lower a CastInst + * + * Return: whether any change was made, and thus the current instruction + * is now marked for erasing + */ +bool GenXLowering::lowerCast(Instruction *Inst) { + // If it is zext/sext/UIToFP from (vector of) i1, turn into a select. + if (Inst->getOperand(0)->getType()->getScalarType()->isIntegerTy(1) && + Inst->getOpcode() != Instruction::BitCast) { + int OneVal = 0; + switch (Inst->getOpcode()) { + case Instruction::ZExt: + OneVal = 1; + break; + case Instruction::SExt: + OneVal = -1; + break; + case Instruction::UIToFP: + OneVal = 1; + break; + default: + assert(0 && "unknown opcode in lowerCast"); + } + + Instruction *NewInst; + if (Inst->getType()->isFPOrFPVectorTy()) + NewInst = SelectInst::Create( + Inst->getOperand(0), ConstantFP::get(Inst->getType(), OneVal), + ConstantFP::get(Inst->getType(), 0), Inst->getName(), Inst); + else + NewInst = SelectInst::Create( + Inst->getOperand(0), ConstantInt::get(Inst->getType(), OneVal), + ConstantInt::get(Inst->getType(), 0), Inst->getName(), Inst); + NewInst->setDebugLoc(Inst->getDebugLoc()); + Inst->replaceAllUsesWith(NewInst); + ToErase.push_back(Inst); + return true; + } + return false; +} + +/*********************************************************************** + * lowerSelect : lower a non-i1 select + * + * Return: whether any change was made, and thus the current instruction + * is now marked for erasing + * + * Lower select into predicated wrr. This transform is profitable + * if we can bale into resulting wrr later + */ +bool GenXLowering::lowerSelect(SelectInst *SI) { + assert(SI); + + if (!isa(SI->getOperand(0)->getType())) + return false; // scalar selector + + // Do not lower byte select, because byte wrr then will be widened + if (SI->getTrueValue()->getType()->getScalarType()->isIntegerTy(8)) + return false; + + Value *Cond = SI->getCondition(); + Value *TrueVal = SI->getTrueValue(); + Value *FalseVal = SI->getFalseValue(); + + // Do not transform if one of the sources is constant. + // Now post-legalization generarates redundant moves for constants. + // It's also required for correct baling of function pointers' PtrToInts + // into select. + // This check can be relaxed. + if (isa(TrueVal) || isa(FalseVal)) + return false; + + // If select is used by unmasked wrr than we do not apply transformation too + // because wrr+wrr is not optimal. In this case select itself will bale into + // wrr. There might be some cases where wrr user of + // select can be eliminated too. + if (SI->hasOneUse() && GenXIntrinsic::isWrRegion(SI->user_back())) { + auto *I = cast(SI->user_back()); + if ((I->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum) == SI) && + !Region(I, BaleInfo()).Mask) + return false; + } + + // GenXPatternMatch tries to convert cmp + select + // into min/max instructions. So do not transform in this case + // This check can be relaxed too. + if (isa(Cond)) + return false; + + bool TrueValUsedOnce = TrueVal->hasOneUse(); + bool FalseValUsedOnce = FalseVal->hasOneUse(); + + // Baling produces better code if balable + // value has single use + if (!FalseValUsedOnce && !TrueValUsedOnce) + return false; + + // So select this value + bool InvertPred = false; + Value *OldWrrVal = FalseVal; + Value *NewWrrVal = TrueVal; + if (GotoJoin::isEMValue(Cond) && !TrueValUsedOnce) { + // Conversion only for true val if EM since + // EM is implicit, inverting it will require extra instructions + return false; + } + + if (FalseValUsedOnce && !TrueValUsedOnce) { + std::swap(OldWrrVal, NewWrrVal); + InvertPred = true; + } + + // Main check: profitable only if we can bale later + Region R(SI); + R.Mask = Cond; + if (!GenXBaling::isBalableNewValueIntoWrr(NewWrrVal, R, ST)) + return false; + + // Inverting predicate if false value of select was choosen + // as new value for wrr + if (InvertPred) { + R.Mask = BinaryOperator::Create( + Instruction::Xor, R.Mask, Constant::getAllOnesValue(R.Mask->getType()), + SI->getName() + ".invertpred", SI); + cast(R.Mask)->setDebugLoc(SI->getDebugLoc()); + } + + auto NewWrRegion = cast(R.createWrRegion( + OldWrrVal, NewWrrVal, SI->getName() + ".lower", SI, SI->getDebugLoc())); + SI->replaceAllUsesWith(NewWrRegion); + ToErase.push_back(SI); + return true; +} + +/*********************************************************************** + * lowerBoolScalarSelect : lower a SelectInst on vector of i1 + * + * Return: whether any change was made, and thus the current instruction + * is now marked for erasing + * + * This is a select on vector of i1 where the condition is scalar. This only + * happens in simd control flow where an LLVM pass has optimized away the + * conditional branch. We restore the conditional branch and create an + * if..else..endif. + */ +bool GenXLowering::lowerBoolScalarSelect(SelectInst *SI) { + // BB1 + // / | + // false / | true + // / | + // BB2 | + // \ | + // \ | + // \ | + // BB4 + // + auto BB1 = SI->getParent(); + auto BB2 = SplitBlock(BB1, SI, DT); + auto BB4 = SplitEdge(BB1, BB2, DT); + BB2->setName("select.false"); + BB4->setName("select.true"); + + auto OldTerm = BB1->getTerminator(); + BranchInst::Create(BB4, BB2, SI->getCondition(), OldTerm); + OldTerm->eraseFromParent(); + // Since additional edge is added between BB1 and BB4 instead of through BB2 + // only. BB4 is not immediately dominated by BB2 anymore. Instead, BB4 is + // dominated by BB1 immediately. + if (DT) + DT->changeImmediateDominator(BB4, BB1); + // Replace 'select' with 'phi' + auto Phi = PHINode::Create(SI->getType(), /*NumReservedValues=*/2, "", + &BB4->front()); + Phi->takeName(SI); + Phi->addIncoming(SI->getTrueValue(), BB1); + Phi->addIncoming(SI->getFalseValue(), BB2); + SI->replaceAllUsesWith(Phi); + ToErase.push_back(SI); + // Split the (critical) edge from BB1 to BB4 to avoid having critical edge. + auto BB3 = SplitEdge(BB1, BB4, DT); + BB3->setName("select.crit"); + return true; +} + +/*********************************************************************** + * lowerBoolVectorSelect : lower a SelectInst on (vector of) i1 + * + * Return: whether any change was made, and thus the current instruction + * is now marked for erasing + * + * A select on (vector of) i1 is lowered to the equivalent and/or/xor + * instructions. No simplification is done even if an input is a constant. + * + * However, if the selector looks like an EM value, and the "true" operand is + * a cmp, it is instead lowered to an llvm.genx.wrpredpredregion. Baling will + * bale the cmp into it, resulting in a masked cmp instruction that sets bits + * of the flag only if the corresponding EM bit is set. + * + * FIXME: I have seen a case where the two inputs are all false and all true. + * Rather than try and simplify that here in the GenX backend, we should + * try and work out how to stop LLVM generating it in the first place. + */ +bool GenXLowering::lowerBoolVectorSelect(SelectInst *Inst) { + if (isa(Inst->getTrueValue())) { + // Check for the condition being an EM value. It might be a shufflevector + // that slices the EM value at index 0. + bool IsEM = GotoJoin::isEMValue(Inst->getCondition()); + if (!IsEM) { + if (auto SV = dyn_cast(Inst->getCondition())) { + ShuffleVectorAnalyzer SVA(SV); + if (!SVA.getAsSlice()) { + // Slice at index 0. + IsEM = GotoJoin::isEMValue(SV->getOperand(0)); + } + } + } + if (IsEM) { + // Can be lowered to llvm.genx.wrpredpredregion. It always has an index of + // 0 and the "new value" operand the same vector width as the whole vector + // here. That might get changed if it is split up in legalization. + auto NewInst = Region::createWrPredPredRegion( + Inst->getFalseValue(), Inst->getTrueValue(), 0, Inst->getCondition(), + "", Inst, Inst->getDebugLoc()); + NewInst->takeName(Inst); + Inst->replaceAllUsesWith(NewInst); + ToErase.push_back(Inst); + return true; + } + } + // Normal lowering to some bit twiddling. + Instruction *NewInst1 = + BinaryOperator::Create(BinaryOperator::And, Inst->getOperand(0), + Inst->getOperand(1), Inst->getName(), Inst); + NewInst1->setDebugLoc(Inst->getDebugLoc()); + Instruction *NewInst2 = BinaryOperator::Create( + BinaryOperator::Xor, Inst->getOperand(0), + Constant::getAllOnesValue(Inst->getType()), Inst->getName(), Inst); + NewInst2->setDebugLoc(Inst->getDebugLoc()); + Instruction *NewInst3 = + BinaryOperator::Create(BinaryOperator::And, Inst->getOperand(2), NewInst2, + Inst->getName(), Inst); + NewInst3->setDebugLoc(Inst->getDebugLoc()); + Instruction *NewInst4 = BinaryOperator::Create( + BinaryOperator::Or, NewInst1, NewInst3, Inst->getName(), Inst); + NewInst4->setDebugLoc(Inst->getDebugLoc()); + Inst->replaceAllUsesWith(NewInst4); + ToErase.push_back(Inst); + return true; +} + +/*********************************************************************** + * lowerBoolShuffle : lower a shufflevector (element type i1) + * + * Return: whether any change was made, and thus the current instruction + * is now marked for erasing + * + * We handle three cases: + * + * 1. A slice of the vector, which can be turned into rdpredregion. + * + * 2. A splat. By default we need to lower that to a select to + * 0 or -1 then a bitcast to the vector of i1. But if the input is the + * result of a cmp then we can splat the cmp as an optimization. + * + * 3. An unslice of the vector, which can be turned into wrpredregion. + */ +bool GenXLowering::lowerBoolShuffle(ShuffleVectorInst *SI) { + ShuffleVectorAnalyzer SVA(SI); + // 1. Check for a slice. + int SliceStart = SVA.getAsSlice(); + if (SliceStart >= 0) { + unsigned Width = SI->getType()->getVectorNumElements(); + auto RPR = Region::createRdPredRegion(SI->getOperand(0), SliceStart, Width, + "", SI, SI->getDebugLoc()); + RPR->takeName(SI); + SI->replaceAllUsesWith(RPR); + ToErase.push_back(SI); + return true; + } + // 2. Check for a splat. + auto Splat = SVA.getAsSplat(); + if (Splat.Input) + return lowerBoolSplat(SI, Splat.Input, Splat.Index); + // 3. Check for an unslice. The "old value" input is operand 0 of the + // shufflevector; the "new value" input is operand 0 of the shufflevector + // that is operand 1 of SI. We create a wrpredregion, but GenXLowering might + // subsequently decide that it is illegal because its "new value" input is not + // a compare, in which case it is further lowered. + int UnsliceStart = SVA.getAsUnslice(); + if (UnsliceStart >= 0) { + auto InnerSI = cast(SI->getOperand(1)); + auto WPR = + Region::createWrPredRegion(SI->getOperand(0), InnerSI->getOperand(0), + UnsliceStart, "", SI, SI->getDebugLoc()); + WPR->takeName(SI); + SI->replaceAllUsesWith(WPR); + // Undef out the operand for InnerSI in SI, so we can directly erase InnerSI + // if SI was its only use. + SI->setOperand(1, UndefValue::get(InnerSI->getType())); + ToErase.push_back(SI); + if (InnerSI->use_empty()) + InnerSI->eraseFromParent(); + return true; + } + + // Do not lower replicated slices. + if (SVA.isReplicatedSlice()) + return false; + + // No other cases handled. + SI->getContext().emitError( + SI, "general bool shuffle vector instruction not implemented"); + return false; +} + +/*********************************************************************** + * lowerBoolSplat : lower a shufflevector (element type i1) that is a splat + * + * Return: whether any change was made, and thus the current instruction + * is now marked for erasing + */ +bool GenXLowering::lowerBoolSplat(ShuffleVectorInst *SI, Value *In, + unsigned Idx) { + unsigned Width = SI->getType()->getVectorNumElements(); + if (isa(In->getType())) { + IRBuilder<> B(SI); + Constant *C1 = ConstantVector::getSplat(Width, B.getInt16(1)); + Constant *C0 = ConstantVector::getSplat(Width, B.getInt16(0)); + Value *V = B.CreateSelect(In, C1, C0); + Region R(V); + R.NumElements = Width; + R.Stride = 0; + R.VStride = 0; + R.Offset = (int)Idx; + V = R.createRdRegion(V, "splat", SI, SI->getDebugLoc()); + V = B.CreateICmpNE(V, C0); + SI->replaceAllUsesWith(V); + ToErase.push_back(SI); + return true; + } + // This is a splat. See if the input is a cmp, possibly via a bitcast. + if (auto BC = dyn_cast(In)) + In = BC->getOperand(0); + if (auto Cmp = dyn_cast(In)) { + // Create a splatted version of the cmp. + Value *CmpOpnds[2]; + Region R(Cmp->getOperand(0)); + R.NumElements = Width; + R.Width = R.NumElements; + R.Stride = 0; + R.VStride = 0; + for (unsigned i = 0; i != 2; ++i) { + auto Opnd = Cmp->getOperand(i); + if (auto C = dyn_cast(Opnd)) { + CmpOpnds[i] = ConstantVector::getSplat(R.NumElements, C); + continue; + } + if (!isa(Opnd->getType())) { + auto NewBC = CastInst::Create(Instruction::BitCast, Opnd, + VectorType::get(Opnd->getType(), 1), + Opnd->getName() + ".bc", Cmp); + NewBC->setDebugLoc(Cmp->getDebugLoc()); + Opnd = NewBC; + } + CmpOpnds[i] = + R.createRdRegion(Opnd, Cmp->getOperand(i)->getName() + ".splat", + Cmp /*InsertBefore*/, Cmp->getDebugLoc()); + } + auto NewCmp = CmpInst::Create( + Cmp->getOpcode(), Cmp->getPredicate(), CmpOpnds[0], CmpOpnds[1], + Cmp->getName() + ".splat", Cmp /*InsertBefore*/); + NewCmp->setDebugLoc(Cmp->getDebugLoc()); + SI->replaceAllUsesWith(NewCmp); + ToErase.push_back(SI); + return true; + } + // Default code. Select int and bitcast to vector of i1. + if (isa(In->getType())) { + // First convert v1i1 to i1. + auto NewBC = CastInst::Create(Instruction::BitCast, In, + In->getType()->getScalarType(), + In->getName() + ".scalar", SI); + NewBC->setDebugLoc(SI->getDebugLoc()); + In = NewBC; + } + if (Width == 8 || Width == 16 || Width == 32) { + auto IntTy = Type::getIntNTy(SI->getContext(), Width); + auto Sel = SelectInst::Create(In, Constant::getAllOnesValue(IntTy), + Constant::getNullValue(IntTy), + SI->getName() + ".sel", SI); + Sel->setDebugLoc(SI->getDebugLoc()); + auto NewBC = + CastInst::Create(Instruction::BitCast, Sel, SI->getType(), "", SI); + NewBC->takeName(SI); + NewBC->setDebugLoc(SI->getDebugLoc()); + SI->replaceAllUsesWith(NewBC); + ToErase.push_back(SI); + return true; + } + + IRBuilder<> Builder(SI); + auto Val = Builder.CreateSelect(In, Builder.getInt16(1), Builder.getInt16(0), + SI->getName() + ".sel"); + if (auto Inst = dyn_cast(Val)) + Inst->setDebugLoc(SI->getDebugLoc()); + Val = Builder.CreateBitCast(Val, VectorType::get(Builder.getInt16Ty(), 1)); + if (auto Inst = dyn_cast(Val)) + Inst->setDebugLoc(SI->getDebugLoc()); + + Region R(Val); + R.Offset = 0; + R.Width = 1; + R.Stride = R.VStride = 0; + R.NumElements = Width; + Val = R.createRdRegion(Val, "", SI, SI->getDebugLoc()); + Val = Builder.CreateICmpNE(Val, ConstantVector::getNullValue(Val->getType())); + Val->takeName(SI); + if (auto Inst = dyn_cast(Val)) + Inst->setDebugLoc(SI->getDebugLoc()); + SI->replaceAllUsesWith(Val); + ToErase.push_back(SI); + return true; +} + +/*********************************************************************** + * lowerShuffleSplat : lower a ShuffleInst (element type not i1) when it is + * a splat (repetition of the same element) + */ +void GenXLowering::lowerShuffleSplat(ShuffleVectorInst *SI, + ShuffleVectorAnalyzer::SplatInfo Splat) { + // This is a splat. Turn it into a splatting rdregion. + if (!isa(Splat.Input->getType())) { + // The input is a scalar rather than a 1-vector. Bitcast it to a 1-vector. + auto *BC = CastInst::Create(Instruction::BitCast, Splat.Input, + VectorType::get(Splat.Input->getType(), 1), + SI->getName(), SI); + BC->setDebugLoc(SI->getDebugLoc()); + Splat.Input = BC; + } + // Create a rdregion with a stride of 0 to represent this splat + Region R(Splat.Input); + R.NumElements = SI->getType()->getVectorNumElements(); + R.Width = R.NumElements; + R.Stride = 0; + R.VStride = 0; + R.Offset = Splat.Index * R.ElementBytes; + Instruction *NewInst = + R.createRdRegion(Splat.Input, "", SI /*InsertBefore*/, SI->getDebugLoc()); + NewInst->takeName(SI); + NewInst->setDebugLoc(SI->getDebugLoc()); + SI->replaceAllUsesWith(NewInst); + ToErase.push_back(SI); +} + +/*********************************************************************** + * lowerShuffle : lower a ShuffleInst (element type not i1) + * + * Mostly these are splats. These are lowered to a rdregion + * Any other shuffle is currently unsupported + */ +bool GenXLowering::lowerShuffle(ShuffleVectorInst *SI) { + auto Splat = ShuffleVectorAnalyzer(SI).getAsSplat(); + if (Splat.Input) { + lowerShuffleSplat(SI, Splat); + return true; + } + if (lowerShuffleToSelect(SI)) + return true; + lowerShuffleToMove(SI); + return true; +} + +// Lower those shufflevector that can be implemented efficiently as select. +bool GenXLowering::lowerShuffleToSelect(ShuffleVectorInst *SI) { + int NumElements = SI->getType()->getVectorNumElements(); + int NumOpnd = SI->getNumOperands(); + for (int i = 0; i < NumOpnd; ++i) { + if (SI->getOperand(i)->getType()->getVectorNumElements() != NumElements) + return false; + } + for (int i = 0; i < NumElements; ++i) { + int idx = SI->getMaskValue(i); + // undef index returns -1. + if (idx < 0) + continue; + if (idx != i && idx != i + NumElements) + return false; + } + IRBuilder<> Builder(SI); + Type *Int1Ty = Builder.getInt1Ty(); + SmallVector MaskVec; + MaskVec.reserve(NumElements); + for (int i = 0; i < NumElements; ++i) { + int idx = SI->getMaskValue(i); + // undef index returns -1. + if (idx == i || idx < 0) + MaskVec.push_back(ConstantInt::get(Int1Ty, 1)); + else + MaskVec.push_back(ConstantInt::get(Int1Ty, 0)); + } + Value *Mask = ConstantVector::get(MaskVec); + auto NewSel = + SelectInst::Create(Mask, SI->getOperand(0), SI->getOperand(1), "", SI); + NewSel->takeName(SI); + NewSel->setDebugLoc(SI->getDebugLoc()); + SI->replaceAllUsesWith(NewSel); + ToErase.push_back(SI); + return true; +} + +template Iter skipUndefs(Iter First, Iter Last) { + return std::find_if(First, Last, [](int MaskVal) { return MaskVal != -1; }); +} + +/*********************************************************************** + * lowerShuffleToMove : lower a ShuffleInst (element type not i1) to a + * sequence of rd/wrregion intrinsics + */ +void GenXLowering::lowerShuffleToMove(ShuffleVectorInst *SI) { + ShuffleVectorAnalyzer Analyzer(SI); + std::vector RdRegions; + std::vector WrRegions; + auto MaskVals = SI->getShuffleMask(); + + // Filling read and write regions based on shuffle mask. + for (auto It = skipUndefs(MaskVals.begin(), MaskVals.end()); + It != MaskVals.end(); + It = skipUndefs(std::next(It, RdRegions.back().R.NumElements), + MaskVals.end())) { + int Idx = It - MaskVals.begin(); + auto OpRegion = Analyzer.getMaskRegionPrefix(Idx); + assert(OpRegion.R.NumElements > 0 && + "should've match at least 1 element region"); + Region WrRegion(SI); + WrRegion.Offset = Idx * WrRegion.ElementBytes; + WrRegion.NumElements = WrRegion.Width = OpRegion.R.NumElements; + RdRegions.push_back(std::move(OpRegion)); + WrRegions.push_back(std::move(WrRegion)); + } + + // Building rdregion intrinsics or promoting the operand if possible. + std::vector RdRegionInsts; + std::transform( + RdRegions.begin(), RdRegions.end(), std::back_inserter(RdRegionInsts), + [SI](ShuffleVectorAnalyzer::OperandRegionInfo &OpRegion) -> Value * { + if (OpRegion.Op->getType()->getVectorNumElements() == + OpRegion.R.NumElements) + return OpRegion.Op; + return OpRegion.R.createRdRegion( + OpRegion.Op, SI->getName() + ".shuffle.rd", SI, SI->getDebugLoc()); + }); + + // Obtaining SI replacement (sequence of wrregion intrinsics in the + // most common case). + Value *Result; + if (WrRegions.size() == 0) + Result = UndefValue::get(SI->getType()); + else if (WrRegions.size() == 1 && WrRegions.front().NumElements == + SI->getType()->getVectorNumElements()) + Result = RdRegionInsts.back(); + else { + auto WrRegionArgs = zip(WrRegions, RdRegionInsts); + Result = std::accumulate( + WrRegionArgs.begin(), WrRegionArgs.end(), + static_cast(UndefValue::get(SI->getType())), + [SI](Value *PrevWrRegionInst, + const std::tuple &Args) { + return std::get<0>(Args).createWrRegion( + PrevWrRegionInst, std::get<1>(Args), + SI->getName() + ".shuffle.wr", SI, SI->getDebugLoc()); + }); + } + + SI->replaceAllUsesWith(Result); + ToErase.push_back(SI); +} + +/*********************************************************************** + * lowerShr : lower Shl followed by AShr/LShr by the same amount + * into trunc+sext/zext + * + * Return: whether any change was made, and thus the current instruction + * is now marked for erasing + * + * See convertShlShr below. + */ +bool GenXLowering::lowerShr(Instruction *Inst) { + Instruction *NewInst = convertShlShr(Inst); + if (!NewInst) + return false; // no conversion done + ToErase.push_back(Inst); + auto Shl = cast(Inst->getOperand(0)); + if (Shl->hasOneUse()) + ToErase.push_back(Shl); + return true; +} + +/*********************************************************************** + * convertShlShr : convert Shl followed by AShr/LShr by the same amount + * into trunc+sext/zext + * + * Enter: Inst = the AShr or LShr instruction + * + * Return: 0 if no conversion done, else the new SExt/ZExt instruction. + * The original AShr/LShr is now unused, but neither original + * instruction is erased. + * + * This is the opposite to what instruction combining does! We want to change + * it back to trunc then extend because the trunc can then be lowered into + * a region, and the extend can sometimes be baled into whatever uses it. + * + * This is a separately callable global function so it can also be used + * from GenXReduceIntSize, which for other reasons of convenience runs before + * GenXLowering. + */ +Instruction *llvm::genx::convertShlShr(Instruction *Inst) { + unsigned NumBits = Inst->getType()->getScalarType()->getPrimitiveSizeInBits(); + auto C = dyn_cast(Inst->getOperand(1)); + if (!C) + return nullptr; + auto Shl = dyn_cast(Inst->getOperand(0)); + if (!Shl) + return nullptr; + if (Shl->getOpcode() != Instruction::Shl) + return nullptr; + if (Shl->getOperand(1) != C) + return nullptr; + if (isa(C->getType())) { + C = C->getSplatValue(); + if (!C) + return nullptr; + } + unsigned ShiftBits = cast(C)->getSExtValue(); + unsigned RemainingBits = NumBits - ShiftBits; + if (RemainingBits != 8 && RemainingBits != 16) + return nullptr; + // We have Shl+AShr or Shl+LShr that can be turned into trunc+sext/zext. + Type *ConvTy = Type::getIntNTy(Inst->getContext(), RemainingBits); + if (auto VT = dyn_cast(Inst->getType())) + ConvTy = VectorType::get(ConvTy, VT->getNumElements()); + auto Trunc = CastInst::Create(Instruction::Trunc, Shl->getOperand(0), ConvTy, + "", Inst); + Trunc->takeName(Shl); + Trunc->setDebugLoc(Shl->getDebugLoc()); + auto Ext = CastInst::Create(Inst->getOpcode() == Instruction::AShr + ? Instruction::SExt + : Instruction::ZExt, + Trunc, Inst->getType(), "", Inst); + Ext->takeName(Inst); + Ext->setDebugLoc(Inst->getDebugLoc()); + Inst->replaceAllUsesWith(Ext); + return Ext; +} + +/*********************************************************************** + * splitStructPhis : find struct phi nodes and split them + * + * Return: whether code modified + * + * Each struct phi node is split into a separate phi node for each struct + * element. This is needed because the GenX backend's liveness and coalescing + * code cannot cope with a struct phi. + * + * This is run in two places: firstly in GenXLowering, so that pass can then + * simplify any InsertElement and ExtractElement instructions added by the + * struct phi splitting. But then it needs to be run again in GenXLiveness, + * because other passes can re-insert a struct phi. The case I saw in + * hevc_speed was something commoning up the struct return from two calls in an + * if..else..endif. + */ +bool genx::splitStructPhis(Function *F) { + bool Modified = false; + for (Function::iterator fi = F->begin(), fe = F->end(); fi != fe; ++fi) { + BasicBlock *BB = &*fi; + for (BasicBlock::iterator bi = BB->begin();;) { + PHINode *Phi = dyn_cast(&*bi); + if (!Phi) + break; + ++bi; // increment here as splitStructPhi removes old phi node + if (isa(Phi->getType())) + Modified |= GenXLowering::splitStructPhi(Phi); + } + } + return Modified; +} + +/*********************************************************************** + * splitStructPhi : split a phi node with struct type by splitting into + * struct elements + */ +bool GenXLowering::splitStructPhi(PHINode *Phi) { + StructType *Ty = cast(Phi->getType()); + // Find where we need to insert the combine instructions. + Instruction *CombineInsertBefore = Phi->getParent()->getFirstNonPHI(); + // Now split the phi. + Value *Combined = UndefValue::get(Ty); + // For each struct element... + for (unsigned Idx = 0, e = Ty->getNumElements(); Idx != e; ++Idx) { + Type *ElTy = Ty->getTypeAtIndex(Idx); + // Create the new phi node. + PHINode *NewPhi = + PHINode::Create(ElTy, Phi->getNumIncomingValues(), + Phi->getName() + ".element" + Twine(Idx), Phi); + NewPhi->setDebugLoc(Phi->getDebugLoc()); + // Combine the new phi. + Instruction *Combine = InsertValueInst::Create( + Combined, NewPhi, Idx, NewPhi->getName(), CombineInsertBefore); + Combine->setDebugLoc(Phi->getDebugLoc()); + Combined = Combine; + // For each incoming... + for (unsigned In = 0, InEnd = Phi->getNumIncomingValues(); In != InEnd; + ++In) { + // Create an extractelement to get the individual element value. + // This needs to go before the terminator of the incoming block. + BasicBlock *IncomingBB = Phi->getIncomingBlock(In); + Value *Incoming = Phi->getIncomingValue(In); + Instruction *Extract = ExtractValueInst::Create( + Incoming, Idx, Phi->getName() + ".element" + Twine(Idx), + IncomingBB->getTerminator()); + Extract->setDebugLoc(Phi->getDebugLoc()); + // Add as an incoming of the new phi node. + NewPhi->addIncoming(Extract, IncomingBB); + } + } + Phi->replaceAllUsesWith(Combined); + Phi->eraseFromParent(); + return true; +} + +/*********************************************************************** + * lowerExtractValue : remove extractvalue if possible + * + * Return: whether any change was made, and thus the current instruction + * is now marked for erasing + * + * If we can trace the input of the extractvalue to the point where the + * value was inserted, use that value instead. + * + * Because we have already split struct phi nodes, we should just be left + * with insertvalue/extractvalue pairs that we can remove here. The + * exception is when a struct is passed in to or returned from a call. + * Then we leave the extractvalue for later handling in the register + * allocator. + */ +bool GenXLowering::lowerExtractValue(ExtractValueInst *Inst) { + ArrayRef EVIndices = Inst->getIndices(); + ArrayRef Indices = EVIndices; + Value *V = Inst->getAggregateOperand(); + for (;;) { + InsertValueInst *IV = dyn_cast(V); + if (!IV) { + // If we used up any indices, create a new extractvalue for the + // remaining ones. + if (Indices.size() != EVIndices.size()) { + Instruction *NewIV = ExtractValueInst::Create( + Inst->getAggregateOperand(), Indices, Inst->getName(), Inst); + NewIV->setDebugLoc(Inst->getDebugLoc()); + Inst->replaceAllUsesWith(NewIV); + ToErase.push_back(Inst); + return true; + } + return false; + } + // We have an insertvalue. See how many of the indices agree. + ArrayRef IVIndices = IV->getIndices(); + unsigned Match = 0; + while (Match < Indices.size() && Match < IVIndices.size() && + Indices[Match] == IVIndices[Match]) + ++Match; + if (!Match) { + // No match at all. Go back to the previous insertvalue. + V = IV->getAggregateOperand(); + continue; + } + // Use the inserted value here. + V = IV->getInsertedValueOperand(); + // Chop off the indices we have used up. If none left, we have finished. + Indices = Indices.slice(Match); + if (!Indices.size()) + break; + } + // We have found the struct element value V. + Inst->replaceAllUsesWith(V); + ToErase.push_back(Inst); + return true; +} + +/*********************************************************************** + * lowerInsertValue : remove insertvalue if possible + * + * Return: whether any change was made, and thus the current instruction + * is now marked for erasing + * + * In most cases, by the time we get to an insertvalue, it will be unused + * because of extractvalue removal. + * + * In a case where it is still used (probably because this function has an + * arg or return value that is a struct, or we call a function like that), + * the struct value is dealt with in register allocation. + */ +bool GenXLowering::lowerInsertValue(InsertValueInst *Inst) { + if (Inst->use_empty()) { + ToErase.push_back(Inst); + return true; + } + return false; +} + +/*********************************************************************** + * lowerUAddWithOverflow : lower llvm.uadd.with.overflow + * + * This could potentially be implemented with the vISA addc instruction. + * However an intrinsic for that would need extra GenX backend support for + * returning a struct containing two vectors, and that support does not exist + * now. + * + * So for now we use the old DEC Alpha trick of comparing the result with + * one of the operands. + */ +bool GenXLowering::lowerUAddWithOverflow(CallInst *CI) { + const DebugLoc &DL = CI->getDebugLoc(); + // Do the add. + auto Add = + BinaryOperator::Create(Instruction::Add, CI->getArgOperand(0), + CI->getArgOperand(1), CI->getName() + ".add", CI); + Add->setDebugLoc(DL); + // Do the comparison. (An unsigned add has overflowed if the result is + // smaller than one of the operands, and, if it has overflowed, the result + // is smaller than both of the operands. So it doesn't matter which operand + // we use for the comparison.) + auto Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULT, Add, + CI->getArgOperand(1), CI->getName() + ".cmp", CI); + Cmp->setDebugLoc(DL); + // For any extractvalue use of the result of the original add with overflow, + // replace it directly. + SmallVector Extracts; + for (auto ui = CI->use_begin(), ue = CI->use_end(); ui != ue; ++ui) + if (auto EVI = dyn_cast(ui->getUser())) + Extracts.push_back(EVI); + for (auto ei = Extracts.begin(), ee = Extracts.end(); ei != ee; ++ei) { + auto EVI = *ei; + EVI->replaceAllUsesWith(EVI->getIndices()[0] ? (Value *)Cmp : (Value *)Add); + EVI->setOperand(0, UndefValue::get(CI->getType())); + ToErase.push_back(EVI); + } + // If any uses of the original intrinsic remain, recreate the struct value. + if (!CI->use_empty()) { + auto Insert = InsertValueInst::Create(UndefValue::get(CI->getType()), Add, + 0, CI->getName() + ".insertadd", CI); + Insert->setDebugLoc(DL); + Insert = InsertValueInst::Create(Insert, Cmp, 1, + CI->getName() + ".insertcmp", CI); + Insert->setDebugLoc(DL); + // ... and use it to replace the original intrinsic. + CI->replaceAllUsesWith(Insert); + } + ToErase.push_back(CI); + return true; +} + +bool GenXLowering::lowerTrap(CallInst *CI) { + Module *M = CI->getModule(); + IRBuilder<> Builder(CI); + auto &Ctx = CI->getContext(); + unsigned EMWidth = 32; + Type *ArgTypes[] = {VectorType::get(Type::getInt1Ty(Ctx), EMWidth), + VectorType::get(Type::getInt16Ty(Ctx), EMWidth)}; + auto Fn = GenXIntrinsic::getGenXDeclaration(M, + GenXIntrinsic::genx_raw_send_noresult, ArgTypes); + SmallVector Args; + // send + Args.push_back(ConstantInt::get(Type::getInt32Ty(Ctx), 0)); + // predicate all lanes + Args.push_back(ConstantVector::getSplat(EMWidth, ConstantInt::getTrue(Ctx))); + // EOT + Args.push_back(ConstantInt::get(Type::getInt32Ty(Ctx), 0x27)); + Args.push_back(ConstantInt::get(Type::getInt32Ty(Ctx), 0x02000010)); + Args.push_back(ConstantVector::getSplat(EMWidth, Constant::getNullValue(Type::getInt16Ty(Ctx)))); + Builder.CreateCall(Fn, Args); + ToErase.push_back(CI); + + return true; +} + +bool GenXLowering::lowerCtpop(CallInst *CI) { + Module *M = CI->getModule(); + IRBuilder<> Builder(CI); + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + Type *Int32Ty = IntegerType::getInt32Ty(CI->getContext()); + Type *RetTy = nullptr; + if (auto *VT = dyn_cast(CI->getType())) + RetTy = VectorType::get(Int32Ty, VT->getNumElements()); + else + RetTy = Int32Ty; + + auto *CBitDecl = GenXIntrinsic::getGenXDeclaration( + M, GenXIntrinsic::genx_cbit, {RetTy, CI->getType()}); + Value *CBitInst = + Builder.CreateCall(CBitDecl, CI->getOperand(0), CI->getName()); + + CBitInst = Builder.CreateZExtOrTrunc(CBitInst, CI->getType()); + CI->replaceAllUsesWith(CBitInst); + ToErase.push_back(CI); + + return true; +} + +// Lower cmp instructions that GenX cannot deal with. +bool GenXLowering::lowerFCmpInst(FCmpInst *Inst) { + IRBuilder<> Builder(Inst); + Builder.SetCurrentDebugLocation(Inst->getDebugLoc()); + Value *Ops[] = {Inst->getOperand(0), Inst->getOperand(1)}; + + switch (Inst->getPredicate()) { + default: + break; + case CmpInst::FCMP_ORD: // True if ordered (no nans) + { + // %c = fcmp ord %a %b + // => + // %1 = fcmp oeq %a %a + // %2 = fcmp oeq %b %b + // %c = and %1 %2 + Value *LHS = Builder.CreateFCmpOEQ(Ops[0], Ops[0]); + Value *RHS = Builder.CreateFCmpOEQ(Ops[1], Ops[1]); + Value *New = Builder.CreateAnd(LHS, RHS); + Inst->replaceAllUsesWith(New); + ToErase.push_back(Inst); + return true; + } + case CmpInst::FCMP_UNO: // True if unordered: isnan(X) | isnan(Y) + // %c = fcmp uno %a %b + // => + // %1 = fcmp une %a %a + // %2 = fcmp une %b %b + // %c = or %1 %2 + Value *LHS = Builder.CreateFCmpUNE(Ops[0], Ops[0]); + Value *RHS = Builder.CreateFCmpUNE(Ops[1], Ops[1]); + Value *New = Builder.CreateOr(LHS, RHS); + Inst->replaceAllUsesWith(New); + ToErase.push_back(Inst); + return true; + } + + return false; +} + +// Lower cmp instructions that GenX cannot deal with. +bool GenXLowering::lowerMul64(Instruction *Inst) { + IRBuilder<> Builder(Inst); + Builder.SetCurrentDebugLocation(Inst->getDebugLoc()); + auto Src0 = Inst->getOperand(0); + auto Src1 = Inst->getOperand(1); + auto ETy = Src0->getType(); + auto Len = 1; + if (ETy->isVectorTy()) { + Len = ETy->getVectorNumElements(); + ETy = ETy->getVectorElementType(); + } + if (!ETy->isIntegerTy() || ETy->getPrimitiveSizeInBits() != 64) + return false; + auto VTy = VectorType::get(ETy->getInt32Ty(Inst->getContext()), Len * 2); + // create src0 bitcast, then the low and high part + auto Src0V = Builder.CreateBitCast(Src0, VTy); + Region R(Inst); + R.Offset = 0; + R.Width = Len; + R.NumElements = Len; + R.Stride = 2; + R.VStride = 0; + auto Src0L = R.createRdRegion(Src0V, "", Inst, Inst->getDebugLoc()); + R.Offset = 4; + auto Src0H = R.createRdRegion(Src0V, "", Inst, Inst->getDebugLoc()); + // create src1 bitcast, then the low and high part + auto Src1V = Builder.CreateBitCast(Src1, VTy); + R.Offset = 0; + auto Src1L = R.createRdRegion(Src1V, "", Inst, Inst->getDebugLoc()); + R.Offset = 4; + auto Src1H = R.createRdRegion(Src1V, "", Inst, Inst->getDebugLoc()); + // create muls and adds + auto ResL = Builder.CreateMul(Src0L, Src1L); + // create the mulh intrinsic to the get the carry-part + Type *tys[2]; + SmallVector args; + // build type-list + tys[0] = ResL->getType(); + tys[1] = Src0L->getType(); + // build argument list + args.push_back(Src0L); + args.push_back(Src1L); + auto M = Inst->getParent()->getParent()->getParent(); + Function *IntrinFunc = + GenXIntrinsic::getGenXDeclaration(M, GenXIntrinsic::genx_umulh, tys); + Instruction *Cari = CallInst::Create(IntrinFunc, args, "", Inst); + Cari->setDebugLoc(Inst->getDebugLoc()); + auto Temp0 = Builder.CreateMul(Src0L, Src1H); + auto Temp1 = Builder.CreateAdd(Cari, Temp0); + auto Temp2 = Builder.CreateMul(Src0H, Src1L); + auto ResH = Builder.CreateAdd(Temp2, Temp1); + // create the write-regions + auto UndefV = UndefValue::get(VTy); + R.Offset = 0; + auto WrL = R.createWrRegion(UndefV, ResL, "WrLow", Inst, Inst->getDebugLoc()); + R.Offset = 4; + auto WrH = R.createWrRegion(WrL, ResH, "WrHigh", Inst, Inst->getDebugLoc()); + // create the bitcast to the destination-type + auto Replace = Builder.CreateBitCast(WrH, Inst->getType(), "mul64"); + Inst->replaceAllUsesWith(Replace); + ToErase.push_back(Inst); + return true; +} +/*********************************************************************** + * widenByteOp : widen a vector byte operation to short if that might + * improve code + * + * Return: whether any change was made, and thus the current instruction + * is now marked for erasing + * + * Gen has restrictions on byte operands. The jitter copes with that, but + * sometimes it needs to do even-odd splitting, which can lead to suboptimal + * code if cmps and predicates are involved. + * Here we attempt to pick up the common cases by converting a byte operation + * to short. + * + * Note that we might end up with the extends being baled into the instruction + * anyway, resulting in a byte operation in vISA. + */ +bool GenXLowering::widenByteOp(Instruction *Inst) { + if (!EnableGenXByteWidening) + return false; + Type *Ty = Inst->getType(); + if (isa(Inst)) + Ty = Inst->getOperand(0)->getType(); + if (!isa(Ty) || !Ty->getScalarType()->isIntegerTy(8)) + return false; // not byte operation + if (Inst->use_empty()) + return false; // result unused + // check use, if use is a phi, stop widenning + if (!isa(Inst)) { + for (auto ui = Inst->use_begin(), ue = Inst->use_end(); ui != ue; ++ui) { + auto User = cast(ui->getUser()); + if (isa(User)) + return false; + } + } + // For a predicated wrregion, widen by separating the predication into a + // rdregion and select, which can then be widened. + if (GenXIntrinsic::isWrRegion(Inst)) { + Region R(Inst, BaleInfo()); + if (R.NumElements == 1 || !R.Mask) + return false; + // Can only do this if the predicate is the right size. (We could handle + // the wrong size case by adding an rdpredregion, but then we would need + // to ensure that GenXLegalization can cope with an arbitrary size + // rdpredregion.) + if (R.Mask->getType()->getVectorNumElements() != R.NumElements) + return false; + // Create the rdregion and select. + auto NewRd = + R.createRdRegion(Inst->getOperand(0), Inst->getName() + ".byteselrdr", + Inst, Inst->getDebugLoc()); + auto NewSel = + SelectInst::Create(R.Mask, Inst->getOperand(1), NewRd, "", Inst); + NewSel->takeName(Inst); + NewSel->setDebugLoc(Inst->getDebugLoc()); + // Modify the existing wrregion. + Inst->setName(NewSel->getName() + ".byteselwrr"); + Inst->setOperand(1, NewSel); + Inst->setOperand(GenXIntrinsic::GenXRegion::PredicateOperandNum, + Constant::getAllOnesValue(R.Mask->getType())); + // Fall through for the select to get widened. + Inst = NewSel; + } + // Do the widening for: + // 1. a compare or select + // 2. used in a zext that indicates that the user has probably already been + // widened by this code. + bool Widen = false; + if (isa(Inst) || isa(Inst)) + Widen = true; + else { + auto user = cast(Inst->use_begin()->getUser()); + if (isa(user)) + Widen = true; + } + if (!Widen) + return false; + // Widen to short. + // Decide whether to zero or sign extend. Also decide whether the result is + // guaranteed to have all 0 bits in the extended part. + Instruction::CastOps ExtOpcode = Instruction::ZExt; + bool ExtendedIsZero = false; + switch (Inst->getOpcode()) { + case Instruction::SDiv: + case Instruction::AShr: + ExtOpcode = Instruction::SExt; + break; + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::LShr: + ExtendedIsZero = true; + break; + case Instruction::ICmp: + if (cast(Inst)->isSigned()) + ExtOpcode = Instruction::SExt; + break; + default: + break; + } + // Get the range of operands to process. + unsigned StartIdx = 0, EndIdx = Inst->getNumOperands(); + if (auto CI = dyn_cast(Inst)) + EndIdx = CI->getNumArgOperands(); + else if (isa(Inst)) + StartIdx = 1; + // Extend the operands. + auto ExtTy = VectorType::get( + Type::getInt16Ty(Inst->getContext()), + Inst->getOperand(StartIdx)->getType()->getVectorNumElements()); + SmallVector Opnds; + for (unsigned Idx = 0; Idx != EndIdx; ++Idx) { + Value *Opnd = Inst->getOperand(Idx); + if (Idx >= StartIdx) { + if (auto C = dyn_cast(Opnd)) + Opnd = ConstantExpr::getCast(ExtOpcode, C, ExtTy); + else { + auto NewExt = CastInst::Create(ExtOpcode, Opnd, ExtTy, + Inst->getName() + ".byteext", Inst); + NewExt->setDebugLoc(Inst->getDebugLoc()); + Opnd = NewExt; + } + } + Opnds.push_back(Opnd); + } + // Create the replacement instruction. + Instruction *NewInst = nullptr; + if (isa(Inst)) + NewInst = BinaryOperator::Create((Instruction::BinaryOps)Inst->getOpcode(), + Opnds[0], Opnds[1], "", Inst); + else if (auto CI = dyn_cast(Inst)) + NewInst = CmpInst::Create(CI->getOpcode(), CI->getPredicate(), Opnds[0], + Opnds[1], "", CI); + else if (isa(Inst)) + NewInst = SelectInst::Create(Opnds[0], Opnds[1], Opnds[2], "", Inst); + else + llvm_unreachable("unhandled instruction in widenByteOp"); + NewInst->takeName(Inst); + NewInst->setDebugLoc(Inst->getDebugLoc()); + if (ExtendedIsZero) { + // We know that the extended part of the result contains 0 bits. If we + // find that any use is a zext (probably from also being byte widened + // in this code), we can replace the use directly and save the + // trunc/zext pair. First put the uses in a vector as the use list will + // change under our feet. + SmallVector Uses; + for (auto ui = Inst->use_begin(), ue = Inst->use_end(); ui != ue; ++ui) + Uses.push_back(&*ui); + for (auto ui = Uses.begin(), ue = Uses.end(); ui != ue; ++ui) { + if (auto user = dyn_cast((*ui)->getUser())) { + if (user->getType() == NewInst->getType()) { + user->replaceAllUsesWith(NewInst); + ToErase.push_back(user); + // Remove the use of Inst from the trunc so we can tell whether there + // are any uses left below. + *(*ui) = UndefValue::get(Inst->getType()); + } + } + } + } + if (!Inst->use_empty()) { + // Truncate the result. + if (!isa(Inst)) { + NewInst = CastInst::Create(Instruction::Trunc, NewInst, Inst->getType(), + Inst->getName() + ".bytetrunc", Inst); + NewInst->setDebugLoc(Inst->getDebugLoc()); + } + // Replace uses. + Inst->replaceAllUsesWith(NewInst); + } + ToErase.push_back(Inst); + return true; +} + +static bool breakConstantVector(unsigned i, Instruction *CurInst, + Instruction *InsertPt) { + ConstantVector *CV = cast(CurInst->getOperand(i)); + + // Splat case. + if (auto S = dyn_cast_or_null(CV->getSplatValue())) { + // Turn element into an instruction + auto Inst = S->getAsInstruction(); + Inst->setDebugLoc(CurInst->getDebugLoc()); + Inst->insertBefore(InsertPt); + Type *NewTy = VectorType::get(Inst->getType(), 1); + Inst = CastInst::Create(Instruction::CastOps::BitCast, Inst, NewTy, "", + CurInst); + Inst->setDebugLoc(CurInst->getDebugLoc()); + + // Splat this value. + Region R(Inst); + R.Offset = 0; + R.Width = 1; + R.Stride = R.VStride = 0; + R.NumElements = CV->getNumOperands(); + Inst = R.createRdRegion(Inst, "", InsertPt /*InsertBefore*/, + Inst->getDebugLoc()); + + // Update i-th operand with newly created splat. + CurInst->setOperand(i, Inst); + return true; + } + + SmallVector Vals; + bool HasConstExpr = false; + for (unsigned j = 0, N = CV->getNumOperands(); j < N; ++j) { + Value *Elt = CV->getOperand(j); + if (auto CE = dyn_cast(Elt)) { + auto Inst = CE->getAsInstruction(); + Inst->setDebugLoc(CurInst->getDebugLoc()); + Inst->insertBefore(InsertPt); + Vals.push_back(Inst); + HasConstExpr = true; + } else + Vals.push_back(Elt); + } + + if (HasConstExpr) { + Value *Val = UndefValue::get(CV->getType()); + for (unsigned j = 0, N = CV->getNumOperands(); j < N; ++j) { + Region R(Vals[j]); + R.Offset = j * R.ElementBytes; + Val = + R.createWrRegion(Val, Vals[j], "", InsertPt, CurInst->getDebugLoc()); + } + CurInst->setOperand(i, Val); + return true; + } + + return false; +} + +bool genx::breakConstantExprs(Function *F) { + bool Modified = false; + for (po_iterator i = po_begin(&F->getEntryBlock()), + e = po_end(&F->getEntryBlock()); + i != e; ++i) { + BasicBlock *BB = *i; + // The effect of this loop is that we process the instructions in reverse + // order, and we re-process anything inserted before the instruction + // being processed. + for (Instruction *CurInst = BB->getTerminator(); CurInst;) { + PHINode *PN = dyn_cast(CurInst); + for (unsigned i = 0, e = CurInst->getNumOperands(); i < e; ++i) { + Instruction *InsertPt = + PN ? PN->getIncomingBlock(i)->getTerminator() : CurInst; + Value *Op = CurInst->getOperand(i); + if (getUnderlyingGlobalVariable(Op) != nullptr) + continue; + if (ConstantExpr *CE = dyn_cast(Op)) { + Instruction *NewInst = CE->getAsInstruction(); + NewInst->setDebugLoc(CurInst->getDebugLoc()); + NewInst->insertBefore(CurInst); + CurInst->setOperand(i, NewInst); + Modified = true; + } else if (isa(Op)) + Modified |= breakConstantVector(i, CurInst, InsertPt); + } + CurInst = CurInst == &BB->front() ? nullptr : CurInst->getPrevNode(); + } + } + return Modified; +} + +namespace { + +// Helper class to translate load/store into proper GenX intrinsic calls. +class LoadStoreResolver { + Instruction *Inst; + const GenXSubtarget *ST; + IRBuilder<> Builder; + +public: + LoadStoreResolver(Instruction *Inst, const GenXSubtarget *ST) + : Inst(Inst), ST(ST), Builder(Inst) {} + + // Resolve this instruction and return true on success. + bool resolve(); + +private: + bool isLoad() const { return isa(Inst); } + bool isStore() const { return isa(Inst); } + + const DataLayout &getDL() const { + Function *F = Inst->getParent()->getParent(); + return F->getParent()->getDataLayout(); + } + + // Find a proper GenX intrinsic ID for this load/store instruction. + GenXIntrinsic::ID getGenXIntrinsicID() const; + + unsigned getPointerSizeInBits() const { + unsigned AddrSp = 0; + if (auto LI = dyn_cast(Inst)) + AddrSp = LI->getPointerAddressSpace(); + else if (auto SI = dyn_cast(Inst)) + AddrSp = SI->getPointerAddressSpace(); + return getDL().getPointerSizeInBits(AddrSp); + } + + unsigned getValueSizeInBits(Type *T) const { + if (auto PT = dyn_cast(T)) { + unsigned AddrSp = PT->getAddressSpace(); + return getDL().getPointerSizeInBits(AddrSp); + } + return T->getPrimitiveSizeInBits(); + } + + // Return true if this load/store can be translated. + bool isSupported() const; + + // Emit actual intrinsic calls. + bool emitGather(); + bool emitScatter(); + bool emitSVMGather(); + bool emitSVMScatter(); +}; + +} // namespace + +// Translate store instructions into genx builtins. +bool GenXLowering::lowerLoadStore(Instruction *Inst) { + auto ST = getAnalysisIfAvailable(); + LoadStoreResolver Resolver(Inst, ST ? ST->getSubtarget() : nullptr); + if (Resolver.resolve()) { + ToErase.push_back(Inst); + return true; + } + return false; +} + +bool LoadStoreResolver::resolve() { + if (!isSupported()) + return false; + + GenXIntrinsic::ID ID = getGenXIntrinsicID(); + switch (ID) { + case GenXIntrinsic::genx_gather_scaled: + return emitGather(); + case GenXIntrinsic::genx_scatter_scaled: + return emitScatter(); + case GenXIntrinsic::genx_svm_gather: + return emitSVMGather(); + case GenXIntrinsic::genx_svm_scatter: + return emitSVMScatter(); + default: + break; + } + + return false; +} + +// Return true if this load/store can be translated. +bool LoadStoreResolver::isSupported() const { + auto IsGlobalLoadStore = [=]() { + Value *Ptr = nullptr; + if (auto LI = dyn_cast(Inst)) + Ptr = LI->getPointerOperand(); + if (auto SI = dyn_cast(Inst)) + Ptr = SI->getPointerOperand(); + return getUnderlyingGlobalVariable(Ptr) != nullptr; + }; + + if (IsGlobalLoadStore()) + return false; + + Type *ValTy = Inst->getType(); + if (auto SI = dyn_cast(Inst)) + ValTy = SI->getValueOperand()->getType(); + + // Only scalar data types. + if (!ValTy->isFloatingPointTy() && !ValTy->isIntegerTy() && + !ValTy->isPointerTy()) { + Inst->getContext().emitError(Inst, "unsupported type for load/store"); + return false; + } + + // Only legal types: float, double, half, i8, i16, 132, i64, pointer types. + unsigned NumBits = getValueSizeInBits(ValTy); + if (NumBits < 8 || NumBits > 64 || !isPowerOf2_32(NumBits)) { + Inst->getContext().emitError("unsupported integer type for load/store"); + return false; + } + + // Translate this instruction. + return true; +} + +// Find a proper GenX intrinsic ID for this load/store instruction. +GenXIntrinsic::ID LoadStoreResolver::getGenXIntrinsicID() const { + // A32 byte scattered stateless messages only work on CNL+. + unsigned NBits = getPointerSizeInBits(); + if (NBits == 32 && ST && !ST->WaNoA32ByteScatteredStatelessMessages()) + return isLoad() ? GenXIntrinsic::genx_gather_scaled + : GenXIntrinsic::genx_scatter_scaled; + return isLoad() ? GenXIntrinsic::genx_svm_gather + : GenXIntrinsic::genx_svm_scatter; +} + +bool LoadStoreResolver::emitGather() { + unsigned NBits = getPointerSizeInBits(); + Type *IntTy = IntegerType::get(Inst->getContext(), NBits); + auto LI = cast(Inst); + + // Global offset. + Value *Addr = LI->getPointerOperand(); + Addr = Builder.CreatePtrToInt(Addr, IntTy); + + unsigned NBlocks = getValueSizeInBits(LI->getType()) / 8; + unsigned NBlocksLog2 = llvm::Log2_32(NBlocks); + + // If this is more than 4 bytes, use a larger SIMD size. + unsigned SIMD = 1; + if (NBlocks > 4) { + SIMD = NBlocks / 4; + NBlocksLog2 = 2; + } + + // The old value is undef. + Type *ValTy = LI->getType(); + if (ValTy->isPointerTy()) + ValTy = Builder.getIntNTy(getValueSizeInBits(ValTy)); + Type *DataTy = VectorType::get(ValTy, 1); + if (SIMD > 1) + DataTy = VectorType::get(Builder.getInt32Ty(), SIMD); + Value *OldVal = UndefValue::get(DataTy); + + // Offset. + Type *EltOffsetTy = VectorType::get(Builder.getInt32Ty(), SIMD); + Value *EltOffset = Constant::getNullValue(EltOffsetTy); + if (SIMD > 1) { + SmallVector Offsets(SIMD); + for (unsigned i = 0; i < SIMD; ++i) + // Increase offset by 4 bytes for each lane. + Offsets[i] = i * 4; + EltOffset = ConstantDataVector::get(Inst->getContext(), Offsets); + } + + // Arguments. + Value *Args[] = { + Constant::getAllOnesValue(VectorType::get(Builder.getInt1Ty(), SIMD)), + Builder.getInt32(NBlocksLog2), // log[2](NBlocks) + Builder.getInt16(0), // scale + Builder.getInt32(visa::getReservedSurfaceIndex( + PreDefined_Surface::PREDEFINED_SURFACE_T255)), // surface + Addr, // global offset + EltOffset, // element offset + OldVal // old value + }; + + // Overload with return type, predicate type and element offset type + Type *Tys[] = {OldVal->getType(), Args[0]->getType(), EltOffsetTy}; + Module *M = Inst->getParent()->getParent()->getParent(); + auto Fn = GenXIntrinsic::getGenXDeclaration(M, GenXIntrinsic::genx_gather_scaled, Tys); + + Value *NewVal = Builder.CreateCall(Fn, Args); + NewVal = Builder.CreateBitCast(NewVal, ValTy); + LI->replaceAllUsesWith(NewVal); + return true; +} + +bool LoadStoreResolver::emitScatter() { + unsigned NBits = getPointerSizeInBits(); + Type *IntTy = IntegerType::get(Inst->getContext(), NBits); + auto SI = cast(Inst); + + // Global offset. + Value *Addr = SI->getPointerOperand(); + Addr = Builder.CreatePtrToInt(Addr, IntTy); + + Value *Val = SI->getValueOperand(); + unsigned NBlocks = getValueSizeInBits(Val->getType()) / 8; + unsigned NBlocksLog2 = llvm::Log2_32(NBlocks); + + // If this is more than 4 bytes, use a larger SIMD size. + unsigned SIMD = 1; + if (NBlocks > 4) { + SIMD = NBlocks / 4; + NBlocksLog2 = 2; + } + + // Value to write. + Type *ValTy = (SIMD > 1) ? Builder.getInt32Ty() : Val->getType(); + if (ValTy->isPointerTy()) + ValTy = Builder.getIntNTy(getValueSizeInBits(ValTy)); + Val = Builder.CreateBitCast(Val, VectorType::get(ValTy, SIMD)); + + // Offset. + Type *EltOffsetTy = VectorType::get(Builder.getInt32Ty(), SIMD); + Value *EltOffset = Constant::getNullValue(EltOffsetTy); + if (SIMD > 1) { + SmallVector Offsets(SIMD); + // Increase offset by 4 bytes for each lane. + for (unsigned i = 0; i < SIMD; ++i) + Offsets[i] = i * 4; + EltOffset = ConstantDataVector::get(Inst->getContext(), Offsets); + } + + // Arguments. + Value *Args[] = { + Constant::getAllOnesValue(VectorType::get(Builder.getInt1Ty(), SIMD)), + Builder.getInt32(NBlocksLog2), // log[2](NBlocks) + Builder.getInt16(0), // scale + Builder.getInt32(visa::getReservedSurfaceIndex( + PreDefined_Surface::PREDEFINED_SURFACE_T255)), // surface + Addr, // global offset + EltOffset, // element offset + Val // value to write + }; + + // Overload with predicate type, element offset type, value to write type. + Type *Tys[] = {Args[0]->getType(), EltOffsetTy, Val->getType()}; + Module *M = Inst->getParent()->getParent()->getParent(); + auto Fn = GenXIntrinsic::getGenXDeclaration(M, GenXIntrinsic::genx_scatter_scaled, Tys); + Builder.CreateCall(Fn, Args); + return true; +} + +// Compute the block size and the number of blocks for svm gather/scatter. +// +// Block_Size, 1, 4, 8 +// Num_Blocks, 1, 2, 4, +// 8 only valid for 4 byte blocks and execution size 8. +// +static unsigned getBlockCount(Type *Ty) { + unsigned NumBytes = Ty->getPrimitiveSizeInBits() / 8; + assert(NumBytes <= 8 && "out of sync"); + + // If this is N = 2 byte data, use 2 blocks; + // otherwise, use 1 block of N bytes. + return (NumBytes == 2) ? NumBytes : 1U; +} + +// Translate store to svm scatter. +bool LoadStoreResolver::emitSVMGather() { + unsigned NBits = getPointerSizeInBits(); + Type *IntTy = IntegerType::get(Inst->getContext(), NBits); + auto LI = cast(Inst); + + // Address. + Value *Addr = LI->getPointerOperand(); + Addr = Builder.CreatePtrToInt(Addr, IntTy); + if (NBits == 32) + Addr = Builder.CreateZExt(Addr, Builder.getInt64Ty()); + Addr = Builder.CreateBitCast(Addr, VectorType::get(Addr->getType(), 1)); + + // The old value is undef. + Type *ValTy = LI->getType(); + if (ValTy->isPointerTy()) + ValTy = Builder.getIntNTy(getValueSizeInBits(ValTy)); + Type *DataTy = VectorType::get(ValTy, 1); + Value *OldVal = UndefValue::get(DataTy); + + // Num of blocks. + unsigned NBlocks = getBlockCount(OldVal->getType()); + unsigned NBlocksLog2 = llvm::Log2_32(NBlocks); + + Value *Args[] = { + Constant::getAllOnesValue(VectorType::get(Builder.getInt1Ty(), 1)), + Builder.getInt32(NBlocksLog2), // log2(num_of_blocks) + Addr, // addresses + OldVal // old value + }; + + // Overload with return type, predicate type and address vector type + Type *Tys[] = {OldVal->getType(), Args[0]->getType(), Addr->getType()}; + Module *M = Inst->getParent()->getParent()->getParent(); + auto Fn = GenXIntrinsic::getGenXDeclaration(M, GenXIntrinsic::genx_svm_gather, Tys); + + Value *NewVal = Builder.CreateCall(Fn, Args); + NewVal = Builder.CreateBitCast(NewVal, ValTy); + if (LI->getType()->isPointerTy()) + NewVal = Builder.CreateIntToPtr(NewVal, LI->getType()); + LI->replaceAllUsesWith(NewVal); + return true; +} + +bool LoadStoreResolver::emitSVMScatter() { + unsigned NBits = getPointerSizeInBits(); + Type *IntTy = IntegerType::get(Inst->getContext(), NBits); + auto SI = cast(Inst); + + // Address + Value *Addr = SI->getPointerOperand(); + Addr = Builder.CreatePtrToInt(Addr, IntTy); + if (NBits == 32) + Addr = Builder.CreateZExt(Addr, Builder.getInt64Ty()); + Addr = Builder.CreateBitCast(Addr, VectorType::get(Addr->getType(), 1)); + + // data to write. + Value *Val = SI->getValueOperand(); + Type *ValTy = Val->getType(); + if (ValTy->isPointerTy()) { + ValTy = Builder.getIntNTy(getValueSizeInBits(ValTy)); + Val = Builder.CreatePtrToInt(Val, ValTy); + } + Val = Builder.CreateBitCast(Val, VectorType::get(ValTy, 1)); + + // Num of blocks. + unsigned NBlocks = getBlockCount(Val->getType()); + unsigned NBlocksLog2 = llvm::Log2_32(NBlocks); + + Value *Args[] = { + Constant::getAllOnesValue(VectorType::get(Builder.getInt1Ty(), 1)), + Builder.getInt32(NBlocksLog2), // log2(num_of_blocks) + Addr, // addresses + Val // value to write + }; + + // Overload with predicate type, address vector type, and data type + Type *Tys[] = {Args[0]->getType(), Addr->getType(), Val->getType()}; + Module *M = Inst->getParent()->getParent()->getParent(); + auto Fn = GenXIntrinsic::getGenXDeclaration(M, GenXIntrinsic::genx_svm_scatter, Tys); + + Builder.CreateCall(Fn, Args); + return true; +} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXModule.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXModule.cpp new file mode 100644 index 000000000000..8cec949b1f55 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXModule.cpp @@ -0,0 +1,140 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +// GenXModule is a module pass whose purpose is to store information +// about the GenX module being written, such as the built kernels and functions. +// See the comment in GenXModule.h. +// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "GENX_MODULE" + +#include "GenXModule.h" +#include "FunctionGroup.h" +#include "GenX.h" +#include "GenXSubtarget.h" +#include "GenXWATable.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/GenXIntrinsics/GenXMetadata.h" + +#include + +using namespace llvm; + +char GenXModule::ID = 0; +INITIALIZE_PASS_BEGIN(GenXModule, "GenXModule", "GenXModule", false, + true /*analysis*/) +INITIALIZE_PASS_DEPENDENCY(FunctionGroupAnalysis) +INITIALIZE_PASS_DEPENDENCY(GenXWATable) +INITIALIZE_PASS_END(GenXModule, "GenXModule", "GenXModule", false, + true /*analysis*/) + +ModulePass *llvm::createGenXModulePass() { + initializeGenXModulePass(*PassRegistry::getPassRegistry()); + return new GenXModule; +} + +void GenXModule::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.addRequired(); + AU.setPreservesAll(); +} + +bool GenXModule::CheckForInlineAsm(Module &M) const { + for (auto &F : M) + for (auto &BB : F) + for (auto &I : BB) { + CallInst *CI = dyn_cast(&I); + if (CI && CI->isInlineAsm()) + return true; + } + return false; +} + +/*********************************************************************** + * runOnModule : run GenXModule analysis + * + * This populates FunctionGroupAnalysis such that each FunctionGroup + * corresponds to a GenX kernel/function and its subroutines. If any + * subroutine would be used in more than one FunctionGroup, it is + * cloned. + * + * The FunctionGroup is populated in an order such that a function appears + * after all its callers. + */ +bool GenXModule::runOnModule(Module &M) { + auto FGA = &getAnalysis(); + auto P = getAnalysisIfAvailable(); + ST = P ? P->getSubtarget() : nullptr; + WaTable = getAnalysis().getWATable(); + Ctx = &M.getContext(); + + InlineAsm = CheckForInlineAsm(M); + + // Iterate, processing each Function that is not yet assigned to a + // FunctionGroup. + bool ModuleModified = false; + + // build callgraph and process subgroups + std::map> CG; + // TODO: for now it's a temporary workaround of strange ArgIndirection + // problems that it depends on order of functions withing a group + // This should be removed once indirection is fixed + std::map> Visited; + + for (auto T : FGA->TypesToProcess) { + for (auto &F : M) { + for (auto *U: F.users()) { + auto *Inst = dyn_cast(U); + if (!Inst) { + continue; + } + if (!F.empty() && Visited[Inst->getFunction()].count(&F) == 0) { + CG[Inst->getFunction()].push_back(&F); + Visited[Inst->getFunction()].insert(&F); + } + // recursive funcs must use stack + if (Inst->getFunction() == &F) + assert(F.hasFnAttribute(genx::FunctionMD::CMStackCall) && + "Found recursive function without CMStackCall attribute"); + } + } + + for (auto &F : M) { + if (F.empty() || F.getLinkage() == GlobalValue::InternalLinkage) + continue; + ModuleModified |= FGA->buildGroup(CG, &F, nullptr, T); + } + + FGA->clearVisited(); + CG.clear(); + Visited.clear(); + } + + return ModuleModified; +} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXModule.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXModule.h new file mode 100644 index 000000000000..bf95364457e4 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXModule.h @@ -0,0 +1,185 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXModule +/// ---------- +/// +/// GenXModule is a module pass whose purpose is to store information +/// about the module being written, such as the built kernels and functions. +/// +/// A vISA kernel or function can call a *subroutine*, which can +/// then call further subroutines. All called subroutines are considered part of +/// the kernel or function, which means that a subroutine used by two different +/// kernels needs to have a copy in each. The two copies may be treated differently +/// by the backend passes, so there does actually need to be two copies of the +/// subroutine in the LLVM IR in the backend, one called by each kernel. +/// +/// The GenXModule pass performs any necessary copying of subroutines, and +/// populates FunctionGroupAnalysis such that each kernel and its subroutines +/// make one FunctionGroup. +/// +/// Subsequent passes are mostly FunctionGroupPasses, so they process one +/// FunctionGroup at a time. +/// +/// GenXModule is also an analysis, preserved through subsequent passes to +/// GenXVisaWriter at the end, that is used to store each written vISA kernel. +/// +/// **IR restriction**: After this pass, the lead function in a FunctionGroup is +/// a kernel (or function in the vISA sense), and other functions in the same +/// FunctionGroup are its subroutines. A (non-intrinsic) call must be to a +/// function in the same FunctionGroup, and not the lead function. +/// +//===----------------------------------------------------------------------===// +#ifndef GENXMODULE_H +#define GENXMODULE_H + +#include "GenX.h" +#include "GenXBaling.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Pass.h" +#include "llvm/PassRegistry.h" + +#include + +#include +#include +#include + +class VISABuilder; +class VISAKernel; + +namespace llvm { + class raw_pwrite_stream; + class GenXSubtarget; + + namespace genx { + + // Stream : a class for streaming byte data, and then writing out to a + // formatted_output_stream. + class Stream { + std::vector V; + public: + void push_back(const void *Data, unsigned Size) { + unsigned Pos = V.size(); + V.resize(Pos + Size); + std::copy_n((const unsigned char *)Data, Size, V.begin() + Pos); + } + template void push_back(T Val) { push_back(&Val, sizeof(Val)); } + unsigned size() { return V.size(); } + void write(raw_pwrite_stream &Out); + void setData(unsigned Offset, const void *Data, unsigned Size) { + assert(Offset + Size <= size()); + std::copy_n((const unsigned char *)Data, Size, V.begin() + Offset); + } + }; + + // FuncWriter : a class to write the output for a GenX kernel or function + class FuncWriter { + public: + FuncWriter() {} + virtual ~FuncWriter() {} + // isKernel : true if the Func is a kernel + virtual bool isKernel() = 0; + // setOffset : set the offset field in the header + // For a kernel, it also sets the input_offset field in the header + virtual void setOffset(uint32_t O) = 0; + // get header/body size + virtual unsigned getHeaderSize() = 0; + virtual unsigned getBodySize() = 0; + // write header/body + virtual void writeHeader(raw_pwrite_stream &Out) = 0; + virtual void writeBody(raw_pwrite_stream &Out) = 0; + }; + + } // end namespace genx + + + //-------------------------------------------------------------------- + // GenXModule pass. Stores the information from various parts of the + // GenX writing process + class GenXModule : public ModulePass { + typedef std::vector FuncWriters_t; + FuncWriters_t FuncWriters; + const GenXSubtarget *ST; + LLVMContext *Ctx = nullptr; + WA_TABLE *WaTable = nullptr; + + void collectFinalizerArgs(std::vector &Owner) const; + void clearFinalizerArgs(std::vector& Owner) const; + + VISABuilder *CisaBuilder = nullptr; + std::vector CISA_Args; + void InitCISABuilder(); + + VISABuilder *VISAAsmTextReader = nullptr; + std::vector VISA_Args; + void InitVISAAsmReader(); + + bool InlineAsm = false; + bool CheckForInlineAsm(Module &M) const; + + std::map VisaKernelMap; + + public: + static char ID; + explicit GenXModule() : ModulePass(ID) {} + ~GenXModule() { + clearFinalizerArgs(VISA_Args); + clearFinalizerArgs(CISA_Args); + for (unsigned i = 0; i != FuncWriters.size(); i++) + delete FuncWriters[i]; + } + virtual StringRef getPassName() const { return "GenX module"; } + void getAnalysisUsage(AnalysisUsage &AU) const; + bool runOnModule(Module &M); + const GenXSubtarget *getSubtarget() { return ST; } + // iterator for FuncWriters list + typedef FuncWriters_t::iterator iterator; + iterator begin() { return FuncWriters.begin(); } + iterator end() { return FuncWriters.end(); } + void push_back(genx::FuncWriter *VF) { FuncWriters.push_back(VF); } + bool HasInlineAsm() const { return InlineAsm; } + VISABuilder *GetCisaBuilder(); + VISABuilder *GetVISAAsmReader(); + void DestroyCISABuilder(); + void DestroyVISAAsmReader(); + LLVMContext &getContext(); + + // Save and retrieve VISAKernels for given function. + void saveVisaKernel(const Function *F, VISAKernel *Kernel) { + assert(VisaKernelMap.count(F) == 0 && "Attempt to save kernel twice"); + VisaKernelMap[F] = Kernel; + } + // Valid only on GenXFinalizer stage until visa builder destructors called. + VISAKernel *getVISAKernel(const Function *F) const { + return VisaKernelMap.at(F); + } + }; + + void initializeGenXModulePass(PassRegistry &); + +} // end namespace llvm +#endif // ndef GENXMODULE_H diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXNumbering.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXNumbering.cpp new file mode 100644 index 000000000000..f9368a06d411 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXNumbering.cpp @@ -0,0 +1,392 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +// GenXNumbering is an analysis that provides a numbering of the instructions +// for use by live range segments. See GenXNumbering.h. +// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "GENX_NUMBERING" + +#include "GenXNumbering.h" +#include "GenX.h" +#include "GenXBaling.h" +#include "GenXLiveness.h" +#include "vc/GenXOpts/Utils/KernelInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/Support/Debug.h" + +#include "llvmWrapper/IR/InstrTypes.h" + +using namespace llvm; +using namespace genx; + +char GenXNumbering::ID = 0; +INITIALIZE_PASS_BEGIN(GenXNumbering, "GenXNumbering", "GenXNumbering", false, false) +INITIALIZE_PASS_DEPENDENCY(GenXGroupBaling) +INITIALIZE_PASS_END(GenXNumbering, "GenXNumbering", "GenXNumbering", false, false) + +FunctionGroupPass *llvm::createGenXNumberingPass() +{ + initializeGenXNumberingPass(*PassRegistry::getPassRegistry()); + return new GenXNumbering(); +} + +void GenXNumbering::getAnalysisUsage(AnalysisUsage &AU) const +{ + FunctionGroupPass::getAnalysisUsage(AU); + AU.addRequired(); + AU.setPreservesAll(); +} + +/*********************************************************************** + * runOnFunctionGroup : run pass + */ +bool GenXNumbering::runOnFunctionGroup(FunctionGroup &ArgFG) +{ + clear(); + FG = &ArgFG; + Baling = &getAnalysis(); + unsigned Num = 0; + for (auto fgi = FG->begin(), fge = FG->end(); fgi != fge; ++fgi) + Num = numberInstructionsInFunc(*fgi, Num); + LastNum = Num; + return false; +} + +/*********************************************************************** + * clear : clear the GenXNumbering + */ +void GenXNumbering::clear() +{ + BBNumbers.clear(); + Numbers.clear(); + NumberToPhiIncomingMap.clear(); +} + +/*********************************************************************** + * numberInstructionsInFunc : number the instructions in a function + */ +unsigned GenXNumbering::numberInstructionsInFunc(Function *Func, unsigned Num) +{ + // Number the function, reserving one number for the args. + Numbers[Func] = Num++; + for (Function::iterator fi = Func->begin(), fe = Func->end(); fi != fe; ++fi) { + BasicBlock *Block = &*fi; + // Number the basic block. + auto BBNumber = &BBNumbers[Block]; + BBNumber->Index = BBNumbers.size() - 1; + Numbers[Block] = Num++; + // If this is the first block of a kernel, reserve kernel arg copy slots. + if (Block == &Func->front() && isKernel(Func)) + for (auto ai = Func->arg_begin(), ae = Func->arg_end(); ai != ae; ++ai) + ++Num; + // Iterate the instructions. + Instruction *Inst; + for (BasicBlock::iterator bi = Block->begin(); ; ++bi) { + Inst = &*bi; + if (Inst->isTerminator()) + break; + // For most instructions, reserve one number for any pre-copy that + // coalescing needs to insert, and nothing after. + unsigned PreReserve = 1, PostReserve = 0; + if (auto CI = dyn_cast(Inst)) { + if (!GenXIntrinsic::isAnyNonTrivialIntrinsic(CI) && + !CI->isInlineAsm()) { + // For a non-intrinsic call, reserve enough numbers before the call + // for: + // - a slot for each element of the args, two numbers per element: + // 1. one for the address setup in case it is an address arg added + // by arg indirection (as returned by getArgIndirectionNumber()); + // 2. one for a pre-copy inserted if coalescing fails (as returned + // by getArgPreCopyNumber()); + // + // - a similar slot with two numbers for any address arg added by + // arg indirection (also as returned by getArgIndirectionNumber() + // and getArgPreCopyNumber()). + // + // Reserve enough numbers after the call for: + // - post-copies of (elements of) the return value, as returned by + // getRetPostCopyNumber(). + // + // Note that numbers get wasted because most call args do not need + // two slots, and most calls never have address args added by arg + // indirection. But treating all call args the same is easier, and + // wasting numbers does not really matter. + PreReserve = 2 * IndexFlattener::getNumArgElements( + CI->getFunctionType()); + PreReserve += 2 * CI->getNumArgOperands(); // extra for pre-copy addresses of args + unsigned NumRetVals = IndexFlattener::getNumElements(CI->getType()); + PreReserve += NumRetVals; // extra for pre-copy addresses of retvals + PostReserve = NumRetVals; + // Set the start number of the call so users of numbering can work out + // where the pre-copies are assumed to start, even if the call gets + // modified later by GenXArgIndirection. + setStartNumber(CI, Num); + } + } + // Number the instruction, reserving PreReserve. + Num += PreReserve; + Numbers[Inst] = Num; + Num += 1 + PostReserve; + } + // We have reached the terminator instruction but not yet numbered it. + // Reserve a number for each phi node in the successor. If there is + // more than one successor (this is a critical edge), then allow for + // whichever successor has the most phi nodes. + BBNumber->PhiNumber = Num; + auto TI = cast(Block->getTerminator()); + unsigned MaxPhis = 0; + for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) { + BasicBlock *Succ = TI->getSuccessor(i); + unsigned NumPhis = 0; + for (BasicBlock::iterator sbi = Succ->begin(), sbe = Succ->end(); sbi != sbe; ++sbi) { + if (!isa(&*sbi)) + break; + NumPhis++; + } + if (NumPhis > MaxPhis) + MaxPhis = NumPhis; + } + Num += MaxPhis; + // Now number the terminator instruction. Doing it here ensures that any + // input to the terminator instruction interferes with the results of the + // phi nodes of the successor. + unsigned PreReserve = 1; + if (isa(Inst)) { + // For a return, reserve enough numbers before for pre-copies of + // (elements of) the return value. + PreReserve = IndexFlattener::getNumElements(Func->getReturnType()); + } + Num += PreReserve; + Numbers[Inst] = Num++; + BBNumber->EndNumber = Num; + } + return Num; +} + +/*********************************************************************** + * getBaleNumber : get instruction number for head of bale, 0 if none + */ +unsigned GenXNumbering::getBaleNumber(Instruction *Inst) +{ + Inst = Baling->getBaleHead(Inst); + return getNumber(Inst); +} + +/*********************************************************************** + * getNumber : get instruction number, or 0 if none + */ +unsigned GenXNumbering::getNumber(Value *V) +{ + auto i = Numbers.find(V), e = Numbers.end(); + if (i == e) + return 0; + return i->second; +} + +/*********************************************************************** + * setNumber : get instruction number + */ +void GenXNumbering::setNumber(Value *V, unsigned Number) +{ + Numbers[V] = Number; +} + +/*********************************************************************** + * getArgIndirectionNumber : get number of arg indirection slot for call arg + * + * Enter: CI = CallInst + * OperandNum = operand (arg) number + * Index = flattened index in the struct + * + * Each flattened index in each call arg has an arg indirection slot before the + * call instruction, where a copy will be inserted if coalescing fails. Each + * slot in fact has two numbers, and this returns the first one. (The second + * one is used for arg pre-copy when coalescing fails.) + */ +unsigned GenXNumbering::getArgIndirectionNumber(CallInst *CI, unsigned OperandNum, + unsigned Index) +{ + auto FT = cast(CI->getFunctionType()); + return getStartNumber(CI) + 2 * (IndexFlattener::flattenArg(FT, OperandNum) + + Index); +} + +/*********************************************************************** + * getKernelArgCopyNumber : get number of kernel arg copy slot + */ +unsigned GenXNumbering::getKernelArgCopyNumber(Argument *Arg) +{ + assert(isKernel(Arg->getParent())); + return Numbers[&Arg->getParent()->front()] + 1 + Arg->getArgNo(); +} + +/*********************************************************************** + * getArgPreCopyNumber : get number of pre-copy slot for call arg + * + * Enter: CI = CallInst + * OperandNum = operand (arg) number + * Index = flattened index in the struct + * + * Each flattened index in each call arg has an arg pre-copy slot before the + * call instruction, where a copy will be inserted if coalescing fails. Each + * slot in fact has two numbers, and this returns the second one. (The first + * one is used for address loading in arg indirection.) + */ +unsigned GenXNumbering::getArgPreCopyNumber(CallInst *CI, unsigned OperandNum, + unsigned Index) +{ + return getArgIndirectionNumber(CI, OperandNum, Index) + 1; +} + +/*********************************************************************** + * getRetPreCopyNumber : get number of pre-copy slot for return value + * + * Enter: RI = ReturnInst + * Index = flattened index in the struct + * + * For each flattened index in the return type, there is one slot before the + * return instruction. + */ +unsigned GenXNumbering::getRetPreCopyNumber(ReturnInst *RI, unsigned Index) +{ + return getNumber(RI) + - IndexFlattener::getNumElements(RI->getOperand(0)->getType()) + Index; +} + +/*********************************************************************** + * getRetPostCopyNumber : get number of post-copy slot for return value + * + * Enter: CI = CallInst + * Index = flattened index in the struct + * + * For each flattened index in the return type, there is one slot after the call + * instruction. + */ +unsigned GenXNumbering::getRetPostCopyNumber(CallInst *CI, unsigned Index) +{ + return getNumber(CI) + 1 + Index; +} + +/*********************************************************************** + * getPhiNumber : get instruction number for phi node for particular predecessor + * + * The non-const version caches the result in NumberToPhiIncomingMap, for the + * later use of getPhiIncomingFromNumber. + */ +unsigned GenXNumbering::getPhiNumber(PHINode *Phi, BasicBlock *BB) const +{ + // The instruction number is the count of phi nodes before it added to the + // PhiNumber for the predecessor. + return BBNumbers.find(BB)->second.PhiNumber + getPhiOffset(Phi); +} + +unsigned GenXNumbering::getPhiNumber(PHINode *Phi, BasicBlock *BB) +{ + unsigned Number = ((const GenXNumbering *)this)->getPhiNumber(Phi, BB); + NumberToPhiIncomingMap[Number] + = std::pair(Phi, Phi->getBasicBlockIndex(BB)); + return Number; +} + +/*********************************************************************** + * getPhiIncomingFromNumber : get the phi incoming for a number returned from + * getPhiNumber + * + * This returns the phi node and incoming index corresponding to the supplied + * instruction number. + */ +std::pair GenXNumbering::getPhiIncomingFromNumber( + unsigned Number) +{ + auto i = NumberToPhiIncomingMap.find(Number); + if (i == NumberToPhiIncomingMap.end()) + return std::pair(nullptr, 0); + return i->second; +} + +/*********************************************************************** + * getPhiOffset : get phi node offset (the 0 based index within its block) + */ +unsigned GenXNumbering::getPhiOffset(PHINode *Phi) const +{ + // Count phi nodes from start of basic block to here. + unsigned Count = 0; + for (BasicBlock::const_iterator bi = Phi->getParent()->begin(); &*bi != Phi; ++bi) + ++Count; + return Count; +} + +/*********************************************************************** + * dump, print : dump the instruction numbering + */ +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void GenXNumbering::dump() +{ + print(errs()); errs() << '\n'; +} +#endif + +void GenXNumbering::print(raw_ostream &OS) const +{ + OS << "GenXNumbering for FunctionGroup " << FG->getName() << "\n"; + for (auto fgi = FG->begin(), fge = FG->end(); fgi != fge; ++fgi) { + Function *Func = *fgi; + if (FG->size() != 1) + OS << Func->getName() << ":\n"; + for (Function::iterator fi = Func->begin(), fe = Func->end(); fi != fe; ++fi) { + BasicBlock *BB = &*fi; + OS << "\n" << Numbers.find(BB)->second << " " << BB->getName() << ":\n"; + for (BasicBlock::iterator bi = BB->begin(), be = BB->end(); bi != be; ++bi) { + Instruction *Inst = &*bi; + if (Numbers.find(Inst) == Numbers.end()) + OS << " - "; + else + OS << Numbers.find(Inst)->second; + OS << " "; + Inst->print(OS); + OS << "\n"; + } + auto TI = cast(BB->getTerminator()); + if (TI->getNumSuccessors()) { + BasicBlock *Succ = TI->getSuccessor(0); + for (BasicBlock::iterator sbi = Succ->begin(), sbe = Succ->end(); sbi != sbe; ++sbi) { + if (PHINode *Phi = dyn_cast(&*sbi)) { + OS << "(" << getPhiNumber(Phi, BB) << ") "; + Phi->print(OS); + OS << "\n"; + } else + break; + } + } + } + } + OS << "\n"; +} + diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXNumbering.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXNumbering.h new file mode 100644 index 000000000000..dd92bbaaf24e --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXNumbering.h @@ -0,0 +1,166 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXNumbering +/// ------------- +/// +/// GenXNumbering is an analysis that provides a numbering of the instructions +/// for use by live ranges. +/// +/// The numbering is done such that slots are reserved for where GenXCoalescing +/// might need to insert copies. +/// +/// Generally, an instruction gets a slot in the numbering for itself, and +/// another slot just before, in case it is a two address instruction where +/// GenXCoalescing might want to insert a copy. +/// +/// Every instruction gets a number, even if it is baled in. However, for the +/// purposes of live range segments, every instruction in a bale is assumed +/// to have the same number as the head instruction of the bale. +/// +/// A non-intrinsic call has N slots reserved +/// before it for pre-copies, where N is the number of SimpleValues in the +/// (possibly struct) args, allowing for extra args that might be added later by +/// GenXArgIndirection. +/// +/// Similarly, a non-intrinsic call has N slots reserved after it for +/// post-copies, where N is the number of SimpleValues in the (possibly struct) +/// return value. The definition of each SimpleValue in the result of the call +/// is considered to be in its slot, and the corresponding SimpleValue in the +/// unified return value has an extra segment of live range from the call up to +/// that slot. +/// +/// A return instruction in a subroutine has N slots reserved before it for +/// pre-copies, where N is the number of SimpleValues in the (possibly struct) +/// return value. The use of each SimpleValue in the return is considered to be +/// in its slot, and the corresponding SimpleValue in the unified return value +/// has an extra segment of live range from the slot up to the return. +/// +/// A kernel has a slot for each kernel arg copy. A copy is inserted into such a slot in +/// GenXCoalescing if the kernel arg offset is not aligned enough for the uses +/// of the value. +/// +/// **IR restriction**: After this pass, it is very difficult to modify code +/// other than by inserting copies in the reserved slots above, as it would +/// disturb the numbering. +/// +//===----------------------------------------------------------------------===// +#ifndef GENXNUMBERING_H +#define GENXNUMBERING_H + +#include "FunctionGroup.h" +#include "IgnoreRAUWValueMap.h" +#include "llvm/IR/Value.h" + +namespace llvm { + +class CallInst; +class GenXBaling; +class PHINode; +class ReturnInst; + +FunctionGroupPass *createGenXGroupPrinterPass(raw_ostream &O, const std::string &Banner); + +class GenXNumbering : public FunctionGroupPass { + FunctionGroup *FG; + GenXBaling *Baling; + struct BBNumber { + unsigned Index; // 0-based index in list of basic blocks + unsigned PhiNumber; // instruction number of first phi node in successor + unsigned EndNumber; // instruction number of end of block + }; + // BBNumbers : The 0-based number (index) of each basic block. + ValueMap> BBNumbers; + // Numbers : The map of instruction numbers. + ValueMap> Numbers; + // StartNumbers : for a CallInst, the start number of where arg pre-copies + // are considered to be. This is stored, instead of being calculated from + // the CallInst's number, so that a CallInst can change number of args, as + // happens in GenXArgIndirection. + ValueMap> StartNumbers; + // NumberToPhiIncomingMap : map from instruction number to the phi incoming (phi + // node plus incoming index) it represents. We assume that a phi node is + // never deleted after GenXNumbering. + std::map> NumberToPhiIncomingMap; + + // The number for the entire fucntion group. All live ranges are included in + // live-range [0, LastNum]. + unsigned LastNum = 0; + +public: + static char ID; + explicit GenXNumbering() : FunctionGroupPass(ID), Baling(0) { } + ~GenXNumbering() { clear(); } + virtual StringRef getPassName() const { return "GenX numbering"; } + void getAnalysisUsage(AnalysisUsage &AU) const; + bool runOnFunctionGroup(FunctionGroup &FG); + // get BBNumber struct for a basic block + const BBNumber *getBBNumber(BasicBlock *BB) { return &BBNumbers[BB]; } + // get and set instruction number + unsigned getBaleNumber(Instruction *Inst); + unsigned getNumber(Value *V); + unsigned getLastNumber() const { return LastNum; } + void setNumber(Value *V, unsigned Number); + // get and set "start instruction number" for a CallInst + unsigned getStartNumber(Value *V) { return StartNumbers[V]; } + void setStartNumber(Value *V, unsigned Number) { StartNumbers[V] = Number; } + // get number for kernel arg copy, arg pre-copy, ret pre-copy and ret post-copy sites + unsigned getArgIndirectionNumber(CallInst *CI, unsigned OperandNum, unsigned Index); + unsigned getKernelArgCopyNumber(Argument *Arg); + unsigned getArgPreCopyNumber(CallInst *CI, unsigned OperandNum, unsigned Index); + unsigned getRetPreCopyNumber(ReturnInst *RI, unsigned Index); + unsigned getRetPostCopyNumber(CallInst *CI, unsigned Index); + // get the number of a phi incoming, where its copy will be inserted + // if necessary + unsigned getPhiNumber(PHINode *Phi, BasicBlock *BB) const; + unsigned getPhiNumber(PHINode *Phi, BasicBlock *BB); + // getPhiIncomingFromNumber : get the phi incoming for a number returned from getPhiNumber + std::pair getPhiIncomingFromNumber(unsigned Number); + // createPrinterPass : get a pass to print the IR, together with the GenX + // specific analyses + virtual Pass *createPrinterPass(raw_ostream &O, + const std::string &Banner) const { + return createGenXGroupPrinterPass(O, Banner); + } + // Debug dump + void dump(); + using llvm::Pass::print; // enables overloading of print in this class rather + // than override (and stops compiler warnings) + virtual void print(raw_ostream &OS) const; + +private: + void clear(); + unsigned numberInstructionsInFunc(Function *Func, unsigned Num); + unsigned getPhiOffset(PHINode *Phi) const; +}; + +void initializeGenXNumberingPass(PassRegistry &); + +} // end namespace llvm +#endif //ndef GENXNUMBERING_H diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXOCLInfoExtractor.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXOCLInfoExtractor.cpp new file mode 100644 index 000000000000..446409cf99ee --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXOCLInfoExtractor.cpp @@ -0,0 +1,77 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ + +#include "GenX.h" +#include "GenXOCLRuntimeInfo.h" +#include "llvm/Pass.h" + +using namespace llvm; + +namespace llvm { +void initializeGenXOCLInfoExtractorPass(PassRegistry &PR); +} + +class GenXOCLInfoExtractor : public ModulePass { +public: + static char ID; + +private: + std::vector *Dest = nullptr; + +public: + StringRef getPassName() const override { return "GenX OCL Info Extractor"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + } + + GenXOCLInfoExtractor() : ModulePass(ID) {} + + GenXOCLInfoExtractor(std::vector &Dst) + : ModulePass(ID), Dest(&Dst) { + initializeGenXOCLInfoExtractorPass(*PassRegistry::getPassRegistry()); + } + + bool runOnModule(Module &M) override { + assert(Dest && "Expected dest to be initialized"); + auto &Info = getAnalysis(); + *Dest = Info.stealCompiledKernels(); + return false; + } +}; + +char GenXOCLInfoExtractor::ID = 0; + +INITIALIZE_PASS_BEGIN(GenXOCLInfoExtractor, "GenXOCLInfoExtractor", + "GenXOCLInfoExtractor", false, false) +INITIALIZE_PASS_DEPENDENCY(GenXOCLRuntimeInfo) +INITIALIZE_PASS_END(GenXOCLInfoExtractor, "GenXOCLInfoExtractor", + "GenXOCLInfoExtractor", false, false) + +ModulePass *llvm::createGenXOCLInfoExtractorPass( + std::vector &Dest) { + return new GenXOCLInfoExtractor(Dest); +} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXOCLRuntimeInfo.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXOCLRuntimeInfo.cpp new file mode 100644 index 000000000000..a0f9990d1461 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXOCLRuntimeInfo.cpp @@ -0,0 +1,292 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ + +#include "GenXOCLRuntimeInfo.h" +#include "GenX.h" +#include "GenXSubtarget.h" +#include "llvm/GenXIntrinsics/GenXIntrinsics.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/DataLayout.h" + +#include +#include +#include + +using namespace llvm; + +char GenXOCLRuntimeInfo::ID = 0; + +//===----------------------------------------------------------------------===// +// +// Kernel info implementation. +// +//===----------------------------------------------------------------------===// +// Just perform linear instructions scan to find usage stats. +// Intrinsic set copied from igcmc. +void GenXOCLRuntimeInfo::KernelInfo::setInstructionUsageProperties( + FunctionGroup &FG, const GenXSubtarget &ST) { + for (Function *F : FG) { + for (BasicBlock &BB : *F) { + for (Instruction &I : BB) { + switch (GenXIntrinsic::getGenXIntrinsicID(&I)) { + default: + break; + case GenXIntrinsic::genx_group_id_x: + case GenXIntrinsic::genx_group_id_y: + case GenXIntrinsic::genx_group_id_z: + UsesGroupId = true; + break; + case GenXIntrinsic::genx_barrier: + UsesBarriers = true; + break; + case GenXIntrinsic::genx_ssdp4a: + case GenXIntrinsic::genx_sudp4a: + case GenXIntrinsic::genx_usdp4a: + case GenXIntrinsic::genx_uudp4a: + case GenXIntrinsic::genx_ssdp4a_sat: + case GenXIntrinsic::genx_sudp4a_sat: + case GenXIntrinsic::genx_usdp4a_sat: + case GenXIntrinsic::genx_uudp4a_sat: + break; + case GenXIntrinsic::genx_alloca: + ThreadPrivateMemSize = ST.stackSurfaceMaxSize(); + break; + } + } + } + } +} + +void GenXOCLRuntimeInfo::KernelInfo::setMetadataProperties( + genx::KernelMetadata &KM, const GenXSubtarget &ST) { + Name = KM.getName(); + SLMSize = KM.getSLMSize(); + // will be replaced to metadata usage once + // useGlobalMem option is removed from GenXSubtarget + // FIXME: replace with 8k * simdSize * numDispatchedThreads + if (ST.useGlobalMem()) + StatelessPrivateMemSize = 16 * 8192; + +} + +void GenXOCLRuntimeInfo::KernelInfo::setArgumentProperties( + const Function &Kernel, genx::KernelMetadata &KM) { + assert(Kernel.arg_size() == KM.getNumArgs() && + "Expected same number of arguments"); + // Some arguments are part of thread payload and do not require + // entries in arguments info for OCL runtime. + auto NonPayloadArgs = + make_filter_range(Kernel.args(), [&KM](const Argument &Arg) { + uint32_t ArgKind = KM.getArgKind(Arg.getArgNo()); + genx::KernelArgInfo KAI(ArgKind); + return !(KAI.isLocalIDX() || KAI.isLocalIDY() || KAI.isLocalIDZ() || + KAI.isGroupOrLocalSize() || KAI.isLocalIDs()); + }); + const DataLayout &DL = Kernel.getParent()->getDataLayout(); + transform(NonPayloadArgs, std::back_inserter(ArgInfos), + [&KM, &DL](const Argument &Arg) { + return KernelArgInfo{Arg, KM, DL}; + }); + UsesReadWriteImages = std::any_of( + ArgInfos.begin(), ArgInfos.end(), [](const KernelArgInfo &AI) { + return AI.isImage() && + AI.getAccessKind() == KernelArgInfo::AccessKindType::ReadWrite; + }); +} + +void GenXOCLRuntimeInfo::KernelInfo::setPrintStrings( + const Module &KernelModule) { + const auto *StringsMeta = KernelModule.getNamedMetadata("cm_print_strings"); + if (!StringsMeta) + return; + std::transform(StringsMeta->op_begin(), StringsMeta->op_end(), + std::back_inserter(PrintStrings), [](const auto *StringMeta) { + StringRef Str = + cast(StringMeta->getOperand(0))->getString(); + return std::string{Str.begin(), Str.end()}; + }); +} + +GenXOCLRuntimeInfo::KernelInfo::KernelInfo(FunctionGroup &FG, + const GenXSubtarget &ST) { + setInstructionUsageProperties(FG, ST); + + GRFSizeInBytes = ST.getGRFWidth(); + + genx::KernelMetadata KM{FG.getHead()}; + assert(KM.isKernel() && "Expected kernel as head of function group"); + setMetadataProperties(KM, ST); + setArgumentProperties(*FG.getHead(), KM); + setPrintStrings(*FG.getHead()->getParent()); +} + +//===----------------------------------------------------------------------===// +// +// Kernel argument info implementation. +// +//===----------------------------------------------------------------------===// +// Supported kernel argument attributes. +// Copied from igcmc.h. +struct OCLAttributes { + static constexpr auto ReadOnly = + "read_only"; // This resource is for read only. + static constexpr auto WriteOnly = + "write_only"; // This resource is for write only. + static constexpr auto ReadWrite = + "read_write"; // This resource is for read and write. + static constexpr auto Buffer = "buffer_t"; // This resource is a buffer. + static constexpr auto SVM = "svmptr_t"; // This resource is a SVM buffer. + static constexpr auto Sampler = "sampler_t"; // This resource is a sampler. + static constexpr auto Image1d = "image1d_t"; // This resource is a 1D surface. + static constexpr auto Image1d_buffer = "image1d_buffer_t"; // This resource is a 1D surface. + static constexpr auto Image2d = "image2d_t"; // This resource is a 2D surface. + static constexpr auto Image3d = "image3d_t"; // This resource is a 3D surface. +}; + +using ArgKindType = GenXOCLRuntimeInfo::KernelArgInfo::KindType; + +static auto GetStrPred = [](const char *Attr) { + return [Attr](StringRef Token) { return Token == Attr; }; +}; + +static ArgKindType getOCLArgKind(const SmallVectorImpl &Tokens, + unsigned ArgNo, genx::KernelMetadata &KM) { + unsigned RawKind = KM.getArgKind(ArgNo); + + // Implicit arguments. + genx::KernelArgInfo KAI{RawKind}; + if (KAI.isLocalSize()) + return ArgKindType::LocalSize; + if (KAI.isGroupCount()) + return ArgKindType::GroupCount; + if (KAI.isPrintBuffer()) + return ArgKindType::PrintBuffer; + if (KAI.isPrivateBase()) + return ArgKindType::PrivateBase; + + // Explicit arguments. + switch (KM.getArgCategory(ArgNo)) { + default: + return ArgKindType::General; + case genx::RegCategory::GENERAL: + if (any_of(Tokens, GetStrPred(OCLAttributes::SVM))) + return ArgKindType::SVM; + return ArgKindType::General; + case genx::RegCategory::SURFACE: + if (any_of(Tokens, GetStrPred(OCLAttributes::Image1d))) + return ArgKindType::Image1D; + if (any_of(Tokens, GetStrPred(OCLAttributes::Image1d_buffer))) + return ArgKindType::Image1D; + if (any_of(Tokens, GetStrPred(OCLAttributes::Image2d))) + return ArgKindType::Image2D; + if (any_of(Tokens, GetStrPred(OCLAttributes::Image3d))) + return ArgKindType::Image3D; + return ArgKindType::Buffer; + case genx::RegCategory::SAMPLER: + return ArgKindType::Sampler; + } +} + +using ArgAccessKindType = GenXOCLRuntimeInfo::KernelArgInfo::AccessKindType; + +static ArgAccessKindType +getOCLArgAccessKind(const SmallVectorImpl &Tokens, + ArgKindType Kind) { + // As in igcmc.cpp. + switch (Kind) { + case ArgKindType::Buffer: + case ArgKindType::Image1D: + case ArgKindType::Image2D: + case ArgKindType::Image3D: + case ArgKindType::SVM: + if (any_of(Tokens, GetStrPred(OCLAttributes::ReadOnly))) + return ArgAccessKindType::ReadOnly; + if (any_of(Tokens, GetStrPred(OCLAttributes::WriteOnly))) + return ArgAccessKindType::WriteOnly; + return ArgAccessKindType::ReadWrite; + default: + return ArgAccessKindType::None; + } +} + +// Initialize Kind and AccessKind from given ArgTypeDesc in metadata. +void GenXOCLRuntimeInfo::KernelArgInfo::translateArgDesc( + genx::KernelMetadata &KM) { + std::string Translated{KM.getArgTypeDesc(Index)}; + // Transform each separator to space. + std::transform(Translated.begin(), Translated.end(), Translated.begin(), + [](char C) { + if (C != '-' && C != '_' && C != '=' && !std::isalnum(C)) + return ' '; + return C; + }); + + // Split and delete duplicates. + SmallVector Tokens; + StringRef(Translated) + .split(Tokens, ' ', -1 /* MaxSplit */, false /* AllowEmpty */); + std::sort(Tokens.begin(), Tokens.end()); + Tokens.erase(std::unique(Tokens.begin(), Tokens.end()), Tokens.end()); + + Kind = getOCLArgKind(Tokens, Index, KM); + AccessKind = getOCLArgAccessKind(Tokens, Kind); +} + +static unsigned getArgSizeInBytes(const Argument &Arg, genx::KernelMetadata &KM, + const DataLayout &DL) { + Type *ArgTy = Arg.getType(); + if (ArgTy->isPointerTy()) + return DL.getPointerTypeSize(ArgTy); + if (KM.isBufferType(Arg.getArgNo())) + return DL.getPointerSize(); + return ArgTy->getPrimitiveSizeInBits() / genx::ByteBits; +} + +GenXOCLRuntimeInfo::KernelArgInfo::KernelArgInfo(const Argument &Arg, + genx::KernelMetadata &KM, + const DataLayout &DL) + : Index(Arg.getArgNo()) { + translateArgDesc(KM); + Offset = KM.getArgOffset(Index); + SizeInBytes = getArgSizeInBytes(Arg, KM, DL); + BTI = KM.getBTI(Index); +} + +//===----------------------------------------------------------------------===// +// +// Compiled kernel implementation. +// +//===----------------------------------------------------------------------===// +GenXOCLRuntimeInfo::CompiledKernel::CompiledKernel(KernelInfo &&KI, + const FINALIZER_INFO &JI, + ArrayRef GenBin) + : CompilerInfo(std::move(KI)), JitterInfo(JI), + GenBinary(GenBin.begin(), GenBin.end()) {} + +INITIALIZE_PASS_BEGIN(GenXOCLRuntimeInfo, "GenXOCLRuntimeInfo", + "GenXOCLRuntimeInfo", false, true) +INITIALIZE_PASS_END(GenXOCLRuntimeInfo, "GenXOCLRuntimeInfo", + "GenXOCLRuntimeInfo", false, true) diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXOCLRuntimeInfo.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXOCLRuntimeInfo.h new file mode 100644 index 000000000000..438a6e3fea02 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXOCLRuntimeInfo.h @@ -0,0 +1,256 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ + +#ifndef VCOPT_LIB_GENXCODEGEN_GENXOCLRUNTIMEINFO_H +#define VCOPT_LIB_GENXCODEGEN_GENXOCLRUNTIMEINFO_H + +#include "FunctionGroup.h" +#include "JitterDataStruct.h" +#include "vc/GenXOpts/Utils/KernelInfo.h" +#include "llvm/ADT/Optional.h" +#include "llvm/Pass.h" +#include +#include + +namespace llvm { +class Function; +class GenXSubtarget; + +void initializeGenXOCLRuntimeInfoPass(PassRegistry &PR); + +// This is an immutable pass to allow it creation once in the beginning of +// pipeline since creating it before actual place of need (cisa builder) +// will invalidate every other analyses required by builder. +class GenXOCLRuntimeInfo : public ImmutablePass { +public: + class KernelArgInfo { + public: + enum class KindType { + General, + LocalSize, + GroupCount, + Buffer, + SVM, + Sampler, + Image1D, + Image2D, + Image3D, + PrintBuffer, + PrivateBase + }; + + enum class AccessKindType { None, ReadOnly, WriteOnly, ReadWrite }; + + private: + unsigned Index; + KindType Kind; + AccessKindType AccessKind; + unsigned Offset; + unsigned SizeInBytes; + unsigned BTI; + + private: + void translateArgDesc(genx::KernelMetadata &KM); + + public: + KernelArgInfo(const Argument &Arg, genx::KernelMetadata &KM, + const DataLayout &DL); + + unsigned getIndex() const { return Index; } + KindType getKind() const { return Kind; } + AccessKindType getAccessKind() const { return AccessKind; } + unsigned getOffset() const { return Offset; } + unsigned getSizeInBytes() const { return SizeInBytes; } + unsigned getBTI() const { return BTI; } + + bool isImage() const { + switch (Kind) { + case KindType::Image1D: + case KindType::Image2D: + case KindType::Image3D: + return true; + default: + return false; + } + } + }; + + struct TableInfo { + void *Buffer = nullptr; + unsigned Size = 0; + unsigned Entries = 0; + }; + + // Additional kernel info that are not provided by finalizer + // but still required for runtime. + struct KernelInfo { + private: + std::string Name; + + bool UsesGroupId = false; + + + // Jitter info contains similar field. + // Whom should we believe? + bool UsesBarriers = false; + + bool UsesReadWriteImages = false; + + unsigned SLMSize = 0; + unsigned ThreadPrivateMemSize = 0; + unsigned StatelessPrivateMemSize = 0; + + unsigned GRFSizeInBytes; + + using ArgInfoStorageTy = std::vector; + using PrintStringStorageTy = std::vector; + ArgInfoStorageTy ArgInfos; + PrintStringStorageTy PrintStrings; + + TableInfo ReloTable; + TableInfo SymbolTable; + + private: + void setInstructionUsageProperties(FunctionGroup &FG, + const GenXSubtarget &ST); + void setMetadataProperties(genx::KernelMetadata &KM, + const GenXSubtarget &ST); + void setArgumentProperties(const Function &Kernel, + genx::KernelMetadata &KM); + void setPrintStrings(const Module &KernelModule); + + public: + using arg_iterator = ArgInfoStorageTy::iterator; + using arg_const_iterator = ArgInfoStorageTy::const_iterator; + using arg_size_type = ArgInfoStorageTy::size_type; + + public: + // Creates kernel info for given function group. + KernelInfo(FunctionGroup &FG, const GenXSubtarget &ST); + + const std::string &getName() const { return Name; } + + // These are considered to always be true (at least in igcmc). + // Preserve this here. + bool usesLocalIdX() const { return true; } + bool usesLocalIdY() const { return true; } + bool usesLocalIdZ() const { return true; } + + // Deduced from actual function instructions. + bool usesGroupId() const { return UsesGroupId; } + + // SIMD size is always set by igcmc to one. Preserve this here. + unsigned getSIMDSize() const { return 1; } + unsigned getSLMSize() const { return SLMSize; } + + // Deduced from actual function instructions. + unsigned getTPMSize() const { return ThreadPrivateMemSize; } + unsigned getStatelessPrivMemSize() const { return StatelessPrivateMemSize; } + + unsigned getGRFSizeInBytes() const { return GRFSizeInBytes; } + + + bool usesBarriers() const { return UsesBarriers; } + bool usesReadWriteImages() const { return UsesReadWriteImages; } + + // Arguments accessors. + arg_iterator arg_begin() { return ArgInfos.begin(); } + arg_iterator arg_end() { return ArgInfos.end(); } + arg_const_iterator arg_begin() const { return ArgInfos.begin(); } + arg_const_iterator arg_end() const { return ArgInfos.end(); } + iterator_range args() { return {arg_begin(), arg_end()}; } + iterator_range args() const { + return {arg_begin(), arg_end()}; + } + arg_size_type arg_size() const { return ArgInfos.size(); } + bool arg_empty() const { return ArgInfos.empty(); } + const PrintStringStorageTy &getPrintStrings() const { return PrintStrings; } + TableInfo &getRelocationTable() { return ReloTable; } + const TableInfo &getRelocationTable() const { return ReloTable; } + TableInfo &getSymbolTable() { return SymbolTable; } + const TableInfo &getSymbolTable() const { return SymbolTable; } + }; + + + class CompiledKernel { + KernelInfo CompilerInfo; + FINALIZER_INFO JitterInfo; + std::string GenBinary; + + public: + CompiledKernel(KernelInfo &&KI, const FINALIZER_INFO &JI, + ArrayRef GenBin); + + const KernelInfo &getKernelInfo() const { return CompilerInfo; } + const FINALIZER_INFO &getJitterInfo() const { return JitterInfo; } + const std::string &getGenBinary() const { return GenBinary; } + }; + +public: + using KernelStorageTy = std::vector; + + using kernel_iterator = KernelStorageTy::iterator; + using kernel_const_iterator = KernelStorageTy::const_iterator; + using kernel_size_type = KernelStorageTy::size_type; + +private: + KernelStorageTy Kernels; + +public: + static char ID; + + GenXOCLRuntimeInfo() : ImmutablePass(ID) { + initializeGenXOCLRuntimeInfoPass(*PassRegistry::getPassRegistry()); + } + + // Save kernel info and jit info for given function in this pass. + void saveCompiledKernel(CompiledKernel &&KD) { + Kernels.push_back(std::move(KD)); + } + + // Move compiled kernels out of this pass. + KernelStorageTy stealCompiledKernels() { return std::move(Kernels); } + + // Kernel descriptor accessors. + kernel_iterator kernel_begin() { return Kernels.begin(); } + kernel_iterator kernel_end() { return Kernels.end(); } + kernel_const_iterator kernel_begin() const { return Kernels.begin(); } + kernel_const_iterator kernel_end() const { return Kernels.end(); } + iterator_range kernels() { + return {kernel_begin(), kernel_end()}; + } + iterator_range kernels() const { + return {kernel_begin(), kernel_end()}; + } + kernel_size_type kernel_size() const { return Kernels.size(); } + bool kernel_empty() const { return Kernels.empty(); } +}; + +ModulePass *createGenXOCLInfoExtractorPass( + std::vector &Dest); +} // namespace llvm + +#endif diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXPatternMatch.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXPatternMatch.cpp new file mode 100644 index 000000000000..9cb4239c2472 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXPatternMatch.cpp @@ -0,0 +1,2640 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXPatternMatch +/// ---------------- +/// +/// This pass performs a small number of GenX-specific peephole optimizations. +/// +/// It is named *pattern match* with the idea that it is analogous to the +/// pattern matching pass in IGC. However IGC's pattern matching is more +/// extensive, and I believe some of its functionality is covered by GenXBaling +/// in the GenX backend. +/// +/// * Turns fp and integer mul+add into mad, if it decides it is profitable. +/// +/// For an integer mul+add, the pass looks at the inputs after accounting for +/// extends that will get baled into the operation in the GenX backend, or +/// folded into the instruction in the finalizer, and it uses mad only if both +/// inputs are short or byte. Our experience on HSW was that using int mad +/// where the inputs are actually 32 bit ints is counterproductive because of +/// the way that the finalizer has to implement it using the hardware's 32x16 +/// multiply. +/// +/// However, this criterion could probably be looser on any arch that has a +/// 32x32 multiply (BDW+, but excluding some later LP variants). This is +/// something to investigate. +/// +/// To implement this, the pass would need to use GenXSubtarget, and there +/// would need to be a has32x32Multiply flag in GenXSubtarget. +/// +/// * Turns cmp+sel into min/max if possible. +/// +/// * Flips a boolean not if profitable. +/// +/// * Cleanup predicate region reads if possible. +/// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "GENX_PATTERN_MATCH" +#include "GenX.h" +#include "GenXConstants.h" +#include "GenXModule.h" +#include "GenXRegion.h" +#include "GenXSubtarget.h" +#include "GenXUtil.h" +#include "GenXVectorDecomposer.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/TargetFolder.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/GenXIntrinsics/GenXIntrinsicInst.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/KnownBits.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Transforms/Utils/Local.h" + +#include +#include + +using namespace llvm; +using namespace llvm::PatternMatch; +using namespace genx; + +STATISTIC(NumOfMadMatched, "Number of mad instructions matched"); +STATISTIC(NumOfMinMaxMatched, "Number of min/max instructions matched"); + +static cl::opt EnableMadMatcher("enable-mad", cl::init(true), cl::Hidden, + cl::desc("Enable mad matching.")); + +static cl::opt EnableMinMaxMatcher("enable-minmax", cl::init(true), + cl::Hidden, + cl::desc("Enable min/max matching.")); + +namespace { + +class GenXPatternMatch : public FunctionPass, + public InstVisitor { + DominatorTree *DT = nullptr; + LoopInfo *LI = nullptr; + const DataLayout *DL = nullptr; + const TargetOptions *Options; + // Indicates whether there is any change. + bool Changed = false; + +public: + static char ID; + GenXPatternMatch(const TargetOptions *Options = nullptr) + : FunctionPass(ID), Options(Options) {} + + StringRef getPassName() const override { return "GenX pattern match"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + AU.setPreservesCFG(); + } + + void visitBinaryOperator(BinaryOperator &I); + + void visitCallInst(CallInst &I); + + void visitSelectInst(SelectInst &I); + + void visitFDiv(BinaryOperator &I); + + void visitICmpInst(ICmpInst &I); + + bool runOnFunction(Function &F) override; + + bool isFpMadEnabled() const { + return EnableMadMatcher && + (!Options || Options->AllowFPOpFusion != FPOpFusion::Strict); + } + +private: + // flipBoolNot : flip a (vector) bool not instruction if beneficial + bool flipBoolNot(Instruction *Inst); + // foldBoolAnd : fold a (vector) bool and into sel/wrregion if beneficial + bool foldBoolAnd(Instruction *Inst); + bool simplifyPredRegion(CallInst *Inst); + bool simplifyWrRegion(CallInst *Inst); + bool simplifyRdRegion(CallInst* Inst); + bool simplifyTruncSat(CallInst *Inst); + bool simplifySelect(Function *F); + bool simplifyVolatileGlobals(Function *F); + bool decomposeSelect(Function *F); + // Preprocessing to help generate integer MAD. + bool distributeIntegerMul(Function *F); + bool propagateFoldableRegion(Function *F); + bool reassociateIntegerMad(Function *F); + bool vectorizeConstants(Function *F); + bool placeConstants(Function *F); + bool simplifyCmp(CmpInst *Cmp); + CmpInst *reduceCmpWidth(CmpInst *Cmp); + bool simplifyNullDst(CallInst *Inst); + // Transform logic operation with a mask from to + bool extendMask(BinaryOperator *BO); +}; + +} // namespace + +char GenXPatternMatch::ID = 0; + +namespace llvm { +void initializeGenXPatternMatchPass(PassRegistry &); +} +INITIALIZE_PASS_BEGIN(GenXPatternMatch, "GenXPatternMatch", "GenXPatternMatch", + false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(GenXPatternMatch, "GenXPatternMatch", "GenXPatternMatch", + false, false) + +FunctionPass *llvm::createGenXPatternMatchPass(const TargetOptions *Options) { + initializeGenXPatternMatchPass(*PassRegistry::getPassRegistry()); + return new GenXPatternMatch(Options); +} + +bool GenXPatternMatch::runOnFunction(Function &F) { + DT = &getAnalysis().getDomTree(); + LI = &getAnalysis().getLoopInfo(); + DL = &F.getParent()->getDataLayout(); + + // Before we get the simd-control-flow representation right, + // we avoid dealing with predicate constants + loadPhiConstants(&F, DT, true); + Changed |= distributeIntegerMul(&F); + Changed |= propagateFoldableRegion(&F); + Changed |= reassociateIntegerMad(&F); + Changed |= placeConstants(&F); + Changed |= vectorizeConstants(&F); + + visit(F); + + Changed |= simplifyVolatileGlobals(&F); + Changed |= simplifySelect(&F); + // Break big predicate variables and run after min/max pattern match. + Changed |= decomposeSelect(&F); + + return Changed; +} + +namespace { + +// Helper class to share common code. +class MadMatcher { +public: + explicit MadMatcher(Instruction *I) + : AInst(I), MInst(nullptr), ID(GenXIntrinsic::not_any_intrinsic), NegIndex(-1) { + assert(I && "null instruction"); + Srcs[0] = Srcs[1] = Srcs[2] = nullptr; + } + + // Match mads with floating point operands. + bool matchFpMad(); + + // Match integer mads that starts with binary operators. + bool matchIntegerMad(); + + // Match integer mads that starts with genx_*add intrinsic calls. + bool matchIntegerMad(unsigned IID); + +private: + // Return true if changes are made. + bool emit(); + + // Check whether it is profitable to emit a mad. + // + // Each mad out of add implies a duplicated mul and jitter usually can not + // remove it in the end. + // + // It is a bit more subtle for the integer case. Since 32 bit mul is not well + // supported in HW, it may lead to worse code if a 32 bit integer mad cannot + // be emitted as mac in the end and mul + mach could be emitted. + bool isProfitable() const; + + // Checks whether a fp mad is being matched or not. + bool isFpMad() const { return ID == Intrinsic::fma; } + + void setMInst(Instruction *I) { MInst = I; } + + // Checks whether 'MInst' is an integer shift, which could be turned back to + // an integer muliplication. + bool isLShift() const { return MInst->getOpcode() == Instruction::Shl; } + + std::tuple getNarrowI16Vector(IRBuilder<> &, Instruction *, + Value *, unsigned) const; + +private: + // The instruction starts the mad matching: + // * fadd/fsub + // * add/sub + // * genx_*add + Instruction *AInst; + + // The instruction being sinked into: + // * fmul + // * mul/shl + // * genx_*mul + Instruction *MInst; + + // The mad intrinsic ID. + unsigned ID; + + // Source operands for the mad intrinsic call, representing mad as + // srcs[0] * srcs[1] + srcs[2]. + Value *Srcs[3]; + + // Indicates whether Srcs[NegIndex] needs to be negated. Value -1 means no + // negation is needed. + int NegIndex; +}; + + + +// Class to identify cases where a comparison and select are equivalent to a +// min or max operation. These are replaced by a min/max intrinsic which allows +// the jitter to produce better code for these cases. +class MinMaxMatcher { +public: + explicit MinMaxMatcher(Instruction *I) + : SelInst(I), CmpInst(nullptr), ID(GenXIntrinsic::not_any_intrinsic) { + assert(I && "null instruction"); + Srcs[0] = Srcs[1] = nullptr; + Annotation = 0; + } + + // Match select instruction that are equivalent to min/max + bool matchMinMax(); + + bool valuesMatch(llvm::Value *Op1, llvm::Value *Op2); + + static bool isEnabled() { return EnableMinMaxMatcher; } + +private: + // Return true if changes are made. + bool emit(); + + void setSelInst(Instruction *I) { SelInst = I; } + +private: + // The select instruction + Instruction *SelInst; + + // The compare instruction + llvm::CmpInst *CmpInst; + + // The min/max intrinsic ID. + unsigned ID; + + // Source operands for the min/max intrinsic call + Value *Srcs[2]; + + // Effective operands for the cmp ignoring some casts + Value *CmpSrcs[2]; + + // Annotation for the min/max call + const char *Annotation; +}; + +} // namespace + +void GenXPatternMatch::visitBinaryOperator(BinaryOperator &I) { + auto P = getAnalysisIfAvailable(); + const GenXSubtarget *ST = P ? P->getSubtarget() : nullptr; + if (isPredNot(&I)) + Changed |= flipBoolNot(&I); + else + switch (I.getOpcode()) { + default: + break; + case Instruction::FAdd: + case Instruction::FSub: + Changed |= isFpMadEnabled() && MadMatcher(&I).matchFpMad(); + break; + case Instruction::Add: + case Instruction::Sub: + if (EnableMadMatcher && MadMatcher(&I).matchIntegerMad()) + Changed = true; + break; + case Instruction::And: + if (I.getType()->getScalarType()->isIntegerTy(1)) { + if (foldBoolAnd(&I)) + Changed = true; + } else if (extendMask(&I)) + Changed = true; + break; + case Instruction::Or: + case Instruction::Xor: + if (extendMask(&I)) + Changed = true; + break; + } +} + +void GenXPatternMatch::visitCallInst(CallInst &I) { + auto P = getAnalysisIfAvailable(); + const GenXSubtarget *ST = P ? P->getSubtarget() : nullptr; + switch (unsigned ID = GenXIntrinsic::getGenXIntrinsicID(&I)) { + default: + break; + case GenXIntrinsic::genx_ssadd_sat: + case GenXIntrinsic::genx_suadd_sat: + case GenXIntrinsic::genx_usadd_sat: + case GenXIntrinsic::genx_uuadd_sat: + if (EnableMadMatcher && MadMatcher(&I).matchIntegerMad(ID)) + Changed = true; + break; + case GenXIntrinsic::genx_rdpredregion: + Changed |= simplifyPredRegion(&I); + break; + case GenXIntrinsic::genx_wrregioni: + case GenXIntrinsic::genx_wrregionf: + Changed |= simplifyWrRegion(&I); + break; + case GenXIntrinsic::genx_rdregioni: + case GenXIntrinsic::genx_rdregionf: + Changed |= simplifyRdRegion(&I); + break; + case GenXIntrinsic::genx_sstrunc_sat: + case GenXIntrinsic::genx_sutrunc_sat: + case GenXIntrinsic::genx_ustrunc_sat: + case GenXIntrinsic::genx_uutrunc_sat: + Changed |= simplifyTruncSat(&I); + break; + case GenXIntrinsic::genx_dword_atomic_add: + case GenXIntrinsic::genx_dword_atomic_and: + case GenXIntrinsic::genx_dword_atomic_cmpxchg: + case GenXIntrinsic::genx_dword_atomic_dec: + case GenXIntrinsic::genx_dword_atomic_fcmpwr: + case GenXIntrinsic::genx_dword_atomic_fmax: + case GenXIntrinsic::genx_dword_atomic_fmin: + case GenXIntrinsic::genx_dword_atomic_imax: + case GenXIntrinsic::genx_dword_atomic_imin: + case GenXIntrinsic::genx_dword_atomic_max: + case GenXIntrinsic::genx_dword_atomic_min: + case GenXIntrinsic::genx_dword_atomic_or: + case GenXIntrinsic::genx_dword_atomic_sub: + case GenXIntrinsic::genx_dword_atomic_xchg: + case GenXIntrinsic::genx_dword_atomic_xor: + Changed |= simplifyNullDst(&I); + break; + } +} + +void GenXPatternMatch::visitICmpInst(ICmpInst &I) { + // Ignore dead comparison. + if (I.use_empty()) + return; + + Value *V0 = nullptr; + Constant *C1 = nullptr; + Constant *C2 = nullptr; + ICmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; + + // Transform icmp (V0 & 65535), C2 ==> icmp (trunc V0 to i16), C2. + // TODO: Only consider unsigned comparisons so do not inspect the sign bit. + if (I.isUnsigned() && + match(&I, m_ICmp(Pred, m_OneUse(m_And(m_Value(V0), m_Constant(C1))), + m_Constant(C2))) && + C1->getType()->isVectorTy()) { + Type *Ty = V0->getType(); + if (auto Elt = dyn_cast_or_null(C1->getSplatValue())) { + auto Known = computeKnownBits(C2, *DL); + unsigned NBits = Known.Zero.countLeadingOnes(); + + IRBuilder<> Builder(&I); + uint64_t Int16Mask = std::numeric_limits::max(); + uint64_t Int8Mask = std::numeric_limits::max(); + + // Check if it is safe to truncate to lower type without loss of bits. + Type *DstTy = nullptr; + uint64_t Val = Elt->getZExtValue(); + unsigned NElts = Ty->getVectorNumElements(); + unsigned BitWidth = Elt->getType()->getPrimitiveSizeInBits(); + if (Val == Int16Mask && NBits + 16 >= BitWidth) + DstTy = VectorType::get(Builder.getInt16Ty(), NElts); + else if (Val == Int8Mask && NBits + 8 >= BitWidth) + DstTy = VectorType::get(Builder.getInt8Ty(), NElts); + + // Lower trunc to bitcast followed by a region read + // as such bitcast is not support after IR lowering. + if (DstTy) { + Type *InEltTy = Ty->getVectorElementType(); + Type *OutEltTy = DstTy->getVectorElementType(); + assert(OutEltTy->getPrimitiveSizeInBits()); + unsigned Stride = InEltTy->getPrimitiveSizeInBits() / + OutEltTy->getPrimitiveSizeInBits(); + // Create the new bitcast. + Instruction *BC = CastInst::Create( + Instruction::BitCast, V0, VectorType::get(OutEltTy, Stride * NElts), + ".bc", &I /*InsertBefore*/); + BC->setDebugLoc(I.getDebugLoc()); + + // Create the new rdregion. + Region R(BC); + R.NumElements = NElts; + R.Stride = Stride; + R.Width = NElts; + R.VStride = R.Stride * R.Width; + Value *LHS = R.createRdRegion(BC, "", &I /*InsertBefore*/, + I.getDebugLoc(), false /*AllowScalar*/); + Value *RHS = Builder.CreateTrunc(C2, DstTy); + assert(isa(RHS)); + Value *NewICmp = Builder.CreateICmp(Pred, LHS, RHS); + if (auto Inst = dyn_cast(NewICmp)) + Inst->setDebugLoc(I.getDebugLoc()); + I.replaceAllUsesWith(NewICmp); + Changed = true; + } + } + } + + // Explore (icmp.ne V0, 0) where V0 is promoted from i1. + if (match(&I, m_ICmp(Pred, m_Value(V0), m_Zero())) && + Pred == CmpInst::ICMP_NE) { + // V0 is calculated from AND, OR, NOT, and (select (cmp ...), 0, 1) + SmallVector WorkList; + SmallVector PreOrder; + bool Profitable = true; + WorkList.push_back(V0); + while (!WorkList.empty()) { + Value *V = WorkList.pop_back_val(); + Value *LHS = nullptr, *RHS = nullptr; + if (match(V, m_OneUse(m_Or(m_Value(LHS), m_Value(RHS))))) { + WorkList.push_back(LHS); + WorkList.push_back(RHS); + PreOrder.push_back(V); + continue; + } + if (match(V, m_OneUse(m_And(m_Value(LHS), m_Value(RHS))))) { + WorkList.push_back(LHS); + WorkList.push_back(RHS); + PreOrder.push_back(V); + continue; + } + if (match(V, m_OneUse(m_Not(m_Value(LHS))))) { + WorkList.push_back(LHS); + PreOrder.push_back(V); + continue; + } + Value *Cond = nullptr; + if (match(V, m_OneUse(m_Select(m_Value(Cond), m_One(), m_Zero())))) { + PreOrder.push_back(Cond); + continue; + } + Profitable = false; + break; + } + if (Profitable) { + IRBuilder<> Builder(&I); + // For simplicity, a stack is used to reconstruct tree. With a next + // pointer, that stack is not necessary. + SmallVector OpStack; + while (!PreOrder.empty()) { + Value *V = PreOrder.pop_back_val(); + if (V->getType()->getScalarType()->isIntegerTy(1)) { + OpStack.push_back(V); + continue; + } + Value *LHS, *RHS; + if (match(V, m_Or(m_Value(LHS), m_Value(RHS)))) { + assert(OpStack.size() >= 2); + RHS = OpStack.pop_back_val(); + LHS = OpStack.pop_back_val(); + OpStack.push_back(Builder.CreateOr(LHS, RHS)); + continue; + } + if (match(V, m_And(m_Value(LHS), m_Value(RHS)))) { + assert(OpStack.size() >= 2); + RHS = OpStack.pop_back_val(); + LHS = OpStack.pop_back_val(); + OpStack.push_back(Builder.CreateAnd(LHS, RHS)); + continue; + } + if (match(V, m_Not(m_Value(LHS)))) { + assert(OpStack.size() >= 1); + LHS = OpStack.pop_back_val(); + OpStack.push_back(Builder.CreateNot(LHS)); + } + assert(false && "Unhandled logic op!"); + } + assert(OpStack.size() == 1); + I.replaceAllUsesWith(OpStack.pop_back_val()); + Changed = true; + return; + } + } + + // Skip the following optimization specific to scalar comparison. + if (!I.getType()->isIntegerTy(1)) + return; + + // Transform the evaluation of flag == 0 into (~flag).all(). + // TODO: Transform flag != 0 into flag.any(). + if (match(&I, m_ICmp(Pred, m_OneUse(m_BitCast(m_OneUse(m_Value(V0)))), + m_Zero())) && + Pred == CmpInst::ICMP_EQ && isa(V0) && + V0->getType()->isVectorTy() && + V0->getType()->getScalarType()->isIntegerTy(1)) { + VectorType *VTy = cast(V0->getType()); + unsigned NumElts = VTy->getNumElements(); + if (NumElts == 2 || NumElts == 4 || NumElts == 8 || NumElts == 16) { + IRBuilder<> Builder(&I); + auto Cmp = cast(V0); + // Inverse the evaluation of flag. + Cmp->setPredicate(Cmp->getInversePredicate()); + if (auto NewCmp = reduceCmpWidth(Cmp)) { + // Once the cmp could be reduced into narrower one (with the assumption + // that the reduced part is always TRUE), reduce it into narrow one. + Cmp = NewCmp; + VTy = cast(Cmp->getType()); + } + simplifyCmp(Cmp); + // Call 'all'. + auto M = I.getParent()->getParent()->getParent(); + auto Fn = GenXIntrinsic::getGenXDeclaration(M, GenXIntrinsic::genx_all, VTy); + auto NewVal = Builder.CreateCall(Fn, Cmp); + I.replaceAllUsesWith(NewVal); + Changed = true; + return; + } + } +} + +// Simplify the sequence of (cmp.eq (and (wrregion zero v), 1), 0) to +// (cmp.eq (and v, 1), 0) with a narrow vector length with the assumption that +// the reduced part will be always TRUE. +CmpInst *GenXPatternMatch::reduceCmpWidth(CmpInst *Cmp) { + ICmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE; + Value *V0 = nullptr; + if (!Cmp->hasOneUse() || !Cmp->getType()->isVectorTy() || + !match(Cmp, m_ICmp(Pred, m_And(m_Value(V0), m_One()), m_Zero())) || + Pred != CmpInst::ICMP_EQ || !GenXIntrinsic::isWrRegion(V0)) + return nullptr; + + GenXIntrinsicInst *WII = cast(V0); + if (!match(WII->getOperand(0), m_Zero())) + return nullptr; + + V0 = WII->getOperand(1); + VectorType *VTy = cast(V0->getType()); + unsigned NumElts = VTy->getNumElements(); + + Region R(WII, BaleInfo()); + if (R.Indirect || R.Offset || R.VStride || R.Stride != 1 || + R.Width != NumElts) + return nullptr; + if (R.Width != 2 && R.Width != 4 && R.Width != 8 && R.Width != 16) + return nullptr; + + // As the rest parts of the original vector are all zeros, the sequence could + // be reduced into a narrower one (R.Width) and skip the wrregion. + IRBuilder<> Builder(Cmp); + + auto One = ConstantInt::get(VTy, 1); + auto Zero = Constant::getNullValue(VTy); + + auto V1 = Builder.CreateAnd(V0, One); + auto V2 = Builder.CreateICmp(Pred, V1, Zero); + + return cast(V2); +} + +// Simplify the sequence of (cmp (and (select (cmp ...) 1, 0), 1), 0) +bool GenXPatternMatch::simplifyCmp(CmpInst *Cmp) { + ICmpInst::Predicate P0 = ICmpInst::BAD_ICMP_PREDICATE; + ICmpInst::Predicate P1 = ICmpInst::BAD_ICMP_PREDICATE; + Value *LHS = nullptr; + Value *RHS = nullptr; + if (!match(Cmp, m_ICmp(P0, + m_And(m_Select(m_ICmp(P1, m_Value(LHS), m_Value(RHS)), + m_One(), m_Zero()), + m_One()), + m_Zero()))) + return false; + if (P0 != ICmpInst::ICMP_EQ && P0 != ICmpInst::ICMP_NE) + return false; + if (P0 == ICmpInst::ICMP_EQ) + P1 = ICmpInst::getInversePredicate(P1); + Cmp->setPredicate(P1); + Cmp->setOperand(0, LHS); + Cmp->setOperand(1, RHS); + return true; +} + +/*********************************************************************** + * notHasRealUse : detect whether an instruction has a use that counts as + * a "real" use of a bool not, that is one where it would need to be + * calculated rather than just baled in + */ +static bool notHasRealUse(Instruction *Inst) { + for (auto ui = Inst->use_begin(), ue = Inst->use_end(); ui != ue; ++ui) { + auto user = cast(ui->getUser()); + if (isPredNot(user)) + continue; + if (isa(user)) + continue; + if (user->use_empty()) + continue; // ignore dead instruction + switch (GenXIntrinsic::getGenXIntrinsicID(user)) { + case GenXIntrinsic::genx_any: + case GenXIntrinsic::genx_all: + case GenXIntrinsic::genx_wrregioni: + case GenXIntrinsic::genx_wrregionf: + continue; + default: + return true; + } + } + return false; +} + +/*********************************************************************** + * GenXPatternMatch::flipBoolNot : attempt to flip (vector) bool not + * + * A vector bool not is bad if its value actually needs to be calculated, + * as opposed to just baling it into a predicate field. In gen code, + * calculating it involves using a sel to get it into a GRF, then doing + * an xor that sets flags. Here we call any use that requires it to be + * calculated a "real" use. + * + * This code detects the case that: + * 1. the not has at least one "real" use + * 2. the input to the not is the result of a cmp and does not have any + * "real" use. + * If these conditions hold, then we flip the not by inverting the + * cmp and replacing uses of the not with the new inverted cmp. If the + * original cmp has any uses other than the original not, then we create + * a new not and change uses to that. + * + * In this way we save an actual calculation of the original not. + * + * We only do this for a v16i1 or smaller. + */ +bool GenXPatternMatch::flipBoolNot(Instruction *Inst) { + if (Inst->getType()->getPrimitiveSizeInBits() > 16) + return false; // too big + auto Input = dyn_cast(Inst->getOperand(0)); + if (!Input) + return false; // input not cmp + if (!notHasRealUse(Inst)) + return false; // result of not has no "real" use + if (notHasRealUse(Input)) + return false; // input has a "real" use, so we don't want to flip + // We want to flip the not by inverting the comparison that generates its + // input. + auto NewCmp = CmpInst::Create( + Input->getOpcode(), Input->getInversePredicate(), Input->getOperand(0), + Input->getOperand(1), Input->getName() + ".inverted", Input); + NewCmp->setDebugLoc(Input->getDebugLoc()); + Inst->replaceAllUsesWith(NewCmp); + if (!Input->use_empty()) { + auto NewNot = BinaryOperator::Create( + Instruction::Xor, NewCmp, Constant::getAllOnesValue(NewCmp->getType()), + "", Input); + NewNot->setDebugLoc(Input->getDebugLoc()); + NewNot->takeName(Inst); + Input->replaceAllUsesWith(NewNot); + } + return true; +} + +/*********************************************************************** + * foldBoolAnd : fold a (vector) bool and into sel/wrregion if beneficial + * + * A bool and takes a sequence of 3 gen instructions. Here we detect if + * a bool and has a single use in a select or wrregion, and if so we fold + * it in to have two selects or rdregion, select, wrregion respectively. + * + * We only do this for a v16i1 or smaller. + */ +bool GenXPatternMatch::foldBoolAnd(Instruction *Inst) { + if (Inst->getType()->getPrimitiveSizeInBits() > 16) + return false; // too big + if (!isa(Inst->getType())) + return false; // too small + if (!Inst->hasOneUse()) + return false; // more than one use + auto user = cast(Inst->use_begin()->getUser()); + if (auto Sel = dyn_cast(user)) { + // Fold and into sel. + auto NewSel1 = SelectInst::Create(Inst->getOperand(0), Sel->getOperand(1), + Sel->getOperand(2), + Sel->getName() + ".foldand", Sel); + NewSel1->setDebugLoc(Sel->getDebugLoc()); + auto NewSel2 = SelectInst::Create(Inst->getOperand(1), NewSel1, + Sel->getOperand(2), "", Sel); + NewSel2->takeName(Sel); + NewSel2->setDebugLoc(Sel->getDebugLoc()); + Sel->replaceAllUsesWith(NewSel2); + return true; + } + if (!GenXIntrinsic::isWrRegion(user)) + return false; + // Fold and into wrregion, giving rdregion, select and wrregion, as long + // as the original wrregion is not indirect. + Region R(user, BaleInfo()); + if (R.Indirect) + return false; + auto NewRdRegion = + R.createRdRegion(user->getOperand(0), user->getName() + ".foldand1", user, + user->getDebugLoc(), false); + auto NewSel = + SelectInst::Create(Inst->getOperand(0), user->getOperand(1), NewRdRegion, + user->getName() + ".foldand2", user); + NewSel->setDebugLoc(user->getDebugLoc()); + R.Mask = Inst->getOperand(1); + auto NewWrRegion = cast(R.createWrRegion( + user->getOperand(0), NewSel, "", user, user->getDebugLoc())); + NewWrRegion->takeName(user); + user->replaceAllUsesWith(NewWrRegion); + return true; +} + +void GenXPatternMatch::visitSelectInst(SelectInst &I) { + Changed |= MinMaxMatcher::isEnabled() && MinMaxMatcher(&I).matchMinMax(); +} + +// Trace the def-use chain and return the first non up-cast related value. +static Value *getEffectiveValueUp(Value *V) { + if (isa(V) || isa(V) || isa(V)) + return getEffectiveValueUp(cast(V)->getOperand(0)); + + return V; +} + +// Determine whether it is profitable to match a mad. This function assumes +// that it is valid to match. +bool MadMatcher::isProfitable() const { + // Do not match unused instructions. + if (AInst->use_empty()) + return false; + + // For the following case, + // %m = mul %a, %b + // %a1 = add %m, %c1 + // %a2 = add %m, %c2 + // + // If we match them into two mads as + // + // %m1 = mad(%a, %b, %c1) + // %m2 = mad(%a, %b, %c2) + // + // and it fails to emit two mac/mads then there are redundant instructions in + // the end. Conservatively, only match when there is a single use for MInst. + // + // Update: There are enough cases where this transformation helps spilling + // (particularly for long sequences) that mean it is of more value to enable + // multiple use cases. May need to revisit. if (!MInst->hasOneUse()) + // return false; + + // Do not match x * y +/- 0.0f + // FIXME: specify fp mode. ICL certainly is not strict in general. + if (Constant *C = dyn_cast(Srcs[2])) + if (C->isZeroValue()) + return false; + + // Ignores upward or bit casts, which usually will be performed by copy + // propagation within jitter. + Value *Vals[] = {getEffectiveValueUp(Srcs[0]), getEffectiveValueUp(Srcs[1]), + getEffectiveValueUp(Srcs[2])}; + + auto isIndirectRdRegion = [](Value *V) -> bool { + if (!GenXIntrinsic::isRdRegion(V)) + return false; + Region R(cast(V), BaleInfo()); + return R.Indirect; + }; + + auto isIndirectWrRegion = [](User *U) -> bool { + if (!GenXIntrinsic::isWrRegion(U)) + return false; + Region R(cast(U), BaleInfo()); + return R.Indirect; + }; + + // If the result of this mad used solely in an indirect + // region write, count it as an indirect access. + bool IsIndirectDst = false; + if (AInst->hasOneUse()) { + User *U = AInst->use_begin()->getUser(); + IsIndirectDst = isIndirectWrRegion(U); + } + + if (isFpMad()) { + // Agressive on floating point types since there are fewer constraints, + // considering up to one indirect region access to be worthwhile. + // For non-FP mads, any indirect region accesses make it not worth + // bothering. + unsigned IndirectCount = 0; + if (isIndirectRdRegion(Vals[0])) + IndirectCount++; + if (isIndirectRdRegion(Vals[1])) + IndirectCount++; + if (isIndirectRdRegion(Vals[2])) + IndirectCount++; + if (IsIndirectDst) + IndirectCount++; + return IndirectCount <= 1; + } + + if (IsIndirectDst || isIndirectRdRegion(Vals[2]) || + (isIndirectRdRegion(Vals[0]) && isIndirectRdRegion(Vals[1]))) + // For integer mad, we only support indirect access on one of + // multiplicative operands. + return false; + + // This is an integer mad. + // Do not match constant add. I was getting bad results from allowing this, + // although it may have been largely from scalar address computations. + if (isa(Srcs[2])) + return false; + + // Do not match unless both of multiplicants are of type *B/*W. + bool IsProfitable = true; + + auto Checker = [](Value *V) -> bool { + // TODO, handle constants more accurately. + if (isa(V)) + return true; + const unsigned DWordSizeInBits = 32; + return (V->getType()->getScalarSizeInBits() < DWordSizeInBits); + }; + + auto HasKnownShAmtLT16 = [](Value *V) -> bool { + ConstantInt *C = dyn_cast(V); + if (!C) { + if (!isa(V)) + return false; + C = dyn_cast(cast(V)->getSplatValue()); + if (!C) + return false; + } + return C->getValue().ult(16); + }; + + IsProfitable = Checker(Vals[0]); + if (!IsProfitable) + return false; + + IsProfitable = isLShift() ? HasKnownShAmtLT16(Vals[1]) : Checker(Vals[1]); + if (!IsProfitable) + return false; + + // Safety check on indirect access if any. + GenXIntrinsicInst *RII = nullptr; + if (isIndirectRdRegion(Vals[0])) + RII = cast(Vals[0]); + else if (isIndirectRdRegion(Vals[1])) + RII = cast(Vals[1]); + + // Always profitable if there's no indirect access. + if (!RII) + return true; + // Assume not profitable if the indirect access is defined in another BB to + // avoid expensive alias analysis. + if (RII->getParent() != AInst->getParent()) + return false; + + return IsProfitable; +} + +static Value *getBroadcastFromScalar(Value *V) { + VectorType *VTy = dyn_cast(V->getType()); + // Skip if it's not vector type. + if (!VTy) + return nullptr; + // Skip if it's not from rdregion. + if (!GenXIntrinsic::isRdRegion(V)) + return nullptr; + GenXIntrinsicInst *RII = cast(V); + Region R(RII, BaleInfo()); + if (!R.isScalar() || R.Width != 1 || R.Offset != 0) + return nullptr; + Value *Src = RII->getArgOperand(0); + auto *BC = dyn_cast(Src); + if (!BC) + return nullptr; + VTy = dyn_cast(BC->getType()); + if (!VTy || VTy->getNumElements() != 1 || + VTy->getScalarType() != BC->getOperand(0)->getType()) + return nullptr; + return BC->getOperand(0); +} + +class FAddOperator + : public ConcreteOperator {}; + +class FSubOperator + : public ConcreteOperator {}; + +class FMulOperator + : public ConcreteOperator {}; + +class ExtOperator : public Operator { +public: + static bool isExtOpcode(unsigned Opc) { + return Opc == Instruction::SExt || Opc == Instruction::ZExt; + } + static inline bool classof(const Instruction *I) { + return isExtOpcode(I->getOpcode()); + } + static inline bool classof(const ConstantExpr *CE) { + return isExtOpcode(CE->getOpcode()); + } + static inline bool classof(const Value *V) { + return (isa(V) && classof(cast(V))) || + (isa(V) && classof(cast(V))); + } +}; + +class MulLikeOperator : public Operator { +public: + static bool isMulLikeOpcode(unsigned Opc) { + return Opc == Instruction::Mul || Opc == Instruction::Shl; + } + static inline bool classof(const Instruction *I) { + return isMulLikeOpcode(I->getOpcode()); + } + static inline bool classof(const Value *V) { + return isa(V) && classof(cast(V)); + } +}; + +std::tuple +MadMatcher::getNarrowI16Vector(IRBuilder<> &Builder, Instruction *AInst, + Value *V, unsigned NumElts) const { + assert(V->getType()->getScalarType()->isIntegerTy(32) && "I32 is expected!"); + if (auto Ext = dyn_cast(V)) { + V = Ext->getOperand(0); + if (V->getType()->getScalarType()->isIntegerTy(8)) { + Type *DstTy = Builder.getInt16Ty(); + if (auto VTy = dyn_cast(V->getType())) + DstTy = VectorType::get(DstTy, VTy->getNumElements()); + // Extend to i16 first. + V = Builder.CreateCast(Instruction::CastOps(Ext->getOpcode()), V, DstTy); + } + if (!V->getType()->isVectorTy()) { + // Broadcast through rdregion. + Type *NewTy = VectorType::get(V->getType(), 1); + V = Builder.CreateBitCast(V, NewTy); + Region R(V); + R.Offset = 0; + R.Width = 1; + R.Stride = R.VStride = 0; + R.NumElements = NumElts; + V = R.createRdRegion(V, ".splat", AInst, AInst->getDebugLoc()); + } + return std::make_tuple(V, Ext->getOpcode() == Instruction::SExt); + } + if (auto CI = dyn_cast(V)) { + const APInt &Val = CI->getValue(); + if (Val.isIntN(16)) { + V = ConstantVector::getSplat(NumElts, + Builder.getIntN(16, Val.getZExtValue())); + return std::make_tuple(V, Val.isSignedIntN(16)); + } + } + return std::make_tuple(nullptr, false); +} + +// The floating point case is relatively simple. Only need to match with fmul. +bool MadMatcher::matchFpMad() { + assert(AInst->getOpcode() == Instruction::FAdd || + AInst->getOpcode() == Instruction::FSub); + Value *Ops[2] = {AInst->getOperand(0), AInst->getOperand(1)}; + + for (unsigned Idx = 0; Idx != 2; ++Idx) { + Value *Op0 = Ops[Idx]; + Value *Op1 = Ops[1 - Idx]; + if (BinaryOperator *BO = dyn_cast(Op0)) { + // Case +/-(X * Y) +/- Z + if (BO->getOpcode() == Instruction::FMul) { + Srcs[0] = BO->getOperand(0); + Srcs[1] = BO->getOperand(1); + Srcs[2] = Op1; + + setMInst(BO); + if (AInst->getOpcode() == Instruction::FSub) + NegIndex = 2 - Idx; + break; + } + } + if (!MInst) { + if (BinaryOperator *BO = dyn_cast(Op1)) { + // Case Z +/- X * Y + if (BO->getOpcode() == Instruction::FMul) { + Srcs[0] = BO->getOperand(0); + Srcs[1] = BO->getOperand(1); + Srcs[2] = Op0; + + setMInst(BO); + if (AInst->getOpcode() == Instruction::FSub) + NegIndex = 1; + break; + } + } + } + } + + // No genx intrinsic mad for the fp case. + ID = Intrinsic::fma; + + // Emit mad if matched and profitable. + return emit(); +} + +bool MadMatcher::matchIntegerMad() { + assert(AInst->getOpcode() == Instruction::Add || + AInst->getOpcode() == Instruction::Sub); + Value *Ops[2] = {AInst->getOperand(0), AInst->getOperand(1)}; + + if (auto BI = dyn_cast(Ops[0])) { + // Case X * Y +/- Z + Srcs[2] = Ops[1]; + Srcs[1] = BI->getOperand(1); + Srcs[0] = BI->getOperand(0); + setMInst(cast(BI)); + if (isProfitable()) { + if (AInst->getOpcode() == Instruction::Sub) + NegIndex = 2; + } else + setMInst(nullptr); + } + + if (!MInst) { + if (auto BI = dyn_cast(Ops[1])) { + // Case Z +/- X * Y + Srcs[2] = Ops[0]; + Srcs[1] = BI->getOperand(1); + Srcs[0] = BI->getOperand(0); + setMInst(cast(BI)); + if (isProfitable()) { + if (AInst->getOpcode() == Instruction::Sub) + NegIndex = 1; + } else + setMInst(nullptr); + } + } + + if (!MInst) { // Check if operand 0 is broadcasted from scalar. + if (auto S = getBroadcastFromScalar(Ops[0])) { + if (auto BI = dyn_cast(S)) { + // Case X * Y +/- Z + Srcs[2] = Ops[1]; + Srcs[1] = BI->getOperand(1); + Srcs[0] = BI->getOperand(0); + setMInst(cast(BI)); + if (isProfitable()) { + if (AInst->getOpcode() == Instruction::Sub) + NegIndex = 2; + } else + setMInst(nullptr); + } + } + } + + if (!MInst) { // Check if operand 1 is broadcasted from scalar. + if (auto S = getBroadcastFromScalar(Ops[1])) { + if (auto BI = dyn_cast(S)) { + // Case X * Y +/- Z + Srcs[2] = Ops[0]; + Srcs[1] = BI->getOperand(1); + Srcs[0] = BI->getOperand(0); + setMInst(cast(BI)); + if (isProfitable()) { + if (AInst->getOpcode() == Instruction::Sub) + NegIndex = 1; + } else + setMInst(nullptr); + } + } + } + + // Always use ssmad. + ID = GenXIntrinsic::genx_ssmad; + + // Emit mad if matched and profitable. + return emit(); +} + +bool MadMatcher::matchIntegerMad(unsigned IID) { + assert((GenXIntrinsic::getAnyIntrinsicID(AInst) == IID) && "input out of sync"); + Value *Ops[2] = {AInst->getOperand(0), AInst->getOperand(1)}; + + // TODO: handle cases like: cm_add(cm_mul(u, v), w). + if (BinaryOperator *BI = dyn_cast(Ops[0])) { + if (BI->getOpcode() == Instruction::Mul || + BI->getOpcode() == Instruction::Shl) { + // Case X * Y +/- Z + Srcs[2] = Ops[1]; + Srcs[1] = BI->getOperand(1); + Srcs[0] = BI->getOperand(0); + setMInst(BI); + if (!isProfitable()) + setMInst(nullptr); + } + } + if (!MInst) { + if (BinaryOperator *BI = dyn_cast(Ops[1])) { + // Case Z +/- X * Y + if (BI->getOpcode() == Instruction::Mul || + BI->getOpcode() == Instruction::Shl) { + Srcs[2] = Ops[0]; + Srcs[1] = BI->getOperand(1); + Srcs[0] = BI->getOperand(0); + setMInst(BI); + if (!isProfitable()) + setMInst(nullptr); + } + } + } + + switch (IID) { + default: + llvm_unreachable("unexpected intrinsic ID"); + case GenXIntrinsic::genx_ssadd_sat: + ID = GenXIntrinsic::genx_ssmad_sat; + break; + case GenXIntrinsic::genx_suadd_sat: + ID = GenXIntrinsic::genx_sumad_sat; + break; + case GenXIntrinsic::genx_usadd_sat: + ID = GenXIntrinsic::genx_usmad_sat; + break; + case GenXIntrinsic::genx_uuadd_sat: + ID = GenXIntrinsic::genx_uumad_sat; + break; + } + + // Emit mad if matched and profitable. + return emit(); +} + +bool MadMatcher::emit() { + if (MInst == nullptr || !isProfitable()) + return false; + + IRBuilder<> Builder(AInst); + + VectorType *VTy = dyn_cast(Srcs[2]->getType()); + if (!isFpMad() && VTy && VTy->getScalarType()->isIntegerTy(32)) { + Value *V = getBroadcastFromScalar(Srcs[2]); + if (!V) + V = Srcs[2]; + auto BO = dyn_cast(V); + if (BO && BO->getOpcode() == Instruction::Mul) { + // If both operands could be reduced to narrow integer types, use 'mul' + // intrinsic. + Value *V0 = nullptr, *V1 = nullptr; + bool S0 = false, S1 = false; + std::tie(V0, S0) = getNarrowI16Vector(Builder, AInst, BO->getOperand(0), + VTy->getNumElements()); + std::tie(V1, S1) = getNarrowI16Vector(Builder, AInst, BO->getOperand(1), + VTy->getNumElements()); + if (V0 && V1) { + GenXIntrinsic::ID IID = + S0 ? (S1 ? GenXIntrinsic::genx_ssmul : GenXIntrinsic::genx_sumul) + : (S1 ? GenXIntrinsic::genx_usmul : GenXIntrinsic::genx_uumul); + Module *M = AInst->getParent()->getParent()->getParent(); + Type *Tys[2] = {VTy, V0->getType()}; + Function *Fn = GenXIntrinsic::getGenXDeclaration(M, IID, Tys); + Value *Vals[2] = {V0, V1}; + CallInst *CI = Builder.CreateCall(Fn, Vals, "mul"); + Srcs[2] = CI; + } + } + } + + Value *Vals[3] = {Srcs[0], Srcs[1], Srcs[2]}; + + if (isa(AInst)) { + ExtOperator *E0 = dyn_cast(Vals[0]); + ExtOperator *E1 = dyn_cast(Vals[1]); + if (E0 && E1 && + E0->getOperand(0)->getType() == E1->getOperand(0)->getType()) { + if (E0->getOpcode() == Instruction::SExt) { + if (E1->getOpcode() == Instruction::SExt) + ID = GenXIntrinsic::genx_ssmad; + else + ID = GenXIntrinsic::genx_sumad; + } else { + if (E1->getOpcode() == Instruction::SExt) + ID = GenXIntrinsic::genx_usmad; + else + ID = GenXIntrinsic::genx_uumad; + } + Vals[0] = E0->getOperand(0); + Vals[1] = E1->getOperand(0); + } + } + + if (auto VTy = dyn_cast(Vals[2]->getType())) { + // Splat scalar sources if necessary. + for (unsigned i = 0; i != 2; ++i) { + Value *V = Vals[i]; + if (V->getType()->isVectorTy()) + continue; + if (auto C = dyn_cast(V)) { + Vals[i] = ConstantVector::getSplat(VTy->getNumElements(), C); + continue; + } + auto Ext = dyn_cast(V); + if (Ext) + V = Ext->getOperand(0); + Type *NewTy = VectorType::get(V->getType(), 1); + V = Builder.CreateBitCast(V, NewTy); + // Broadcast through rdregin. + Region R(V); + R.Offset = 0; + R.Width = 1; + R.Stride = R.VStride = 0; + R.NumElements = VTy->getNumElements(); + V = R.createRdRegion(V, ".splat", AInst, AInst->getDebugLoc()); + if (Ext) + V = Builder.CreateCast(Instruction::CastOps(Ext->getOpcode()), V, VTy); + Vals[i] = V; + } + } + + if (isLShift()) { + Type *Ty = Vals[0]->getType(); + Constant *Base = ConstantInt::get(Ty->getScalarType(), 1); + if (Ty->isVectorTy()) + Base = ConstantVector::getSplat(Ty->getVectorNumElements(), Base); + Vals[1] = Builder.CreateShl(Base, Vals[1]); + } + + // Perform source operand negation if necessary. + if (NegIndex >= 0) { + if (AInst->getType()->isFPOrFPVectorTy()) + Vals[NegIndex] = Builder.CreateFNeg(Vals[NegIndex], "fneg"); + else + Vals[NegIndex] = Builder.CreateNeg(Vals[NegIndex], "neg"); + } + + Function *Fn = nullptr; + { + Module *M = AInst->getParent()->getParent()->getParent(); + if (AInst->getType()->isFPOrFPVectorTy()) + Fn = GenXIntrinsic::getAnyDeclaration(M, ID, AInst->getType()); + else { + Type *Tys[2] = {AInst->getType(), Vals[0]->getType()}; + Fn = GenXIntrinsic::getAnyDeclaration(M, ID, Tys); + } + } + CallInst *CI = Builder.CreateCall(Fn, Vals, "mad"); + CI->setDebugLoc(AInst->getDebugLoc()); + AInst->replaceAllUsesWith(CI); + + NumOfMadMatched++; + return true; +} + + + +bool MinMaxMatcher::valuesMatch(llvm::Value *Op1, llvm::Value *Op2) { + // Handle casts for instructions. + bool ZExt = false; + if (CastInst *CI = dyn_cast(Op1)) { + Op1 = CI->getOperand(0); + if (CI->getOpcode() == Instruction::ZExt) + ZExt = true; + } + if (CastInst *CI = dyn_cast(Op2)) { + Op2 = CI->getOperand(0); + if (CI->getOpcode() == Instruction::ZExt && !ZExt) + return false; + } + + // the easy case - the operands match + if (Op1 == Op2) + return true; + + // Handle constant zeros before data vectors. + if (isa(Op1) && isa(Op2)) { + ConstantAggregateZero *C1 = cast(Op1); + ConstantAggregateZero *C2 = cast(Op2); + if (C1->getNumElements() != C2->getNumElements()) + return false; + Type *C1Ty = C1->getType(); + Type *C2Ty = C2->getType(); + if (C1Ty->isVectorTy()) { + C1Ty = C1Ty->getSequentialElementType(); + C2Ty = C2Ty->getSequentialElementType(); + } + + return (C1Ty->isIntegerTy() && C2Ty->isIntegerTy()) || + (C1Ty->isFloatingPointTy() && C2Ty->isFloatingPointTy()); + } + + // ConstantDataVectors aren't always matched as different instances are + // constructed containing the same values, so we'll compare the values to + // catch this case. + llvm::ConstantDataVector *C1 = dyn_cast(Op1); + llvm::ConstantDataVector *C2 = dyn_cast(Op2); + if (!C1 || !C2 || (C1->getNumElements() != C2->getNumElements())) + return false; + + Type *C1Ty = C1->getElementType(); + Type *C2Ty = C2->getElementType(); + if (C1Ty->isIntegerTy() && C2Ty->isIntegerTy()) { + for (unsigned i = 0, e = C1->getNumElements(); i < e; ++i) + if (C1->getElementAsInteger(i) != C2->getElementAsInteger(i)) + return false; + return true; + } + + if (C1Ty->isFloatingPointTy() && C2Ty->isFloatingPointTy()) { + for (unsigned i = 0, e = C1->getNumElements(); i < e; ++i) { + double C1Val = C1Ty->isFloatTy() ? C1->getElementAsFloat(i) + : C1->getElementAsDouble(i); + double C2Val = C2Ty->isFloatTy() ? C2->getElementAsFloat(i) + : C2->getElementAsDouble(i); + if (C1Val != C2Val) + return false; + } + return true; + } + + return false; +} + +bool MinMaxMatcher::matchMinMax() { + assert(SelInst->getOpcode() == Instruction::Select && "expected SelectInst"); + if ((CmpInst = dyn_cast(SelInst->getOperand(0)))) { + Srcs[0] = SelInst->getOperand(1); + Srcs[1] = SelInst->getOperand(2); + CmpSrcs[0] = CmpInst->getOperand(0); + CmpSrcs[1] = CmpInst->getOperand(1); + + bool Inverse = false; + if (valuesMatch(CmpSrcs[1], Srcs[0]) && valuesMatch(CmpSrcs[0], Srcs[1])) + Inverse = true; + else if (!(valuesMatch(CmpSrcs[0], Srcs[0]) && + valuesMatch(CmpSrcs[1], Srcs[1]))) + return false; + + // We choose the min/max intrinsic based on the condition and whether the + // operand ordering is the same in the cmp and select. + switch (CmpInst->getPredicate()) { + default: + // this is not a candidate for min/max + return false; + case llvm::CmpInst::FCMP_OGE: + case llvm::CmpInst::FCMP_OGT: + if (Inverse) { + ID = GenXIntrinsic::genx_fmin; + Annotation = "min"; + } else { + ID = GenXIntrinsic::genx_fmax; + Annotation = "max"; + } + break; + case llvm::CmpInst::FCMP_OLE: + case llvm::CmpInst::FCMP_OLT: + if (Inverse) { + ID = GenXIntrinsic::genx_fmax; + Annotation = "max"; + } else { + ID = GenXIntrinsic::genx_fmin; + Annotation = "min"; + } + break; + case llvm::CmpInst::ICMP_SGE: + case llvm::CmpInst::ICMP_SGT: + if (Inverse) { + ID = GenXIntrinsic::genx_smin; + Annotation = "min"; + } else { + ID = GenXIntrinsic::genx_smax; + Annotation = "max"; + } + break; + case llvm::CmpInst::ICMP_SLE: + case llvm::CmpInst::ICMP_SLT: + if (Inverse) { + ID = GenXIntrinsic::genx_smax; + Annotation = "max"; + } else { + ID = GenXIntrinsic::genx_smin; + Annotation = "min"; + } + break; + case llvm::CmpInst::ICMP_UGE: + case llvm::CmpInst::ICMP_UGT: + if (Inverse) { + ID = GenXIntrinsic::genx_umin; + Annotation = "min"; + } else { + ID = GenXIntrinsic::genx_umax; + Annotation = "max"; + } + break; + case llvm::CmpInst::ICMP_ULE: + case llvm::CmpInst::ICMP_ULT: + if (Inverse) { + ID = GenXIntrinsic::genx_umax; + Annotation = "max"; + } else { + ID = GenXIntrinsic::genx_umin; + Annotation = "min"; + } + break; + } + } + + // Emit min/max if matched + return emit(); +} + +bool MinMaxMatcher::emit() { + if ((ID == GenXIntrinsic::not_any_intrinsic) || (Srcs[0] == nullptr) || + (Srcs[1] == nullptr)) + return false; + + IRBuilder<> Builder(SelInst); + Module *M = SelInst->getParent()->getParent()->getParent(); + Type *Tys[2] = {SelInst->getType(), Srcs[0]->getType()}; + Function *Fn = GenXIntrinsic::getAnyDeclaration(M, ID, Tys); + CallInst *CI = Builder.CreateCall(Fn, Srcs, Annotation); + CI->setDebugLoc(SelInst->getDebugLoc()); + SelInst->replaceAllUsesWith(CI); + + NumOfMinMaxMatched++; + return true; +} + +// For a given instruction, find the insertion position which is the closest +// to all the similar users to the specified reference user. +static std::tuple +findOptimalInsertionPos(Instruction *I, Instruction *Ref, DominatorTree *DT, + std::function IsSimilar) { + assert(!isa(Ref) && "PHINode is not expected!"); + + // Shortcut case. If it's single-used, insert just before that user. + if (I->hasOneUse()) + return std::make_tuple(nullptr, Ref); + + DenseMap BBs; + for (auto U : I->users()) { + Instruction *User = dyn_cast(U); + if (!User || !IsSimilar(User)) + continue; + BasicBlock *UseBB = User->getParent(); + DenseMap::iterator MI; + bool New = false; + std::tie(MI, New) = BBs.insert(std::make_pair(UseBB, User)); + if (New) + continue; + // Find the earliest user if they are in the same block. + BasicBlock::iterator BI = UseBB->begin(); + for (; &*BI != User && &*BI != MI->second; ++BI) + /* EMPTY */; + MI->second = &*BI; + } + + assert(BBs.size() != 0 && "Must find at least one BB!"); + + auto MI = BBs.begin(); + // Another shortcut case. If it's only used in a single BB, + if (BBs.size() == 1) + return std::make_tuple(MI->first, MI->second); + + BasicBlock *BB = MI->first; + for (++MI; MI != BBs.end(); ++MI) + BB = DT->findNearestCommonDominator(BB, MI->first); + + MI = BBs.find(BB); + Instruction *Pos = nullptr; + if (MI != BBs.end()) { + BB = MI->first; + Pos = MI->second; + } + assert(BB); + return std::make_tuple(BB, Pos); +} + +// For the specified constant, calculate its reciprocal if it's safe; +// otherwise, return null. +static Constant *getReciprocal(Constant *C, bool HasAllowReciprocal) { + assert(C->getType()->isFPOrFPVectorTy() && + "Floating point value is expected!"); + + // TODO: remove this and use ConstantExpr::getFDiv. + + // Reciprocal of undef can be undef. + if (isa(C)) + return C; + + if (ConstantFP *CFP = dyn_cast(C)) { + // Compute the reciprocal of C. + const APFloat &Divisor = CFP->getValueAPF(); + APFloat Rcp(Divisor.getSemantics(), 1U); + APFloat::opStatus Status = + Rcp.divide(Divisor, APFloat::rmNearestTiesToEven); + // Only fold it if it's safe. + if (Status == APFloat::opOK || + (HasAllowReciprocal && Status == APFloat::opInexact)) + return ConstantFP::get(C->getType()->getContext(), Rcp); + return nullptr; + } + + VectorType *VTy = cast(C->getType()); + IntegerType *ITy = Type::getInt32Ty(VTy->getContext()); + + SmallVector Result; + for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) { + Constant *Elt = + ConstantExpr::getExtractElement(C, ConstantInt::get(ITy, i)); + Constant *Rcp = getReciprocal(Elt, HasAllowReciprocal); + // Skip if any of elements fails to be folded as reciprocal. + if (!Rcp) + return nullptr; + Result.push_back(Rcp); + } + return ConstantVector::get(Result); +} + +// For the given value, calculate its reciprocal and performance constant +// folding if allowed. +static Value *getReciprocal(IRBuilder<> &IRB, Value *V, + bool HasAllowReciprocal = true) { + if (Constant *C = dyn_cast(V)) + return getReciprocal(C, HasAllowReciprocal); + + if (!HasAllowReciprocal) + return nullptr; + + Module *M = IRB.GetInsertBlock()->getParent()->getParent(); + Twine Name = V->getName() + ".inv"; + auto Func = GenXIntrinsic::getGenXDeclaration(M, GenXIntrinsic::genx_inv, + V->getType()); + auto Inv = IRB.CreateCall(Func, V, Name); + return Inv; +} + +/// visitFDiv : reduce fdiv strength. +/// +/// If fast-math is present, perform the following transforms: +/// +/// (fdiv x, y) -> (fmul x0, (fdiv 1., x1)) +/// (fdiv 1., x) -> (rcp x) +/// (fdiv 1., (sqrt x)) -> (rsqrt x) +/// +/// Otherwise, try to reduce fdiv with constant divisor to fmul if the +/// reciprocal is exact. +/// +void GenXPatternMatch::visitFDiv(BinaryOperator &I) { + if (isInstructionTriviallyDead(&I)) { + // Clean up dead 'fdiv', which may be left due to the limitation of + // iterator used in instruction visitor, where only the instruction being + // visited could be safely erased/removed. + I.eraseFromParent(); + Changed |= true; + return; + } + + IRBuilder<> IRB(&I); + + Value *Op0 = I.getOperand(0); + Value *Op1 = I.getOperand(1); + // Constant folding Op1 if it's safe. + if (Constant *C1 = dyn_cast(Op1)) { + Constant *Rcp = getReciprocal(C1, I.hasAllowReciprocal()); + if (!Rcp) + return; + IRB.setFastMathFlags(I.getFastMathFlags()); + Value *FMul = IRB.CreateFMul(Op0, Rcp); + I.replaceAllUsesWith(FMul); + I.eraseFromParent(); + Changed |= true; + return; + } + + // Skip if reciprocal optimization is not allowed. + if (!I.hasAllowReciprocal()) + return; + + Instruction *Divisor = dyn_cast(Op1); + if (!Divisor) + return; + + auto IsSimilar = [](Instruction *User) { + return User->getOpcode() == Instruction::FDiv && User->hasAllowReciprocal(); + }; + + BasicBlock *BB = nullptr; + Instruction *Pos = nullptr; + std::tie(BB, Pos) = findOptimalInsertionPos(Divisor, &I, DT, IsSimilar); + if (Pos) + IRB.SetInsertPoint(Pos); + else + IRB.SetInsertPoint(BB); + auto Rcp = getReciprocal(IRB, Divisor); + cast(Rcp)->setDebugLoc(I.getDebugLoc()); + + for (auto U : Divisor->users()) { + Instruction *User = dyn_cast(U); + if (!User || User == Rcp || !IsSimilar(User)) + continue; + Op0 = User->getOperand(0); + Value *NewVal = Rcp; + if (!match(Op0, m_FPOne())) { + IRB.SetInsertPoint(User); + IRB.setFastMathFlags(User->getFastMathFlags()); + NewVal = IRB.CreateFMul(Op0, Rcp); + } + User->replaceAllUsesWith(NewVal); + // Skip removing dead instruction if it's the current instruction being + // visited as that might invalidate the iterator of this BB. These dead + // 'fdiv' will be removed when they are visited then. + if (User == &I) + User->eraseFromParent(); + } + Changed |= true; + return; +} + +namespace { + +class MulLike { +public: + virtual ~MulLike() {} + static MulLike &get(Instruction *I); + + virtual Instruction *getMul(Instruction *) const { return nullptr; } + virtual bool isAdd(User *) const { return false; } +}; + +class FPMulLike : public MulLike { +public: + Instruction *getMul(Instruction *I) const override { + if (isa(I)) + return I; + return nullptr; + } + bool isAdd(User *U) const override { + return isa(U) || isa(U); + } +}; + +class IntMulLike : public MulLike { +public: + Instruction *getMul(Instruction *I) const override { + if (isa(I) || isa(I)) + return I; + return nullptr; + } + bool isAdd(User *U) const override { + if (isa(U) || isa(U)) + return true; + if (CallInst *CI = dyn_cast(U)) { + switch (GenXIntrinsic::getGenXIntrinsicID(CI)) { + // Keep this list consistent with the one used for matchIntegerMad(IID). + case GenXIntrinsic::genx_ssadd_sat: + case GenXIntrinsic::genx_suadd_sat: + case GenXIntrinsic::genx_usadd_sat: + case GenXIntrinsic::genx_uuadd_sat: + return true; + default: + break; + } + } + return false; + } +}; + +MulLike &MulLike::get(Instruction *I) { + Type *Ty = I->getType()->getScalarType(); + if (Ty->isFloatingPointTy()) { + static FPMulLike FPMul; + return FPMul; + } + if (Ty->isIntegerTy()) { + static IntMulLike IntMul; + return IntMul; + } + static MulLike Null; + return Null; +} +} // End anonymous namespace + +bool GenXPatternMatch::propagateFoldableRegion(Function *F) { + ReversePostOrderTraversal RPOT(F); + bool Changed = false; + for (auto *BB : RPOT) + for (auto BI = BB->begin(), BE = BB->end(); BI != BE; ++BI) { + MulLike &Ring = MulLike::get(&*BI); + Instruction *Mul = Ring.getMul(&*BI); + if (!Mul) + continue; + // Traverse each wrregion use of mul. + for (auto *User : Mul->users()) { + if (!GenXIntrinsic::isWrRegion(User)) + continue; + GenXIntrinsicInst *WII = cast(User); + if (WII->getOperand(1) != Mul) + continue; + Region W(WII, BaleInfo()); + Region V(Mul); + // TODO: Consider the broadcast and similar cases. + if (!W.isStrictlySimilar(V)) + continue; + // Check if all rdregion usage could be folded. + SmallVector Rds; + SmallVector Wrs; // Assume just 1 live wrregion. + Wrs.push_back(WII); + bool HasUnsafeUse = false; + while (!HasUnsafeUse && !Wrs.empty()) { + GenXIntrinsicInst *II = Wrs.back(); + Wrs.pop_back(); + for (auto *U : II->users()) { + if (GenXIntrinsic::isRdRegion(U)) { + GenXIntrinsicInst *RII = cast(U); + Region R(RII, BaleInfo()); + if (R == W) { + for (auto *U2 : RII->users()) + if (!Ring.isAdd(U2)) { + HasUnsafeUse = true; + break; + } + if (HasUnsafeUse) + break; + Rds.push_back(RII); + } else if (R.overlap(W)) { + HasUnsafeUse = true; + break; + } + } else if (GenXIntrinsic::isWrRegion(U)) { + GenXIntrinsicInst *WII2 = cast(U); + Region W2(WII2, BaleInfo()); + if (W2 == W) { + // No more wrregion needs tracing. DO NOTHING. + } else if (W2.overlap(W)) { + HasUnsafeUse = true; + break; + } else // Otherwise, look over that non-overlapping wrregion. + Wrs.push_back(WII2); + } else { + HasUnsafeUse = true; + break; + } + } + } + // Skip if there is any unsafe use. + if (HasUnsafeUse) + continue; + auto *ScalarOrVectorMul = scalarizeOrVectorizeIfNeeded(Mul, Rds.begin(), Rds.end()); + // Fold mul directly into its use after wrregion/rdregion pair. + for (auto *II : Rds) { + if (II->getType() != Mul->getType()) + II->replaceAllUsesWith(ScalarOrVectorMul); + else + II->replaceAllUsesWith(Mul); + Changed = true; + } + // Collapse wrregion if there are rdregion folded away. + if (!Rds.empty()) { + WII->replaceAllUsesWith(WII->getArgOperand(0)); + Changed = true; + } + } + } + return Changed; +} + +// Simplify: +// %1 = zext i8 %0 to i32> +// %2 = bitcast i32 %2 to <32 x i1> +// %3 = call <8 x i1> @llvm.genx.rdpredregion.v8i1.v32i1(<32 x i1> %2, i32 0) +// into +// %1 = bitcast i8 %0 to <8 x i1> +// RAUW %1 +// +bool GenXPatternMatch::simplifyPredRegion(CallInst *CI) { + assert(GenXIntrinsic::getGenXIntrinsicID(CI) == GenXIntrinsic::genx_rdpredregion); + bool Changed = false; + + unsigned NElts = CI->getType()->getVectorNumElements(); + ConstantInt *C = dyn_cast(CI->getArgOperand(1)); + assert(C && "constant integer expected"); + unsigned Offset = (unsigned)C->getZExtValue(); + assert(Offset % NElts == 0); + + // The number of actual bits required. + unsigned NBits = NElts + Offset; + NBits = 1U << llvm::Log2_32_Ceil(NBits); + + Value *Src = CI->getArgOperand(0); + Value *Input = nullptr; + if (match(Src, m_BitCast(m_ZExt(m_Value(Input))))) { + unsigned InputBits = Input->getType()->getPrimitiveSizeInBits(); + if (NBits == InputBits) { + IRBuilder<> Builder(CI); + auto BC = Builder.CreateBitCast(Input, CI->getType(), "bitcast"); + if (auto Inst = dyn_cast(BC)) + Inst->setDebugLoc(CI->getDebugLoc()); + CI->replaceAllUsesWith(BC); + Changed = true; + } + } + return Changed; +} + +bool GenXPatternMatch::simplifyRdRegion(CallInst* Inst) { + assert(GenXIntrinsic::isRdRegion(Inst)); + auto NewVTy = Inst->getType(); + // rewrite indirect rdregion with constant offsets + auto R = Region::getWithOffset(Inst, false /*ParentWidth*/); + if (R.Indirect && R.IndirectIdx == 0 && R.IndirectAddrOffset == 0) { + int64_t starti = 0; + int64_t diffi = 0; + if (IsLinearVectorConstantInts(R.Indirect, starti, diffi)) { + R.Indirect = nullptr; + R.Width = NewVTy->getVectorNumElements(); + R.Offset += starti; + R.Stride = (diffi * 8) / NewVTy->getVectorElementType()->getPrimitiveSizeInBits(); + R.VStride = 0; + Value* OldV = Inst->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum); + auto NewInst = R.createRdRegion(OldV, Inst->getName(), + Inst /*InsertBefore*/, Inst->getDebugLoc()); + Inst->replaceAllUsesWith(NewInst); + return true; + } + } + return false; +} + +bool GenXPatternMatch::simplifyWrRegion(CallInst *Inst) { + assert(GenXIntrinsic::isWrRegion(Inst)); + Value *NewV = Inst->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum); + Type *NewVTy = NewV->getType(); + + // Rewrite a single element insertion to undef as a region splat. + auto check1 = [=]() { + Value *OldV = Inst->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum); + if (!isa(OldV)) + return false; + if (NewVTy->isVectorTy() && NewVTy->getVectorNumElements() > 1) + return false; + // Do not rewrite if input is another region read, as two region reads + // cannot be groupped into a single bale. + if (GenXIntrinsic::isRdRegion(NewV)) + return false; + for (auto U : Inst->users()) { + if (auto BC = dyn_cast(U)) { + for (auto User : BC->users()) + if (GenXIntrinsic::isWrRegion(User)) + return false; + } + + if (GenXIntrinsic::isWrRegion(U)) + return false; + } + + // OK, rewrite it! + return true; + }; + + if (check1()) { + if (!NewVTy->isVectorTy()) { + IRBuilder<> B(Inst); + NewV = B.CreateBitCast(NewV, VectorType::get(NewVTy, 1)); + } + Region R(Inst->getType()); + R.Width = R.NumElements; + R.Stride = 0; + NewV = R.createRdRegion(NewV, "splat", Inst, Inst->getDebugLoc(), false); + Inst->replaceAllUsesWith(NewV); + return true; + } + + // rewrite indirect wrregion with constant offsets + auto R = Region::getWithOffset(Inst, false/*ParentWidth*/); + if (R.Indirect && R.IndirectIdx == 0 && R.IndirectAddrOffset == 0) { + int64_t starti = 0; + int64_t diffi = 0; + if (IsLinearVectorConstantInts(R.Indirect, starti, diffi)) { + R.Indirect = nullptr; + R.Width = NewVTy->getVectorNumElements(); + R.Offset += starti; + R.Stride = (diffi * 8) / NewVTy->getVectorElementType()->getPrimitiveSizeInBits(); + R.VStride = 0; + Value* OldV = Inst->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum); + auto NewInst = R.createWrRegion(OldV, NewV, Inst->getName(), + Inst /*InsertBefore*/, Inst->getDebugLoc()); + Inst->replaceAllUsesWith(NewInst); + return true; + } + } + return false; +} + +// Simplify (trunc.sat (ext V)) to (trunc.sat V). Even if the source and +// destination has the same type, it's incorrect to fold them into V directly +// as the saturation is necessary. +bool GenXPatternMatch::simplifyTruncSat(CallInst *Inst) { + assert(GenXIntrinsic::isIntegerSat(Inst) && "Unexpected integer saturation intrinsic!"); + + GenXIntrinsicInst *II = cast(Inst); + ExtOperator *Ext = dyn_cast(Inst->getOperand(0)); + if (!Ext) + return false; + + auto IID = GenXIntrinsic::getGenXIntrinsicID(II); + Value *Src = Ext->getOperand(0); + bool isZExt = (Ext->getOpcode() == Instruction::ZExt); + + switch (IID) { + case GenXIntrinsic::genx_sstrunc_sat: + IID = isZExt ? GenXIntrinsic::genx_sutrunc_sat + : GenXIntrinsic::genx_sstrunc_sat; + break; + case GenXIntrinsic::genx_sutrunc_sat: + IID = isZExt ? GenXIntrinsic::genx_sutrunc_sat + : GenXIntrinsic::genx_sstrunc_sat; + break; + case GenXIntrinsic::genx_ustrunc_sat: + IID = isZExt ? GenXIntrinsic::genx_uutrunc_sat + : GenXIntrinsic::genx_ustrunc_sat; + break; + case GenXIntrinsic::genx_uutrunc_sat: + IID = isZExt ? GenXIntrinsic::genx_uutrunc_sat + : GenXIntrinsic::genx_ustrunc_sat; + break; + default: + llvm_unreachable("Unknown intrinsic!"); + } + + Module *M = Inst->getParent()->getParent()->getParent(); + Type *Tys[2] = {Inst->getType(), Src->getType()}; + Function *Fn = GenXIntrinsic::getGenXDeclaration(M, IID, Tys); + + Inst->setCalledFunction(Fn); + Inst->setOperand(0, Src); + + return true; +} + +// Merge select into a write region if possible. +// +// a = rrd(x, R); a = rrd(x, R) +// c = a op b ==> c = a op b +// d = select p, c, a +// wrr(x, d, R) wrr(x, c, R, p) +// +bool GenXPatternMatch::simplifySelect(Function *F) { + using namespace GenXIntrinsic::GenXRegion; + + bool Changed = false; + for (auto &BB : *F) { + for (auto BI = BB.begin(), BE = BB.end(); BI != BE; /*empty*/) { + SelectInst *Inst = dyn_cast(&*BI++); + if (!Inst || !Inst->hasOneUse() || !Inst->getType()->isVectorTy() || + !Inst->getCondition()->getType()->isVectorTy()) + continue; + if (!GenXIntrinsic::isWrRegion(Inst->user_back())) + continue; + CallInst *Wr = cast(Inst->user_back()); + if (Wr->getOperand(NewValueOperandNum) != Inst) + continue; + + auto match = [](Instruction *Wr, Value *V) -> bool { + if (!GenXIntrinsic::isRdRegion(V)) + return false; + CallInst *Rd = cast(V); + if (Wr->getOperand(OldValueOperandNum) != + Rd->getOperand(OldValueOperandNum)) + return false; + + Region WrReg(Wr, BaleInfo()); + Region RdReg(Rd, BaleInfo()); + if (WrReg != RdReg || WrReg.Indirect) + return false; + + if (WrReg.Mask == nullptr) + return true; + if (auto C = dyn_cast(WrReg.Mask)) + if (C->isAllOnesValue()) + return true; + + return false; + }; + + for (int i = 1; i <= 2; ++i) { + Value *Op = Inst->getOperand(i); + if (match(Wr, Op)) { + Value *Mask = Inst->getCondition(); + if (i == 1) { + IRBuilder<> B(Inst); + Mask = B.CreateNot(Mask, "not"); + } + + Region WrReg(Wr, BaleInfo()); + WrReg.Mask = Mask; + Value *NewWr = WrReg.createWrRegion( + Wr->getOperand(OldValueOperandNum), Inst->getOperand(3 - i), + Wr->getName(), Wr, Wr->getDebugLoc()); + Wr->replaceAllUsesWith(NewWr); + Changed = true; + + if (Wr == &*BI) + ++BI; + Wr->eraseFromParent(); + Inst->eraseFromParent(); + break; + } + } + } + } + + return Changed; +} + +// Perform volatile global related simplifications. +bool GenXPatternMatch::simplifyVolatileGlobals(Function *F) { + bool Changed = false; + for (auto &BB : F->getBasicBlockList()) { + for (auto I = BB.begin(); I != BB.end(); /*empty*/) { + Instruction *Inst = &*I++; + if (isa(Inst)) + Changed |= normalizeGloads(Inst); + } + for (auto I = BB.rbegin(); I != BB.rend(); /*empty*/) { + Instruction *Inst = &*I++; + if (isInstructionTriviallyDead(Inst)) + Inst->eraseFromParent(); + } + } + return Changed; +} + +// Decompose predicate operand for large vector selects. +bool GenXPatternMatch::decomposeSelect(Function *F) { + auto P = getAnalysisIfAvailable(); + const GenXSubtarget *ST = P ? P->getSubtarget() : nullptr; + SelectDecomposer SD(ST); + for (auto &BB : F->getBasicBlockList()) + for (auto &Inst : BB.getInstList()) + if (isa(Inst)) + SD.addStartSelect(&Inst); + + return SD.run(); +} + +bool GenXPatternMatch::reassociateIntegerMad(Function *F) { + auto isSingleUsedAdd = [](Value *V) -> bool { + auto BO = dyn_cast(V); + if (!BO || !BO->hasOneUse()) + return false; + // FIXME: Consider 'sub' as well. + return BO->getOpcode() == Instruction::Add; + }; + + auto isSingleUsedMul = [](Value *V) -> bool { + auto BO = dyn_cast(V); + if (!BO || !BO->hasOneUse()) + return false; + return (BO->getOpcode() == Instruction::Mul || + BO->getOpcode() == Instruction::Shl); + }; + + bool Changed = false; + for (auto &BB : *F) { + for (auto BI = BB.begin(), BE = BB.end(); BI != BE; /*EMPTY*/) { + if (!isSingleUsedAdd(&*BI)) { + ++BI; + continue; + } + + auto BO = cast(&*BI); + if (!isSingleUsedMul(BO->getOperand(0)) || + !isSingleUsedMul(BO->getOperand(1))) { + ++BI; + continue; + } + + // Found (a0 * b0) + (a1 * b1), track through the chain to check it is + // + // (a0 * b0) + (a1 * b1) + ... + c + // + // and transform it into + // + // c + (a0 * b0) + (a1 * b1) + ... + // + SmallVector AccChain; + AccChain.push_back(BO); + bool Found = false; + unsigned OpndNo = 0; + while (!Found) { + Use &U = *BO->use_begin(); + if (!isSingleUsedAdd(U.getUser())) + break; + BO = cast(U.getUser()); + if (BO->getParent() != &BB) + break; + if (!isSingleUsedMul(BO->getOperand(1 - U.getOperandNo()))) { + OpndNo = 1 - U.getOperandNo(); + Found = true; + } + AccChain.push_back(BO); + } + if (!Found) { + ++BI; + continue; + } + + BO = AccChain.back(); + AccChain.pop_back(); + + IRBuilder<> IRB(BO); + // Reconstruct a new accumulation chain. + Instruction *Acc = cast(IRB.CreateAdd( + BO->getOperand(OpndNo), AccChain.front()->getOperand(0))); + OpndNo = 1; + for (auto CI = AccChain.begin(), CE = AccChain.end(); CI != CE; ++CI) { + auto BO2 = *CI; + Value *Opnd = BO2->getOperand(OpndNo); + Acc = cast(IRB.CreateAdd(Acc, Opnd)); + Acc->setDebugLoc(BO2->getDebugLoc()); + Use &U = *BO2->use_begin(); + OpndNo = 1 - U.getOperandNo(); + } + BO->replaceAllUsesWith(Acc); + + // Erase old accumulation chain. + BI = std::next(BasicBlock::iterator(BO)); + BO->eraseFromParent(); + while (!AccChain.empty()) { + BO = AccChain.back(); + AccChain.pop_back(); + BI = std::next(BasicBlock::iterator(BO)); + BO->eraseFromParent(); + } + Changed = true; + } + } + + return Changed; +} + +bool GenXPatternMatch::distributeIntegerMul(Function *F) { + bool Changed = false; + for (auto &BB : *F) { + for (auto BI = BB.begin(), BE = BB.end(); BI != BE; /*EMPTY*/) { + auto Mul = dyn_cast(&*BI++); + if (!Mul || Mul->getType()->getScalarSizeInBits() < 32) + continue; + // Find the following pattern + // + // A * (B + C) and all components are extended from 8-/16-bit integers. + // + // and transform it to + // + // A * B + A * C. + // + // This transformation won't bring two much difference on SKL but could + // improve code quality a lot on platforms without multiplication of + // D * D -> D, e.g. CNL. + Value *LHS = Mul->getOperand(0); + Value *RHS = Mul->getOperand(1); + if (!isa(LHS)) + std::swap(LHS, RHS); + // Skip if both LHS & RHS are not ext operators. + if (!isa(LHS)) + continue; + // Skip if both LHS & RHS are already operands extended from narrow + // types. + if (isa(RHS)) + continue; + + auto collect = [](Value *V, SmallVectorImpl &Ops) -> bool { + SmallVector CheckList; + CheckList.push_back(V); + + while (!CheckList.empty()) { + V = CheckList.pop_back_val(); + // Collect values if they are extended from narrow types. + if (isa(V)) { + Ops.push_back(V); + continue; + } + // FIXME: Add 'sub' support. + AddOperator *Add = dyn_cast(V); + if (!Add || !Add->hasOneUse()) + return true; + // DFT that 'add' tree. + CheckList.push_back(Add->getOperand(1)); + CheckList.push_back(Add->getOperand(0)); + } + + return false; + }; + + SmallVector Ops; + if (collect(RHS, Ops)) + continue; + + assert(!Ops.empty() && "There's no operands collected!"); + + IRBuilder<> Builder(cast(Mul)); + Value *Sum = nullptr; + for (auto V : Ops) { + Value *Prod = Builder.CreateMul(LHS, V); + if (!Sum) + Sum = Prod; + else + Sum = Builder.CreateAdd(Sum, Prod); + } + Mul->replaceAllUsesWith(Sum); + RecursivelyDeleteTriviallyDeadInstructions(Mul); + + Changed = true; + } + } + return Changed; +} + +// The shift pattern: +// V[0:7] = ShtAmt[0] +// V[8:15] = ShtAmt[0] + ShtAmt[1] +// V[16:23] = ShtAmt[0] + ShtAmt[2] +// V[24:31] = ShtAmt[0] + ShtAmt[3] +// where ShtAmt[0] is a constant vector and ShtAmt[i] are constant splats. +static bool analyzeForShiftPattern(Constant *C, + SmallVectorImpl &ShtAmt, + const DataLayout &DL) { + unsigned Width = 8; + VectorType *VT = dyn_cast(C->getType()); + if (!VT || VT->getVectorNumElements() <= Width || + VT->getScalarSizeInBits() == 1) + return false; + unsigned NElts = VT->getVectorNumElements(); + if (NElts % Width != 0) + return false; + + SmallVector Elts(Width, nullptr); + for (unsigned i = 0; i < Width; ++i) { + Constant *Elt = C->getAggregateElement(i); + if (isa(Elt)) + return false; + Elts[i] = Elt; + } + Constant *Base = ConstantVector::get(Elts); + ShtAmt.push_back(Base); + + for (unsigned i = Width; i < NElts; i += Width) { + SmallVector Elts(Width, nullptr); + for (unsigned j = 0; j < Width; ++j) { + Constant *Elt = C->getAggregateElement(i + j); + if (isa(Elt)) + return false; + Elts[j] = Elt; + } + unsigned Op = Base->getType()->isFPOrFPVectorTy() ? Instruction::FSub + : Instruction::Sub; + Constant *A[] = {ConstantVector::get(Elts), Base}; + auto X = ConstantFoldBinaryOpOperands(Op, A[0], A[1], DL); + if (!X) + return false; + if (!X->getSplatValue()) { + // This is not a splat and it is an integer vector. + if (!Base->getType()->isFPOrFPVectorTy()) + return false; + + // Check if A and B are within a few ULPs. + auto isWithinMaxULP = [](APFloat A, APFloat B, unsigned NSteps) { + APFloat::cmpResult cmpRes = A.compare(B); + if (cmpRes == APFloat::cmpEqual) + return true; + if (cmpRes == APFloat::cmpUnordered) + return false; + + unsigned MAX_ULP = 3 * NSteps; + bool nextDown = cmpRes == APFloat::cmpGreaterThan; + for (unsigned i = 0; i < MAX_ULP; ++i) { + A.next(nextDown); + if (A.compare(B) == APFloat::cmpEqual) + return true; + } + return false; + }; + + // This is not an exact splat fp vector. We check if they are within a few + // ULPs, as divisions are actually not correctly rounded during folding. + ConstantFP *X0 = dyn_cast_or_null(X->getAggregateElement(0U)); + if (!X0) + return false; + for (unsigned j = 1; j < Width; ++j) { + ConstantFP *Xj = + dyn_cast_or_null(X->getAggregateElement(j)); + unsigned NSteps = NElts / Width; + if (!Xj || + !isWithinMaxULP(Xj->getValueAPF(), X0->getValueAPF(), NSteps)) + return false; + } + X = ConstantDataVector::getSplat(Width, X0); + } + ShtAmt.push_back(X); + } + return true; +} + +bool GenXPatternMatch::vectorizeConstants(Function *F) { + bool Changed = false; + for (auto &BB : F->getBasicBlockList()) { + for (auto I = BB.begin(); I != BB.end();) { + Instruction *Inst = &*I++; + if (isa(Inst)) + continue; + unsigned NumOpnds = Inst->getNumOperands(); + auto CI = dyn_cast(Inst); + if (CI) + NumOpnds = CI->getNumArgOperands(); + for (unsigned i = 0, e = NumOpnds; i != e; ++i) { + auto C = dyn_cast(Inst->getOperand(i)); + if (!C || isa(C)) + continue; + if (opMustBeConstant(Inst, i)) + continue; + auto Ty = C->getType(); + if (!Ty->isVectorTy() || Ty->getVectorNumElements() < 16 || + C->getSplatValue()) + continue; + SmallVector ShtAmt; + if (analyzeForShiftPattern(C, ShtAmt, *DL)) { + // W1 = wrrregion(undef, ShtAmt[0], 0); + // V2 = fadd ShtAmt[0], ShtAmt[1] + // W2 = wrregion(W1, V2, Width) + // V3 = fadd ShtAmt[0], ShtAmt[2] + // W2 = wrregion(W2, V3, Width * 2) + // ... + Value *Base = nullptr; + { + Value *Args[] = {ShtAmt[0]}; + Type *Tys[] = {ShtAmt[0]->getType()}; + auto ID = C->getType()->isFPOrFPVectorTy() + ? GenXIntrinsic::genx_constantf + : GenXIntrinsic::genx_constanti; + Module *M = F->getParent(); + Function *Decl = GenXIntrinsic::getGenXDeclaration(M, ID, Tys); + auto NewInst = CallInst::Create(Decl, Args, "constant", Inst); + NewInst->setDebugLoc(Inst->getDebugLoc()); + Base = NewInst; + } + + IRBuilder<> Builder(Inst); + unsigned Width = ShtAmt[0]->getType()->getVectorNumElements(); + Region R(C->getType()); + R.getSubregion(0, Width); + Value *Val = UndefValue::get(C->getType()); + Val = R.createWrRegion(Val, Base, "", Inst, Inst->getDebugLoc()); + for (unsigned j = 1; j < (unsigned)ShtAmt.size(); ++j) { + auto Opc = C->getType()->isFPOrFPVectorTy() ? Instruction::FAdd + : Instruction::Add; + auto Input = Builder.CreateBinOp(Opc, Base, ShtAmt[j]); + Region R1(C->getType()); + R1.getSubregion(Width * j, Width); + Val = R1.createWrRegion(Val, Input, "", Inst, Inst->getDebugLoc()); + } + + // Update this operand with newly vectorized constant. + auto ID = GenXIntrinsic::getGenXIntrinsicID(Inst); + if (ID == GenXIntrinsic::genx_constantf || + ID == GenXIntrinsic::genx_constanti) { + Inst->replaceAllUsesWith(Val); + Inst->eraseFromParent(); + } else + Inst->setOperand(i, Val); + + Changed = true; + } + } + } + } + + return Changed; +} + +static Instruction *insertConstantLoad(Constant *C, Instruction *InsertBefore) { + assert(!C->getType()->getScalarType()->isIntegerTy(1)); + Value *Args[] = {C}; + Type *Ty[] = {C->getType()}; + auto IntrinsicID = GenXIntrinsic::genx_constanti; + if (C->getType()->isFPOrFPVectorTy()) + IntrinsicID = GenXIntrinsic::genx_constantf; + Module *M = InsertBefore->getParent()->getParent()->getParent(); + Function *F = GenXIntrinsic::getGenXDeclaration(M, IntrinsicID, Ty); + Instruction *Inst = CallInst::Create(F, Args, "constant", InsertBefore); + Inst->setDebugLoc(InsertBefore->getDebugLoc()); + return Inst; +} + +bool GenXPatternMatch::placeConstants(Function *F) { + bool Changed = false; + for (auto &BB : F->getBasicBlockList()) { + for (auto I = BB.begin(); I != BB.end();) { + Instruction *Inst = &*I++; + auto ID = GenXIntrinsic::getGenXIntrinsicID(Inst); + if (ID == GenXIntrinsic::genx_constantf || + ID == GenXIntrinsic::genx_constanti) + continue; + + for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i) { + auto C = dyn_cast(Inst->getOperand(i)); + if (!C || isa(C)) + continue; + if (opMustBeConstant(Inst, i)) + continue; + auto Ty = C->getType(); + if (!Ty->isVectorTy() || C->getSplatValue()) + continue; + if (Ty->getScalarSizeInBits() == 1) + continue; + + // Counting the bit size of non-undef values. + unsigned NBits = 0; + for (unsigned i = 0, e = Ty->getVectorNumElements(); i != e; ++i) { + Constant *Elt = C->getAggregateElement(i); + if (Elt && !isa(Elt)) + NBits += Ty->getScalarSizeInBits(); + } + if (NBits <= 256) + continue; + + // Collect uses inside this function. + SmallVector ConstantUses; + std::set ConstantUsers; + + for (auto &U : C->uses()) { + auto I = dyn_cast(U.getUser()); + if (!I || I->getParent()->getParent() != F) + continue; + ConstantUses.push_back(&U); + ConstantUsers.insert(I); + } + if (ConstantUsers.empty()) + continue; + + // Single use in a loop. + if (ConstantUsers.size() == 1) { + // Do not lift this constant, for now, to avoid spills. +#if 0 + Use *U = ConstantUses.back(); + Instruction *UseInst = cast(U->getUser()); + BasicBlock *UseBB = UseInst->getParent(); + if (Loop *L = LI->getLoopFor(UseBB)) { + if (BasicBlock *Preheader = L->getLoopPreheader()) { + if (Preheader != UseBB) { + // Insert constant initialization in loop preheader. + Instruction *InsertBefore = Preheader->getTerminator(); + Value *Val = insertConstantLoad(C, InsertBefore); + U->set(Val); + Changed = true; + } + } + } +#endif + continue; // skip to the next constant + } + + // It is profitable to use a common constant pool in register. + assert(ConstantUses.size() >= 2); + BasicBlock *InsertBB = nullptr; + for (auto U : ConstantUses) { + auto UseInst = cast(U->getUser()); + auto UseBB = UseInst->getParent(); + if (InsertBB == nullptr) + InsertBB = UseBB; + else if (InsertBB != UseBB) { + InsertBB = DT->findNearestCommonDominator(InsertBB, UseBB); + } + } + + // InsertBlock is in a loop. + if (Loop *L = LI->getLoopFor(InsertBB)) + if (BasicBlock *Preheader = L->getLoopPreheader()) + if (Preheader != InsertBB) + InsertBB = Preheader; + + // If the insert block is the same as some use block, find the first + // use instruction as the insert point. Otherwise, use the terminator of + // the insert block. + Instruction *InsertBefore = InsertBB->getTerminator(); + for (auto UseInst : ConstantUsers) { + if (InsertBB == UseInst->getParent()) { + for (auto &I : InsertBB->getInstList()) { + if (ConstantUsers.find(&I) != ConstantUsers.end()) { + InsertBefore = &I; + goto Found; + } + } + } + } + Found: + assert(!isa(InsertBefore)); + Value *Val = insertConstantLoad(C, InsertBefore); + for (auto U : ConstantUses) + U->set(Val); + Changed = true; + } + } + } + + return Changed; +} + +bool GenXPatternMatch::simplifyNullDst(CallInst *Inst) { + if (Inst->getNumUses() != 1) + return false; + + PHINode *Phi = dyn_cast(Inst->use_begin()->getUser()); + if (Phi == nullptr) + return false; + + if (Phi->getNumUses() == 1 && Phi->use_begin()->getUser() == Inst) { + Phi->replaceAllUsesWith(UndefValue::get(Phi->getType())); + Phi->eraseFromParent(); + return true; + } + + return false; +} + +bool canExtendMask(BinaryOperator *BO) { + Type *InstTy = BO->getType(); + auto Op0 = dyn_cast(BO->getOperand(0)); + auto Op1 = dyn_cast(BO->getOperand(1)); + return InstTy->isVectorTy() && + (InstTy->getScalarSizeInBits() == genx::ByteBits) && (Op0 || Op1); +} + +bool GenXPatternMatch::extendMask(BinaryOperator *BO) { + if (!canExtendMask(BO)) + return false; + + Type *InstTy = BO->getType(); + Type *I32Ty = Type::getInt32Ty(InstTy->getContext()); + unsigned SizeInBits = InstTy->getScalarSizeInBits(); + unsigned Scale = I32Ty->getPrimitiveSizeInBits() / SizeInBits; + unsigned NumElts = InstTy->getVectorNumElements(); + + // Cannot bitcast to + if (NumElts % Scale != 0) + return false; + NumElts /= Scale; + + Type *NewTy = VectorType::get(I32Ty, NumElts); + IRBuilder Builder(BO->getParent(), BasicBlock::iterator(BO), + TargetFolder(*DL)); + StringRef Name = BO->getName(); + + Value *Op0 = + Builder.CreateBitCast(BO->getOperand(0), NewTy, Name + ".extend.mask.op"); + Value *Op1 = + Builder.CreateBitCast(BO->getOperand(1), NewTy, Name + ".extend.mask.op"); + Value *NewAnd = Builder.CreateAnd(Op0, Op1, Name + ".extend.mask"); + NewAnd = Builder.CreateBitCast(NewAnd, InstTy, Name + ".extend.mask.trunc"); + + BO->replaceAllUsesWith(NewAnd); + + return true; +} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXPostLegalization.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXPostLegalization.cpp new file mode 100644 index 000000000000..608c60571d38 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXPostLegalization.cpp @@ -0,0 +1,171 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXPostLegalization +/// -------------------- +/// +/// GenXPostLegalization is a function pass run after legalization with the +/// following purposes: +/// +/// 1. It inserts a constant load for most constants that are not representable +/// as a constant operand in GenX code. See the GenXConstants section below. +// (in the file GenXConstants.cpp) +/// +/// 2. It calls GenXVectorDecomposer to perform vector decomposition. See the +/// GenXVectorDecomposer section below. +// (in the file GenXVectorDecomposer.h) +/// +/// Both of these things are done here because the results of them (constant +/// loads and decomposed vector operations) may benefit from CSE run after +/// this pass. +/// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "GENX_POST_LEGALIZATION" + +#include "GenX.h" +#include "GenXBaling.h" +#include "GenXConstants.h" +#include "GenXRegion.h" +#include "GenXSubtarget.h" +#include "GenXUtil.h" +#include "GenXVectorDecomposer.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" + +#include + +using namespace llvm; +using namespace genx; +using namespace GenXIntrinsic::GenXRegion; + +namespace { + +// GenXPostLegalization : post-legalization pass +class GenXPostLegalization : public FunctionPass { + DominatorTree *DT = nullptr; + VectorDecomposer VD; + const DataLayout *DL = nullptr; + const GenXSubtarget *ST = nullptr; +public: + static char ID; + explicit GenXPostLegalization() : FunctionPass(ID) { } + virtual StringRef getPassName() const { return "GenX post-legalization pass"; } + void getAnalysisUsage(AnalysisUsage &AU) const; + bool runOnFunction(Function &F); +}; + +} // end namespace llvm + + +char GenXPostLegalization::ID = 0; +namespace llvm { void initializeGenXPostLegalizationPass(PassRegistry &); } +INITIALIZE_PASS_BEGIN(GenXPostLegalization, "GenXPostLegalization", "GenXPostLegalization", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(GenXPostLegalization, "GenXPostLegalization", "GenXPostLegalization", false, false) + +FunctionPass *llvm::createGenXPostLegalizationPass() +{ + initializeGenXPostLegalizationPass(*PassRegistry::getPassRegistry()); + return new GenXPostLegalization; +} + +void GenXPostLegalization::getAnalysisUsage(AnalysisUsage &AU) const +{ + AU.addRequired(); + AU.setPreservesCFG(); +} + +/*********************************************************************** + * GenXPostLegalization::runOnFunction : process one function + */ +bool GenXPostLegalization::runOnFunction(Function &F) +{ + DL = &F.getParent()->getDataLayout(); + auto P = getAnalysisIfAvailable(); + if (P) + ST = P->getSubtarget(); + else + return false; + DT = &getAnalysis().getDomTree(); + + bool Modified = false; + Modified |= breakConstantExprs(&F); + + for (Function::iterator fi = F.begin(), fe = F.end(); fi != fe; ++fi) { + BasicBlock *BB = &*fi; + for (BasicBlock::iterator bi = BB->begin(), be = BB->end(); bi != be; ++bi) { + Instruction *Inst = &*bi; + switch (GenXIntrinsic::getAnyIntrinsicID(Inst)) { + default: + // Lower non-simple constant operands. + Modified |= loadNonSimpleConstants(Inst, nullptr, ST); + break; + case Intrinsic::fma: + Modified |= loadConstants(Inst, ST); + break; + } + + // If this is a wrregion with constant input, or phi node input, give it + // to the vector decomposer. (We could just give it all wrregions, but we + // are trying to minimize the amount of work it has to do.) + if (!ST->disableVectorDecomposition()) { + if (GenXIntrinsic::isWrRegion(Inst)) { + if (isa(Inst->getOperand(0))) + VD.addStartWrRegion(Inst); + else if (isa(Inst->getOperand(0))) + VD.addStartWrRegion(Inst); + } + } + } + } + // Run the vector decomposer for this function. + Modified |= VD.run(DT); + // Cleanup region reads and writes. + Modified |= simplifyRegionInsts(&F, DL); + // Cleanup redundant global loads. + Modified |= cleanupLoads(&F); + // Legalize constants in return. + for (auto FI = F.begin(), FE = F.end(); FI != FE; ++FI) { + BasicBlock *BB = &*FI; + for (auto BI = BB->begin(), BE = BB->end(); BI != BE; ++BI) { + Instruction *Inst = &*BI; + if (isa(Inst)) { + Modified |= loadNonSimpleConstants(Inst, nullptr, ST); + Modified |= loadConstants(Inst, ST); + } + } + } + + return Modified; +} + diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXPressureTracker.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXPressureTracker.cpp new file mode 100644 index 000000000000..e32798ea829a --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXPressureTracker.cpp @@ -0,0 +1,211 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ + +#include "GenXPressureTracker.h" +#include "FunctionGroup.h" +#include "GenX.h" +#include "GenXBaling.h" +#include "GenXLiveness.h" +#include "GenXRegion.h" +#include "vc/GenXOpts/Utils/RegCategory.h" + +using namespace llvm; +using namespace genx; + +namespace { + +struct LiveRangeAndLength { + LiveRange *LR; + unsigned Length; + LiveRangeAndLength(LiveRange *LR, unsigned Length) : LR(LR), Length(Length) {} + bool operator<(const LiveRangeAndLength &Rhs) const { + return Length > Rhs.Length; + } +}; + +} // namespace + +unsigned PressureTracker::getSizeInBytes(LiveRange *LR, bool AllowWidening) { + SimpleValue SV = *LR->value_begin(); + Value *V = SV.getValue(); + Type *Ty = IndexFlattener::getElementType(V->getType(), SV.getIndex()); + unsigned Bytes = (Ty->getPrimitiveSizeInBits() + 15U) / 8U & -2U; + if (!AllowWidening) + return Bytes; + + // Check if this will be a live range to be promoted to a word vector: + // - this is a byte vector + // - non-of values will be used in indirect regions + // - all uses are in the same block (local variables only) + // + auto toWiden = [=]() -> bool { + if (!Ty->isVectorTy() || !Ty->getVectorElementType()->isIntegerTy(8)) + return false; + + BasicBlock *DefBB = nullptr; + for (auto I = LR->value_begin(), E = LR->value_end(); I != E; ++I) { + auto Inst = dyn_cast((*I).getValue()); + if (!Inst) + return false; + if (!DefBB) + DefBB = Inst->getParent(); + if (DefBB != Inst->getParent() || Inst->isUsedOutsideOfBlock(DefBB)) + return false; + for (auto UI : Inst->users()) { + if (GenXIntrinsic::isRdRegion(UI) || GenXIntrinsic::isWrRegion(UI)) { + Region R(cast(UI), BaleInfo()); + if (R.Indirect) + return false; + } + } + } + + // OK, this is a candidate for widening. + return true; + }; + + if (toWiden()) { + WidenCandidates.push_back(LR); + Bytes *= 2; + } + return Bytes; +} + +// Decrease pressure assuming no widening on variable for LR. +void PressureTracker::decreasePressure(LiveRange *LR) { + if (!LR || LR->getCategory() != RegCategory::GENERAL) + return; + +#if _DEBUG + auto I = std::find(WidenCandidates.begin(), WidenCandidates.end(), LR); + assert(I != WidenCandidates.end()); +#endif + + unsigned Bytes = getSizeInBytes(LR, /*AllowWidening*/ false); + for (auto SI = LR->begin(), SE = LR->end(); SI != SE; ++SI) { + for (unsigned i = SI->getStart(); i != SI->getEnd(); ++i) { + assert(i < Pressure.size()); + assert(Pressure[i] >= Bytes); + Pressure[i] -= Bytes; + } + } + calculateRedSegments(); +} + +void PressureTracker::calculate() { + std::vector LRs; + getLiveRanges(LRs); + std::vector LRLs; + for (auto LR : LRs) + LRLs.emplace_back(LR, LR->getLength(/*WithWeak*/ false)); + LRs.clear(); + std::sort(LRLs.begin(), LRLs.end()); + + // Keep count of the rp at each instruction number. + Pressure.clear(); + for (auto &I : LRLs) { + LiveRange *LR = I.LR; + unsigned Bytes = getSizeInBytes(LR, WithByteWidening); + for (auto SI = LR->begin(), SE = LR->end(); SI != SE; ++SI) { + if (SI->getEnd() >= Pressure.size()) + Pressure.resize(SI->getEnd() + 1, 0); + for (unsigned i = SI->getStart(); i != SI->getEnd(); ++i) + Pressure[i] += Bytes; + } + } +} + +// Calculate high pressure segments. +void PressureTracker::calculateRedSegments() { + HighPressureSegments.clear(); + unsigned UNDEF = std::numeric_limits::max(); + unsigned B = UNDEF; + unsigned E = UNDEF; + for (unsigned i = 0; i < Pressure.size(); ++i) { + if (Pressure[i] >= THRESHOLD) { + if (B == UNDEF) + B = i; + else + E = i; + } else { + if (B != UNDEF && E != UNDEF) + HighPressureSegments.emplace_back(B, E); + else if (B != UNDEF) + HighPressureSegments.emplace_back(B, B); + B = E = UNDEF; + } + } +} + +// Check if segment [B, E] intersects with a high pressure region or not. +bool PressureTracker::intersectWithRedRegion(unsigned B, unsigned E) const { + for (auto S : HighPressureSegments) { + unsigned B1 = S.Begin; + unsigned E1 = S.End; + if (B > E1) + continue; + return E >= B1; + } + return false; +} + +bool PressureTracker::intersectWithRedRegion(LiveRange *LR) const { + if (!LR || LR->getCategory() == RegCategory::NONE) + return false; + for (auto I = LR->begin(), E = LR->end(); I != E; ++I) + if (intersectWithRedRegion(I->getStart(), I->getEnd())) + return true; + return false; +} + +void PressureTracker::getLiveRanges(std::vector &LRs) { + for (auto I = FG.begin(), E = FG.end(); I != E; ++I) { + Function *F = *I; + for (auto &Arg : F->args()) + getLiveRangesForValue(&Arg, LRs); + if (I != FG.begin() && !F->getReturnType()->isVoidTy()) + getLiveRangesForValue(Liveness->getUnifiedRet(F), LRs); + for (auto &BB : F->getBasicBlockList()) + for (auto &Inst : BB.getInstList()) + getLiveRangesForValue(&Inst, LRs); + } +} + +void PressureTracker::getLiveRangesForValue( + Value *V, std::vector &LRs) const { + auto Ty = V->getType(); + for (unsigned i = 0, e = IndexFlattener::getNumElements(Ty); i != e; ++i) { + SimpleValue SV(V, i); + LiveRange *LR = Liveness->getLiveRangeOrNull(SV); + if (!LR || LR->getCategory() == RegCategory::NONE) + continue; + // Only process an LR if the map iterator is on the value that appears + // first in the LR. That avoids processing the same LR multiple times. + if (SV != *LR->value_begin()) + continue; + LRs.push_back(LR); + } +} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXPressureTracker.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXPressureTracker.h new file mode 100644 index 000000000000..00f922561c68 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXPressureTracker.h @@ -0,0 +1,91 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +#ifndef TARGET_GENX_PRESSURE_TRACKER_H +#define TARGET_GENX_PRESSURE_TRACKER_H + +#include + +namespace llvm { + +class Value; +class GenXLiveness; +class FunctionGroup; + +namespace genx { + +class LiveRange; + +class PressureTracker { + FunctionGroup &FG; + GenXLiveness *Liveness; + // Flag to widen byte vectors to word vectors if applicable. + bool WithByteWidening; + // Candidate variable for widening. + std::vector WidenCandidates; + std::vector Pressure; + + static const unsigned THRESHOLD = sizeof(float) * 8 * 120; + struct Segment { + unsigned Begin; + unsigned End; + Segment(unsigned B, unsigned E) : Begin(B), End(E) {} + }; + std::vector HighPressureSegments; + +public: + PressureTracker(FunctionGroup &FG, GenXLiveness *L, + bool WithByteWidening = false) + : FG(FG), Liveness(L), WithByteWidening(WithByteWidening) { + calculate(); + calculateRedSegments(); + } + + // Estimate the register pressure for each Instruction number. + void calculate(); + + // Calculate high pressure segments. + void calculateRedSegments(); + + // Check if segment [B, E] intersects with a high pressure region or not. + bool intersectWithRedRegion(unsigned B, unsigned E) const; + bool intersectWithRedRegion(LiveRange *LR) const; + + // Return the list of variables that are likely to be widened. + const std::vector &getWidenVariables() { return WidenCandidates; } + + // Decrease pressure assuming no widening on variable for LR. + void decreasePressure(LiveRange *LR); + +private: + void getLiveRanges(std::vector &LRs); + void getLiveRangesForValue(Value *V, std::vector &LRs) const; + unsigned getSizeInBytes(LiveRange *LR, bool AllowWidening); +}; + +} // namespace genx +} // namespace llvm + +#endif // TARGET_GENX_PRESSURE_TRACKER_H diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXPrinter.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXPrinter.cpp new file mode 100644 index 000000000000..c458bae44d0f --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXPrinter.cpp @@ -0,0 +1,243 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +// GenXPrinter is a pass that prints the LLVM IR for a function, together +// GenX specific analyses (instruction baling, liveness, register allocation). +// +//===----------------------------------------------------------------------===// + +#include "FunctionGroup.h" +#include "GenX.h" +#include "GenXBaling.h" +#include "GenXLiveness.h" +#include "GenXNumbering.h" +#include "GenXVisaRegAlloc.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; +using namespace genx; + +namespace { + +// GenXPrinter : an analysis to print a Function, with GenX specific analyses +class GenXPrinter : public FunctionPass { + raw_ostream &OS; + const std::string Banner; +public: + static char ID; + explicit GenXPrinter(raw_ostream &OS, const std::string &Banner) + : FunctionPass(ID), OS(OS), Banner(Banner) { } + virtual StringRef getPassName() const { return "GenX printer pass"; } + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + } + bool runOnFunction(Function &F); +}; + +// GenXGroupPrinter : an analysis to print a FunctionGroup, with GenX specific analyses +class GenXGroupPrinter : public FunctionGroupPass { + raw_ostream &OS; + const std::string Banner; +public: + static char ID; + explicit GenXGroupPrinter(raw_ostream &OS, const std::string &Banner) + : FunctionGroupPass(ID), OS(OS), Banner(Banner) { } + virtual StringRef getPassName() const { return "GenX FunctionGroup printer pass"; } + void getAnalysisUsage(AnalysisUsage &AU) const { + FunctionGroupPass::getAnalysisUsage(AU); + AU.setPreservesAll(); + } + bool runOnFunctionGroup(FunctionGroup &FG); +}; + +} // end namespace llvm + +char GenXPrinter::ID = 0; + +FunctionPass *llvm::createGenXPrinterPass(raw_ostream &O, const std::string &Banner) +{ + return new GenXPrinter(O, Banner); +} + +char GenXGroupPrinter::ID = 0; + +FunctionGroupPass *llvm::createGenXGroupPrinterPass(raw_ostream &O, const std::string &Banner) +{ + return new GenXGroupPrinter(O, Banner); +} + +/*********************************************************************** + * printFunction : print function with GenX analyses + */ +static void printFunction(raw_ostream &OS, Function &F, GenXBaling *Baling, + GenXLiveness *Liveness, GenXNumbering *Numbering, GenXVisaRegAlloc *RA) +{ + // This code is a downmarket version of AssemblyWriter::printFunction. + // We have our own version so we can show bales. + OS << "\ndefine "; + cast(cast(F.getType())->getElementType())->getReturnType()->print(OS); + OS << " @" << F.getName() << "("; + for (Function::arg_iterator fb = F.arg_begin(), fi = fb, fe = F.arg_end(); + fi != fe; ) { + if (fi != fb) + OS << ", "; + Argument *Arg = &*fi; + ++fi; + Arg->getType()->print(OS); + OS << " "; + // Only show register number if there is a register allocator. + GenXVisaRegAlloc::Reg* Reg = nullptr; + if (RA) + Reg = RA->getRegForValueOrNull(&F, SimpleValue(Arg)); + if (Reg) { + OS << "["; + Reg->print(OS); + OS << "]"; + } + OS << "%" << Arg->getName(); + } + OS << ") {\n"; + for (Function::iterator fi = F.begin(), fe = F.end(); fi != fe; ++fi) { + BasicBlock *BB = &*fi; + if (!BB->use_empty()) + OS << BB->getName() << ":\n"; + for (BasicBlock::iterator bi = BB->begin(), be = BB->end(); bi != be; ++bi) { + Instruction *Inst = &*bi; + if (!Baling || !Baling->isBaled(Inst)) { + if (RA && !Inst->getType()->isVoidTy()) { + // Show allocated register in brackets. If it is struct type, + // we show the multiple registers. For an alias, show its base + // register in braces as well. + for (unsigned i = 0, + e = IndexFlattener::getNumElements(Inst->getType()); + i != e; ++i) { + auto Reg = RA->getRegForValueOrNull(&F, SimpleValue(Inst, i)); + if (Reg && Reg->Category) { + OS << (!i ? "[" : ","); + Reg->print(OS); + auto BaseReg = RA->getRegForValueUntyped(&F, SimpleValue(Inst, i)); + if (BaseReg != Reg) { + OS << "{"; + assert(BaseReg); + BaseReg->print(OS); + OS << "}"; + } + if (i + 1 == e) + OS << "]"; + } + } + } + // Show instruction number in brackets. + unsigned Num = 0; + if (Numbering) + Num = Numbering->getNumber(Inst); + if (Num) + OS << "[" << Num << "]"; + if (!Baling) { + Inst->print(OS); + OS << "\n"; + } else { + Bale B; + Baling->buildBale(Inst, &B); + if (B.size() == 1) { + Inst->print(OS); + OS << "\n"; + } else { + OS << " bale {\n"; + for (Bale::iterator i = B.begin(), + e = B.end(); i != e; ++i) { + unsigned Num = 0; + if (Numbering) + Num = Numbering->getNumber(i->Inst); + if (Num) + OS << "[" << Num << "]"; + OS << " "; + i->Inst->print(OS); + switch (i->Info.Type) { + case BaleInfo::MAININST: break; + default: OS << " {" << i->Info.getTypeString() << "}"; break; + } + OS << "\n"; + } + if (Num) + OS << "[" << Num << "]"; + OS << " }\n"; + } + } + } + } + } + OS << "}\n"; +} + +/*********************************************************************** + * GenXPrinter::runOnFunction : dump function with GenX analyses + */ +bool GenXPrinter::runOnFunction(Function &F) +{ + GenXVisaRegAlloc *RA = getAnalysisIfAvailable(); + GenXLiveness *Liveness = nullptr; + GenXNumbering *Numbering = nullptr; + if (!RA) { + Liveness = getAnalysisIfAvailable(); + Numbering = getAnalysisIfAvailable(); + } + GenXBaling *Baling = getAnalysisIfAvailable(); + OS << Banner; + printFunction(OS, F, Baling, Liveness, Numbering, RA); + return false; +} + +/*********************************************************************** + * GenXGroupPrinter::runOnFunctionGroup : dump functions with GenX analyses + */ +bool GenXGroupPrinter::runOnFunctionGroup(FunctionGroup &FG) +{ + GenXVisaRegAlloc *RA = getAnalysisIfAvailable(); + GenXLiveness *Liveness = nullptr; + GenXNumbering *Numbering = nullptr; + if (!RA) { + Liveness = getAnalysisIfAvailable(); + Numbering = getAnalysisIfAvailable(); + } + GenXBaling *Baling = getAnalysisIfAvailable(); + if (!Baling) + Baling = getAnalysisIfAvailable(); + OS << Banner; + if (Liveness) + OS << " (see below for GenXLiveness)"; + for (auto i = FG.begin(), e = FG.end(); i != e; ++i) + printFunction(OS, **i, Baling, Liveness, Numbering, RA); + if (Liveness) { + Liveness->print(OS); + OS << "\n"; + } + OS << "\n"; + return false; +} + diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXPromoteArray.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXPromoteArray.cpp new file mode 100644 index 000000000000..be5427272229 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXPromoteArray.cpp @@ -0,0 +1,1081 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXPromoteArray +/// -------------------- +/// +/// GenXPromoteArray is an optimization pass that converts load/store +/// from an allocated private array into vector loads/stores followed by +/// read-region and write-region. Then we can apply standard llvm optimization +/// to promote the entire array into virtual registers, and remove those +/// loads and stores +//===----------------------------------------------------------------------===// + +#include "GenX.h" +#include "GenXModule.h" +#include "GenXRegion.h" +#include "GenXUtil.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/DiagnosticPrinter.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/ADT/SmallVector.h" + +#include + +#define MAX_ALLOCA_PROMOTE_GRF_NUM 96 + +using namespace llvm; +using namespace genx; + +namespace { + +// Diagnostic information for error/warning relating array promotion. +class DiagnosticInfoPromoteArray : public DiagnosticInfo { +private: + std::string Description; + +public: + // Initialize from description + DiagnosticInfoPromoteArray(const Twine &Desc, + DiagnosticSeverity Severity = DS_Error) + : DiagnosticInfo(llvm::getNextAvailablePluginDiagnosticKind(), Severity), + Description(Desc.str()) {} + + void print(DiagnosticPrinter &DP) const override { + DP << "GenXPromoteArray: " << Description; + } +}; + +class TransposeHelper { +public: + TransposeHelper(bool vectorIndex, const llvm::DataLayout *DL, + uint64_t baseTypeAllocSize) + : m_vectorIndex(vectorIndex), m_pDL(DL), + m_baseTypeAllocSize(baseTypeAllocSize) {} + void HandleAllocaSources(llvm::Instruction *v, llvm::Value *idx); + void handleGEPInst(llvm::GetElementPtrInst *pGEP, llvm::Value *idx); + void handlePHINode(llvm::PHINode *pPhi, llvm::Value *pScalarizedIdx, + llvm::BasicBlock *pIncomingBB); + virtual void handleLoadInst(llvm::LoadInst *pLoad, + llvm::Value *pScalarizedIdx) = 0; + virtual void handleStoreInst(llvm::StoreInst *pStore, + llvm::Value *pScalarizedIdx) = 0; + virtual void handlePrivateGather(llvm::IntrinsicInst *pInst, + llvm::Value *pScalarizedIdx) = 0; + virtual void handlePrivateScatter(llvm::IntrinsicInst *pInst, + llvm::Value *pScalarizedIdx) = 0; + virtual void handleLLVMGather(llvm::IntrinsicInst *pInst, + llvm::Value *pScalarizedIdx) = 0; + virtual void handleLLVMScatter(llvm::IntrinsicInst *pInst, + llvm::Value *pScalarizedIdx) = 0; + void EraseDeadCode(); + +private: + bool m_vectorIndex = false; + std::vector m_toBeRemoved; + ValueMap m_phiReplacement; + +protected: + const llvm::DataLayout *m_pDL = nullptr; + uint64_t m_baseTypeAllocSize = 0; +}; + +/// @brief TransformPrivMem pass is used for lowering the allocas identified +/// while visiting the alloca instructions +/// and then inserting insert/extract elements instead of load stores. +/// This allows us to store the data in registers instead of propagating +/// it to scratch space. +class TransformPrivMem : public llvm::FunctionPass, + public llvm::InstVisitor { +public: + TransformPrivMem(); + + ~TransformPrivMem() {} + + virtual llvm::StringRef getPassName() const override { + return "TransformPrivMem"; + } + + virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + } + + virtual bool runOnFunction(llvm::Function &F) override; + + void visitAllocaInst(llvm::AllocaInst &I); + + void visitStore(llvm::StoreInst &St); + + unsigned int extractAllocaSize(llvm::AllocaInst *pAlloca); + +private: + llvm::AllocaInst *createVectorForAlloca(llvm::AllocaInst *pAlloca, + llvm::Type *pBaseType); + void handleAllocaInst(llvm::AllocaInst *pAlloca); + + bool CheckIfAllocaPromotable(llvm::AllocaInst *pAlloca); + + bool replaceSingleAggrStore(llvm::StoreInst *StI); + + bool replaceAggregatedStore(llvm::StoreInst *StI); + +public: + static char ID; + +private: + std::queue m_StoresToHandle; + const llvm::DataLayout *m_pDL = nullptr; + LLVMContext *m_ctx = nullptr; + std::vector m_allocasToPrivMem; + llvm::Function *m_pFunc = nullptr; +}; +} // namespace + +// Register pass to igc-opt +namespace llvm { +void initializeTransformPrivMemPass(PassRegistry &); +} +#define PASS_FLAG "transform-priv-mem" +#define PASS_DESCRIPTION \ + "transform private arrays for promoting them to registers" +#define PASS_CFG_ONLY false +#define PASS_ANALYSIS false +INITIALIZE_PASS_BEGIN(TransformPrivMem, PASS_FLAG, PASS_DESCRIPTION, + PASS_CFG_ONLY, PASS_ANALYSIS) +INITIALIZE_PASS_END(TransformPrivMem, PASS_FLAG, PASS_DESCRIPTION, + PASS_CFG_ONLY, PASS_ANALYSIS) + +char TransformPrivMem::ID = 0; + +FunctionPass *llvm::createTransformPrivMemPass() { + return new TransformPrivMem(); +} + +namespace { + +class TransposeHelperPromote : public TransposeHelper { +public: + void handleLoadInst(LoadInst *pLoad, Value *pScalarizedIdx); + void handleStoreInst(StoreInst *pStore, Value *pScalarizedIdx); + void handlePrivateGather(IntrinsicInst *pInst, Value *pScalarizedIdx); + void handlePrivateScatter(IntrinsicInst *pInst, Value *pScalarizedIdx); + void handleLLVMGather(IntrinsicInst *pInst, Value *pScalarizedIdx); + void handleLLVMScatter(IntrinsicInst *pInst, Value *pScalarizedIdx); + + AllocaInst *pVecAlloca; + + TransposeHelperPromote(AllocaInst *pAI, const llvm::DataLayout *DL, + uint64_t baseTypeAllocSize) + : TransposeHelper(false, DL, baseTypeAllocSize) { + pVecAlloca = pAI; + } +}; + +TransformPrivMem::TransformPrivMem() : FunctionPass(ID), m_pFunc(nullptr) { + initializeTransformPrivMemPass(*PassRegistry::getPassRegistry()); +} + +llvm::AllocaInst * +TransformPrivMem::createVectorForAlloca(llvm::AllocaInst *pAlloca, + llvm::Type *pBaseType) { + IRBuilder<> IRB(pAlloca); + + unsigned int totalSize = extractAllocaSize(pAlloca) / + (unsigned int)(m_pDL->getTypeAllocSize(pBaseType)); + + llvm::VectorType *pVecType = llvm::VectorType::get(pBaseType, totalSize); + + AllocaInst *pAllocaValue = IRB.CreateAlloca(pVecType); + return pAllocaValue; +} + +bool TransformPrivMem::replaceSingleAggrStore(StoreInst *StI) { + IRBuilder<> Builder(StI); + + Value *ValueOp = StI->getValueOperand(); + Value *Ptr = StI->getPointerOperand(); + unsigned AS = StI->getPointerAddressSpace(); + Value *ValToStore = Builder.CreateExtractValue(ValueOp, 0); + ValToStore->setName(ValueOp->getName() + ".noAggr"); + + StoreInst *NewStI = Builder.CreateAlignedStore(ValToStore, + Builder.CreateBitCast(Ptr, ValToStore->getType()->getPointerTo(AS)), + StI->getAlignment(), StI->isVolatile()); + m_StoresToHandle.push(NewStI); + StI->eraseFromParent(); + + return true; +} + +bool TransformPrivMem::replaceAggregatedStore(StoreInst *StI) { + IRBuilder<> Builder(StI); + Value *ValueOp = StI->getValueOperand(); + Type *ValueOpTy = ValueOp->getType(); + auto *ST = dyn_cast(ValueOpTy); + auto *AT = dyn_cast(ValueOpTy); + + assert(StI->isSimple()); + assert(AT || ST); + + uint64_t Count = ST ? ST->getNumElements() : AT->getNumElements(); + if (Count == 1) { + return replaceSingleAggrStore(StI); + } + + auto *IdxType = Type::getInt32Ty(*m_ctx); + auto *Zero = ConstantInt::get(IdxType, 0); + for (uint64_t i = 0; i < Count; ++i) { + Value *Indices[2] = { + Zero, + ConstantInt::get(IdxType, i) + }; + + Value *Ptr = nullptr; + auto *PtrOp = StI->getPointerOperand(); + if (ST) { + Ptr = Builder.CreateInBoundsGEP(ST, + PtrOp, makeArrayRef(Indices)); + } else { + Ptr = Builder.CreateInBoundsGEP(AT, + PtrOp, makeArrayRef(Indices)); + } + Ptr->setName(PtrOp->getName() + ".noAggrGEP"); + auto *Val = Builder.CreateExtractValue(ValueOp, i); + Val->setName(ValueOp->getName() + ".noAggr"); + StoreInst *NewStI = Builder.CreateStore(Val, Ptr, StI->isVolatile()); + + m_StoresToHandle.push(NewStI); + } + + StI->eraseFromParent(); + + return true; +} + +bool TransformPrivMem::runOnFunction(llvm::Function &F) { + m_pFunc = &F; + m_ctx = &(m_pFunc->getContext()); + + m_pDL = &F.getParent()->getDataLayout(); + m_allocasToPrivMem.clear(); + + visit(F); + + bool AggrRemoved = false; + while (!m_StoresToHandle.empty()) { + StoreInst *StI = m_StoresToHandle.front(); + m_StoresToHandle.pop(); + if (StI->getValueOperand()->getType()->isAggregateType()) + AggrRemoved |= replaceAggregatedStore(StI); + } + + std::vector &allocaToHandle = m_allocasToPrivMem; + + for (auto pAlloca : allocaToHandle) { + handleAllocaInst(pAlloca); + } + + // Last remove alloca instructions + for (auto pInst : allocaToHandle) { + if (pInst->use_empty()) { + pInst->eraseFromParent(); + } + } + // IR changed only if we had alloca instruction to optimize or + // if aggregated stores were replaced + return !allocaToHandle.empty() || AggrRemoved; +} + +unsigned int TransformPrivMem::extractAllocaSize(llvm::AllocaInst *pAlloca) { + unsigned int arraySize = + (unsigned int)(cast(pAlloca->getArraySize()) + ->getZExtValue()); + unsigned int totalArrayStructureSize = + (unsigned int)(m_pDL->getTypeAllocSize(pAlloca->getAllocatedType()) * + arraySize); + + return totalArrayStructureSize; +} + +static Type *GetBaseType(Type *pType, Type *pBaseType) { + while (pType->isStructTy() || pType->isArrayTy() || pType->isVectorTy()) { + if (pType->isStructTy()) { + int num_elements = pType->getStructNumElements(); + for (int i = 0; i < num_elements; ++i) { + Type *structElemBaseType = + GetBaseType(pType->getStructElementType(i), pBaseType); + // can support only homogeneous structures + if (pBaseType != nullptr && + (structElemBaseType == nullptr || + structElemBaseType->getTypeID() != pBaseType->getTypeID())) + return nullptr; + pBaseType = structElemBaseType; + } + return pBaseType; + } else if (pType->isArrayTy()) { + pType = pType->getArrayElementType(); + } else if (pType->isVectorTy()) { + pType = pType->getVectorElementType(); + } else { + assert(0); + } + } + if (pType->isPointerTy() && pType->getPointerElementType()->isFunctionTy()) + pType = IntegerType::getInt8Ty(pType->getContext()); + return pType; +} + +static bool CheckAllocaUsesInternal(Instruction *I) { + for (Value::user_iterator use_it = I->user_begin(), use_e = I->user_end(); + use_it != use_e; ++use_it) { + if (GetElementPtrInst *gep = dyn_cast(*use_it)) { + auto PtrV = gep->getPointerOperand(); + // we cannot support a vector of pointers as the base of the GEP + if (PtrV->getType()->isPointerTy()) { + if (CheckAllocaUsesInternal(gep)) + continue; + } + return false; + } + if (llvm::LoadInst *pLoad = llvm::dyn_cast(*use_it)) { + if (!pLoad->isSimple()) + return false; + } else if (llvm::StoreInst *pStore = + llvm::dyn_cast(*use_it)) { + if (!pStore->isSimple()) + return false; + llvm::Value *pValueOp = pStore->getValueOperand(); + if (pValueOp == I) { + // GEP instruction is the stored value of the StoreInst (not supported + // case) + return false; + } + } else if (llvm::BitCastInst *pBitCast = + llvm::dyn_cast(*use_it)) { + if (pBitCast->use_empty()) + continue; + Type *baseT = + GetBaseType(pBitCast->getType()->getPointerElementType(), nullptr); + Type *sourceType = GetBaseType( + pBitCast->getOperand(0)->getType()->getPointerElementType(), nullptr); + assert(sourceType); + // either the point-to-element-type is the same or + // the point-to-element-type is the byte or a function pointer + if (baseT != nullptr && + (baseT->getScalarSizeInBits() == 8 || + baseT->getScalarSizeInBits() == sourceType->getScalarSizeInBits() || + (baseT->isPointerTy() && + baseT->getPointerElementType()->isFunctionTy()))) { + if (CheckAllocaUsesInternal(pBitCast)) + continue; + } + // Not a candidate. + return false; + } else if (IntrinsicInst *intr = dyn_cast(*use_it)) { + auto IID = GenXIntrinsic::getAnyIntrinsicID(intr); + if (IID == llvm::Intrinsic::lifetime_start || + IID == llvm::Intrinsic::lifetime_end || + IID == GenXIntrinsic::genx_gather_private || + IID == GenXIntrinsic::genx_scatter_private || + IID == llvm::Intrinsic::masked_gather || + IID == llvm::Intrinsic::masked_scatter) { + continue; + } + return false; + } else if (PHINode *phi = dyn_cast(*use_it)) { + // Only GEPs with same base and bitcasts with same src yet supported + Value *pPtrOp = nullptr; + if (auto BC = dyn_cast(I)) + pPtrOp = BC->getOperand(0); + else if (auto GEP = dyn_cast(I)) + pPtrOp = GEP->getPointerOperand(); + else + return false; + + if (all_of(phi->incoming_values(), [&](Value *V) { + if (auto GEP = dyn_cast(V)) + return GEP->getPointerOperand() == pPtrOp; + else if (auto BC = dyn_cast(V)) + return BC->getOperand(0) == pPtrOp; + return false; + })) + if (CheckAllocaUsesInternal(phi)) + continue; + // Not a candidate. + return false; + } else { + // This is some other instruction. Right now we don't want to handle these + return false; + } + } + return true; +} + +bool TransformPrivMem::CheckIfAllocaPromotable(llvm::AllocaInst *pAlloca) { + unsigned int allocaSize = extractAllocaSize(pAlloca); + unsigned int allowedAllocaSizeInBytes = MAX_ALLOCA_PROMOTE_GRF_NUM * 32; + + // if alloca size exceeds alloc size threshold, emit warning + // and discard promotion + if (allocaSize > allowedAllocaSizeInBytes) { + DiagnosticInfoPromoteArray Warn( + m_pFunc->getName() + " allocation size is too big: using TPM", + DS_Warning); + m_pFunc->getContext().diagnose(Warn); + return false; + } + + // Don't even look at non-array or non-struct allocas. + // (extractAllocaDim can not handle them anyway, causing a crash) + llvm::Type *pType = pAlloca->getAllocatedType(); + if ((!pType->isStructTy() && !pType->isArrayTy() && !pType->isVectorTy()) || + pAlloca->isArrayAllocation()) + return false; + + Type *baseType = GetBaseType(pType, nullptr); + if (baseType == nullptr) + return false; + auto Ty = baseType->getScalarType(); + // only handle case with a simple base type + if (!(Ty->isFloatingPointTy() || Ty->isIntegerTy()) && + !(Ty->isPointerTy() && Ty->getPointerElementType()->isFunctionTy())) + return false; + + return CheckAllocaUsesInternal(pAlloca); +} + +void TransformPrivMem::visitStore(StoreInst &I) { + if (I.getValueOperand()->getType()->isAggregateType()) + m_StoresToHandle.push(&I); +} + +void TransformPrivMem::visitAllocaInst(AllocaInst &I) { + // find those allocas that can be promoted as a whole-vector + if (!CheckIfAllocaPromotable(&I)) { + return; + } + m_allocasToPrivMem.push_back(&I); +} + +void TransformPrivMem::handleAllocaInst(llvm::AllocaInst *pAlloca) { + // Extract the Alloca size and the base Type + Type *pType = pAlloca->getType()->getPointerElementType(); + Type *pBaseType = GetBaseType(pType, nullptr); + if (!pBaseType) + return; + pBaseType = pBaseType->getScalarType(); + llvm::AllocaInst *pVecAlloca = createVectorForAlloca(pAlloca, pBaseType); + if (!pVecAlloca) + return; + // skip processing of allocas that are already fine + if (pVecAlloca->getType() == pAlloca->getType()) + return; + + IRBuilder<> IRB(pVecAlloca); + Value *idx = IRB.getInt32(0); + TransposeHelperPromote helper(pVecAlloca, m_pDL, + m_pDL->getTypeAllocSize(pBaseType)); + helper.HandleAllocaSources(pAlloca, idx); + helper.EraseDeadCode(); +} + +void TransposeHelper::EraseDeadCode() { + for (Instruction *I : m_toBeRemoved) + I->dropAllReferences(); + for (Instruction *I : m_toBeRemoved) + I->eraseFromParent(); +} + +void TransposeHelper::HandleAllocaSources(Instruction *v, Value *idx) { + SmallVector instructions; + for (Value::user_iterator it = v->user_begin(), e = v->user_end(); it != e; + ++it) { + Value *inst = cast(*it); + instructions.push_back(inst); + } + + for (auto instruction : instructions) { + if (GetElementPtrInst *pGEP = dyn_cast(instruction)) { + handleGEPInst(pGEP, idx); + } else if (BitCastInst *bitcast = dyn_cast(instruction)) { + m_toBeRemoved.push_back(bitcast); + Type *baseT = + GetBaseType(bitcast->getType()->getPointerElementType(), nullptr); + Type *sourceType = GetBaseType( + bitcast->getOperand(0)->getType()->getPointerElementType(), nullptr); + assert(baseT && sourceType); + // either the point-to-element-type is the same or + // the point-to-element-type is the byte + if (baseT->getScalarSizeInBits() == sourceType->getScalarSizeInBits()) + HandleAllocaSources(bitcast, idx); + else if (baseT->isPointerTy() && baseT->getPointerElementType()->isFunctionTy()) + HandleAllocaSources(bitcast, idx); + else { + assert(baseT->getScalarSizeInBits() == 8); + IRBuilder<> IRB(bitcast); + auto ElementSize = + sourceType->getScalarSizeInBits() / baseT->getScalarSizeInBits(); + Value * Scale = nullptr; + if (idx->getType()->isVectorTy()) { + auto Width = idx->getType()->getVectorNumElements(); + Scale = ConstantVector::getSplat(Width, IRB.getInt32(ElementSize)); + } + else + Scale = IRB.getInt32(ElementSize); + auto NewIdx = IRB.CreateMul(idx, Scale); + HandleAllocaSources(bitcast, NewIdx); + } + } else if (StoreInst *pStore = llvm::dyn_cast(instruction)) { + handleStoreInst(pStore, idx); + } else if (LoadInst *pLoad = llvm::dyn_cast(instruction)) { + handleLoadInst(pLoad, idx); + } else if (PHINode *pPhi = llvm::dyn_cast(instruction)) { + handlePHINode(pPhi, idx, v->getParent()); + } else if (IntrinsicInst *inst = dyn_cast(instruction)) { + auto IID = GenXIntrinsic::getAnyIntrinsicID(inst); + if (IID == llvm::Intrinsic::lifetime_start || + IID == llvm::Intrinsic::lifetime_end) + inst->eraseFromParent(); + else if (IID == GenXIntrinsic::genx_gather_private) + handlePrivateGather(inst, idx); + else if (IID == GenXIntrinsic::genx_scatter_private) + handlePrivateScatter(inst, idx); + else if (inst->getIntrinsicID() == llvm::Intrinsic::masked_gather) + handleLLVMGather(inst, idx); + else if (inst->getIntrinsicID() == llvm::Intrinsic::masked_scatter) + handleLLVMScatter(inst, idx); + } + } +} + + +void TransposeHelper::handleGEPInst(llvm::GetElementPtrInst *GEP, + llvm::Value *idx) { + m_toBeRemoved.push_back(GEP); + Value *PtrOp = GEP->getPointerOperand(); + PointerType *PtrTy = dyn_cast(PtrOp->getType()); + assert(PtrTy && "Only accept scalar pointer!"); + int IdxWidth = 1; + for (auto OI = GEP->op_begin() + 1, E = GEP->op_end(); OI != E; ++OI) { + Value * Idx = *OI; + if (Idx->getType()->isVectorTy()) { + auto Width = Idx->getType()->getVectorNumElements(); + if (Width > 1) { + if (IdxWidth <= 1) + IdxWidth = Width; + else + assert(IdxWidth == Width && "GEP has inconsistent vector-index width"); + } + } + } + Type *Ty = PtrTy; + gep_type_iterator GTI = gep_type_begin(GEP); + IRBuilder<> IRB(GEP); + Value * pScalarizedIdx = (IdxWidth == 1) ? IRB.getInt32(0) : + ConstantVector::getSplat(IdxWidth, IRB.getInt32(0)); + for (auto OI = GEP->op_begin() + 1, E = GEP->op_end(); OI != E; ++OI, ++GTI) { + Value *Idx = *OI; + if (StructType *StTy = GTI.getStructTypeOrNull()) { + unsigned Field = unsigned(cast(Idx)->getZExtValue()); + if (Field) { + Constant *OffsetVal = + IRB.getInt32(m_pDL->getStructLayout(StTy)->getElementOffset(Field) / + m_baseTypeAllocSize); + if (IdxWidth > 1) + OffsetVal = ConstantVector::getSplat(IdxWidth, OffsetVal); + pScalarizedIdx = IRB.CreateAdd(pScalarizedIdx, OffsetVal); + } + Ty = StTy->getElementType(Field); + } else { + Ty = GTI.getIndexedType(); + if (const ConstantInt *CI = dyn_cast(Idx)) { + if (!CI->isZero()) { + Constant *OffsetVal = + IRB.getInt32(m_pDL->getTypeAllocSize(Ty) * CI->getZExtValue() / + m_baseTypeAllocSize); + if (IdxWidth > 1) + OffsetVal = ConstantVector::getSplat(IdxWidth, OffsetVal); + pScalarizedIdx = IRB.CreateAdd(pScalarizedIdx, OffsetVal); + } + } + else if (!Idx->getType()->isVectorTy() && IdxWidth <= 1) { + Value *NewIdx = IRB.CreateZExtOrTrunc(Idx, IRB.getInt32Ty()); + auto ElementSize = m_pDL->getTypeAllocSize(Ty) / m_baseTypeAllocSize; + NewIdx = IRB.CreateMul(NewIdx, IRB.getInt32(ElementSize)); + pScalarizedIdx = IRB.CreateAdd(pScalarizedIdx, NewIdx); + } else { + // the input idx is a vector or the one of the GEP index is vector + Value * NewIdx = nullptr; + auto ElementSize = m_pDL->getTypeAllocSize(Ty) / m_baseTypeAllocSize; + if (Idx->getType()->isVectorTy()) { + assert(Idx->getType()->getVectorNumElements() == IdxWidth); + NewIdx = IRB.CreateZExtOrTrunc(Idx, pScalarizedIdx->getType()); + NewIdx = IRB.CreateMul(NewIdx, + ConstantVector::getSplat(IdxWidth, IRB.getInt32(ElementSize))); + } + else { + Value * NewIdx = IRB.CreateZExtOrTrunc(Idx, IRB.getInt32Ty()); + NewIdx = IRB.CreateMul(NewIdx, IRB.getInt32(ElementSize)); + // splat the new-idx into a vector + NewIdx = IRB.CreateVectorSplat(IdxWidth, NewIdx); + } + pScalarizedIdx = IRB.CreateAdd(pScalarizedIdx, NewIdx); + } + } + } + if (!idx->getType()->isVectorTy() && IdxWidth <= 1) { + pScalarizedIdx = IRB.CreateAdd(pScalarizedIdx, idx); + } + else if (idx->getType()->isVectorTy()) { + assert(idx->getType()->getVectorNumElements() == IdxWidth); + pScalarizedIdx = IRB.CreateAdd(pScalarizedIdx, idx); + } + else { + auto SplatIdx = IRB.CreateVectorSplat(IdxWidth, idx); + pScalarizedIdx = IRB.CreateAdd(pScalarizedIdx, SplatIdx); + } + HandleAllocaSources(GEP, pScalarizedIdx); +} + +// Pass acummulated idx through new phi +void TransposeHelper::handlePHINode(PHINode *pPhi, Value *idx, + BasicBlock *pIncomingBB) { + PHINode *NewPhi = nullptr; + // If phi is not yet visited + if (!m_phiReplacement.count(pPhi)) { + IRBuilder<> IRB(pPhi); + NewPhi = IRB.CreatePHI(idx->getType(), pPhi->getNumIncomingValues(), "idx"); + m_phiReplacement.insert(std::make_pair(pPhi, NewPhi)); + m_toBeRemoved.push_back(pPhi); + } else + NewPhi = m_phiReplacement[pPhi]; + NewPhi->addIncoming(idx, pIncomingBB); + HandleAllocaSources(pPhi, NewPhi); +} + +void TransposeHelperPromote::handleLoadInst(LoadInst *pLoad, + Value *pScalarizedIdx) { + assert(pLoad->isSimple()); + IRBuilder<> IRB(pLoad); + Value *pLoadVecAlloca = IRB.CreateLoad(pVecAlloca); + auto LdTy = pLoad->getType()->getScalarType(); + auto VETy = pLoadVecAlloca->getType()->getScalarType(); + auto ReadIn = pLoadVecAlloca; + bool IsFuncPointer = pLoad->getPointerOperandType()->isPointerTy() && + pLoad->getPointerOperandType()->getPointerElementType()->isPointerTy() && + pLoad->getPointerOperandType()->getPointerElementType()->getPointerElementType()->isFunctionTy(); + // do the type-casting if necessary + if (VETy != LdTy && !IsFuncPointer) { + auto VLen = pLoadVecAlloca->getType()->getVectorNumElements(); + assert(VETy->getScalarSizeInBits() >= LdTy->getScalarSizeInBits()); + assert((VETy->getScalarSizeInBits() % LdTy->getScalarSizeInBits()) == 0); + VLen = VLen * (VETy->getScalarSizeInBits() / LdTy->getScalarSizeInBits()); + ReadIn = IRB.CreateBitCast(ReadIn, VectorType::get(LdTy, VLen)); + } + if (IsFuncPointer) { + Region R(VectorType::get( + pVecAlloca->getType() + ->getPointerElementType() + ->getVectorElementType(), + m_pDL->getTypeSizeInBits(LdTy) / + m_pDL->getTypeSizeInBits(pVecAlloca->getType() + ->getPointerElementType() + ->getVectorElementType())), + m_pDL); + if (!pScalarizedIdx->getType()->isIntegerTy(16)) { + pScalarizedIdx = IRB.CreateZExtOrTrunc(pScalarizedIdx, Type::getInt16Ty(pLoad->getContext())); + } + R.Indirect = pScalarizedIdx; + auto *Result = R.createRdRegion(pLoadVecAlloca, pLoad->getName(), pLoad, + pLoad->getDebugLoc(), true); + if (!Result->getType()->isPointerTy()) { + auto *BC = + IRB.CreateBitCast(Result, Type::getInt64Ty(pLoad->getContext())); + auto *PtrToI = IRB.CreateIntToPtr(BC, pLoad->getType(), pLoad->getName()); + pLoad->replaceAllUsesWith(PtrToI); + } else + pLoad->replaceAllUsesWith(Result); + } + else if (pLoad->getType()->isVectorTy()) { + // A vector load + // %v = load <2 x float>* %ptr + // becomes + // %w = load <32 x float>* %ptr1 + // %v0 = extractelement <32 x float> %w, i32 %idx + // %v1 = extractelement <32 x float> %w, i32 %idx+1 + // replace all uses of %v with <%v0, %v1> + auto Len = pLoad->getType()->getVectorNumElements(); + Value *Result = UndefValue::get(pLoad->getType()); + for (unsigned i = 0; i < Len; ++i) { + Value *VectorIdx = ConstantInt::get(pScalarizedIdx->getType(), i); + auto Idx = IRB.CreateAdd(pScalarizedIdx, VectorIdx); + auto Val = IRB.CreateExtractElement(ReadIn, Idx); + Result = IRB.CreateInsertElement(Result, Val, VectorIdx); + } + pLoad->replaceAllUsesWith(Result); + } else { + auto Result = IRB.CreateExtractElement(ReadIn, pScalarizedIdx); + pLoad->replaceAllUsesWith(Result); + } + pLoad->eraseFromParent(); +} + +void TransposeHelperPromote::handleStoreInst(llvm::StoreInst *pStore, + llvm::Value *pScalarizedIdx) { + // Add Store instruction to remove list + assert(pStore->isSimple()); + IRBuilder<> IRB(pStore); + llvm::Value *pStoreVal = pStore->getValueOperand(); + llvm::Value *pLoadVecAlloca = IRB.CreateLoad(pVecAlloca); + llvm::Value *WriteOut = pLoadVecAlloca; + auto StTy = pStoreVal->getType()->getScalarType(); + auto VETy = pLoadVecAlloca->getType()->getScalarType(); + // do the type-casting if necessary + + bool IsFuncPointerStore = + (isFuncPointerVec(pStoreVal) || + (pStoreVal->getType()->isPointerTy() && + pStoreVal->getType()->getPointerElementType()->isFunctionTy())); + if (VETy != StTy && !IsFuncPointerStore) { + auto VLen = pLoadVecAlloca->getType()->getVectorNumElements(); + assert(VETy->getScalarSizeInBits() >= StTy->getScalarSizeInBits()); + assert((VETy->getScalarSizeInBits()%StTy->getScalarSizeInBits()) == 0); + VLen = VLen * (VETy->getScalarSizeInBits() / StTy->getScalarSizeInBits()); + WriteOut = IRB.CreateBitCast(WriteOut, VectorType::get(StTy, VLen)); + } + if (IsFuncPointerStore) { + auto *NewStoreVal = pStoreVal; + assert(pVecAlloca->getType()->getPointerElementType()->getVectorElementType()->isIntegerTy(8)); + if (NewStoreVal->getType()->isPointerTy() && + NewStoreVal->getType()->getPointerElementType()->isFunctionTy()) { + NewStoreVal = IRB.CreatePtrToInt(NewStoreVal, IntegerType::get(pStore->getContext(), 64)); + NewStoreVal = IRB.CreateBitCast(NewStoreVal, VectorType::get(VETy, 8)); + } + Region R(NewStoreVal, m_pDL); + if (!pScalarizedIdx->getType()->isIntegerTy(16)) { + pScalarizedIdx = IRB.CreateZExtOrTrunc(pScalarizedIdx, Type::getInt16Ty(pStore->getContext())); + } + R.Indirect = pScalarizedIdx; + WriteOut = R.createWrRegion(WriteOut, NewStoreVal, pStore->getName(), pStore, + pStore->getDebugLoc()); + } else if (pStoreVal->getType()->isVectorTy()) { + // A vector store + // store <2 x float> %v, <2 x float>* %ptr + // becomes + // %w = load <32 x float> *%ptr1 + // %v0 = extractelement <2 x float> %v, i32 0 + // %w0 = insertelement <32 x float> %w, float %v0, i32 %idx + // %v1 = extractelement <2 x float> %v, i32 1 + // %w1 = insertelement <32 x float> %w0, float %v1, i32 %idx+1 + // store <32 x float> %w1, <32 x float>* %ptr1 + auto Len = pStoreVal->getType()->getVectorNumElements(); + for (unsigned i = 0; i < Len; ++i) { + Value *VectorIdx = ConstantInt::get(pScalarizedIdx->getType(), i); + auto Val = IRB.CreateExtractElement(pStoreVal, VectorIdx); + auto Idx = IRB.CreateAdd(pScalarizedIdx, VectorIdx); + WriteOut = IRB.CreateInsertElement(WriteOut, Val, Idx); + } + } else { + WriteOut = IRB.CreateInsertElement(WriteOut, pStoreVal, pScalarizedIdx); + } + // cast the vector type back if necessary + if (VETy != StTy) + WriteOut = IRB.CreateBitCast(WriteOut, pLoadVecAlloca->getType()); + IRB.CreateStore(WriteOut, pVecAlloca); + pStore->eraseFromParent(); +} + +void TransposeHelperPromote::handlePrivateGather(IntrinsicInst *pInst, + Value *pScalarizedIdx) { + IRBuilder<> IRB(pInst); + assert(pInst->getType()->isVectorTy()); + Value *pLoadVecAlloca = IRB.CreateLoad(pVecAlloca); + auto N = pInst->getType()->getVectorNumElements(); + auto ElemType = pInst->getType()->getVectorElementType(); + + // A vector load + // %v = <2 x float> gather %pred, %ptr, %offset, %old_value + // becomes + // %w = load <32 x float>* %ptr1 + // %v0 = <2 x float> rdregion <32 x float> %w, i32 %offsets, %stride + // + // replace all uses of %v with <%v0, %v1> + Region R(pInst); + int64_t v0 = 0; + int64_t diff = 0; + ConstantInt *CI = dyn_cast(pScalarizedIdx); + PointerType *GatherPtrTy = + dyn_cast(pInst->getArgOperand(1)->getType()); + // pScalarizedIdx is an indice of element, so + // count byte offset depending on the type of pointer in gather + assert(GatherPtrTy); + unsigned GatherPtrNumBytes = + GatherPtrTy->getElementType()->getPrimitiveSizeInBits() / 8; + if (CI != nullptr && + IsLinearVectorConstantInts(pInst->getArgOperand(2), v0, diff)) { + R.Indirect = nullptr; + R.Width = N; + int BytesOffset = CI->getSExtValue() * GatherPtrNumBytes; + R.Offset = v0 + BytesOffset; + R.Stride = (diff * 8) / ElemType->getPrimitiveSizeInBits(); + R.VStride = 0; + } else { + auto OffsetType = + VectorType::get(IntegerType::getInt16Ty(pInst->getContext()), N); + auto Offsets = IRB.CreateIntCast(pInst->getArgOperand(2), OffsetType, true); + auto Cast = IRB.CreateIntCast( + pScalarizedIdx, IntegerType::getInt16Ty(pInst->getContext()), true); + auto Scale = IRB.CreateMul(IRB.getInt16(GatherPtrNumBytes), Cast); + auto vec = VectorType::get(IntegerType::getInt16Ty(pInst->getContext()), 1); + auto GEPOffsets = + IRB.CreateInsertElement(UndefValue::get(vec), Scale, IRB.getInt32(0)); + GEPOffsets = IRB.CreateShuffleVector( + GEPOffsets, UndefValue::get(vec), + ConstantAggregateZero::get( + VectorType::get(IntegerType::getInt32Ty(pInst->getContext()), N))); + Offsets = IRB.CreateAdd(GEPOffsets, Offsets); + R.Indirect = Offsets; + R.Width = 1; + R.Stride = 0; + R.VStride = 0; + } + Value *Result = + R.createRdRegion(pLoadVecAlloca, pInst->getName(), pInst /*InsertBefore*/, + pInst->getDebugLoc(), true /*AllowScalar*/); + + // if old-value is not undefined and predicate is not all-one, + // create a select auto OldVal = pInst->getArgOperand(3); + auto PredVal = pInst->getArgOperand(0); + bool PredAllOne = false; + if (auto C = dyn_cast(PredVal)) { + if (auto B = C->getSplatValue()) + PredAllOne = B->isOneValue(); + } + auto OldVal = pInst->getArgOperand(3); + if (!PredAllOne && !isa(OldVal)) { + Result = IRB.CreateSelect(PredVal, Result, OldVal); + } + + pInst->replaceAllUsesWith(Result); + pInst->eraseFromParent(); +} + +void TransposeHelperPromote::handlePrivateScatter(llvm::IntrinsicInst *pInst, + llvm::Value *pScalarizedIdx) { + // Add Store instruction to remove list + IRBuilder<> IRB(pInst); + llvm::Value *pStoreVal = pInst->getArgOperand(3); + llvm::Value *pLoadVecAlloca = IRB.CreateLoad(pVecAlloca); + if (pStoreVal->getType()->isVectorTy() == false) { + assert(false); + return; + } + auto N = pStoreVal->getType()->getVectorNumElements(); + auto ElemType = pStoreVal->getType()->getVectorElementType(); + // A vector scatter + // scatter %pred, %ptr, %offset, %newvalue + // becomes + // %w = load <32 x float> *%ptr1 + // %w1 = <32 x float> wrregion %w, newvalue, %offset, %pred + // store <32 x float> %w1, <32 x float>* %ptr1 + + // Create the new wrregion + Region R(pStoreVal); + int64_t v0 = 0; + int64_t diff = 0; + ConstantInt *CI = dyn_cast(pScalarizedIdx); + PointerType* ScatterPtrTy = + dyn_cast(pInst->getArgOperand(1)->getType()); + // pScalarizedIdx is an indice of element, so + // count byte offset depending on the type of pointer in scatter + assert(ScatterPtrTy); + unsigned ScatterPtrNumBytes = + ScatterPtrTy->getElementType()->getPrimitiveSizeInBits() / 8; + if (CI != nullptr && IsLinearVectorConstantInts(pInst->getArgOperand(2), v0, diff)) { + R.Indirect = nullptr; + R.Width = N; + int BytesOffset = CI->getSExtValue() * ScatterPtrNumBytes; + R.Offset = v0 + BytesOffset; + R.Stride = (diff * 8) / ElemType->getPrimitiveSizeInBits(); + R.VStride = 0; + } else { + auto OffsetType = + VectorType::get(IntegerType::getInt16Ty(pInst->getContext()), N); + auto Offsets = IRB.CreateIntCast(pInst->getArgOperand(2), OffsetType, true); + auto Cast = IRB.CreateIntCast( + pScalarizedIdx, IntegerType::getInt16Ty(pInst->getContext()), true); + auto Scale = IRB.CreateMul(IRB.getInt16(ScatterPtrNumBytes), Cast); + auto vec = VectorType::get(IntegerType::getInt16Ty(pInst->getContext()), 1); + auto GEPOffsets = + IRB.CreateInsertElement(UndefValue::get(vec), Scale, IRB.getInt32(0)); + GEPOffsets = IRB.CreateShuffleVector( + GEPOffsets, UndefValue::get(vec), + ConstantAggregateZero::get( + VectorType::get(IntegerType::getInt32Ty(pInst->getContext()), N))); + Offsets = IRB.CreateAdd(GEPOffsets, Offsets); + R.Indirect = Offsets; + R.Width = 1; + R.Stride = 0; + R.VStride = 0; + } + R.Mask = pInst->getArgOperand(0); + auto NewInst = cast( + R.createWrRegion(pLoadVecAlloca, pStoreVal, pInst->getName(), + pInst /*InsertBefore*/, pInst->getDebugLoc())); + + IRB.CreateStore(NewInst, pVecAlloca); + pInst->eraseFromParent(); +} + +void TransposeHelperPromote::handleLLVMGather(IntrinsicInst *pInst, + Value *pScalarizedIdx) { + IRBuilder<> IRB(pInst); + assert(pInst->getType()->isVectorTy()); + Value *pLoadVecAlloca = IRB.CreateLoad(pVecAlloca); + auto N = pInst->getType()->getVectorNumElements(); + auto ElemType = pInst->getType()->getVectorElementType(); + + // A vector load + // %v = <2 x float> gather %pred, %vector_of_ptr, %old_value + // becomes + // %w = load <32 x float>* %ptr1 + // %v0 = <2 x float> rdregion <32 x float> %w, i32 %offsets, %stride + // + // replace all uses of %v with <%v0, %v1> + Region R(pInst); + int64_t v0 = 0; + int64_t diff = 0; + // count byte offset depending on the type of pointer in gather + unsigned ElemNumBytes = ElemType->getPrimitiveSizeInBits() / 8; + if (IsLinearVectorConstantInts(pScalarizedIdx, v0, diff)) { + R.Indirect = nullptr; + R.Width = N; + R.Offset = v0; + R.Stride = (diff * 8) / ElemType->getPrimitiveSizeInBits(); + R.VStride = 0; + } + else { + auto OffsetType = + VectorType::get(IntegerType::getInt16Ty(pInst->getContext()), N); + auto Offsets = IRB.CreateIntCast(pScalarizedIdx, OffsetType, false); + auto ScaleVec = + IRB.CreateInsertElement(UndefValue::get(OffsetType), IRB.getInt16(ElemNumBytes), IRB.getInt32(0)); + ScaleVec = IRB.CreateShuffleVector( + ScaleVec, UndefValue::get(OffsetType), + ConstantAggregateZero::get( + VectorType::get(IntegerType::getInt32Ty(pInst->getContext()), N))); + Offsets = IRB.CreateMul(Offsets, ScaleVec); + R.Indirect = Offsets; + R.Width = 1; + R.Stride = 0; + R.VStride = 0; + } + Value *Result = + R.createRdRegion(pLoadVecAlloca, pInst->getName(), pInst /*InsertBefore*/, + pInst->getDebugLoc(), true /*AllowScalar*/); + + // if old-value is not undefined and predicate is not all-one, + // create a select auto OldVal = pInst->getArgOperand(3); + auto PredVal = pInst->getArgOperand(2); + bool PredAllOne = false; + if (auto C = dyn_cast(PredVal)) { + if (auto B = C->getSplatValue()) + PredAllOne = B->isOneValue(); + } + auto OldVal = pInst->getArgOperand(3); + if (!PredAllOne && !isa(OldVal)) { + Result = IRB.CreateSelect(PredVal, Result, OldVal); + } + + pInst->replaceAllUsesWith(Result); + pInst->eraseFromParent(); +} + +void TransposeHelperPromote::handleLLVMScatter(llvm::IntrinsicInst *pInst, + llvm::Value *pScalarizedIdx) { + // Add Store instruction to remove list + IRBuilder<> IRB(pInst); + llvm::Value *pStoreVal = pInst->getArgOperand(3); + llvm::Value *pLoadVecAlloca = IRB.CreateLoad(pVecAlloca); + if (pStoreVal->getType()->isVectorTy() == false) { + assert(false); + return; + } + auto N = pStoreVal->getType()->getVectorNumElements(); + auto ElemType = pStoreVal->getType()->getVectorElementType(); + // A vector scatter + // scatter %pred, %ptr, %offset, %newvalue + // becomes + // %w = load <32 x float> *%ptr1 + // %w1 = <32 x float> wrregion %w, newvalue, %offset, %pred + // store <32 x float> %w1, <32 x float>* %ptr1 + + // Create the new wrregion + Region R(pStoreVal); + int64_t v0 = 0; + int64_t diff = 0; + // pScalarizedIdx is an indice of element, so + // count byte offset depending on the type of pointer in scatter + unsigned ElemNumBytes = ElemType->getPrimitiveSizeInBits() / 8; + if (IsLinearVectorConstantInts(pScalarizedIdx, v0, diff)) { + R.Indirect = nullptr; + R.Width = N; + R.Offset = v0; + R.Stride = (diff * 8) / ElemType->getPrimitiveSizeInBits(); + R.VStride = 0; + } + else { + auto OffsetType = + VectorType::get(IntegerType::getInt16Ty(pInst->getContext()), N); + auto Offsets = IRB.CreateIntCast(pScalarizedIdx, OffsetType, false); + auto ScaleVec = IRB.CreateInsertElement(UndefValue::get(OffsetType), + IRB.getInt16(ElemNumBytes), + IRB.getInt32(0)); + ScaleVec = IRB.CreateShuffleVector( + ScaleVec, UndefValue::get(OffsetType), + ConstantAggregateZero::get( + VectorType::get(IntegerType::getInt32Ty(pInst->getContext()), N))); + Offsets = IRB.CreateMul(Offsets, ScaleVec); + R.Indirect = Offsets; + R.Width = 1; + R.Stride = 0; + R.VStride = 0; + } + R.Mask = pInst->getArgOperand(0); + auto NewInst = cast( + R.createWrRegion(pLoadVecAlloca, pStoreVal, pInst->getName(), + pInst /*InsertBefore*/, pInst->getDebugLoc())); + + IRB.CreateStore(NewInst, pVecAlloca); + pInst->eraseFromParent(); +} + +} // namespace diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXPromotePredicate.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXPromotePredicate.cpp new file mode 100644 index 000000000000..c28809807f66 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXPromotePredicate.cpp @@ -0,0 +1,204 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXPromotePredicate +/// -------------------- +/// +/// GenXPromotePredicate is an optimization pass that promotes vector operations +/// on predicates (n x i1) to operations on wider integer types (). +/// This often reduces flag register pressure and improves code quality. +/// +//===----------------------------------------------------------------------===// + +#include "GenX.h" +#include "GenXModule.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Utils/Local.h" + +using namespace llvm; +using namespace genx; + +static cl::opt +LogicOpsThreshold("logical-ops-threshold", cl::init(2), cl::Hidden, + cl::desc("Number of logical operations")); + +namespace { + +class GenXPromotePredicate : public FunctionPass { +public: + static char ID; + GenXPromotePredicate() : FunctionPass(ID) {} + bool runOnFunction(Function &F) override; + StringRef getPassName() const override { return "GenXPromotePredicate"; } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addPreserved(); + AU.setPreservesCFG(); + } + +private: + bool matchOpnds(llvm::BasicBlock *UseBB, Value *V, unsigned &NumLogicOps); + Value *rewriteTree(Instruction *Inst); +}; + +} // namespace + +char GenXPromotePredicate::ID = 0; + +namespace llvm { +void initializeGenXPromotePredicatePass(PassRegistry &); +} +INITIALIZE_PASS_BEGIN(GenXPromotePredicate, "GenXPromotePredicate", + "GenXPromotePredicate", false, false) +INITIALIZE_PASS_END(GenXPromotePredicate, "GenXPromotePredicate", + "GenXPromotePredicate", false, false) + +FunctionPass *llvm::createGenXPromotePredicatePass() { + initializeGenXPromotePredicatePass(*PassRegistry::getPassRegistry()); + return new GenXPromotePredicate; +} + +// This matches a common pattern like +// +// v1.merge(v2, (v3 > 0) | (v4 < 9)) +// +// Or operation will be beformed on which may cause flag spills when n +// is large. We promote such computations into . +// +bool GenXPromotePredicate::runOnFunction(Function &F) { + // Collect candidates. + SmallVector Candidates; + for (auto &BB : F.getBasicBlockList()) { + for (auto &Inst : BB.getInstList()) { + auto SI = dyn_cast(&Inst); + if (SI == nullptr || SI->use_empty()) + continue; + + // Match conditions with at least 32 elements. + auto Cond = dyn_cast(SI->getCondition()); + if (!Cond || !Cond->getType()->isVectorTy()) + continue; + if (Cond->getType()->getVectorNumElements() < 32) + continue; + + // TODO: analyze when it is benefial to promote. + unsigned NumLogicOps = 0; + if (matchOpnds(SI->getParent(), Cond, NumLogicOps) && + NumLogicOps >= LogicOpsThreshold) + Candidates.push_back(Cond); + } + } + + // Do promotions. This is a tree rewrite, with candidates as root, + // comparisions or constants as leaf nodes. + for (auto Inst : Candidates) { + assert(Inst->hasOneUse()); + Instruction *UI = Inst->user_back(); + Value *V = rewriteTree(Inst); + assert(isa(V)); + auto TI = TruncInst::Create(CastInst::Trunc, V, Inst->getType()); + TI->insertAfter(cast(V)); + TI->setDebugLoc(Inst->getDebugLoc()); + UI->replaceUsesOfWith(Inst, TI); + RecursivelyDeleteTriviallyDeadInstructions(Inst); + } + + return !Candidates.empty(); +} + +bool GenXPromotePredicate::matchOpnds(llvm::BasicBlock *UseBB, Value *V, + unsigned &NumLogicOps) { + auto Inst = dyn_cast(V); + // Constants are OK. + if (Inst == nullptr) + return isa(V); + + unsigned Opc = Inst->getOpcode(); + switch (Opc) { + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + ++NumLogicOps; + // Match local definitions only. + if (!Inst->hasOneUse() || Inst->getParent() != UseBB) + return false; + + // Recurse on its operands. + return matchOpnds(UseBB, Inst->getOperand(0), NumLogicOps) && + matchOpnds(UseBB, Inst->getOperand(1), NumLogicOps); + case Instruction::ICmp: + case Instruction::FCmp: + // Matching stops at local comparison operands. + return Inst->hasOneUse() && Inst->getParent() == UseBB; + default: + break; + } + + // Not a match. + return false; +} +Value *GenXPromotePredicate::rewriteTree(Instruction *Inst) { + IRBuilder<> Builder(Inst); + unsigned N = Inst->getType()->getVectorNumElements(); + VectorType *VT = VectorType::get(Builder.getInt16Ty(), N); + unsigned Opc = Inst->getOpcode(); + switch (Opc) { + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + Value *Ops[] = {nullptr, nullptr}; + for (unsigned i : {0, 1}) { + Value *Op = Inst->getOperand(i); + if (auto C = dyn_cast(Op)) + Ops[i] = Builder.CreateSExt(C, VT, ".sext"); + else if (auto I = dyn_cast(Op)) + Ops[i] = rewriteTree(I); + else + llvm_unreachable("out of sync"); + } + + Value *V = Builder.CreateBinOp(Instruction::BinaryOps(Opc), Ops[0], Ops[1]); + V->takeName(Inst); + if (auto I = dyn_cast(V)) + I->setDebugLoc(Inst->getDebugLoc()); + return V; + } + case Instruction::ICmp: + case Instruction::FCmp: { + auto V = Builder.CreateSExt(Inst, VT, ".sext"); + if (auto I = dyn_cast(V)) { + I->setDebugLoc(Inst->getDebugLoc()); + Inst->moveBefore(I); + } + return V; + } + default: + break; + } + + llvm_unreachable("out of sync"); +} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXRawSendRipper.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXRawSendRipper.cpp new file mode 100644 index 000000000000..48b1172c4eb4 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXRawSendRipper.cpp @@ -0,0 +1,96 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXRawSendRipper +/// ----------------- +/// +/// This pass tears down a series of raw send chained through the old value +/// operand when it's safe. +//===----------------------------------------------------------------------===// +// + +#define DEBUG_TYPE "GENX_RAWSENDRIPPER" +#include "GenX.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Function.h" +#include "llvm/Pass.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; +using namespace genx; + +namespace { + +class GenXRawSendRipper : public FunctionPass { + +public: + static char ID; + explicit GenXRawSendRipper() : FunctionPass(ID) {} + + StringRef getPassName() const override { + return "GenX RAW send ripper"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + } + + bool runOnFunction(Function &F) override; +}; + +} // End anonymous namespace + +namespace llvm { +void initializeGenXRawSendRipperPass(PassRegistry &); +} // End namespace llvm + +char GenXRawSendRipper::ID = 0; +INITIALIZE_PASS(GenXRawSendRipper, "GenXRawSendRipper", + "Rip chain of raw send", false, false) + +FunctionPass *llvm::createGenXRawSendRipperPass() { + initializeGenXRawSendRipperPass(*PassRegistry::getPassRegistry()); + return new GenXRawSendRipper(); +} + +bool GenXRawSendRipper::runOnFunction(Function &F) { + bool Changed = false; + Value *True = ConstantInt::getTrue(F.getContext()); + for (auto &BB : F) + for (auto &I : BB) { + if (GenXIntrinsic::getGenXIntrinsicID(&I) != GenXIntrinsic::genx_raw_send) + continue; + auto II = cast(&I); + if (II->getOperand(1) != True) + continue; + Value *Old = II->getOperand(5); + if (isa(Old)) + continue; + II->setOperand(5, UndefValue::get(Old->getType())); + } + return Changed; +} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXReduceIntSize.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXReduceIntSize.cpp new file mode 100644 index 000000000000..2d56603ff330 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXReduceIntSize.cpp @@ -0,0 +1,1038 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXReduceIntSize +/// ----------------- +/// +/// GenXReduceIntSize is a function pass that reduces the size of vector int +/// values where it can. +/// +/// The semantics of the source language usually involve an operator such as + +/// promoting its operands before performing the calculation. Typically, the +/// front end compiler generates IR for the promotion without bothering to work +/// out if it is unnecessary, as it is easier to work out if it is unnecessary +/// in a later LLVM pass. +/// +/// For scalar operations, LLVM already contains passes to do this. But it does +/// not seem to for vectors, possibly because OpenCL does not have C-like +/// promotion rules for vectors. CM does have C-like promotion rules for vectors, +/// so we need to cope with unnecessarily promoted operations. +/// +/// Operation of the pass +/// ^^^^^^^^^^^^^^^^^^^^^ +/// +/// First it does a backwards scan, spotting where an instruction can be +/// converted to a smaller int size because its result is used in other +/// instructions that only use the lower part of the value (trunc, or an "and" +/// with e.g. 0xff). The modified instruction with a smaller int size then +/// needs a trunc inserting for each operand. When the pass reaches the +/// instruction that is the input to that new trunc, it may be able to +/// modify that one too. Thus a reduced int size gets propagated backwards. +/// +/// Then it does a forwards scan, spotting where an instruction can be converted +/// to a smaller int size because the operands have only the lower part of the +/// value set (zext/sext, or an "and" with e.g. 0xff). The modified instruction with +/// a smaller int size then needs a ZExt/SExt inserting. Thus the reduced int size +/// is propagated forwards. +/// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "GENX_REDUCEINTSIZE" + +#include "GenX.h" +#include "GenXIntrinsics.h" +#include "GenXModule.h" +#include "GenXUtil.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" + + +using namespace llvm; +using namespace llvm::PatternMatch; +using namespace genx; + +namespace { + +// GenXReduceIntSize : reduce integer size +class GenXReduceIntSize : public FunctionPass { + struct ValueNumBits { + unsigned NumBits; + bool IsSignExtended; + ValueNumBits(unsigned NumBits) : NumBits(NumBits), IsSignExtended(false) {} + ValueNumBits(unsigned NumBits, bool IsSignExtended) + : NumBits(NumBits), IsSignExtended(IsSignExtended) {} + }; + bool Modified; +public: + static char ID; + explicit GenXReduceIntSize() : FunctionPass(ID) { } + virtual StringRef getPassName() const { return "GenX reduce integer size"; } + void getAnalysisUsage(AnalysisUsage &AU) const; + bool runOnFunction(Function &F); +private: + Instruction *reverseProcessInst(Instruction *Inst); + Value *truncValue(Value *V, unsigned NumBits, Instruction *InsertBefore, + const DebugLoc &DL); + Instruction *forwardProcessInst(Instruction *Inst); + ValueNumBits getValueNumBits(Value *V, bool PreferSigned = false); + Value *getSplatValue(ShuffleVectorInst *SVI) const; +}; + +} // end anonymous namespace + +char GenXReduceIntSize::ID = 0; +namespace llvm { void initializeGenXReduceIntSizePass(PassRegistry &); } +INITIALIZE_PASS_BEGIN(GenXReduceIntSize, "GenXReduceIntSize", "GenXReduceIntSize", false, false) +INITIALIZE_PASS_END(GenXReduceIntSize, "GenXReduceIntSize", "GenXReduceIntSize", false, false) + +class ExtOperator : public Operator { +public: + static bool isExtOpcode(unsigned Opc) { + return Opc == Instruction::SExt || Opc == Instruction::ZExt; + } + static inline bool classof(const Instruction *I) { + return isExtOpcode(I->getOpcode()); + } + static inline bool classof(const ConstantExpr *CE) { + return isExtOpcode(CE->getOpcode()); + } + static inline bool classof(const Value *V) { + return (isa(V) && classof(cast(V))) || + (isa(V) && classof(cast(V))); + } +}; + +FunctionPass *llvm::createGenXReduceIntSizePass() +{ + initializeGenXReduceIntSizePass(*PassRegistry::getPassRegistry()); + return new GenXReduceIntSize(); +} + +void GenXReduceIntSize::getAnalysisUsage(AnalysisUsage &AU) const +{ + AU.setPreservesCFG(); +} + +/*********************************************************************** + * GenXReduceIntSize::runOnFunction : process one function to + * reduce integer size where possible + */ +bool GenXReduceIntSize::runOnFunction(Function &F) +{ + // Reverse scan: This does a postordered depth first traversal of the CFG, + // processing instructions within a basic block in reverse, to ensure that we + // see a def after its uses (ignoring phi node uses). + Modified = false; + for (po_iterator i = po_begin(&F.getEntryBlock()), + e = po_end(&F.getEntryBlock()); i != e; ++i) { + BasicBlock *BB = *i; + // This loop scans the BB in reverse, and allows processReverseInst to + // erase Inst and other instructions. + for (auto Inst = &BB->back(); Inst; ) + Inst = reverseProcessInst(Inst); + } + // Forward scan: This does a preordered depth first traversal of the CFG to + // ensure that we see a def before its uses (ignoring phi node uses). + for (df_iterator i = df_begin(&F.getEntryBlock()), + e = df_end(&F.getEntryBlock()); i != e; ++i) { + BasicBlock *BB = *i; + // This loop scans the BB forward, and allows processForwardInst to erase + // Inst and other instructions. + for (auto Inst = &BB->front(); Inst; ) + Inst = forwardProcessInst(Inst); + } + return Modified; +} + +/*********************************************************************** + * getAndNumBits : get the number of lower bits set by an "and" instruction + */ +static unsigned getAndNumBits(Instruction *Inst) +{ + if (auto C = dyn_cast(Inst->getOperand(1))) { + if ((C = C->getSplatValue())) { + uint64_t Val = cast(C)->getZExtValue(); + return 64 - countLeadingZeros(Val, ZB_Width); + } + } + return Inst->getType()->getScalarType()->getPrimitiveSizeInBits(); +} + +/*********************************************************************** + * getPrev : get the previous instruction, or 0 if at start of BB + * getNext : get the next instruction, or 0 if at end of BB + */ +static Instruction *getPrev(Instruction *Inst) +{ + if (&Inst->getParent()->front() == Inst) + return nullptr; + return Inst->getPrevNode(); +} + +static Instruction *getNext(Instruction *Inst) +{ + if (&Inst->getParent()->back() == Inst) + return nullptr; + return Inst->getNextNode(); +} + +/*********************************************************************** + * reverseProcessInst : process one instruction in GenXReduceIntSize's + * reverse scan + * + * Enter: Inst = the instruction to process + * + * Return: the previous instruction (after any erases done in here), 0 if + * at start of block + */ +Instruction *GenXReduceIntSize::reverseProcessInst(Instruction *Inst) +{ + Instruction *Prev = getPrev(Inst); + // Ignore if not at least a 4 vector. + auto VT = dyn_cast(Inst->getType()); + if (!VT) + return Prev; + if (!VT->getElementType()->isIntegerTy()) + return Prev; + unsigned NumBits = VT->getElementType()->getPrimitiveSizeInBits(); + if (NumBits == 1) + return Prev; + unsigned TruncBits = 0; + // See if the value is only used in instructions that use fewer bits (trunc, + // and, shl). Get the max truncated size. + for (auto ui = Inst->use_begin(), ue = Inst->use_end(); ui != ue; ++ui) { + unsigned ThisTruncBits = NumBits; + auto user = cast(ui->getUser()); + switch (user->getOpcode()) { + case Instruction::Trunc: + ThisTruncBits = user->getType()->getScalarType()->getPrimitiveSizeInBits(); + break; + case Instruction::And: + ThisTruncBits = getAndNumBits(user); + break; + default: + ThisTruncBits = NumBits; + break; + } + TruncBits = std::max(TruncBits, ThisTruncBits); + if (TruncBits == NumBits) + break; + } + if (!TruncBits) + return Prev; // Inst is unused + // Round TruncBits up to next power of two no smaller than 8. + TruncBits = std::max(8, 1 << genx::log2(TruncBits * 2 - 1)); + // If the instruction is not min/max, truncate to no smaller than 16. + switch (GenXIntrinsic::getGenXIntrinsicID(Inst)) { + case GenXIntrinsic::genx_smin: + case GenXIntrinsic::genx_umin: + case GenXIntrinsic::genx_smax: + case GenXIntrinsic::genx_umax: + break; + default: + TruncBits = std::max(TruncBits, 16U); + break; + } + if (TruncBits >= NumBits) + return Prev; // Inst is used somewhere that cannot truncate. + LLVM_DEBUG(dbgs() << "GenXReduceIntSize::reverse: can truncate to " + << TruncBits << " bits: " << *Inst << "\n"); + Value *NewVal = nullptr; + Instruction *NewInst = nullptr; + // Put new code _after_ original instruction, so we don't see it again in + // this backwards pass. + Instruction *InsertBefore = Inst->getNextNode(); + const DebugLoc &DL = Inst->getDebugLoc(); + switch (Inst->getOpcode()) { + case Instruction::LShr: + case Instruction::AShr: + // An shr by constant needs N more bits, where N is the constant. + // That might still allow some truncation. + if (auto C = dyn_cast(Inst->getOperand(1))) { + if ((C = C->getSplatValue())) { + TruncBits += cast(C)->getSExtValue(); + // Round TruncBits up to next power of two no smaller than 8. + TruncBits = std::max(8, 1 << genx::log2(TruncBits * 2 - 1)); + LLVM_DEBUG(dbgs() << "GenXReduceIntSize::reverse: actually can only truncate right shift to " + << TruncBits << " bits\n"); + if (TruncBits < NumBits) { + NewInst = BinaryOperator::Create( + (Instruction::BinaryOps)Inst->getOpcode(), + truncValue(Inst->getOperand(0), TruncBits, InsertBefore, DL), + truncValue(Inst->getOperand(1), TruncBits, InsertBefore, DL), + "", InsertBefore); + break; + } + } + } + // Other shr cannot truncate. + return Prev; + case Instruction::And: + // An "and" by constant might be completely removable if the rhs truncates + // to all ones. + if (auto C = dyn_cast(Inst->getOperand(1))) { + if (cast(truncValue(C, TruncBits, InsertBefore, DL)) + ->isAllOnesValue()) { + // Remove the "and". + NewVal = truncValue(Inst->getOperand(0), TruncBits, InsertBefore, DL); + break; + } + } + // Otherwise, fall through to treat "and" like the other truncatable + // binary ops. + case Instruction::Or: + case Instruction::Xor: + case Instruction::Add: + case Instruction::Sub: + case Instruction::Mul: + case Instruction::Shl: + // These binary operators can just truncate. + NewInst = BinaryOperator::Create( + (Instruction::BinaryOps)Inst->getOpcode(), + truncValue(Inst->getOperand(0), TruncBits, InsertBefore, DL), + truncValue(Inst->getOperand(1), TruncBits, InsertBefore, DL), + "", InsertBefore); + break; + case Instruction::ZExt: + case Instruction::SExt: { + NewVal = Inst->getOperand(0); + unsigned NewBits = NewVal->getType()->getScalarType() + ->getPrimitiveSizeInBits(); + if (TruncBits != NewBits) { + // The value still needs extending, just not as much as before. Or it + // might need to be truncated. + unsigned NumElements = cast(Inst->getType()) + ->getNumElements(); + int Opcode = Instruction::Trunc; + if (TruncBits > NewBits) + Opcode = Inst->getOpcode(); + auto ElTy = Type::getIntNTy(InsertBefore->getContext(), TruncBits); + auto Ty = VectorType::get(ElTy, NumElements); + NewInst = CastInst::Create((Instruction::CastOps)Opcode, NewVal, + Ty, "", InsertBefore); + } + } + break; + case Instruction::ShuffleVector: + if (!cast(Inst->getOperand(2))->isNullValue()) + return Prev; + if (cast(Inst->getOperand(0)->getType()) + ->getNumElements() == 1) { + // This shufflevector is a splat from a 1-vector. + auto TruncatedInput = truncValue(Inst->getOperand(0), TruncBits, + InsertBefore, DL); + NewInst = new ShuffleVectorInst(TruncatedInput, + UndefValue::get(TruncatedInput->getType()), Inst->getOperand(2), "", + InsertBefore); + break; + } + // Detect when the shufflevector is the second half of an + // insertelement+shufflevector sequence being used to implement + // a splat (and the insertelement has no other use). For example: + // %splat.splatinsert.i = insertelement <16 x i32> undef, i32 %direction, i32 0, !dbg !355 + // %splat.splat.i = shufflevector <16 x i32> %splat.splatinsert.i, <16 x i32> undef, <16 x i32> zeroinitializer, !dbg !355 + if (auto IE = dyn_cast(Inst->getOperand(0))) { + if (IE->hasOneUse()) { + if (auto C = dyn_cast(IE->getOperand(2))) { + if (C->isNullValue()) { + // This is a splat, and we can truncate it by creating new + // insertelement and shufflevector instructions. + unsigned NumElements = cast(Inst->getType()) + ->getNumElements(); + auto ElTy = Type::getIntNTy(InsertBefore->getContext(), + TruncBits); + auto Ty = VectorType::get(ElTy, NumElements); + auto NewScalar = CastInst::Create(Instruction::Trunc, + IE->getOperand(1), ElTy, + IE->getOperand(1)->getName() + ".reduceintsize", InsertBefore); + NewScalar->setDebugLoc(IE->getDebugLoc()); + auto NewIE = InsertElementInst::Create(UndefValue::get(Ty), + NewScalar, IE->getOperand(2), "", InsertBefore); + NewIE->setDebugLoc(IE->getDebugLoc()); + NewIE->takeName(IE); + NewInst = new ShuffleVectorInst(NewIE, UndefValue::get(Ty), + Inst->getOperand(2), "", InsertBefore); + break; + } + } + } + } + return Prev; + default: + return Prev; + } + if (NewInst) { + NewInst->setDebugLoc(DL); + NewInst->takeName(Inst); + NewVal = NewInst; + } + assert(NewVal); + // NewVal is the replacement for Inst with a smaller int size. + LLVM_DEBUG(dbgs() << "GenXReduceIntSize::reverse: NewVal: " << *NewVal << "\n"); + // Replace the uses of Inst, which we know are all things that + // have a reduced size requirement (trunc, and). + while (!Inst->use_empty()) { + Instruction *user = cast(Inst->use_begin()->getUser()); + unsigned ThisTruncBits = + user->getType()->getScalarType()->getPrimitiveSizeInBits(); + switch (user->getOpcode()) { + case Instruction::Trunc: { + auto ThisNewVal = NewVal; + if (ThisTruncBits != TruncBits) { + // We need a new trunc. + auto NewTI = CastInst::Create(Instruction::Trunc, NewVal, user->getType(), + "", user); + NewTI->takeName(user); + NewTI->setDebugLoc(user->getDebugLoc()); + LLVM_DEBUG(dbgs() << "GenXReduceIntSize::reverse: NewTI: " << *NewTI << "\n"); + ThisNewVal = NewTI; + } + user->replaceAllUsesWith(ThisNewVal); + user->eraseFromParent(); + } + break; + case Instruction::And: { + auto ThisNewVal = NewVal; + unsigned AndBits = getAndNumBits(user); + if (AndBits != TruncBits) { + // We need a replacement "and" instruction with a different type. + auto NewAnd = BinaryOperator::Create(Instruction::And, NewVal, + truncValue(user->getOperand(1), TruncBits, + user, user->getDebugLoc()), + "", user); + NewAnd->takeName(user); + NewAnd->setDebugLoc(user->getDebugLoc()); + LLVM_DEBUG(dbgs() << "GenXReduceIntSize::reverse: NewAnd: " << *NewAnd << "\n"); + ThisNewVal = NewAnd; + } + if (ThisTruncBits != TruncBits) { + // Need to trunc or extend our new instruction's result to match + // the result of the "and". + assert(ThisNewVal); + auto NewCast = CastInst::Create( + ThisTruncBits > TruncBits ? Instruction::ZExt : Instruction::Trunc, + ThisNewVal, user->getType(), "", user); + if (NewVal == ThisNewVal) + NewCast->takeName(user); + else + NewCast->setName(ThisNewVal->getName() + ".cast"); + NewCast->setDebugLoc(user->getDebugLoc()); + LLVM_DEBUG(dbgs() << "GenXReduceIntSize::reverse: NewCast: " << *NewCast << "\n"); + ThisNewVal = NewCast; + } + user->replaceAllUsesWith(ThisNewVal); + user->eraseFromParent(); + } + break; + default: + assert(0 && "unexpected use"); + break; + } + } + // Erase Inst. Its operands may now become unused, in which case remove + // those too. + auto Opnd0Inst = dyn_cast(Inst->getOperand(0)); + Instruction *Opnd1Inst = nullptr; + if (Inst->getNumOperands() >= 2) + Opnd1Inst = dyn_cast(Inst->getOperand(1)); + Inst->eraseFromParent(); + if (Opnd0Inst && Opnd0Inst->use_empty()) { + if (Opnd0Inst == Prev) + Prev = getPrev(Prev); + Opnd0Inst->eraseFromParent(); + if (Opnd0Inst == Opnd1Inst) + Opnd1Inst = nullptr; + } + if (Opnd1Inst && Opnd1Inst->use_empty()) { + if (Opnd1Inst == Prev) + Prev = getPrev(Prev); + Opnd1Inst->eraseFromParent(); + } + Modified = true; + return Prev; +} + +/*********************************************************************** + * truncValue : get truncated version of value + * + * Enter: V = value to truncate (might be constant) + * NumBits = integer bit size to truncate to + * InsertBefore = insert any new instruction before here + * DL = debug loc for any new instruction + */ +Value *GenXReduceIntSize::truncValue(Value *V, unsigned NumBits, + Instruction *InsertBefore, const DebugLoc &DL) +{ + unsigned NumElements = cast(V->getType())->getNumElements(); + auto ElTy = Type::getIntNTy(InsertBefore->getContext(), NumBits); + auto Ty = VectorType::get(ElTy, NumElements); + if (Ty == V->getType()) + return V; + if (auto C = dyn_cast(V)) { + if (isa(C)) + return UndefValue::get(Ty); + if (auto SV = C->getSplatValue()) { + auto AI = cast(SV)->getValue(); + AI = AI.trunc(NumBits); + C = Constant::getIntegerValue(Ty, AI); + return C; + } + SmallVector Vals; + if (auto CV = dyn_cast(C)) { + for (unsigned i = 0, e = CV->getNumOperands(); i != e; ++i) + Vals.push_back(CV->getOperand(i)); + return ConstantVector::get(Vals); + } else if (auto CDV = dyn_cast(C)) { + for (unsigned i = 0, e = CDV->getNumElements(); i != e; ++i) + Vals.push_back(Constant::getIntegerValue(ElTy, + APInt(NumBits, CDV->getElementAsInteger(i)))); + return ConstantVector::get(Vals); + } + } + // Not a constant. + if (auto Inst = dyn_cast(V)) { + switch (Inst->getOpcode()) { + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: { + // The value is the result of a truncate or extend. + // See if the input is already the right size. + Value *Input = Inst->getOperand(0); + if (Input->getType() == Ty) + return Input; + // Instead of truncating the value, truncate or extend the input. + auto NewInst = CastInst::Create( + Input->getType()->getScalarType()->getPrimitiveSizeInBits() + < NumBits ? (Instruction::CastOps)Inst->getOpcode() + : Instruction::Trunc, + Input, Ty, Inst->getName() + ".reduceintsize", InsertBefore); + NewInst->setDebugLoc(DL); + LLVM_DEBUG(dbgs() << "GenXReduceIntSize::truncVal: " << *NewInst << "\n"); + return NewInst; + } + case Instruction::And: + if (auto C = dyn_cast(Inst->getOperand(1))) { + auto VNB = getValueNumBits(C); + if (!VNB.IsSignExtended && VNB.NumBits >= NumBits) { + C = C->getSplatValue(); + if (C) { + APInt Mask = C->getUniqueInteger(); + if (Mask.isMask(NumBits)) + // The value is the result of an "and" that only keeps bits + // within the truncated size. Just use its input. + return + truncValue(Inst->getOperand(0), NumBits, InsertBefore, DL); + } + } + } + break; + default: + break; + } + } + // Create a new trunc instruction. + auto NewInst = CastInst::Create(Instruction::Trunc, V, Ty, + V->getName() + ".reduceintsize", InsertBefore); + NewInst->setDebugLoc(DL); + LLVM_DEBUG(dbgs() << "GenXReduceIntSize::truncVal: " << *NewInst << "\n"); + return NewInst; +} + +/*********************************************************************** + * forwardProcessInst : process one instruction in GenXReduceIntSize's + * forward scan + * + * Enter: Inst = the instruction to process + * + * Return: the next instruction (after any erases done in here), 0 if + * at end of block + */ +Instruction *GenXReduceIntSize::forwardProcessInst(Instruction *Inst) { + Instruction *Next = getNext(Inst); + // Ignore if not at least a 4 vector. + auto VT = dyn_cast(Inst->getType()); + if (!VT) { + Type *Ty = Inst->getType(); + Value *A; + const APInt *Val; + // Transform (add zext(A), Val) to (zext (add zext(A), Val)). + if (Ty->isIntegerTy(32) && + match(Inst, m_Add(m_ZExt(m_Value(A)), m_APInt(Val)))) + if (A->getType()->isIntegerTy(8) && Val->isNonNegative() && Val->isIntN(8)) { + IRBuilder<> Builder(Inst); + IntegerType *I16Ty = Builder.getInt16Ty(); + APInt NVal = Val->trunc(16); + Instruction *NewInst = cast( + Builder.CreateZExt( + Builder.CreateAdd(Builder.CreateZExt(A, I16Ty), + ConstantInt::get(I16Ty, NVal)), Ty)); + NewInst->takeName(Inst); + Inst->replaceAllUsesWith(NewInst); + Inst->eraseFromParent(); + Modified = true; + } + return Next; + } + if (!VT->getElementType()->isIntegerTy()) + return Next; + unsigned NumBits = VT->getElementType()->getPrimitiveSizeInBits(); + if (NumBits == 1) + return Next; + unsigned TruncBits = NumBits; + bool NeedSignExtend = false; + Instruction *InsertBefore = Inst; + Instruction *NewInst = nullptr; + Value *NewVal = nullptr; + const DebugLoc &DL = Inst->getDebugLoc(); + switch (Inst->getOpcode()) { + case Instruction::LShr: + case Instruction::AShr: + // Convert shl+shr pair back into trunc+ext here, because it makes it + // easier to handle an op that uses the result of it. + if (auto NewInst = convertShlShr(Inst)) { + auto Shl = cast(Inst->getOperand(0)); + Inst->eraseFromParent(); + if (Shl->use_empty()) + Shl->eraseFromParent(); + Inst = NewInst; + } + break; + default: + break; + } + auto IID = GenXIntrinsic::not_any_intrinsic; + switch (Inst->getOpcode()) { + case Instruction::ShuffleVector: + if (Value *V = getSplatValue(cast(Inst))) { + // Transform "splat (ext v)" to "ext (splat v)". + if (auto Ext = dyn_cast(V)) { + unsigned NumElts = Inst->getType()->getVectorNumElements(); + IntegerType *I32Ty = Type::getInt32Ty(Inst->getContext()); + VectorType *MaskTy = VectorType::get(I32Ty, NumElts); + Value *Mask = Constant::getNullValue(MaskTy); + Value *Src = Ext->getOperand(0); + if (!isa(Src->getType())) { + VectorType *VTy = VectorType::get(Src->getType(), NumElts); + Src = + InsertElementInst::Create(UndefValue::get(VTy), Src, + Constant::getNullValue(I32Ty), "", + InsertBefore); + } + NewInst = + new ShuffleVectorInst(Src, UndefValue::get(Src->getType()), + Mask, "", InsertBefore); + if (Ext->getOpcode() == Instruction::ZExt) + NewInst = new ZExtInst(NewInst, Inst->getType(), "", InsertBefore); + else + NewInst = new SExtInst(NewInst, Inst->getType(), "", InsertBefore); + } + } + break; + case Instruction::LShr: { + // LShr can just truncate as long as it does not need sign extending. + auto VNB0 = getValueNumBits(Inst->getOperand(0)); + if (!VNB0.IsSignExtended) + TruncBits = VNB0.NumBits; + } + goto binop; + case Instruction::AShr: { + // AShr can just truncate as long as it does need sign extending. + auto VNB0 = getValueNumBits(Inst->getOperand(0), + /*PreferSigned=*/true); + if (VNB0.IsSignExtended) { + TruncBits = VNB0.NumBits; + NeedSignExtend = true; + } + } + goto binop; + case Instruction::And: + { + Value *A; + const APInt *Val; + if (match(Inst, m_And(m_Value(A), m_APInt(Val))) && + Val->isMask(Val->getActiveBits())) { + TruncBits = std::max(16, 1 << genx::log2(Val->getActiveBits() * 2 - 1)); + NeedSignExtend = false; + goto binop; + } + // "And" can just truncate, if both operands are truncated, and need the + // same kind of extension. + auto VNB0 = getValueNumBits(Inst->getOperand(0)); + auto VNB1 = getValueNumBits(Inst->getOperand(1), + /*PreferSigned=*/VNB0.IsSignExtended); + if (VNB0.IsSignExtended == VNB1.IsSignExtended) { + TruncBits = std::max(VNB0.NumBits, VNB1.NumBits); + NeedSignExtend = VNB0.IsSignExtended; + // Round TruncBits up to next power of two no smaller than 8. + TruncBits = std::max(8, 1 << genx::log2(TruncBits * 2 - 1)); + if (TruncBits < NumBits) { + auto Opnd1 = truncValue(Inst->getOperand(1), TruncBits, InsertBefore, DL); + if (auto C = dyn_cast(Opnd1)) { + if (C->isAllOnesValue()) { + // An "and" with constant that is now all ones can be omitted. + // This bypasses the usual rule that an "and", like most other + // operators, should not be truncated smaller than 16. + LLVM_DEBUG(dbgs() << "GenXReduceIntSize::forward: can truncate to " << TruncBits + << " bits and remove completely: " << *Inst << "\n"); + NewVal = truncValue(Inst->getOperand(0), TruncBits, InsertBefore, DL); + break; + } + } + } + } + } + goto binop; + case Instruction::Or: + case Instruction::Xor: + // These binary operators can just truncate, if both operands are + // truncated, and need the same kind of extension. + { + auto VNB0 = getValueNumBits(Inst->getOperand(0)); + auto VNB1 = getValueNumBits(Inst->getOperand(1), + /*PreferSigned=*/VNB0.IsSignExtended); + if (VNB0.IsSignExtended == VNB1.IsSignExtended) { + TruncBits = std::max(VNB0.NumBits, VNB1.NumBits); + NeedSignExtend = VNB0.IsSignExtended; + } + } + goto binop; + case Instruction::Sub: { + Value *A; + const APInt *Val; + // Transforms (sub (zext A), (zext B)) to (zext (sub A, B)) if A is proved + // to be greater than B. + if (match(Inst, m_Sub(m_APInt(Val), m_ZExt(m_Value(A))))) { + unsigned ASize = A->getType()->getScalarSizeInBits(); + if (ASize <= 16 && Val->trunc(ASize).isMaxValue()) { + TruncBits = 16; + goto binop; + } + } + break; + } + case Instruction::Call: + IID = GenXIntrinsic::getGenXIntrinsicID(Inst); + switch (IID) { + case GenXIntrinsic::genx_umin: + case GenXIntrinsic::genx_umax: + case GenXIntrinsic::genx_smin: + case GenXIntrinsic::genx_smax: { + // umin/umax/smin/smax can just truncate as long as both operands + // have the same type of extension. The type of extension (zero + // or signed) determines whether the truncated op is umin/umax or + // smin/smax: + // + // a = zext i16 1 to i32 = 0x00000001 + // b = zext i16 -1 to i32 = 0x0000FFFF + // umax(a, b) = b = umax(trunc(a), trunc(b)) + // smax(a, b) = b = umax(trunc(a), trunc(b)) + // + // c = sext i16 1 to i32 = 0x00000001 + // d = sext i16 -1 to i32 = 0xFFFFFFFF + // umax(c, d) = d = smax(trunc(c), trunc(d)) + // smax(c, d) = c = smax(trunc(c), trunc(d)) + // + auto VNB0 = getValueNumBits(Inst->getOperand(0)); + auto VNB1 = getValueNumBits(Inst->getOperand(1), + /*PreferSigned=*/VNB0.IsSignExtended); + if (VNB0.IsSignExtended == VNB1.IsSignExtended) { + // Round TruncBits up to next power of two no smaller than 8. + // For min and max, allow byte operands. + TruncBits = std::max(VNB0.NumBits, VNB1.NumBits); + TruncBits = std::max(8, 1 << genx::log2(TruncBits * 2 - 1)); + + Type *SrcTy = Inst->getOperand(0)->getType(); + unsigned SrcBits = SrcTy->getScalarSizeInBits(); + // Only update IID when there is truncation in the source. + if (TruncBits < SrcBits) { + switch (IID) { + case GenXIntrinsic::genx_smax: + case GenXIntrinsic::genx_umax: + IID = VNB0.IsSignExtended ? GenXIntrinsic::genx_smax + : GenXIntrinsic::genx_umax; + break; + case GenXIntrinsic::genx_smin: + case GenXIntrinsic::genx_umin: + IID = VNB0.IsSignExtended ? GenXIntrinsic::genx_smin + : GenXIntrinsic::genx_umin; + break; + default: + break; + } + } + } + } + goto binop_truncate; + default: + break; + } + break; + + binop: + // Round TruncBits up to next power of two no smaller than 16. + // Truncating to 8 bits often makes worse gen code because of the + // restrictions on byte operands in gen. + TruncBits = std::max(16, 1 << genx::log2(TruncBits * 2 - 1)); + binop_truncate: + if (TruncBits < NumBits) { + LLVM_DEBUG(dbgs() << "GenXReduceIntSize::forward: can truncate to " << TruncBits + << " bits: " << *Inst << "\n"); + auto Opnd0 = truncValue(Inst->getOperand(0), TruncBits, InsertBefore, DL); + auto Opnd1 = truncValue(Inst->getOperand(1), TruncBits, InsertBefore, DL); + if (isa(Inst)) { + // Create the replacement instruction: binary operator. + NewInst = BinaryOperator::Create( + (Instruction::BinaryOps)Inst->getOpcode(), + Opnd0, Opnd1, "", InsertBefore); + } else { + // Create the replacement instruction: intrinsic. + // If it is not the case that all uses trunc to TruncBits, then + // use the original size as the result type. + Type *ResTy = Opnd0->getType(); + bool IsOneEltVecTy = false; + if (auto VTy = dyn_cast(ResTy)) + IsOneEltVecTy = VTy->getNumElements() == 1; + for (auto ui = Inst->use_begin(), ue = Inst->use_end(); + ui != ue; ++ui) { + auto User = cast(ui->getUser()); + // Trace through 'extractelement' on single-element vector values. + if (IsOneEltVecTy && + User->getOpcode() == Instruction::ExtractElement && + User->hasOneUse()) + User = User->user_back(); + switch (User->getOpcode()) { + case Instruction::Trunc: + if (User->getType()->getScalarType() + ->getPrimitiveSizeInBits() == TruncBits) { + // Use is trunc to TruncBits: allow truncated result type + // for intrinsic. + continue; + } + break; + case Instruction::And: + if (auto C = dyn_cast(User->getOperand(1))) { + auto VNB = getValueNumBits(C); + if (!VNB.IsSignExtended && VNB.NumBits <= TruncBits) { + // Use is and with no bits remaining outside bottom + // TruncBits: allow truncated result type for intrinsic. + continue; + } + } + break; + } + // Other cases: use the original size as the result type. + ResTy = Inst->getType(); + } + TruncBits = ResTy->getScalarType()->getPrimitiveSizeInBits(); + Type *Tys[] = { ResTy, Opnd0->getType() }; + Function *Decl = GenXIntrinsic::getGenXDeclaration( + Inst->getParent()->getParent()->getParent(), + IID, Tys); + Value *Args[] = { Opnd0, Opnd1 }; + NewInst = CallInst::Create(Decl, Args, "", InsertBefore); + } + } + break; + default: + break; + } + if (NewInst) { + NewInst->takeName(Inst); + NewInst->setDebugLoc(DL); + NewVal = NewInst; + } + if (!NewVal) + return Next; + LLVM_DEBUG(dbgs() << "GenXReduceIntSize::forward: NewVal: " << *NewVal << "\n"); + // Replace uses of Inst. The default is that we zero/sign extend back to the + // original size. However, if the use is in a trunc or zext/sext, then we can + // combine. + Instruction *Extended = nullptr; + while (!Inst->use_empty()) { + auto user = cast(Inst->use_begin()->getUser()); + auto ThisNewVal = NewVal; + switch (user->getOpcode()) { + case Instruction::ZExt: + if (NeedSignExtend) + break; + goto combine; + case Instruction::SExt: + if (!NeedSignExtend) + break; + goto combine; + case Instruction::Trunc: + combine: { + unsigned TargetNumBits = user->getType()->getScalarType() + ->getPrimitiveSizeInBits(); + if (TargetNumBits != TruncBits) { + auto NewCast = CastInst::Create( + TargetNumBits > TruncBits + ? (NeedSignExtend ? Instruction::SExt : Instruction::ZExt) + : Instruction::Trunc, + NewVal, user->getType(), "", user); + NewCast->takeName(user); + NewCast->setDebugLoc(user->getDebugLoc()); + LLVM_DEBUG(dbgs() << "GenXReduceIntSize::forward: NewCast: " + << *NewCast << "\n"); + ThisNewVal = NewCast; + } + user->replaceAllUsesWith(ThisNewVal); + if (user == Next) + Next = getNext(Next); + user->eraseFromParent(); + } + continue; + } + // Default case. + if (!Extended && NewVal->getType() == Inst->getType()) + Extended = NewInst; + if (!Extended) { + Extended = CastInst::Create( + NeedSignExtend ? Instruction::SExt : Instruction::ZExt, NewVal, + Inst->getType(), NewVal->getName() + ".reduceintsize_extend", + Inst->getNextNode()); + Extended->setDebugLoc(Inst->getDebugLoc()); + LLVM_DEBUG(dbgs() << "GenXReduceIntSize::forward: Extended: " + << *Extended << "\n"); + } + *Inst->use_begin() = Extended; + } + // Erase Inst. Its operands may now become unused, in which case remove + // those too. + auto Opnd0Inst = dyn_cast(Inst->getOperand(0)); + Instruction *Opnd1Inst = nullptr; + if (Inst->getNumOperands() >= 2) + Opnd1Inst = dyn_cast(Inst->getOperand(1)); + Inst->eraseFromParent(); + if (Opnd0Inst && Opnd0Inst->use_empty()) { + if (Opnd0Inst == Next) + Next = getPrev(Next); + Opnd0Inst->eraseFromParent(); + } + if (Opnd1Inst && Opnd1Inst->use_empty()) { + if (Opnd1Inst == Next) + Next = getPrev(Next); + Opnd1Inst->eraseFromParent(); + } + Modified = true; + return Next; +} + +/*********************************************************************** + * getValueNumBits : get the number of bits needed for the vector int value + * + * Enter: PreferSigned = return ValueNumBits with IsSignExtended true + * (and NumBits one greater) for a non-negative + * constant + * + * This just returns the number of bits in an element of the value, except + * for these special cases: + * + * 1. A splatted constant returns the number of bits required to represent + * an element of the constant. + * + * 2. A ZExt returns the number of bits in an element of the _input_. + * + * 3. A SExt returns the number of bits in an element of the _input_, with the + * flag to say it needs sign extending. + * + * 4. An "and" with splatted constant returns the number of bits required + * to represent an element of that constant. + * + * This function returns a ValueNumBits, which contains: + * - NumBits, number of bits required + * - IsSignExtended, true if the missing bits are derived by sign extending + * rather than zero extending + */ +GenXReduceIntSize::ValueNumBits GenXReduceIntSize::getValueNumBits( + Value *V, bool PreferSigned) +{ + unsigned NumBits = V->getType()->getScalarType()->getPrimitiveSizeInBits(); + if (auto C = dyn_cast(V)) { + if (C->getType()->isVectorTy()) + C = C->getSplatValue(); + if (C) { + int64_t Val = cast(C)->getSExtValue(); + if (Val >= 0) + return ValueNumBits(64 - countLeadingZeros((uint64_t)Val, ZB_Width) + + PreferSigned, /*IsSignExtended=*/PreferSigned); + return ValueNumBits(63 - countLeadingZeros((uint64_t)-Val, ZB_Undefined), + /*IsSignExtended=*/true); + } + return NumBits; + } + auto Inst = dyn_cast(V); + if (!Inst) + return NumBits; + switch (Inst->getOpcode()) { + case Instruction::ZExt: + return static_cast(Inst->getOperand(0) + ->getType() + ->getScalarType() + ->getPrimitiveSizeInBits()); + case Instruction::SExt: + return ValueNumBits(Inst->getOperand(0)->getType()->getScalarType() + ->getPrimitiveSizeInBits(), /*IsSignExtended=*/true); + case Instruction::And: + if (auto C = dyn_cast(Inst->getOperand(1))) { + ValueNumBits VNB = getValueNumBits(C); + if (!VNB.IsSignExtended) + return VNB; + } + break; + } + return NumBits; +} + +Value *GenXReduceIntSize::getSplatValue(ShuffleVectorInst *SVI) const { + if (!SVI->getMask()->isNullValue()) + return nullptr; + + Value *Src = SVI->getOperand(0); + + if (auto IEI = dyn_cast(Src)) { + auto C = dyn_cast(IEI->getOperand(2)); + if (C && C->isNullValue()) + return IEI->getOperand(1); + } + + if (cast(Src->getType())->getNumElements() == 1) + return Src; + + return nullptr; +} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXRegion.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXRegion.cpp new file mode 100644 index 000000000000..b79788e0945c --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXRegion.cpp @@ -0,0 +1,954 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +// Implementation of methods for Region class +// +//===----------------------------------------------------------------------===// + +#include "GenXRegion.h" +#include "GenXAlignmentInfo.h" +#include "GenXBaling.h" +#include "GenXSubtarget.h" +#include "GenXUtil.h" +#include "vc/GenXOpts/GenXAnalysis.h" +#include "llvm/ADT/SmallBitVector.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/Support/Debug.h" +#include + +using namespace llvm; +using namespace genx; + +/*********************************************************************** + * getWithOffset : get a Region given a rdregion/wrregion, baling in + * constant add of offset + * + * This constructs the Region with a variable index that is a constant add + * baled in (i.e. Region::Indirect and Region::Offset both set to the + * operands of the add). It is for use when baling information is not + * available, but the caller wants the constant offset separated out like + * that. + */ +Region Region::getWithOffset(Instruction *Inst, bool WantParentWidth) +{ + unsigned OperandNum = 0; + switch (GenXIntrinsic::getGenXIntrinsicID(Inst)) { + case GenXIntrinsic::genx_rdregioni: + case GenXIntrinsic::genx_rdregionf: + OperandNum = GenXIntrinsic::GenXRegion::RdIndexOperandNum; + break; + case GenXIntrinsic::genx_wrregioni: + case GenXIntrinsic::genx_wrregionf: + case GenXIntrinsic::genx_wrconstregion: + OperandNum = GenXIntrinsic::GenXRegion::WrIndexOperandNum; + break; + default: + llvm_unreachable("not rdregion or wrregion"); + break; + } + BaleInfo BI; + if (GenXBaling::isBalableIndexAdd(Inst->getOperand(OperandNum))) + BI.setOperandBaled(OperandNum); + return Region(Inst, BI, WantParentWidth); +} + +/*********************************************************************** + * Region constructor from a rd/wr region and its BaleInfo + * This also works with rdpredregion and wrpredregion, with Offset in + * bits rather than bytes, and with ElementBytes set to 1. + */ +Region::Region(Instruction *Inst, const BaleInfo &BI, bool WantParentWidth) + : CMRegion() +{ + // Determine where to get the subregion value from and which arg index + // the region parameters start at. + unsigned ArgIdx = 0; + Value *Subregion = 0; + assert(isa(Inst)); + auto CallI = cast(Inst); + assert(CallI->getCalledFunction()); + switch (GenXIntrinsic::getGenXIntrinsicID(CallI->getCalledFunction())) { + case GenXIntrinsic::genx_rdpredregion: + NumElements = Inst->getType()->getVectorNumElements(); + Width = NumElements; + Offset = cast(Inst->getOperand(1))->getZExtValue(); + ElementBytes = 1; + return; + case GenXIntrinsic::genx_wrpredregion: + NumElements = Inst->getOperand(1)->getType()->getVectorNumElements(); + Width = NumElements; + Offset = cast(Inst->getOperand(2))->getZExtValue(); + ElementBytes = 1; + return; + case GenXIntrinsic::genx_rdregioni: + case GenXIntrinsic::genx_rdregionf: + ArgIdx = 1; + // The size/type of the region is given by the return value: + Subregion = Inst; + break; + case GenXIntrinsic::genx_wrregioni: + case GenXIntrinsic::genx_wrregionf: + case GenXIntrinsic::genx_wrconstregion: + ArgIdx = 2; + // The size/type of the region is given by the "subregion value to + // write" operand: + Subregion = Inst->getOperand(1); + // For wrregion, while we're here, also get the mask. We set mask to NULL + // if the mask operand is constant 1 (i.e. not predicated). + Mask = Inst->getOperand(GenXIntrinsic::GenXRegion::PredicateOperandNum); + if (auto C = dyn_cast(Mask)) + if (C->isAllOnesValue()) + Mask = 0; + break; + default: + assert(0); + } + // Get the region parameters. + assert(Subregion); + ElementTy = Subregion->getType(); + if (VectorType *VT = dyn_cast(ElementTy)) { + ElementTy = VT->getElementType(); + NumElements = VT->getNumElements(); + } + const DataLayout &DL = Inst->getModule()->getDataLayout(); + assert(DL.getTypeSizeInBits(ElementTy) % genx::ByteBits == 0); + ElementBytes = DL.getTypeSizeInBits(ElementTy) / genx::ByteBits; + VStride = cast(Inst->getOperand(ArgIdx))->getSExtValue(); + Width = cast(Inst->getOperand(ArgIdx + 1))->getSExtValue(); + Stride = cast(Inst->getOperand(ArgIdx + 2))->getSExtValue(); + ArgIdx += 3; + // Get the start index. + Value *V = Inst->getOperand(ArgIdx); + assert(V->getType()->getScalarType()->isIntegerTy(16) && + "region index must be i16 or vXi16 type"); +#if _DEBUG + if (VectorType *VT = dyn_cast(V->getType())) + assert(VT->getNumElements() * Width == NumElements && + "vector region index size mismatch"); +#endif + if (ConstantInt *CI = dyn_cast(V)) + Offset = CI->getSExtValue(); // Constant index. + else { + Indirect = V; // Index is variable; assume no baled in add. + if (BI.isOperandBaled(ArgIdx)) { + Instruction *Operator = cast(V); + // The index is variable and has something baled in. We want to process + // a baled in add or add_addr, and ignore a baled in rdregion. + if(!GenXIntrinsic::isRdRegion(Operator)) { + // The index is variable and has a baled in or/add/sub/add_addr. + assert((Operator->getOpcode() == Instruction::Add || + Operator->getOpcode() == Instruction::Sub || + Operator->getOpcode() == Instruction::Or || + GenXIntrinsic::getGenXIntrinsicID(Operator) == GenXIntrinsic::genx_add_addr) + && "error: your offset seems to be calculated not through 'add' or 'or' "); + Constant *C = cast(Operator->getOperand(1)); + ConstantInt *CI = dyn_cast(C); + if (!CI) + CI = cast(C->getSplatValue()); + + // check for or could be changed to add + if (Operator->getOpcode() == Instruction::Or && + !haveNoCommonBitsSet(Operator->getOperand(0), Operator->getOperand(1), + Operator->getModule()->getDataLayout())) + { + assert(false && "or should be changed to add without any errors"); + } + + + Offset = CI->getSExtValue(); + + if (Operator->getOpcode() == Instruction::Sub) + Offset = -Offset; + + Indirect = Operator->getOperand(0); + } + } + // For a variable index, get the parent width arg. + ConstantInt *PW = dyn_cast(Inst->getOperand(ArgIdx + 1)); + if (PW) + ParentWidth = PW->getZExtValue(); + } + // We do some trivial legalization here. The legalization pass does not + // make these changes; instead we do them here so they are not permanently + // written back into the IR but are made on the fly each time some other + // pass uses this code to get the region info. + if (NumElements == 1) { + Width = Stride = 1; + VStride = 0; + } else { + if (NumElements <= Width) { + Width = NumElements; + VStride = 0; + } else if ((unsigned)VStride == Width * Stride) { + // VStride == Width * Stride, so we can canonicalize to a 1D region, + // but only if not indirect or not asked to preserve parentwidth, + // and never if multi-indirect. + if (!Indirect + || (!isa(Indirect->getType()) && !WantParentWidth)) { + Width = NumElements; + VStride = 0; + ParentWidth = 0; + } + } else if (Width == 1) { + // We can turn a 2D width 1 region into a 1D region, but if it is + // indirect it invalidates ParentWidth. So only do it if not asked + // to keep ParentWidth. Also we cannot do it if it is multi-indirect. + if (!Indirect + || (!isa(Indirect->getType()) && !WantParentWidth)) { + Width = NumElements; + Stride = VStride; + VStride = 0; + ParentWidth = 0; + } + } + if (Stride == 0 && Width == NumElements) { + // Canonical scalar region. + Width = 1; + VStride = 0; + } + } +} + +/*********************************************************************** + * Region::getLegalSize : get the max legal size of a region + * + * Enter: Idx = start index into the subregion + * Allow2D = whether to allow 2D region + * InputNumElements = number of elements in whole input vector (so + * we can tell if it is small enough that it cannot possibly + * cross a GRF boundary) + * ST = GenXSubtarget (so we can get gen specific crossing rules) + * AI = 0 else AlignmentInfo (to determine alignment of indirect index) + */ +unsigned Region::getLegalSize(unsigned Idx, bool Allow2D, + unsigned InputNumElements, const GenXSubtarget *ST, AlignmentInfo *AI) +{ + Alignment Align; + if (Indirect) { + Align = Alignment::getUnknown(); + if (AI) + Align = AI->get(Indirect); + } + return getLegalSize(Idx, Allow2D, InputNumElements, ST, Align); +} + +/*********************************************************************** + * Region::getLegalSize : get the max legal size of a region + * + * Enter: Idx = start index into the subregion + * Allow2D = whether to allow 2D region + * InputNumElements = number of elements in whole input vector (so + * we can tell if it is small enough that it cannot possibly + * cross a GRF boundary) + * ST = GenXSubtarget (so we can get gen specific crossing rules) + * Align = alignment of indirect index if any + * + * The setting of Indirect is used as follows: + * + * 0: not indirect + * anything of scalar type: single indirect + * anything of vector type: multi indirect + */ +unsigned Region::getLegalSize(unsigned Idx, bool Allow2D, + unsigned InputNumElements, const GenXSubtarget *ST, Alignment Align) +{ + // Determine the max valid width. + unsigned ValidWidth = 1; + unsigned GRFWidth = ST ? ST->getGRFWidth() : 32; + int MaxStride = 4; + unsigned LogGRFWidth = genx::log2(GRFWidth); + if ((!Stride || exactLog2(Stride) >= 0) && (Allow2D || Stride <= MaxStride)) { + // The stride is legal, so we can potentially do more than one element at a + // time. + // Disallow 2D if the stride is too large for a real Gen region. For a + // source operand (Allow2D is true), we allow a 1D region with stride too + // large, because the vISA writer turns it into a 2D region with width 1. + bool StrideValid = (Stride <= MaxStride); + + if (Indirect && isa(Indirect->getType())) { + // Multi indirect. + if (!Allow2D) { + // Multi indirect not allowed in wrregion. + if (!Stride) + ValidWidth = 1 << genx::log2(Width); + } else if (Width == 1 || !Stride) { + // Multi indirect with width 1 or stride 0. + // Return the max power of two number of elements that: + // 1. fit in 2 GRFs; and + // 2. fit in the whole region; and + // 3. fit in a row if the width is not legal + // 4. no more than 8 elements in multi indirect (because there + // are only 8 elements in an address register). + unsigned LogWidth = genx::log2(Width); + if (1U << LogWidth == Width) + LogWidth = genx::log2(NumElements); // legal width + unsigned LogElementBytes = genx::log2(ElementBytes); + if (LogWidth + LogElementBytes > (LogGRFWidth + 1)) + LogWidth = LogGRFWidth + 1 - LogElementBytes; + ValidWidth = 1 << LogWidth; + if (ValidWidth > 8) + ValidWidth = 8; + } + // Other multi indirect can only do one element. + } else { + // Calculate number of elements up to the boundary imposed by GRF + // crossing rules. + unsigned ElementsPerGRF = GRFWidth / ElementBytes; + unsigned OffsetElements = Offset / ElementBytes; + unsigned ElementsToBoundary = 1; + unsigned RealIdx = Idx / Width * VStride + Idx % Width * Stride; + if (!Indirect) { + // For a direct operand, just use the constant offset of the + // region and the index so far to calculate how far into a GRF this + // subregion starts, and set the boundary at the next-but-one GRF + // boundary. + unsigned NumGRF = 2; + ElementsToBoundary = (NumGRF * ElementsPerGRF) - + ((RealIdx + OffsetElements) % ElementsPerGRF); + } else if (InputNumElements <= ElementsPerGRF) { + // Indirect region but the whole vector is no bigger than a GRF, so + // there is no limit imposed by GRF crossing. + ElementsToBoundary = ElementsPerGRF; + } else { + // For an indirect region, calculate the min and max index (including + // offset) from the region parameters, and add on the current start + // index to both. + // For <= BDW: + // 1. If the min and max then are in the same GRF, then the distance + // from max to the next GRF boundary is the allowed size. + // For >= SKL: + // 1. If the min and max then are in the same GRF, then the distance + // from max to the next-but-one GRF boundary is the allowed size. + // 2. If the max is in the next GRF after min, then the distance + // from max to the next GRF boundary is the allowed size. + // However vISA imposes the restriction that, in a source indirect + // region, a row cannot cross a GRF, unless the region is contiguous. + // Pending a proper fix, we have a temporary fix here that we disallow + // GRF crossing completely unless the original region is a destination + // operand or is a 1D source operand (so GenXVisaFuncWriter can turn it + // into Nx1 instead of 1xN). We use Allow2D as a proxy for "is source + // operand". + unsigned GRFsPerIndirect = 1; + assert(ST); + if (ST->hasIndirectGRFCrossing() && + // SKL+. See if we can allow GRF crossing. + (Allow2D || !is2D())) { + GRFsPerIndirect = 2; + } + unsigned Last = (NumElements / Width - 1) * VStride + (Width - 1) * Stride; + unsigned Max = InputNumElements - Last - 1 + RealIdx; + unsigned Min = RealIdx; + unsigned MinMaxGRFDiff = (Max & -ElementsPerGRF) - (Min & -ElementsPerGRF); + if (!MinMaxGRFDiff) // min and max in same GRF + ElementsToBoundary = ElementsPerGRF * GRFsPerIndirect + - (Max & (ElementsPerGRF - 1)); + else if (MinMaxGRFDiff == 1 && GRFsPerIndirect > 1) + ElementsToBoundary = ElementsPerGRF - (Max & (ElementsPerGRF - 1)); + // We may be able to refine an indirect region legal width further... + if (exactLog2(ParentWidth) >= 0 + && ParentWidth <= ElementsPerGRF) { + // ParentWidth tells us that a row of our region cannot cross a GRF + // boundary. Say that the boundary is at the next multiple of + // ParentWidth. + ElementsToBoundary = std::max(ParentWidth - RealIdx % ParentWidth, + ElementsToBoundary); + } else if (!isa(Indirect->getType())) { + // Use the alignment+offset of the single indirect index, with alignment + // limited to one GRF. + if (!Align.isUnknown()) { + unsigned LogAlign = Align.getLogAlign(); + unsigned ExtraBits = Align.getExtraBits(); + ExtraBits += (Offset + RealIdx * ElementBytes); + ExtraBits &= ((1 << LogAlign) - 1); + if (LogAlign >= LogGRFWidth && !ExtraBits) { + // Start is GRF aligned, so legal width is 1 GRF for <=BDW or + // 2 GRFs for >=SKL. + ElementsToBoundary = ElementsPerGRF * GRFsPerIndirect; + } else if (LogAlign > (unsigned)genx::log2(ElementBytes) || + (LogAlign == (unsigned)genx::log2(ElementBytes) && + ExtraBits == 0)) { + LogAlign = std::min(LogGRFWidth, LogAlign) - genx::log2(ElementBytes); + ExtraBits = (ExtraBits & (GRFWidth-1)) >> genx::log2(ElementBytes); + // We have some alignment, so we can say that the next GRF boundary + // is (at least) that many elements away, minus the offset from that + // alignment. + // For SKL+, we can cross one GRF boundary, so add on one GRF's + // worth. + unsigned ElementsToBoundaryFromAlign = (1U << LogAlign) - ExtraBits; + ElementsToBoundaryFromAlign += (GRFsPerIndirect - 1) * ElementsPerGRF; + ElementsToBoundary = std::max(ElementsToBoundaryFromAlign, + ElementsToBoundary); + } + } + } + } + + // Now calculate what subregion we can fit in before the boundary + // calculated above. + if (Allow2D && StrideValid) { + if ((!VStride || exactLog2(VStride) >= 0) && exactLog2(Width) >= 0 + && Width <= 16 && !(Idx % Width) + && ElementsToBoundary >= (Width - 1) * Stride + 1) { + // The vstride and width are legal, and we're at the start of a + // row, and ElementsToBoundary is big enough for at least one + // whole row, so we can potentially do more than one whole row at a + // time. See how many we can fit, without including the "slack" + // at the end of the last row. + unsigned NumRows = 0; + if (VStride == 0) // Avoid divide by 0 + NumRows = (NumElements - Idx) / Width; + else { + unsigned LastElementOfRow = (Width - 1) * Stride; + unsigned Slack = VStride - (LastElementOfRow + 1); + NumRows = (ElementsToBoundary + Slack) / VStride; + if (NumRows) { + if (NumRows * Width + Idx > NumElements) + NumRows = (NumElements - Idx) / Width; + } + } + ValidWidth = (1 << genx::log2(NumRows)) * Width; + } + if (ValidWidth == 1 && Idx % Width) { + // That failed. See if we can legally get to the end of the row then + // the same number of elements again at the start of the next row. + unsigned ToEndOfRow = Width - Idx % Width; + if (exactLog2(ToEndOfRow) >= 0 && ToEndOfRow <= 16) { + unsigned NewVStride = VStride + (ToEndOfRow - Width) * Stride; + if (exactLog2(NewVStride) >= 0 + && NewVStride + (ToEndOfRow - 1) * Stride < ElementsToBoundary) { + // Yes, we can do the end of one row and the same size start of + // the next row. + ValidWidth = 2 * ToEndOfRow; + } + } + } + } + if (ValidWidth == 1) { + // That failed. See how many elements we can get, no further than the + // next end of row. + ValidWidth = Width - Idx % Width; + if (ValidWidth * Stride - (Stride - 1) > ElementsToBoundary) + ValidWidth = (ElementsToBoundary + Stride - 1) / Stride; + ValidWidth = 1 << genx::log2(ValidWidth); + } + // If the RStride is 0 (which is seen in splat operations) then the + // above logic tends to determine that all of the elements can fit, + // irrespective of vector size and type. This is usually incorrect + // in the wider context, so clamp it here to whatever fits in 2GRF if + // necessary + if (ValidWidth > (2 * ElementsPerGRF)) + ValidWidth = 2 * ElementsPerGRF; + + } + } + return ValidWidth; +} + +/*********************************************************************** + * RdWrRegionSequence::buildFromStartWr: detect a split (legalized) + * sequence rdregion-wrregion from the start, and populate the + * RdWrRegionSequence object with its details + * + * This fails if there is any predication. It succeeds with a sequence length + * of one (i.e. a single rdregion-wrregion pair). + * + * On success, if the WaitingFor field matches one of the wrregions in the + * sequence, then WaitingFor is reset to 0. This is used by buildFromWr to + * check that the sequence includes the wrregion originally passed to it. + * + * On failure, EndWr is left as is, which means that isNull() continues to + * be true. + */ +bool RdWrRegionSequence::buildFromStartWr(Instruction *ArgStartWr, + GenXBaling *Baling) +{ + StartWr = ArgStartWr; + auto Wr = StartWr; + assert(GenXIntrinsic::isWrRegion(Wr)); + Region TotalWrR(Wr, Baling->getBaleInfo(Wr)); + WrR = TotalWrR; + if (TotalWrR.Mask) + return false; + OldVal = Wr->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum); + auto RdVal = Wr->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum); + if (auto Rd = dyn_cast(RdVal)) { + // Handle the case that the start wrregion has a rdregion, so we look for + // a sequence of rd-wr pairs. + if (!GenXIntrinsic::isRdRegion(Rd)) + return false; + Region TotalRdR(Rd, Baling->getBaleInfo(Rd)); + RdR = TotalRdR; + Input = Rd->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum); + EndWr = Wr; + if (Wr == WaitingFor) + WaitingFor = nullptr; + bool SeenWaitingFor = false; + for (;;) { + if (!Wr->hasOneUse() || Wr->use_begin()->getOperandNo() + != GenXIntrinsic::GenXRegion::OldValueOperandNum) + break; + Wr = cast(Wr->use_begin()->getUser()); + if (!GenXIntrinsic::isWrRegion(Wr)) + break; + Value *In = Wr->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum); + if (!GenXIntrinsic::isRdRegion(In)) + break; + auto Rd = cast(In); + if (Rd->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum) != Input) + break; + // Append to the regions. Give up if either fails. + if (!TotalRdR.append(Region(Rd, Baling->getBaleInfo(Rd))) + || !TotalWrR.append(Region(Wr, Baling->getBaleInfo(Wr)))) + break; + SeenWaitingFor |= Wr == WaitingFor; + // If both regions are now legal (have a whole number of rows), then + // save the current position. + if (TotalRdR.isWholeNumRows() && TotalWrR.isWholeNumRows()) { + RdR = TotalRdR; + WrR = TotalWrR; + EndWr = Wr; + if (SeenWaitingFor) + WaitingFor = nullptr; + } + } + return true; + } + if (!isa(Wr->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum))) + return false; + auto TotalC = dyn_cast(RdVal); + if (!TotalC) + return false; + // Handle the case that the start wrregion has a constant "new value" operand + // and an undef "old value" operand. + // We look for a sequence of wrregions where the "new value" operands are all + // constant and we derive the overall constant. + Region TotalRdR(TotalC); + RdR = TotalRdR; + Input = TotalC; + EndWr = Wr; + if (Wr == WaitingFor) + WaitingFor = nullptr; + bool SeenWaitingFor = false; + for (;;) { + if (!Wr->hasOneUse() || Wr->use_begin()->getOperandNo() + != GenXIntrinsic::GenXRegion::OldValueOperandNum) + break; + Wr = cast(Wr->use_begin()->getUser()); + if (!GenXIntrinsic::isWrRegion(Wr)) + break; + auto In = dyn_cast(Wr->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum)); + if (!In) + break; + // Append to the regions. Give up if either fails. + Region InR(In); + InR.Offset = TotalRdR.NumElements * TotalRdR.ElementBytes; + if (!TotalRdR.append(InR) + || !TotalWrR.append(Region(Wr, Baling->getBaleInfo(Wr)))) + break; + SeenWaitingFor |= Wr == WaitingFor; + // Append the constant. + TotalC = concatConstants(TotalC, In); + // If both regions are now legal (have a whole number of rows), then save + // the current position. + if (TotalRdR.isWholeNumRows() && TotalWrR.isWholeNumRows()) { + RdR = TotalRdR; + WrR = TotalWrR; + EndWr = Wr; + Input = TotalC; + if (SeenWaitingFor) + WaitingFor = nullptr; + } + } + return true; +} + +/*********************************************************************** + * RdWrRegionSequence::buildFromWr: detect a split (legalized) rdregion-wrregion + * sequence starting from any wrregion within it, and populate the + * RdWrRegionSequence object with its details + * + * This fails if there is any predication. It succeeds with a sequence length + * of one (i.e. a single rdregion-wrregion pair). + * + * On failure, EndWr is left as is, which means that isNull() continues to + * be true. + */ +bool RdWrRegionSequence::buildFromWr(Instruction *Wr, GenXBaling *Baling) +{ + // Remember that our sequence needs to contain Wr. + WaitingFor = Wr; + // Scan back to what looks like the start of the sequence. + assert(GenXIntrinsic::isWrRegion(Wr)); + StartWr = Wr; + auto RdVal = Wr->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum); + auto Rd = dyn_cast(RdVal); + bool ConstSequence = false; + if (!Rd) { + if (!isa(RdVal)) + return 0; + ConstSequence = true; + } else + Input = Rd->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum); + for (;;) { + Wr = dyn_cast( + Wr->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum)); + if (!GenXIntrinsic::isWrRegion(Wr)) + break; + assert(Wr); + if (!Wr->hasOneUse()) + break; + RdVal = Wr->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum); + if (ConstSequence) { + if (!isa(RdVal)) + break; + } else { + Rd = dyn_cast( + Wr->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum)); + if (!Rd) + break; + if (Input != Rd->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum)) + break; + } + StartWr = Wr; + } + // Try detecting a split rdregion-wrregion starting at StartWr. + for (;;) { + if (!buildFromStartWr(StartWr, Baling)) { + EndWr = nullptr; + return false; + } + if (!WaitingFor) + return true; // success + // The detected sequence did not include the wrregion this function + // started with. Retry with the following sequence. + StartWr = cast(EndWr->use_begin()->getUser()); + if (GenXIntrinsic::isWrRegion(StartWr)) + return false; + } +} + +/*********************************************************************** + * RdWrRegionSequence::buildFromRd: detect a split (legalized) rdregion-wrregion + * sequence starting from any rdregion within it, and populate the + * RdWrRegionSequence object with its details + * + * This fails if there is any predication. It succeeds with a sequence length + * of one (i.e. a single rdregion-wrregion pair). + */ +bool RdWrRegionSequence::buildFromRd(Instruction *Rd, GenXBaling *Baling) +{ + assert(GenXIntrinsic::isRdRegion(Rd)); + if (!Rd->hasOneUse()) + return false; + if (Rd->use_begin()->getOperandNo() != GenXIntrinsic::GenXRegion::NewValueOperandNum) + return false; + auto Wr = cast(Rd->use_begin()->getUser()); + if (!GenXIntrinsic::isWrRegion(Wr)) + return false; + return buildFromWr(Wr, Baling); +} + +/*********************************************************************** + * RdWrRegionSequence::size : get number of rdregion-wrregion pairs in the + * sequence + */ +unsigned RdWrRegionSequence::size() const +{ + unsigned Size = 1; + Instruction *Wr = EndWr; + for ( ; Wr != StartWr; ++Size) + Wr = cast( + Wr->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum)); + return Size; +} + +/*********************************************************************** + * RdWrRegionSequence::isOnlyUseOfInput : check whether the sequence is the + * only use of its input + */ +bool RdWrRegionSequence::isOnlyUseOfInput() const +{ + unsigned Count = 0; + for (auto ui = Input->use_begin(), ue = Input->use_end(); + ui != ue; ++ui) + ++Count; + return Count == size(); +} + +/*********************************************************************** + * RdWrRegionSequence::getRdIndex : get the index of the legalized rdregion + */ +Value *RdWrRegionSequence::getRdIndex() const +{ + if (isa(Input)) + return ConstantInt::get(Type::getInt16Ty(StartWr->getContext()), 0); + auto Rd = cast( + StartWr->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum)); + assert(GenXIntrinsic::isRdRegion(Rd)); + return Rd->getOperand(GenXIntrinsic::GenXRegion::RdIndexOperandNum); +} + +/*********************************************************************** + * RdWrRegionSequence::getWrIndex : get the index of the legalized wrregion + */ +Value *RdWrRegionSequence::getWrIndex() const +{ + return StartWr->getOperand(GenXIntrinsic::GenXRegion::WrIndexOperandNum); +} + +/*********************************************************************** + * RdWrRegionSequence::getInputUse : get some use of Input in the sequence + * + * This only works if the RdWrRegionSequence is a sequence of rd-wr pairs, + * rather than a sequence of wrregions with constant input. In the latter + * case, this returns 0. + */ +Use *RdWrRegionSequence::getInputUse() const +{ + auto Rd = dyn_cast( + StartWr->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum)); + if (!GenXIntrinsic::isRdRegion(Rd)) + return nullptr; + assert(Rd && Rd->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum) == Input); + return &Rd->getOperandUse(GenXIntrinsic::GenXRegion::OldValueOperandNum); +} + +/*********************************************************************** + * RdWrRegionSequence::print : debug dump/print + */ +void RdWrRegionSequence::print(raw_ostream &OS) const +{ + if (isNull()) + OS << "null"; + else { + OS << "sequence"; + if (OldVal) + dbgs() << " OldVal=" << OldVal->getName(); + dbgs() << " Input=" << Input->getName() + << " StartWr=" << StartWr->getName() + << " EndWr=" << EndWr->getName() + << " RdR=" << RdR + << " WrR=" << WrR; + } +} + +static Value *simplifyRegionWrite(Instruction *Inst) { + assert(GenXIntrinsic::isWrRegion(Inst)); + Value *NewVal = Inst->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum); + + // Replace C with A + // C = wrregion(A, undef, R) + if (isa(NewVal)) + return Inst->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum); + + // When A and undef have the same type, replace C with A + // B = rdregion(A, R) + // C = wrregion(undef, B, R) + // + // or replace C by A + // + // B = rdregion(A, R) + // C = wrregion(A, B, R) + // + if (GenXIntrinsic::isRdRegion(NewVal)) { + Instruction *B = cast(NewVal); + Region InnerR(B, BaleInfo()); + Region OuterR(Inst, BaleInfo()); + if (OuterR != InnerR) + return nullptr; + + auto OldValB = B->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum); + auto OldValC = Inst->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum); + if ((isa(OldValC) && + OldValB->getType() == OldValC->getType()) || + OldValB == OldValC) + return OldValB; + } + + return nullptr; +} + +static Value *simplifyRegionRead(Instruction *Inst) { + assert(GenXIntrinsic::isRdRegion(Inst)); + Value *Input = Inst->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum); + if (isa(Input)) + return UndefValue::get(Inst->getType()); + else if (auto C = dyn_cast(Input)) { + if (auto Splat = C->getSplatValue()) { + Type *Ty = Inst->getType(); + if (Ty->isVectorTy()) + Splat = ConstantVector::getSplat(Ty->getVectorNumElements(), Splat); + return Splat; + } + } else if (GenXIntrinsic::isWrRegion(Input) && Input->hasOneUse()) { + // W = wrr(A, B, R) + // C = rdr(W, R) + // => + // replace C by B + Instruction *WI = cast(Input); + Region R1(WI, BaleInfo()); + Region R2(Inst, BaleInfo()); + if (R1 == R2) { + Value *B = WI->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum); + if (B->getType() == Inst->getType()) + return B; + } + } + return nullptr; +} + +// Simplify a region read or write. +Value *llvm::genx::simplifyRegionInst(Instruction *Inst, const DataLayout *DL) { + if (Inst->use_empty()) + return nullptr; + + if (Constant *C = ConstantFoldGenX(Inst, *DL)) + return C; + + unsigned ID = GenXIntrinsic::getGenXIntrinsicID(Inst); + switch (ID) { + case GenXIntrinsic::genx_wrregionf: + case GenXIntrinsic::genx_wrregioni: + return simplifyRegionWrite(Inst); + case GenXIntrinsic::genx_rdregionf: + case GenXIntrinsic::genx_rdregioni: + return simplifyRegionRead(Inst); + default: + break; + } + return nullptr; +} + +bool llvm::genx::simplifyRegionInsts(Function *F, const DataLayout *DL) { + bool Changed = false; + for (auto &BB : F->getBasicBlockList()) { + for (auto I = BB.begin(); I != BB.end();) { + Instruction *Inst = &*I++; + if (auto V = simplifyRegionInst(Inst, DL)) { + Inst->replaceAllUsesWith(V); + Inst->eraseFromParent(); + Changed = true; + } + } + } + return Changed; +} + +// Cleanup loads. +// %load1 = load *m +// %load2 = load *m +// no store to m +// use(load1, load2) +// +bool llvm::genx::cleanupLoads(Function *F) { + bool Changed = false; + for (auto &BB : F->getBasicBlockList()) { + // The dominating loads (may have different types) for each variable. + std::unordered_map> DomLoads; + for (auto I = BB.begin(); I != BB.end();) { + Instruction *Inst = &*I++; + if (auto SI = dyn_cast(Inst)) { + auto GV = getUnderlyingGlobalVariable(SI->getPointerOperand()); + if (!GV) + continue; + // Kill all live loads on this variable. + DomLoads[GV].clear(); + } else if (auto LI = dyn_cast(Inst)) { + auto GV = getUnderlyingGlobalVariable(LI->getPointerOperand()); + if (!GV) + continue; + auto &Loads = DomLoads[GV]; + LoadInst *DomLI = nullptr; + for (auto LI1 : Loads) { + if (LI1->getType() == LI->getType()) { + DomLI = LI1; + break; + } + } + if (DomLI == nullptr) + Loads.push_back(LI); + else { + LI->replaceAllUsesWith(DomLI); + LI->eraseFromParent(); + Changed = true; + } + } + } + } + return Changed; +} + +bool +llvm::genx::IsLinearVectorConstantInts(Value* v, int64_t& start, int64_t& stride) { + auto cv = dyn_cast(v); + if (!cv) + return false; + // Flatten the vector out into the elements array + llvm::SmallVector elements; + auto vectorLength = cv->getType()->getVectorNumElements(); + for (unsigned i = 0; i < vectorLength; ++i) + elements.push_back(cv->getElementAsConstant(i)); + + llvm::ConstantInt* ci = llvm::dyn_cast(elements[0]); + if (ci == NULL) + return false; // Not a vector of integers + + int64_t val0 = ci->getSExtValue(); + if (vectorLength == 1) { + start = val0; + stride = 0; + return true; + } + ci = llvm::dyn_cast(elements[1]); + if (ci == NULL) + return false; // Not a vector of integers + int64_t prevVal = ci->getSExtValue(); + int64_t diff = prevVal - val0; + + // For each element in the array, see if it is both a ConstantInt and + // if the difference between it and the value of the previous element + // is stride. If not, fail. + for (int i = 2; i < (int)vectorLength; ++i) { + ci = llvm::dyn_cast(elements[i]); + if (ci == NULL) + return false; + + int64_t nextVal = ci->getSExtValue(); + if (prevVal + diff != nextVal) + return false; + + prevVal = nextVal; + } + start = val0; + stride = diff; + return true; +} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXRegion.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXRegion.h new file mode 100644 index 000000000000..6e312bf90a31 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXRegion.h @@ -0,0 +1,197 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXRegion : region information +/// ------------------------------- +/// +/// Refer to the comments in the base class CMRegion defined in +/// llvm/Transform/Scalar. +/// +/// Function added for the GenXRegion +/// +/// * Construct from a rdregion/wrregion intrinsic, setting the GenXRegion +/// to the region described by the intrinsic. This constructor also takes the +/// BaleInfo as an argument, allowing a variable index that is a baled in +/// constant add to be considered as a separate variable index and constant +/// offset. +/// +/// GenXLegalization uses GenXRegion to determine whether a region is legal, +/// and split it up if necessary. First it constructs a GenXRegion, then it +/// has a loop to split it into legal regions. Each loop iteration calls: +/// +/// * the getLegalSize method (see below) to determine the split size; then +/// * getSubregion to modify the GenXRegion for the split size; then +/// * one of the methods to create a new rdregion or wrregion intrinsic. +/// +/// GenXRegion::getLegalSize +/// ^^^^^^^^^^^^^^^^^^^^^^^^ +/// +/// The ``getLegalSize`` method is used by GenXLegalization and some other +/// passes to determine whether a region is legal, and if not how small +/// a split is required to make it legal. +/// +/// It takes the GenXSubtarget as an argument, because it needs to know +/// architecture-specific details, currently just whether a single GRF +/// crossing is allowed in an indirect region. +/// +/// It also takes either an AlignmentInfo object, or the actual alignment +/// of the indirect index (if any). Knowing the alignment of the indirect +/// index can help allow a larger legal region, and avoid needing to split +/// into simd1. +/// +//===----------------------------------------------------------------------===// + +#ifndef GENXREGION_H +#define GENXREGION_H + +#include "GenXAlignmentInfo.h" +#include "vc/GenXOpts/Utils/CMRegion.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallBitVector.h" + +namespace llvm { + class Constant; + class DataLayout; + class Value; + class Function; + class GenXBaling; + class GenXSubtarget; + class Module; + class Type; + class Instruction; + class raw_ostream; + class Twine; + class DebugLoc; + class TargetLibraryInfo; + +namespace genx { + struct BaleInfo; + +// Region : description of an operand's region +class Region : public CMRegion { +public: + static Region getWithOffset(Instruction *Inst, bool WantParentWith = false); + // Default constructor: assume single element + Region() : CMRegion() {} + // Construct from a type. + Region(Type *Ty, const DataLayout *DL = nullptr) : CMRegion(Ty, DL) {}; + // Construct from a value. + Region(Value *V, const DataLayout *DL = nullptr) : CMRegion(V, DL) {}; + // Construct from a rd/wr region/element and its BaleInfo + Region(Instruction *Inst, const BaleInfo &BI, bool WantParentWidth = false); + // Construct from a bitmap of which elements to set (legal 1D region) + Region(unsigned Bits, unsigned ElementBytes) + : CMRegion(Bits, ElementBytes) {}; + // getLegalSize : get the max legal size of a region + unsigned getLegalSize(unsigned Idx, bool Allow2D, unsigned InputNumElements, + const GenXSubtarget *ST, AlignmentInfo *AI = nullptr); + unsigned getLegalSize(unsigned Idx, bool Allow2D, unsigned InputNumElements, + const GenXSubtarget *ST, Alignment Align); +}; + +// RdWrRegionSequence : a sequence of rdregion-wrregion pairs probably +// created by legalization or coalescing, conforming to the following +// rules: +// +// 1. It is a sequence of wrregions, each one (other than the last) +// having the next one's "old value" input as its only use. +// +// 2. Each wrregion's "new value" input is a single-use rdregion. +// +// 3. All the rdregions have the same "old value" input. +// +// 4. If the rdregions have a variable index, the index is the same for each +// one, other than the constant offset from a baled in genx.add.addr. +// +// 5. The rdregion regions are such that they can be combined to give the +// region parameters of the original unsplit rdregion. Those rdregion +// parameters are stored in the RdR field. +// +// 6. If the wrregions have a variable index, the index is the same for each +// one, other than the constant offset from a baled in genx.add.addr. +// +// 7. The wrregion regions are such that they can be combined to give the +// region parameters of the original unsplit wrregion. Those wrregion +// parameters are stored in the WrR field. +// +// Alternatively, a RdWrRegionSequence can represent a sequence of wrregion +// instructions with undef "old value" input to the first one and constant +// "new value" input to each one, forming a legalized constant load. +// +class RdWrRegionSequence { + Instruction *WaitingFor = nullptr; +public: + Value *Input = nullptr; + Value *OldVal = nullptr; + Instruction *StartWr = nullptr; + Instruction *EndWr = nullptr; + Region RdR; + Region WrR; + // Default constructor + RdWrRegionSequence() : Input(nullptr), EndWr(nullptr) {} + // isNull : true if the RdWrRegionSequence has not been initialized + bool isNull() const { return !EndWr && !Input; } + // Scan for sequence from the start wrregion instruction. + // Returns false if not even a single rdregion-wrregion pair found. + bool buildFromStartWr(Instruction *Wr, GenXBaling *Baling); + // Scan for sequence from any wrregion instruction in the sequence. + // Returns false if not even a single rdregion-wrregion pair found. + bool buildFromWr(Instruction *Wr, GenXBaling *Baling); + // Scan for sequence from any rdregion instruction in the sequence. + // Returns false if not even a single rdregion-wrregion pair found. + bool buildFromRd(Instruction *Rd, GenXBaling *Baling); + // Get number of rdregion-wrregion pairs in the sequence + unsigned size() const; + // Check whether the sequence is the only use of its input + bool isOnlyUseOfInput() const; + // Get the index of the legalized rdregion + Value *getRdIndex() const; + // Get the index of the legalized wrregion + Value *getWrIndex() const; + // Get some use of Input in the sequence + Use *getInputUse() const; + // Debug dump/print + void dump() const; + void print(raw_ostream &OS) const; +}; + +inline raw_ostream &operator<<(raw_ostream &OS, const RdWrRegionSequence &RWS) { + RWS.print(OS); + return OS; +} + +Value *simplifyRegionInst(Instruction *Inst, const DataLayout *DL); +bool simplifyRegionInsts(Function *F, const DataLayout *DL); + +bool cleanupLoads(Function *F); + +bool IsLinearVectorConstantInts(Value* v, int64_t& start, int64_t& stride); + +} // end namespace genx + +} // end namespace llvm + +#endif /* GENXREGION_H */ diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXRegionCollapsing.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXRegionCollapsing.cpp new file mode 100644 index 000000000000..9bbdce584635 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXRegionCollapsing.cpp @@ -0,0 +1,1460 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXRegionCollapsing +/// -------------------- +/// +/// GenX region collapsing pass is function pass that collapses nested +/// read regions or nested write regions. +/// +/// Nested region accesses can occur in two ways (or a mixture of both): +/// +/// 1. The front end compiler deliberately generates nested region access. The +/// CM compiler does this for a matrix select, generating a region access for +/// the rows and another one for the columns, safe in the knowledge that this +/// pass will combine them where it can. +/// +/// 2. Two region accesses in different source code constructs (e.g. two select() +/// calls, either in the same or different source statements). +/// +/// The combineRegions() function is what makes the decisions on whether two +/// regions can be collapsed, depending on whether they are 1D or 2D, how the +/// rows of one fit in the rows of the other, whether each is indirect, etc. +/// +/// This pass makes an effort to combine two region accesses even if there are +/// multiple bitcasts (from CM format()) or up to one SExt/ZExt (from a cast) in +/// between. +/// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "GENX_RegionCollapsing" + +#include "GenX.h" +#include "GenXBaling.h" +#include "GenXRegion.h" +#include "GenXUtil.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/Local.h" + +using namespace llvm; +using namespace genx; + +namespace { + +// GenX region collapsing pass +class GenXRegionCollapsing : public FunctionPass { + const DataLayout *DL = nullptr; + DominatorTree *DT = nullptr; + bool Modified = false; +public: + static char ID; + explicit GenXRegionCollapsing() : FunctionPass(ID) { } + virtual StringRef getPassName() const { return "GenX Region Collapsing"; } + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.setPreservesCFG(); + } + bool runOnFunction(Function &F); +private: + void runOnBasicBlock(BasicBlock *BB); + void processBitCast(BitCastInst *BC); + void processRdRegion(Instruction *InnerRd); + void splitReplicatingIndirectRdRegion(Instruction *Rd, Region *R); + void processWrRegionElim(Instruction *OuterWr); + Instruction *processWrRegionBitCast(Instruction *WrRegion); + void processWrRegionBitCast2(Instruction *WrRegion); + Instruction *processWrRegion(Instruction *OuterWr); + Instruction *processWrRegionSplat(Instruction *OuterWr); + bool normalizeElementType(Region *R1, Region *R2, bool PreferFirst = false); + bool combineRegions(const Region *OuterR, const Region *InnerR, + Region *CombinedR); + void calculateIndex(const Region *OuterR, const Region *InnerR, + Region *CombinedR, Value *InnerIndex, const Twine &Name, + Instruction *InsertBefore, const DebugLoc &DL); + Value *insertOp(Instruction::BinaryOps Opcode, Value *Lhs, unsigned Rhs, + const Twine &Name, Instruction *InsertBefore, + const DebugLoc &DL); + Value *insertOp(Instruction::BinaryOps Opcode, Value *Lhs, Value *Rhs, + const Twine &Name, Instruction *InsertBefore, + const DebugLoc &DL); +}; + +}// end namespace llvm + + +char GenXRegionCollapsing::ID = 0; +namespace llvm { void initializeGenXRegionCollapsingPass(PassRegistry &); } +INITIALIZE_PASS_BEGIN(GenXRegionCollapsing, "GenXRegionCollapsing", + "GenXRegionCollapsing", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(GenXRegionCollapsing, "GenXRegionCollapsing", + "GenXRegionCollapsing", false, false) + +// Publicly exposed interface to pass... +FunctionPass *llvm::createGenXRegionCollapsingPass() +{ + initializeGenXRegionCollapsingPass(*PassRegistry::getPassRegistry()); + return new GenXRegionCollapsing(); +} + +/*********************************************************************** + * runOnFunction : run the region collapsing pass for this Function + */ +bool GenXRegionCollapsing::runOnFunction(Function &F) +{ + DL = &F.getParent()->getDataLayout(); + DT = &getAnalysis().getDomTree(); + + // Track if there is any modification to the function. + bool Changed = false; + + // This does a postordered depth first traversal of the CFG, processing + // instructions within a basic block in reverse, to ensure that we see a def + // after its uses (ignoring phi node uses). + for (po_iterator i = po_begin(&F.getEntryBlock()), + e = po_end(&F.getEntryBlock()); + i != e; ++i) { + // Iterate until there is no modification. + BasicBlock *BB = *i; + do { + Modified = false; + runOnBasicBlock(BB); + if (Modified) + Changed = true; + } while (Modified); + } + + return Changed; +} + +static bool lowerTrunc(TruncInst *Inst) { + Value *InValue = Inst->getOperand(0); + if (!GenXIntrinsic::isRdRegion(InValue)) + return false; + + Type *InElementTy = InValue->getType(); + Type *OutElementTy = Inst->getType(); + unsigned NumElements = 1; + if (VectorType *VT = dyn_cast(InElementTy)) { + InElementTy = VT->getElementType(); + OutElementTy = cast(OutElementTy)->getElementType(); + NumElements = VT->getNumElements(); + } + unsigned OutBitSize = OutElementTy->getPrimitiveSizeInBits(); + assert(OutBitSize); + // Do not touch truncations to i1 or vector of i1 types. + if (OutBitSize == 1) + return false; + unsigned Stride = InElementTy->getPrimitiveSizeInBits() / OutBitSize; + + // Create the new bitcast. + Instruction *BC = + CastInst::Create(Instruction::BitCast, InValue, + VectorType::get(OutElementTy, Stride * NumElements), + Inst->getName(), Inst /*InsertBefore*/); + BC->setDebugLoc(Inst->getDebugLoc()); + + // Create the new rdregion. + Region R(BC); + R.NumElements = NumElements; + R.Stride = Stride; + R.Width = NumElements; + R.VStride = R.Stride * R.Width; + Instruction *NewInst = R.createRdRegion( + BC, Inst->getName(), Inst /*InsertBefore*/, Inst->getDebugLoc(), + !isa(Inst->getType()) /*AllowScalar*/); + + // Change uses and mark the old inst for erasing. + Inst->replaceAllUsesWith(NewInst); + return true; +} + +void GenXRegionCollapsing::runOnBasicBlock(BasicBlock *BB) { + // Code simplification in block first. + for (auto BI = BB->begin(), E = --BB->end(); BI != E;) { + assert(!BI->isTerminator()); + Instruction *Inst = &*BI++; + if (Inst->use_empty()) + continue; + + // Turn trunc into bitcast followed by rdr. This helps region collapsing in + // a later stage. + if (auto TI = dyn_cast(Inst)) { + Modified |= lowerTrunc(TI); + continue; + } + + // Simplify + // %1 = call <1 x i32> @rdr(...) + // %2 = extractelement <1 x i32> %1, i32 0 + // into + // %2 = call i32 @rdr(...) + // + if (auto EEI = dyn_cast(Inst)) { + Value *Src = EEI->getVectorOperand(); + if (GenXIntrinsic::isRdRegion(Src) && Src->getType()->getVectorNumElements() == 1) { + // Create a new region with scalar output. + Region R(Inst); + Instruction *NewInst = + R.createRdRegion(Src, Inst->getName(), Inst /*InsertBefore*/, + Inst->getDebugLoc(), true /*AllowScalar*/); + Inst->replaceAllUsesWith(NewInst); + Modified = true; + continue; + } + } + + if (Value *V = simplifyRegionInst(Inst, DL)) { + Inst->replaceAllUsesWith(V); + Modified = true; + continue; + } + + // sink index calculation before region collapsing. For collapsed regions, + // it is more difficult to lift constant offsets. + static const unsigned NOT_INDEX = 255; + unsigned Index = NOT_INDEX; + + unsigned IID = GenXIntrinsic::getGenXIntrinsicID(Inst); + if (GenXIntrinsic::isRdRegion(IID)) + Index = GenXIntrinsic::GenXRegion::RdIndexOperandNum; + else if (GenXIntrinsic::isWrRegion(IID)) + Index = GenXIntrinsic::GenXRegion::WrIndexOperandNum; + else if (isa(Inst)) + Index = 2; + else if (isa(Inst)) + Index = 1; + + if (Index != NOT_INDEX) { + Use *U = &Inst->getOperandUse(Index); + Value *V = sinkAdd(*U); + if (V != U->get()) { + *U = V; + Modified = true; + } + } + } + Modified |= SimplifyInstructionsInBlock(BB); + + // This loop processes instructions in reverse, tolerating an instruction + // being removed during its processing, and not re-processing any new + // instructions added during the processing of an instruction. + for (Instruction *Prev = BB->getTerminator(); Prev;) { + Instruction *Inst = Prev; + Prev = nullptr; + if (Inst != &BB->front()) + Prev = Inst->getPrevNode(); + switch (GenXIntrinsic::getGenXIntrinsicID(Inst)) { + case GenXIntrinsic::genx_rdregioni: + case GenXIntrinsic::genx_rdregionf: + processRdRegion(Inst); + break; + case GenXIntrinsic::genx_wrregioni: + case GenXIntrinsic::genx_wrregionf: + processWrRegionElim(Inst); + if (!Inst->use_empty()) { + if (auto NewInst = processWrRegionBitCast(Inst)) { + Modified = true; + Inst = NewInst; + } + auto NewInst1 = processWrRegionSplat(Inst); + if (Inst != NewInst1) { + Modified = true; + Inst = NewInst1; + } + + auto NewInst = processWrRegion(Inst); + processWrRegionBitCast2(NewInst); + if (Inst != NewInst && NewInst->use_empty()) { + NewInst->eraseFromParent(); + Modified = true; + } + } + if (Inst->use_empty()) { + Inst->eraseFromParent(); + Modified = true; + } + break; + default: + if (auto BC = dyn_cast(Inst)) + processBitCast(BC); + if (isa(Inst) && Inst->use_empty()) { + // Remove bitcast that has become unused due to changes in this pass. + Inst->eraseFromParent(); + Modified = true; + } + break; + } + } +} + +/*********************************************************************** + * createBitCast : create a bitcast, combining bitcasts where applicable + */ +static Value *createBitCast(Value *Input, Type *Ty, const Twine &Name, + Instruction *InsertBefore, const DebugLoc &DL) { + if (Input->getType() == Ty) + return Input; + if (auto BC = dyn_cast(Input)) + Input = BC->getOperand(0); + if (Input->getType() == Ty) + return Input; + auto NewBC = CastInst::Create(Instruction::BitCast, Input, Ty, + Name, InsertBefore); + NewBC->setDebugLoc(DL); + return NewBC; +} + +/*********************************************************************** + * createBitCastToElementType : create a bitcast to a vector with the + * specified element type, combining bitcasts where applicable + */ +static Value *createBitCastToElementType(Value *Input, Type *ElementTy, + const Twine &Name, + Instruction *InsertBefore, + const DataLayout *DL, + const DebugLoc &DbgLoc) { + unsigned ElBytes = ElementTy->getPrimitiveSizeInBits() / 8U; + if (!ElBytes) { + assert(ElementTy->isPointerTy() && ElementTy->getPointerElementType()->isFunctionTy()); + ElBytes = DL->getTypeSizeInBits(ElementTy) / 8; + } + unsigned InputBytes = Input->getType()->getPrimitiveSizeInBits() / 8U; + if (!InputBytes) { + Type *T = Input->getType(); + if (T->isVectorTy()) + T = T->getVectorElementType(); + assert(T->isPointerTy() && T->getPointerElementType()->isFunctionTy()); + InputBytes = DL->getTypeSizeInBits(T) / 8; + } + assert(!(InputBytes & (ElBytes - 1)) && "non-integral number of elements"); + auto Ty = VectorType::get(ElementTy, InputBytes / ElBytes); + return createBitCast(Input, Ty, Name, InsertBefore, DbgLoc); +} + +/*********************************************************************** + * combineBitCastWithUser : if PossibleBC is a bitcast, and it has a single + * user that is also a bitcast, then combine them + * + * If combined, the two bitcast instructions are erased. + * + * This can happen because combining two rdregions with a bitcast between + * them can result in the bitcast being used by another bitcast that was + * already there. + */ +static void combineBitCastWithUser(Value *PossibleBC) +{ + if (auto BC1 = dyn_cast(PossibleBC)) { + if (BC1->hasOneUse()) { + if (auto BC2 = dyn_cast(BC1->use_begin()->getUser())) { + Value *CombinedBC = BC1->getOperand(0); + if (CombinedBC->getType() != BC2->getType()) + CombinedBC = createBitCast(BC1->getOperand(0), BC2->getType(), + BC2->getName(), BC2, BC2->getDebugLoc()); + BC2->replaceAllUsesWith(CombinedBC); + BC2->eraseFromParent(); + BC1->eraseFromParent(); + } + } + } +} + +/*********************************************************************** + * processBitCast : process a bitcast whose input is rdregion + * + * We put the bitcast before the rdregion, in the hope that it will enable + * the rdregion to be baled in to something later on. + */ +void GenXRegionCollapsing::processBitCast(BitCastInst *BC) +{ + if (BC->getType()->getScalarType()->isIntegerTy(1)) + return; + auto Rd = dyn_cast(BC->getOperand(0)); + + // check if skipping this optimization. + auto skip = [=] { + // Skip if this is not rdregion + if (!Rd || !GenXIntrinsic::isRdRegion(Rd)) + return true; + + // Single use, do optimization. + if (Rd->hasOneUse()) + return false; + + // More than one uses, we check if rdr is reading from a global. + // If yes, still do such conversion, as bitcast could be folded into g_load. + Value *Op0 = Rd->getOperand(0); + while (auto CI = dyn_cast(Op0)) + Op0 = CI->getOperand(0); + auto LI = dyn_cast(Op0); + if (LI && getUnderlyingGlobalVariable(LI->getPointerOperand())) + return false; + + // skip otherwise; + return true; + }; + + if (skip()) + return; + + // skip call above shall check for RdRegion among other things + assert(Rd && GenXIntrinsic::isRdRegion(Rd)); + + // We have a single use rdregion as the input to the bitcast. + // Adjust the region parameters if possible so the element type is that of + // the result of the bitcast, instead of the input. + Region ROrig(Rd, BaleInfo()); + Region R(Rd, BaleInfo()); + auto ElTy = BC->getType()->getScalarType(); + if (!R.changeElementType(ElTy)) + return; + + // we do not want this optimization to be applied if resulting indirect + // region will have non-zero stride or non-single width + // this will require ineffective legalization in those cases + bool OrigCorr = ((ROrig.Width == 1) || (ROrig.Stride == 0)); + bool ChangedWrong = ((R.Width != 1) && (R.Stride != 0)); + if (OrigCorr && ChangedWrong && R.Indirect) + return; + + // Create the new bitcast. + assert(ElTy->getPrimitiveSizeInBits()); + auto Input = Rd->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum); + auto NewBCTy = VectorType::get(ElTy, + Input->getType()->getPrimitiveSizeInBits() / ElTy->getPrimitiveSizeInBits()); + auto NewBC = CastInst::Create(Instruction::BitCast, Input, NewBCTy, "", Rd); + NewBC->takeName(BC); + NewBC->setDebugLoc(BC->getDebugLoc()); + // Create the new rdregion. + auto NewRd = R.createRdRegion(NewBC, "", Rd, Rd->getDebugLoc(), + /*AllowScalar=*/!isa(BC->getType())); + NewRd->takeName(Rd); + // Replace uses. + BC->replaceAllUsesWith(NewRd); + // Caller removes BC. + Modified = true; +} + +/*********************************************************************** + * processRdRegion : process a rdregion + * + * 1. If this rdregion is unused, it probably became so in the processing + * of a later rdregion. Erase it. + * + * 2. Otherwise, see if the input to this rdregion is the result of + * an earlier rdregion, and if so see if they can be combined. This can + * work even if there are bitcasts and up to one sext/zext between the + * two rdregions. + */ +void GenXRegionCollapsing::processRdRegion(Instruction *InnerRd) +{ + if (InnerRd->use_empty()) { + InnerRd->eraseFromParent(); + Modified = true; + return; + } + + // We use Region::getWithOffset to get a Region object for a rdregion/wrregion + // throughout this pass, in order to ensure that, with an index that is + // V+const, we get the V and const separately (in Region::Indirect and + // Region::Offset). Then our index calculations can ensure that the constant + // add remains the last thing that happens in the calculation. + Region InnerR = Region::getWithOffset(InnerRd, /*WantParentWidth=*/true); + + // Prevent region collapsing for specific src replication pattern, + // in order to enable swizzle optimization for Align16 instruction + if (InnerRd->hasOneUse()) { + if (auto UseInst = dyn_cast(InnerRd->use_begin()->getUser())) { + if (UseInst->getOpcode() == Instruction::FMul) { + auto NextInst = dyn_cast(UseInst->use_begin()->getUser()); + if (NextInst && + (NextInst->getOpcode() == Instruction::FAdd || + NextInst->getOpcode() == Instruction::FSub) && + InnerR.ElementTy->getPrimitiveSizeInBits() == 64U && + InnerR.Width == 2 && + InnerR.Stride == 0 && + InnerR.VStride == 2) + return; + } + } + } + + for (;;) { + Instruction *OuterRd = dyn_cast(InnerRd->getOperand(0)); + // Go through any bitcasts and up to one sext/zext if necessary to find the + // outer rdregion. + Instruction *Extend = nullptr; + bool HadElementTypeChange = false; + for (;;) { + if (!OuterRd) + break; // input not result of earlier rdregion + if (GenXIntrinsic::isRdRegion(OuterRd)) + break; // found the outer rdregion + if (isa(OuterRd) || isa(OuterRd)) { + if (OuterRd->getOperand(0)->getType()->getScalarType()->isIntegerTy(1)) { + OuterRd = nullptr; + break; // input not result of earlier rdregion + } + if (Extend || HadElementTypeChange) { + OuterRd = nullptr; + break; // can only have one sext/zext between the rdregions, and + // sext/zext not allowed if it is then subject to a bitcast + // that changes the element type + } + // Remember the sext/zext instruction. + Extend = OuterRd; + } else if (isa(OuterRd)) { + if (OuterRd->getType()->getScalarType() + != OuterRd->getOperand(0)->getType()->getScalarType()) + HadElementTypeChange = true; + } else { + OuterRd = nullptr; + break; // input not result of earlier rdregion + } + OuterRd = dyn_cast(OuterRd->getOperand(0)); + } + if (!OuterRd) + break; // no outer rdregion that we can combine with + Region OuterR = Region::getWithOffset(OuterRd); + // There was a sext/zext. Because we are going to put that after the + // collapsed region, we want to modify the inner region to the + // extend's input element type without changing the region parameters + // (other than scaling the offset). We know that there is no element + // type changing bitcast between the extend and the inner rdregion. + if (Extend) { + if (InnerR.Indirect) + return; // cannot cope with indexed inner region and sext/zext + InnerR.ElementTy = Extend->getOperand(0)->getType()->getScalarType(); + unsigned ExtInputElementBytes + = InnerR.ElementTy->getPrimitiveSizeInBits() / 8U; + InnerR.Offset = InnerR.Offset / InnerR.ElementBytes * ExtInputElementBytes; + InnerR.ElementBytes = ExtInputElementBytes; + } + // See if the regions can be combined. We call normalizeElementType with + // InnerR as the first arg so it prefers to normalize to that region's + // element type if possible. That can avoid a bitcast being put after the + // combined rdregion, which can help baling later on. + LLVM_DEBUG(dbgs() << "GenXRegionCollapsing::processRdRegion:\n" + " OuterRd (line " << OuterRd->getDebugLoc().getLine() << "): " << *OuterRd << "\n" + " InnerRd (line " << InnerRd->getDebugLoc().getLine() << "): " << *InnerRd << "\n"); + if (!normalizeElementType(&InnerR, &OuterR, /*PreferFirst=*/true)) { + LLVM_DEBUG(dbgs() << "Cannot normalize element type\n"); + return; + } + Region CombinedR; + if (!combineRegions(&OuterR, &InnerR, &CombinedR)) + return; // cannot combine + + // If the combined region is both indirect and splat, then do not combine. + // Otherwise, this leads to an infinite loop as later on we split such + // region reads. + auto isIndirectSplat = [](const Region &R) { + if (!R.Indirect) + return false; + if (R.Width != R.NumElements && !R.VStride && + !isa(R.Indirect->getType())) + return true; + if (R.Width == 1 || R.Stride) + return false; + return true; + }; + if (isIndirectSplat(CombinedR)) + return; + + // Calculate index if necessary. + if (InnerR.Indirect) { + calculateIndex(&OuterR, &InnerR, &CombinedR, + InnerRd->getOperand(GenXIntrinsic::GenXRegion::RdIndexOperandNum), + InnerRd->getName() + ".indexcollapsed", + InnerRd, InnerRd->getDebugLoc()); + } + // If the element type of the combined region does not match that of the + // outer region, we need to do a bitcast first. + Value *Input = OuterRd->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum); + // InnerR.ElementTy not always equal to InnerRd->getType()->getScalarType() (look above) + if (InnerR.ElementTy != OuterRd->getType()->getScalarType()) + Input = createBitCastToElementType(Input, InnerR.ElementTy, + Input->getName() + + ".bitcast_before_collapse", + OuterRd, DL, OuterRd->getDebugLoc()); + // Create the combined rdregion. + Instruction *CombinedRd = CombinedR.createRdRegion(Input, + InnerRd->getName() + ".regioncollapsed", InnerRd, InnerRd->getDebugLoc(), + !isa(InnerRd->getType())); + // If we went through sext/zext, re-instate it here. + Value *NewVal = CombinedRd; + if (Extend) { + auto NewCI = CastInst::Create((Instruction::CastOps)Extend->getOpcode(), + NewVal, InnerRd->getType(), Extend->getName(), InnerRd); + NewCI->setDebugLoc(Extend->getDebugLoc()); + NewVal = NewCI; + } + // If we still don't have the right type due to bitcasts in the original + // code, add a bitcast here. + NewVal = createBitCast(NewVal, InnerRd->getType(), + NewVal->getName() + ".bitcast_after_collapse", InnerRd, + InnerRd->getDebugLoc()); + // Replace the inner read with the new value, and erase the inner read. + // any other instructions between it and the outer read (inclusive) that + // become unused. + InnerRd->replaceAllUsesWith(NewVal); + InnerRd->eraseFromParent(); + Modified = true; + // Check whether we just created a bitcast that can be combined with its + // user. If so, combine them. + combineBitCastWithUser(NewVal); + InnerRd = CombinedRd; + InnerR = Region::getWithOffset(InnerRd, /*WantParentWidth=*/true); + // Because the loop in runOnFunction does not re-process the new rdregion, + // loop back here to re-process it. + } + // InnerRd and InnerR are now the combined rdregion (or the original one if + // no combining was done). + // Check whether we have a rdregion that is both indirect and replicating, + // that we want to split. + splitReplicatingIndirectRdRegion(InnerRd, &InnerR); +} + +/*********************************************************************** + * splitReplicatingIndirectRdRegion : if the rdregion is both indirect and + * replicating, split out the indirect part so it is read only once + */ +void GenXRegionCollapsing::splitReplicatingIndirectRdRegion( + Instruction *Rd, Region *R) +{ + if (!R->Indirect) + return; + if (R->Width != R->NumElements && !R->VStride + && !isa(R->Indirect->getType())) { + // Replicating rows. We want an indirect region that just reads + // one row + Region IndirR = *R; + IndirR.NumElements = IndirR.Width; + auto Indir = IndirR.createRdRegion(Rd->getOperand(0), + Rd->getName() + ".split_replicated_indir", Rd, Rd->getDebugLoc()); + // ... and a direct region that replicates the row. + R->Indirect = nullptr; + R->Offset = 0; + R->Stride = 1; + auto NewRd = R->createRdRegion(Indir, "", Rd, Rd->getDebugLoc()); + NewRd->takeName(Rd); + Rd->replaceAllUsesWith(NewRd); + Rd->eraseFromParent(); + Modified = true; + return; + } + if (R->Width == 1 || R->Stride) + return; + // Replicating columns. We want an indirect region that just reads + // one column + Region IndirR = *R; + IndirR.NumElements = IndirR.NumElements / IndirR.Width; + IndirR.Width = 1; + auto Indir = IndirR.createRdRegion(Rd->getOperand(0), + Rd->getName() + ".split_replicated_indir", Rd, Rd->getDebugLoc()); + // ... and a direct region that replicates the column. + R->Indirect = nullptr; + R->Offset = 0; + R->VStride = 1; + auto NewRd = R->createRdRegion(Indir, "", Rd, Rd->getDebugLoc()); + NewRd->takeName(Rd); + Rd->replaceAllUsesWith(NewRd); + Rd->eraseFromParent(); +} + +/*********************************************************************** + * processWrRegionElim : process a wrregion and eliminate redundant writes + * + * This detects the following code: + * + * B = wrregion(A, V1, R) + * C = wrregion(B, V2, R) + * + * (where "R" is a region that is identical in the two versions + * this can be collapsed to + * + * D = wrregion(A, V2, R) + * + */ +void GenXRegionCollapsing::processWrRegionElim(Instruction *OuterWr) +{ + auto InnerWr = dyn_cast( + OuterWr->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum)); + if (!GenXIntrinsic::isWrRegion(InnerWr)) + return; + // Only perform this optimisation if the only use is with outer - otherwise + // this seems to make the code spill more + assert(InnerWr); + if (!InnerWr->hasOneUse()) + return; + Region InnerR(InnerWr, BaleInfo(), /*WantParentWidth=*/true); + Region OuterR(OuterWr, BaleInfo()); + if (OuterR != InnerR) + return; + // Create the combined wrregion. + Instruction *CombinedWr = cast(OuterR.createWrRegion( + InnerWr->getOperand(0), + OuterWr->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum), + OuterWr->getName() + ".regioncollapsed", OuterWr, + OuterWr->getDebugLoc())); + OuterWr->replaceAllUsesWith(CombinedWr); + // Do not erase OuterWr here -- it gets erased by the caller. + Modified = true; +} + +/*********************************************************************** + * processWrRegionBitCast : handle a wrregion whose "new value" is a + * bitcast (before processing wrregion for region collapsing) + * + * Enter: Inst = the wrregion + * + * Return: replacement wrregion if any, else 0 + * + * If the "new value" operand of the wrregion is a bitcast from scalar to + * 1-vector, or vice versa, then we can replace the wrregion with one that + * uses the input to the bitcast directly. This may enable later baling + * that would otherwise not happen. + * + * The bitcast typically arises from GenXLowering lowering an insertelement. + */ +Instruction *GenXRegionCollapsing::processWrRegionBitCast(Instruction *WrRegion) +{ + assert(GenXIntrinsic::isWrRegion(WrRegion)); + if (auto BC = dyn_cast(WrRegion->getOperand( + GenXIntrinsic::GenXRegion::NewValueOperandNum))) { + if (BC->getType()->getScalarType() + == BC->getOperand(0)->getType()->getScalarType()) { + // The bitcast is from scalar to 1-vector, or vice versa. + Region R(WrRegion, BaleInfo()); + auto NewInst = cast(R.createWrRegion(WrRegion->getOperand(0), + BC->getOperand(0), "", WrRegion, WrRegion->getDebugLoc())); + NewInst->takeName(WrRegion); + WrRegion->replaceAllUsesWith(NewInst); + WrRegion->eraseFromParent(); + return NewInst; + } + } + return nullptr; +} + +/*********************************************************************** + * processWrRegionBitCast2 : handle a wrregion whose "new value" is a + * bitcast (after processing wrregion for region collapsing) + * + * Enter: WrRegion = the wrregion + * + * This does not erase WrRegion even if it becomes unused. + * + * + * If the "new value" operand of the wrregion is some other bitcast, then we + * change the wrregion to the pre-bitcast type and add new bitcasts for the + * "old value" input and the result. This makes it possible for the new value + * to be baled in to the wrregion. + */ +void GenXRegionCollapsing::processWrRegionBitCast2(Instruction *WrRegion) +{ + auto BC = dyn_cast(WrRegion->getOperand( + GenXIntrinsic::GenXRegion::NewValueOperandNum)); + if (!BC) + return; + Type *BCInputElementType = BC->getOperand(0)->getType()->getScalarType(); + if (BCInputElementType->isIntegerTy(1)) + return; + // Get the region params for the replacement wrregion, checking if that + // fails. + Region R(WrRegion, BaleInfo()); + if (!R.changeElementType(BCInputElementType)) + return; + // Bitcast the "old value" input. + Value *OldVal = createBitCastToElementType( + WrRegion->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum), + BCInputElementType, WrRegion->getName() + ".precast", WrRegion, DL, + WrRegion->getDebugLoc()); + // Create the replacement wrregion. + auto NewInst = cast(R.createWrRegion(OldVal, BC->getOperand(0), + "", WrRegion, WrRegion->getDebugLoc())); + NewInst->takeName(WrRegion); + // Cast it. + Value *Res = createBitCast(NewInst, WrRegion->getType(), + WrRegion->getName() + ".postcast", WrRegion, WrRegion->getDebugLoc()); + WrRegion->replaceAllUsesWith(Res); +} + +static bool hasMemoryDeps(CallInst *L1, CallInst *L2, Value *Addr, + DominatorTree *DT) { + + auto isKill = [=](Instruction &I) { + Instruction *Inst = &I; + if (GenXIntrinsic::isVStore(Inst) && + (Addr == Inst->getOperand(1) || + Addr == getUnderlyingGlobalVariable(Inst->getOperand(1)))) + return true; + // OK. + return false; + }; + + // vloads from the same block. + if (L1->getParent() == L2->getParent()) { + BasicBlock::iterator I = L1->getParent()->begin(); + for (; &*I != L1 && &*I != L2; ++I) + /*empty*/; + assert(&*I == L1 || &*I == L2); + auto IEnd = (&*I == L1) ? L2->getIterator() : L1->getIterator(); + return std::any_of(I->getIterator(), IEnd, isKill); + } + + // vloads are from different blocks. + // + // BB1 (L1) + // / \ + // BB3 BB2 (L2) + // \ / + // BB4 + // + auto BB1 = L1->getParent(); + auto BB2 = L2->getParent(); + if (!DT->properlyDominates(BB1, BB2)) { + std::swap(BB1, BB2); + std::swap(L1, L2); + } + if (DT->properlyDominates(BB1, BB2)) { + // As BB1 dominates BB2, we can recursively check BB2's predecessors, until + // reaching BB1. + // + // check BB1 && BB2 + if (std::any_of(BB2->begin(), L2->getIterator(), isKill)) + return true; + if (std::any_of(L1->getIterator(), BB1->end(), isKill)) + return true; + std::set Visited{BB1, BB2}; + std::vector BBs; + for (auto I = pred_begin(BB2), E = pred_end(BB2); I != E; ++I) { + BasicBlock *BB = *I; + if (!Visited.count(BB)) + BBs.push_back(BB); + } + + // This visits the subgraph dominated by BB1, originated from BB2. + while (!BBs.empty()) { + BasicBlock *BB = BBs.back(); + BBs.pop_back(); + Visited.insert(BB); + + // check if there is any store kill in this block. + if (std::any_of(BB->begin(), BB->end(), isKill)) + return true; + + // Populate not visited predecessors. + for (auto I = pred_begin(BB), E = pred_end(BB); I != E; ++I) + if (!Visited.count(*I)) + BBs.push_back(*I); + } + + // no mem deps. + return false; + } + + return true; +} + +// Check whether two values are bitwise identical. +static bool isBitwiseIdentical(Value *V1, Value *V2, DominatorTree *DT) { + assert(V1 && V2 && "null value"); + if (V1 == V2) + return true; + if (BitCastInst *BI = dyn_cast(V1)) + V1 = BI->getOperand(0); + if (BitCastInst *BI = dyn_cast(V2)) + V2 = BI->getOperand(0); + + // Special case arises from vload/vstore. + if (GenXIntrinsic::isVLoad(V1) && GenXIntrinsic::isVLoad(V2)) { + auto L1 = cast(V1); + auto L2 = cast(V2); + + // Loads from global variables. + auto GV1 = getUnderlyingGlobalVariable(L1->getOperand(0)); + auto GV2 = getUnderlyingGlobalVariable(L2->getOperand(0)); + Value *Addr = L1->getOperand(0); + if (GV1 && GV1 == GV2) + // OK. + Addr = GV1; + else if (L1->getOperand(0) != L2->getOperand(0)) + // Check if loading from the same location. + return false; + else if (!isa(Addr)) + // Check if this pointer is local and only used in vload/vstore. + return false; + + // Check if there is no store to the same location in between. + return !hasMemoryDeps(L1, L2, Addr, DT); + } + + // Cannot prove. + return false; +} + +/*********************************************************************** + * processWrRegion : process a wrregion + * + * Enter: OuterWr = the wrregion instruction that we will attempt to use as + * the outer wrregion and collapse with inner ones + * + * Return: the replacement wrregion if any, otherwise OuterWr + * + * This detects the following code: + * + * B = rdregion(A, OuterR) + * C = wrregion(B, V, InnerR) + * D = wrregion(A, C, OuterR) + * + * (where "InnerR" and "OuterR" are the region parameters). This code can + * be collapsed to + * + * D = wrregion(A, V, CombinedR) + * + * We want to do innermost wrregion combining first, but this pass visits + * instructions in the wrong order for that. So, when we see a wrregion + * here, we use recursion to scan back to find the innermost one and then work + * forwards to where we started. + */ +Instruction *GenXRegionCollapsing::processWrRegion(Instruction *OuterWr) +{ + assert(OuterWr); + // Find the inner wrregion, skipping bitcasts. + auto InnerWr = dyn_cast( + OuterWr->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum)); + while (InnerWr && isa(InnerWr)) + InnerWr = dyn_cast(InnerWr->getOperand(0)); + if (!GenXIntrinsic::isWrRegion(InnerWr)) + return OuterWr; + // Process inner wrregions first, recursively. + InnerWr = processWrRegion(InnerWr); + // Now process this one. + // Find the associated rdregion of the outer region, skipping bitcasts, + // and check it has the right region parameters. + assert(InnerWr); + auto OuterRd = dyn_cast(InnerWr->getOperand(0)); + while (OuterRd && isa(OuterRd)) + OuterRd = dyn_cast(OuterRd->getOperand(0)); + if (!GenXIntrinsic::isRdRegion(OuterRd)) + return OuterWr; + assert(OuterRd); + if (!isBitwiseIdentical(OuterRd->getOperand(0), OuterWr->getOperand(0), DT)) + return OuterWr; + Region InnerR = Region::getWithOffset(InnerWr, /*WantParentWidth=*/true); + Region OuterR = Region::getWithOffset(OuterWr); + if (OuterR != Region::getWithOffset(OuterRd)) + return OuterWr; + // See if the regions can be combined. + LLVM_DEBUG(dbgs() << "GenXRegionCollapsing::processWrRegion:\n" + " OuterWr (line " << OuterWr->getDebugLoc().getLine() << "): " << *OuterWr << "\n" + " InnerWr (line " << InnerWr->getDebugLoc().getLine() << "): " << *InnerWr << "\n"); + if (!normalizeElementType(&OuterR, &InnerR)) { + LLVM_DEBUG(dbgs() << "Cannot normalize element type\n"); + return OuterWr; + } + Region CombinedR; + if (!combineRegions(&OuterR, &InnerR, &CombinedR)) + return OuterWr; // cannot combine + // Calculate index if necessary. + if (InnerR.Indirect) { + calculateIndex(&OuterR, &InnerR, &CombinedR, + InnerWr->getOperand(GenXIntrinsic::GenXRegion::WrIndexOperandNum), + InnerWr->getName() + ".indexcollapsed", OuterWr, InnerWr->getDebugLoc()); + } + // Bitcast inputs if necessary. + Value *OldValInput = OuterRd->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum); + OldValInput = createBitCastToElementType(OldValInput, InnerR.ElementTy, + OldValInput->getName() + ".bitcast_before_collapse", OuterWr, DL, OuterWr->getDebugLoc()); + Value *NewValInput = InnerWr->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum); + NewValInput = createBitCastToElementType(NewValInput, InnerR.ElementTy, + NewValInput->getName() + ".bitcast_before_collapse", OuterWr, DL, OuterWr->getDebugLoc()); + // Create the combined wrregion. + Instruction *CombinedWr = cast(CombinedR.createWrRegion( + OldValInput, + NewValInput, + InnerWr->getName() + ".regioncollapsed", OuterWr, + InnerWr->getDebugLoc())); + // Bitcast to the original type if necessary. + Value *Res = createBitCast(CombinedWr, OuterWr->getType(), + CombinedWr->getName() + ".cast", OuterWr, + InnerWr->getDebugLoc()); + // Replace all uses. + OuterWr->replaceAllUsesWith(Res); + // Do not erase OuterWr here, as (if this function recursed to process an + // inner wrregion first) OuterWr might be the same as Prev in the loop in + // runOnFunction(). For a recursive call of processWrRegion, it will + // eventually get visited and then erased as it has no uses. For an outer + // call of processWrRegion, OuterWr is erased by the caller. + Modified = true; + return CombinedWr; +} + +/*********************************************************************** + * processWrRegionSplat : process a wrregion + * + * Enter: OuterWr = the wrregion instruction that we will attempt to use as + * the outer wrregion and collapse with inner ones + * + * Return: the replacement wrregion if any, otherwise OuterWr + * + * This detects the following code: + * + * C = wrregion(undef, V, InnerR) + * D = wrregion(undef, C, OuterR) + * + * (where "InnerR" and "OuterR" are the region parameters). This code can + * be collapsed to + * + * D = wrregion(undef, V, CombinedR) + * + * We want to do innermost wrregion combining first, but this pass visits + * instructions in the wrong order for that. So, when we see a wrregion + * here, we use recursion to scan back to find the innermost one and then work + * forwards to where we started. + */ +Instruction *GenXRegionCollapsing::processWrRegionSplat(Instruction *OuterWr) +{ + assert(OuterWr); + // Find the inner wrregion, skipping bitcasts. + auto InnerWr = dyn_cast( + OuterWr->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum)); + while (InnerWr && isa(InnerWr)) + InnerWr = dyn_cast(InnerWr->getOperand(0)); + if (!GenXIntrinsic::isWrRegion(InnerWr)) + return OuterWr; + // Process inner wrregions first, recursively. + InnerWr = processWrRegionSplat(InnerWr); + + // Now process this one. + auto InnerSrc = dyn_cast(InnerWr->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum)); + if (!InnerSrc) + return OuterWr; + // Ensure that the combined region is well-defined. + if (InnerSrc->getType()->getScalarSizeInBits() != + OuterWr->getType()->getScalarSizeInBits()) + return OuterWr; + + auto OuterSrc = dyn_cast(OuterWr->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum)); + if (!OuterSrc) + return OuterWr; + if (isa(InnerSrc)) { + // OK. + } else { + auto InnerSplat = InnerSrc->getSplatValue(); + auto OuterSplat = OuterSrc->getSplatValue(); + if (!InnerSplat || !OuterSplat || InnerSplat != OuterSplat) + return OuterWr; + } + + Region InnerR = Region::getWithOffset(InnerWr, /*WantParentWidth=*/true); + Region OuterR = Region::getWithOffset(OuterWr); + Region CombinedR; + if (!combineRegions(&OuterR, &InnerR, &CombinedR)) + return OuterWr; // cannot combine + // Calculate index if necessary. + if (InnerR.Indirect) { + calculateIndex(&OuterR, &InnerR, &CombinedR, + InnerWr->getOperand(GenXIntrinsic::GenXRegion::WrIndexOperandNum), + InnerWr->getName() + ".indexcollapsed", OuterWr, InnerWr->getDebugLoc()); + } + // Bitcast inputs if necessary. + Value *OldValInput = OuterSrc; + Value *NewValInput = InnerWr->getOperand(1); + NewValInput = createBitCastToElementType(NewValInput, OuterWr->getType()->getScalarType(), + NewValInput->getName() + ".bitcast_before_collapse", OuterWr, DL, OuterWr->getDebugLoc()); + // Create the combined wrregion. + Instruction *CombinedWr = cast(CombinedR.createWrRegion( + OldValInput, + NewValInput, + InnerWr->getName() + ".regioncollapsed", OuterWr, + InnerWr->getDebugLoc())); + // Bitcast to the original type if necessary. + Value *Res = createBitCast(CombinedWr, OuterWr->getType(), + CombinedWr->getName() + ".cast", OuterWr, + InnerWr->getDebugLoc()); + // Replace all uses. + OuterWr->replaceAllUsesWith(Res); + // Do not erase OuterWr here, as (if this function recursed to process an + // inner wrregion first) OuterWr might be the same as Prev in the loop in + // runOnFunction(). For a recursive call of processWrRegionSplat, it will + // eventually get visited and then erased as it has no uses. For an outer + // call of processWrRegionSplat, OuterWr is erased by the caller. + Modified = true; + return CombinedWr; +} + +/*********************************************************************** + * normalizeElementType : where two regions have different element size, + * make them the same if possible + * + * Enter: R1 = first region + * R2 = second region + * PreferFirst = true to prefer the first region's element type + * + * Return: false if failed + * + * If PreferFirst is false, this uses the larger element size if everything is + * suitably aligned and the region with the smaller element size can be + * converted to the larger element size. + * + * Otherwise, it uses the smaller element size if the region with the + * larger element size can be converted to the smaller element size. + */ +bool GenXRegionCollapsing::normalizeElementType(Region *R1, Region *R2, + bool PreferFirst) +{ + if (R1->ElementBytes == R2->ElementBytes) + return true; // nothing to do + LLVM_DEBUG(dbgs() << "Before normalizeElementType:\n" + " R1: " << *R1 << "\n" + " R2: " << *R2 << "\n"); + // Set BigR to the region with the bigger element size, and SmallR to the + // region with the smaller element size. + bool PreferSmall = false; + Region *BigR = nullptr, *SmallR = nullptr; + if (R1->ElementBytes > R2->ElementBytes) { + BigR = R1; + SmallR = R2; + } else { + BigR = R2; + SmallR = R1; + PreferSmall = PreferFirst; + } + // Try the smaller element size first if it is preferred by the caller. + if (PreferSmall) + if (!BigR->Indirect) // big region not indirect + if (BigR->changeElementType(SmallR->ElementTy)) + return true; + // Then try the bigger element size. + if (!SmallR->Indirect) // small region not indirect + if (SmallR->changeElementType(BigR->ElementTy)) + return true; + // Then try the smaller element size. + if (!PreferSmall) + if (!BigR->Indirect) // big region not indirect + if (BigR->changeElementType(SmallR->ElementTy)) + return true; + return false; +} + +/*********************************************************************** + * combineRegions : combine two regions if possible + * + * Enter: OuterR = Region struct for outer region + * InnerR = Region struct for inner region + * CombinedR = Region struct to write combined region into + * + * Return: true if combining is possible + * + * If combining is possible, this function sets up CombinedR. However, + * CombinedR->Offset and CombinedR->Indirect are set assuming that the + * inner region is direct. + * + * If OuterR->ElementTy != InnerR->ElementTy, this algo cannot determine + * CombinedR->ElementTy, as the type depends on the order of respective + * wr/rd regions (it should be the type of the last one). + */ +bool GenXRegionCollapsing::combineRegions(const Region *OuterR, + const Region *InnerR, Region *CombinedR) +{ + LLVM_DEBUG(dbgs() << "GenXRegionCollapsing::combineRegions\n" + " OuterR: " << *OuterR << "\n" + " InnerR: " << *InnerR << "\n"); + if (InnerR->Indirect && isa(InnerR->Indirect->getType())) + return false; // multi indirect not supported + if (OuterR->Indirect && isa(OuterR->Indirect->getType())) + return false; // multi indirect not supported + if (OuterR->Mask) + return false; // outer region predicated, cannot combine + *CombinedR = *InnerR; + CombinedR->Indirect = OuterR->Indirect; + CombinedR->Stride *= OuterR->Stride; + CombinedR->VStride *= OuterR->Stride; + unsigned ElOffset = InnerR->Offset / InnerR->ElementBytes; + if (OuterR->is2D()) { + // Outer region is 2D: create the combined offset. For outer 2D + // and inner indirect, what CombinedR->Offset is set to here is + // ignored and overwritten by calculateIndex(), so it does not matter + // that it is incorrect in that case. + ElOffset = ElOffset / OuterR->Width * OuterR->VStride + + ElOffset % OuterR->Width * OuterR->Stride; + } else { + // Outer region is 1D: create the combined offset. For the benefit + // of inner indirect, where InnerR->Offset is just an offset from + // InnerR->Indirect, we cope with InnerR->Offset being apparently + // out of range (negative or too big). + ElOffset *= OuterR->Stride; + } + CombinedR->Offset = OuterR->Offset + ElOffset * InnerR->ElementBytes; + if (!OuterR->is2D()) { + LLVM_DEBUG(dbgs() << "outer 1D: CombinedR: " << *CombinedR << "\n"); + return true; // outer region is 1D, can always combine + } + if (InnerR->isScalar()) { + LLVM_DEBUG(dbgs() << "inner scalar/splat: CombinedR: " << *CombinedR << "\n"); + return true; // inner region is scalar/splat, can always combine + } + if (InnerR->Indirect) { + // Indirect inner region. Can combine as long as inner vstride is a + // multiple of outer width, and it in turn is a multiple of inner parent + // width. + if (InnerR->ParentWidth && !(InnerR->VStride % (int)OuterR->Width) + && !(OuterR->Width % InnerR->ParentWidth)) { + CombinedR->VStride = InnerR->VStride / OuterR->Width * OuterR->VStride; + LLVM_DEBUG(dbgs() << "inner indirect: CombinedR: " << *CombinedR << "\n"); + return true; + } + LLVM_DEBUG(dbgs() << "inner indirect: failed\n"); + return false; + } + // Inner region is not indirect. + unsigned StartEl = InnerR->Offset / InnerR->ElementBytes; + unsigned StartRow = StartEl / OuterR->Width; + if (!InnerR->is2D()) { + // Inner region is 1D but outer region is 2D. + unsigned EndEl = StartEl + (InnerR->NumElements - 1) * InnerR->Stride; + unsigned EndRow = EndEl / OuterR->Width; + if (StartRow == EndRow) { + // The whole 1D inner region fits in a row of the outer region. + LLVM_DEBUG(dbgs() << "inner 1D outer 2D, fits in row: CombinedR: " << *CombinedR << "\n"); + return true; + } + if (EndRow == StartRow + 1 && !(InnerR->NumElements % 2)) { + unsigned MidEl = StartEl + InnerR->NumElements / 2 * InnerR->Stride; + if (InnerR->Stride > 0 && (unsigned)(MidEl - (EndRow * OuterR->Width)) + < (unsigned)InnerR->Stride) { + // The 1D inner region is evenly split between two adjacent rows of + // the outer region. + CombinedR->VStride = (MidEl % OuterR->Width - StartEl % OuterR->Width) + * OuterR->Stride + OuterR->VStride; + CombinedR->Width = InnerR->NumElements / 2; + LLVM_DEBUG(dbgs() << "inner 1D outer 2D, split between two rows: CombinedR: " << *CombinedR << "\n"); + return true; + } + } + unsigned BeyondEndEl = EndEl + InnerR->Stride; + if (BeyondEndEl % OuterR->Width == StartEl % OuterR->Width + && !(OuterR->Width % InnerR->Stride)) { + // The 1D inner region is evenly split between N adjacent rows of the + // outer region, starting in the same column for each row. + CombinedR->Width = OuterR->Width / InnerR->Stride; + CombinedR->VStride = OuterR->VStride; + LLVM_DEBUG(dbgs() << "inner 1D outer 2D, split between N rows: CombinedR: " << *CombinedR << "\n"); + return true; + } + LLVM_DEBUG(dbgs() << "inner 1D outer 2D, fail\n"); + return false; // All other 1D inner region cases fail. + } + if (!(InnerR->VStride % (int)OuterR->Width)) { + // Inner vstride is a whole number of outer rows. + CombinedR->VStride = OuterR->VStride * InnerR->VStride / (int)OuterR->Width; + if (!InnerR->Indirect) { + // For a direct inner region, calculate whether we can combine. + unsigned StartEl = InnerR->Offset / InnerR->ElementBytes; + unsigned StartRow = StartEl / OuterR->Width; + unsigned EndRowOfFirstRow = (StartEl + (InnerR->Width - 1) * InnerR->Stride) + / OuterR->Width; + if (StartRow == EndRowOfFirstRow) { + // Each row of inner region is within a row of outer region, starting + // at the same column. + LLVM_DEBUG(dbgs() << "row within row: CombinedR: " << *CombinedR << "\n"); + return true; + } + } else { + // For an indirect inner region, use parent width to tell whether we can + // combine. + if (InnerR->ParentWidth && !(OuterR->Width % InnerR->ParentWidth)) { + LLVM_DEBUG(dbgs() << "inner indirect, parentwidth ok: CombinedR: " << *CombinedR << "\n"); + return true; + } + } + } + // We could handle other cases like: + // - each row of inner region enclosed in a row of outer region + // but with a different column offset + LLVM_DEBUG(dbgs() << "failed\n"); + return false; +} + +/*********************************************************************** + * calculateIndex : calculate index in the case that the inner region is + * indirect + * + * Enter: OuterR, InnerR = outer and inner regions + * CombinedR = combined region set up by combineRegions() + * InnerIndex = variable index for inner region, including the + * constant offset add that was extracted by the Region + * constructor into InnerR->Offset + * Name = name for new instruction(s) + * InsertBefore = insert before this instruction + * DL = debug loc for new instruction(s) + * + * This sets up CombinedR->Indirect and CombinedR->Offset. + * + * A Region has the offset set up as follows: + * + * - For a direct region, R.Offset is the constant offset in bytes and + * R.Indirect is 0. + * + * - Normally, for an indirect region, R.Offset is 0 and R.Indirect is the + * Value used for the offset (in bytes). + * + * - But if the Value used for the offset is an add constant, then R.Offset + * is the constant offset and R.Indirect is the other operand of the add. + * + * In some code paths, this function needs the actual index of the inner region, + * rather than the R.Offset and R.Indirect parts separated out by the Region + * constructor. Thus it is passed InnerIndex, which is that actual index value. + */ +void GenXRegionCollapsing::calculateIndex(const Region *OuterR, + const Region *InnerR, Region *CombinedR, Value *InnerIndex, + const Twine &Name, Instruction *InsertBefore, const DebugLoc &DL) +{ + if (!OuterR->is2D()) { + // Outer region is 1D. We can leave CombinedR->Offset as + // set by combineRegions, but we need to add the indices together, scaling + // the inner one by the outer region's stride. + Value *Idx = InnerR->Indirect; + if (OuterR->Stride != 1) { + Idx = insertOp(Instruction::Mul, Idx, OuterR->Stride, Name, + InsertBefore, DL); + LLVM_DEBUG(dbgs() << " calculateIndex: " << *Idx << "\n"); + } + if (OuterR->Indirect) { + Idx = insertOp(Instruction::Add, Idx, OuterR->Indirect, Name, + InsertBefore, DL); + LLVM_DEBUG(dbgs() << " calculateIndex: " << *Idx << "\n"); + } + CombinedR->Indirect = Idx; + LLVM_DEBUG(dbgs() << " calculateIndex result(1d): CombinedR: " << *CombinedR << "\n"); + return; + } + // Outer region is 2D. We need to split the inner region's index into row + // and column of the outer region, then recombine. We are using InnerIndex, + // which includes any constant offset add, so we need to adjust + // CombinedR->Offset so it does not include InnerR->Offset. + CombinedR->Offset = OuterR->Offset; + LLVM_DEBUG(dbgs() << " calculateIndex: Offset now " << CombinedR->Offset << "\n"); + Value *Col = insertOp(Instruction::URem, InnerIndex, + OuterR->Width * OuterR->ElementBytes, + Name, InsertBefore, DL); + LLVM_DEBUG(dbgs() << " calculateIndex: " << *Col << "\n"); + Value *Row = insertOp(Instruction::UDiv, InnerIndex, + OuterR->Width * OuterR->ElementBytes, + Name, InsertBefore, DL); + LLVM_DEBUG(dbgs() << " calculateIndex: " << *Row << "\n"); + Value *Idx = nullptr; + if (!(OuterR->VStride % OuterR->Stride)) { + // We need to multply Row by VStride and Col by Stride. However, Stride + // divides VStride evenly, so we can common up the multiply by Stride. + Idx = insertOp(Instruction::Mul, Row, + OuterR->VStride * OuterR->ElementBytes / OuterR->Stride, + Name, InsertBefore, DL); + LLVM_DEBUG(dbgs() << " calculateIndex: " << *Idx << "\n"); + Idx = insertOp(Instruction::Add, Idx, Col, Name, InsertBefore, DL); + LLVM_DEBUG(dbgs() << " calculateIndex: " << *Idx << "\n"); + Idx = insertOp(Instruction::Mul, Idx, OuterR->Stride, Name, InsertBefore, DL); + LLVM_DEBUG(dbgs() << " calculateIndex: " << *Idx << "\n"); + } else { + // Need to do Row*VStride and Col*Stride separately. + Idx = insertOp(Instruction::Mul, Row, + OuterR->VStride * OuterR->ElementBytes, Name, InsertBefore, DL); + LLVM_DEBUG(dbgs() << " calculateIndex: " << *Idx << "\n"); + Col = insertOp(Instruction::Mul, Col, OuterR->Stride, Name, InsertBefore, DL); + LLVM_DEBUG(dbgs() << " calculateIndex: " << *Col << "\n"); + Idx = insertOp(Instruction::Add, Idx, Col, Name, InsertBefore, DL); + LLVM_DEBUG(dbgs() << " calculateIndex: " << *Idx << "\n"); + } + if (OuterR->Indirect) { + Idx = insertOp(Instruction::Add, Idx, OuterR->Indirect, + Name, InsertBefore, DL); + LLVM_DEBUG(dbgs() << " calculateIndex: " << *Idx << "\n"); + } + CombinedR->Indirect = Idx; + LLVM_DEBUG(dbgs() << " calculateIndex result(2d): CombinedR: " << *CombinedR << "\n"); +} + +/*********************************************************************** + * insertOp : insert a binary op + */ +Value *GenXRegionCollapsing::insertOp(Instruction::BinaryOps Opcode, Value *Lhs, + unsigned Rhs, const Twine &Name, Instruction *InsertBefore, const DebugLoc &DL) +{ + auto I16Ty = Type::getInt16Ty(InsertBefore->getContext()); + return insertOp(Opcode, Lhs, + Constant::getIntegerValue(I16Ty, APInt(16, Rhs)), + Name, InsertBefore, DL); +} + +Value *GenXRegionCollapsing::insertOp(Instruction::BinaryOps Opcode, Value *Lhs, + Value *Rhs, const Twine &Name, Instruction *InsertBefore, const DebugLoc &DL) +{ + if (auto C = dyn_cast(Rhs)) { + int RhsVal = C->getZExtValue(); + int LogVal = genx::exactLog2(RhsVal); + if (LogVal >= 0) { + switch (Opcode) { + case Instruction::Mul: + // multiply by power of 2 -> shl + if (!LogVal) + return Lhs; + Rhs = Constant::getIntegerValue(C->getType(), APInt(16, LogVal)); + Opcode = Instruction::Shl; + break; + case Instruction::UDiv: + // divide by power of 2 -> lshr + if (!LogVal) + return Lhs; + Rhs = Constant::getIntegerValue(C->getType(), APInt(16, LogVal)); + Opcode = Instruction::LShr; + break; + case Instruction::URem: + // remainder by power of 2 -> and + Rhs = Constant::getIntegerValue(C->getType(), APInt(16, RhsVal - 1)); + Opcode = Instruction::And; + break; + default: + break; + } + } + } + auto Inst = BinaryOperator::Create(Opcode, Lhs, Rhs, Name, InsertBefore); + Inst->setDebugLoc(DL); + return Inst; +} + diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXRematerialization.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXRematerialization.cpp new file mode 100644 index 000000000000..247442968078 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXRematerialization.cpp @@ -0,0 +1,146 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXRematerialization +/// --------------------- +/// +/// This pass performs rematerialization to reduce register pressure. +/// +//===----------------------------------------------------------------------===// +#include "GenX.h" +#include "GenXBaling.h" +#include "GenXLiveness.h" +#include "GenXModule.h" +#include "GenXNumbering.h" +#include "GenXPressureTracker.h" +#include "GenXUtil.h" +#include "llvm/Pass.h" + +using namespace llvm; +using namespace genx; + +namespace { + +class GenXRematerialization : public FunctionGroupPass { + GenXBaling *Baling = nullptr; + GenXLiveness *Liveness = nullptr; + GenXNumbering *Numbering = nullptr; + bool Modified = false; + +public: + static char ID; + explicit GenXRematerialization() : FunctionGroupPass(ID) {} + StringRef getPassName() const override { + return "GenX rematerialization pass"; + } + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnFunctionGroup(FunctionGroup &FG) override; + +private: + void remat(Function *F, PressureTracker &RP); +}; + +} // namespace + +namespace llvm { void initializeGenXRematerializationPass(PassRegistry &); } +char GenXRematerialization::ID = 0; +INITIALIZE_PASS_BEGIN(GenXRematerialization, "GenXRematerialization", "GenXRematerialization", false, false) +INITIALIZE_PASS_DEPENDENCY(GenXGroupBaling) +INITIALIZE_PASS_DEPENDENCY(GenXLiveness) +INITIALIZE_PASS_DEPENDENCY(GenXNumbering) +INITIALIZE_PASS_END(GenXRematerialization, "GenXRematerialization", "GenXRematerialization", false, false) + +FunctionGroupPass *llvm::createGenXRematerializationPass() { + initializeGenXRematerializationPass(*PassRegistry::getPassRegistry()); + return new GenXRematerialization; +} + +void GenXRematerialization::getAnalysisUsage(AnalysisUsage &AU) const { + FunctionGroupPass::getAnalysisUsage(AU); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + AU.addPreserved(); + AU.setPreservesCFG(); +} + +bool GenXRematerialization::runOnFunctionGroup(FunctionGroup &FG) { + if (skipOptWithLargeBlock(FG)) + return false; + + Modified = false; + Baling = &getAnalysis(); + Liveness = &getAnalysis(); + Numbering = &getAnalysis(); + PressureTracker RP(FG, Liveness); + for (auto fgi = FG.begin(), fge = FG.end(); fgi != fge; ++fgi) + remat(*fgi, RP); + return Modified; +} + +void GenXRematerialization::remat(Function *F, PressureTracker &RP) { + // Collect rematerialization candidates. + std::vector Candidates; + for (auto &BB : F->getBasicBlockList()) { + for (auto &Inst : BB.getInstList()) { + // (1) upward cast + if (auto CI = dyn_cast(&Inst)) { + if (CI->getOpcode() != Instruction::UIToFP && + CI->getOpcode() != Instruction::SIToFP) + continue; + if (!CI->getType()->isVectorTy()) + continue; + if (CI->getSrcTy()->getScalarSizeInBits() >= + CI->getDestTy()->getScalarSizeInBits()) + continue; + if (Inst.isUsedOutsideOfBlock(&BB) || Inst.getNumUses() <= 2) + continue; + LiveRange *LR = Liveness->getLiveRangeOrNull(CI); + if (!LR || LR->value_size() != 1) + continue; + assert(*LR->value_begin() == CI); + unsigned B = Numbering->getNumber(CI); + for (auto &U : CI->uses()) { + auto UI = U.getUser(); + unsigned E = Numbering->getNumber(UI); + if (E > B && RP.intersectWithRedRegion(B, E)) + Candidates.push_back(&U); + } + } + } + } + + // Do rematerialization. + for (auto U : Candidates) { + Instruction *Inst = cast(U->get()); + Instruction *UI = cast(U->getUser()); + Instruction *Clone = Inst->clone(); + Clone->insertBefore(UI); + U->set(Clone); + Modified = true; + } +} \ No newline at end of file diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXSimdCFConformance.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXSimdCFConformance.cpp new file mode 100644 index 000000000000..6bd1ef7cf26b --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXSimdCFConformance.cpp @@ -0,0 +1,3698 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXSimdCFConformance +/// --------------------- +/// +/// This pass checks that the use of SIMD control flow (llvm.genx.simdcf.goto +/// and llvm.genx.simdcf.join) conforms to the rules required to allow us to +/// generate actual goto and join instructions. If not, the intrinsics are +/// lowered to code that implements the defined semantics for the intrinsics, +/// but does not use SIMD CF instructions, so is usually less efficient. +/// +/// It also makes certain transformations to make goto/join legal in terms of its +/// position in the basic block. These can fail silently, in which case the +/// conformance check will fail on the goto/join in question: +/// +/// * A goto and its extractvalues must be at the end of the block. (Actually, if +/// the !any result of the goto is used in a conditional branch at the end of +/// the block, then the goto being baled into the branch means that it is +/// treated as being at the end of the block anyway. The only reason we need +/// to sink it here is to ensure that isGotoBlock works.) +/// +/// * For a join label block (a block that is the JIP of other gotos/joins), a +/// join must come at the start of the block. +/// +/// * For a branching join block (one whose conditional branch condition is the +/// !any result from a join), the join must be at the end of the block. +/// +/// * For a block that has one join with both of the above true, we need to move +/// all other code out of the block. +/// +/// The pass is run twice: an "early SIMD CF conformance pass" (a module pass) +/// just before GenXLowering, and a "late SIMD CF conformance pass" (a function +/// group pass) just before second baling. +/// +/// The early pass is the one that checks for conformance, and lowers the goto +/// and join intrinsics if the code is not conformant. The conformance checks +/// implement the rules listed in the documentation for the goto and join +/// intrinsics. +/// +/// Lowering a goto issues a "failed to optimize SIMD control flow" warning. No +/// clue is given in the warning as to what caused the conformance failure, +/// however you (a compiler developer) can find out (for a test case submitted +/// by a compiler user) by turning on -debug and looking at the output from this +/// pass. +/// +/// The late pass checks again for conformance, but if the code is not +/// conformant, it just errors. We could lower the gotos and joins there too, +/// but it would be more fiddly as we would have to ensure that the code +/// conforms with what is expected at that stage of compilation, and there is +/// no further chance to optimize it there. +/// +/// We are not expecting this error to happen. +/// +/// Otherwise, the late pass sets the register category of the EM and RM values +/// to "EM" and "RM", so they do not get any register allocated. +/// +/// Conformance rules +/// ^^^^^^^^^^^^^^^^^ +/// +/// If the goto and join intrinsics are not used in a way that conforms to the +/// rules, then they will still have the semantics in their spec, but this pass +/// will lower at least some of them to equivalent but less efficient code. +/// +/// The rules are: +/// +/// 1. Because the hardware has a single EM (execution mask) register, all EM +/// values input to and generated by these intrinsics must not interfere with +/// each other; that is, they must have disjoint live ranges. For the +/// purposes of determining interference, if any EM value is a phi node +/// with incoming constant all ones, then the constant all ones value is +/// counted as being live from the start of the function and is not allowed +/// to interfere with other EM values (although it can interfere with other +/// such constant all ones values). +/// +/// 2. An EM value is allowed to be defined: +/// +/// a. as part of the struct returned by one of these intrinsics; +/// +/// b. by a phi node, as long as each incoming is either an EM value or +/// a constant all ones; +/// +/// c. by an extractvalue extracting it from a struct containing an EM value; +/// +/// d. as a function argument, as long as an EM value is also returned by the +/// function (perhaps as part of a struct); +/// +/// e. by an insertvalue as part of a return value struct; +/// +/// f. as the return value of a non-intrinsic call (perhaps as part of a struct), +/// as long as there is also a call arg that is an EM value, and the called +/// function has the corresponding function arg and return value as EM values; +/// +/// g. since shufflevector from EM does not change EM and only makes it shorter +/// to create implicit predication of desired width, it's also considered +/// as an EM definition, but it can only be used by wrregion and select; +/// +/// 3. An EM value is allowed to be used: +/// +/// a. as the OldEM input to one of these intrinsics; +/// +/// b. in a phi node, as long as the result of the phi node is an EM value; +/// +/// c. as the condition in a wrregion or select; +/// +/// d. as the input to a shufflevector whose effect is to slice part of the EM +/// value starting at index 0, as long as the result of that slice is only +/// used as the condition in a wrregion or select; +/// +/// e. as a call argument, as long as the corresponding function argument is an +/// EM value, and the call has an EM return value; +/// +/// f. in a return (perhaps as part of a struct), as long as the function also +/// has an argument that is an EM value. +/// +/// For an EM value defined in a goto, or a join whose scalar BranchCond result +/// is used in a conditional branch, or in an extractvalue out of +/// the result of such a goto or join, the only use allowed in the same basic block +/// as the goto/join is such an extractvalue. +/// +/// 4. The OldEM input to the two intrinsics must be either an EM value or +/// constant all ones. In the latter case, and in the case of a constant incoming +/// to an EM phi node, its live range is considered to reach +/// back through all paths to the function entry for the purposes of rule (1). +/// +/// 5. Each join point has a web of RM (resume mask) values, linked as by rules (6) +/// and (7). All RM values within one join point's web must not interfere with +/// each other; that is, they must have disjoint live ranges. For the +/// purposes of determining interference, if an RM value is a phi node with +/// incoming constant all zeros, then the constant all zeros value is +/// counted as being live from the start of the function and is not allowed +/// to interfere with other RM values for this join (although it can +/// interfere with other such constant all zeros values). +/// +/// 6. An RM value is allowed to be defined: +/// +/// a. as part of the struct returned by ``llvm.genx.simdcf.goto``; +/// +/// b. by a phi node, as long as each incoming is either an RM value or +/// a constant all zeros. +/// +/// 7. An RM value is allowed to be used: +/// +/// a. as the OldRM input to ``llvm.genx.simdcf.goto``; +/// +/// b. as the RM input to ``llvm.genx.simdcf.join``, but only to one join in the +/// whole web; +/// +/// c. in a phi node, as long as the result of the phi node is an RM value. +/// +/// 8. The OldRM input to ``llvm.genx.simdcf.goto``, or the RM input to +/// ``llvm.genx.simdcf.join``, must be either an RM value, or constant all +/// zeros. In the latter case, and in the case of a constant incoming to an RM +/// phi node, its live range is considered to reach back through all paths +/// to the function entry or to the web's ``llvm.genx.simdcf.join`` for the +/// purposes of rule (5). +/// +/// 9. The BranchCond struct element of the result of ``llvm.genx.simdcf.goto`` +/// must either be unused (unextracted), or, after being extractvalued, +/// must have exactly one use, which is in a +/// conditional branch terminating the same basic block. In the unused case, +/// the basic block must end with an unconditional branch. (This is a goto +/// that is immediately followed by a join.) +/// +/// 10. The BranchCond struct element of the result of ``llvm.genx.simdcf.join`` +/// must either be unused (unextracted), or, after being extractvalued, +/// have exactly one use, which is in a conditional branch terminating the +/// same basic block. +/// +/// 11. It must be possible to derive an ordering for the basic blocks in a +/// function such that, in the conditional branch using the result of any goto +/// or join, the "false" successor is fall-through and the "true" successor is +/// to a join later on in the sequence. For a goto followed by an +/// unconditional branch, the successor is fall-through _and_ the next join +/// in sequence. +/// +/// **IR restriction**: goto and join intrinsics must conform to these rules +/// (since this pass lowers any that do not). +/// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "GENX_SIMDCFCONFORMANCE" + +#include "FunctionGroup.h" +#include "GenX.h" +#include "GenXConstants.h" +#include "GenXGotoJoin.h" +#include "GenXLiveness.h" +#include "GenXModule.h" +#include "GenXRegion.h" +#include "GenXUtil.h" +#include "vc/GenXOpts/Utils/RegCategory.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/GenXIntrinsics/GenXIntrinsics.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/DiagnosticPrinter.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/Local.h" + +#include "llvmWrapper/IR/InstrTypes.h" + +using namespace llvm; +using namespace genx; + +static cl::opt EnableGenXGotoJoin("enable-genx-goto-join", cl::init(true), cl::Hidden, + cl::desc("Enable use of Gen goto/join instructions for SIMD control flow.")); + +namespace { + +// Diagnostic information for error/warning relating to SIMD control flow. +class DiagnosticInfoSimdCF : public DiagnosticInfoOptimizationBase { +private: + static int KindID; + static int getKindID() { + if (KindID == 0) + KindID = llvm::getNextAvailablePluginDiagnosticKind(); + return KindID; + } +public: + static void emit(Instruction *Inst, StringRef Msg, DiagnosticSeverity Severity = DS_Error); + DiagnosticInfoSimdCF(DiagnosticSeverity Severity, const Function &Fn, + const DebugLoc &DLoc, StringRef Msg) + : DiagnosticInfoOptimizationBase((DiagnosticKind)getKindID(), Severity, + /*PassName=*/nullptr, Msg, Fn, DLoc) {} + // This kind of message is always enabled, and not affected by -rpass. + virtual bool isEnabled() const override { return true; } + static bool classof(const DiagnosticInfo *DI) { + return DI->getKind() == getKindID(); + } + + // TODO: consider changing format + void print(DiagnosticPrinter &DP) const override { DP << "GenXSimdCFConformance: " << RemarkName; } +}; +int DiagnosticInfoSimdCF::KindID = 0; + +// GenX SIMD control flow conformance pass -- common data between early and +// late passes. +class GenXSimdCFConformance { +protected: + Module *M; + FunctionGroup *FG; + FunctionGroupAnalysis *FGA; + DominatorTreeGroupWrapperPass *DTWrapper; + std::map DTs; + GenXLiveness *Liveness; + bool Modified; + SetVector EMVals; + std::map> RMVals; + bool lowerSimdCF; +private: + + // GotoJoinEVs: container for goto/join Extract Value (EV) info. Also + // allowes to remove duplication of EVs. Performs it in construction + // and moves EVs right after goto/join. Hoisting can be performed + // again with hoistEVs method. For instance, it is used on join + // hoisting to save correct EM liveranges. + class GotoJoinEVs { + private: + enum ValPos { + EMPos = 0, + RMPos = 1, + JoinCondPos = 1, + GotoCondPos = 2, + PosNum + }; + + ExtractValueInst *EVs[PosNum] = { nullptr, nullptr, nullptr }; + bool IsGoto; + Value *GotoJoin; + + void CollectEVs(); + + public: + + GotoJoinEVs(Value *GJ = nullptr); + ExtractValueInst *getEMEV() const; + ExtractValueInst *getRMEV() const; + ExtractValueInst *getCondEV() const; + Value *getGotoJoin() const; + Instruction *getSplitPoint() const; + void setCondEV(ExtractValueInst *CondEV); + bool isGoto() const; + bool isJoin() const; + void hoistEVs() const; + + }; + + SetVector EMValsStack; + std::map GotoJoinMap; + std::map EMProducers; + std::map GotoJoinEVsMap; +protected: + GenXSimdCFConformance() : + M(0), FG(0), FGA(0), DTWrapper(0), Liveness(0), lowerSimdCF(false) {} + void gatherEMVals(); + void gatherRMVals(); + void removeFromEMRMVals(Value *V); + void moveCodeInGotoBlocks(bool hoistGotoUsers = false); + void moveCodeInJoinBlocks(); + void ensureConformance(); + void lowerAllSimdCF(); + void canonicalizeEM(); + void splitGotoJoinBlocks(); + void lowerUnsuitableGetEMs(); + void clear() { + DTs.clear(); + EMVals.clear(); + RMVals.clear(); + GotoJoinMap.clear(); + GotoJoinEVsMap.clear(); + EMProducers.clear(); + } +private: + bool isLatePass() { return FG != nullptr; } + void emptyBranchingJoinBlocksInFunc(Function *F); + void emptyBranchingJoinBlock(CallInst *Join); + DominatorTree *getDomTree(Function *F); + bool hoistJoin(CallInst *Join); + bool checkEMVal(SimpleValue EMVal); + bool checkGoto(SimpleValue EMVal); + bool checkJoin(SimpleValue EMVal); + bool checkGotoJoin(SimpleValue EMVal); + void removeBadEMVal(SimpleValue EMVal); + void pushValues(Value *V); + bool getConnectedVals(SimpleValue Val, int Cat, bool IncludeOptional, CallInst *OkJoin, SmallVectorImpl *ConnectedVals, bool LowerBadUsers = false); + void checkEMInterference(); + void checkInterference(SetVector *Vals, SetVector *BadDefs, Instruction *ConstStop); + bool hoistGotoUser(Instruction *Inst, CallInst *Goto, unsigned operandNo); + void gatherGotoJoinEMVals(bool IncludeIncoming = true); + void handleEVs(); + void resolveBitCastChains(); + Value *eliminateBitCastPreds(Value *Val, std::set &DeadInst, std::set &Visited); + Value *getEMProducer(Value *Inst, std::set &Visited, bool BitCastAllowed = false); + void handleCondValue(Value *GotoJoin); + void handleNoCondEVCase(GotoJoinEVs &GotoJoinData); + void handleOptimizedBranchCase(GotoJoinEVs &GotoJoinData, BasicBlock *&TrueSucc, BasicBlock *&FalseSucc); + void handleExistingBranchCase(GotoJoinEVs &GotoJoinData, BasicBlock *&TrueSucc, BasicBlock *&FalseSucc, BranchInst *ExistingBranch); + void addNewPhisIncomings(BasicBlock *BranchingBlock, BasicBlock *TrueSucc, BasicBlock *FalseSucc); + void collectCondEVUsers(ExtractValueInst *CondEV, std::vector &BadUsers, BranchInst *&CorrectUser); + void updateBadCondEVUsers(GotoJoinEVs &GotoJoinData, std::vector &BadUsers, BasicBlock *TrueSucc, BasicBlock *FalseSucc); + Value *findGotoJoinVal(int Cat, BasicBlock *Loc, Instruction *CondEV, BasicBlockEdge &TrueEdge, BasicBlockEdge &FalseEdge, Value *TrueVal, + Value *FalseVal, std::map &foundVals); + bool canUseLoweredEM(Instruction *Val); + void replaceUseWithLoweredEM(Instruction *Val, unsigned opNo, SetVector &ToRemove); + Value *insertCond(Value *OldVal, Value *NewVal, const Twine &Name, Instruction *InsertBefore, const DebugLoc &DL); + Value *truncateCond(Value *In, Type *Ty, const Twine &Name, Instruction *InsertBefore, const DebugLoc &DL); + void lowerGoto(CallInst *Goto); + void lowerJoin(CallInst *Join); + void replaceGotoJoinUses(CallInst *GotoJoin, ArrayRef Vals); +}; + +// GenX early SIMD control flow conformance pass +class GenXEarlySimdCFConformance + : public GenXSimdCFConformance, public ModulePass { +public: + static char ID; + explicit GenXEarlySimdCFConformance() : ModulePass(ID) { } + virtual StringRef getPassName() const { return "GenX early SIMD control flow conformance"; } + void getAnalysisUsage(AnalysisUsage &AU) const { + ModulePass::getAnalysisUsage(AU); + } + bool runOnModule(Module &M); +}; + +// GenX late SIMD control flow conformance pass +class GenXLateSimdCFConformance + : public GenXSimdCFConformance, public FunctionGroupPass { +public: + static char ID; + explicit GenXLateSimdCFConformance() : FunctionGroupPass(ID) { } + virtual StringRef getPassName() const { return "GenX late SIMD control flow conformance"; } + void getAnalysisUsage(AnalysisUsage &AU) const { + FunctionGroupPass::getAnalysisUsage(AU); + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + } + bool runOnFunctionGroup(FunctionGroup &FG); + // createPrinterPass : get a pass to print the IR, together with the GenX + // specific analyses + virtual Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const + { return createGenXGroupPrinterPass(O, Banner); } +private: + void setCategories(); + void modifyEMUses(Value *EM); +}; + +} // end anonymous namespace + +char GenXEarlySimdCFConformance::ID = 0; +namespace llvm { void initializeGenXEarlySimdCFConformancePass(PassRegistry &); } +INITIALIZE_PASS_BEGIN(GenXEarlySimdCFConformance, "GenXEarlySimdCFConformance", "GenXEarlySimdCFConformance", false, false) +INITIALIZE_PASS_END(GenXEarlySimdCFConformance, "GenXEarlySimdCFConformance", "GenXEarlySimdCFConformance", false, false) + +ModulePass *llvm::createGenXEarlySimdCFConformancePass() +{ + initializeGenXEarlySimdCFConformancePass(*PassRegistry::getPassRegistry()); + return new GenXEarlySimdCFConformance(); +} + +char GenXLateSimdCFConformance::ID = 0; +namespace llvm { void initializeGenXLateSimdCFConformancePass(PassRegistry &); } +INITIALIZE_PASS_BEGIN(GenXLateSimdCFConformance, "GenXLateSimdCFConformance", "GenXLateSimdCFConformance", false, false) +INITIALIZE_PASS_DEPENDENCY(FunctionGroupAnalysis) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeGroupWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GenXLiveness) +INITIALIZE_PASS_DEPENDENCY(GenXModule) +INITIALIZE_PASS_END(GenXLateSimdCFConformance, "GenXLateSimdCFConformance", "GenXLateSimdCFConformance", false, false) + +FunctionGroupPass *llvm::createGenXLateSimdCFConformancePass() +{ + initializeGenXLateSimdCFConformancePass(*PassRegistry::getPassRegistry()); + return new GenXLateSimdCFConformance(); +} + +/*********************************************************************** + * runOnModule : run the early SIMD control flow conformance pass for this + * module + */ +bool GenXEarlySimdCFConformance::runOnModule(Module &ArgM) +{ + LLVM_DEBUG(dbgs() << "Early SIMD CF Conformance starts\n"); + + Modified = false; + M = &ArgM; + FG = nullptr; + FGA = nullptr; + DTWrapper = nullptr; + // Perform actions to create correct DF for EM + canonicalizeEM(); + // Gather the EM values, both from goto/join and phi nodes. + gatherEMVals(); + // Gather the RM values from gotos and phi nodes. + gatherRMVals(); + // Hoist instructions that does not depend on Goto's result. + // It is needed to perform correct split. + moveCodeInGotoBlocks(); + // Split Goto/Join blocks to recreate actual SIMD CF + splitGotoJoinBlocks(); + // Handle instructions that depend on Goto's result + moveCodeInGotoBlocks(true); + // Handle Joins to create correct SIMD CF structure + moveCodeInJoinBlocks(); + // TODO: currently all SIMD CF is lowered if there is + // an unmask construction in module. It is very suboptimal. + if (lowerSimdCF) + lowerAllSimdCF(); + else + // Repeatedly check the code for conformance and lower non-conformant gotos + // and joins until the code stabilizes. + ensureConformance(); + // Perform check for genx_simdcf_get_em intrinsics and remove redundant ones. + lowerUnsuitableGetEMs(); + clear(); + + LLVM_DEBUG(dbgs() << "Early SIMD CF Conformance ends\n"); + + return Modified; +} + +/*********************************************************************** + * runOnFunctionGroup : run the late SIMD control flow conformance pass for this + * FunctionGroup + */ +bool GenXLateSimdCFConformance::runOnFunctionGroup(FunctionGroup &ArgFG) +{ + LLVM_DEBUG(dbgs() << "Late SIMD CF Conformance starts\n"); + + Modified = false; + FG = &ArgFG; + M = FG->getModule(); + // Get analyses that we use and/or modify. + FGA = &getAnalysis(); + DTWrapper = &getAnalysis(); + Liveness = &getAnalysis(); + // Gather the EM values, both from goto/join and phi nodes. + gatherEMVals(); + // Gather the RM values from gotos and phi nodes. + gatherRMVals(); + // Move code in goto and join blocks as necessary. + moveCodeInGotoBlocks(); + moveCodeInJoinBlocks(); + // Check the code for conformance. In this late pass, we do not expect to + // find non-conformance. + ensureConformance(); + // For remaining unlowered gotos and joins (the ones that will become SIMD + // control flow instructions), mark the webs of EM and RM values as + // category EM or RM respectively. For EM, this also modifies uses as needed. + setCategories(); + clear(); + + LLVM_DEBUG(dbgs() << "Late SIMD CF Conformance ends\n"); + + return Modified; +} + +/*********************************************************************** + * gatherGotoJoinEMVals : gather the EM values for gotos/joins only + * + * IncludeIncoming is used for adding goto/join def to EMVals + */ +void GenXSimdCFConformance::gatherGotoJoinEMVals(bool IncludeIncoming) +{ + // We find gotos and joins by scanning all uses of the intrinsics and (in the + // case of the late pass) ignoring ones not in this function group, rather + // than scanning the whole IR. + Type *I1Ty = Type::getInt1Ty(M->getContext()); + for (auto IID : { GenXIntrinsic::genx_simdcf_goto, GenXIntrinsic::genx_simdcf_join }) { + Type *EMTy = VectorType::get(I1Ty, 32); + for (unsigned Width = 1; Width <= 32; Width <<= 1) { + Type *Tys[] = { EMTy, VectorType::get(I1Ty, Width) }; + auto GotoJoinFunc = GenXIntrinsic::getGenXDeclaration(M, IID, Tys); + for (auto ui = GotoJoinFunc->use_begin(), ue = GotoJoinFunc->use_end(); + ui != ue; ++ui) { + auto GotoJoin = dyn_cast(ui->getUser()); + if (!GotoJoin) + continue; + if (FG && (FGA->getGroup(GotoJoin->getParent()->getParent()) != FG + || ui->getOperandNo() != GotoJoin->getNumArgOperands())) + continue; + // We have a goto/join (in our function group in the case of the late + // pass). Add the EM value (struct index 0) to EMVals. + EMVals.insert(SimpleValue(GotoJoin, 0)); + // Also add its EM input to EMVals, if not a constant. + if (IncludeIncoming && !isa(GotoJoin->getOperand(0))) + EMVals.insert(SimpleValue(GotoJoin->getOperand(0), 0)); + } + } + } +} + +/*********************************************************************** + * gatherEMVals : gather the EM values, including phi nodes + */ +void GenXSimdCFConformance::gatherEMVals() +{ + // Collect gotos/joins and their defs + gatherGotoJoinEMVals(true); + + Type *I1Ty = Type::getInt1Ty(M->getContext()); + Type *EMTy = VectorType::get(I1Ty, 32); + Type *Tys[] = { EMTy }; + auto SavemaskFunc = GenXIntrinsic::getGenXDeclaration( + M, GenXIntrinsic::genx_simdcf_savemask, Tys); + for (auto ui = SavemaskFunc->use_begin(), ue = SavemaskFunc->use_end(); ui != ue; + ++ui) { + auto Savemask = dyn_cast(ui->getUser()); + if (!Savemask) + continue; + if (FG && (FGA->getGroup(Savemask->getParent()->getParent()) != FG || + ui->getOperandNo() != Savemask->getNumArgOperands())) + continue; + lowerSimdCF = true; + // Add its EM input to EMVals, if not a constant. + if (!isa(Savemask->getOperand(0))) + EMVals.insert(SimpleValue(Savemask->getOperand(0), 0)); + } + + auto UnmaskFunc = GenXIntrinsic::getGenXDeclaration( + M, GenXIntrinsic::genx_simdcf_unmask, Tys); + for (auto ui = UnmaskFunc->use_begin(), ue = UnmaskFunc->use_end(); ui != ue; + ++ui) { + auto Unmask = dyn_cast(ui->getUser()); + if (!Unmask) + continue; + if (FG && (FGA->getGroup(Unmask->getParent()->getParent()) != FG || + ui->getOperandNo() != Unmask->getNumArgOperands())) + continue; + lowerSimdCF = true; + // We have a unmask (in our function group in the case of the late + EMVals.insert(SimpleValue(Unmask)); + } + auto RemaskFunc = GenXIntrinsic::getGenXDeclaration( + M, GenXIntrinsic::genx_simdcf_remask, Tys); + for (auto ui = RemaskFunc->use_begin(), ue = RemaskFunc->use_end(); ui != ue; + ++ui) { + auto Remask = dyn_cast(ui->getUser()); + if (!Remask) + continue; + if (FG && (FGA->getGroup(Remask->getParent()->getParent()) != FG || + ui->getOperandNo() != Remask->getNumArgOperands())) + continue; + lowerSimdCF = true; + // We have a remask (in our function group in the case of the late + // pass). Add the EM value (struct index 0) to EMVals. + EMVals.insert(SimpleValue(Remask)); + // Also add its EM input to EMVals, if not a constant. + if (!isa(Remask->getOperand(0))) + EMVals.insert(SimpleValue(Remask->getOperand(0))); + } + // delete useless cm_unmask_begin and cm_unmask_end + auto UnmaskEF = GenXIntrinsic::getGenXDeclaration( + M, GenXIntrinsic::genx_unmask_end); + for (auto ui = UnmaskEF->use_begin(), ue = UnmaskEF->use_end(); ui != ue;) { + auto u = ui->getUser(); + ++ui; + if (auto UnmaskEnd = dyn_cast(u)) + UnmaskEnd->eraseFromParent(); + } + auto UnmaskBF = GenXIntrinsic::getGenXDeclaration( + M, GenXIntrinsic::genx_unmask_begin); + for (auto ui = UnmaskBF->use_begin(), ue = UnmaskBF->use_end(); ui != ue;) { + auto u = ui->getUser(); + ++ui; + if (auto UnmaskBeg = dyn_cast(u)) + UnmaskBeg->eraseFromParent(); + } + // Find related phi nodes and values related by insertvalue/extractvalue/call + // using EMVal as a worklist. + for (unsigned i = 0; i != EMVals.size(); ++i) { + SimpleValue EMVal = EMVals[i]; + // For this EM value, get the connected values. + SmallVector ConnectedVals; + getConnectedVals(EMVal, RegCategory::EM, /*IncludeOptional=*/true, + /*OkJoin=*/nullptr, &ConnectedVals); + // Add the connected values to EMVals. + for (auto j = ConnectedVals.begin(), je = ConnectedVals.end(); + j != je; ++j) + if (!isa(j->getValue())) + EMVals.insert(*j); + } +} + +/*********************************************************************** + * gatherRMVals : gather RM values for each join + */ +void GenXSimdCFConformance::gatherRMVals() +{ + for (auto ji = EMVals.begin(), je = EMVals.end(); ji != je; ++ji) { + auto EMVal = *ji; + if (GenXIntrinsic::getGenXIntrinsicID(EMVal.getValue()) != GenXIntrinsic::genx_simdcf_join) + continue; + auto Join = cast(EMVal.getValue()); + // We have a join. Gather its web of RM values. + auto RMValsEntry = &RMVals[Join]; + if (!isa(Join->getOperand(1))) + RMValsEntry->insert(Join->getOperand(1)); + for (unsigned rvi = 0; rvi != RMValsEntry->size(); ++rvi) { + SimpleValue RM = (*RMValsEntry)[rvi]; + // RM is a value in this join's RM web. Get other values related by phi + // nodes and extractvalues and gotos. + SmallVector ConnectedVals; + getConnectedVals(RM, RegCategory::RM, /*IncludeOptional=*/true, + Join, &ConnectedVals); + for (auto j = ConnectedVals.begin(), je = ConnectedVals.end(); + j != je; ++j) + if (!isa(j->getValue())) + RMValsEntry->insert(*j); + } + } +} + +/*********************************************************************** + * findGotoJoinVal : find goto/join that should be applied at the + * specified location + * + * It uses dominator tree to find the value needed. Category is used to + * set proper name for instruction and doesn't affect reg category + * that is used in reg alloc. It only shows what we are dealing with. + */ +Value *GenXSimdCFConformance::findGotoJoinVal(int Cat, BasicBlock *Loc, Instruction *GotoJoinEV, + BasicBlockEdge &TrueEdge, BasicBlockEdge &FalseEdge, Value *TrueVal, Value *FalseVal, std::map& foundVals) +{ + assert(TrueEdge.getStart() == FalseEdge.getStart()); + assert(TrueEdge.getEnd() != FalseEdge.getEnd()); + assert((Cat == RegCategory::EM || Cat == RegCategory::PREDICATE) && "Handling only EM and Cond!"); + + LLVM_DEBUG(dbgs() << "Entering " << Loc->getName() << "\n"); + + // Check if value were found before + auto ResIt = foundVals.find(Loc); + if (ResIt != foundVals.end()) + return ResIt->second; + + DominatorTree *DomTree = getDomTree(Loc->getParent()); + if (DomTree->dominates(TrueEdge, Loc)) { + LLVM_DEBUG(dbgs() << "Dominated by True Edge\n"); + foundVals[Loc] = TrueVal;; + return TrueVal; + } + if (DomTree->dominates(FalseEdge, Loc)) { + LLVM_DEBUG(dbgs() << "Dominated by False Edge\n"); + foundVals[Loc] = FalseVal; + return FalseVal; + } + + // Need to create phi somewhere. + // Try to get IDom. If we found CondEV's BB then we are + // already in the final block + auto Node = DomTree->getNode(Loc); + auto IDom = Node->getIDom(); + assert(IDom && "No IDom found!"); + BasicBlock *PhiLoc = nullptr; + PhiLoc = IDom->getBlock(); + if (IDom->getBlock() == GotoJoinEV->getParent()) + PhiLoc = Loc; + + std::string Name = (Cat == RegCategory::EM) ? "ExecMaskEV" : "CondEV"; + auto PHI = PHINode::Create(GotoJoinEV->getType(), pred_size(PhiLoc), Name, &PhiLoc->front()); + foundVals[PhiLoc] = PHI; + if (PhiLoc != Loc) + foundVals[Loc] = PHI; + + for (auto pi = pred_begin(PhiLoc), pe = pred_end(PhiLoc); pi != pe; ++pi) { + BasicBlock *Pred = *pi; + Value *Val = nullptr; + + // Don't check dominators for def since we are looking for + // edges that are located after it + if (Pred == TrueEdge.getStart()) { + // This happens when we enter def block from join block + // w/o any intermediate blocks (actually we expect this + // situation to happen always). Check that we came through + // true branch. + if (Pred->getTerminator()->getSuccessor(0) == PhiLoc) { + Val = TrueVal; + LLVM_DEBUG(dbgs() << "Usual case\n"); + } else { + // This situation shouldn't happen, but if so, we can handle it + Val = FalseVal; + LLVM_DEBUG(dbgs() << "Strange case\n"); + } + } else { + Val = findGotoJoinVal(Cat, Pred, GotoJoinEV, TrueEdge, FalseEdge, TrueVal, FalseVal, foundVals); + } + + PHI->addIncoming(Val, Pred); + } + + LLVM_DEBUG(dbgs() << "Built PHI for EV:" << *PHI << "\n"); + return PHI; +} + +/** + * collectCondEVUsers : gather Cond EV users + * + * Bad users: they should not use cond EV. + * Correct user: conditional branch CondEV's BB. This + * is the only possible conformant user. + */ +void GenXSimdCFConformance::collectCondEVUsers(ExtractValueInst *CondEV, std::vector &BadUsers, BranchInst *&CorrectUser) +{ + // Bad users: they should not use cond EV. Make a real value for them + // Correct user: conditional branch in this BB + for (auto ui = CondEV->use_begin(), ue = CondEV->use_end(); ui != ue; ++ui) { + BranchInst *Br = dyn_cast(ui->getUser()); + + // If cond EV is used by wrong branch, we can simply consider + // it as non-baled conditional branch + if (!Br || Br->getParent() != CondEV->getParent()) { + LLVM_DEBUG(dbgs() << "Found bad CondEV user:\n" << *ui->getUser() << "\n"); + BadUsers.push_back(ui->getUser()); + } else if (Br) { + assert(!CorrectUser && "Found another correct user!"); + LLVM_DEBUG(dbgs() << "Found correct user:\n" << *Br << "\n"); + CorrectUser = Br; + } + } +} + +/** + * updateBadCondEVUsers : update bad cond EV users + * + * It replaces cond EV uses by values that can be + * obtained on true and false pathes + */ +void GenXSimdCFConformance::updateBadCondEVUsers(GenXSimdCFConformance::GotoJoinEVs &GotoJoinData, + std::vector &BadUsers, BasicBlock *TrueSucc, BasicBlock *FalseSucc) +{ + ExtractValueInst *CondEV = GotoJoinData.getCondEV(); + assert(CondEV && "Expected valid CondEV!"); + + BasicBlockEdge TrueEdge(CondEV->getParent(), TrueSucc); + BasicBlockEdge FalseEdge(CondEV->getParent(), FalseSucc); + Constant *TrueVal = Constant::getAllOnesValue(CondEV->getType()); + Constant *FalseVal = Constant::getNullValue(CondEV->getType()); + + // Update users + std::map FoundCondEV; + for (auto bi = BadUsers.begin(), be = BadUsers.end(); bi != be; ++bi) { + Instruction *User = cast(*bi); + for (unsigned idx = 0, opNum = User->getNumOperands(); idx < opNum; ++idx) { + if (CondEV != User->getOperand(idx)) + continue; + + User->setOperand(idx, findGotoJoinVal(RegCategory::PREDICATE, User->getParent(), CondEV, TrueEdge, FalseEdge, TrueVal, FalseVal, FoundCondEV)); + } + } +} + +/** + * addNewPhisIncomings : add new incomings after split + * + * It is needed to update phis after turning unconditional + * branch into conditional one. True successor is assumed to + * be correct join point, but the only thing we know here + * is that FalseSucc branches to TrueSucc. Branching Block's + * successors are TrueSucc and FalseSucc. + */ +void GenXSimdCFConformance::addNewPhisIncomings(BasicBlock *BranchingBlock, BasicBlock *TrueSucc, BasicBlock *FalseSucc) +{ + for (auto Inst = &TrueSucc->front(); + auto PN = dyn_cast(Inst); + Inst = Inst->getNextNode()) { + Value* CurrVal = PN->getIncomingValueForBlock(BranchingBlock); + PN->addIncoming(CurrVal, FalseSucc); + } +} + +/** + * handleNoCondEVCase : handle case when there is no + * CondEV for goto/join. + * + * It performs split for goto in order to prepare + * goto for possible EM lower. Goto is branch itself + * so such transformation doesn't introduce any + * overhead in case of conformant SIMD CF. + * + * TODO: this transformation can be reverted in case of + * non-conformant SIMD CF if necessary data was saved. + * It is not done now because no non-conformant cases + * were found so far. + */ +void GenXSimdCFConformance::handleNoCondEVCase(GenXSimdCFConformance::GotoJoinEVs &GotoJoinData) +{ + assert(!GotoJoinData.getCondEV() && "Unexpected CondEV!"); + + // Handle only goto + if (GotoJoinData.isJoin()) + return; + auto SplitPoint = GotoJoinData.getSplitPoint(); + + // Skip possible goto users + for (;; SplitPoint = SplitPoint->getNextNode()) { + if (SplitPoint->isTerminator()) + break; + if (auto CI = dyn_cast(SplitPoint)) { + // We need to perform split before next goto/join to save their conformance + if (GenXIntrinsic::getGenXIntrinsicID(CI) == GenXIntrinsic::genx_simdcf_goto || + GenXIntrinsic::getGenXIntrinsicID(CI) == GenXIntrinsic::genx_simdcf_join) + break; + } + } + + Value *GotoJoin = GotoJoinData.getGotoJoin(); + ExtractValueInst *CondEV = ExtractValueInst::Create(GotoJoin, { 2 }, "missing_extractcond", SplitPoint); + GotoJoinData.setCondEV(CondEV); + + if (auto Br = dyn_cast(SplitPoint)) { + if (Br->isConditional()) { + // This CF is non-conformant: there should be a join point + // before this branch, but it wasn't found. Skip it. + return; + } + + // We are turning unconditional branch into conditional one + BasicBlock *Split = BasicBlock::Create(CondEV->getContext(), "goto_split", CondEV->getParent()->getParent(), Br->getSuccessor(0)); + BranchInst::Create(Br->getSuccessor(0), Split); + BranchInst::Create(Br->getSuccessor(0), Split, CondEV, Br); + + // Update phis in TrueSucc + addNewPhisIncomings(CondEV->getParent(), Br->getSuccessor(0), Split); + + Br->eraseFromParent(); + } else { + // Split point is in the middle of BB. We assume that there is a join point + // after it. + // TODO: consider adding this check. No such cases were found now. + BasicBlock *TrueSucc = CondEV->getParent()->splitBasicBlock(SplitPoint, "cond_ev_true_split"); + CondEV->getParent()->getTerminator()->eraseFromParent(); + LLVM_DEBUG(dbgs() << "Created " << TrueSucc->getName() << " to handle missing conditional branch\n"); + + // False block: need to create new one + BasicBlock *FalseSucc = BasicBlock::Create(CondEV->getContext(), "cond_ev_false_split", CondEV->getParent()->getParent(), + TrueSucc); + LLVM_DEBUG(dbgs() << "Created " << FalseSucc->getName() << " to handle missing conditional branch\n"); + + // Link blocks + BranchInst::Create(TrueSucc, FalseSucc, CondEV, CondEV->getParent()); + BranchInst::Create(TrueSucc, FalseSucc); + } + + // CFG changed: update DomTree. + // TODO: there must be workaround to do it in a more optimal way + DominatorTree *domTree = getDomTree(CondEV->getParent()->getParent()); + domTree->recalculate(*CondEV->getParent()->getParent()); +} + +/** + * handleOptimizedBranchCase : perform split for optimized branch case + * + * TODO: this make sence only in case when the true successor is a + * join block, otherwise it will introduce more overhead due to + * goto/join lowering. Also there should be check that this + * join really uses current EM and RM. This issue is resolved + * at the end of this pass in EM/RM liveness analysis and cannot + * be done easy at this point. For now assume that everything OK + * with it here. + * + * TODO: It is possible to undo this transformation if we store + * all necessery data here. Currently it is not done: + * no non-conformant cases found for now. + * + * Due to earlier transformations we can split BB after the last + * goto/join EV. It will solve issue with join located in this + * basic block. Code movements to sink goto/join will be performed + * further, we don't need to focus on it here. + */ +void GenXSimdCFConformance::handleOptimizedBranchCase(GenXSimdCFConformance::GotoJoinEVs &GotoJoinData, BasicBlock *&TrueSucc, BasicBlock *&FalseSucc) +{ + // Look for the first non-goto/join user inst + auto SplitPoint = GotoJoinData.getSplitPoint(); + + ExtractValueInst *CondEV = GotoJoinData.getCondEV(); + assert(CondEV && "Expected valid CondEV!"); + + // Split: this is true succ which is join point (at least we assume that) + TrueSucc = CondEV->getParent()->splitBasicBlock(SplitPoint, "cond_ev_true_split"); + LLVM_DEBUG(dbgs() << "Created " << TrueSucc->getName() << " to handle missing conditional branch\n"); + CondEV->getParent()->getTerminator()->eraseFromParent(); + // False block: need to create new one + FalseSucc = BasicBlock::Create(CondEV->getContext(), "cond_ev_false_split", CondEV->getParent()->getParent(), + TrueSucc); + LLVM_DEBUG(dbgs() << "Created " << FalseSucc->getName() << " to handle missing conditional branch\n"); + // Link blocks + BranchInst::Create(TrueSucc, FalseSucc, CondEV, CondEV->getParent()); + BranchInst::Create(TrueSucc, FalseSucc); + + // CFG changed: update DomTree. + // TODO: there must be workaround to do it in a more optimal way + DominatorTree *domTree = getDomTree(CondEV->getParent()->getParent()); + domTree->recalculate(*CondEV->getParent()->getParent()); +} + +/** + * handleExistingBranchCase : perform actions needed to + * handle case when branch wasn't optimized + * + * It stores True/False successors and adds new BB + * in case when both successors are the same BB. + */ +void GenXSimdCFConformance::handleExistingBranchCase(GenXSimdCFConformance::GotoJoinEVs &GotoJoinData, + BasicBlock *&TrueSucc, BasicBlock *&FalseSucc, BranchInst *ExistingBranch) +{ + ExtractValueInst *CondEV = GotoJoinData.getCondEV(); + assert(CondEV && "Expected valid CondEV!"); + assert(ExistingBranch->isConditional() && "Expected conditional branch!"); + + TrueSucc = ExistingBranch->getSuccessor(0); + FalseSucc = ExistingBranch->getSuccessor(1); + + if (TrueSucc == FalseSucc) { + // We need to simply introduce new BB to get CondEV + FalseSucc = BasicBlock::Create(CondEV->getContext(), "cond_ev_split", CondEV->getParent()->getParent(), + TrueSucc); + BranchInst::Create(TrueSucc, FalseSucc); + ExistingBranch->setSuccessor(1, FalseSucc); + + LLVM_DEBUG(dbgs() << "Created " << FalseSucc->getName() << " to handle always taken CONDITIONAL branch\n"); + + // Update phis in TrueSucc + addNewPhisIncomings(CondEV->getParent(), TrueSucc, FalseSucc); + + // CFG changed: update DomTree. + // TODO: there must be workaround to do it in a more optimal way + DominatorTree *domTree = getDomTree(CondEV->getParent()->getParent()); + domTree->recalculate(*CondEV->getParent()->getParent()); + } +} + +/** + * handleCondValue : perform analysis on Cond EV usage and fix + * it if needed + * + * The basic use case is optimized False Successor. That + * often happens in standard SimplifyCFG pass. + */ +void GenXSimdCFConformance::handleCondValue(Value *GotoJoin) +{ + GotoJoinEVs &GotoJoinData = GotoJoinEVsMap[GotoJoin]; + ExtractValueInst *CondEV = GotoJoinData.getCondEV(); + + // No cond EV - nothing to handle. Here we create branch for goto + // to make it easier to handle possible bad EM users. Goto is a + // branch itself and it won't introduce any overhead in case + // of conformant SIMD CF + if (!CondEV) { + handleNoCondEVCase(GotoJoinData); + return; + } + + // Collect Cond EV users + std::vector BadUsers; + BranchInst *CorrectUser = nullptr; + collectCondEVUsers(CondEV, BadUsers, CorrectUser); + + // Nothing needs to be fixed. However, allow this algorithm to fix + // case with TrueSucc == FalseSucc for goto in order to simplify further + // analysis. + if (BadUsers.empty() && GotoJoinData.isJoin()) + return; + + BasicBlock *TrueSucc = nullptr; + BasicBlock *FalseSucc = nullptr; + + if (!CorrectUser) { + // Branch was optimized by some pass. We need to create it again. + handleOptimizedBranchCase(GotoJoinData, TrueSucc, FalseSucc); + } else { + // Branch is still here. Perform actions needed. + handleExistingBranchCase(GotoJoinData, TrueSucc, FalseSucc, CorrectUser); + } + + // Update users + updateBadCondEVUsers(GotoJoinData, BadUsers, TrueSucc, FalseSucc); +} + +/*********************************************************************** + * splitGotoJoinBlocks : split Basic Blocks that contains goto/join + * + * This is used to solve problems that can be introduced by some + * standard LLVM passes: one of them is simplified CFG that lead to + * goto/join's condition usage by non-branch instruction. After this + * transformation each BB will contain only one goto or join instruction + * (or none of them), that fact allows us to make further changes simplier. + */ +void GenXSimdCFConformance::splitGotoJoinBlocks() { + + LLVM_DEBUG(dbgs() << "Splitting GotoJoin Blocks\n"); + + for (auto &Elem : GotoJoinEVsMap) { + + Value *GotoJoin = Elem.first; + auto &GotoJoinData = Elem.second; + + LLVM_DEBUG(dbgs() << "Trying to split BB for:\n" << *GotoJoin << "\n"); + + handleCondValue(GotoJoin); + + if (GotoJoinData.isJoin()) { + auto SplitPoint = GotoJoinData.getSplitPoint(); + if (SplitPoint->isTerminator()) + continue; + SplitPoint->getParent()->splitBasicBlock(SplitPoint, "split_for_join"); + // CFG changed: update DomTree. + // TODO: there must be workaround to do it in a more optimal way + DominatorTree *domTree = getDomTree(SplitPoint->getParent()->getParent()); + domTree->recalculate(*SplitPoint->getParent()->getParent()); + } + } + + LLVM_DEBUG(dbgs() << "Done splitting\n\n" << *M << "\n\n"); +} + +/*********************************************************************** + * removeFromEMRMVals : remove a value from EMVals or RMVals + * + * This is used just before erasing a phi node in moveCodeInJoinBlocks. + */ +void GenXSimdCFConformance::removeFromEMRMVals(Value *V) +{ + auto VT = dyn_cast(V->getType()); + if (!VT || !VT->getElementType()->isIntegerTy(1)) + return; + if (EMVals.remove(SimpleValue(V, 0))) + return; + for (auto i = RMVals.begin(), e = RMVals.end(); i != e; ++i) { + auto RMValsEntry = &i->second; + if (RMValsEntry->remove(SimpleValue(V, 0))) + return; + } +} + +/*********************************************************************** + * hoistGotoUser : hoist instruction that uses goto's EV and is located + * after it in the same basic block. + * + * Since goto must be at the end of basic block, we have to solve + * this problem somehow. Current approach is to duplicate instruction + * on both paths (true and false) and update uses. + * + * It is always possible to perform such transformation even if there + * is a chain of users: we just can duplicate them all. Since we know + * all values on the true pass, it should be possible to perform full + * calculation in this case. However, it is not done now because it can + * lead to much worse code when SIMD CF is not conformant (we are not + * sure that it is conformant at this point). + */ +bool GenXSimdCFConformance::hoistGotoUser(Instruction *Inst, CallInst *Goto, unsigned operandNo) +{ + // Find branch for goto + ExtractValueInst *CondEV = GotoJoinEVsMap[Goto].getCondEV(); + auto BrIt = std::find_if(CondEV->use_begin(), CondEV->use_end(), + [&Goto](const Use& u) { + auto Br = dyn_cast(u.getUser()); + return (Br && Br->getParent() == Goto->getParent() && Br->isConditional()); + }); + assert(BrIt != CondEV->use_end() && "All gotos should become branching earlier!"); + + BranchInst *Br = cast(BrIt->getUser()); + BasicBlock *TrueSucc = Br->getSuccessor(0); + BasicBlock *FalseSucc = Br->getSuccessor(1); + + // Handle FallThrough block with phis. + // + // TODO: it is redundant in some cases. For example, there can be Phi that + // uses bitcasts from EM from two paths. In this case we can use one + // GetEM from Phi with EM. Currently there is no trivial mechanism + // to check for that because in this case Phi arguments are supposed to use + // different Exectution Masks according to DF. + // + // Temporary solution for that is to place a splitter block that branches to + // such bb directly. Examples of that case can be found in local-atomics + // tests in ISPC. + if (isa(&FalseSucc->front())) { + BasicBlock *Splitter = BasicBlock::Create(FalseSucc->getContext(), "phi_fallthrough_splitter", FalseSucc->getParent()); + Splitter->moveAfter(Goto->getParent()); + BranchInst::Create(FalseSucc, Splitter); + Br->setSuccessor(1, Splitter); + // Update phis + for (auto CurrInst = &FalseSucc->front(); + auto PN = dyn_cast(CurrInst); + CurrInst = CurrInst->getNextNode()) { + for (unsigned idx = 0, num = PN->getNumIncomingValues(); idx < num; ++idx) { + if (PN->getIncomingBlock(idx) == Goto->getParent()) + PN->setIncomingBlock(idx, Splitter); + } + } + FalseSucc = Splitter; + // CFG changed: update DomTree. + // TODO: there must be workaround to do it in a more optimal way + DominatorTree *domTree = getDomTree(CondEV->getParent()->getParent()); + domTree->recalculate(*CondEV->getParent()->getParent()); + } + + // Copy instruction and set the value for true block. Place it before goto. + Instruction *TrueVal = Inst->clone(); + TrueVal->insertBefore(Goto); + TrueVal->setOperand(operandNo, Constant::getNullValue(Inst->getOperand(operandNo)->getType())); + + // Copy instruction and place it in the false successor. Get EM will be + // created later to handle its goto use. + Instruction *FalseVal = Inst->clone(); + FalseVal->insertBefore(FalseSucc->getFirstNonPHI()); + + // Handle all users + BasicBlockEdge TrueEdge(Goto->getParent(), TrueSucc); + BasicBlockEdge FalseEdge(Goto->getParent(), FalseSucc); + std::map foundVals; + std::vector newOperands; + for (auto ui = Inst->use_begin(), ue = Inst->use_end(); ui != ue; ++ui) { + auto User = dyn_cast(ui->getUser()); + // TODO: it can be solved with duplicated instructions. + // Currently we are not going to duplicate them. + if (User->getParent() == Inst->getParent()) { + TrueVal->eraseFromParent(); + FalseVal->eraseFromParent(); + return false; + } + + BasicBlock *Loc = User->getParent(); + if (auto PN = dyn_cast(User)) + Loc = PN->getIncomingBlock(ui->getOperandNo()); + + // Store new value + Value *NewOperand = nullptr; + if (Loc == Goto->getParent()) + NewOperand = TrueVal; + else + NewOperand = findGotoJoinVal(RegCategory::EM, Loc, Inst, TrueEdge, FalseEdge, + TrueVal, FalseVal, foundVals); + + newOperands.push_back(NewOperand); + } + + // Update uses + unsigned i = 0; + for (auto ui = Inst->use_begin(), ue = Inst->use_end(); ui != ue;) { + auto User = dyn_cast(ui->getUser()); + unsigned opNo = ui->getOperandNo(); + ++ui; + User->setOperand(opNo, newOperands[i++]); + } + + return true; +} + +/*********************************************************************** + * moveCodeInGotoBlocks : move code in goto blocks + * + * A goto and its extractvalues must be at the end of the block. (Actually, if + * the !any result of the goto is used in a conditional branch at the end of + * the block, then the goto being baled into the branch means that it is + * treated as being at the end of the block anyway. The only reason we need to + * sink it here is to ensure that isGotoBlock works.) + * + * This can silently fail to sink a goto, in which case checkGoto will spot that + * the goto is not conformant. + */ +void GenXSimdCFConformance::moveCodeInGotoBlocks(bool hoistGotoUsers) +{ + for (auto gi = EMVals.begin(), ge = EMVals.end(); gi != ge; ++gi) { + auto EMVal = *gi; + if (GenXIntrinsic::getGenXIntrinsicID(EMVal.getValue()) != GenXIntrinsic::genx_simdcf_goto) + continue; + auto Goto = cast(EMVal.getValue()); + // We want to sink the goto and its extracts. In fact we hoist any other + // instruction, checking that it does not use the extracts. + // With hoistGotoUsers, we are trying to hoist them, too. + // We are skipping all instructions that use skipped instructions + // in order to save dominance. + std::set Skipping; + for (Instruction *NextInst = Goto->getNextNode();;) { + auto Inst = NextInst; + if (Inst->isTerminator()) + break; + assert(Inst); + NextInst = Inst->getNextNode(); + if (auto Extract = dyn_cast(Inst)) + if (Extract->getOperand(0) == Goto) + continue; + bool Failed = false; + for (unsigned oi = 0, oe = Inst->getNumOperands(); oi != oe; ++oi) { + if (auto I = dyn_cast(Inst->getOperand(oi))) + if (Skipping.count(I)) { + LLVM_DEBUG(dbgs() << "Skipping " << Inst->getName() << " due to use of skipped inst\n"); + Skipping.insert(Inst); + Failed = true; + break; + } + if (auto Extract = dyn_cast(Inst->getOperand(oi))) + if (Extract->getOperand(0) == Goto) { + // This is used after splitting basic blocks. + // To perform this all gotos must be branching since EM + // is changed by goto. + if (hoistGotoUsers && hoistGotoUser(Inst, Goto, oi)) { + continue; + } + LLVM_DEBUG(dbgs() << "moveCodeInGotoBlocks: " << Goto->getName() << " failed\n"); + LLVM_DEBUG(dbgs() << "Could not hoist " << Inst->getName() << "\n"); + Failed = true; + Skipping.insert(Inst); + break; // Intervening instruction uses extract of goto; abandon + } + } + if (Failed) + continue; + if (Inst->getNumUses() == 0) { + Inst->eraseFromParent(); + continue; + } + // Hoist the instruction. + Inst->removeFromParent(); + Inst->insertBefore(Goto); + } + } +} + +/*********************************************************************** + * moveCodeInJoinBlocks : move code in join blocks as necessary + * + * 1. For a join label block (a block that is the JIP of other gotos/joins), a + * join must come at the start of the block. + * + * 2. For a branching join block (one whose conditional branch condition is the + * !any result from a join), the join must be at the end of the block. + * + * 3. For a block that has one join with both of the above true, we need to move + * all other code out of the block. + * + * We achieve this as follows: + * + * a. First handle case 3. For any such block, hoist any other code to the end + * of its immediate dominator. To allow for the immediate dominator also + * being a case 3 join, we process blocks in post-order depth first search + * order, so we visit a block before its dominator. Thus code from a case 3 + * join block eventually gets moved up to its closest dominating block that + * is not a case 3 join block. + * + * Because it is more convenient and does not hurt, we also hoist the code + * before the first join in a block that initially looks like it is case 3, + * even if it then turns out not to be a case 3 join because it has multiple + * joins. + * + * b. Then scan all joins handling case 1. + * + * c. No need to handle case 2 here, as it (together with a similar requirement + * to sink a goto in a branching goto block) is checked in checkConformance + * and treated as sunk subsequently by virtue of getting baled in to the + * branch. + * + * This happens in both SIMD CF conformance passes, in case constant loading + * etc sneaks code back into the wrong place in a join block. Any pass after + * the late SIMD CF conformance pass needs to be careful not to sneak code back + * into a join block. + * + * Any failure to do the above is not flagged here, but it will be spotted when + * checking the join for conformance. + * + * moveCodeInGotoBlocks needs to run first, as we rely on its sinking of an + * unconditional branch goto for isBranchingGotoJoinBlock to work. + */ +void GenXSimdCFConformance::moveCodeInJoinBlocks() +{ + // a. Handle case 3 join blocks. + if (!FG) { + // Early pass: iterate all funcs in the module. + for (auto mi = M->begin(), me = M->end(); mi != me; ++mi) { + Function *F = &*mi; + if (!F->empty()) + emptyBranchingJoinBlocksInFunc(F); + } + } else { + // Late pass: iterate all funcs in the function group. + for (auto fgi = FG->begin(), fge = FG->end(); fgi != fge; ++fgi) { + Function *F = *fgi; + emptyBranchingJoinBlocksInFunc(F); + } + } + // b. Process all other joins (in fact all joins, but ones successfully + // processed above will not need anything doing). + // Get the joins into a vector first, because the code below modifies EMVals. + SmallVector Joins; + for (auto ji = EMVals.begin(), je = EMVals.end(); ji != je; ++ji) { + auto EMVal = *ji; + if (GenXIntrinsic::getGenXIntrinsicID(EMVal.getValue()) != GenXIntrinsic::genx_simdcf_join) + continue; + Joins.push_back(cast(EMVal.getValue())); + } + for (auto ji = Joins.begin(), je = Joins.end(); ji != je; ++ji) { + auto Join = *ji; + auto JoinBlock = Join->getParent(); + if (GotoJoin::isJoinLabel(JoinBlock, /*SkipCriticalEdgeSplitter=*/true)) + hoistJoin(Join); + else { + // The join is in a block that is not a join label. Also check the case + // that there is a predecessor that: + // 1. has one successor; and + // 2. is empty other than phi nodes; and + // 3. is a join label. + // In that case we merge the two blocks, merging phi nodes. + // I have seen this situation arise where LLVM decides to add a loop + // pre-header block. + BasicBlock *PredBlock = nullptr; + for (auto ui = JoinBlock->use_begin(), ue = JoinBlock->use_end(); ui != ue; ++ui) { + auto Br = dyn_cast(ui->getUser()); + if (!Br || Br->isConditional()) + continue; + auto BB = Br->getParent(); + if (BB->getFirstNonPHIOrDbg() != Br) + continue; + if (GotoJoin::isJoinLabel(BB, /*SkipCriticalEdgeSplitter=*/true)) { + PredBlock = BB; + break; + } + } + if (PredBlock) { + // We have such a predecessor block. First hoist the join in our block. + if (hoistJoin(Join)) { + // Join hoisting succeeded. Now merge the blocks. + LLVM_DEBUG(dbgs() << "moveCodeInJoinBlocks: merging " << PredBlock->getName() + << " into " << JoinBlock->getName() << "\n"); + // First adjust the phi nodes to include both blocks' incomings. + for (auto Phi = dyn_cast(&JoinBlock->front()); Phi; + Phi = dyn_cast(Phi->getNextNode())) { + int Idx = Phi->getBasicBlockIndex(PredBlock); + if (Idx >= 0) { + Value *Incoming = Phi->getIncomingValue(Idx); + auto PredPhi = dyn_cast(Incoming); + if (PredPhi && PredPhi->getParent() != PredBlock) + PredPhi = nullptr; + if (PredPhi) { + // The incoming in JoinBlock is a phi node in PredBlock. Add its + // incomings. + Phi->removeIncomingValue(Idx, /*DeletePHIIfEmpty=*/false); + for (unsigned oi = 0, oe = PredPhi->getNumIncomingValues(); + oi != oe; ++oi) + Phi->addIncoming(PredPhi->getIncomingValue(oi), + PredPhi->getIncomingBlock(oi)); + } else { + // Otherwise, add the predecessors of PredBlock to the phi node + // in JoinBlock. + for (auto ui2 = PredBlock->use_begin(), + ue2 = PredBlock->use_end(); ui2 != ue2; ++ui2) { + Instruction *Term = dyn_cast(ui2->getUser()); + assert(Term); + if (Term->isTerminator()) { + auto PredPred = Term->getParent(); + if (Idx >= 0) { + Phi->setIncomingBlock(Idx, PredPred); + Idx = -1; + } else + Phi->addIncoming(Incoming, PredPred); + } + } + } + } + } + // Any phi in PredBlock that is not used in a phi in JoinBlock (and + // so still has at least one use after the code above) needs to be + // moved to JoinBlock, with itself added as the extra incomings. The + // incoming blocks to JoinBlock other than PredBlock must be loop + // back edges. + for (;;) { + auto Phi = dyn_cast(&PredBlock->front()); + if (!Phi) + break; + if (Phi->use_empty()) { + removeFromEMRMVals(Phi); + Phi->eraseFromParent(); + continue; + } + for (auto ui = JoinBlock->use_begin(), ue = JoinBlock->use_end(); + ui != ue; ++ui) { + auto Term = dyn_cast(ui->getUser()); + assert(Term); + if (!Term->isTerminator()) + continue; + auto TermBB = Term->getParent(); + if (TermBB == PredBlock) + continue; + Phi->addIncoming(Phi, TermBB); + } + Phi->removeFromParent(); + Phi->insertBefore(&JoinBlock->front()); + } + // Adjust branches targeting PredBlock to target JoinBlock instead. + PredBlock->replaceAllUsesWith(JoinBlock); + // Remove PredBlock. + PredBlock->eraseFromParent(); + } + } + } + } +} + +/*********************************************************************** + * emptyBranchingJoinBlocksInFunc : empty other instructions out of each + * block in a function that is both a join label and a branching join block + * + * See comment for moveCodeInJoinBlocks above. + */ +void GenXSimdCFConformance::emptyBranchingJoinBlocksInFunc(Function *F) +{ + for (auto i = po_begin(&F->getEntryBlock()), e = po_end(&F->getEntryBlock()); + i != e; ++i) { + BasicBlock *BB = *i; + CallInst *Join = GotoJoin::isBranchingJoinBlock(BB); + if (!Join) + continue; + emptyBranchingJoinBlock(Join); + } +} + +/*********************************************************************** + * emptyBranchingJoinBlock : empty instructions other than the join (and its + * extracts) from this branching join block + */ +void GenXSimdCFConformance::emptyBranchingJoinBlock(CallInst *Join) +{ + BasicBlock *BB = Join->getParent(); + Instruction *InsertBefore = nullptr; + for (Instruction *NextInst = BB->getFirstNonPHIOrDbg();;) { + auto Inst = NextInst; + if (Inst->isTerminator()) + break; + NextInst = Inst->getNextNode(); + if (Inst == Join) + continue; // do not hoist the join itself + if (GenXIntrinsic::getGenXIntrinsicID(Inst) == GenXIntrinsic::genx_simdcf_join) + break; // we have encountered another join; there must be more than one + if (auto EV = dyn_cast(Inst)) + if (EV->getOperand(0) == Join) + continue; // do not hoist an extract of the join + if (Inst->getNumUses() == 0) { + Inst->eraseFromParent(); + Modified = true; + continue; + } + // Check that the instruction's operands do not use anything in this block + // (the phi nodes, or the join and extracts being left behind). + for (unsigned oi = 0, oe = Inst->getNumOperands(); oi != oe; ++oi) { + auto Opnd = dyn_cast(Inst->getOperand(oi)); + if (Opnd && Opnd->getParent() == BB) { + LLVM_DEBUG(dbgs() << "Failed to empty branching join label for join " << Join->getName() << "\n"); + return; // Instruction uses something in this block: abandon. + } + } + if (!InsertBefore) { + // Lazy determination of the insert point. If it is a branching goto/join + // block, insert before the goto/join. + auto DomTree = getDomTree(BB->getParent()); + assert(DomTree); + auto BBNode = DomTree->getNode(BB); + assert(BBNode); + auto InsertBB = BBNode->getIDom()->getBlock(); + InsertBefore = GotoJoin::isBranchingGotoJoinBlock(InsertBB); + if (!InsertBefore) + InsertBefore = InsertBB->getTerminator(); + } + // Hoist the instruction. + Inst->removeFromParent(); + Inst->insertBefore(InsertBefore); + Modified = true; + } +} + +/*********************************************************************** + * getDomTree : get dominator tree for a function + */ +DominatorTree *GenXSimdCFConformance::getDomTree(Function *F) +{ + if (!DTWrapper) { + // In early pass, which is a module pass. + if (!DTs[F]) { + auto DT = new DominatorTree; + DT->recalculate(*F); + DTs[F] = DT; + } + return DTs[F]; + } + // In late pass, use the DominatorTreeGroupWrapper. + return DTWrapper->getDomTree(F); +} + +/*********************************************************************** + * hoistJoin : hoist a join to the top of its basic block if possible + * + * Return: whether succeeded + * + * This is used for a join in a block that is a join label, but not a branching + * join block. See comment for emptyJoinBlocks above. + * + * There might be multiple joins in the function, and the one supplied is not + * necessarily the first one. If it is a later one, this function will silently + * fail, which is harmless. If it silently fails for the first join, then we + * end up with a join label block that does not start with a join, which + * checkConformance will spot later on. + * + * This function does return whether it has succeeded, which is used in + * moveCodeInJoinBlocks in the case that it wants to merge a loop pre-header + * back into the join block. + */ +bool GenXSimdCFConformance::hoistJoin(CallInst *Join) +{ + // This only works if no operand of the join uses one of the instructions + // before it in the block, other than phi nodes. + // However, if we find such an instruction and it is an extractvalue from the + // result of an earlier goto/join in a different block, we can just move it + // to after that goto/join. + for (unsigned oi = 0, oe = Join->getNumArgOperands(); oi != oe; ++oi) { + auto Opnd = dyn_cast(Join->getOperand(oi)); + if (!Opnd || isa(Opnd)) + continue; + if (Opnd->getParent() == Join->getParent()) { + if (auto EV = dyn_cast(Opnd)) { + unsigned IID = GenXIntrinsic::getGenXIntrinsicID(EV->getOperand(0)); + if (IID == GenXIntrinsic::genx_simdcf_goto + || IID == GenXIntrinsic::genx_simdcf_join) { + auto GotoJoin = cast(EV->getOperand(0)); + if (GotoJoin->getParent() != Join->getParent()) { + LLVM_DEBUG(dbgs() << "moving out of join block: " << *EV << "\n"); + EV->removeFromParent(); + EV->insertBefore(GotoJoin->getNextNode()); + continue; + } + } + } + LLVM_DEBUG(dbgs() << "hoistJoin: " << Join->getName() << " failed\n"); + return false; // failed -- join uses non-phi instruction before it + } + } + // Hoist the join. + auto BB = Join->getParent(); + auto InsertBefore = BB->getFirstNonPHIOrDbg(); + if (InsertBefore == Join) + return true; // already at start + Join->removeFromParent(); + Join->insertBefore(InsertBefore); + GotoJoinEVsMap[Join].hoistEVs(); + Modified = true; + return true; +} + +/*********************************************************************** + * ensureConformance : check for conformance, and lower any non-conformant + * gotos and joins + */ +void GenXSimdCFConformance::ensureConformance() +{ + // Push all EM values onto the stack for checking. Push the joins last, since + // we want to process those before their corresponding gotos, so that + // GotoJoinMap is set for a goto by the time we process a valid goto. + for (auto i = EMVals.begin(), e = EMVals.end(); i != e; ++i) { + auto IID = GenXIntrinsic::getGenXIntrinsicID(i->getValue()); + if (IID != GenXIntrinsic::genx_simdcf_join && + IID != GenXIntrinsic::genx_simdcf_unmask && + IID != GenXIntrinsic::genx_simdcf_remask) + EMValsStack.insert(*i); + } + for (auto i = EMVals.begin(), e = EMVals.end(); i != e; ++i) { + auto IID = GenXIntrinsic::getGenXIntrinsicID(i->getValue()); + if (IID == GenXIntrinsic::genx_simdcf_join) + EMValsStack.insert(*i); + } // Process the stack. + SmallVector GotosToLower; + SmallVector JoinsToLower; + for (;;) { + if (!EMValsStack.empty()) { + // Remove and process the top entry on the stack. + auto EMVal = EMValsStack.back(); + EMValsStack.pop_back(); + if (checkEMVal(EMVal)) + continue; + removeBadEMVal(EMVal); + if (!EMVal.getIndex()) { + if (auto CI = dyn_cast(EMVal.getValue())) { + switch (GenXIntrinsic::getGenXIntrinsicID(EMVal.getValue())) { + case GenXIntrinsic::genx_simdcf_goto: + GotosToLower.push_back(CI); + break; + case GenXIntrinsic::genx_simdcf_join: + JoinsToLower.push_back(CI); + break; + default: + break; + } + } + } + continue; + } + // The stack is empty. Check for EM values interfering with each other. + checkEMInterference(); + if (EMValsStack.empty()) { + // Stack still empty; we have finished. + break; + } + } + // In the late pass, we are not expecting to have found any non-conformant + // gotos and joins that need lowering. All such gotos and joins should have + // been identified in the early pass, unless passes in between have + // transformed the code in an unexpected way that has made the simd CF + // non-conformant. Give an error here if this has happened. + if (isLatePass() && (!GotosToLower.empty() || !JoinsToLower.empty())) + llvm_unreachable("unexpected non-conformant SIMD CF in late SIMD CF conformance pass"); + // Lower gotos and joins that turned out to be non-conformant. + for (auto i = GotosToLower.begin(), e = GotosToLower.end(); i != e; ++i) + lowerGoto(*i); + for (auto i = JoinsToLower.begin(), e = JoinsToLower.end(); i != e; ++i) + lowerJoin(*i); +} + +/*********************************************************************** + * getEMProducer : perform recurrent check for EM terms. + * + * It goes through all phis and bitcasts (when BitCastAllowed is true) + * and determines whether the EM is correct in DF terms. It doesn't + * check live range interference, but can spot non-conformant usage + * in case when EM from bad instruction is being used. + * + * This approach is used when we need to perform some actions on full + * EM data flow, for example, to insert phis when eliminating redundant + * bitcasts. + * + * All found EM producers are stored in EMProducers and can be used + * later without performing full search. + * + * TODO: currently returns User if it deals with EM. It is done in + * this way as workaround for possible future changes (for example, + * getConnectedVals refactor). The idea of such approach is to be + * able to update info if something changes. + */ +Value *GenXSimdCFConformance::getEMProducer(Value *User, std::set &Visited, bool BitCastAllowed) +{ + LLVM_DEBUG(dbgs() << "Looking for EM producer for value:\n" << *User << "\n"); + + if (Visited.count(User)) { + if (dyn_cast(User)) + return User; + return nullptr; + } + + // Check for previously found value + auto It = EMProducers.find(User); + if (It != EMProducers.end()) { + LLVM_DEBUG(dbgs() << "Using previously found value:\n" << *It->second << "\n"); + return It->second; + } + + if (auto C = dyn_cast(User)) { + // All one is considered as EM at entry point + if (C->isAllOnesValue()) { + LLVM_DEBUG(dbgs() << "EMProducer is an AllOne constant\n"); + EMProducers[C] = C; + return C; + } + } else if (auto PN = dyn_cast(User)) { + // For phi node, check all its preds. They all must be EMs + Visited.insert(PN); + for (unsigned idx = 0, opNo = PN->getNumOperands(); idx < opNo; ++idx) { + Value *Pred = PN->getOperand(idx); + + if (!getEMProducer(Pred, Visited, BitCastAllowed)) { + LLVM_DEBUG(dbgs() << "!!! Bad phi pred detected for:\n" << *PN << "\n"); + EMProducers[PN] = nullptr; + return nullptr; + } + } + + LLVM_DEBUG(dbgs() << "EMProducer is phi itself:\n" << *PN << "\n"); + EMProducers[PN] = PN; + return PN; + } else if (auto EVI = dyn_cast(User)) { + // Extract value can be an EV from goto/join or from callee that + // returned it. For the second case we check that the pred is + // still in EM values since it could be lowered. + CallInst *CI = dyn_cast(EVI->getOperand(0)); + if (CI) { + // Goto/join check + if (GenXIntrinsic::getGenXIntrinsicID(CI) == GenXIntrinsic::genx_simdcf_goto || + GenXIntrinsic::getGenXIntrinsicID(CI) == GenXIntrinsic::genx_simdcf_join) { + LLVM_DEBUG(dbgs() << "Reached goto/join\n"); + EMProducers[EVI] = EVI; + return EVI; + } + + // EV from other calls. + if (EMVals.count(SimpleValue(CI, EVI->getIndices()[0]))) { + LLVM_DEBUG(dbgs() << "Value from return\n"); + EMProducers[EVI] = EVI; + return EVI; + } + } + } else if (auto Arg = dyn_cast(User)){ + // For argument we need to ensure that it is still in EM values + // since it could be lowered. + if (EMVals.count(SimpleValue(Arg, Arg->getArgNo()))) { + LLVM_DEBUG(dbgs() << "Input argument\n"); + EMProducers[Arg] = Arg; + return Arg; + } + } else if (auto IVI = dyn_cast(User)) { + // Insert value prepares structure for return. Check the + // value that is being inserted + Visited.insert(IVI); + if (auto EMProd = getEMProducer(IVI->getInsertedValueOperand(), Visited, BitCastAllowed)) { + LLVM_DEBUG(dbgs() << "Insert for return\n"); + EMProducers[IVI] = EMProd; + return IVI; + } + } else if (BitCastAllowed) { + if (auto BCI = dyn_cast(User)) { + // BitCast doesn't produce new EM. Just go through it. + Visited.insert(BCI); + if (auto EMProd = getEMProducer(BCI->getOperand(0), Visited, BitCastAllowed)) { + LLVM_DEBUG(dbgs() << "Bitcast from EM producer\n"); + EMProducers[BCI] = EMProd; + return BCI; + } + } + } + + // All other instructions cannot be treated as EM producers + LLVM_DEBUG(dbgs() << "!!! IT IS NOT A EM PRODUCER !!!\n"); + return nullptr; +} + +/*********************************************************************** + * lowerUnsuitableGetEMs : remove all unsuitable get_em intrinsics. + * + * This intrinsic is unsuitable if: + * - It uses constant value: it is simply redundant + * - The EM argument is not actually a EM: this may happen if + * SIMD CF was non-conformant and this EM was lowered. + */ +void GenXSimdCFConformance::lowerUnsuitableGetEMs() +{ + Type *I1Ty = Type::getInt1Ty(M->getContext()); + Function *GetEMDecl = GenXIntrinsic::getGenXDeclaration(M, GenXIntrinsic::genx_simdcf_get_em, + { VectorType::get(I1Ty, 32) }); + for (auto ui = GetEMDecl->use_begin(); ui != GetEMDecl->use_end();) { + std::set Visited; + auto GetEM = dyn_cast(ui->getUser()); + ++ui; + auto GetEMPred = GetEM->getOperand(0); + + // Constants and non-EM values should be used directly + if (dyn_cast(GetEMPred) || !getEMProducer(dyn_cast(GetEMPred), Visited)) { + GetEM->replaceAllUsesWith(GetEM->getOperand(0)); + GetEM->eraseFromParent(); + } + } +} + +/*********************************************************************** + * lowerAllSimdCF : do NOT check for conformance, and simply lower + * all any gotos, joins, and unmasks + */ +void GenXSimdCFConformance::lowerAllSimdCF() +{ + for (auto i = EMVals.begin(), e = EMVals.end(); i != e; ++i) { + if (auto CI = dyn_cast(i->getValue())) { + auto IID = GenXIntrinsic::getGenXIntrinsicID(i->getValue()); + if (IID == GenXIntrinsic::genx_simdcf_join) + lowerJoin(CI); + else if (IID == GenXIntrinsic::genx_simdcf_goto) + lowerGoto(CI); + else if (IID == GenXIntrinsic::genx_simdcf_unmask) { + auto SaveMask = CI->getArgOperand(0); + if (auto CI0 = dyn_cast(SaveMask)) { + IRBuilder<> Builder(CI0); + auto Replace = Builder.CreateBitCast(CI0->getArgOperand(0), CI0->getType()); + CI0->replaceAllUsesWith(Replace); + CI0->eraseFromParent(); + } + IRBuilder<> Builder(CI); + auto Replace = Builder.CreateBitCast(CI->getArgOperand(1), CI->getType()); + CI->replaceAllUsesWith(Replace); + CI->eraseFromParent(); + } + else if (IID == GenXIntrinsic::genx_simdcf_remask) { + IRBuilder<> Builder(CI); + auto Replace = Builder.CreateBitCast(CI->getArgOperand(1), CI->getType()); + CI->replaceAllUsesWith(Replace); + CI->eraseFromParent(); + } + } + } +} + +/*********************************************************************** + * checkEMVal : check an EM value for conformance + * + * Return: true if ok, false if the EM value needs to be removed + */ +bool GenXSimdCFConformance::checkEMVal(SimpleValue EMVal) +{ + LLVM_DEBUG(dbgs() << "checkEMVal " << *EMVal.getValue() << "#" << EMVal.getIndex() << "\n"); + if (!EnableGenXGotoJoin) + return false; // use of goto/join disabled + SmallVector ConnectedVals; + // Check connected values. Do not lower bad users in Late Pass because + // current SIMD CF Conformance check approach expects that SIMD CF must + // be OK at this point if it wasn't lowered during Early Pass. + if (!getConnectedVals(EMVal, RegCategory::EM, /*IncludeOptional=*/true, + /*OkJoin=*/nullptr, &ConnectedVals, /*LowerBadUsers=*/!FG)) { + LLVM_DEBUG(dbgs() << "invalid def or uses\n"); + return false; // something invalid about the EM value itself + } + // Check that all connected values are EM values. + for (auto i = ConnectedVals.begin(), e = ConnectedVals.end(); i != e; ++i) { + SimpleValue ConnectedVal = *i; + if (auto C = dyn_cast(ConnectedVal.getValue())) { + if (!C->isAllOnesValue()) { + LLVM_DEBUG(dbgs() << "ConnectedVal is constant that is not all ones\n"); + return false; // uses constant that is not all ones, invalid + } + } else if (!EMVals.count(ConnectedVal)) { + LLVM_DEBUG(dbgs() << "ConnectedVal is not in EMVals\n"); + return false; // connected value is not in EMVals + } + } + switch (GenXIntrinsic::getGenXIntrinsicID(EMVal.getValue())) { + case GenXIntrinsic::genx_simdcf_goto: + return checkGoto(EMVal); + case GenXIntrinsic::genx_simdcf_join: + return checkJoin(EMVal); + default: + break; + } + return true; +} + +/*********************************************************************** + * checkGotoJoinSunk : check whether a goto/join is sunk to the bottom of + * its basic block, other than extractvalues from its result + */ +static bool checkGotoJoinSunk(CallInst *GotoJoin) +{ + for (Instruction *Inst = GotoJoin;;) { + Inst = Inst->getNextNode(); + if (Inst->isTerminator()) { + if (!isa(Inst)) + return false; + break; + } + auto EV = dyn_cast(Inst); + if (!EV || EV->getOperand(0) != GotoJoin) + return false; + } + return true; +} + +/*********************************************************************** + * checkGoto : check conformance of an actual goto instruction + */ +bool GenXSimdCFConformance::checkGoto(SimpleValue EMVal) +{ + if (!checkGotoJoin(EMVal)) + return false; + // Check that there is a linked join. (We do not need to check here that the + // linked join is an EM value; that happened in checkEMVal due to the join + // being treated as a linked value in getConnectedVals.) + auto Goto = cast(EMVal.getValue()); + if (!GotoJoinMap[Goto]) { + LLVM_DEBUG(dbgs() << "checkGoto: no linked join\n"); + return false; + } + // Check that the goto is sunk to the end of the block, other than extracts + // from its result, and a branch. moveCodeInGotoBlocks ensures that if + // possible; if that failed, this conformance check fails. + if (!checkGotoJoinSunk(Goto)) { + LLVM_DEBUG(dbgs() << "checkGoto: not sunk\n"); + return false; + } + return true; +} + +/*********************************************************************** + * checkJoin : check conformance of an actual join instruction + */ +bool GenXSimdCFConformance::checkJoin(SimpleValue EMVal) +{ + if (!checkGotoJoin(EMVal)) + return false; + // Check that the join is at the start of the block. emptyJoinBlock should + // have ensured this, unless the code was such that it could not. + auto Join = cast(EMVal.getValue()); + if (!GotoJoin::isValidJoin(Join)) { + LLVM_DEBUG(dbgs() << "not valid join\n"); + return false; + } + // If the !any result of this join is used in a conditional branch at the + // end, check that the join is sunk to the end of the block, other than + // extracts from its result, and a branch. moveCodeInJoinBlocks ensures that + // if possible; if that failed, this conformance check fails. + if (auto Br = dyn_cast(Join->getParent()->getTerminator())) + if (Br->isConditional()) + if (auto EV = dyn_cast(Br->getCondition())) + if (EV->getOperand(0) == Join) + if (!checkGotoJoinSunk(Join)) { + LLVM_DEBUG(dbgs() << "checkJoin: not sunk\n"); + return false; + } + // Gather the web of RM values. + auto RMValsEntry = &RMVals[Join]; + RMValsEntry->clear(); + LLVM_DEBUG(dbgs() << "gather web of RM vals for " << *Join << "\n"); + if (!isa(Join->getOperand(1))) + RMValsEntry->insert(Join->getOperand(1)); + for (unsigned rvi = 0; rvi != RMValsEntry->size(); ++rvi) { + SimpleValue RM = (*RMValsEntry)[rvi]; + // RM is a value in this join's RM web. Get other values related by phi + // nodes and extractvalues and gotos. + SmallVector ConnectedVals; + bool Ok = getConnectedVals(RM, RegCategory::RM, /*IncludeOptional=*/false, + Join, &ConnectedVals); + LLVM_DEBUG( + dbgs() << "getConnectedVals: " << RM.getValue()->getName() << "#" << RM.getIndex() << "\n"; + for (auto i = ConnectedVals.begin(), e = ConnectedVals.end(); i != e; ++i) + dbgs() << " " << i->getValue()->getName() << "#" << i->getIndex() << "\n" + ); + if (!Ok) { + LLVM_DEBUG(dbgs() << "illegal RM value in web\n"); + return false; + } + for (auto j = ConnectedVals.begin(), je = ConnectedVals.end(); + j != je; ++j) { + SimpleValue ConnectedVal = *j; + if (auto C = dyn_cast(ConnectedVal.getValue())) { + // A constant in the RM web must be all zeros. + if (!C->isNullValue()) { + LLVM_DEBUG(dbgs() << "non-0 constant in RM web\n"); + return false; + } + } else { + // Insert the non-constant value. If it is a goto with struct index + // other than 1, it is illegal. + if (RMValsEntry->insert(ConnectedVal)) { + LLVM_DEBUG(dbgs() << "New one: " << ConnectedVal.getValue()->getName() << "#" << ConnectedVal.getIndex() << "\n"); + switch (GenXIntrinsic::getGenXIntrinsicID(ConnectedVal.getValue())) { + case GenXIntrinsic::genx_simdcf_join: + LLVM_DEBUG(dbgs() << "multiple joins in RM web\n"); + return false; + case GenXIntrinsic::genx_simdcf_goto: + if (ConnectedVal.getIndex() != 1/* struct index of RM result */) { + LLVM_DEBUG(dbgs() << "wrong struct index in goto\n"); + return false; + } + break; + default: + break; + } + } + } + } + } + // Check whether the RM values interfere with each other. + SetVector BadDefs; + checkInterference(RMValsEntry, &BadDefs, Join); + if (!BadDefs.empty()) { + LLVM_DEBUG(dbgs() << "RMs interfere\n"); + return false; + } + // Set GotoJoinMap for each goto in the RM web. + for (unsigned rvi = 0; rvi != RMValsEntry->size(); ++rvi) { + SimpleValue RM = (*RMValsEntry)[rvi]; + if (GenXIntrinsic::getGenXIntrinsicID(RM.getValue()) == GenXIntrinsic::genx_simdcf_goto) + GotoJoinMap[cast(RM.getValue())] = Join; + } + return true; +} + +/*********************************************************************** + * getEmptyCriticalEdgeSplitterSuccessor : if BB is an empty critical edge + * splitter block (one predecessor and one successor), then return the + * single successor + */ +static BasicBlock *getEmptyCriticalEdgeSplitterSuccessor(BasicBlock *BB) +{ + if (!BB->hasOneUse()) + return nullptr; // not exactly one predecessor + auto Term = dyn_cast(BB->getFirstNonPHIOrDbg()); + if (!Term->isTerminator()) + return nullptr; // not empty + auto TI = cast(Term); + if (TI->getNumSuccessors() != 1) + return nullptr; // not exactly one successor + return TI->getSuccessor(0); +} + +/*********************************************************************** + * checkGotoJoin : common code to check conformance of an actual goto or join + * instruction + */ +bool GenXSimdCFConformance::checkGotoJoin(SimpleValue EMVal) +{ + auto CI = cast(EMVal.getValue()); + // If there is an extract of the scalar result of the goto/join, check that + // it is used in the conditional branch at the end of the block. + ExtractValueInst *ExtractScalar = nullptr; + for (auto ui = CI->use_begin(), ue = CI->use_end(); ui != ue; ++ui) + if (auto EV = dyn_cast(ui->getUser())) + if (!isa(EV->getType())) { + if (ExtractScalar) { + LLVM_DEBUG(dbgs() << "goto/join has more than one extract of its !any result\n"); + return false; + } + ExtractScalar = EV; + } + if (ExtractScalar) { + if (!ExtractScalar->hasOneUse()) { + LLVM_DEBUG(dbgs() << "goto/join's !any result does not have exactly one use\n"); + return false; + } + auto Br = dyn_cast(ExtractScalar->use_begin()->getUser()); + if (!Br || Br->getParent() != CI->getParent()) { + LLVM_DEBUG(dbgs() << "goto/join's !any result not used in conditional branch in same block\n"); + return false; + } + // For a goto/join with a conditional branch, check that the "true" + // successor is a join label. We also tolerate there being an empty + // critical edge splitter block in between; this will get removed in + // setCategories in this pass. + BasicBlock *TrueSucc = Br->getSuccessor(0); + Instruction *First = TrueSucc->getFirstNonPHIOrDbg(); + if (GenXIntrinsic::getGenXIntrinsicID(First) != GenXIntrinsic::genx_simdcf_join) { + // "True" successor is not a join label. Check for an empty critical edge + // splitter block in between. + TrueSucc = getEmptyCriticalEdgeSplitterSuccessor(TrueSucc); + if (!TrueSucc) { + LLVM_DEBUG(dbgs() << "goto/join true successor not join label\n"); + return false; // Not empty critical edge splitter + } + if (GenXIntrinsic::getGenXIntrinsicID(TrueSucc->getFirstNonPHIOrDbg()) + != GenXIntrinsic::genx_simdcf_join) { + LLVM_DEBUG(dbgs() << "goto/join true successor not join label\n"); + return false; // Successor is not join label + } + } + } + return true; +} + +/*********************************************************************** + * removeBadEMVal : remove a bad EM value + * + * This removes a non-conformant EM value, and pushes any connected EM value + * onto the stack so it gets re-checked for conformance. + */ +void GenXSimdCFConformance::removeBadEMVal(SimpleValue EMVal) +{ + LLVM_DEBUG( + dbgs() << "removeBadEMVal "; + EMVal.print(dbgs()); + dbgs() << "\n" + ); + // Remove the EM value. + if (!EMVals.remove(EMVal)) + return; // was not in EMVals + // Push anything related to it onto the stack for re-checking. + SmallVector ConnectedVals; + getConnectedVals(EMVal, RegCategory::EM, /*IncludeOptional=*/true, + /*OkJoin=*/nullptr, &ConnectedVals); + for (auto i = ConnectedVals.begin(), e = ConnectedVals.end(); i != e; ++i) { + SimpleValue ConnectedVal = *i; + if (EMVals.count(ConnectedVal)) + EMValsStack.insert(ConnectedVal); + } +} + +/*********************************************************************** + * pushValues : push EM struct elements in a value onto EMValsStack + */ +void GenXSimdCFConformance::pushValues(Value *V) +{ + for (unsigned si = 0, se = IndexFlattener::getNumElements(V->getType()); + si != se; ++si) { + SimpleValue SV(V, si); + if (EMVals.count(SV)) + EMValsStack.insert(SV); + } +} + +/*********************************************************************** + * checkAllUsesAreSelectOrWrRegion : check that all uses of a value are the + * condition in select or wrregion or wrpredpredregion (or a predicate + * in a non-ALU intrinsic) + * + * This is used in getConnectedVals below for the result of a use of an EM + * value in an rdpredregion, or a shufflevector that is a slice so will be + * lowered to rdpredregion. + */ +static bool checkAllUsesAreSelectOrWrRegion(Value *V) +{ + for (auto ui2 = V->use_begin(); ui2 != V->use_end(); /*empty*/) { + auto User2 = cast(ui2->getUser()); + unsigned OpNum = ui2->getOperandNo(); + ++ui2; + + if (isa(User2)) + continue; + + // Matches uses that can be turned into select. + if (auto BI = dyn_cast(User2)) { + auto Opc = BI->getOpcode(); + Constant *AllOne = Constant::getAllOnesValue(V->getType()); + Constant *AllNul = Constant::getNullValue(V->getType()); + + // EM && X -> sel EM X 0 + // EM || X -> sel EM 1 X + if (Opc == BinaryOperator::And || + Opc == BinaryOperator::Or) { + Value *Ops[3] = {V, nullptr, nullptr}; + if (Opc == BinaryOperator::And) { + Ops[1] = BI->getOperand(1 - OpNum); + Ops[2] = AllNul; + } else if (Opc == BinaryOperator::Or) { + Ops[1] = AllOne; + Ops[2] = BI->getOperand(1 - OpNum); + } + auto SI = SelectInst::Create(Ops[0], Ops[1], Ops[2], ".revsel", BI, BI); + BI->replaceAllUsesWith(SI); + BI->eraseFromParent(); + continue; + } + + // ~EM || X ==> sel EM, X, 1 + using namespace PatternMatch; + if (BI->hasOneUse() && + BI->user_back()->getOpcode() == BinaryOperator::Or && + match(BI, m_Xor(m_Specific(V), m_Specific(AllOne)))) { + Instruction *OrInst = BI->user_back(); + Value *Op = OrInst->getOperand(0) != BI ? OrInst->getOperand(0) + : OrInst->getOperand(1); + auto SI = SelectInst::Create(V, Op, AllOne, ".revsel", OrInst, OrInst); + OrInst->replaceAllUsesWith(SI); + OrInst->eraseFromParent(); + BI->eraseFromParent(); + continue; + } + + // ~EM && X ==> sel EM, 0, X + using namespace PatternMatch; + if (BI->hasOneUse() && + BI->user_back()->getOpcode() == BinaryOperator::And && + match(BI, m_Xor(m_Specific(V), m_Specific(AllOne)))) { + Instruction *AndInst = BI->user_back(); + Value *Op = AndInst->getOperand(0) != BI ? AndInst->getOperand(0) + : AndInst->getOperand(1); + auto SI = SelectInst::Create(V, AllNul, Op, ".revsel", AndInst, AndInst); + AndInst->replaceAllUsesWith(SI); + AndInst->eraseFromParent(); + BI->eraseFromParent(); + continue; + } + } else if (auto CI = dyn_cast(User2)) { + // Turn zext/sext to select. + if (CI->getOpcode() == Instruction::CastOps::ZExt || + CI->getOpcode() == Instruction::CastOps::SExt) { + unsigned NElts = V->getType()->getVectorNumElements(); + unsigned NBits = CI->getType()->getScalarSizeInBits(); + int Val = (CI->getOpcode() == Instruction::CastOps::ZExt) ? 1 : -1; + APInt One(NBits, Val); + Constant *LHS = ConstantVector::getSplat( + NElts, ConstantInt::get(CI->getType()->getScalarType(), One)); + Constant *AllNul = Constant::getNullValue(CI->getType()); + auto SI = SelectInst::Create(V, LHS, AllNul, ".revsel", CI, CI); + CI->replaceAllUsesWith(SI); + CI->eraseFromParent(); + continue; + } + } + + unsigned IID = GenXIntrinsic::getAnyIntrinsicID(User2); + if (GenXIntrinsic::isWrRegion(IID)) + continue; + if (IID == GenXIntrinsic::genx_wrpredpredregion + && OpNum == cast(User2)->getNumArgOperands() - 1) + continue; + if (GenXIntrinsic::isAnyNonTrivialIntrinsic(IID) + && !cast(User2)->doesNotAccessMemory()) + continue; + return false; + } + return true; +} + +/*********************************************************************** + * getConnectedVals : for a SimpleValue, get other SimpleValues connected to + * it through phi nodes, insertvalue, extractvalue, goto/join, and maybe + * args and return values + * + * Enter: Val = SimpleValue to start at + * Cat = RegCategory::EM to do EM connections + * RegCategory::RM to do RM connections + * IncludeOptional = for EM connections, include optional connections + * where Val is a function arg and it is connected to call args, + * and where Val is the operand to return and it is connected to + * the returned value at call sites + * OkJoin = for RM connections, error if a use in a join other than + * this one is found + * ConnectedVals = vector to store connected values in + * + * Return: true if ok, false if def or some use is not suitable for EM/RM + * + * The provided value must be non-constant, but the returned connected values + * may include constants. Duplicates may be stored in ConnectedVals. + * + * This function is used in three different ways by its callers: + * + * 1. to gather a web of putative EM values or RM values starting at goto/join + * instructions; + * + * 2. to test whether a putative EM/RM value is valid by whether its connected + * neighbors are EM/RM values; + * + * 3. when removing a value from the EM/RM values list, to find its connected + * neighbors to re-run step 2 on each of them. + * + * TODO: some refactoring should be performed here due to quite big + * CF with many different actions. Also some of these actions + * are repeated in different situations. + */ +bool GenXSimdCFConformance::getConnectedVals(SimpleValue Val, int Cat, + bool IncludeOptional, CallInst *OkJoin, + SmallVectorImpl *ConnectedVals, bool LowerBadUsers) +{ + // Check the def first. + if (auto Arg = dyn_cast(Val.getValue())) { + if (Cat != RegCategory::EM) + return false; // can't have RM argument + // Connected to some return value. There is a problem here in that it might + // find another predicate return value that is nothing to do with SIMD CF, + // and thus stop SIMD CF being optimized. But passing a predicate in and + // out of a function is rare outside of SIMD CF, so we do not worry about + // that. + // It is possible that EM was optimized from ret. In this case the ret type + // is void. Allow such situation. + Function *F = Arg->getParent(); + unsigned RetIdx = 0; + auto RetTy = F->getReturnType(); + auto ValTy = IndexFlattener::getElementType( + Val.getValue()->getType(), Val.getIndex()); + if (auto ST = dyn_cast(RetTy)) { + for (unsigned End = IndexFlattener::getNumElements(ST); ; ++RetIdx) { + if (RetIdx == End) + return false; // no predicate ret value found + if (IndexFlattener::getElementType(ST, RetIdx) == ValTy) + break; + } + } else if (RetTy != ValTy && !RetTy->isVoidTy()) + return false; // no predicate ret value found + if (!RetTy->isVoidTy()) + for (auto fi = F->begin(), fe = F->end(); fi != fe; ++fi) + if (auto Ret = dyn_cast(fi->getTerminator())) + ConnectedVals->push_back(SimpleValue(Ret->getOperand(0), RetIdx)); + if (IncludeOptional) { + // With IncludeOptional, also add the corresponding arg at each call + // site. + for (auto ui = F->use_begin(), ue = F->use_end(); ui != ue; ++ui) + if (auto CI = dyn_cast(ui->getUser())) + ConnectedVals->push_back( + SimpleValue(CI->getArgOperand(Arg->getArgNo()), Val.getIndex())); + } + } else if (auto Phi = dyn_cast(Val.getValue())) { + // phi: add (the corresponding struct element of) each incoming + for (unsigned oi = 0, oe = Phi->getNumIncomingValues(); oi != oe; ++oi) + ConnectedVals->push_back( + SimpleValue(Phi->getIncomingValue(oi), Val.getIndex())); + } else if (auto EVI = dyn_cast(Val.getValue())) { + // extractvalue: add the appropriate struct element of the input + ConnectedVals->push_back(SimpleValue(EVI->getOperand(0), + Val.getIndex() + IndexFlattener::flatten( + cast(EVI->getOperand(0)->getType()), + EVI->getIndices()))); + } else if (auto IVI = dyn_cast(Val.getValue())) { + // insertvalue: add the appropriate struct element in either the + // aggregate input or the value to insert input + unsigned InsertedIndex = Val.getIndex() - IndexFlattener::flatten( + cast(IVI->getType()), IVI->getIndices()); + unsigned NumElements = IndexFlattener::getNumElements( + IVI->getOperand(1)->getType()); + SimpleValue SV; + if (InsertedIndex < NumElements) + SV = SimpleValue(IVI->getOperand(1), InsertedIndex); + else + SV = SimpleValue(IVI->getOperand(0), Val.getIndex()); + ConnectedVals->push_back(SV); + } else if (auto SVI = dyn_cast(Val.getValue())) { + // shufflevector: add the EM use + ConnectedVals->push_back(SimpleValue(SVI->getOperand(0), 0)); + } else if (auto CI = dyn_cast(Val.getValue())) { + switch (GenXIntrinsic::getAnyIntrinsicID(CI)) { + case GenXIntrinsic::genx_simdcf_goto: + // goto: invalid unless it is the EM/RM result of goto as applicable + if (Val.getIndex() != (Cat == RegCategory::EM ? 0U : 1U)) + return false; + // Add the corresponding input. + ConnectedVals->push_back(CI->getOperand(Val.getIndex())); + // If doing EM connections, add the corresponding join. This does + // nothing if checkJoin has not yet run for the corresponding join, + // since GotoJoinMap has not yet been set up for our goto. We tolerate + // that situation; if the goto really has no linked join, that is + // picked up later in checkGoto. + if (Cat == RegCategory::EM) + if (auto Join = GotoJoinMap[cast(Val.getValue())]) + ConnectedVals->push_back( + SimpleValue(Join, 0/* struct idx of EM result */)); + break; + case GenXIntrinsic::genx_simdcf_join: { + // join: invalid unless it is the EM result + if (Val.getIndex() || Cat != RegCategory::EM) + return false; + // Add the corresponding input. + ConnectedVals->push_back(CI->getOperand(Val.getIndex())); + // Add the corresponding gotos. This does nothing if checkJoin has not + // yet run for this join, since RMVals has not yet been set up for it. + // That is OK, because adding the corresponding gotos here is required + // only when we are called by removeBadEMVal to remove the join, so the + // gotos get re-checked and found to be invalid. + auto RMValsEntry = &RMVals[cast(Val.getValue())]; + for (auto i = RMValsEntry->begin(), e = RMValsEntry->end(); i != e; ++i) + if (GenXIntrinsic::getGenXIntrinsicID(i->getValue()) == GenXIntrinsic::genx_simdcf_goto) + ConnectedVals->push_back( + SimpleValue(i->getValue(), 0/* struct idx of EM result */)); + break; + } + case GenXIntrinsic::genx_simdcf_savemask: + case GenXIntrinsic::genx_simdcf_remask: + case GenXIntrinsic::genx_simdcf_get_em: + // Add the corresponding input. + ConnectedVals->push_back(CI->getOperand(0)); + return true; + case GenXIntrinsic::genx_constantpred: + // constantpred: add the constant. Don't add any other uses of it, + // because it might be commoned up with other RM webs. + ConnectedVals->push_back(CI->getOperand(0)); + return true; + case GenXIntrinsic::not_any_intrinsic: { + // Value returned from a call. + if (Cat != RegCategory::EM) + return false; // invalid for RM + // Add the corresponding value at each return in the called function. + auto CalledFunc = CI->getCalledFunction(); + for (auto fi = CalledFunc->begin(), fe = CalledFunc->end(); + fi != fe; ++fi) + if (auto Ret = dyn_cast(fi->getTerminator())) + if (!Ret->getType()->isVoidTy()) + ConnectedVals->push_back( + SimpleValue(Ret->getOperand(0), Val.getIndex())); + // Connected to some call arg. There is a problem here in that it might + // find another predicate arg that is nothing to do with SIMD CF, and + // thus stop SIMD CF being optimized. But passing a predicate in and + // out of a function is rare outside of SIMD CF, so we do not worry + // about that. + auto ValTy = IndexFlattener::getElementType( + Val.getType(), Val.getIndex()); + for (unsigned Idx = 0, End = CI->getNumArgOperands(); ; ++Idx) { + if (Idx == End) + return false; // no corresponding call arg found + if (CI->getArgOperand(Idx)->getType() == ValTy) { + ConnectedVals->push_back(SimpleValue(CI->getArgOperand(Idx), 0)); + break; + } + } + break; + } + default: + return false; // unexpected call as def + } + } else + return false; // unexpected instruction as def + // Check the uses. + std::vector UsersToLower; + for (auto ui = Val.getValue()->use_begin(), + ue = Val.getValue()->use_end(); ui != ue; ++ui) { + auto User = cast(ui->getUser()); + if (auto Phi = dyn_cast(User)) { + // Use in phi node. Add the phi result. + ConnectedVals->push_back(SimpleValue(Phi, Val.getIndex())); + continue; + } + if (auto EVI = dyn_cast(User)) { + // Use in extractvalue. + // If extracting the right index, add the result. + unsigned StartIndex = IndexFlattener::flatten( + cast(EVI->getOperand(0)->getType()), EVI->getIndices()); + unsigned NumIndices = IndexFlattener::getNumElements(EVI->getType()); + unsigned ExtractedIndex = Val.getIndex() - StartIndex; + if (ExtractedIndex < NumIndices) + ConnectedVals->push_back(SimpleValue(EVI, ExtractedIndex)); + continue; + } + if (auto IVI = dyn_cast(User)) { + // Use in insertvalue. Could be either the aggregate input or the value + // to insert. + unsigned StartIndex = IndexFlattener::flatten( + cast(IVI->getType()), IVI->getIndices()); + unsigned NumIndices = IndexFlattener::getNumElements( + IVI->getOperand(1)->getType()); + if (!ui->getOperandNo()) { + // Use in insertvalue as the aggregate input. Add the corresponding + // element in the result, as long as it is not overwritten by the + // insertvalue. + if (Val.getIndex() - StartIndex >= NumIndices) + ConnectedVals->push_back(SimpleValue(IVI, Val.getIndex())); + } else { + // Use in insertvalue as the value to insert. Add the corresponding + // element in the result. + ConnectedVals->push_back(SimpleValue(IVI, StartIndex + Val.getIndex())); + } + continue; + } + if (isa(User)) { + // Use in a return. + if (Cat != RegCategory::EM) + return false; // invalid for RM + // Connected to some function arg. There is a problem here in that it might + // find another predicate arg that is nothing to do with SIMD CF, and + // thus stop SIMD CF being optimized. But passing a predicate in and + // out of a function is rare outside of SIMD CF, so we do not worry + // about that. + auto ValTy = IndexFlattener::getElementType( + Val.getType(), Val.getIndex()); + auto F = User->getParent()->getParent(); + bool Lower = false; + for (auto ai = F->arg_begin(), ae = F->arg_end(); ; ++ai) { + if (ai == ae) { + // no arg of the right type found + Lower = true; + UsersToLower.push_back(SimpleValue(User, ui->getOperandNo())); + break; + } + auto Arg = &*ai; + if (Arg->getType() == ValTy) { + ConnectedVals->push_back(SimpleValue(Arg, 0)); + break; + } + } + if (IncludeOptional && !Lower) { + // With IncludeOptional, also add the values connected by being the + // return value at each call site. + for (auto ui = F->use_begin(), ue = F->use_end(); ui != ue; ++ui) + if (auto CI = dyn_cast(ui->getUser())) + ConnectedVals->push_back(SimpleValue(CI, Val.getIndex())); + } + continue; + } + if (isa(User)) { + // A use in a select is allowed only for EM used as the condition. + if (Cat != RegCategory::EM || ui->getOperandNo() != 0) + UsersToLower.push_back(SimpleValue(User, ui->getOperandNo())); + continue; + } + if (auto SVI = dyn_cast(User)) { + if (!ShuffleVectorAnalyzer(SVI).isReplicatedSlice()) { + UsersToLower.push_back(SimpleValue(User, ui->getOperandNo())); + continue; + } + // This is a shufflevector that is a replicated slice, so it can be + // lowered to rdpredregion or baled with instruction with channels. + // (We only see this in the early pass; GenXLowering has + // turned it into rdpredregion by the late pass.) Check that all its uses + // are select or wrregion. + if (!checkAllUsesAreSelectOrWrRegion(SVI)) { + UsersToLower.push_back(SimpleValue(User, ui->getOperandNo())); + continue; + } + // Shufflevector produces EM for value baled inst, so this is a (almost) real EM def: + // add it here to perform correct EM interference check + ConnectedVals->push_back(SimpleValue(SVI, ui->getOperandNo())); + continue; + } + if (auto CI = dyn_cast(User)) { + switch (GenXIntrinsic::getAnyIntrinsicID(CI)) { + case GenXIntrinsic::genx_simdcf_get_em: + assert(Cat == RegCategory::EM); + // Skip it if the category is right. This + // intrinsic doesn't produce EM + break; + case GenXIntrinsic::genx_simdcf_unmask: + case GenXIntrinsic::genx_simdcf_remask: + assert(Cat == RegCategory::EM); + ConnectedVals->push_back(SimpleValue(CI, 0)); + break; + case GenXIntrinsic::genx_simdcf_goto: + // use in goto: valid only if arg 0 (EM) or 1 (RM) + if (ui->getOperandNo() != (Cat == RegCategory::EM ? 0U : 1U)) + return false; + // Add corresponding result. + ConnectedVals->push_back(SimpleValue(CI, ui->getOperandNo())); + break; + case GenXIntrinsic::genx_simdcf_join: + // use in join: valid only if arg 0 (EM) or 1 (RM) + if (ui->getOperandNo() != (Cat == RegCategory::EM ? 0U : 1U)) + return false; + // If EM, add corresponding result. + if (Cat == RegCategory::EM) + ConnectedVals->push_back(SimpleValue(CI, 0)); + else if (OkJoin && OkJoin != CI) { + // RM value used in a join other than OkJoin. That is illegal, as we + // can only have one join per RM web. + LLVM_DEBUG(dbgs() << "getConnectedVals: found illegal join: " << CI->getName() << "\n"); + return false; + } + break; + case GenXIntrinsic::genx_wrregionf: + case GenXIntrinsic::genx_wrregioni: + break; // Use as wrregion predicate is allowed. + case GenXIntrinsic::genx_rdpredregion: + // We only see rdpredregion in the late pass; in the early pass it is + // still a shufflevector. Check that all its uses are select or + // wrregion. + if (!checkAllUsesAreSelectOrWrRegion(CI)) + UsersToLower.push_back(SimpleValue(User, ui->getOperandNo())); + break; + case GenXIntrinsic::genx_wrpredpredregion: + // Use in wrpredpredregion allowed as the last arg. + if (ui->getOperandNo() + 1 != CI->getNumArgOperands()) + UsersToLower.push_back(SimpleValue(User, ui->getOperandNo())); + break; + default: + // Allowed as an predicate in a non-ALU intrinsic. + if (CI->getCalledFunction()->doesNotAccessMemory()) + UsersToLower.push_back(SimpleValue(User, ui->getOperandNo())); + break; + case GenXIntrinsic::not_any_intrinsic: { + // Use in subroutine call. Add the corresponding function arg. + Function *CalledFunc = CI->getCalledFunction(); + assert(CalledFunc); + auto ai = CalledFunc->arg_begin(); + for (unsigned Count = ui->getOperandNo(); Count; --Count, ++ai) + ; + Argument *Arg = &*ai; + ConnectedVals->push_back(SimpleValue(Arg, Val.getIndex())); + // Connected to some return value from the call. There is a problem + // here in that it might find another predicate return value that is + // nothing to do with SIMD CF, and thus stop SIMD CF being optimized. + // But passing a predicate in and out of a function is rare outside + // of SIMD CF, so we do not worry about that. + unsigned RetIdx = 0; + auto ValTy = IndexFlattener::getElementType( + Val.getValue()->getType(), Val.getIndex()); + if (auto ST = dyn_cast(CI->getType())) { + for (unsigned End = IndexFlattener::getNumElements(ST); ; ++RetIdx) { + if (RetIdx == End) + UsersToLower.push_back(SimpleValue(User, ui->getOperandNo())); // no predicate ret value found + if (IndexFlattener::getElementType(ST, RetIdx) == ValTy) { + ConnectedVals->push_back(SimpleValue(CI, RetIdx)); + break; + } + } + } else if (CI->getType() == ValTy) + ConnectedVals->push_back(SimpleValue(CI, 0)); + else if (!CI->getType()->isVoidTy()) + UsersToLower.push_back(SimpleValue(User, ui->getOperandNo())); // no predicate ret value found + break; + } + } + continue; + } + UsersToLower.push_back(SimpleValue(User, ui->getOperandNo())); + } + + if (LowerBadUsers) { + SetVector ToRemove; + for (auto BadUser : UsersToLower) { + replaceUseWithLoweredEM(dyn_cast(BadUser.getValue()), + BadUser.getIndex(), ToRemove); + } + for (auto Inst : ToRemove) { + removeFromEMRMVals(Inst); + } + } else { + if (!UsersToLower.empty()) + return false; + } + + return true; +} + +// check if this is an EM value or part of an EM value. +static bool isEM(Value *V) { + if (auto SI = dyn_cast(V)) + return isEM(SI->getOperand(0)) || isEM(SI->getOperand(1)); + return GotoJoin::isEMValue(V); +} + +// canonicalizeEM : canonicalize EM uses so that EM uses will not +// stop SIMD-CF conformance. +void GenXSimdCFConformance::canonicalizeEM() { + using namespace PatternMatch; + std::vector DeadInstructions; + + for (auto &F : M->getFunctionList()) + for (auto &BB : F.getBasicBlockList()) { + for (Instruction *Inst = BB.getTerminator(); Inst;) { + // select(C0&C1, a, b) -> select(C0, select(C1, a, b), b) + // select(C0|C1, a, b) -> select(C0, a, select(C1, a, b)) + Value *C0, *C1, *A, *B; + if (match(Inst, m_Select(m_BinOp(m_Value(C0), m_Value(C1)), m_Value(A), + m_Value(B)))) { + bool C1IsEM = isEM(C1); + if (C1IsEM || isEM(C0)) { + Value *Cond = Inst->getOperand(0); + if (Cond->getType()->isVectorTy()) { + BinaryOperator *BO = cast(Cond); + // Set Inst as insert point in order to save dominance + IRBuilder<> Builder(Inst); + if (C1IsEM) + std::swap(C0, C1); + if (BO->getOpcode() == BinaryOperator::And) { + Value *V = Builder.CreateSelect(C1, A, B); + V = Builder.CreateSelect(C0, V, B); + Inst->replaceAllUsesWith(V); + DeadInstructions.push_back(Inst); + } else if (BO->getOpcode() == BinaryOperator::Or) { + Value *V = Builder.CreateSelect(C1, A, B); + V = Builder.CreateSelect(C0, A, V); + Inst->replaceAllUsesWith(V); + DeadInstructions.push_back(Inst); + } + } + } + } + + Inst = (Inst == &BB.front()) ? nullptr : Inst->getPrevNode(); + } + } + + for (Instruction *I : DeadInstructions) + RecursivelyDeleteTriviallyDeadInstructions(I); + + // Collect data for gotos/joins EVs + handleEVs(); + // Resolve bitcast chains so they don't break conformance + resolveBitCastChains(); +} + +/*********************************************************************** + * handleEVs : collect goto/join EVs and perform some transformations + * on them. + * + * All transformations are done in GotoJoinEVs constructor. + */ +void GenXSimdCFConformance::handleEVs() +{ + // Collect gotos/joins + gatherGotoJoinEMVals(false); + for (auto val : EMVals) { + Value *GotoJoin = val.getValue(); + assert(GenXIntrinsic::getGenXIntrinsicID(GotoJoin) == GenXIntrinsic::genx_simdcf_goto || + GenXIntrinsic::getGenXIntrinsicID(GotoJoin) == GenXIntrinsic::genx_simdcf_join); + GotoJoinEVsMap[GotoJoin] = GotoJoinEVs(GotoJoin); + } + EMVals.clear(); +} + +/*********************************************************************** + * eliminateBitCastPreds : perform bitcast elimination on EM DF + * + * GetEMPred should be called earlier to check if Val is actually + * a EM producer. + */ +Value *GenXSimdCFConformance::eliminateBitCastPreds(Value *Val, std::set &DeadInst, std::set &Visited) +{ + Type *EMType = VectorType::get(Type::getInt1Ty(M->getContext()), 32); + + if (Visited.count(Val)) + { + return EMProducers[Val]; + } + + Visited.insert(Val); + + if (auto BCI = dyn_cast(Val)) { + assert(EMProducers[BCI] == BCI->getOperand(0) && "Bad EM producer was saved!"); + + DeadInst.insert(BCI); + return eliminateBitCastPreds(BCI->getOperand(0), DeadInst, Visited); + } else if (auto PN = dyn_cast(Val)) { + assert(EMProducers[PN] == PN && "Bad EM producer was saved!"); + + PHINode *NewPN = nullptr; + if (PN->getType() != EMType) { + // Different type at phi. This may happen if its incoming value + // became bitcast. + LLVM_DEBUG(dbgs() << "Creating new PHI for:\n" << *PN << "\n"); + NewPN = PHINode::Create(EMType, PN->getNumIncomingValues(), "EMTerm", PN); + EMProducers[NewPN] = NewPN; + // In case of cycle, we will return newly created phi + EMProducers[PN] = NewPN; + // Phi can become redundant after it + DeadInst.insert(PN); + } + + for (unsigned oi = 0, on = PN->getNumIncomingValues(); oi < on; ++oi) { + auto EMProd = eliminateBitCastPreds(PN->getIncomingValue(oi), DeadInst, Visited); + if (!NewPN) { + PN->setIncomingValue(oi, EMProd); + PN->setIncomingBlock(oi, PN->getIncomingBlock(oi)); + } else { + NewPN->addIncoming(EMProd, PN->getIncomingBlock(oi)); + } + } + + return NewPN ? NewPN : PN; + } else if (auto C = dyn_cast(Val)) { + assert(C->isAllOnesValue() && "Should be checked before!"); + assert(EMProducers[C] == C && "Bad EM producer was saved!"); + + return Constant::getAllOnesValue(EMType); + } else { + assert(Val && EMProducers[Val] == Val && "Bad EM producer was saved!"); + assert(Val->getType() == EMType && "Unexpected final EM producer!"); + + return Val; + } +} + +/*********************************************************************** + * resolveBitCastChains : resolve EM -> (bitcast) -> EM chains + * + * Standard LLVM passes create such chains sometimes and it makes + * SIMD CF non-conformant. Here we check this and make changes to + * resolve it if possible. If it is not, SIMD CF remains non-conformant + * and is lowered later. + */ +void GenXSimdCFConformance::resolveBitCastChains() +{ + LLVM_DEBUG(dbgs() << "Resolving Bitcast chains:\n"); + + // We don't have EM values here so we have to gather them + // here, too. This is because we can change EM values set + // during these transformations. + gatherEMVals(); + + std::set DeadInst; + for (auto Val : EMVals) { + if (auto PN = dyn_cast(Val.getValue())) { + LLVM_DEBUG(dbgs() << "Found phi:\n" << *PN << "\n"); + } else if (auto BCI = dyn_cast(Val.getValue())) { + LLVM_DEBUG(dbgs() << "Found bitcast:\n" << *BCI << "\n"); + } else + continue; + + std::set Visited; + Instruction *I = dyn_cast(Val.getValue()); + Value *EMProd = getEMProducer(I, Visited, true); + + if (!EMProd) { + LLVM_DEBUG(dbgs() << "!!! Not EM producer was detected when resolving bitcast chains !!!\n"); + continue; + } + + Visited.clear(); + Value *NewEMProd = eliminateBitCastPreds(EMProd, DeadInst, Visited); + if (NewEMProd != EMProd) { + EMProd->replaceAllUsesWith(NewEMProd); + } + } + + EMVals.clear(); + + for (auto DI : DeadInst) { + if (auto I = dyn_cast(DI)) + RecursivelyDeleteTriviallyDeadInstructions(I); + } + + // TODO: since we are using EMProducers only here and during get_em check, + // clean it after these transformation sinse it may contain dead data. + EMProducers.clear(); + + LLVM_DEBUG(dbgs() << "Done resolving bitcast chains:\n"); +} + +/*********************************************************************** + * checkEMInterference : check for EM values interfering with each other, + * lowering gotos/joins as necessary + * + * There is only one EM in the hardware, and we need to model that by ensuring + * that our multiple EM values, including phi nodes, do not interfere with each + * other. This is effectively a register allocator with only one register. + */ +void GenXSimdCFConformance::checkEMInterference() +{ + // Do an interference check, returning a list of defs that appear in the live + // range of other values. + SetVector BadDefs; + checkInterference(&EMVals, &BadDefs, nullptr); + for (auto i = BadDefs.begin(), e = BadDefs.end(); i != e; ++i) + removeBadEMVal(*i); +} + +/*********************************************************************** + * replaceUseWithLoweredEM : lower incoming EM for user. + * + * EM is being lowered via genx_simdcf_get_em intrinsic. + */ +void GenXSimdCFConformance::replaceUseWithLoweredEM(Instruction *Val, unsigned operandNo, SetVector &ToRemove) +{ + Value *EM = Val->getOperand(operandNo); + + LLVM_DEBUG(dbgs() << "Replacing EM use:\n" << *EM << "\nwith lowered EM for:\n" << *Val << "\n"); + + if (auto EVI = dyn_cast(EM)) { + CallInst *GotoJoin = dyn_cast(EVI->getOperand(0)); + assert(GotoJoin && (GenXIntrinsic::getGenXIntrinsicID(GotoJoin) == GenXIntrinsic::genx_simdcf_goto || + GenXIntrinsic::getGenXIntrinsicID(GotoJoin) == GenXIntrinsic::genx_simdcf_join)); + Type *Tys[] = { EVI->getType() }; + Function *GetEMDecl = GenXIntrinsic::getGenXDeclaration(M, GenXIntrinsic::genx_simdcf_get_em, Tys); + // The CFG was corrected for SIMD CF by earlier transformations + // so isBranchingGotoJoinBlock works correctly here. + if (GotoJoin::isBranchingGotoJoinBlock(GotoJoin->getParent()) == GotoJoin) { + // For branching case, we need to create false and true value + BasicBlock *DefBB = GotoJoin->getParent(); + BasicBlock *TrueBlock = DefBB->getTerminator()->getSuccessor(0); + BasicBlock *FalseBlock = DefBB->getTerminator()->getSuccessor(1); + + Value *TrueVal = Constant::getNullValue(EVI->getType()); + Value *FalseVal = CallInst::Create(GetEMDecl, { EVI }, "getEM", FalseBlock->getFirstNonPHI()); + + LLVM_DEBUG(dbgs() << "Built GetEM for Branching goto/join:\n" << *FalseVal << "\n"); + + std::map foundVals; + BasicBlockEdge TrueEdge(DefBB, TrueBlock); + BasicBlockEdge FalseEdge(DefBB, FalseBlock); + auto newPred = findGotoJoinVal(RegCategory::EM, Val->getParent(), EVI, + TrueEdge, FalseEdge, TrueVal, FalseVal, foundVals); + Val->setOperand(operandNo, newPred); + } else { + // Non-branching case: must be join. Insert get_em right after join's EM + assert(GenXIntrinsic::getGenXIntrinsicID(GotoJoin) == GenXIntrinsic::genx_simdcf_join && + "Gotos should be turned into branching earlier!"); + auto GetEM = CallInst::Create(GetEMDecl, { EVI }, "getEM", EVI->getParent()); + LLVM_DEBUG(dbgs() << "Built GetEM for simple join:\n" << *GetEM << "\n"); + GetEM->moveAfter(EVI); + Val->setOperand(operandNo, GetEM); + } + } else if (auto SVI = dyn_cast(EM)) { + // Shuffle vector: got through it and lower its pred + replaceUseWithLoweredEM(SVI, 0, ToRemove); + } else if (auto PN = dyn_cast(EM)) { + // The saddest case: for phi we need to lower all its preds + auto newPN = PN->clone(); + newPN->insertAfter(PN); + for (unsigned idx = 0, op_no = newPN->getNumOperands(); idx < op_no; ++idx) { + replaceUseWithLoweredEM(newPN, idx, ToRemove); + } + + Val->setOperand(operandNo, newPN); + } else if (auto Arg = dyn_cast(EM)) { + // Create get_em at function enter. This may happen if argument's user + // is moved under SIMD CF due to some reason. + Type *Tys[] = { Arg->getType() }; + Function *GetEMDecl = GenXIntrinsic::getGenXDeclaration(M, GenXIntrinsic::genx_simdcf_get_em, Tys); + auto GetEM = CallInst::Create(GetEMDecl, { Arg }, "getEM", Arg->getParent()->front().getFirstNonPHI()); + Val->setOperand(operandNo, GetEM); + } else + // All other instructions should not be EM producers with correct DF + assert("Failed to lower EM!"); + + ToRemove.insert(Val); +} + +/*********************************************************************** + * canUseLoweredEM : check whether instruction can use lowered EM + * + * Lowered EM is an explicit value that can be consumed by any + * instruction except of goto and join because they take implicit EM. + */ +bool GenXSimdCFConformance::canUseLoweredEM(Instruction *Val) +{ + if (GenXIntrinsic::getGenXIntrinsicID(Val) == GenXIntrinsic::genx_simdcf_goto || + GenXIntrinsic::getGenXIntrinsicID(Val) == GenXIntrinsic::genx_simdcf_join) + return false; + + // For phi, check that it does not deal with goto or join. + if (auto PN = dyn_cast(Val)) { + for (unsigned idx = 0, opNo = PN->getNumIncomingValues(); idx < opNo; ++idx) { + auto Inst = dyn_cast(PN->getOperand(idx)); + if (Inst) { + auto Pred = Inst->getOperand(0); + if (GenXIntrinsic::getGenXIntrinsicID(Pred) == GenXIntrinsic::genx_simdcf_goto || + GenXIntrinsic::getGenXIntrinsicID(Pred) == GenXIntrinsic::genx_simdcf_join) + return false; + } + } + } + + return true; +} + +/*********************************************************************** + * checkInterference : check for a list of values interfering with each other + * + * Enter: Vals = values to check (not constants) + * BadDefs = SetVector in which to store any def that is found in the + * live range of another def + * ConstStop = instruction to treat as the def point of a constantpred, + * nullptr to treat the start of the function as the def + * point + * + * This code finds interference by scanning back from uses, finding other defs, + * relying on the dominance property of SSA. Having found that two EM values A + * and B interfere due to the def of A appearing in the live range of B, we + * could choose either one to lower its goto and join. In fact we choose A (the + * found def), as that tends to lower inner SIMD CF, giving a chance for the + * outer SIMD CF to become legal. + * + * Because GenXSimdCFConformance runs before live ranges are determined, so + * that it can modify code as it wants, we cannot use the normal interference + * testing code in GenXLiveness. + * + * The idea of ConstStop is different depending on whether we are testing + * interference of all EM values, or all RM values for a particular join: + * + * * For interference between all EM values, any constant (input to + * constantpred intrinsic) must be all ones, which is checked elsewhere. It + * represents the state of the execution mask at the start of the function, + * therefore we need to pretend that the constantpred's live range extends + * back to the start of the function. This is done by the caller setting + * ConstStop to 0. + * + * * For interference between all RM values for one particular join, any + * constant must be all zeros, which is checked elsewhere. It represents the + * state of that join's resume mask on entry to the function, and just after + * executing the join. Therefore we need to pretend that the constantpred's + * live range extends back to those two places. This is done by the caller + * setting ConstStop to the join instruction. + */ +void GenXSimdCFConformance::checkInterference(SetVector *Vals, + SetVector *BadDefs, Instruction *ConstStop) +{ + // Scan the live range of each value, looking for a def of another value. + // Finding such a def indicates interference. + SetVector ToRemove; + for (auto evi = Vals->begin(), eve = Vals->end(); evi != eve; ++evi) { + Value *EMVal = evi->getValue(); + bool IsConstantPred = GenXIntrinsic::getGenXIntrinsicID(EMVal) == GenXIntrinsic::genx_constantpred; + // Set of blocks where we know the value is live out. + SmallSet LiveOut; + // Start from each use and scan backwards. + for (auto ui = EMVal->use_begin(), ue = EMVal->use_end(); ui != ue;) { + auto User = cast(ui->getUser()); + auto OpNo = ui->getOperandNo(); + ++ui; + if (auto EVI = dyn_cast(User)) { + // Ignore a use that is an extractvalue not involving the right struct + // index. + unsigned StartIndex = IndexFlattener::flatten( + cast(EVI->getOperand(0)->getType()), EVI->getIndices()); + unsigned NumIndices = IndexFlattener::getNumElements(EVI->getType()); + if (evi->getIndex() - StartIndex >= NumIndices) + continue; + } + BasicBlock *PhiPred = nullptr; + if (auto Phi = dyn_cast(User)) + PhiPred = Phi->getIncomingBlock(OpNo); + auto Inst = User; + SmallVector PendingBBStack; + for (;;) { + if (!Inst) { + // Go on to the next pending predecessor. + if (PendingBBStack.empty()) + break; + Inst = PendingBBStack.back()->getTerminator(); + PendingBBStack.pop_back(); + } + if (&Inst->getParent()->front() == Inst) { + // Reached the start of the block. Make all unprocessed predecessors + // pending. Except if the use is in a phi node and this is the first + // time we reach the start of a block: in that case, mark only the + // corresponding phi block is pending. + if (PhiPred) { + if (LiveOut.insert(PhiPred).second) + PendingBBStack.push_back(PhiPred); + PhiPred = nullptr; + } else { + for (auto bui = Inst->getParent()->use_begin(), + bue = Inst->getParent()->use_end(); bui != bue; ++bui) { + auto Pred = cast(bui->getUser())->getParent(); + if (LiveOut.insert(Pred).second) + PendingBBStack.push_back(Pred); + } + } + Inst = nullptr; + continue; + } + // Go back to the previous instruction. (This happens even when + // starting at the end of a new block, thus skipping scanning the uses + // of the terminator, but that's OK because the terminator never uses + // our EM or RM values.) + Inst = Inst->getPrevNode(); + if (Inst == EMVal && !IsConstantPred) { + // Reached the def of the value. Stop scanning, unless the def is + // constantpred, in which case we pretend it was live from the + // ConstStop. + Inst = nullptr; + continue; + } + if (Inst == ConstStop && IsConstantPred) { + // For a constantpred value, we have reached the point that we want + // to treat as its definition point. Stop scanning. + Inst = nullptr; + continue; + } + // Check if this is the def of some other EM value. + if (auto VT = dyn_cast(Inst->getType())) + if (VT->getElementType()->isIntegerTy(1)) + if (Vals->count(Inst) && !ToRemove.count(Inst)) { + // It is the def of some other EM value. Mark that one as + // interfering. However do not mark it if both values are + // constantpred, since we pretend all of those are defined at the + // start of the function. + if (!IsConstantPred + || GenXIntrinsic::getGenXIntrinsicID(Inst) != GenXIntrinsic::genx_constantpred) { + LLVM_DEBUG(dbgs() << "GenXSimdCFConformance::checkInterference: def of " << Inst->getName() << " found in live range of " << EMVal->getName() << "\n"); + auto SVI = dyn_cast(Inst); + if (SVI && SVI->getOperand(0) == EMVal) { + // Shuffle vector is baled as EM of another size: this check is to + // ensure that the EM in SVI is still actual + LLVM_DEBUG(dbgs() << "\tShuffle vector with correct arg, skipping it\n"); + } else if (canUseLoweredEM(User) && !FG) { + // Lower EM in Early Pass + replaceUseWithLoweredEM(User, OpNo, ToRemove); + LLVM_DEBUG(dbgs() << "\tSucceded to lower EM for that use\n"); + } else { + LLVM_DEBUG(dbgs() << "\t!!! Failed to lower EM for that use: def will be lowered\n"); + BadDefs->insert(Inst); + } + // Done for that use + break; + } + } + } + } + } + + for (auto Inst : ToRemove) { + removeFromEMRMVals(Inst); + } +} + +/*********************************************************************** + * insertCond : insert a vector of i1 value into the start of another one + * + * Enter: OldVal = value to insert into + * NewVal = value to insert, at index 0 + * Name = name for any new instruction + * InsertBefore = where to insert any new instruction + * DL = debug loc to give any new instruction + * + * Return: value, possibly the same as the input value + */ +Value *GenXSimdCFConformance::insertCond(Value *OldVal, Value *NewVal, + const Twine &Name, Instruction *InsertBefore, const DebugLoc &DL) +{ + unsigned OldWidth = OldVal->getType()->getVectorNumElements(); + unsigned NewWidth = NewVal->getType()->getVectorNumElements(); + if (OldWidth == NewWidth) + return NewVal; + // Do the insert with shufflevector. We need two shufflevectors, one to extend + // NewVal to OldVal's width, and one to combine them. + // GenXLowering decides whether this is suitable to lower to wrpredregion, or + // needs to be lowered to something less efficient. + SmallVector Indices; + Type *I32Ty = Type::getInt32Ty(InsertBefore->getContext()); + unsigned i; + for (i = 0; i != NewWidth; ++i) + Indices.push_back(ConstantInt::get(I32Ty, i)); + auto UndefIndex = UndefValue::get(I32Ty); + for (; i != OldWidth; ++i) + Indices.push_back(UndefIndex); + auto SV1 = new ShuffleVectorInst(NewVal, UndefValue::get(NewVal->getType()), + ConstantVector::get(Indices), NewVal->getName() + ".extend", InsertBefore); + SV1->setDebugLoc(DL); + if (isa(OldVal)) + return SV1; + Indices.clear(); + for (i = 0; i != NewWidth; ++i) + Indices.push_back(ConstantInt::get(I32Ty, i + OldWidth)); + for (; i != OldWidth; ++i) + Indices.push_back(ConstantInt::get(I32Ty, i)); + auto SV2 = new ShuffleVectorInst(OldVal, SV1, ConstantVector::get(Indices), + Name, InsertBefore); + SV2->setDebugLoc(DL); + return SV2; +} + +/*********************************************************************** + * truncateCond : truncate a vector of i1 value + * + * Enter: In = input value + * Ty = type to truncate to + * Name = name for any new instruction + * InsertBefore = where to insert any new instruction + * DL = debug loc to give any new instruction + * + * Return: value, possibly the same as the input value + */ +Value *GenXSimdCFConformance::truncateCond(Value *In, Type *Ty, + const Twine &Name, Instruction *InsertBefore, const DebugLoc &DL) +{ + unsigned InWidth = In->getType()->getVectorNumElements(); + unsigned TruncWidth = Ty->getVectorNumElements(); + if (InWidth == TruncWidth) + return In; + // Do the truncate with shufflevector. GenXLowering lowers it to rdpredregion. + SmallVector Indices; + Type *I32Ty = Type::getInt32Ty(InsertBefore->getContext()); + unsigned i; + for (i = 0; i != TruncWidth; ++i) + Indices.push_back(ConstantInt::get(I32Ty, i)); + auto SV = new ShuffleVectorInst(In, UndefValue::get(In->getType()), + ConstantVector::get(Indices), Name, InsertBefore); + SV->setDebugLoc(DL); + return SV; +} + +/*********************************************************************** + * lowerGoto : lower a llvm.genx.simdcf.goto + * + * This also outputs a warning that we failed to optimize a SIMD branch. + * We always output it, rather than including it in the -rpass mechanism + * to enable or disable the warning, as it is an unexpected situation that + * we want our users to report. + */ +void GenXSimdCFConformance::lowerGoto(CallInst *Goto) +{ + LLVM_DEBUG(dbgs() << "lowerGoto: " << *Goto << "\n"); + const DebugLoc &DL = Goto->getDebugLoc(); + if (EnableGenXGotoJoin && !lowerSimdCF) + DiagnosticInfoSimdCF::emit(Goto, "failed to optimize SIMD branch", DS_Warning); + Value *Results[3]; + auto EM = Goto->getOperand(0); + auto Cond = Goto->getOperand(2); + // EM is always 32 bit. Extract SubEM, of the same width as Cond, from it. + auto OldSubEM = truncateCond(EM, Cond->getType(), + EM->getName() + ".sub", Goto, DL); + // Result 1: NewRM = OldRM | (SubEM & ~Cond) + auto NotCond = BinaryOperator::Create(Instruction::Xor, Cond, + Constant::getAllOnesValue(Cond->getType()), + Goto->getName() + ".notcond", Goto); + NotCond->setDebugLoc(DL); + auto NotCondAndSubEM = BinaryOperator::Create(Instruction::And, NotCond, + OldSubEM, Goto->getName() + ".disabling", Goto); + NotCondAndSubEM->setDebugLoc(DL); + Value *OldRM = Goto->getArgOperand(1); + auto NewRM = BinaryOperator::Create(Instruction::Or, OldRM, NotCondAndSubEM, + Goto->getName() + ".newRM", Goto); + NewRM->setDebugLoc(DL); + Results[1] = NewRM; + // And SubEM with Cond. + auto SubEM = BinaryOperator::Create(Instruction::And, OldSubEM, Cond, + Goto->getName() + ".subEM", Goto); + SubEM->setDebugLoc(DL); + // Insert that back into EM. That is result 0. + Results[0] = EM = insertCond(EM, SubEM, Goto->getName() + ".EM", Goto, DL); + // Result 2: BranchCond = !any(SubEM) + Function *AnyFunc = GenXIntrinsic::getGenXDeclaration(M, GenXIntrinsic::genx_any, + SubEM->getType()); + auto Any = CallInst::Create(AnyFunc, SubEM, + SubEM->getName() + ".any", Goto); + Any->setDebugLoc(DL); + auto Not = BinaryOperator::Create(Instruction::Xor, Any, + Constant::getAllOnesValue(Any->getType()), + Any->getName() + ".not", Goto); + Not->setDebugLoc(DL); + Results[2] = Not; + // Replace uses. + replaceGotoJoinUses(Goto, Results); + Goto->eraseFromParent(); + Modified = true; +} + +/*********************************************************************** + * lowerJoin : lower a llvm.genx.simdcf.join + */ +void GenXSimdCFConformance::lowerJoin(CallInst *Join) +{ + LLVM_DEBUG(dbgs() << "lowerJoin: " << *Join << "\n"); + const DebugLoc &DL = Join->getDebugLoc(); + Value *Results[2]; + auto EM = Join->getOperand(0); + auto RM = Join->getOperand(1); + // EM is always 32 bit. Extract SubEM, of the same width as RM, from it. + auto OldSubEM = truncateCond(EM, RM->getType(), EM->getName() + ".sub", + Join, DL); + // Or it with RM. + auto SubEM = BinaryOperator::Create(Instruction::Or, OldSubEM, RM, + Join->getName() + ".subEM", Join); + SubEM->setDebugLoc(DL); + // Insert that back into EM. That is result 0. + Results[0] = EM = insertCond(EM, SubEM, Join->getName() + ".EM", Join, DL); + // Result 1: BranchCond = !any(SubEM) + Function *AnyFunc = GenXIntrinsic::getGenXDeclaration(M, GenXIntrinsic::genx_any, + SubEM->getType()); + auto Any = CallInst::Create(AnyFunc, SubEM, + SubEM->getName() + ".any", Join); + Any->setDebugLoc(DL); + auto Not = BinaryOperator::Create(Instruction::Xor, Any, + Constant::getAllOnesValue(Any->getType()), + Any->getName() + ".not", Join); + Not->setDebugLoc(DL); + Results[1] = Not; + // Replace uses. + replaceGotoJoinUses(Join, Results); + Join->eraseFromParent(); + Modified = true; +} + +/*********************************************************************** + * replaceGotoJoinUses : replace uses of goto/join + * + * The goto and join intrinsics have multiple return values in a struct. + * This attempts to find the extractvalues and replace those directly. + * It also spots where a value is unused. + */ +void GenXSimdCFConformance::replaceGotoJoinUses(CallInst *GotoJoin, + ArrayRef Vals) +{ + SmallVector Extracts; + for (auto ui = GotoJoin->use_begin(), ue = GotoJoin->use_end(); + ui != ue; ++ui) { + auto Extract = dyn_cast(ui->getUser()); + if (Extract) + Extracts.push_back(Extract); + } + for (auto ei = Extracts.begin(), ee = Extracts.end(); ei != ee; ++ei) { + auto Extract = *ei; + unsigned Index = Extract->getIndices()[0]; + if (Index >= Vals.size()) + continue; + Extract->replaceAllUsesWith(Vals[Index]); + Extract->eraseFromParent(); + } + if (!GotoJoin->use_empty()) { + // There are still some uses of the original goto/join. We need to + // aggregate the result values into a struct. + Value *StructVal = UndefValue::get(GotoJoin->getType()); + Instruction *InsertBefore = GotoJoin->getNextNode(); + for (unsigned Index = 0, + End = cast(GotoJoin->getType())->getNumElements(); + Index != End; ++Index) + StructVal = InsertValueInst::Create(StructVal, Vals[Index], + Index, "", InsertBefore); + GotoJoin->replaceAllUsesWith(StructVal); + } else { + // Remove code for unused value. This is particularly useful at an outer + // join, where the !any(NewEM) is unused, so we don't need to compute it. + for (unsigned vi = 0; vi != Vals.size(); ++vi) { + Value *V = Vals[vi]; + while (V && V->use_empty()) { + auto I = dyn_cast(V); + if (I == nullptr) + continue; + unsigned NumOperands = I->getNumOperands(); + if (auto CI = dyn_cast(I)) + NumOperands = CI->getNumArgOperands(); + V = nullptr; + if (NumOperands == 1) + V = I->getOperand(0); + I->eraseFromParent(); + } + } + } +} + +/*********************************************************************** + * setCategories : set webs of EM and RM values to category EM or RM + * + * This also modifies EM uses as needed. + */ +void GenXLateSimdCFConformance::setCategories() +{ + // First the EM values. + for (auto ei = EMVals.begin(); ei != EMVals.end(); /* empty */) { + SimpleValue EMVal = *ei; + ei++; + // For this EM value, set its category and modify its uses. + Liveness->getOrCreateLiveRange(EMVal)->setCategory(RegCategory::EM); + LLVM_DEBUG(dbgs() << "Set category for:\n" << *EMVal.getValue() << "\n"); + if (!isa(EMVal.getValue()->getType())) + modifyEMUses(EMVal.getValue()); + switch (GenXIntrinsic::getGenXIntrinsicID(EMVal.getValue())) { + case GenXIntrinsic::genx_simdcf_join: { + // For a join, set the category of each RM value. + auto RMValsEntry = &RMVals[cast(EMVal.getValue())]; + for (auto vi = RMValsEntry->begin(), ve = RMValsEntry->end(); vi != ve; ++vi) { + SimpleValue RMVal = *vi; + // For this RM value, set its category. + Liveness->getOrCreateLiveRange(RMVal)->setCategory(RegCategory::RM); + } + } + // Fall through... + case GenXIntrinsic::genx_simdcf_goto: { + // See if this is a branching goto/join where the "true" successor is + // an empty critical edge splitter block. + auto CI = cast(EMVal.getValue()); + BasicBlock *BB = CI->getParent(); + if (GotoJoin::isBranchingGotoJoinBlock(BB) == CI) { + BasicBlock *TrueSucc = BB->getTerminator()->getSuccessor(0); + if (BasicBlock *TrueSuccSucc + = getEmptyCriticalEdgeSplitterSuccessor(TrueSucc)) { + for (auto i = TrueSucc->begin(); i != TrueSucc->end(); /*empty*/) { + Instruction *Inst = &*i++; + auto Phi = dyn_cast(Inst); + if (!Phi) + break; + if (Phi->getNumIncomingValues() == 1) { + Phi->replaceAllUsesWith(Phi->getIncomingValue(0)); + Liveness->eraseLiveRange(Phi); + removeFromEMRMVals(Phi); + Phi->eraseFromParent(); + } + } + // now BB should be truely empty + assert(TrueSucc->front().isTerminator() && + "BB is not empty for removal"); + // For a branching goto/join where the "true" successor is an empty + // critical edge splitter block, remove the empty block, to ensure + // that the "true" successor is a join label. + // Adjust phi nodes in TrueSuccSucc. + adjustPhiNodesForBlockRemoval(TrueSuccSucc, TrueSucc); + // Replace the use (we know there is only the one). + BB->getTerminator()->setSuccessor(0, TrueSuccSucc); + // Erase the critical edge splitter block. + TrueSucc->eraseFromParent(); + Modified = true; + } + } + break; + } + default: + break; + } + } +} + +/*********************************************************************** + * modifyEMUses : modify EM uses as needed + */ +void GenXLateSimdCFConformance::modifyEMUses(Value *EM) +{ + LLVM_DEBUG(dbgs() << "modifyEMUses: " << EM->getName() << "\n"); + // Gather the selects we need to modify, at the same time as handling other + // uses of the EM values. + SmallVector Selects; + SmallVector EMs; + EMs.push_back(EM); + for (unsigned ei = 0; ei != EMs.size(); ++ei) { + EM = EMs[ei]; + // Scan EM's uses. + for (auto ui = EM->use_begin(), ue = EM->use_end(); ui != ue; ++ui) { + auto User = cast(ui->getUser()); + if (auto Sel = dyn_cast(User)) { + assert(!ui->getOperandNo()); + Selects.push_back(Sel); + } else switch (GenXIntrinsic::getAnyIntrinsicID(User)) { + case GenXIntrinsic::genx_rdpredregion: + // An rdpredregion of the EM. Find its uses in select too. + EMs.push_back(User); + break; +#ifndef NDEBUG + case GenXIntrinsic::genx_simdcf_goto: + case GenXIntrinsic::genx_simdcf_join: + case GenXIntrinsic::genx_simdcf_get_em: + break; + case GenXIntrinsic::genx_wrregioni: + case GenXIntrinsic::genx_wrregionf: + assert(ui->getOperandNo() == GenXIntrinsic::GenXRegion::PredicateOperandNum); + break; + case GenXIntrinsic::genx_wrpredpredregion: + break; + default: + if (isa(User) || isa(User) + || isa(User)) + break; + assert(!cast(User)->getCalledFunction()->doesNotAccessMemory() + && "unexpected ALU intrinsic use of EM"); + break; + case GenXIntrinsic::not_any_intrinsic: + assert((isa(User) || isa(User) || + isa(User) || isa(User) || + isa(User)) && + "unexpected use of EM"); +#endif + } + } + } + // Modify each select into a predicated wrregion. + for (auto si = Selects.begin(), se = Selects.end(); si != se; ++si) { + auto Sel = *si; + Value *FalseVal = Sel->getFalseValue(); + if (auto C = dyn_cast(FalseVal)) { + if (!isa(C)) { + // The false value needs loading if it is a constant other than + // undef. + SmallVector AddedInstructions; + FalseVal = ConstantLoader(C, nullptr, &AddedInstructions).loadBig(Sel); + // ConstantLoader generated at least one instruction. Ensure that + // each one has debug loc and category. + for (auto aii = AddedInstructions.begin(), aie = AddedInstructions.end(); + aii != aie; ++aii) { + Instruction *I = *aii; + I->setDebugLoc(Sel->getDebugLoc()); + } + } + } + Region R(Sel); + R.Mask = Sel->getCondition(); + assert(FalseVal); + Value *Wr = R.createWrRegion(FalseVal, Sel->getTrueValue(), + Sel->getName(), Sel, Sel->getDebugLoc()); + Sel->replaceAllUsesWith(Wr); + Liveness->eraseLiveRange(Sel); + Sel->eraseFromParent(); + Modified = true; + } +} + +/*********************************************************************** + * GotoJoinEVs::GotoJoinEVs : collects and handle EVs. See CollectEVs + * for more info. + */ +GenXSimdCFConformance::GotoJoinEVs::GotoJoinEVs(Value* GJ) { + GotoJoin = GJ; + + if (!GotoJoin) + return; + + switch (GenXIntrinsic::getGenXIntrinsicID(GotoJoin)) { + case GenXIntrinsic::genx_simdcf_goto: + IsGoto = true; + break; + case GenXIntrinsic::genx_simdcf_join: + IsGoto = false; + break; + default: + assert(false && "Expected goto or join!"); + break; + } + + CollectEVs(); +} + +/*********************************************************************** + * GotoJoinEVs::getEMEV : get EV for goto/join Execution Mask + */ +ExtractValueInst *GenXSimdCFConformance::GotoJoinEVs::getEMEV() const { + assert(GotoJoin && "Uninitialized GotoJoinEVs Data!"); + return EVs[EMPos]; +} + +/*********************************************************************** + * GotoJoinEVs::getRMEV : get EV for goto/join Resume Mask + */ +ExtractValueInst *GenXSimdCFConformance::GotoJoinEVs::getRMEV() const { + assert(GotoJoin && "Uninitialized GotoJoinEVs Data!"); + assert(IsGoto && "Only goto returns RM!"); + return EVs[RMPos]; +} + +/*********************************************************************** + * GotoJoinEVs::getCondEV : get EV for goto/join condition + */ +ExtractValueInst *GenXSimdCFConformance::GotoJoinEVs::getCondEV() const { + assert(GotoJoin && "Uninitialized GotoJoinEVs Data!"); + return IsGoto ? EVs[GotoCondPos] : EVs[JoinCondPos]; +} + +Value *GenXSimdCFConformance::GotoJoinEVs::getGotoJoin() const { + assert(GotoJoin && "Uninitialized GotoJoinEVs Data!"); + return GotoJoin; +} + +/*********************************************************************** + * GotoJoinEVs::getSplitPoint : find first instruction that is not + * a EV or doesn't use Goto/Join. Such instruction always exists + * in a correct IR - BB terminator is a such instruction. + */ + Instruction *GenXSimdCFConformance::GotoJoinEVs::getSplitPoint() const { + assert(GotoJoin && "Uninitialized GotoJoinEVs Data!"); + Instruction *SplitPoint = cast(GotoJoin)->getNextNode(); + for (; isa(SplitPoint) && SplitPoint->getOperand(0) == GotoJoin; + SplitPoint = SplitPoint->getNextNode()); + return SplitPoint; + } + +/*********************************************************************** + * GotoJoinEVs::setCondEV : set EV for goto/join condition. It is + * needed on basic block splitting to handle bad Cond EV user. + */ +void GenXSimdCFConformance::GotoJoinEVs::setCondEV(ExtractValueInst *CondEV) { + assert(GotoJoin && "Uninitialized GotoJoinEVs Data!"); + assert(!getCondEV() && "CondEV is already set!"); + if (IsGoto) + EVs[GotoCondPos] = CondEV; + else + EVs[JoinCondPos] = CondEV; +} + +/*********************************************************************** + * GotoJoinEVs::isGoto : check wether this EVs info belongs to goto + */ +bool GenXSimdCFConformance::GotoJoinEVs::isGoto() const { + assert(GotoJoin && "Uninitialized GotoJoinEVs Data!"); + return IsGoto; +} + +/*********************************************************************** + * GotoJoinEVs::isJoin : check wether this EVs info belongs to join + */ +bool GenXSimdCFConformance::GotoJoinEVs::isJoin() const { + assert(GotoJoin && "Uninitialized GotoJoinEVs Data!"); + return !IsGoto; +} + +/*********************************************************************** + * GotoJoindEVs::CollectEVs : handle and store goto/join EVs + * + * This does the following steps: + * - Locate EVs. If we found a duplicate, just replace users. + * - Move EVs right after the goto/join + * - Add missing EM and RM. This is needed for correct liverange + * interference analysis. + */ +void GenXSimdCFConformance::GotoJoinEVs::CollectEVs() { + assert(GotoJoin && "Uninitialized GotoJoinEVs Data!"); + assert((GenXIntrinsic::getGenXIntrinsicID(GotoJoin) == GenXIntrinsic::genx_simdcf_goto || + GenXIntrinsic::getGenXIntrinsicID(GotoJoin) == GenXIntrinsic::genx_simdcf_join) && + "Expected goto or join!"); + + auto GotoJoinInst = dyn_cast(GotoJoin); + + // Collect EVs, hoist them, resolve duplications + for (auto ui = GotoJoin->use_begin(), ue = GotoJoin->use_end(); ui != ue;) { + + auto EV = dyn_cast(ui->getUser()); + ++ui; + + assert(EV && "Bad user of goto/join!"); + assert(EV->getNumIndices() == 1 && "Expected 1 index in Extract Value for goto/join!"); + + unsigned idx = EV->getIndices()[0]; +#ifndef NDEBUG + switch (idx) { + case EMPos: + case RMPos: // same as JoinCondPos + break; + case GotoCondPos: + if (IsGoto) + break; + default: + assert(false && "Bad index in ExtractValue for goto/join!"); + break; + } +#endif + + LLVM_DEBUG(dbgs() << "Found EV:\n" << *EV << "\n"); + if (EVs[idx]) { + LLVM_DEBUG(dbgs() << "Duplication: replacing users with:\n" << *EVs[idx] << "\n"); + EV->replaceAllUsesWith(EVs[idx]); + EV->eraseFromParent(); + } + else { + LLVM_DEBUG(dbgs() << "Saving it.\n"); + EVs[idx] = EV; + } + } + + // Add missing EVs for masks + for (unsigned idx = 0, end = IsGoto ? RMPos : EMPos; idx <= end; ++idx) { + if (EVs[idx]) + continue; + + std::string Name = "missing"; + switch (idx) { + case EMPos: + Name += "EMEV"; + break; + case RMPos: + Name += "RMEV"; + break; + case GotoCondPos: + Name += "CondEV"; + break; + } + + auto EV = ExtractValueInst::Create(GotoJoin, { idx }, Name, GotoJoinInst->getParent()); + EVs[idx] = EV; + } + + hoistEVs(); +} + +/*********************************************************************** + * GotoJoinEVs::hoistEVs : move EVs right after goto/join + */ +void GenXSimdCFConformance::GotoJoinEVs::hoistEVs() const{ + assert(GotoJoin && "Uninitialized GotoJoinEVs Data!"); + + LLVM_DEBUG(dbgs() << "Moving EV users after:\n" << *GotoJoin << "\n"); + + for (unsigned idx = 0, num = PosNum; idx < num; ++idx) { + if (EVs[idx]) + EVs[idx]->moveAfter(dyn_cast(GotoJoin)); + } +} + +/*********************************************************************** + * DiagnosticInfoSimdCF::emit : emit an error or warning + */ +void DiagnosticInfoSimdCF::emit(Instruction *Inst, StringRef Msg, + DiagnosticSeverity Severity) +{ + DiagnosticInfoSimdCF Err(Severity, *Inst->getParent()->getParent(), + Inst->getDebugLoc(), Msg); + Inst->getContext().diagnose(Err); +} + diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXSubtarget.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXSubtarget.cpp new file mode 100644 index 000000000000..33454d6780d7 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXSubtarget.cpp @@ -0,0 +1,145 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +// This file implements the GenX specific subclass of TargetSubtargetInfo. +// +//===----------------------------------------------------------------------===// + +#include "GenXSubtarget.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Target/TargetMachine.h" +#include "llvm/Target/TargetOptions.h" + +using namespace llvm; + +#define DEBUG_TYPE "subtarget" + +#define GET_SUBTARGETINFO_TARGET_DESC +#define GET_SUBTARGETINFO_CTOR +#define GET_SUBTARGETINFO_MC_DESC +#include "GenXGenSubtargetInfo.inc" + +static cl::opt + StackScratchMem("stack-scratch-mem", + cl::desc("Specify what surface should be used for stack"), + cl::init(true)); +static cl::opt StackMemSize("stack-mem-size", + cl::desc("Available space for stack"), + cl::init(8 * 1024)); + +void GenXSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) { + + DumpRegAlloc = false; + EmitCisa = false; + HasLongLong = false; + DisableJmpi = false; + DisableVectorDecomposition = false; + WarnCallable = false; + OCLRuntime = false; + + if (StackScratchMem) + StackSurf = PreDefined_Surface::PREDEFINED_SURFACE_T255; + else + StackSurf = PreDefined_Surface::PREDEFINED_SURFACE_STACK; + StackSurfMaxSize = StackMemSize; + UseGlobalMem = false; + + GenXVariant = llvm::StringSwitch(CPU) + .Case("HSW", GENX_HSW) + .Case("BDW", GENX_BDW) + .Case("CHV", GENX_CHV) + .Case("SKL", GENX_SKL) + .Case("BXT", GENX_BXT) + .Case("KBL", GENX_KBL) + .Case("GLK", GENX_GLK) + .Case("CNL", GENX_CNL) + .Case("ICLLP", GENX_ICLLP) + .Case("TGLLP", GENX_TGLLP) + .Default(GENX_SKL); + + std::string CPUName = CPU; + if (CPUName.empty()) + CPUName = "generic"; + + ParseSubtargetFeatures(CPUName, FS); +} + +GenXSubtarget::GenXSubtarget(const Triple &TT, const std::string &CPU, + const std::string &FS) + : GenXGenSubtargetInfo(TT, CPU, FS), TargetTriple(TT) { + + resetSubtargetFeatures(CPU, FS); +} + +StringRef GenXSubtarget::getEmulateFunction(const Instruction *Inst) const { + StringRef EmuFnName; + if (emulateIDivRem()) { + unsigned Opcode = Inst->getOpcode(); + switch (Opcode) { + default: + break; + case BinaryOperator::SDiv: + EmuFnName = "__cm_intrinsic_impl_sdiv"; + break; + case BinaryOperator::SRem: + EmuFnName = "__cm_intrinsic_impl_srem"; + break; + case BinaryOperator::UDiv: + EmuFnName = "__cm_intrinsic_impl_udiv"; + break; + case BinaryOperator::URem: + EmuFnName = "__cm_intrinsic_impl_urem"; + break; + } + } + return EmuFnName; +} + +GenXSubtargetPass::GenXSubtargetPass() : ImmutablePass(ID), ST(nullptr) {} +GenXSubtargetPass::GenXSubtargetPass(GenXSubtarget &ST) + : ImmutablePass(ID), ST(&ST) {} +GenXSubtargetPass::~GenXSubtargetPass() {} + +char GenXSubtargetPass::ID = 0; + +namespace llvm { + +void initializeGenXSubtargetPassPass(PassRegistry &); + +ImmutablePass *createGenXSubtargetPass(GenXSubtarget &ST) { + initializeGenXSubtargetPassPass(*PassRegistry::getPassRegistry()); + return new GenXSubtargetPass(ST); +} + +} // namespace llvm + +INITIALIZE_PASS_BEGIN(GenXSubtargetPass, "GenXSubtargetPass", "GenXSubtargetPass", false, true) +INITIALIZE_PASS_END(GenXSubtargetPass, "GenXSubtargetPass", "GenXSubtargetPass", false, true) diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXSubtarget.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXSubtarget.h new file mode 100644 index 000000000000..2f7b158faa8d --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXSubtarget.h @@ -0,0 +1,293 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXSubtarget : subtarget information +/// ------------------------------------- +/// +/// GenXSubtarget is the GenX-specific subclass of TargetSubtargetInfo. It takes +/// features detected by the front end (what the Gen architecture is), +/// and exposes flags to the rest of the GenX backend for +/// various features (e.g. whether 64 bit operations are supported). +/// +/// Where subtarget features are used is noted in the documentation of GenX +/// backend passes. +/// +/// The flags exposed to the rest of the GenX backend are as follows. Most of +/// these are currently not used. +/// +//===----------------------------------------------------------------------===// + +#ifndef GENXSUBTARGET_H +#define GENXSUBTARGET_H + +#include "llvm/ADT/StringSwitch.h" +#include "llvm/ADT/Triple.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/Pass.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "visa_igc_common_header.h" +#include + +#define GET_SUBTARGETINFO_HEADER +#define GET_SUBTARGETINFO_ENUM +#include "GenXGenSubtargetInfo.inc" + +namespace llvm { +class GlobalValue; +class Instruction; +class StringRef; +class TargetMachine; + +class GenXSubtarget final : public GenXGenSubtargetInfo { + +protected: + // TargetTriple - What processor and OS we're targeting. + Triple TargetTriple; + + enum GenXTag { + GENX_GENERIC, + GENX_HSW, + GENX_BDW, + GENX_CHV, + GENX_SKL, + GENX_BXT, + GENX_KBL, + GENX_GLK, + GENX_CNL, + GENX_ICLLP, + GENX_TGLLP, + }; + + // GenXVariant - GenX Tag identifying the variant to compile for + GenXTag GenXVariant; + +private: + // DumpRegAlloc - True if we should dump register allocation information + bool DumpRegAlloc; + + // EmitCisa Builder - True if we should generate CISA instead of VISA + bool EmitCisa; + + // HasLongLong - True if subtarget supports long long type + bool HasLongLong; + + // DisableJmpi - True if jmpi is disabled. + bool DisableJmpi; + + // DisableVectorDecomposition - True if vector decomposition is disabled. + bool DisableVectorDecomposition; + + // Only generate warning when callable is used in the middle of the kernel + bool WarnCallable; + + // True if codegenerating for OCL runtime. + bool OCLRuntime; + + // Shows which surface should we use for stack + PreDefined_Surface StackSurf; + // Limit in bytes for stack purposes + unsigned StackSurfMaxSize; + + bool UseGlobalMem; + +public: + // This constructor initializes the data members to match that + // of the specified triple. + // + GenXSubtarget(const Triple &TT, const std::string &CPU, + const std::string &FS); + + // hasLongLong - true for Gen8+ + bool hasLongLong() { return HasLongLong; } + + unsigned getGRFWidth() const { return 32; } + + bool isOCLRuntime() const { return OCLRuntime; } + + // ParseSubtargetFeatures - Parses features string setting specified + // subtarget options. Definition of function is auto generated by tblgen. + void ParseSubtargetFeatures(StringRef CPU, StringRef FS); + + // \brief Reset the features for the GenX target. + void resetSubtargetFeatures(StringRef CPU, StringRef FS); + +public: + + /// * isHSW - true if target is HSW + bool isHSW() const { return GenXVariant == GENX_HSW; } + + /// * isBDW - true if target is BDW + bool isBDW() const { return GenXVariant == GENX_BDW; } + + /// * isBDWplus - true if target is BDW or later + bool isBDWplus() const { return GenXVariant >= GENX_BDW; } + + /// * isCHV - true if target is CHV + bool isCHV() const { return GenXVariant == GENX_CHV; } + + /// * isSKL - true if target is SKL + bool isSKL() const { return GenXVariant == GENX_SKL; } + + /// * isSKLplus - true if target is SKL or later + bool isSKLplus() const { return GenXVariant >= GENX_SKL; } + + /// * isBXT - true if target is BXT + bool isBXT() const { return GenXVariant == GENX_BXT; } + + + /// * isKBL - true if target is KBL + bool isKBL() const { return GenXVariant == GENX_KBL; } + + /// * isGLK - true if target is GLK + bool isGLK() const { return GenXVariant == GENX_GLK; } + + /// * isCNL - true if target is CNL + bool isCNL() const { return GenXVariant == GENX_CNL; } + + /// * isCNLplus - true if target is CNL or later + bool isCNLplus() const { return GenXVariant >= GENX_CNL; } + + /// * isICLLP - true if target is ICL LP + bool isICLLP() const { return GenXVariant == GENX_ICLLP; } + /// * isTGLLP - true if target is TGL LP + bool isTGLLP() const { return GenXVariant == GENX_TGLLP; } + + /// * emulateIDivRem - true if emulates integer division and reminder. + bool emulateIDivRem() const { return GenXVariant >= GENX_TGLLP; } + + /// * dumpRegAlloc - true if we should dump Reg Alloc info + bool dumpRegAlloc() const { return DumpRegAlloc; } + + /// * hasLongLong - true if target supports long long + bool hasLongLong() const { return HasLongLong; } + + /// * disableJmpi - true if jmpi is disabled. + bool disableJmpi() const { return DisableJmpi; } + + /// * WaNoA32ByteScatteredStatelessMessages - true if there is no A32 byte + /// scatter stateless message. + bool WaNoA32ByteScatteredStatelessMessages() const { return !isCNLplus(); } + + /// * disableVectorDecomposition - true if vector decomposition is disabled. + bool disableVectorDecomposition() const { return DisableVectorDecomposition; } + + /// * warnCallable() - true if compiler only generate warning for + /// callable in the middle + bool warnCallable() const { return WarnCallable; } + + /// * hasIndirectGRFCrossing - true if target supports an indirect region + /// crossing one GRF boundary + bool hasIndirectGRFCrossing() const { return isSKLplus(); } + + bool useGlobalMem() const { return UseGlobalMem; } + + void setUseGlobalMem() { + assert(hasLongLong() && isOCLRuntime() && + "Global mem stack can't be used on 32-bit targets or on CMRT"); + UseGlobalMem = true; + } + + /// * getEmulateFunction - return the corresponding emulation function name, + /// empty string if no emulation is needed. + StringRef getEmulateFunction(const Instruction *Inst) const; + + // Generic helper functions... + const Triple &getTargetTriple() const { return TargetTriple; } + + bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); } + bool isTargetLinux() const { return TargetTriple.isOSLinux(); } + + bool isTargetWindowsMSVC() const { + return TargetTriple.isWindowsMSVCEnvironment(); + } + + bool isTargetKnownWindowsMSVC() const { + return TargetTriple.isKnownWindowsMSVCEnvironment(); + } + + bool isTargetWindowsCygwin() const { + return TargetTriple.isWindowsCygwinEnvironment(); + } + + bool isTargetWindowsGNU() const { + return TargetTriple.isWindowsGNUEnvironment(); + } + + bool isTargetCygMing() const { return TargetTriple.isOSCygMing(); } + + bool isOSWindows() const { return TargetTriple.isOSWindows(); } + + TARGET_PLATFORM getVisaPlatform() const { + switch (GenXVariant) { + case GENX_BDW: + return TARGET_PLATFORM::GENX_BDW; + case GENX_CHV: + return TARGET_PLATFORM::GENX_CHV; + case GENX_SKL: + return TARGET_PLATFORM::GENX_SKL; + case GENX_BXT: + return TARGET_PLATFORM::GENX_BXT; + case GENX_CNL: + return TARGET_PLATFORM::GENX_CNL; + case GENX_ICLLP: + return TARGET_PLATFORM::GENX_ICLLP; + case GENX_TGLLP: + return TARGET_PLATFORM::GENX_TGLLP; + // TODO: Unfortunately, the finalizer doesn't support all platforms, so we + // map any unsupported platforms to the most appropriate supported one. + // See also getFinalizerPlatform function in GenX.cpp + case GENX_KBL: + return TARGET_PLATFORM::GENX_SKL; + case GENX_GLK: + return TARGET_PLATFORM::GENX_BXT; + default: + return TARGET_PLATFORM::GENX_NONE; + } + } + + /// * stackSurface - return a surface that should be used for stack. + PreDefined_Surface stackSurface() const { return StackSurf; } + + /// * stackSurfaceMaxSize - return available space in bytes for stack + /// purposes. + unsigned stackSurfaceMaxSize() const { return StackSurfMaxSize; } +}; + +class GenXSubtargetPass : public ImmutablePass { + GenXSubtarget *ST; +public: + GenXSubtargetPass(); + GenXSubtargetPass(GenXSubtarget &ST); + ~GenXSubtargetPass(); + GenXSubtarget *getSubtarget() { return ST; } + static char ID; +}; + +ImmutablePass *createGenXSubtargetPass(GenXSubtarget &ST); + +} // End llvm namespace + +#endif diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXTargetMachine.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXTargetMachine.cpp new file mode 100644 index 000000000000..76cc1f1b2054 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXTargetMachine.cpp @@ -0,0 +1,546 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +// This file defines the GenX specific subclass of TargetMachine. +// +/// Non-pass classes +/// ================ +/// +/// This section documents some GenX backend classes and abstractions that are not +/// in themselves passes, but are used by the passes. +/// +/// .. include:: GenXAlignmentInfo.h +/// +/// .. include:: GenXRegion.h +/// +/// .. include:: GenXSubtarget.h +/// +/// Pass documentation +/// ================== +/// +/// The GenX backend runs the following passes on LLVM IR: +/// +/// .. contents:: +/// :local: +/// :depth: 1 +/// +// +//===----------------------------------------------------------------------===// + +#include "GenXTargetMachine.h" +#include "FunctionGroup.h" +#include "GenX.h" +#include "GenXModule.h" +#include "GenXOCLRuntimeInfo.h" +#include "vc/GenXOpts/GenXOpts.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/Passes.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/Verifier.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/PassRegistry.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/IPO/AlwaysInliner.h" +#include "llvm/Transforms/IPO/PassManagerBuilder.h" +#include "llvm/Transforms/InstCombine/InstCombine.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils.h" + +using namespace llvm; + +static cl::opt DumpRegAlloc("genx-dump-regalloc", cl::init(false), cl::Hidden, + cl::desc("Enable dumping of GenX liveness and register allocation to a file.")); + +static cl::opt EmitVLoadStore( + "genx-emit-vldst", cl::init(true), cl::Hidden, + cl::desc("Emit load/store intrinsic calls for pass-by-ref arguments")); + +// There's another copy of DL string in clang/lib/Basic/Targets.cpp +static std::string getDL(bool Is64Bit) { + return Is64Bit ? "e-p:64:64-i64:64-n8:16:32" : "e-p:32:32-i64:64-n8:16:32"; +} + +namespace llvm { +//===----------------------------------------------------------------------===// +// This function is required to add GenX passes to opt tool +//===----------------------------------------------------------------------===// +void initializeGenXPasses(PassRegistry ®istry) { + initializeFunctionGroupAnalysisPass(registry); + initializeGenXAddressCommoningPass(registry); + initializeGenXArgIndirectionPass(registry); + initializeGenXCategoryPass(registry); + initializeGenXCFSimplificationPass(registry); + initializeGenXCisaBuilderPass(registry); + initializeGenXCoalescingPass(registry); + initializeGenXDeadVectorRemovalPass(registry); + initializeGenXDepressurizerPass(registry); + initializeGenXEarlySimdCFConformancePass(registry); + initializeGenXEmulatePass(registry); + initializeGenXExtractVectorizerPass(registry); + initializeGenXFuncBalingPass(registry); + initializeGenXGEPLoweringPass(registry); + initializeGenXGroupBalingPass(registry); + initializeGenXIMadPostLegalizationPass(registry); + initializeGenXLateSimdCFConformancePass(registry); + initializeGenXLayoutBlocksPass(registry); + initializeGenXLegalizationPass(registry); + initializeGenXLiveRangesPass(registry); + initializeGenXLivenessPass(registry); + initializeGenXLivenessPass(registry); + initializeGenXLowerAggrCopiesPass(registry); + initializeGenXLoweringPass(registry); + initializeGenXModulePass(registry); + initializeGenXNumberingPass(registry); + initializeGenXPatternMatchPass(registry); + initializeGenXPostLegalizationPass(registry); + initializeGenXPromotePredicatePass(registry); + initializeGenXRawSendRipperPass(registry); + initializeGenXReduceIntSizePass(registry); + initializeGenXRegionCollapsingPass(registry); + initializeGenXRematerializationPass(registry); + initializeGenXSubtargetPassPass(registry); + initializeGenXThreadPrivateMemoryPass(registry); + initializeGenXUnbalingPass(registry); + initializeGenXVisaRegAllocPass(registry); + initializeTransformPrivMemPass(registry); + initializeGenXFunctionPointersLoweringPass(registry); + + // WRITE HERE MORE PASSES IF IT'S NEEDED; +} + +TargetTransformInfo GenXTargetMachine::getTargetTransformInfo(const Function &F) { + GenXTTIImpl GTTI(F.getParent()->getDataLayout()); + return TargetTransformInfo(GTTI); +} + +} // namespace llvm + +GenXTargetMachine::GenXTargetMachine(const Target &T, const Triple &TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Optional RM, + Optional CM, + CodeGenOpt::Level OL, bool Is64Bit) + : IGCLLVM::TargetMachine(T, getDL(Is64Bit), TT, CPU, FS, Options), + Is64Bit(Is64Bit), Subtarget(TT, CPU, FS) {} + +GenXTargetMachine::~GenXTargetMachine() = default; + +void GenXTargetMachine32::anchor() {} + +GenXTargetMachine32::GenXTargetMachine32(const Target &T, const Triple &TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Optional RM, + Optional CM, + CodeGenOpt::Level OL, bool JIT) + : GenXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {} + +void GenXTargetMachine64::anchor() {} + +GenXTargetMachine64::GenXTargetMachine64(const Target &T, const Triple &TT, + StringRef CPU, StringRef FS, + const TargetOptions &Options, + Optional RM, + Optional CM, + CodeGenOpt::Level OL, bool JIT) + : GenXTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {} + +//===----------------------------------------------------------------------===// +// External Interface declaration +//===----------------------------------------------------------------------===// +extern "C" void LLVMInitializeGenXTarget() { + // Register the target. + RegisterTargetMachine X(getTheGenXTarget32()); + RegisterTargetMachine Y(getTheGenXTarget64()); +} + +//===----------------------------------------------------------------------===// +// Pass Pipeline Configuration +//===----------------------------------------------------------------------===// + +bool GenXTargetMachine::addPassesToEmitFile(PassManagerBase &PM, + raw_pwrite_stream &o, + raw_pwrite_stream * pi, + CodeGenFileType FileType, + bool DisableVerify, + MachineModuleInfo *) { + // We can consider the .isa file to be an object file, or an assembly file + // which may later be converted to GenX code by the Finalizer. If we're + // asked to produce any other type of file return true to indicate an error. + if ((FileType != IGCLLVM::TargetMachine::CodeGenFileType::CGFT_ObjectFile) && + (FileType != IGCLLVM::TargetMachine::CodeGenFileType::CGFT_AssemblyFile)) + return true; + + // GenXSubtargetPass is a wrapper pass to query features or options. + // This adds it explicitly to allow passes access the subtarget object using + // method getAnalysisIfAvailable. + PM.add(createGenXSubtargetPass(Subtarget)); + + // Wrapper structure for collecting information related to OCL runtime. + // Can be used by external caller by adding extractor pass in the end + // of compilation pipeline. + if (Subtarget.isOCLRuntime()) + PM.add(new GenXOCLRuntimeInfo()); + + // Install GenX-specific TargetTransformInfo for passes such as + // LowerAggrCopies and InfoAddressSpace + PM.add(createTargetTransformInfoWrapperPass(getTargetIRAnalysis())); + + PM.add(createSROAPass()); + PM.add(createEarlyCSEPass()); + PM.add(createCFGSimplificationPass()); + PM.add(createInstructionCombiningPass()); + + PM.add(createGlobalDCEPass()); + PM.add(createGenXLowerAggrCopiesPass()); + PM.add(createInferAddressSpacesPass()); + PM.add(createTransformPrivMemPass()); + PM.add(createPromoteMemoryToRegisterPass()); + // All passes which modify the LLVM IR are now complete; run the verifier + // to ensure that the IR is valid. + if (!DisableVerify) + PM.add(createVerifierPass()); + // Run passes to generate vISA. + + /// .. include:: GenXGEPLowering.cpp + PM.add(createGenXGEPLoweringPass()); + PM.add(createGenXThreadPrivateMemoryPass()); + + /// BasicAliasAnalysis + /// ------------------ + /// This is a standard LLVM analysis pass to provide basic AliasAnalysis + /// support. + PM.add(createBasicAAWrapperPass()); + /// SROA + /// ---- + /// This is a standard LLVM pass, used at this point in the GenX backend. + /// Normally all alloca variables have been + /// removed by now by earlier LLVM passes, unless ``-O0`` was specified. + /// We run this pass here to cover that case. + /// + /// **IR restriction**: alloca, load, store not supported after this pass. + /// + PM.add(createSROAPass()); + + /// .. include:: GenXSwitchFixup.cpp + PM.add(createGenXInstCombineCleanup()); + + /// LowerSwitch + /// ----------- + /// This is a standard LLVM pass to lower a switch instruction to a chain of + /// conditional branches. + /// + /// **IR restriction**: switch not supported after this pass. + /// + // TODO: keep some switch instructions and lower them to JMPSWITCH vISA ops. + PM.add(createLowerSwitchPass()); + /// .. include:: GenXCFSimplification.cpp + PM.add(createGenXCFSimplificationPass()); + /// CFGSimplification + /// ----------------- + /// This is a standard LLVM pass, used at this point in the GenX backend. + /// + PM.add(createCFGSimplificationPass()); + /// .. include:: GenXInlineAsmLowering.cpp + PM.add(createGenXInlineAsmLoweringPass()); + /// .. include:: GenXReduceIntSize.cpp + PM.add(createGenXReduceIntSizePass()); + /// .. include:: GenXAggregatePseudoLowering.cpp + PM.add(createGenXAggregatePseudoLoweringPass()); + /// InstructionCombining + /// -------------------- + /// This is a standard LLVM pass, used at this point in the GenX backend. + /// + PM.add(createInstructionCombiningPass()); + // Run integer reduction again to revert some trunc/ext patterns transformed + // by instcombine. + PM.add(createGenXReduceIntSizePass()); + /// .. include:: GenXSimdCFConformance.cpp + PM.add(createGenXEarlySimdCFConformancePass()); + /// .. include:: GenXPromotePredicate.cpp + PM.add(createGenXPromotePredicatePass()); + // Run GEP lowering again to remove possible GEPs after instcombine. + PM.add(createGenXGEPLoweringPass()); + /// .. include:: GenXLowering.cpp + PM.add(createGenXLoweringPass()); + if (!DisableVerify) PM.add(createVerifierPass()); + PM.add(createGenXFunctionPointersLoweringPass()); + /// .. include:: GenXRegionCollapsing.cpp + PM.add(createGenXRegionCollapsingPass()); + /// EarlyCSE + /// -------- + /// This is a standard LLVM pass, run at this point in the GenX backend. + /// It commons up common subexpressions, but only in the case that two common + /// subexpressions are related by one dominating the other. + /// + PM.add(createEarlyCSEPass()); + /// BreakCriticalEdges + /// ------------------ + /// In the control flow graph, a critical edge is one from a basic block with + /// multiple successors (a conditional branch) to a basic block with multiple + /// predecessors. + /// + /// We use this standard LLVM pass to split such edges, to ensure that + /// constant loader and GenXCoalescing have somewhere to insert a phi copy if + /// needed. + /// + PM.add(createBreakCriticalEdgesPass()); + /// .. include:: GenXPatternMatch.cpp + PM.add(createGenXPatternMatchPass(&Options)); + if (!DisableVerify) PM.add(createVerifierPass()); + /// .. include:: GenXExtractVectorizer.cpp + PM.add(createGenXExtractVectorizerPass()); + /// .. include:: GenXRawSendRipper.cpp + PM.add(createGenXRawSendRipperPass()); + /// DeadCodeElimination + /// ------------------- + /// This is a standard LLVM pass, run at this point in the GenX backend. It + /// removes code that has been made dead by other passes. + /// + PM.add(createDeadCodeEliminationPass()); + /// .. include:: GenXBaling.h + PM.add(createGenXFuncBalingPass(BalingKind::BK_Legalization, &Subtarget)); + /// .. include:: GenXLegalization.cpp + PM.add(createGenXLegalizationPass()); + /// .. include:: GenXEmulate.cpp + PM.add(createGenXEmulatePass()); + /// .. include:: GenXDeadVectorRemoval.cpp + PM.add(createGenXDeadVectorRemovalPass()); + /// DeadCodeElimination + /// ------------------- + /// This is a standard LLVM pass, run at this point in the GenX backend. It + /// removes code that has been made dead by other passes. + /// + PM.add(createDeadCodeEliminationPass()); + /// .. include:: GenXPostLegalization.cpp + /// .. include:: GenXConstants.cpp + /// .. include:: GenXVectorDecomposer.h + PM.add(createGenXPostLegalizationPass()); + if (!DisableVerify) PM.add(createVerifierPass()); + /// EarlyCSE + /// -------- + /// This is a standard LLVM pass, run at this point in the GenX backend. + /// It commons up common subexpressions, but only in the case that two common + /// subexpressions are related by one dominating the other. + /// + PM.add(createEarlyCSEPass()); + /// LICM + /// ---- + /// This is a standard LLVM pass to hoist/sink the loop invariant code after + /// legalization. + PM.add(createLICMPass()); + /// DeadCodeElimination + /// ------------------- + /// This is a standard LLVM pass, run at this point in the GenX backend. It + /// removes code that has been made dead by other passes. + /// + PM.add(createDeadCodeEliminationPass()); + PM.add(createGenXIMadPostLegalizationPass()); + /// GlobalDCE + /// --------- + /// This is a standard LLVM pass, run at this point in the GenX backend. It + /// eliminates unreachable internal globals. + /// + PM.add(createGlobalDCEPass()); + /// .. include:: FunctionGroup.h + /// .. include:: GenXModule.h + PM.add(createGenXModulePass()); + /// .. include:: GenXLiveness.h + PM.add(createGenXLivenessPass()); + PM.add(createGenXGroupBalingPass(BalingKind::BK_Analysis, &Subtarget)); + PM.add(createGenXNumberingPass()); + PM.add(createGenXLiveRangesPass()); + /// .. include:: GenXRematerialization.cpp + PM.add(createGenXRematerializationPass()); + /// .. include:: GenXCategory.cpp + PM.add(createGenXCategoryPass()); + /// Late SIMD CF conformance pass + /// ----------------------------- + /// This is the same pass as GenXSimdCFConformance above, but run in a + /// slightly different way. See above. + /// + /// **IR restriction**: After this pass, the EM values must have EM register + /// category. The RM values must have RM register category. The !any result of + /// a goto/join must have NONE register category. + /// + PM.add(createGenXLateSimdCFConformancePass()); + /// CodeGen baling pass + /// ------------------- + /// This is the same pass as GenXBaling above, but run in a slightly different + /// way. See above. + /// + /// **IR restriction**: Any pass after this needs to be careful when modifying + /// code, as it also needs to update baling info. + /// + PM.add(createGenXGroupBalingPass(BalingKind::BK_CodeGen, &Subtarget)); + + /// .. include:: GenXNumbering.h + PM.add(createGenXNumberingPass()); + /// .. include:: GenXLiveRanges.cpp + PM.add(createGenXLiveRangesPass()); + /// .. include:: GenXUnbaling.cpp + PM.add(createGenXUnbalingPass()); + /// .. include:: GenXDepressurizer.cpp + PM.add(createGenXDepressurizerPass()); + /// .. include:: GenXNumbering.h + PM.add(createGenXNumberingPass()); + /// .. include:: GenXLiveRanges.cpp + PM.add(createGenXLiveRangesPass()); + /// .. include:: GenXCoalescing.cpp + PM.add(createGenXCoalescingPass()); + /// .. include:: GenXAddressCommoning.cpp + PM.add(createGenXAddressCommoningPass()); + /// .. include:: GenXArgIndirection.cpp + PM.add(createGenXArgIndirectionPass()); + /// .. include:: GenXTidyControlFlow.cpp + //initializeLoopInfoPass(*PassRegistry::getPassRegistry()); + PM.add(createGenXTidyControlFlowPass()); + /// .. include:: GenXVisaRegAlloc.h + auto RegAlloc = createGenXVisaRegAllocPass(); + PM.add(RegAlloc); + if (DumpRegAlloc || Subtarget.dumpRegAlloc()) + PM.add(createGenXGroupAnalysisDumperPass(RegAlloc, ".regalloc")); + + /// .. include:: GenXCisaBuilder.cpp + PM.add(createGenXCisaBuilderPass()); + PM.add(createGenXFinalizerPass(o)); + + return false; +} + +void GenXTargetMachine::adjustPassManager(PassManagerBuilder &PMBuilder) { + // Lower aggr copies. + PMBuilder.addExtension( + PassManagerBuilder::EP_EarlyAsPossible, + [](const PassManagerBuilder &Builder, PassManagerBase &PM) { + PM.add(createGenXLowerAggrCopiesPass()); + }); + + // Packetize. + auto AddPacketize = [](const PassManagerBuilder &Builder, + PassManagerBase &PM) { + PM.add(createGenXPacketizePass()); + PM.add(createAlwaysInlinerLegacyPass()); + PM.add(createGlobalDCEPass()); + PM.add(createPromoteMemoryToRegisterPass()); + PM.add(createInferAddressSpacesPass()); + PM.add(createEarlyCSEPass(true)); + PM.add(createCFGSimplificationPass()); + PM.add(createInstructionCombiningPass()); + PM.add(createDeadCodeEliminationPass()); + PM.add(createSROAPass()); + PM.add(createInferAddressSpacesPass()); + PM.add(createEarlyCSEPass(true)); + PM.add(createCFGSimplificationPass()); + PM.add(createInstructionCombiningPass()); + PM.add(createDeadCodeEliminationPass()); + }; + PMBuilder.addExtension(PassManagerBuilder::EP_ModuleOptimizerEarly, + AddPacketize); + PMBuilder.addExtension(PassManagerBuilder::EP_EnabledOnOptLevel0, + AddPacketize); + + // vldst. + if (EmitVLoadStore) { + auto AddLowerLoadStore = [](const PassManagerBuilder &Builder, + PassManagerBase &PM) { + if (Builder.OptLevel > 0) { + // Inline + PM.add(createSROAPass()); + PM.add(createEarlyCSEPass()); + PM.add(createJumpThreadingPass()); + PM.add(createCFGSimplificationPass()); + PM.add(createCorrelatedValuePropagationPass()); + PM.add(createGenXReduceIntSizePass()); + PM.add(createInstructionCombiningPass()); + PM.add(createAlwaysInlinerLegacyPass()); + PM.add(createGlobalDCEPass()); + PM.add(createInstructionCombiningPass()); + // Unroll + PM.add(createCFGSimplificationPass()); + PM.add(createReassociatePass()); + PM.add(createLoopRotatePass()); + PM.add(createLICMPass()); + PM.add(createInstructionCombiningPass()); + PM.add(createIndVarSimplifyPass()); + PM.add(createLoopIdiomPass()); + PM.add(createLoopDeletionPass()); + PM.add(createSimpleLoopUnrollPass()); + PM.add(createInstructionCombiningPass()); + // Simplify region accesses. + PM.add(createGenXRegionCollapsingPass()); + PM.add(createEarlyCSEPass()); + PM.add(createDeadCodeEliminationPass()); + } + PM.add(createCMLowerVLoadVStorePass()); + }; + PMBuilder.addExtension(PassManagerBuilder::EP_ModuleOptimizerEarly, + AddLowerLoadStore); + PMBuilder.addExtension(PassManagerBuilder::EP_EnabledOnOptLevel0, + AddLowerLoadStore); + } + + // CM implicit parameters. + auto AddCMImpParam = [this](const PassManagerBuilder &Builder, + PassManagerBase &PM) { + PM.add(createCMImpParamPass(!Subtarget.isOCLRuntime())); + }; + PMBuilder.addExtension(PassManagerBuilder::EP_ModuleOptimizerEarly, + AddCMImpParam); + PMBuilder.addExtension(PassManagerBuilder::EP_EnabledOnOptLevel0, + AddCMImpParam); + + // CM ABI. + auto AddCMABI = [](const PassManagerBuilder &Builder, PassManagerBase &PM) { + PM.add(createIPSCCPPass()); + PM.add(createCMABIPass()); + }; + PMBuilder.addExtension(PassManagerBuilder::EP_ModuleOptimizerEarly, AddCMABI); + PMBuilder.addExtension(PassManagerBuilder::EP_EnabledOnOptLevel0, AddCMABI); + + // CM kernel argument offset. + auto AddCMKernelArgOffset = [this](const PassManagerBuilder &Builder, + PassManagerBase &PM) { + unsigned Width = 32; + PM.add(createCMKernelArgOffsetPass(Width, Subtarget.isOCLRuntime())); + }; + PMBuilder.addExtension(PassManagerBuilder::EP_ModuleOptimizerEarly, + AddCMKernelArgOffset); + PMBuilder.addExtension(PassManagerBuilder::EP_EnabledOnOptLevel0, + AddCMKernelArgOffset); + + auto AddGenXPeephole = [](const PassManagerBuilder &Builder, + PassManagerBase &PM) { + PM.add(createGenXSimplifyPass()); + }; + PMBuilder.addExtension(PassManagerBuilder::EP_Peephole, AddGenXPeephole); +} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXTargetMachine.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXTargetMachine.h new file mode 100644 index 000000000000..da752c93f61e --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXTargetMachine.h @@ -0,0 +1,183 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +// This file declares the TargetMachine that is used by the GenX backend. +// +// Unlike a normal CPU backend, the GenX backend does not use CodeGen (the +// LLVM target independent code generator). +// +//===----------------------------------------------------------------------===// + +#ifndef GENXTARGETMACHINE_H +#define GENXTARGETMACHINE_H + +#include "llvmWrapper/Target/TargetMachine.h" + +#include "GenXIntrinsics.h" +#include "GenXSubtarget.h" +#include "TargetInfo/GenXTargetInfo.h" + +#include "llvm/IR/DataLayout.h" +#include "llvm/CodeGen/BasicTTIImpl.h" + +namespace llvm { + +class raw_pwrite_stream; +class MachineModuleInfo; + +class GenXTargetMachine : public IGCLLVM::TargetMachine { + bool Is64Bit; + GenXSubtarget Subtarget; + +public: + GenXTargetMachine(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, + Optional RM, Optional CM, + CodeGenOpt::Level OL, bool Is64Bit); + + ~GenXTargetMachine() override; + + bool addPassesToEmitFile(PassManagerBase &PM, raw_pwrite_stream &o, raw_pwrite_stream *pi, + CodeGenFileType FileType, + bool /*DisableVerify*/ = true, + MachineModuleInfo *MMI = nullptr) override; + + void adjustPassManager(PassManagerBuilder &PMBuilder) override; + + virtual const DataLayout *getDataLayout() const { return &DL; } + + virtual const TargetSubtargetInfo *getSubtargetImpl(const Function &) const override { + return &Subtarget; + } + TargetTransformInfo getTargetTransformInfo(const Function &F) override; +}; + +class GenXTargetMachine32 : public GenXTargetMachine { + virtual void anchor(); + +public: + GenXTargetMachine32(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, + Optional RM, Optional CM, + CodeGenOpt::Level OL, bool JIT); +}; + +class GenXTargetMachine64 : public GenXTargetMachine { + virtual void anchor(); + +public: + GenXTargetMachine64(const Target &T, const Triple &TT, StringRef CPU, + StringRef FS, const TargetOptions &Options, + Optional RM, Optional CM, + CodeGenOpt::Level OL, bool JIT); +}; + +// This implementation allows us to define our own costs for +// the GenX backend. Did not use BasicTTIImplBase because the overloaded +// constructors have TragetMachine as an argument, so I inherited from +// its parent which has only DL as its arguments +class GenXTTIImpl : public TargetTransformInfoImplCRTPBase +{ + typedef TargetTransformInfoImplCRTPBase BaseT; + typedef TargetTransformInfo TTI; + friend BaseT; +public: + GenXTTIImpl(const DataLayout& DL) : BaseT(DL) {} + + bool shouldBuildLookupTables() { return false; } + unsigned getFlatAddressSpace() { return 4; } + + int getUserCost(const User *U, ArrayRef Operands) { + if (auto EV = dyn_cast(U)) { + switch(GenXIntrinsic::getGenXIntrinsicID(EV->getOperand(0))) { + case GenXIntrinsic::genx_simdcf_goto: + case GenXIntrinsic::genx_simdcf_join: + // Do not allow such EVs to be TCC_Free + return TTI::TCC_Basic; + default: + break; + } + } + + return BaseT::getUserCost(U, Operands); + } + + bool isProfitableToHoist(Instruction *I) const { + // genx_vload and genx_vstore are related to g_store bales + // and they shouldn't be hoisted from then/else blocks + // in front of the branch + auto IntrinsicID = GenXIntrinsic::getGenXIntrinsicID(I); + return IntrinsicID != GenXIntrinsic::genx_vload && + IntrinsicID != GenXIntrinsic::genx_vstore; + } +}; + +/// Initialize all GenX passes for opt tool. +void initializeGenXPasses(PassRegistry &); + +void initializeFunctionGroupAnalysisPass(PassRegistry &); +void initializeGenXAddressCommoningPass(PassRegistry &); +void initializeGenXArgIndirectionPass(PassRegistry &); +void initializeGenXCategoryPass(PassRegistry &); +void initializeGenXCFSimplificationPass(PassRegistry &); +void initializeGenXCisaBuilderPass(PassRegistry &); +void initializeGenXCoalescingPass(PassRegistry &); +void initializeGenXDeadVectorRemovalPass(PassRegistry &); +void initializeGenXDepressurizerPass(PassRegistry &); +void initializeGenXEarlySimdCFConformancePass(PassRegistry &); +void initializeGenXEmulatePass(PassRegistry &); +void initializeGenXExtractVectorizerPass(PassRegistry &); +void initializeGenXFuncBalingPass(PassRegistry &); +void initializeGenXGEPLoweringPass(PassRegistry &); +void initializeGenXGroupBalingPass(PassRegistry &); +void initializeGenXInstCombineCleanup(PassRegistry &); +void initializeGenXIMadPostLegalizationPass(PassRegistry &); +void initializeGenXLateSimdCFConformancePass(PassRegistry &); +void initializeGenXLayoutBlocksPass(PassRegistry &); +void initializeGenXLegalizationPass(PassRegistry &); +void initializeGenXLiveRangesPass(PassRegistry &); +void initializeGenXLivenessPass(PassRegistry &); +void initializeGenXLowerAggrCopiesPass(PassRegistry &); +void initializeGenXLoweringPass(PassRegistry &); +void initializeGenXModulePass(PassRegistry &); +void initializeGenXNumberingPass(PassRegistry &); +void initializeGenXPatternMatchPass(PassRegistry &); +void initializeGenXPostLegalizationPass(PassRegistry &); +void initializeGenXPostLegalizationPass(PassRegistry &); +void initializeGenXPromotePredicatePass(PassRegistry &); +void initializeGenXRawSendRipperPass(PassRegistry &); +void initializeGenXReduceIntSizePass(PassRegistry &); +void initializeGenXRegionCollapsingPass(PassRegistry &); +void initializeGenXRematerializationPass(PassRegistry &); +void initializeGenXSubtargetPassPass(PassRegistry &); +void initializeGenXThreadPrivateMemoryPass(PassRegistry &); +void initializeGenXUnbalingPass(PassRegistry &); +void initializeGenXVisaRegAllocPass(PassRegistry &); +void initializeTransformPrivMemPass(PassRegistry &); +void initializeGenXFunctionPointersLoweringPass(PassRegistry &); +} // End llvm namespace + +#endif diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXThreadPrivateMemory.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXThreadPrivateMemory.cpp new file mode 100644 index 000000000000..4483645f0fdc --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXThreadPrivateMemory.cpp @@ -0,0 +1,1023 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// This pass lowers alloca instructions to genx.alloca intrinsics and changes +/// pointer from alloca to offset in predefined stack surface +// +//===----------------------------------------------------------------------===// + +#include "GenX.h" +#include "GenXModule.h" +#include "GenXRegion.h" +#include "GenXSubtarget.h" +#include "GenXUtil.h" +#include "GenXVisa.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvmWrapper/IR/InstrTypes.h" +#include +#include + +using namespace llvm; +using namespace genx; + +namespace { + +// This actually should've been a FunctionGroupPass, +// but due to the FGPassManager hack we can't run GenXModule twice +// so for now we can't insert module pass that invalidate FGA betw FGPasses +class GenXThreadPrivateMemory : public ModulePass, + public InstVisitor { +public: + GenXThreadPrivateMemory(); + + virtual StringRef getPassName() const override { + return "GenXThreadPrivateMemory"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const { + ModulePass::getAnalysisUsage(AU); + AU.setPreservesCFG(); + } + + bool runOnModule(Module &M) override; + bool runOnFunction(Function &F); + + void visitAllocaInst(AllocaInst &I); + +private: + bool replacePhi(PHINode *Phi); + bool preparePhiForReplacement(PHINode *Phi); + bool replaceScatterPrivate(CallInst *CI); + bool replaceGatherPrivate(CallInst *CI); + bool replacePTI(PtrToIntInst *PTI); + bool replaceStore(StoreInst *StI); + bool replaceLoad(LoadInst *LdI); + bool replaceSelect(SelectInst *Sel); + bool replaceAddrSpaceCast(AddrSpaceCastInst * AddrCast); + Value *lookForPtrReplacement(Value *Ptr) const; + void addUsers(Instruction *I); + void collectEachAllocaUsers(); + void addUsersIfNeeded(Instruction *I); + std::pair NormalizeVector(Value *From, Type *To, + Instruction *InsertBefore); + Instruction *RestoreVectorAfterNormalization(Instruction *From, Type *To); + +public: + static char ID; + +private: + LLVMContext *m_ctx; + GenXSubtarget *m_ST; + const DataLayout *m_DL; + std::vector m_alloca; + std::vector m_gather; + std::vector m_scatter; + std::map m_allocaToIntrinsic; + std::queue m_AIUsers; + std::set m_AlreadyAdded; + PreDefined_Surface m_stack; +}; +} // namespace + +// Register pass to igc-opt +namespace llvm { +void initializeGenXThreadPrivateMemoryPass(PassRegistry &); +} + +INITIALIZE_PASS_BEGIN(GenXThreadPrivateMemory, "GenXThreadPrivateMemory", + "GenXThreadPrivateMemory", false, false) +INITIALIZE_PASS_END(GenXThreadPrivateMemory, "GenXThreadPrivateMemory", + "GenXThreadPrivateMemory", false, false) + +char GenXThreadPrivateMemory::ID = 0; + +ModulePass *llvm::createGenXThreadPrivateMemoryPass() { + return new GenXThreadPrivateMemory; +} + +GenXThreadPrivateMemory::GenXThreadPrivateMemory() : ModulePass(ID) { + initializeGenXThreadPrivateMemoryPass(*PassRegistry::getPassRegistry()); +} + +static Value *ZExtOrTruncIfNeeded(Value *From, Type *To, + Instruction *InsertBefore) { + unsigned FromTySz = From->getType()->getPrimitiveSizeInBits(); + unsigned ToTySz = To->getPrimitiveSizeInBits(); + Value *Res = From; + if (From->getType()->isVectorTy() && + From->getType()->getVectorNumElements() == 1) { + Res = CastInst::CreateBitOrPointerCast( + Res, From->getType()->getVectorElementType(), "", InsertBefore); + } + if (FromTySz < ToTySz) + Res = CastInst::CreateZExtOrBitCast(Res, To, "", InsertBefore); + else if (FromTySz > ToTySz) + Res = CastInst::CreateTruncOrBitCast(Res, To, "", InsertBefore); + return Res; +} + +// If data is a vector of double/int64, bitcast each element to 2 int32. +// If data is a vector of type < 32bit, extend each element in order to create +// proper send instruction in the finalizer. +std::pair +GenXThreadPrivateMemory::NormalizeVector(Value *From, Type *To, + Instruction *InsertBefore) { + Type *I32Ty = Type::getInt32Ty(InsertBefore->getContext()); + Value *Res = From; + Type *FromTy = From->getType(); + assert(isa(FromTy)); + unsigned NumElts = FromTy->getVectorNumElements(); + unsigned EltSz = + m_DL->getTypeSizeInBits(FromTy->getScalarType()) / genx::ByteBits; + assert(EltSz > 0); + if (To->getScalarType()->isPointerTy() && + To->getScalarType()->getPointerElementType()->isFunctionTy()) { + Type *I64Ty = Type::getInt64Ty(InsertBefore->getContext()); + To = VectorType::get(I64Ty, NumElts); + Res = CastInst::Create(Instruction::PtrToInt, From, To, "", InsertBefore); + NumElts *= 2; + To = VectorType::get(I32Ty, NumElts); + EltSz = I32Ty->getPrimitiveSizeInBits() / genx::ByteBits; + Res = CastInst::Create(Instruction::BitCast, Res, To, "", InsertBefore); + } else if (To->getVectorElementType()->getPrimitiveSizeInBits() < + genx::DWordBits) { + To = VectorType::get(I32Ty, NumElts); + + Res = CastInst::Create(Instruction::ZExt, From, To, "", InsertBefore); + } else if (To->getVectorElementType()->getPrimitiveSizeInBits() == + genx::QWordBits) { + NumElts *= 2; + EltSz = I32Ty->getPrimitiveSizeInBits() / genx::ByteBits; + To = VectorType::get(I32Ty, NumElts); + + Res = CastInst::Create(Instruction::BitCast, From, To, "", InsertBefore); + } + + return std::make_pair(Res, EltSz); +} + +Instruction * +GenXThreadPrivateMemory::RestoreVectorAfterNormalization(Instruction *From, + Type *To) { + Instruction *Restored = From; + unsigned EltSz = m_DL->getTypeSizeInBits(To->getScalarType()); + assert(EltSz > 0); + if (To->getScalarType()->isPointerTy() && + To->getScalarType()->getPointerElementType()->isFunctionTy()) { + Restored = PtrToIntInst::Create(Instruction::IntToPtr, From, To); + } else if (EltSz < genx::DWordBits) { + Restored = CastInst::Create(Instruction::Trunc, From, To, ""); + } else if (EltSz == genx::QWordBits) { + auto *NewFrom = From; + if (!From->getType()->getScalarType()->isPointerTy() && + To->getScalarType()->isPointerTy()) { + assert(From->getType()->getScalarType()->isIntegerTy(genx::DWordBits)); + Type *NewTy = + VectorType::get(Type::getInt64Ty(*m_ctx), + From->getType()->getVectorNumElements() / 2); + NewFrom = CastInst::CreateBitOrPointerCast(From, NewTy); + NewFrom->insertAfter(From); + Restored = CastInst::Create(CastInst::IntToPtr, NewFrom, To); + } else + Restored = CastInst::CreateBitOrPointerCast(NewFrom, To); + } + if (Restored != From) + Restored->insertAfter(From); + return Restored; +} + +static Value *DoubleVector(Value *OrigVector, unsigned ShiftVal, + Instruction *InsertPoint) { + IRBuilder<> Builder(InsertPoint); + Type *I32Ty = Type::getInt32Ty(InsertPoint->getContext()); + unsigned NumElts = OrigVector->getType()->getVectorNumElements() * 2; + Type *OrigVectorEltTy = OrigVector->getType()->getVectorElementType(); + Value *NewElts = UndefValue::get(VectorType::get(OrigVectorEltTy, NumElts)); + for (unsigned CurEltNum = 0; CurEltNum * 2 < NumElts; ++CurEltNum) { + Value *OldIdx = ConstantInt::get(I32Ty, CurEltNum); + Value *NewIdx = ConstantInt::get(I32Ty, CurEltNum * 2); + Value *EltOld = Builder.CreateExtractElement(OrigVector, OldIdx); + NewElts = Builder.CreateInsertElement(NewElts, EltOld, NewIdx); + NewIdx = ConstantInt::get(I32Ty, CurEltNum * 2 + 1); + if (ShiftVal) { + Value *TyShift = ConstantInt::get(I32Ty, ShiftVal); + EltOld = Builder.CreateAdd(EltOld, TyShift); + } + NewElts = Builder.CreateInsertElement(NewElts, EltOld, NewIdx); + } + + return NewElts; +} + +static Value *FormEltsOffsetVector(unsigned NumElts, unsigned TySz, + Instruction *InsertBefore) { + IRBuilder<> Builder(InsertBefore); + Type *I32Ty = Type::getInt32Ty(InsertBefore->getContext()); + Value *EltsOffset = UndefValue::get(VectorType::get(I32Ty, NumElts)); + for (unsigned CurElt = 0; CurElt < NumElts; ++CurElt) { + Value *Idx = ConstantInt::get(I32Ty, CurElt); + Value *EltOffset = ConstantInt::get(I32Ty, CurElt * TySz); + EltsOffset = Builder.CreateInsertElement(EltsOffset, EltOffset, Idx); + } + + return EltsOffset; +} + +static Value *FormEltsOffsetVectorForSVM(unsigned NumElts, + Instruction *InsertBefore, Value *Offset) { + IRBuilder<> Builder(InsertBefore); + Type *I32Ty = Type::getInt32Ty(InsertBefore->getContext()); + Type *I64Ty = Type::getInt64Ty(InsertBefore->getContext()); + Value *EltsOffset = UndefValue::get(VectorType::get(I64Ty, NumElts)); + if (Offset->getType()->isVectorTy()) { + assert(Offset->getType()->getVectorNumElements() == 1); + Offset = CastInst::CreateZExtOrBitCast(Offset, I64Ty, "", InsertBefore); + } + for (unsigned CurElt = 0; CurElt < NumElts; ++CurElt) { + Value *Idx = ConstantInt::get(I32Ty, CurElt); + EltsOffset = Builder.CreateInsertElement(EltsOffset, Offset, Idx); + } + + return EltsOffset; +} + + +Value *GenXThreadPrivateMemory::lookForPtrReplacement(Value *Ptr) const { + assert(Ptr->getType()->isPtrOrPtrVectorTy()); + + if (auto BC = dyn_cast(Ptr)) + return lookForPtrReplacement(BC->getOperand(0)); + else if (auto ITP = dyn_cast(Ptr)) + return ITP->getOperand(0); + else if (auto AI = dyn_cast(Ptr)) { + auto AllocaIntr = m_allocaToIntrinsic.find(AI); + assert(AllocaIntr != m_allocaToIntrinsic.end() && + "Each alloca must be here"); + return AllocaIntr->second; + } else if (auto *EEI = dyn_cast(Ptr)) { + // support a case when load/gather addr goes from svm.ld + extract_elem + auto *CI = dyn_cast(EEI->getVectorOperand()); + if (CI && !CI->isIndirectCall() && + GenXIntrinsic::getAnyIntrinsicID(CI->getCalledFunction()) == + GenXIntrinsic::genx_svm_block_ld) { + if (Ptr->getType()->isPointerTy()) { + auto *Cast = CastInst::Create(Instruction::PtrToInt, Ptr, + Type::getInt32Ty(*m_ctx)); + Cast->insertAfter(EEI); + return Cast; + } else + return Ptr; + } else + report_fatal_error("Cannot find pointer replacement for extractelem"); + } else if (isa(Ptr)) + return ConstantInt::get(Type::getInt32Ty(*m_ctx), 0); + else + report_fatal_error("Cannot find pointer replacement"); +} + +bool GenXThreadPrivateMemory::replaceAddrSpaceCast( + AddrSpaceCastInst* AddrCast) { + auto NewAlloca = lookForPtrReplacement(AddrCast->getPointerOperand()); + + auto IntToPtr = IntToPtrInst::Create( + llvm::Instruction::CastOps::IntToPtr, NewAlloca, + AddrCast->getPointerOperand()->getType(), "", AddrCast); + auto NewAddrCast = + AddrSpaceCastInst::Create(llvm::Instruction::CastOps::AddrSpaceCast, + IntToPtr, AddrCast->getType(), "", AddrCast); + + AddrCast->replaceAllUsesWith(NewAddrCast); + AddrCast->eraseFromParent(); + + return true; +} + +bool GenXThreadPrivateMemory::replaceLoad(LoadInst *LdI) { + IRBuilder<> Builder(LdI); + Type *LdTy = LdI->getType(); + Type *LdEltTy = LdTy; + if (isa(LdEltTy)) + LdEltTy = LdEltTy->getVectorElementType(); + else + LdTy = VectorType::get(LdTy, 1); + + unsigned NumEltsToLoad = LdTy->getVectorNumElements(); + unsigned LdEltTySz = m_DL->getTypeSizeInBits(LdEltTy); + if (LdEltTySz == genx::QWordBits) + NumEltsToLoad *= 2; + + Value *PredVal = ConstantInt::get(Type::getInt1Ty(*m_ctx), 1); + Value *Pred = Builder.CreateVectorSplat(NumEltsToLoad, PredVal); + + Type *I32Ty = Type::getInt32Ty(*m_ctx); + Type *I64Ty = Type::getInt64Ty(*m_ctx); + Type *TyToLoad = I32Ty; + if (LdEltTy->isFloatTy()) + TyToLoad = LdEltTy; + Type *RealTyToLoad = LdEltTy; + if (m_DL->getTypeSizeInBits(RealTyToLoad) == genx::QWordBits) + RealTyToLoad = I32Ty; + unsigned RealTyToLoadSz = + m_DL->getTypeSizeInBits(RealTyToLoad) / genx::ByteBits; + Value *OldValOfTheDataRead = + Builder.CreateVectorSplat(NumEltsToLoad, UndefValue::get(TyToLoad)); + + + Value *PointerOp = LdI->getPointerOperand(); + Value *Offset = lookForPtrReplacement(PointerOp); + Offset = + ZExtOrTruncIfNeeded(Offset, m_ST->useGlobalMem() ? I64Ty : I32Ty, LdI); + auto IID = m_ST->useGlobalMem() + ? llvm::GenXIntrinsic::genx_svm_gather + : llvm::GenXIntrinsic::genx_gather_scaled; + + Value *EltsOffset = FormEltsOffsetVector(NumEltsToLoad, RealTyToLoadSz, LdI); + + unsigned SrcSize = genx::log2(RealTyToLoadSz); + Value *logNumBlocks = ConstantInt::get(I32Ty, m_ST->useGlobalMem() ? 0 : SrcSize); + Value *Scale = ConstantInt::get(Type::getInt16Ty(*m_ctx), 0); + Value *Surface = ConstantInt::get(I32Ty, + visa::getReservedSurfaceIndex(m_stack)); + if (m_ST->useGlobalMem() && NumEltsToLoad > 1) { + assert(Offset->getType()->getScalarType()->isIntegerTy(64)); + auto *BaseOff = FormEltsOffsetVectorForSVM(NumEltsToLoad, LdI, Offset); + auto *ZextOff = CastInst::CreateZExtOrBitCast( + EltsOffset, + VectorType::get(I64Ty, EltsOffset->getType()->getVectorNumElements()), + "", LdI); + Offset = BinaryOperator::CreateAdd(BaseOff, ZextOff, "", LdI); + } + Function *F = GenXIntrinsic::getGenXDeclaration( + LdI->getModule(), IID, + {OldValOfTheDataRead->getType(), + Pred->getType(), + (m_ST->useGlobalMem() ? Offset : EltsOffset)->getType()}); + CallInst *Gather = + m_ST->useGlobalMem() + ? IntrinsicInst::Create( + F, {Pred, logNumBlocks, Offset, OldValOfTheDataRead}) + : IntrinsicInst::Create(F, {Pred, logNumBlocks, Scale, Surface, + Offset, EltsOffset, OldValOfTheDataRead}); + Gather->insertAfter(LdI); + m_gather.push_back(Gather); + Instruction *ProperGather = RestoreVectorAfterNormalization(Gather, LdTy); + + if (!isa(LdI->getType()) && + isa(ProperGather->getType())) { + Instruction *LdVal = CastInst::CreateBitOrPointerCast(ProperGather, LdI->getType()); + LdVal->insertAfter(ProperGather); + ProperGather = LdVal; + } + + LdI->replaceAllUsesWith(ProperGather); + LdI->eraseFromParent(); + + return true; +} + +bool GenXThreadPrivateMemory::replaceStore(StoreInst *StI) { + IRBuilder<> Builder(StI); + Value *ValueOp = StI->getValueOperand(); + Type *ValueOpTy = ValueOp->getType(); + if (ValueOpTy->isIntOrPtrTy() || ValueOpTy->isFloatingPointTy()) { + ValueOp = Builder.CreateVectorSplat(1, ValueOp); + ValueOpTy = ValueOp->getType(); + } + assert(ValueOp->getType()->isVectorTy()); + + unsigned ValueEltSz = 0; + std::tie(ValueOp, ValueEltSz) = NormalizeVector(ValueOp, ValueOpTy, StI); + unsigned ValueNumElts = ValueOp->getType()->getVectorNumElements(); + + Value *PointerOp = StI->getPointerOperand(); + Value *Offset = lookForPtrReplacement(PointerOp); + Type *I32Ty = Type::getInt32Ty(*m_ctx); + Type *I64Ty = Type::getInt64Ty(*m_ctx); + Offset = + ZExtOrTruncIfNeeded(Offset, m_ST->useGlobalMem() ? I64Ty : I32Ty, StI); + + auto IID = m_ST->useGlobalMem() + ? llvm::GenXIntrinsic::genx_svm_scatter + : llvm::GenXIntrinsic::genx_scatter_scaled; + + Value *PredVal = ConstantInt::get(Type::getInt1Ty(*m_ctx), 1); + Value *Pred = Builder.CreateVectorSplat(ValueNumElts, PredVal); + Value *EltsOffset = FormEltsOffsetVector(ValueNumElts, ValueEltSz, StI); + + if (m_ST->useGlobalMem() && ValueNumElts > 1) { + assert(Offset->getType()->getScalarType()->isIntegerTy(64)); + auto *BaseOff = FormEltsOffsetVectorForSVM(ValueNumElts, StI, Offset); + auto *ZextOff = CastInst::CreateZExtOrBitCast( + EltsOffset, + VectorType::get(I64Ty, EltsOffset->getType()->getVectorNumElements()), + "", StI); + Offset = BinaryOperator::CreateAdd(BaseOff, ZextOff, "", StI); + } + + Function *F = GenXIntrinsic::getGenXDeclaration( + StI->getModule(), IID, + {Pred->getType(), + (m_ST->useGlobalMem() ? Offset : EltsOffset)->getType(), + ValueOp->getType()}); + Value *logNumBlocks = ConstantInt::get(I32Ty, m_ST->useGlobalMem() ? 0 : genx::log2(ValueEltSz)); + Value *Scale = ConstantInt::get(Type::getInt16Ty(*m_ctx), 0); + Value *Surface = ConstantInt::get(I32Ty, + visa::getReservedSurfaceIndex(m_stack)); + auto *Scatter = + m_ST->useGlobalMem() + ? IntrinsicInst::Create(F, {Pred, logNumBlocks, Offset, ValueOp}) + : IntrinsicInst::Create(F, {Pred, logNumBlocks, Scale, Surface, + Offset, EltsOffset, ValueOp}); + Scatter->insertAfter(StI); + StI->eraseFromParent(); + + m_scatter.push_back(Scatter); + + return true; +} + +bool GenXThreadPrivateMemory::replacePTI(PtrToIntInst *PTI) { + Value *PointerOp = PTI->getPointerOperand(); + Value *Offset = lookForPtrReplacement(PointerOp); + + Offset = ZExtOrTruncIfNeeded(Offset, PTI->getDestTy(), PTI); + PTI->replaceAllUsesWith(Offset); + PTI->eraseFromParent(); + + return true; +} + +bool GenXThreadPrivateMemory::replaceGatherPrivate(CallInst *CI) { + auto IID = m_ST->useGlobalMem() + ? llvm::GenXIntrinsic::genx_svm_gather + : llvm::GenXIntrinsic::genx_gather_scaled; + + Type *OrigDstTy = CI->getType(); + assert(isa(OrigDstTy)); + Type *NewDstTy = OrigDstTy; + Value *OldValue = CI->getArgOperand(3); + unsigned ValueEltSz = 0; + + // Check gather.private invariant. + assert(NewDstTy == OldValue->getType()); + + // Cast data type to legal. + std::tie(OldValue, ValueEltSz) = NormalizeVector(OldValue, NewDstTy, CI); + NewDstTy = OldValue->getType(); + unsigned ValueNumElts = NewDstTy->getVectorNumElements(); + + Value *Pred = CI->getArgOperand(0); + Value *EltsOffset = CI->getArgOperand(2); + if (OrigDstTy->getVectorElementType()->getPrimitiveSizeInBits() == + genx::QWordBits) { + assert(ValueNumElts == EltsOffset->getType()->getVectorNumElements() * 2); + EltsOffset = DoubleVector(EltsOffset, ValueEltSz, CI); + Pred = DoubleVector(Pred, 0, CI); + } + + Type *I32Ty = Type::getInt32Ty(*m_ctx); + Value *PointerOp = CI->getOperand(1); + Value *Offset = lookForPtrReplacement(PointerOp); + Offset = ZExtOrTruncIfNeeded(Offset, I32Ty, CI); + + Function *F = GenXIntrinsic::getGenXDeclaration( + CI->getModule(), IID, + {NewDstTy, Pred->getType(), + (m_ST->useGlobalMem() ? Offset : EltsOffset)->getType()}); + + Value *logNumBlocks = ConstantInt::get(I32Ty, genx::log2(ValueEltSz)); + Value *Scale = ConstantInt::get(Type::getInt16Ty(*m_ctx), 0); + Value *Surface = ConstantInt::get(I32Ty, + visa::getReservedSurfaceIndex(m_stack)); + + CallInst *Gather = + m_ST->useGlobalMem() + ? IntrinsicInst::Create(F, {Pred, logNumBlocks, Offset, OldValue}) + : IntrinsicInst::Create(F, {Pred, logNumBlocks, Scale, Surface, + Offset, EltsOffset, OldValue}); + Gather->insertAfter(CI); + m_gather.push_back(Gather); + + Instruction *ProperGather = + RestoreVectorAfterNormalization(Gather, OrigDstTy); + CI->replaceAllUsesWith(ProperGather); + CI->eraseFromParent(); + + return true; +} + +bool GenXThreadPrivateMemory::replaceScatterPrivate(CallInst *CI) { + auto IID = m_ST->useGlobalMem() + ? llvm::GenXIntrinsic::genx_svm_scatter + : llvm::GenXIntrinsic::genx_scatter_scaled; + Value *ValueOp = CI->getArgOperand(3); + Type *OrigValueTy = ValueOp->getType(); + assert(isa(OrigValueTy)); + unsigned EltSz = 0; + std::tie(ValueOp, EltSz) = NormalizeVector(ValueOp, ValueOp->getType(), CI); + + Value *Pred = CI->getArgOperand(0); + Value *EltsOffset = CI->getArgOperand(2); + if (OrigValueTy->getVectorElementType()->getPrimitiveSizeInBits() == + genx::QWordBits) { + EltsOffset = DoubleVector(EltsOffset, EltSz, CI); + Pred = DoubleVector(Pred, 0, CI); + } + + Value *ScatterPtr = CI->getArgOperand(1); + Type *I32Ty = Type::getInt32Ty(*m_ctx); + Value *Offset = lookForPtrReplacement(ScatterPtr); + Offset = ZExtOrTruncIfNeeded(Offset, I32Ty, CI); + + Function *F = GenXIntrinsic::getGenXDeclaration( + CI->getModule(), IID, + {Pred->getType(), (m_ST->useGlobalMem() ? Offset : EltsOffset)->getType(), + ValueOp->getType()}); + + unsigned logNumBlocks = genx::log2(EltSz); + unsigned Scale = 0; // scale is always 0 + Value *Surface = ConstantInt::get(I32Ty, + visa::getReservedSurfaceIndex(m_stack)); + CallInst *ScatterStScaled = + m_ST->useGlobalMem() + ? IntrinsicInst::Create( + F, + {Pred, ConstantInt::get(I32Ty, logNumBlocks), Offset, ValueOp}) + : IntrinsicInst::Create( + F, {Pred, ConstantInt::get(I32Ty, logNumBlocks), + ConstantInt::get(Type::getInt16Ty(*m_ctx), Scale), Surface, + Offset, EltsOffset, ValueOp}); + ScatterStScaled->insertAfter(CI); + m_scatter.push_back(ScatterStScaled); + CI->replaceAllUsesWith(ScatterStScaled); + CI->eraseFromParent(); + + return true; +} + +bool GenXThreadPrivateMemory::replacePhi(PHINode *Phi) { + SmallVector PhiOps; + for (auto &IncVal : Phi->incoming_values()) + PhiOps.push_back(lookForPtrReplacement(static_cast(IncVal.get()))); + + assert(!PhiOps.empty()); + + Type *OffsetTy = PhiOps[0]->getType(); + auto TypeChecker = [OffsetTy](Value *V) { return OffsetTy == V->getType(); }; + assert(std::all_of(PhiOps.begin(), PhiOps.end(), TypeChecker)); + + PHINode *NewPhi = PHINode::Create(OffsetTy, PhiOps.size()); + for (unsigned i = 0; i < PhiOps.size(); ++i) + NewPhi->addIncoming(PhiOps[i], Phi->getIncomingBlock(i)); + + NewPhi->insertAfter(Phi); + + // Create temporary cast instruction to satisfy old phi users. Types must be + // different due to replacement pointer by integer offset. + assert(NewPhi->getType() != Phi->getType()); + CastInst *TempCast = CastInst::CreateBitOrPointerCast(NewPhi, Phi->getType()); + TempCast->insertAfter(NewPhi->getParent()->getFirstNonPHI()); + + Phi->replaceAllUsesWith(TempCast); + Phi->eraseFromParent(); + + return true; +} + +// |--%1 = PHI(%2, ...) +// | ^ +// | | +// | | +// | %2 = PHI(%1, ...) +// |---------^ +// +// In this situation, it's difficult to find the origin of the pointer. PtrToInt +// and IntToPtr break the process of searching (see lookForPtrReplacement) and +// it helps to 'emulate' phi in TPM +bool GenXThreadPrivateMemory::preparePhiForReplacement(PHINode *Phi) { + if (!isa(Phi->getType())) + return false; + + Type *I64Ty = Type::getInt64Ty(Phi->getContext()); + StringRef Name = Phi->getName(); + Instruction *TempPtrToInt = CastInst::Create( + Instruction::PtrToInt, Phi, I64Ty, Name + ".tpm.temp.pti", + Phi->getParent()->getFirstNonPHI()); + Instruction *TempIntToPtr = + CastInst::Create(Instruction::IntToPtr, TempPtrToInt, Phi->getType(), + Name + ".tpm.temp.itp"); + TempIntToPtr->insertAfter(TempPtrToInt); + Phi->replaceAllUsesWith(TempIntToPtr); + + // Replacement here was incorrect + TempPtrToInt->replaceUsesOfWith(TempIntToPtr, Phi); + + return true; +} + +bool GenXThreadPrivateMemory::replaceSelect(SelectInst *Sel) { + Value *Cond = Sel->getCondition(); + Value *TrueValue = lookForPtrReplacement(Sel->getTrueValue()); + Value *FalseValue = lookForPtrReplacement(Sel->getFalseValue()); + + SelectInst *NewSel = SelectInst::Create(Cond, TrueValue, FalseValue); + NewSel->insertAfter(Sel); + NewSel->setDebugLoc(Sel->getDebugLoc()); + + CastInst *TempCast = CastInst::CreateBitOrPointerCast(NewSel, Sel->getType()); + TempCast->insertAfter(NewSel); + TempCast->setDebugLoc(Sel->getDebugLoc()); + + Sel->replaceAllUsesWith(TempCast); + Sel->eraseFromParent(); + + return true; +} + +static Value *GetUndefVec(Type *Ty, unsigned NumElts) { + return UndefValue::get(VectorType::get(Ty, NumElts)); +} + +static std::pair GetUndefPair(Type *Ty, unsigned NumElts) { + return std::make_pair(GetUndefVec(Ty, NumElts), GetUndefVec(Ty, NumElts)); +} + +static Value *FillVecWithSeqVals(Value *Vec, unsigned Start, + Instruction *InsertBefore) { + IRBuilder<> Builder(InsertBefore); + Builder.SetInsertPoint(InsertBefore); + + Type *I32Ty = Type::getInt32Ty(InsertBefore->getContext()); + unsigned NumElts = Vec->getType()->getVectorNumElements(); + for (unsigned i = 0; i < NumElts; ++i) { + Value *Idx = ConstantInt::get(I32Ty, i); + Value *Val = ConstantInt::get(I32Ty, i + Start); + Vec = Builder.CreateInsertElement(Vec, Val, Idx); + } + return Vec; +} + +static std::pair +SplitVec(Value *Vec, unsigned NumElts, Instruction *InsertBefore, + std::pair Splitters) { + IRBuilder<> Builder(InsertBefore); + Builder.SetInsertPoint(InsertBefore); + + Type *EltTy = Vec->getType()->getVectorElementType(); + Value *First = Builder.CreateShuffleVector(Vec, GetUndefVec(EltTy, NumElts), + Splitters.first); + Value *Second = Builder.CreateShuffleVector(Vec, GetUndefVec(EltTy, NumElts), + Splitters.second); + return std::make_pair(First, Second); +} + +void SplitScatter(CallInst *CI) { + assert(GenXIntrinsic::getAnyIntrinsicID(CI) == + llvm::GenXIntrinsic::genx_scatter_scaled); + Type *DataTy = CI->getArgOperand(5)->getType(); + unsigned NumElts = DataTy->getVectorNumElements(); + assert(NumElts % 2 == 0); + + Type *I32Ty = Type::getInt32Ty(CI->getContext()); + std::pair Splitters = GetUndefPair(I32Ty, NumElts / 2); + Splitters.first = FillVecWithSeqVals(Splitters.first, 0, CI); + Splitters.second = FillVecWithSeqVals(Splitters.second, NumElts / 2, CI); + + Value *Pred = CI->getArgOperand(0); + std::pair NewPreds = SplitVec(Pred, NumElts, CI, Splitters); + + Value *EltOffsets = CI->getArgOperand(5); + std::pair NewEltOffsets = + SplitVec(EltOffsets, NumElts, CI, Splitters); + + Value *OldVal = CI->getArgOperand(6); + std::pair OldVals = + SplitVec(OldVal, NumElts, CI, Splitters); + + auto IID = llvm::GenXIntrinsic::genx_scatter_scaled; + Function *F = GenXIntrinsic::getGenXDeclaration(CI->getModule(), IID, + {NewPreds.first->getType(), + NewEltOffsets.first->getType(), + OldVals.first->getType()}); + + Value *LogNumBlock = CI->getArgOperand(1); + Value *Scale = CI->getArgOperand(2); + Value *Surface = CI->getArgOperand(3); + Value *Offset = CI->getArgOperand(4); + + CallInst *FirstScatter = + IntrinsicInst::Create(F, {NewPreds.first, LogNumBlock, Scale, Surface, + Offset, NewEltOffsets.first, OldVals.first}); + CallInst *SecondScatter = + IntrinsicInst::Create(F, {NewPreds.second, LogNumBlock, Scale, Surface, + Offset, NewEltOffsets.second, OldVals.second}); + + FirstScatter->insertAfter(CI); + SecondScatter->insertAfter(FirstScatter); + + CI->eraseFromParent(); +} + +void SplitGather(CallInst *CI) { + assert(GenXIntrinsic::getAnyIntrinsicID(CI) == + llvm::GenXIntrinsic::genx_gather_scaled); + Type *DstTy = CI->getType(); + unsigned NumElts = DstTy->getVectorNumElements(); + assert(NumElts % 2 == 0); + + Type *I32Ty = Type::getInt32Ty(CI->getContext()); + std::pair Splitters = GetUndefPair(I32Ty, NumElts / 2); + Splitters.first = FillVecWithSeqVals(Splitters.first, 0, CI); + Splitters.second = FillVecWithSeqVals(Splitters.second, NumElts / 2, CI); + + Value *Pred = CI->getArgOperand(0); + std::pair NewPreds = SplitVec(Pred, NumElts, CI, Splitters); + + Value *EltOffsets = CI->getArgOperand(5); + std::pair NewEltOffsets = + SplitVec(EltOffsets, NumElts, CI, Splitters); + + Value *OldVal = CI->getArgOperand(6); + std::pair OldVals = + SplitVec(OldVal, NumElts, CI, Splitters); + auto IID = llvm::GenXIntrinsic::genx_gather_scaled; + Function *F = GenXIntrinsic::getGenXDeclaration(CI->getModule(), IID, + {OldVals.first->getType(), + NewPreds.first->getType(), + NewEltOffsets.first->getType()}); + + Value *LogNumBlock = CI->getArgOperand(1); + Value *Scale = CI->getArgOperand(2); + Value *Surface = CI->getArgOperand(3); + Value *Offset = CI->getArgOperand(4); + + CallInst *FirstGather = + IntrinsicInst::Create(F, {NewPreds.first, LogNumBlock, Scale, Surface, + Offset, NewEltOffsets.first, OldVals.first}); + CallInst *SecondGather = + IntrinsicInst::Create(F, {NewPreds.second, LogNumBlock, Scale, Surface, + Offset, NewEltOffsets.second, OldVals.second}); + + FirstGather->insertAfter(CI); + SecondGather->insertAfter(FirstGather); + + Value *Joiner = FillVecWithSeqVals(GetUndefVec(I32Ty, NumElts), 0, CI); + IRBuilder<> Builder(CI); + Builder.SetInsertPoint(SecondGather->getNextNode()); + Value *JointGather = + Builder.CreateShuffleVector(FirstGather, SecondGather, Joiner); + + CI->replaceAllUsesWith(JointGather); + CI->eraseFromParent(); +} + +void GenXThreadPrivateMemory::addUsers(Instruction *I) { + assert(I); + for (auto Usr : I->users()) { + Instruction *ToAdd = cast(Usr); + auto Found = m_AlreadyAdded.find(ToAdd); + if (Found == m_AlreadyAdded.end()) { + m_AlreadyAdded.insert(ToAdd); + m_AIUsers.push(ToAdd); + } + } +} + +void GenXThreadPrivateMemory::collectEachAllocaUsers() { + assert(m_AIUsers.empty()); + m_AlreadyAdded.clear(); + for (auto B = m_allocaToIntrinsic.begin(), E = m_allocaToIntrinsic.end(); + B != E; ++B) { + Instruction *I = dyn_cast(B->first); + assert(I); + addUsers(I); + } +} + +void GenXThreadPrivateMemory::addUsersIfNeeded(Instruction *I) { + bool isGatherScatterPrivate = false; + if (IntrinsicInst *CI = dyn_cast(I)) { + unsigned ID = GenXIntrinsic::getAnyIntrinsicID(CI); + switch (ID) { + case GenXIntrinsic::genx_gather_private: + case GenXIntrinsic::genx_scatter_private: + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + isGatherScatterPrivate = true; + break; + default: + break; + } + } + if (m_ST->useGlobalMem() || + (!isa(I) && !isa(I) && !isGatherScatterPrivate)) + addUsers(I); +} + +// pre-transformation analysis to determine +// which kind of mem should we place TPM at +static bool checkSVMNecessary(Instruction *Inst, int LoadsMet = 0) { + // do not handle ConstExprs for now + if (!Inst) + return false; + if (isa(Inst)) { + if (LoadsMet > 0) + return true; + else + ++LoadsMet; + } else if (auto *CI = dyn_cast(Inst)) { + auto IID = GenXIntrinsic::getAnyIntrinsicID(CI); + if (IID == GenXIntrinsic::genx_gather_private || + IID == GenXIntrinsic::genx_scatter_private || + IID == GenXIntrinsic::not_any_intrinsic) { + // do not process users of priv mem intrinsics + // or calls to other functions + return false; + } + } else if (isa(Inst)) { + // do not go thru phi as loops may appear and + // it doesn't seem necessary for the analysis now + return false; + } + bool Result = false; + for (auto *U : Inst->users()) { + Result |= checkSVMNecessary(dyn_cast(U), LoadsMet); + if (Result) + break; + } + return Result; +} + +// required to pass find_if's typecheck +static bool checkSVMNecessaryPred(Instruction *Inst) { + return checkSVMNecessary(Inst); +} + +bool GenXThreadPrivateMemory::runOnModule(Module &M) { + auto STP = getAnalysisIfAvailable(); + assert(STP); + m_ST = STP->getSubtarget(); + for (auto &F : M) + visit(F); + if (std::find_if(m_alloca.begin(), m_alloca.end(), checkSVMNecessaryPred) != + m_alloca.end()) + m_ST->setUseGlobalMem(); + bool Result = false; + for (auto &F : M) + Result |= runOnFunction(F); + return Result; +} + +bool GenXThreadPrivateMemory::runOnFunction(Function &F) { + m_DL = &F.getParent()->getDataLayout(); + m_stack = m_ST->stackSurface(); + + m_ctx = &F.getContext(); + m_DL = &F.getParent()->getDataLayout(); + m_alloca.clear(); + m_gather.clear(); + m_scatter.clear(); + m_allocaToIntrinsic.clear(); + m_AIUsers = {}; + m_AlreadyAdded.clear(); + + visit(F); + + for (auto Alloca : m_alloca) { + Type *AllocaTy = Alloca->getAllocatedType(); + + auto IID = llvm::GenXIntrinsic::genx_alloca; + Function *IntrDecl = GenXIntrinsic::getGenXDeclaration(Alloca->getModule(), IID, AllocaTy); + CallInst *AllocaIntr = + IntrinsicInst::Create(IntrDecl, {Constant::getNullValue(AllocaTy)}); + AllocaIntr->insertAfter(Alloca); + m_allocaToIntrinsic[Alloca] = AllocaIntr; + } + + // Firstly, we resolve dependencies in PHI nodes (see comments in + // preparePhiForReplacement). + collectEachAllocaUsers(); + bool Changed = false; + while (!m_AIUsers.empty()) { + Instruction *I = m_AIUsers.front(); + m_AIUsers.pop(); + + addUsersIfNeeded(I); + + if (PHINode *Phi = dyn_cast(I)) + Changed |= preparePhiForReplacement(Phi); + } + + // Main loop where instructions are replaced one by one. + collectEachAllocaUsers(); + while (!m_AIUsers.empty()) { + Instruction *I = m_AIUsers.front(); + m_AIUsers.pop(); + + addUsersIfNeeded(I); + + if (auto *LdI = dyn_cast(I)) + Changed |= replaceLoad(LdI); + else if (auto *StI = dyn_cast(I)) + Changed |= replaceStore(StI); + else if (auto *PTI = dyn_cast(I)) + Changed |= replacePTI(PTI); + else if (auto* AddrCast = dyn_cast(I)) + Changed |= replaceAddrSpaceCast(AddrCast); + else if (isa(I) || isa(I)) { + // resolve all IntToPtr users and remove it. + if (I->use_empty()) { + I->eraseFromParent(); + Changed = true; + } + } else if (IntrinsicInst *CI = dyn_cast(I)) { + unsigned ID = GenXIntrinsic::getAnyIntrinsicID(CI); + if (ID == GenXIntrinsic::genx_gather_private) + Changed |= replaceGatherPrivate(CI); + else if (ID == GenXIntrinsic::genx_scatter_private) + Changed |= replaceScatterPrivate(CI); + else if (ID == Intrinsic::lifetime_start || + ID == Intrinsic::lifetime_end) { + CI->eraseFromParent(); + Changed = true; + } + } else if (PHINode *Phi = dyn_cast(I)) { + if (isa(Phi->getType())) + Changed |= replacePhi(Phi); + } else if (SelectInst *Sel = dyn_cast(I)) { + if (isa(Sel->getType())) + Changed |= replaceSelect(Sel); + } + + if (m_AIUsers.empty()) { + if (!Changed) + report_fatal_error("Thread private memory: cannot resolve all alloca uses"); + Changed = false; + collectEachAllocaUsers(); + } + } + + for (auto AllocaPair : m_allocaToIntrinsic) { + assert(AllocaPair.first->use_empty() && + "uses of replaced alloca aren't empty"); + AllocaPair.first->eraseFromParent(); + } + + // TODO: Rewrite split conditions due to possible exec sizes are 1, 2, 4, 8, + // 16 and 32. + for (auto CI : m_gather) { + Type *DstTy = CI->getType(); + unsigned NumElts = DstTy->getVectorNumElements(); + unsigned EltSz = DstTy->getVectorElementType()->getPrimitiveSizeInBits(); + unsigned ExecSz = NumElts * EltSz; + + if (ExecSz > 2 * genx::GRFBits || NumElts > 32) + SplitGather(CI); + } + + for (auto CI : m_scatter) { + Type *DataTy = + CI->getArgOperand(m_ST->useGlobalMem() ? 3 : 5)->getType(); + unsigned NumElts = DataTy->getVectorNumElements(); + unsigned EltSz = DataTy->getVectorElementType()->getPrimitiveSizeInBits(); + unsigned ExecSz = NumElts * EltSz; + + if (ExecSz > 2 * genx::GRFBits || NumElts > 32) + SplitScatter(CI); + } + + return !m_allocaToIntrinsic.empty(); +} + +void GenXThreadPrivateMemory::visitAllocaInst(AllocaInst &I) { + m_alloca.push_back(&I); +} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXTidyControlFlow.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXTidyControlFlow.cpp new file mode 100644 index 000000000000..4f905d4eaf61 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXTidyControlFlow.cpp @@ -0,0 +1,302 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXTidyControlFlow +/// ------------------- +/// +/// This pass tidies the control flow in the following ways: +/// +/// 1. It removes empty blocks (a block is empty if all it contains is an +/// unconditional branch), and thus reduces branch chains in the generated +/// code. It is needed because often a block inserted by critical edge +/// splitting is not needed for any phi copies. +/// +/// 2. It reorders blocks to increase fallthrough generally, and specifically +/// to ensure that SIMD CF goto and join have the required structure: the +/// "false" successor must be fallthrough and the "true" successor must be +/// forward. (The '"true" successor must be forward' requirement is a vISA +/// requirement, because vISA goto/join does not specify JIP, and the +/// finalizer reconstructs it on this assumption.) +/// +/// 3. fixGotoOverBranch: The pass spots where there is a SIMD CF goto over an +/// unconditional branch, and turns the combination into a backwards goto. +/// +/// After reordering blocks, we know that any simd goto has its "false" successor as +/// the following block. If all of the following are true: +/// +/// a. its "true" successor just branches over that same block; +/// +/// b. that block contains only an unconditional branch; +/// +/// c. the UIP of the goto (the join whose RM it updates) is the same as the +/// "true" successor; +/// +/// d. the goto condition is not constant 0 (this condition is because we +/// cannot represent a backwards simd goto with this, and it is too late to +/// allocate it a register); +/// +/// then we have the end of a simd do..while loop, and we can optimize to a +/// backwards simd goto. +/// +/// We represent a backwards simd goto in the IR by having the "true" +/// successor as the following block. GenXVisaFuncWriter can then spot that it +/// is a backwards simd goto, and it needs its condition inverting. +/// +/// 4. Ensure that there is a single return block and it is the last block. +/// These are required by the vISA's structurizer. +/// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "GENX_TIDYCONTROLFLOW" + +#include "GenX.h" +#include "GenXBaling.h" +#include "GenXGotoJoin.h" +#include "GenXLiveness.h" +#include "GenXModule.h" +#include "GenXNumbering.h" +#include "GenXSubtarget.h" +#include "GenXUtil.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/Pass.h" +#include "llvm/PassRegistry.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; +using namespace genx; + +/*********************************************************************** + * GenXTidyControlFlow pass declaration + */ +namespace { + class GenXTidyControlFlow : public FunctionPass { + const GenXSubtarget *ST = nullptr; + bool Modified; + public: + static char ID; + explicit GenXTidyControlFlow() : FunctionPass(ID), Modified(false) {} + virtual StringRef getPassName() const { return "GenX tidy control flow"; } + + void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.addRequired(); + } + + bool runOnFunction(Function &F); + // createPrinterPass : get a pass to print the IR, together with the GenX + // specific analyses + virtual Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const + { return createGenXPrinterPass(O, Banner); } + private: + void removeEmptyBlocks(Function *F); + void reorderBlocks(Function *F); + void fixGotoOverBranch(Function *F); + void fixReturns(Function *F); + }; +} // end anonymous namespace. + +char GenXTidyControlFlow::ID = 0; + +FunctionPass *llvm::createGenXTidyControlFlowPass() { + return new GenXTidyControlFlow; +} + +/*********************************************************************** + * GenXTidyControlFlow::runOnFunction : process a function + */ +bool GenXTidyControlFlow::runOnFunction(Function &F) +{ + auto P = getAnalysisIfAvailable(); + ST = P ? P->getSubtarget() : nullptr; + Modified = false; + removeEmptyBlocks(&F); + reorderBlocks(&F); + fixGotoOverBranch(&F); + fixReturns(&F); + return Modified; +} + +/*********************************************************************** + * removeEmptyBlocks + */ +void GenXTidyControlFlow::removeEmptyBlocks(Function *F) +{ + Function::iterator fi = F->begin(), fe = F->end(); + // Don't consider the entry block. + for (++fi; fi != fe; ) { + BasicBlock *BB = &*fi; + // Increment iterator here as we may be removing this block. + ++fi; + // FIXME: By claiming preserving liveness, we cannot remove phi(s) in empty + // blocks. Need to adjust the pass order if such phi(s) really need + // eliminating. + BranchInst *BI = dyn_cast(&BB->front()); + if (!BI || !BI->isUnconditional()) + continue; + // Do not remove BB if it has more than one predecessor. + if (!BB->hasOneUse()) + continue; + // Check if this is a critical edge splitting block whose predecessor is + // the "false" leg of a goto/join. In that case we do not remove the + // block, as reorderBlocks below may rely on it to ensure that the "false" + // successor of a goto/join can be made fallthrough. + if (BB->hasOneUse() + && BB->use_begin()->getOperandNo() == 1 /*false successor*/ + && GotoJoin::isBranchingGotoJoinBlock(cast( + BB->use_begin()->getUser())->getParent())) { + LLVM_DEBUG(dbgs() << "removeEmptyBlocks: not removing " << BB->getName() << "\n"); + continue; + } + // We are removing this block. First adjust phi nodes in the successor. + auto Succ = BI->getSuccessor(0); + adjustPhiNodesForBlockRemoval(Succ, BB); + // Change all of BB's uses to use its successor instead. + assert(BB->getSinglePredecessor() != BB && "self loop"); + BB->replaceAllUsesWith(BI->getSuccessor(0)); + BI->eraseFromParent(); + BB->eraseFromParent(); + Modified = true; + } +} + +/*********************************************************************** + * reorderBlocks : reorder blocks to increase fallthrough, and specifically + * to satisfy the requirements of SIMD control flow + */ +void GenXTidyControlFlow::reorderBlocks(Function *F) +{ + LoopInfo& LI = getAnalysis().getLoopInfo(); + if (LI.empty()) + LayoutBlocks(*F); + else + LayoutBlocks(*F, LI); + Modified = true; +} + +/*********************************************************************** + * fixGotoOverBranch : fix a (simd) goto over a branch into a backwards goto + * + * See the comment at the top of the file. + */ +void GenXTidyControlFlow::fixGotoOverBranch(Function *F) +{ + for (auto fi = F->begin(), fe = F->end(); fi != fe; ++fi) { + BasicBlock *BB = &*fi; + auto Goto = GotoJoin::isGotoBlock(BB); + if (!Goto) + continue; + auto Br = cast(BB->getTerminator()); + if (!Br->isConditional()) + continue; + // We have a block ending with a conditional branch that is a goto. + // Now check whether it branches over an unconditional branch. + auto Succ = BB->getNextNode(); + if (!Succ || !Succ->hasOneUse()) + continue; + if (Br->getSuccessor(0)->getPrevNode() != Succ) + continue; + auto SuccBr = dyn_cast(Succ->getFirstNonPHIOrDbg()); + if (!SuccBr || SuccBr->isConditional()) + continue; + // The goto branches over just an unconditional branch. + // Check whether its UIP is the same as the branch target. + auto Join = GotoJoin::findJoin(Goto); + if (!Join || Join->getParent() != Br->getSuccessor(0)) + continue; + // Check that the goto condition is not constant. + if (isa(Goto->getOperand(2))) + continue; + // Change the goto's "false" successor to the target of the unconditional + // branch, and remove Succ so the goto's "true" successor becomes + // fallthrough. This then represents a backward goto. + adjustPhiNodesForBlockRemoval(SuccBr->getSuccessor(0), Succ); + Br->setSuccessor(1, SuccBr->getSuccessor(0)); + Succ->eraseFromParent(); + Modified = true; + } +} + +/****************************************************************************** + * fixReturns : only keep a single return block and ensure it is the last block + * of a function. + */ +void GenXTidyControlFlow::fixReturns(Function *F) { + // Loop over all of the blocks in a function, tracking all of the blocks + // that return. + SmallVector ReturningBlocks; + for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) + if (isa(I->getTerminator())) + ReturningBlocks.push_back(&*I); + + // We need to insert a new basic block into the function, + // add a PHI nodes (if the function returns values), and convert + // all of the return instructions into unconditional branches. + // + if (ReturningBlocks.size() == 1) { + BasicBlock *RetBlock = ReturningBlocks.front(); + BasicBlock *LastBlock = &F->back(); + if (LastBlock != RetBlock) { + RetBlock->moveAfter(LastBlock); + Modified = true; + } + } else if (ReturningBlocks.size() > 1) { + BasicBlock *NewRetBlock = + BasicBlock::Create(F->getContext(), "UnifiedReturnBlock", F); + PHINode *PN = nullptr; + if (F->getReturnType()->isVoidTy()) + ReturnInst::Create(F->getContext(), nullptr, NewRetBlock); + else { + // If the function doesn't return void, add a PHI node to the block. + PN = PHINode::Create(F->getReturnType(), ReturningBlocks.size(), + "UnifiedRetVal"); + NewRetBlock->getInstList().push_back(PN); + ReturnInst::Create(F->getContext(), PN, NewRetBlock); + } + + // Loop over all of the blocks, replacing the return instruction with an + // unconditional branch. + for (auto BB : ReturningBlocks) { + // Add an incoming element to the PHI node for every return instruction + // that is merging into this new block. + if (PN) + PN->addIncoming(BB->getTerminator()->getOperand(0), BB); + + BB->getInstList().pop_back(); // Remove the return inst. + BranchInst::Create(NewRetBlock, BB); + } + Modified = true; + } +} + diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXUnbaling.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXUnbaling.cpp new file mode 100644 index 000000000000..ffa5220fe2be --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXUnbaling.cpp @@ -0,0 +1,1204 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXUnbaling +/// ------------ +/// +/// After live range building, GenXUnbaling spots cases where baling is harmful +/// due to extending the live range of a big vector. +/// +/// The need for the unbaling pass +/// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +/// +/// A *two address operation* (mainly wrregion, but also a few intrinsics that +/// do a predicated read from a shared function unit such as memory) is one +/// where the result needs to be in the same register as one operand, the *two +/// address operand*. GenXCoalescing attempts to coalesce the two together, but +/// it fails if the live range of the two address operand extends beyond the +/// two address instruction. Failure of coalescing means that you get extra +/// code inserted before to copy the whole big vector, and increased register +/// pressure because two values of the big vector are live at the same time. +/// +/// Similarly, with a phi node incoming, GenXCoalescing attempts to coalesce +/// the incoming with the phi node result. Failure means that you get extra +/// code inserted to copy the value at the end of the incoming block. +/// +/// The existence of this problem is due to our use of SSA. Both the input and +/// the output of the wrregion (or the phi incoming and result) are probably +/// the same big vector variable in the source code, and a more traditional +/// compiler would treat the variable as a single (non-SSA) value assigned to +/// its own register, avoiding the need to treat the wrregion specially as a +/// two address operation. +/// +/// With the traditional approach, code motion is more difficult, as an +/// instruction cannot be moved past any other instruction that modifies any of +/// the potentially moving instruction's uses. +/// +/// With our SSA approach, code motion (of an instruction with no memory +/// interaction) is much easier, and we use that in GenXBaling to bale an +/// instruction into another one without needing to check anything in between. +/// (Even though GenXBaling often does not actually move the baled in +/// instruction in IR, it must be treated as if it is at the position of the +/// head of the bale.) +/// +/// The price we pay for that flexibility is that sometimes we move code even +/// when it is harmful to do so. +/// +/// The most common situation where it would fail to coalesce is where +/// legalization has created a sequence of wrregions, and the "old value" input +/// to the first one is also used in a rdregion baled in to each one of the +/// wrregions. +/// +/// Other situations include where some other rdregion use of the two address +/// operand is user code that has been baled to after the instruction, and +/// where the user code actually takes a copy of the big vector and then uses +/// one or more regions out of it after the two address instruction. +/// +/// The GenXUnbaling pass implements two transformations: the non-overlapping +/// region optimization, and the unbaling transformation. +/// +/// Non-overlapping region optimization +/// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +/// +/// A common cause of the two address operand, the "old value" input, of a +/// wrregion extending beyond the wrregion is that the wrregion is the first in +/// a sequence created by GenXLegalization, and the same vector is used as an +/// input to rdregions baled in to subsequent bales in the sequence. +/// +/// In this case, a baled in rdregion is reading a part of the vector that has +/// not been overwritten by any earlier wrregion in the sequence. +/// +/// The non-overlapping region optimization detects this case by checking which +/// regions of the vector have been overwritten by earlier wrregions in the +/// sequence. If the region read by the rdregion has not been overwritten, then +/// the optimization can change the input to the rdregion to the result of the +/// previous wrregion in the sequence without changing the semantics. +/// +/// If this succeeds for all the rdregions from the same vector in the +/// sequence, then the live range no longer reaches beyond the first wrregion +/// and it can be two address coalesced. +/// +/// The non-overlapping region optimization also handles a similar case where +/// the "old value" input to the first wrregion in the sequence is undef, but +/// of the same type as the input to rdregions through the sequence. As well as +/// modifying each rdregion input to the result of the previous wrregion, it +/// changes the undef input to the first wrregion to the same input vector. +/// This also stops the live range of the inputs to the rdregions overlapping +/// the result, and thus saves register pressure. However it can make the code +/// worse if there are further uses of the input after the sequence, so it only +/// makes the transformation if there are no further uses. +/// +/// Unbaling transformation +/// ^^^^^^^^^^^^^^^^^^^^^^^ +/// +/// At its simplest, the unbaling transformation looks at each two address +/// instruction and phi node incoming, and then looks at the uses of the "old +/// value" input: +/// +/// * A use before the original two address instruction can be ignored as it +/// does not cause the "old value" input to be live beyond that instruction. +/// +/// * A use after the original two address instruction that is not a rdregion +/// cannot be handled, so causes the pass to abandon processing this original +/// two address operation. +/// +/// * A rdregion use after the original two address instruction is unbaled so +/// it regains its pre-baling position before the original two address +/// instruction. +/// +/// Thus the use of the "old value" input in the two address instruction +/// becomes a kill use, and coalescing at that instruction will succeed. Or the +/// phi incoming becomes a kill use, and coalescing it with the phi result will +/// succeed. +/// +/// But there are complications: +/// +/// Moving the unbaled instruction +/// """""""""""""""""""""""""""""" +/// +/// Unbaling an instruction means that its position in the code is now +/// considered to be the same as its position in the IR. Sometimes that is +/// where we want it (before the original two address instruction), since +/// baling tends not to move code. But sometimes it is still after the original +/// two address instruction, most likely because of the order of code split by +/// GenXLegalization. +/// +/// Thus we may need to move the unbaled instruction up to before the original +/// two address instruction. In fact we need to move the whole sub-bale (the +/// new bale headed by the instruction we are unbaling). A rdregion can have an +/// llvm.genx.add.address intrinsic baled in if it has a variable index. +/// +/// If the unbaled instruction is dominated by the original two address +/// instruction, we move it to just before that. Otherwise, we move it to the +/// end of the basic block that is the nearest common dominator of the two +/// locations. +/// +/// To move a bale up, we need to ensure that all outside-bale operands are +/// defined before where we are going to move it to. If that is not the case, +/// then unbaling for the original two address instruction fails. +/// +/// Moving when there is a variable index +/// """"""""""""""""""""""""""""""""""""" +/// +/// For a rdregion with a variable index, there is an llvm.genx.conv.address +/// intrinsic, which represents the setting of an address register relative to +/// the base register that the rdregion will access. GenXCategory ensures that +/// there is one llvm.genx.conv.address intrinsic for each variable index +/// rdregion or wrregion, since it does not know which region accesses are +/// going to be in the same register. Commoning up of address conversions is +/// done later, after coalescing has decided which ones are in the same base +/// register. +/// +/// The problem for GenXUnbaling is that the llvm.genx.conv.address is likely +/// to be just before the rdregion, which stops the rdregion being moved to +/// before the original two address instruction. +/// +/// The solution is to pretend that the llvm.genx.conv.address (and anything it +/// bales in, probably a zext/sext) is part of the rdregion's bale, just for +/// GenXUnbaling's purposes of telling whether it is OK to move it, and then +/// actually moving it. GenXBaling::buildBale() has an extra IncludeAddr flag +/// to enable this behavior. +/// +/// What is before and after? +/// """"""""""""""""""""""""" +/// +/// The notion of whether an instruction is before or after the original two +/// address instruction is more complex in the presence of control flow. +/// +/// This pass distinguishes the following cases: +/// +/// * Before: The instruction dominates the original two address instruction, +/// so can be considered before it. No use in the instruction reaches back to +/// the original two address instruction. +/// +/// * After: The original two address instruction dominates the instruction, so +/// can be considered after it. A use in the instruction causes liveness to +/// reach back to the original two address instruction (as long as the use's +/// definition is before that). +/// +/// * Reaches: Neither dominates the other, but a use in the instruction causes +/// liveness to reach back to the original two address instruction anyway. +/// This is determined by actually tracing back all the branches through the +/// control graph, abandoning a branch when it rejoins with another one or +/// reaches the definition. +/// +/// * Not reaches: Neither dominates the other, but we can prove that a use in +/// the instruction does not cause liveness to reach back to the original two +/// address instruction. +/// +/// When processing a phi incoming rather than a two address instruction, it is +/// considered to be at the end of the corresponding incoming block, rather +/// than at the site of the phi node. +/// +/// If we have "not reaches", then the use can be ignored in the same way as a +/// "before" use. +/// +/// If we have "reaches", then we can still unbale it. If the new sub-bale +/// needs moving, then we move it to the end of the block that is the nearest +/// common dominator of its old location and the original two address +/// instruction. +/// +/// A use in a phi node is considered to be at the end of the incoming block +/// for the purposes of determining its position. +/// +/// Commoning up unbaled sub-bales +/// """""""""""""""""""""""""""""" +/// +/// It is often the case that baling has caused the same rdregion to be cloned +/// (because a baled in instruction can only have a single use), so unbaling +/// the baled in rdregions causes duplicate instructions. No CSE is run after +/// this point, as that would cause various problems including messing up the +/// baling and the address conversion. +/// +/// Therefore, this pass needs to spot when it is unbaling duplicate sub-bales +/// and common them up. +/// +/// Unbaling the main instruction instead of the rdregion +/// """"""""""""""""""""""""""""""""""""""""""""""""""""" +/// +/// In some cases, the rdregion is in a bale that also contains another +/// rdregion of the same big vector. Unbaling the two rdregions separately +/// would create two extra instructions. We can reduce that to one extra +/// instruction by instead unbaling the main instruction from the wrregion at +/// the head, so only the wrregion is left at its original position in the code +/// and the rest of the bale is moved up. +/// +/// The pass only does that if it detects more than one use of the big vector +/// in the bale. +/// +/// When trying to do this, and the proposed sub-bale needs to be moved rather +/// than just unbaled, we may see that not all outside-bale operands are +/// defined before the original two address instruction. In that case, we +/// abandon the attempt to unbale the main instruction, and instead go back to +/// unbaling just the rdregion, which may succeed. +/// +/// Bitcasts +/// """""""" +/// +/// Because GenXCoalescing does "copy coalescing" of bitcasts first, we need to +/// consider not just the rdregion uses of the input to the original two +/// address instruction, but also uses of the whole tree of bitcasts containing +/// it. Not doing that stops the optimization working when the source CM code +/// contains format() functions. +/// +/// Such bitcasts may need to be moved up to just before the original two +/// address instruction, in case any use of it is moved. In fact we just move +/// the whole tree of bitcasts to just after the definition of the root of the +/// tree. This does not worsen code quality because the bitcasts will all be +/// copy coalesced together anyway. +/// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "GENX_UNBALING" + +#include "FunctionGroup.h" +#include "GenX.h" +#include "GenXBaling.h" +#include "GenXGotoJoin.h" +#include "GenXIntrinsics.h" +#include "GenXLiveness.h" +#include "GenXNumbering.h" +#include "GenXRegion.h" +#include "GenXUtil.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Type.h" +#include "llvm/Support/Debug.h" + +#include + +using namespace llvm; +using namespace genx; + +namespace { + +class GenXUnbaling : public FunctionGroupPass { + enum { UNKNOWN, BEFORE, AFTER, NOTREACHES, REACHES }; + + GenXBaling *Baling; + GenXLiveness *Liveness; + GenXNumbering *Numbering; + DominatorTree *DT; + bool Modified; + BasicBlock *CurBlock; + std::map ReachabilityCache; + std::set InstSeen; + ValueMap InstSeenInProcessNonOverlappingRegion; + SmallVector ToErase; + // Fields used to process a single two address instruction. + struct ToUnbaleEntry { + Instruction *Inst; // instruction to unbale + Instruction *InsertBefore; // where to move it to, 0 if no move + ToUnbaleEntry(Instruction *Inst, Instruction *InsertBefore) + : Inst(Inst), InsertBefore(InsertBefore) {} + }; + SmallVector ToUnbale; + std::map CommonBaleMap; +public: + static char ID; + explicit GenXUnbaling() : FunctionGroupPass(ID) {} + StringRef getPassName() const override { return "GenX unbaling"; } + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnFunctionGroup(FunctionGroup &FG) override; + // createPrinterPass : get a pass to print the IR, together with the GenX + // specific analyses + Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const + override { return createGenXGroupPrinterPass(O, Banner); } +private: + void processFunc(Function *F); + void shortenLiveRanges(Function *F); + bool interfere(Value *V1, Value *V2); + void processTwoAddrOrPhi(Instruction *Inst, unsigned TwoAddrOperandNum); + bool scanUsesForUnbaleAndMove(Instruction *Inst, Value *TwoAddrOperand); + int getReachability(Instruction *Inst, Instruction *Def); + void processNonOverlappingRegion(CallInst *Wr); +}; + +} // end anonymous namespace + +namespace llvm { void initializeGenXUnbalingPass(PassRegistry &); } +char GenXUnbaling::ID = 0; +INITIALIZE_PASS_BEGIN(GenXUnbaling, "GenXUnbaling", "GenXUnbaling", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeGroupWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GenXGroupBaling) +INITIALIZE_PASS_DEPENDENCY(GenXLiveness) +INITIALIZE_PASS_DEPENDENCY(GenXNumbering) +INITIALIZE_PASS_END(GenXUnbaling, "GenXUnbaling", "GenXUnbaling", false, false) + +FunctionGroupPass *llvm::createGenXUnbalingPass() { + initializeGenXUnbalingPass(*PassRegistry::getPassRegistry()); + return new GenXUnbaling(); +} + +void GenXUnbaling::getAnalysisUsage(AnalysisUsage &AU) const { + FunctionGroupPass::getAnalysisUsage(AU); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + AU.setPreservesCFG(); +} + +/*********************************************************************** + * runOnFunctionGroup : run the liveness analysis for this FunctionGroup + */ +bool GenXUnbaling::runOnFunctionGroup(FunctionGroup &FG) { + Baling = &getAnalysis(); + Liveness = &getAnalysis(); + Numbering = &getAnalysis(); + Modified = false; + for (auto fgi = FG.begin(), fge = FG.end(); fgi != fge; ++fgi) + processFunc(*fgi); + return Modified; +} + +/*********************************************************************** + * processFunc : process one function in GenXUnbaling + * + * This does a postordered depth first traversal of the CFG, processing + * instructions within a basic block in reverse, to ensure that we see a def + * after its uses (ignoring phi node uses). That is required for the + * non-overlapping region optimization, as we need to perform that on a bale + * before an earlier wrregion sees the use in the rdregion and unbales it. + */ +void GenXUnbaling::processFunc(Function *F) { + LLVM_DEBUG(dbgs() << "GenXUnbaling on " << F->getName() << "\n"); + DT = getAnalysis().getDomTree(F); + for (po_iterator i = po_begin(&F->getEntryBlock()), + e = po_end(&F->getEntryBlock()); i != e; ++i) { + CurBlock = *i; + // Process our incomings of successors' phi nodes. + auto TI = CurBlock->getTerminator(); + for (unsigned si = 0, se = TI->getNumSuccessors(); si != se; ++si) { + BasicBlock *Succ = TI->getSuccessor(si); + for (auto bi = Succ->begin(); ; ++bi) { + auto Phi = dyn_cast(bi); + if (!Phi) + break; + unsigned IncomingNum = Phi->getBasicBlockIndex(CurBlock); + processTwoAddrOrPhi(Phi, IncomingNum); + } + } + for (auto Inst = &CurBlock->back(); Inst; + Inst = Inst == &CurBlock->front() ? nullptr : Inst->getPrevNode()) { + // Process a two address instruction. (All two address instructions are + // intrinsics and thus calls.) + if (auto CI = dyn_cast(Inst)) { + int TwoAddrOperandNum = getTwoAddressOperandNum(CI); + if (TwoAddrOperandNum >= 0) { + processTwoAddrOrPhi(CI, TwoAddrOperandNum); + if (GenXIntrinsic::isWrRegion(CI)) + processNonOverlappingRegion(CI); + } + } + // Mark the instruction as seen. + InstSeen.insert(Inst); + } + InstSeen.clear(); + InstSeenInProcessNonOverlappingRegion.clear(); + ReachabilityCache.clear(); + for (auto i = ToErase.begin(), e = ToErase.end(); i != e; ++i) + (*i)->eraseFromParent(); + ToErase.clear(); + } + + shortenLiveRanges(F); +} + +/*********************************************************************** + * shortenLiveRanges : hoist rdregions if this helps to avoid copy coalescing. + * + * %1 = wrregion ... + * ... + * %2 = wrregion(%1, ...) + * ... + * %3 = rdregion (%1, ...) + * no other uses of %1 except rdregions + * + * In this situation, compiler will do copy coalescing(See GenXCoalescing) %2 + * from %1. If %1 is a big region, we will have a lot of movs. But if %3 reads + * a small region, it's cheaper to hoist it between %1 and %2. Compiler will + * generate a copy for this small region, but %2 will be coalesced without + * copying. + */ +void GenXUnbaling::shortenLiveRanges(Function *F) { + for (po_iterator i = po_begin(&F->getEntryBlock()), + e = po_end(&F->getEntryBlock()); + i != e; ++i) { + BasicBlock *BB = *i; + for (Instruction &Inst : *BB) { + auto DstRegion = dyn_cast(&Inst); + if (DstRegion && GenXIntrinsic::isWrRegion(DstRegion)) { + // now we've found %2 = wrregion. Firstly, let's check that %1 and %2 + // interfere and after search for rdregions(%3 and others). + auto SrcRegion = dyn_cast(DstRegion->getOperand(0)); + if (!SrcRegion || !GenXIntrinsic::isWrRegion(SrcRegion) || + !interfere(SrcRegion, DstRegion)) + continue; + + // Collect all %1 users that are "under" %2. + unsigned DstNumber = Numbering->getNumber(DstRegion); + SmallVector ToHoist; + std::copy_if(SrcRegion->user_begin(), SrcRegion->user_end(), + std::back_inserter(ToHoist), + [DstNumber, N = Numbering](User *U) { + return DstNumber < N->getNumber(U); + }); + bool CanHoist = + std::all_of(ToHoist.begin(), ToHoist.end(), [BB](User *U) { + return U->isUsedInBasicBlock(BB) && GenXIntrinsic::isRdRegion(U); + }); + if (!CanHoist || ToHoist.empty()) + continue; + + // Is it reasonable to hoist rdregions? Let's compare the number of + // elements to copy in both cases. + unsigned NumEltsToCopy = std::accumulate( + ToHoist.begin(), ToHoist.end(), 0u, [](unsigned Init, User *U) { + return Init + cast(U->getType())->getNumElements(); + }); + if (NumEltsToCopy >= + cast(SrcRegion->getType())->getNumElements()) + continue; + + // Unbale and hoist + for (User *U : ToHoist) { + auto RdR = dyn_cast(U); + assert(RdR && GenXIntrinsic::isRdRegion(RdR)); + Instruction *InsertBefore = DstRegion; + if (auto UnbaleFrom = Baling->getBaleParent(RdR)) { + BaleInfo BI = Baling->getBaleInfo(UnbaleFrom); + BI.clearOperandBaled(RdR->use_begin()->getOperandNo()); + Baling->setBaleInfo(UnbaleFrom, BI); + } + RdR->moveBefore(InsertBefore); + Modified = true; + } + } + } + } +} + +bool GenXUnbaling::interfere(Value *V1, Value *V2) { + assert(V1); + assert(V2); + + LiveRange *V1LR = Liveness->getLiveRangeOrNull(V1); + LiveRange *V2LR = Liveness->getLiveRangeOrNull(V2); + // We cannot analyze without LR. + if (!V1LR || !V2LR) + return false; + return Liveness->twoAddrInterfere(V1LR, V2LR); +} + +/*********************************************************************** + * processTwoAddrOrPhi : process a two address instruction or phi node + * incoming + * + * Enter: Inst = two address inst or phi node + * TwoAddrOperandNum = two address operand number (incoming number + * for phi) + * + * For a phi node incoming, this is called when CurBlock and InstSeen reflect + * that processing has reached the end of the incoming's block, rather than the + * start of the block containing the phi node itself. + */ +void GenXUnbaling::processTwoAddrOrPhi(Instruction *Inst, + unsigned TwoAddrOperandNum) { + Value *TwoAddrOperand = Inst->getOperand(TwoAddrOperandNum); + if (isa(TwoAddrOperand)) + return; + LLVM_DEBUG(dbgs() << "\nGenXUnbaling::processTwoAddrOrPhi[" << TwoAddrOperandNum + << "]: " << *Inst << "\n"); + if (!scanUsesForUnbaleAndMove(Inst, TwoAddrOperand)) + return; + // Move the tree of bitcasts containing TwoAddrOperand to just after its def. + // (If that would be before a phi node, because the def is a phi node other + // than the last in its block, then insert just before first non-phi in the + // block. If the def is an Argument, insert at the start of the code.) We may + // need to move some of them earlier if their uses are going to be moved, and + // just moving them all as early as possible is easiest. That does not + // affect register pressure or code size as a bitcast generates no code and + // is copy coalesced together. + // + // We do not worry about the possibility of moving the bitcasts into a join + // label block. Although a join label block must start with a join after the + // phi nodes, bitcasts are allowed as they generate no code. + Value *Root = TwoAddrOperand; + while (auto BC = dyn_cast(Root)) + Root = BC->getOperand(0); + Value *V = Root; + Instruction *InsertBefore = nullptr; + if (auto I = dyn_cast(Root)) { + InsertBefore = I->getNextNode(); + if (isa(InsertBefore)) + InsertBefore = InsertBefore->getParent()->getFirstNonPHI(); + } else + InsertBefore = Inst->getParent()->getParent()->front().getFirstNonPHI(); + SmallVector BitCastQueue; + for (unsigned bci = 0;;) { + // For this value, find uses that are bitcast and save them. + for (auto ui = V->use_begin(), ue = V->use_end(); ui != ue; ++ui) + if (auto BC = dyn_cast(ui->getUser())) + BitCastQueue.push_back(BC); + // Go on to the next bitcast in the queue. + if (bci == BitCastQueue.size()) + break; + auto BC = BitCastQueue[bci++]; + // Move this bitcast. + if (BC == InsertBefore) + InsertBefore = BC->getNextNode(); + else + BC->moveBefore(InsertBefore); + V = BC; + } + // Unbale and/or move uses found in scanUsesForUnbaleAndMove(). + for (auto ti = ToUnbale.begin(), te = ToUnbale.end(); ti != te; ++ti) { + Instruction *Unbale = ti->Inst; + Instruction *InsertBefore = ti->InsertBefore; + LLVM_DEBUG(dbgs() << "Unbaling and/or moving " << Unbale->getName() + << " (or removing if it is a duplicate)\n"); + // Unbale from its bale parent (if any). + if (auto UnbaleFrom = Baling->getBaleParent(Unbale)) { + LLVM_DEBUG(dbgs() << "Unbaling " << Unbale->getName() << " from " + << UnbaleFrom->getName() << " in bale " + << Baling->getBaleHead(UnbaleFrom)->getName() << "\n"); + BaleInfo BI = Baling->getBaleInfo(UnbaleFrom); + BI.clearOperandBaled(Unbale->use_begin()->getOperandNo()); + Baling->setBaleInfo(UnbaleFrom, BI); + } + auto Found = CommonBaleMap.find(Unbale); + if (Found != CommonBaleMap.end()) { + LLVM_DEBUG(dbgs() << "Duplicate of " << Found->second->getName() + << ", removing\n"); + Unbale->replaceAllUsesWith(Found->second); + Bale B; + Baling->buildBale(Unbale, &B, /*IncludeAddr=*/true); + Liveness->removeBale(B); + B.eraseFromParent(); + } else { + // Move it if necessary. + if (InsertBefore) { + LLVM_DEBUG(dbgs() << "Moving bale at " << Unbale->getName() + << " to before " << InsertBefore->getName() + << " in " << InsertBefore->getParent()->getName() << "\n"); + Bale B; + Baling->buildBale(Unbale, &B, /*IncludeAddr=*/true); + for (auto bi = B.begin(), be = B.end(); bi != be; ++bi) { + auto MoveInst = bi->Inst; + LLVM_DEBUG(dbgs() << " moving " << MoveInst->getName() << "\n"); + MoveInst->moveBefore(InsertBefore); + } + } + } + } + Modified = true; +} + +/*********************************************************************** + * scanUsesForUnbaleAndMove : scan uses of TwoAddrOperand to see if we can + * unbale and/or move them to before the current position + * + * Enter: Inst = instruction at current position + * TwoAddrOperand : value whose uses we scan + * + * Return: true if we want to unbale/move some uses + * + * This function clears then populates the following GenXUnbaling fields: + * + * ToUnbale = vector to store instructions that want to be unbaled and/or moved. + * CommonBaleMap = map to store mapping for common bales. + * + * A duplicate instruction is also in ToUnbale, but after the instruction it + * duplicates. + * + * The function spots the following cases (picking the first that applies): + * + * 1. All uses already before Inst. Returns false. + * 2. There is some use whose liveness reaches back to Inst, but is not + * dominated by Inst, so we cannot do anything. Returns false. + * 3. There is some use in an instruction after Inst which we cannot unbale + * and/or move so it is before Inst because it has an outside-bale operand + * whose def is not before Inst. Returns false. + * 4. All uses after Inst can be unbaled and/or moved, but (after commoning + * them up) that would result in a number of extra instructions that + * outweights the number saved by failing to coalesce Inst. Returns false. + * 5. There is some use in an instruction after Inst that is not a rdregion + * use. We cannot do anything with that. Returns false. + * 6. Otherwise, return true to tell the caller to go ahead and unbale/move + * the instructions in ToUnbale (or common up with another one if it is + * in CommonBaleMap). + * + * We also need to look at uses of a tree of bitcasts of TwoAddrOperand, as + * they will be copy coalesced. + */ +bool GenXUnbaling::scanUsesForUnbaleAndMove(Instruction *Inst, + Value *TwoAddrOperand) { + ToUnbale.clear(); + CommonBaleMap.clear(); + std::set UseSeen; + std::set CommonBales; + unsigned UnbaleCount = 0; + // Scan uses of TwoAddrOperand, and, if any use is a bitcast, scan its uses, + // and so on through the tree of bitcasts. If TwoAddrOperand is itself the + // result of a bitcast, scan up to the root of the bitcast tree first. + SmallVector BitCasts; + Value *Root = TwoAddrOperand; + while (auto BC = dyn_cast(Root)) + Root = BC->getOperand(0); + for (unsigned bci = 0;;) { + for (auto ui = Root->use_begin(), ue = Root->use_end(); + ui != ue; ++ui) { + auto User = cast(ui->getUser()); + if (auto Phi = dyn_cast(User)) { + if (Phi == Inst) + continue; // Ignore use in phi node that we started at. + // For a phi node, determine the use's position relative to the current + // position as if it is at the end of the incoming block. + int Position = getReachability( + Phi->getIncomingBlock(*ui)->getTerminator(), + dyn_cast(TwoAddrOperand)); + LLVM_DEBUG(dbgs() << "phi use in " << User->getName() << " is " + << (Position == BEFORE ? "before" : (Position == AFTER ? "after" + : (Position == REACHES ? "reaches" : (Position == NOTREACHES + ? "notreaches" : "unknown")))) << "\n"); + if (Position == BEFORE || Position == NOTREACHES) + continue; + return false; + } + auto UserHead = Baling->getBaleHead(User); + if (UserHead == Inst) + continue; // Ignore use in wrregion Inst that we started at. + LLVM_DEBUG(dbgs() << "use in " << *User << "\n"); + if (!UseSeen.insert(User).second) { + LLVM_DEBUG(dbgs() << "use in " << User->getName() + << " has already been accounted for\n"); + continue; + } + // Determine the use's position relative to the current position. We use + // the bale head's position. + int Position = + getReachability(UserHead, dyn_cast(TwoAddrOperand)); + LLVM_DEBUG(dbgs() << "use in " << User->getName() << " is " + << (Position == BEFORE ? "before" : (Position == AFTER ? "after" + : (Position == REACHES ? "reaches" : (Position == NOTREACHES + ? "notreaches" : "unknown")))) << "\n"); + if (Position == NOTREACHES) + continue; // ignore use unreachable from Inst + if (isa(User)) { + // This is a bitcast -- add it to BitCasts so we use it as a Root later + // and scan its uses (even if it is before Inst, as its uses might + // still be after Inst). + LLVM_DEBUG(dbgs() << "use in " << User->getName() << " is bitcast\n"); + BitCasts.push_back(User); + continue; + } + if (Position == BEFORE) + continue; // Ignore use that is already before Inst. + // Check that the use is operand 0 of rdregion. + if (ui->getOperandNo() || !GenXIntrinsic::isRdRegion(User)) { + LLVM_DEBUG(dbgs() << "use in " << User->getName() + << " is after but is not rdregion\n"); + return false; + } + // If the result of the rdregion is too big (more than 32 elements or + // more than 2 GRFs), we cannot unbale it. This happens with an rdregion + // baled in to a raw operand of a shared function intrinsic. Unbaling it + // would result in an illegally wide instruction. + if (auto VT = dyn_cast(User->getType())) { + if (VT->getNumElements() > 32U + || VT->getPrimitiveSizeInBits() > 512U) { + LLVM_DEBUG(dbgs() << User->getName() << " is too wide to unbale\n"); + return false; + } + } + // We have decided that this use needs unbaling and/or moving. Decide how + // we are going to do it, without actually doing it yet. First assume + // that we're going to unbale User from its bale parent, if it is baled + // at all. + Instruction *Unbale = User; + Bale B; + if (GenXIntrinsic::isWrRegion(UserHead)) { + // The bale head is a wrregion. Unbale the main instruction from it, + // rather than just the user of the overlapping vector, as long as the + // resulting smaller bale contains at least two uses of TwoAddrOperand + // (or a bitcast thereof), and each outside-bale operand in the bale is + // defined before Inst. + Unbale = dyn_cast( + UserHead->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum)); + if (Unbale) { + // We use IncludeAddr=true on the buildBale. That makes it include + // any address calculation (convert.addr and add.addr ops), even + // though they are not baled in. What that gives us is: + // + // 1. When comparing bales in the CommonUses set to find another bale + // that we can common up with, it makes two rdregions look the + // same even though they have separate copies of their address + // calculation. + // + // 2. The code here that checks if all the outside-bale operands are + // defined early enough and then moves the bale also moves the + // address calculation, which is what we want. + Baling->buildBale(Unbale, &B, /*IncludeAddr=*/true); + B.hash(); + LLVM_DEBUG(B.print(dbgs())); + // Check for multiple uses. (A use is always in operand 0 of + // rdregion.) + unsigned UseCount = 0; + for (auto bi = B.begin(), be = B.end(); bi != be; ++bi) { + if (bi->Info.Type != BaleInfo::RDREGION) + continue; + Value *Opnd = bi->Inst->getOperand(0); + if (Opnd == Root) + ++UseCount; + else + for (auto ri = BitCasts.begin(), + re = BitCasts.end(); ri != re; ++ri) + if (bi->Inst->getOperand(0) == *ri) + ++UseCount; + } + assert(UseCount >= 1); + if (UseCount <= 1) { + // Did not get multiple uses. Just unbale the rdregion use. + if (Unbale != User) { + B.clear(); + Unbale = User; + } + } else { + LLVM_DEBUG(dbgs() << "Trying unbale from wrregion\n"); + if (!UseSeen.insert(Unbale).second) { + LLVM_DEBUG(dbgs() << "use (unbale from wrregion) in " + << User->getName() + << " has already been accounted for\n"); + continue; + } + } + } + } + if (!Unbale) + return false; + // Loop to try unbaling from wrregion first, then try just unbaling the + // rdregion. + Instruction *InsertBefore = nullptr; // start assuming not moving sub-bale + for (;;) { + // Build the sub-bale we are proposing to unbale (if not already built + // in the code above). See comment above about using IncludeAddr=true. + if (B.empty()) { + Baling->buildBale(Unbale, &B, /*IncludeAddr=*/true); + B.hash(); + LLVM_DEBUG(B.print(dbgs())); + } + // Get the position relative to Inst of the sub-bale we propose to + // unbale. If it is already BEFORE, then we don't need to check for all + // outside-bale operands being before Inst. + int UnbalePos = getReachability(Unbale, + dyn_cast(TwoAddrOperand)); + LLVM_DEBUG(dbgs() << "proposed unbale " << Unbale->getName() << " is " + << (Position == BEFORE ? "before" : (Position == AFTER ? "after" + : (Position == REACHES ? "reaches" : (Position == NOTREACHES + ? "notreaches" : "unknown")))) << "\n"); + if (UnbalePos == BEFORE) { + InsertBefore = nullptr; // no need to move instruction + break; // ok to unbale here + } + // We need to move the unbaled instruction. Work out where we need to + // move it to. + if (UnbalePos == AFTER && !isa(Inst)) + InsertBefore = Inst; // insert before original two addr inst + else { + // The instruction to be unbaled is not dominated by the original two + // addr inst, or we were processing a phi incoming rather than a two + // addr inst. We want to find the nearest common dominator and insert + // at the end of that block. + InsertBefore = DT->findNearestCommonDominator( + CurBlock, Unbale->getParent())->getTerminator(); + // Ensure we have a legal insertion point in the presence of SIMD CF. + InsertBefore = GotoJoin::getLegalInsertionPoint(InsertBefore, DT); + } + // We will need to move the unbaled instruction to before Inst. Check + // that each outside-bale operand in the bale is defined before the + // insert point. + bool IsBeforeInst = true; + for (auto bi = B.begin(), be = B.end(); bi != be; ++bi) { + for (unsigned oi = 0, oe = bi->Inst->getNumOperands(); + oi != oe && IsBeforeInst; ++oi) { + if (!bi->Info.isOperandBaled(oi)) { + auto Opnd = bi->Inst->getOperand(oi); + // Check for Opnd's definition being before the insert point: + // + // 1. If it is an Argument rather than an Instruction, it is + // before. + if (auto OpndInst = dyn_cast(Opnd)) { + // 2. If in same basic block: + // 2a. If insert point is Inst (the original two addr inst), + // use InstSeen to work out if it is before or after. + // 2b. Otherwise, it is always before because InsertBefore is + // at the end of its basic block. + if (OpndInst->getParent() == InsertBefore->getParent()) { + if (InsertBefore == Inst) + IsBeforeInst &= OpndInst != Inst + && InstSeen.find(OpndInst) == InstSeen.end(); + } else + // 3. If in different basic block, check dominance. + IsBeforeInst &= DT->dominates( + OpndInst->getParent(), InsertBefore->getParent()); + } + if (!IsBeforeInst) { + LLVM_DEBUG(dbgs() << " outside-bale operand " << Opnd->getName() + << " is not before Inst\n"); + break; + } + } + } + } + if (IsBeforeInst) { + // OK to unbale and move to InsertBefore. + break; + } + // We have failed, either by Unbale's position being REACHES so we + // can't move it, or by its position being AFTER so we need to move it + // but there is an outside-bale operand that is not before Inst. + if (Unbale != User) { + // This is the case that we were trying to unbale out of the + // wrregion. This has now failed, and we re-try unbaling just the + // rdregion use. + LLVM_DEBUG(dbgs() << "Failed to unbale out of wrregion; " + << "retrying at rdregion\n"); + Unbale = User; + B.clear(); + continue; + } + // We have found an outside-bale operand that is not defined before + // Inst, presumably an operand to the address calculation of the + // rdregion. We have to give up at this point. + LLVM_DEBUG(dbgs() << "Failed to unbale rdregion; abandon\n"); + return false; + } + LLVM_DEBUG(dbgs() << "Can unbale and/or move\n"); + // See if we already have a common bale. If so, point this use at it. + auto Found = CommonBales.find(B); + if (Found != CommonBales.end()) { + LLVM_DEBUG(dbgs() << "Found common bale " + << Found->getHead()->Inst->getName() << "\n"); + CommonBaleMap[Unbale] = Found->getHead()->Inst; + } else { + CommonBales.insert(B); + // If there will actually be an unbale, count it. + UnbaleCount += Baling->isBaled(Unbale); + } + // Add this bale to the list of bales to unbale and/or move. + LLVM_DEBUG( + if (!InsertBefore) + dbgs() << "Adding " << Unbale->getName() << " to ToUnbale list\n"; + else + dbgs() << "Adding " << Unbale->getName() << " (with move to before " + << InsertBefore->getName() << " in " + << InsertBefore->getParent()->getName() << ") to Unbale list\n"; + ); + ToUnbale.push_back(ToUnbaleEntry(Unbale, InsertBefore)); + } + // Also look at uses of bitcasts in the bitcast tree. + if (bci == BitCasts.size()) + break; + Root = BitCasts[bci++]; + } + if (ToUnbale.empty()) { + LLVM_DEBUG(dbgs() << "Nothing to unbale/move, " + << "must already be kill use at Inst\n"); + return false; + } + // Calculate how many instructions would be needed for the copy caused by + // TwoAddrOperand failing to coalesce with Inst, and compare that with the + // number of extra instructions caused by the unbaling that we propose to do + // to avoid it. + unsigned NumBytes = TwoAddrOperand->getType()->getPrimitiveSizeInBits() / 8U; + unsigned NumCopies = NumBytes / 64U; // one copy per 2 GRFs + NumBytes -= NumCopies * 64U; + NumCopies += countPopulation(NumBytes); // extra copy per power of 2 + LLVM_DEBUG(dbgs() << NumCopies << " copy insts, vs " + << UnbaleCount << " unbales\n"); + if (NumCopies < UnbaleCount) { + LLVM_DEBUG(dbgs() << "Too many new instructions, code would be worse.\n"); + return false; + } + LLVM_DEBUG(dbgs() << "We have uses to unbale/move.\n"); + return true; +} + +/*********************************************************************** + * getReachability : determine relationship of Inst with current position + * + * Enter: Inst = instruction to get position of + * Def = 0 else instruction that defines use whose liveness we are + * interested in + * + * Return: BEFORE: Inst is before current pos (Inst dominates current pos) + * AFTER: Inst is after current pos (current pos dominates Inst) + * REACHES: no dominance, and liveness of use in Inst reaches back to + * current pos without passing through Def + * NOTREACHES: no dominance, and liveness of use in Inst does not reach + * back to current pos without passing through Def + * + * In the case that there is no simple dominance relationship between Inst and + * the current position, Def is used to stop the backwards scan. For a value + * defined inside a loop, if you don't supply def then this function will + * always return REACHES as it will trace backwards round the loop. + * + * The current position is represented by CurBlock and which already seen + * instructions in that block are in InstSeen. + * + * We keep a cache of results. This is cleared when the current basic block + * changes. + */ +int GenXUnbaling::getReachability(Instruction *Inst, Instruction *Def) +{ + auto Block = Inst->getParent(); + // Check simple case of same basic block. + if (CurBlock == Block) + return InstSeen.find(Inst) != InstSeen.end() ? AFTER : BEFORE; + // Check ReachabilityCache. + auto It = ReachabilityCache.insert( + std::pair(Block, UNKNOWN)).first; + if (It->second != UNKNOWN) + return It->second; + // Check dominance. + if (DT->dominates(Block, CurBlock)) + return It->second = BEFORE; + if (DT->dominates(CurBlock, Block)) + return It->second = AFTER; + // Trace liveness of use in Inst backwards and see if we reach CurBlock. + BasicBlock *DefBlock = nullptr; + if (Def) + DefBlock = Def->getParent(); + SmallVector Stack; + std::set BlockSeen; + Stack.push_back(Block); + while (!Stack.empty()) { + Block = Stack.back(); + Stack.pop_back(); + if (!BlockSeen.insert(Block).second) + continue; // already seen, terminate this branch of the scan + if (Block == CurBlock) + return It->second = REACHES; // reached current pos + if (Block == DefBlock) + continue; // reached def, terminate this branch of the scan + // Add the predecessors of this block to the stack. + for (auto ui = Block->use_begin(), ue = Block->use_end(); ui != ue; ++ui) + Stack.push_back(cast(ui->getUser())->getParent()); + } + return It->second = NOTREACHES; +} + +/*********************************************************************** + * processNonOverlappingRegion : perform the non-overlapping region optimization + * + * Enter: EndWr = wrregion instruction for possible end of wrregion sequence + * + * If EndWr is head of a bale that includes a rdregion, and it is part of a + * sequence of wrregions whose first "old value" input is the same as the input + * to the rdregion, then check whether the rdregion's region has been + * overwritten in the sequence. If not, change the rdregion's input to the same + * as that of Wr. + * + * The idea is that we can avoid overlapping live ranges and hence unbaling. + * + * This also handles the case that the "old value" input to the start wrregion + * is undef, and we want to make the transformation (and change that start + * wrregion input too) to save a live range overlap in the sequence. However, + * we only do that if we can prove that it does not make the code worse, which + * it does if the rdregion input is still live after the sequence. + */ +void GenXUnbaling::processNonOverlappingRegion(CallInst *EndWr) +{ + // Avoid processing a sequence of N wrregions N times, giving O(N^2) + // complexity -- only process when we see the end of the sequence. + if (InstSeenInProcessNonOverlappingRegion.find(EndWr) + != InstSeenInProcessNonOverlappingRegion.end()) + return; + // Find the sequence of wrregions, each except the last having the next as + // its only use. + CallInst *StartWr = EndWr; + Value *StartWrInput = nullptr; + bool WrVariableIndex = false; + for (;;) { + WrVariableIndex |=!isa( + StartWr->getOperand(GenXIntrinsic::GenXRegion::WrIndexOperandNum)); + StartWrInput = + StartWr->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum); + if (!GenXIntrinsic::isWrRegion(StartWrInput)) + break; + if (!StartWrInput->hasOneUse()) + break; + StartWr = cast(StartWrInput); + InstSeenInProcessNonOverlappingRegion[StartWr] = true; + } + if (StartWr == EndWr) + return; // no sequence + if (WrVariableIndex) + return; // Can't deal with variable index + Value *RdInput = StartWrInput; + if (isa(StartWrInput)) { + // In the case that the input to the start wrregion is undef, we need to + // find a rdregion input that is the same type. + RdInput = nullptr; + Bale B; + Baling->buildBale(StartWr, &B); + for (auto bi = B.begin(), be = B.end(); bi != be; ++bi) { + if (bi->Info.Type != BaleInfo::RDREGION) + continue; + Value *Input = bi->Inst->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum); + if (Input->getType() != StartWrInput->getType()) + continue; + RdInput = Input; + if (isa(Input)) { + // Prefer to save a live-range on Phi, which may help to + // save phi copies. This is observed on Histogram1. + break; + } + } + if (!RdInput) + return; // no such input found + // We need to check that RdInput is not used again after this sequence, + // otherwise we could be making the code worse. The use of RdInput is + // counted as being at its user's bale head. + auto Def = dyn_cast(RdInput); + for (auto ui = RdInput->use_begin(), ue = RdInput->use_end(); + ui != ue; ++ui) { + auto User = cast(ui->getUser()); + auto UserHead = Baling->getBaleHead(User); + switch (getReachability(UserHead, Def)) { + case AFTER: + case REACHES: + return; + } + } + } + // Scan forwards through the wrregion sequence, keeping track of which + // elements of the vector keep their original values. Then for each one see + // if it has a rdregion whose input is the same as the first wrregion's "old + // value" input. If so, and the region has not been overwritten by wrregions + // so far, remember it as one that we want to change. We calculate which + // regions have been overwritten by starting with a vector of all 0s and then + // simulating the writes by writing -1s. If the region we want at the end is + // still all 0s, then it has not been overwritten. + SmallVector, 4> RdsToModify; + Constant *C = Constant::getNullValue(EndWr->getType()); + for (auto ThisWr = StartWr;;) { + // For elements overwritten by Wr, change corresponding elements in C to + // undef. + Region R(ThisWr, BaleInfo()); + C = R.evaluateConstantWrRegion(C, + Constant::getAllOnesValue(ThisWr->getOperand(1)->getType())); + // Move on to next wrregion. + if (ThisWr == EndWr) + break; + ThisWr = cast(ThisWr->use_begin()->getUser()); + // Scan the rdregions in ThisWr's bale. + Bale B; + Baling->buildBale(ThisWr, &B); + for (auto bi = B.begin(), be = B.end(); bi != be; ++bi) { + if (bi->Info.Type != BaleInfo::RDREGION) + continue; + if (bi->Inst->getOperand(0) != RdInput) + continue; + Instruction *Rd = bi->Inst; + // See if the rdregion only reads a region that has not been overwritten + // by any wrregion up to now. + Region RdR(Rd, BaleInfo()); + if (RdR.Indirect) + return; // Fail if rdregion is indirect + Constant *SubC = RdR.evaluateConstantRdRegion(C, /*AllowScalar=*/false); + if (!SubC->isNullValue()) + return; // Fail if reads overwritten region + // Remember this rdregion for modifying. + RdsToModify.push_back( + std::pair(Rd, ThisWr->getOperand(0))); + } + } + // No failures, so do the modification. + if (RdsToModify.empty()) + return; + Modified = true; + SmallVector UselessWrRegions; + for (auto ri = RdsToModify.begin(), re = RdsToModify.end(); ri != re; ++ri) { + // Change the input to the rdregion. + auto Rd = ri->first; + auto RdInput = ri->second; + Rd->setOperand(0, RdInput); + // Check for the case that we have a rdregion-wrregion bale that is now + // uesless because it reads and writes the same region. + auto Wr = Baling->getBaleParent(Rd); + if (GenXIntrinsic::isWrRegion(Wr) + && Region(Wr, BaleInfo()) == Region(Rd, BaleInfo())) { + UselessWrRegions.push_back(Wr); + continue; + } + // We already know that the rdregion's position in generated code (as + // reflected by the order of heads of bales) is after the instruction + // generating its new input. However, ignoring baling, it might actually be + // _before_ that instruction in the IR, which causes the verifier pass to + // complain. We work around that by moving the rdregion (and any other + // instruction in the bale between it and the head) to just before the head + // of its bale. + SmallVector BaleTrace; + BaleTrace.push_back(Rd); + for (;;) { + auto Parent = Baling->getBaleParent(BaleTrace.back()); + if (!Parent) + break; + BaleTrace.push_back(Parent); + } + for (unsigned i = 0, e = BaleTrace.size() - 1; i != e; ++i) { + auto InstToMove = BaleTrace[i]; + InstToMove->moveBefore(BaleTrace.back()); + } + } + // For the undef input case, also modify that. + if (isa(StartWrInput)) + StartWr->setOperand(0, RdInput); + // Now remove the useless wrregions found above. + for (auto i = UselessWrRegions.begin(), e = UselessWrRegions.end(); + i != e; ++i) { + auto Wr = *i; + auto Rd = cast( + Wr->getOperand(GenXIntrinsic::GenXRegion::NewValueOperandNum)); + Wr->replaceAllUsesWith( + Wr->getOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum)); + Liveness->removeValue(Wr); + Liveness->removeValue(Rd); + ToErase.push_back(Wr); + ToErase.push_back(Rd); + } +} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXUtil.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXUtil.cpp new file mode 100644 index 000000000000..eb7dab05963a --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXUtil.cpp @@ -0,0 +1,1446 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +// Utility functions for the GenX backend. +// +//===----------------------------------------------------------------------===// +#include "GenXUtil.h" +#include "FunctionGroup.h" +#include "GenXIntrinsics.h" +#include "GenXRegion.h" +#include "llvm/GenXIntrinsics/GenXIntrinsics.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" + +#include + +using namespace llvm; +using namespace genx; + +namespace { +struct InstScanner { + Instruction *Original; + Instruction *Current; + InstScanner(Instruction *Inst) : Original(Inst), Current(Inst) {} +}; + +} // namespace + +/*********************************************************************** + * createConvert : create a genx_convert intrinsic call + * + * Enter: In = value to convert + * Name = name to give convert instruction + * InsertBefore = instruction to insert before else 0 + * M = Module (can be 0 as long as InsertBefore is not 0) + */ +CallInst *genx::createConvert(Value *In, const Twine &Name, + Instruction *InsertBefore, Module *M) +{ + if (!M) + M = InsertBefore->getParent()->getParent()->getParent(); + Function *Decl = GenXIntrinsic::getGenXDeclaration(M, GenXIntrinsic::genx_convert, + In->getType()); + return CallInst::Create(Decl, In, Name, InsertBefore); +} + +/*********************************************************************** + * createConvertAddr : create a genx_convert_addr intrinsic call + * + * Enter: In = value to convert + * Offset = constant offset + * Name = name to give convert instruction + * InsertBefore = instruction to insert before else 0 + * M = Module (can be 0 as long as InsertBefore is not 0) + */ +CallInst *genx::createConvertAddr(Value *In, int Offset, const Twine &Name, + Instruction *InsertBefore, Module *M) +{ + if (!M) + M = InsertBefore->getParent()->getParent()->getParent(); + auto OffsetVal = ConstantInt::get(In->getType()->getScalarType(), Offset); + Function *Decl = GenXIntrinsic::getGenXDeclaration(M, GenXIntrinsic::genx_convert_addr, + In->getType()); + Value *Args[] = { In, OffsetVal }; + return CallInst::Create(Decl, Args, Name, InsertBefore); +} + +/*********************************************************************** + * createAddAddr : create a genx_add_addr intrinsic call + * + * InsertBefore can be 0 so the new instruction is not inserted anywhere, + * but in that case M must be non-0 and set to the Module. + */ +CallInst *genx::createAddAddr(Value *Lhs, Value *Rhs, const Twine &Name, + Instruction *InsertBefore, Module *M) +{ + if (!M) + M = InsertBefore->getParent()->getParent()->getParent(); + Value *Args[] = {Lhs, Rhs}; + Type *Tys[] = {Rhs->getType(), Lhs->getType()}; + Function *Decl = GenXIntrinsic::getGenXDeclaration(M, GenXIntrinsic::genx_add_addr, Tys); + return CallInst::Create(Decl, Args, Name, InsertBefore); +} + +/*********************************************************************** + * createUnifiedRet : create a dummy instruction that produces dummy + * unified return value. + * + * %Name.unifiedret = call Ty @llvm.ssa_copy(Ty undef) + */ +CallInst *genx::createUnifiedRet(Type *Ty, const Twine &Name, Module *M) { + assert(Ty && M && "wrong arguments"); + auto G = Intrinsic::getDeclaration(M, Intrinsic::ssa_copy, Ty); + return CallInst::Create(G, UndefValue::get(Ty), Name + ".unifiedret", + static_cast(nullptr)); +} + +/*********************************************************************** + * getPredicateConstantAsInt : get an i1 or vXi1 constant's value as a single integer + */ +unsigned genx::getPredicateConstantAsInt(Constant *C) +{ + if (auto CI = dyn_cast(C)) + return CI->getZExtValue(); // scalar + unsigned Bits = 0; + unsigned NumElements = cast(C->getType())->getNumElements(); + for (unsigned i = 0; i != NumElements; ++i) { + auto El = C->getAggregateElement(i); + if (!isa(El)) + Bits |= (cast(El)->getZExtValue() & 1) << i; + } + return Bits; +} + +/*********************************************************************** + * getConstantSubvector : get a contiguous region from a vector constant + */ +Constant *genx::getConstantSubvector(Constant *V, + unsigned StartIdx, unsigned Size) +{ + Type *ElTy = cast(V->getType())->getElementType(); + Type *RegionTy = VectorType::get(ElTy, Size); + if (isa(V)) + V = UndefValue::get(RegionTy); + else if (isa(V)) + V = ConstantAggregateZero::get(RegionTy); + else { + SmallVector Val; + for (unsigned i = 0; i != Size; ++i) + Val.push_back(V->getAggregateElement(i + StartIdx)); + V = ConstantVector::get(Val); + } + return V; +} + +/*********************************************************************** + * concatConstants : concatenate two possibly vector constants, giving a + * vector constant + */ +Constant *genx::concatConstants(Constant *C1, Constant *C2) +{ + assert(C1->getType()->getScalarType() == C2->getType()->getScalarType()); + Constant *CC[] = { C1, C2 }; + SmallVector Vec; + bool AllUndef = true; + for (unsigned Idx = 0; Idx != 2; ++Idx) { + Constant *C = CC[Idx]; + if (auto VT = dyn_cast(C->getType())) { + for (unsigned i = 0, e = VT->getNumElements(); i != e; ++i) { + Constant *El = C->getAggregateElement(i); + Vec.push_back(El); + AllUndef &= isa(El); + } + } else { + Vec.push_back(C); + AllUndef &= isa(C); + } + } + auto Res = ConstantVector::get(Vec); + if (AllUndef) + Res = UndefValue::get(Res->getType()); + return Res; +} + +/*********************************************************************** + * findClosestCommonDominator : find closest common dominator of some instructions + * + * Enter: DT = dominator tree + * Insts = the instructions + * + * Return: The one instruction that dominates all the others, if any. + * Otherwise the terminator of the closest common dominating basic + * block. + */ +Instruction *genx::findClosestCommonDominator(DominatorTree *DT, + ArrayRef Insts) +{ + assert(!Insts.empty()); + SmallVector InstScanners; + // Find the closest common dominating basic block. + Instruction *Inst0 = Insts[0]; + BasicBlock *NCD = Inst0->getParent(); + InstScanners.push_back(InstScanner(Inst0)); + for (unsigned ii = 1, ie = Insts.size(); ii != ie; ++ii) { + Instruction *Inst = Insts[ii]; + if (Inst->getParent() != NCD) { + auto NewNCD = DT->findNearestCommonDominator(NCD, Inst->getParent()); + if (NewNCD != NCD) + InstScanners.clear(); + NCD = NewNCD; + } + if (NCD == Inst->getParent()) + InstScanners.push_back(Inst); + } + // Now we have NCD = the closest common dominating basic block, and + // InstScanners populated with the instructions from Insts that are + // in that block. + if (InstScanners.empty()) { + // No instructions in that block. Return the block's terminator. + return NCD->getTerminator(); + } + if (InstScanners.size() == 1) { + // Only one instruction in that block. Return it. + return InstScanners[0].Original; + } + // Create a set of the original instructions. + std::set OrigInsts; + for (auto i = InstScanners.begin(), e = InstScanners.end(); i != e; ++i) + OrigInsts.insert(i->Original); + // Scan back one instruction at a time for each scanner. If a scanner reaches + // another original instruction, the scanner can be removed, and when we are + // left with one scanner, that must be the earliest of the original + // instructions. If a scanner reaches the start of the basic block, that was + // the earliest of the original instructions. + // + // In the worst case, this algorithm could scan all the instructions in a + // basic block, but it is designed to be better than that in the common case + // that the original instructions are close to each other. + for (;;) { + for (auto i = InstScanners.begin(), e = InstScanners.end(); i != e; ++i) { + if (i->Current == &i->Current->getParent()->front()) + return i->Original; // reached start of basic block + i->Current = i->Current->getPrevNode(); + if (OrigInsts.find(i->Current) != OrigInsts.end()) { + // Scanned back to another instruction in our original set. Remove + // this scanner. + *i = InstScanners.back(); + InstScanners.pop_back(); + if (InstScanners.size() == 1) + return InstScanners[0].Original; // only one scanner left + break; // restart loop so as not to confuse the iterator + } + } + } +} + +/*********************************************************************** + * getTwoAddressOperandNum : get operand number of two address operand + * + * If an intrinsic has a "two address operand", then that operand must be + * in the same register as the result. This function returns the operand number + * of the two address operand if any, or -1 if not. + */ +int genx::getTwoAddressOperandNum(CallInst *CI) +{ + auto IntrinsicID = GenXIntrinsic::getAnyIntrinsicID(CI); + if (IntrinsicID == GenXIntrinsic::not_any_intrinsic) + return -1; // not intrinsic + if (GenXIntrinsic::isWrRegion(IntrinsicID) || + IntrinsicID == GenXIntrinsic::genx_wrpredregion || + IntrinsicID == GenXIntrinsic::genx_wrpredpredregion) + return 0; // wr(pred(pred))region has operand 0 as two address operand + if (CI->getType()->isVoidTy()) + return -1; // no return value + GenXIntrinsicInfo II(IntrinsicID); + unsigned Num = CI->getNumArgOperands(); + if (!Num) + return -1; // no args + --Num; // Num = last arg number, could be two address operand + if (isa(CI->getOperand(Num))) + return -1; // operand is undef, must be RAW_NULLALLOWED + if (II.getArgInfo(Num).getCategory() != GenXIntrinsicInfo::TWOADDR) + return -1; // not two addr operand + if (CI->use_empty() && II.getRetInfo().rawNullAllowed()) + return -1; // unused result will be V0 + return Num; // it is two addr +} + +/*********************************************************************** + * isNot : test whether an instruction is a "not" instruction (an xor with + * constant all ones) + */ +bool genx::isNot(Instruction *Inst) +{ + if (Inst->getOpcode() == Instruction::Xor) + if (auto C = dyn_cast(Inst->getOperand(1))) + if (C->isAllOnesValue()) + return true; + return false; +} + +/*********************************************************************** + * isPredNot : test whether an instruction is a "not" instruction (an xor + * with constant all ones) with predicate (i1 or vector of i1) type + */ +bool genx::isPredNot(Instruction *Inst) +{ + if (Inst->getOpcode() == Instruction::Xor) + if (auto C = dyn_cast(Inst->getOperand(1))) + if (C->isAllOnesValue() && C->getType()->getScalarType()->isIntegerTy(1)) + return true; + return false; +} + +/*********************************************************************** + * isIntNot : test whether an instruction is a "not" instruction (an xor + * with constant all ones) with non-predicate type + */ +bool genx::isIntNot(Instruction *Inst) +{ + if (Inst->getOpcode() == Instruction::Xor) + if (auto C = dyn_cast(Inst->getOperand(1))) + if (C->isAllOnesValue() && !C->getType()->getScalarType()->isIntegerTy(1)) + return true; + return false; +} + +/*********************************************************************** + * ShuffleVectorAnalyzer::getAsSlice : see if the shufflevector is a slice on + * operand 0, and if so return the start index, or -1 if it is not a slice + */ +int ShuffleVectorAnalyzer::getAsSlice() +{ + unsigned WholeWidth = SI->getOperand(0)->getType()->getVectorNumElements(); + Constant *Selector = cast(SI->getOperand(2)); + unsigned Width = SI->getType()->getVectorNumElements(); + unsigned StartIdx = cast( + Selector->getAggregateElement((unsigned)0))->getZExtValue(); + if (StartIdx >= WholeWidth) + return -1; // start index beyond operand 0 + unsigned SliceWidth; + for (SliceWidth = 1; SliceWidth != Width; ++SliceWidth) { + auto CI = dyn_cast(Selector->getAggregateElement(SliceWidth)); + if (!CI) + break; + if (CI->getZExtValue() != StartIdx + SliceWidth) + return -1; // not slice + } + return StartIdx; +} + +/*********************************************************************** + * ShuffleVectorAnalyzer::isReplicatedSlice : check if the shufflevector + * is a replicated slice on operand 0. + */ +bool ShuffleVectorAnalyzer::isReplicatedSlice() const { + const auto MaskVals = SI->getShuffleMask(); + auto Begin = MaskVals.begin(); + auto End = MaskVals.end(); + + // Check for undefs. + if (std::find(Begin, End, -1) != End) + return false; + + if (MaskVals.size() == 1) + return true; + + // Slice should not touch second operand. + auto MaxIndex = static_cast(MaskVals.back()); + if (MaxIndex >= SI->getOperand(0)->getType()->getVectorNumElements()) + return false; + + // Find first non-one difference. + auto SliceEnd = + std::adjacent_find(Begin, End, + [](int Prev, int Next) { return Next - Prev != 1; }); + // If not found, then it is simple slice. + if (SliceEnd == End) + return true; + + // Compare slice with parts of sequence to prove that it is periodic. + ++SliceEnd; + unsigned SliceSize = std::distance(Begin, SliceEnd); + // Slice should be replicated. + if (MaskVals.size() % SliceSize != 0) + return false; + + for (auto It = SliceEnd; It != End; std::advance(It, SliceSize)) + if (!std::equal(Begin, SliceEnd, It)) + return false; + + return true; +} + +// Based on the value of a shufflevector mask element defines in which of +// 2 operands it points. The operand is returned. +static Value *getOperandByMaskValue(const ShuffleVectorInst &SI, + int MaskValue) { + assert(MaskValue >= 0 && "invalid index"); + int FirstOpSize = SI.getOperand(0)->getType()->getVectorNumElements(); + if (MaskValue < FirstOpSize) + return SI.getOperand(0); + else { + int SecondOpSize = SI.getOperand(1)->getType()->getVectorNumElements(); + assert(MaskValue < FirstOpSize + SecondOpSize && "invalid index"); + return SI.getOperand(1); + } +} + +// safe advance +// If adding \p N results in bound violation, \p Last is written to \p It +template void advanceSafe(Iter &It, Iter Last, int N) { + if (N > std::distance(It, Last)) { + It = Last; + return; + } + std::advance(It, N); +} + +// Returns operand and its region of 1 element that is referenced by +// \p MaskVal element of shufflevector mask. +static ShuffleVectorAnalyzer::OperandRegionInfo +matchOneElemRegion(const ShuffleVectorInst &SI, int MaskVal) { + ShuffleVectorAnalyzer::OperandRegionInfo Init; + Init.Op = getOperandByMaskValue(SI, MaskVal); + Init.R = Region(Init.Op); + Init.R.NumElements = Init.R.Width = 1; + if (Init.Op == SI.getOperand(0)) + Init.R.Offset = MaskVal * Init.R.ElementBytes; + else { + auto FirstOpSize = SI.getOperand(0)->getType()->getVectorNumElements(); + Init.R.Offset = (MaskVal - FirstOpSize) * Init.R.ElementBytes; + } + return Init; +} + +// Takes shufflevector mask indexes from [\p FirstIt, \p LastIt), +// converts them to the indexes of \p Operand of \p SI instruction +// and writes them to \p OutIt. +// Invalid indexes become negative numbers. +template +void makeSVIIndexesOperandIndexes(const ShuffleVectorInst &SI, + const Value &Operand, ForwardIter FirstIt, + ForwardIter LastIt, OutputIter OutIt) { + int FirstOpSize = SI.getOperand(0)->getType()->getVectorNumElements(); + if (&Operand == SI.getOperand(0)) { + std::transform(FirstIt, LastIt, OutIt, [FirstOpSize](int MaskVal) { + if (MaskVal >= FirstOpSize) + return -1; + return MaskVal; + }); + return; + } + assert(&Operand == SI.getOperand(1) && + "wrong argument: a shufflevector operand was expected"); + std::transform(FirstIt, LastIt, OutIt, + [FirstOpSize](int MaskVal) { return MaskVal - FirstOpSize; }); +} + +// Matches "vector" region (with vstride == 0) pattern in +// [\p FirstIt, \p LastIt) indexes. +// Uses info in \p FirstElemRegion, adds defined Width, Stride and +// new NumElements to \p FirstElemRegion and returns resulting region. +// +// Arguments: +// [\p FirstIt, \p LastIt) is the range of indexes into some vector. +// Negative index means invalid index. +// \p FirstElemRegion describes one element region with only one index +// *FirstIt. +template +Region matchVectorRegionByIndexes(Region FirstElemRegion, ForwardIter FirstIt, + ForwardIter LastIt) { + assert(FirstIt != LastIt && "the range must contain at least 1 element"); + + if (std::distance(FirstIt, LastIt) == 1) + return FirstElemRegion; + int Stride = *std::next(FirstIt) - *FirstIt; + if (Stride < 0) + return FirstElemRegion; + auto NewRowIt = + std::adjacent_find(FirstIt, LastIt, [Stride](int First, int Second) { + return Second < 0 || Second - First != Stride; + }); + if (NewRowIt != LastIt) { + ++NewRowIt; + } + int Width = std::distance(FirstIt, NewRowIt); + assert(Width > 0 && "should be at least 1 according to algorithm"); + if (Width == 1) + // Stride doesn't play role when the Width is 1. + // Also it prevents from writing to big value in the region. + Stride = 0; + FirstElemRegion.Stride = Stride; + FirstElemRegion.Width = Width; + FirstElemRegion.NumElements = Width; + return FirstElemRegion; +} + +// Matches "matrix" region (vstride may not equal to 0) pattern in +// [\p FirstIt, \p LastIt) index. +// Uses info in \p FirstRowRegion, adds defined VStride and new NumElements to +// \p FirstRowRegion and returns resulting region. +// +// Arguments: +// [\p FirstIt, \p LastIt) is the range of indexes into some vector. +// Negative index means invalid index. +// \p FirstRowRegion describes "vector" region (with vstride == 0), +// which is formed by first 'FirstRowRegion.NumElements' elements +// of the range. +template +Region matchMatrixRegionByIndexes(Region FirstRowRegion, ForwardIter FirstIt, + ForwardIter LastIt) { + assert(FirstRowRegion.NumElements == FirstRowRegion.Width && + FirstRowRegion.VStride == 0 && + "wrong argunent: vector region (with no vstride) was expected"); + +// TODO: rewrite this assert to remove VS build error +// assert(std::distance(FirstIt, LastIt) >= FirstRowRegion.Width && +// "wrong argument: number of indexes must be at least equal to region " +// "width"); + + auto FirstRowEndIt = std::next(FirstIt, FirstRowRegion.Width); + if (FirstRowEndIt == LastIt) + return FirstRowRegion; + int VStride = *FirstRowEndIt - *FirstIt; + if (VStride < 0) + return FirstRowRegion; + + int Width = FirstRowRegion.Width; + int VDistance = VStride; + int NumElements = Width; + for (auto It = FirstRowEndIt; It != LastIt; advanceSafe(It, LastIt, Width), + NumElements += Width, VDistance += VStride) { + if (std::distance(It, LastIt) < Width || + !std::equal(FirstIt, FirstRowEndIt, It, + [VDistance](int Reference, int Current) { + return Current - Reference == VDistance && Current >= 0; + })) + break; + } + if (NumElements == Width) + // VStride doesn't play role when the Width is equal to NumElements. + // Also it prevents from writing to big value in the region. + VStride = 0; + FirstRowRegion.VStride = VStride; + FirstRowRegion.NumElements = NumElements; + return FirstRowRegion; +} + +// Analyzes shufflevector mask starting from \p StartIdx element of it. +// Finds the longest prefix of the cutted shufflevector mask that can be +// represented as a region of one operand of the instruction. +// Returns the operand and its region. +// +// For example: +// {0, 1, 3, 4, 25, 16 ...} -> first 4 elements form a region: +// <3;2,1> vstride=3, width=2, stride=1 +ShuffleVectorAnalyzer::OperandRegionInfo +ShuffleVectorAnalyzer::getMaskRegionPrefix(int StartIdx) { + assert(StartIdx >= 0 && + StartIdx < static_cast(SI->getShuffleMask().size()) && + "Start index is out of bound"); + + auto MaskVals = SI->getShuffleMask(); + auto StartIt = std::next(MaskVals.begin(), StartIdx); + OperandRegionInfo Res = matchOneElemRegion(*SI, *StartIt); + + if (StartIdx == MaskVals.size() - 1) + return Res; + + makeSVIIndexesOperandIndexes(*SI, *Res.Op, StartIt, MaskVals.end(), StartIt); + + Res.R = matchVectorRegionByIndexes(std::move(Res.R), StartIt, MaskVals.end()); + Res.R = matchMatrixRegionByIndexes(std::move(Res.R), StartIt, MaskVals.end()); + return Res; +} + +/*********************************************************************** + * ShuffleVectorAnalyzer::getAsUnslice : see if the shufflevector is an + * unslice where the "old value" is operand 0 and operand 1 is another + * shufflevector and operand 0 of that is the "new value" + * + * Return: start index, or -1 if it is not an unslice + */ +int ShuffleVectorAnalyzer::getAsUnslice() +{ + auto SI2 = dyn_cast(SI->getOperand(1)); + if (!SI2) + return -1; + Constant *MaskVec = cast(SI->getOperand(2)); + // Find prefix of undef or elements from operand 0. + unsigned OldWidth = SI2->getType()->getVectorNumElements(); + unsigned NewWidth = SI2->getOperand(0)->getType()->getVectorNumElements(); + unsigned Prefix = 0; + for (;; ++Prefix) { + if (Prefix == OldWidth - NewWidth) + break; + Constant *IdxC = MaskVec->getAggregateElement(Prefix); + if (isa(IdxC)) + continue; + unsigned Idx = cast(IdxC)->getZExtValue(); + if (Idx == OldWidth) + break; // found end of prefix + if (Idx != Prefix) + return -1; // not part of prefix + } + // Check that the whole of SI2 operand 0 follows + for (unsigned i = 1; i != NewWidth; ++i) { + Constant *IdxC = MaskVec->getAggregateElement(Prefix + i); + if (isa(IdxC)) + continue; + if (cast(IdxC)->getZExtValue() != i + OldWidth) + return -1; // not got whole of SI2 operand 0 + } + // Check that the remainder is undef or elements from operand 0. + for (unsigned i = Prefix + NewWidth; i != OldWidth; ++i) { + Constant *IdxC = MaskVec->getAggregateElement(i); + if (isa(IdxC)) + continue; + if (cast(IdxC)->getZExtValue() != i) + return -1; + } + // Check that the first Prefix elements of SI2 come from its operand 1. + Constant *MaskVec2 = cast(SI2->getOperand(2)); + for (unsigned i = 0; i != Prefix; ++i) { + Constant *IdxC = MaskVec2->getAggregateElement(Prefix + i); + if (isa(IdxC)) + continue; + if (cast(IdxC)->getZExtValue() != i) + return -1; + } + // Success. + return Prefix; +} + +/*********************************************************************** + * ShuffleVectorAnalyzer::getAsSplat : if shufflevector is a splat, get the + * splatted input, with its vector index if the input is a vector + */ +ShuffleVectorAnalyzer::SplatInfo ShuffleVectorAnalyzer::getAsSplat() +{ + Value *InVec1 = SI->getOperand(0); + Value *InVec2 = SI->getOperand(1); + Constant *MaskVec = cast(SI->getOperand(2)); + ConstantInt *IdxVal = dyn_cast_or_null(MaskVec->getSplatValue()); + if (!IdxVal) + return SplatInfo(0, 0); + // The mask is a splat. Work out which element of which input vector + // it refers to. + unsigned ShuffleIdx = IdxVal->getSExtValue(); + unsigned InVec1NumElements = InVec1->getType()->getVectorNumElements(); + if (ShuffleIdx >= InVec1NumElements) { + ShuffleIdx -= InVec1NumElements; + InVec1 = InVec2; + } + if (auto IE = dyn_cast(InVec1)) { + if (InVec1NumElements == 1 || isa(IE->getOperand(0))) + return SplatInfo(IE->getOperand(1), 0); + // Even though this is a splat, the input vector has more than one + // element. IRBuilder::CreateVectorSplat does this. See if the input + // vector is the result of an insertelement at the right place, and + // if so return that. Otherwise we end up allocating + // an unnecessarily large register. + if (auto ConstIdx = dyn_cast(IE->getOperand(2))) + if (ConstIdx->getSExtValue() == ShuffleIdx) + return SplatInfo(IE->getOperand(1), 0); + } + return SplatInfo(InVec1, ShuffleIdx); +} + +Value *ShuffleVectorAnalyzer::serialize() { + unsigned Cost0 = getSerializeCost(0); + unsigned Cost1 = getSerializeCost(1); + + Value *Op0 = SI->getOperand(0); + Value *Op1 = SI->getOperand(1); + Value *V = Op0; + bool UseOp0AsBase = Cost0 <= Cost1; + if (!UseOp0AsBase) + V = Op1; + + // Expand or shink the initial value if sizes mismatch. + unsigned NElts = SI->getType()->getVectorNumElements(); + unsigned M = V->getType()->getVectorNumElements(); + bool SkipBase = true; + if (M != NElts) { + if (auto C = dyn_cast(V)) { + SmallVector Vals; + for (unsigned i = 0; i < NElts; ++i) { + Type *Ty = C->getType()->getVectorElementType(); + Constant *Elt = + (i < M) ? C->getAggregateElement(i) : UndefValue::get(Ty); + Vals.push_back(Elt); + } + V = ConstantVector::get(Vals); + } else { + // Need to insert individual elements. + V = UndefValue::get(SI->getType()); + SkipBase = false; + } + } + + IRBuilder<> Builder(SI); + for (unsigned i = 0; i < NElts; ++i) { + // Undef index returns -1. + int idx = SI->getMaskValue(i); + if (idx < 0) + continue; + if (SkipBase) { + if (UseOp0AsBase && idx == i) + continue; + if (!UseOp0AsBase && idx == i + M) + continue; + } + + Value *Vi = nullptr; + if (idx < (int)M) + Vi = Builder.CreateExtractElement(Op0, idx, ""); + else + Vi = Builder.CreateExtractElement(Op1, idx - M, ""); + if (!isa(Vi)) + V = Builder.CreateInsertElement(V, Vi, i, ""); + } + + return V; +} + +unsigned ShuffleVectorAnalyzer::getSerializeCost(unsigned i) { + unsigned Cost = 0; + Value *Op = SI->getOperand(i); + if (!isa(Op) && Op->getType() != SI->getType()) + Cost += Op->getType()->getVectorNumElements(); + + unsigned NElts = SI->getType()->getVectorNumElements(); + for (unsigned j = 0; j < NElts; ++j) { + // Undef index returns -1. + int idx = SI->getMaskValue(j); + if (idx < 0) + continue; + // Count the number of elements out of place. + unsigned M = Op->getType()->getVectorNumElements(); + if ((i == 0 && idx != j) || (i == 1 && idx != j + M)) + Cost++; + } + + return Cost; +} + +/*********************************************************************** + * adjustPhiNodesForBlockRemoval : adjust phi nodes when removing a block + * + * Enter: Succ = the successor block to adjust phi nodes in + * BB = the block being removed + * + * This modifies each phi node in Succ as follows: the incoming for BB is + * replaced by an incoming for each of BB's predecessors. + */ +void genx::adjustPhiNodesForBlockRemoval(BasicBlock *Succ, BasicBlock *BB) +{ + for (auto i = Succ->begin(), e = Succ->end(); i != e; ++i) { + auto Phi = dyn_cast(&*i); + if (!Phi) + break; + // For this phi node, get the incoming for BB. + int Idx = Phi->getBasicBlockIndex(BB); + assert(Idx >= 0); + Value *Incoming = Phi->getIncomingValue(Idx); + // Iterate through BB's predecessors. For the first one, replace the + // incoming block with the predecessor. For subsequent ones, we need + // to add new phi incomings. + auto pi = pred_begin(BB), pe = pred_end(BB); + assert(pi != pe); + Phi->setIncomingBlock(Idx, *pi); + for (++pi; pi != pe; ++pi) + Phi->addIncoming(Incoming, *pi); + } +} + +/*********************************************************************** + * sinkAdd : sink add(s) in address calculation + * + * Enter: IdxVal = the original index value + * + * Return: the new calculation for the index value + * + * This detects the case when a variable index in a region or element access + * is one or more constant add/subs then some mul/shl/truncs. It sinks + * the add/subs into a single add after the mul/shl/truncs, so the add + * stands a chance of being baled in as a constant offset in the region. + * + * If add sinking is successfully applied, it may leave now unused + * instructions behind, which need tidying by a later dead code removal + * pass. + */ +Value *genx::sinkAdd(Value *V) { + Instruction *IdxVal = dyn_cast(V); + if (!IdxVal) + return V; + // Collect the scale/trunc/add/sub/or instructions. + int Offset = 0; + SmallVector ScaleInsts; + Instruction *Inst = IdxVal; + int Scale = 1; + bool NeedChange = false; + for (;;) { + if (isa(Inst)) + ScaleInsts.push_back(Inst); + else { + if (!isa(Inst)) + break; + if (ConstantInt *CI = dyn_cast(Inst->getOperand(1))) { + if (Inst->getOpcode() == Instruction::Mul) { + Scale *= CI->getSExtValue(); + ScaleInsts.push_back(Inst); + } else if (Inst->getOpcode() == Instruction::Shl) { + Scale <<= CI->getSExtValue(); + ScaleInsts.push_back(Inst); + } else if (Inst->getOpcode() == Instruction::Add) { + Offset += CI->getSExtValue() * Scale; + if (V != Inst) + NeedChange = true; + } else if (Inst->getOpcode() == Instruction::Sub) { + Offset -= CI->getSExtValue() * Scale; + if (IdxVal != Inst) + NeedChange = true; + } else if(Inst->getOpcode() == Instruction::Or) { + if (!haveNoCommonBitsSet(Inst->getOperand(0), + Inst->getOperand(1), + Inst->getModule()->getDataLayout())) + break; + Offset += CI->getSExtValue() * Scale; + if (V != Inst) + NeedChange = true; + } else + break; + } else + break; + } + Inst = dyn_cast(Inst->getOperand(0)); + if (!Inst) + return V; + } + if (!NeedChange) + return V; + // Clone the scale and trunc instructions, starting with the value that + // was input to the add(s). + for (SmallVectorImpl::reverse_iterator i = ScaleInsts.rbegin(), + e = ScaleInsts.rend(); + i != e; ++i) { + Instruction *Clone = (*i)->clone(); + Clone->insertBefore(IdxVal); + Clone->setName((*i)->getName()); + Clone->setOperand(0, Inst); + Inst = Clone; + } + // Create a new add instruction. + Inst = BinaryOperator::Create( + Instruction::Add, Inst, + ConstantInt::get(Inst->getType(), (int64_t)Offset, true /*isSigned*/), + Twine("addr_add"), IdxVal); + Inst->setDebugLoc(IdxVal->getDebugLoc()); + return Inst; +} + +/*********************************************************************** +* reorderBlocks : reorder blocks to increase fallthrough, and specifically +* to satisfy the requirements of SIMD control flow +*/ +#define SUCCSZANY (true) +#define SUCCHASINST (succ->size() > 1) +#define SUCCNOINST (succ->size() <= 1) +#define SUCCANYLOOP (true) + +#define PUSHSUCC(BLK, C1, C2) \ + for(succ_iterator succIter = succ_begin(BLK), succEnd = succ_end(BLK); \ + succIter!=succEnd; ++succIter) { \ + llvm::BasicBlock *succ = *succIter; \ + if (!visitSet.count(succ) && C1 && C2) { \ + visitVec.push_back(succ); \ + visitSet.insert(succ); \ + break; \ + } \ + } + +static bool HasSimdGotoJoinInBlock(BasicBlock *BB) +{ + for (BasicBlock::iterator BBI = BB->begin(), + BBE = BB->end(); + BBI != BBE; ++BBI) { + auto IID = GenXIntrinsic::getGenXIntrinsicID(&*BBI); + if (IID == GenXIntrinsic::genx_simdcf_goto || + IID == GenXIntrinsic::genx_simdcf_join) + return true; + } + return false; +} + +void genx::LayoutBlocks(Function &func, LoopInfo &LI) +{ + std::vector visitVec; + std::set visitSet; + // Insertion Position per loop header + std::map InsPos; + + llvm::BasicBlock* entry = &(func.getEntryBlock()); + visitVec.push_back(entry); + visitSet.insert(entry); + InsPos[entry] = entry; + + while (!visitVec.empty()) { + llvm::BasicBlock* blk = visitVec.back(); + llvm::Loop *curLoop = LI.getLoopFor(blk); + if (curLoop) { + auto hd = curLoop->getHeader(); + if (blk == hd && InsPos.find(hd) == InsPos.end()) { + InsPos[blk] = blk; + } + } + // push: time for DFS visit + PUSHSUCC(blk, SUCCANYLOOP, SUCCNOINST); + if (blk != visitVec.back()) + continue; + // push: time for DFS visit + PUSHSUCC(blk, SUCCANYLOOP, SUCCHASINST); + // pop: time to move the block to the right location + if (blk == visitVec.back()) { + visitVec.pop_back(); + if (curLoop) { + auto hd = curLoop->getHeader(); + if (blk != hd) { + // move the block to the beginning of the loop + auto insp = InsPos[hd]; + assert(insp); + if (blk != insp) { + blk->moveBefore(insp); + InsPos[hd] = blk; + } + } + else { + // move the entire loop to the beginning of + // the parent loop + auto LoopStart = InsPos[hd]; + assert(LoopStart); + auto PaLoop = curLoop->getParentLoop(); + auto PaHd = PaLoop ? PaLoop->getHeader() : entry; + auto insp = InsPos[PaHd]; + if (LoopStart == hd) { + // single block loop + hd->moveBefore(insp); + } + else { + // loop-header is not moved yet, so should be at the end + // use splice + llvm::Function::BasicBlockListType& BBList = func.getBasicBlockList(); + BBList.splice(insp->getIterator(), BBList, LoopStart->getIterator(), + hd->getIterator()); + hd->moveBefore(LoopStart); + } + InsPos[PaHd] = hd; + } + } + else { + auto insp = InsPos[entry]; + if (blk != insp) { + blk->moveBefore(insp); + InsPos[entry] = blk; + } + } + } + } + + // fix the loop-exit pattern, put break-blocks into the loop + for (llvm::Function::iterator blkIter = func.begin(), blkEnd = func.end(); + blkIter != blkEnd; ++blkIter) { + llvm::BasicBlock *blk = &(*blkIter); + llvm::Loop *curLoop = LI.getLoopFor(blk); + bool allPredLoopExit = true; + unsigned numPreds = 0; + llvm::SmallPtrSet predSet; + for (pred_iterator predIter = pred_begin(blk), predEnd = pred_end(blk); + predIter != predEnd; ++predIter) { + llvm::BasicBlock *pred = *predIter; + numPreds++; + llvm::Loop *predLoop = LI.getLoopFor(pred); + if (curLoop == predLoop) { + llvm::BasicBlock *predPred = pred->getSinglePredecessor(); + if (predPred) { + llvm::Loop *predPredLoop = LI.getLoopFor(predPred); + if (predPredLoop != curLoop && + (!curLoop || curLoop->contains(predPredLoop))) { + if (!HasSimdGotoJoinInBlock(pred)) { + predSet.insert(pred); + } else { + allPredLoopExit = false; + break; + } + } + } + } else if (!curLoop || curLoop->contains(predLoop)) + continue; + else { + allPredLoopExit = false; + break; + } + } + if (allPredLoopExit && numPreds > 1) { + for (SmallPtrSet::iterator predIter = predSet.begin(), + predEnd = predSet.end(); + predIter != predEnd; ++predIter) { + llvm::BasicBlock *pred = *predIter; + llvm::BasicBlock *predPred = pred->getSinglePredecessor(); + assert(predPred); + pred->moveAfter(predPred); + } + } + } +} + +void genx::LayoutBlocks(Function &func) +{ + std::vector visitVec; + std::set visitSet; + // Reorder basic block to allow more fall-through + llvm::BasicBlock* entry = &(func.getEntryBlock()); + visitVec.push_back(entry); + visitSet.insert(entry); + + while (!visitVec.empty()) { + llvm::BasicBlock* blk = visitVec.back(); + // push in the empty successor + PUSHSUCC(blk, SUCCANYLOOP, SUCCNOINST); + if (blk != visitVec.back()) + continue; + // push in the other successor + PUSHSUCC(blk, SUCCANYLOOP, SUCCHASINST); + // pop + if (blk == visitVec.back()) { + visitVec.pop_back(); + if (blk != entry) { + blk->moveBefore(entry); + entry = blk; + } + } + } +} + +// normalize g_load with bitcasts. +// +// When a single g_load is being bitcast'ed to different types, clone g_loads. +bool genx::normalizeGloads(Instruction *Inst) { + assert(isa(Inst)); + auto LI = cast(Inst); + if (getUnderlyingGlobalVariable(LI->getPointerOperand()) == nullptr) + return false; + + // collect all uses connected by bitcasts. + std::set Visited; + // Uses of this loads groupped by the use type. + llvm::MapVector> Uses; + // The working list. + std::vector Insts; + + for (auto UI : LI->users()) + if (auto BI = dyn_cast(UI)) + Insts.push_back(BI); + + while (!Insts.empty()) { + BitCastInst *BCI = Insts.back(); + Insts.pop_back(); + if (Visited.count(BCI)) + continue; + + Uses[BCI->getType()].push_back(BCI); + for (auto UI : BCI->users()) + if (auto BI = dyn_cast(UI)) + Insts.push_back(BI); + } + + // There are more than two uses; clone loads that can fold bitcasts. + if (Uses.size() <= 1) + return false; + + // %0 = load gv + // %1 = bitcast %0 to t1 + // %2 - bitcast %1 to t2 + // + // ==> + // %0 = load gv + // %0.1 = load gv + // %1 = bitcast %0 to t1 + // %2 - bitcast %0.1 to t2 + Instruction *LInst = LI; + for (auto I = Uses.begin(); I != Uses.end(); ++I) { + Type *Ty = I->first; + if (LInst == nullptr) { + LInst = LI->clone(); + LInst->insertAfter(LI); + } + Instruction *NewCI = new BitCastInst(LInst, Ty, ".clone", LInst); + NewCI->moveAfter(LInst); + auto &BInsts = I->second; + for (auto BI : BInsts) + BI->replaceAllUsesWith(NewCI); + LInst = nullptr; + } + return true; +} + +// fold bitcast instruction into Store by change pointer type. +Instruction *genx::foldBitCastInst(Instruction *Inst) { + assert(isa(Inst) || isa(Inst)); + auto LI = dyn_cast(Inst); + auto SI = dyn_cast(Inst); + + Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); + GlobalVariable *GV = getUnderlyingGlobalVariable(Ptr); + if (!GV) + return nullptr; + + if (SI) { + Value *Val = SI->getValueOperand(); + if (auto CI = dyn_cast(Val)) { + auto SrcTy = CI->getSrcTy(); + auto NewPtrTy = PointerType::get(SrcTy, SI->getPointerAddressSpace()); + auto NewPtr = ConstantExpr::getBitCast(GV, NewPtrTy); + StoreInst *NewSI = new StoreInst(CI->getOperand(0), NewPtr, + /*volatile*/ SI->isVolatile(), Inst); + NewSI->takeName(SI); + NewSI->setDebugLoc(Inst->getDebugLoc()); + Inst->eraseFromParent(); + return NewSI; + } + } else if (LI && LI->hasOneUse()) { + if (auto CI = dyn_cast(LI->user_back())) { + auto NewPtrTy = PointerType::get(CI->getType(), LI->getPointerAddressSpace()); + auto NewPtr = ConstantExpr::getBitCast(GV, NewPtrTy); + auto NewLI = new LoadInst(NewPtr, "", + /*volatile*/ LI->isVolatile(), Inst); + NewLI->takeName(LI); + NewLI->setDebugLoc(LI->getDebugLoc()); + CI->replaceAllUsesWith(NewLI); + LI->replaceAllUsesWith(UndefValue::get(LI->getType())); + LI->eraseFromParent(); + return NewLI; + } + } + + return nullptr; +} + +const GlobalVariable *genx::getUnderlyingGlobalVariable(const Value *V) { + while (auto CE = dyn_cast_or_null(V)) { + if (CE->getOpcode() == CastInst::BitCast) + V = CE->getOperand(0); + else + break; + } + return dyn_cast_or_null(V); +} + +GlobalVariable *genx::getUnderlyingGlobalVariable(Value *V) { + return const_cast( + getUnderlyingGlobalVariable(const_cast(V))); +} + +bool genx::isGlobalStore(StoreInst *ST) { + assert(ST); + return getUnderlyingGlobalVariable(ST->getPointerOperand()) != nullptr; +} + +bool genx::isGlobalLoad(LoadInst *LI) { + assert(LI); + return getUnderlyingGlobalVariable(LI->getPointerOperand()) != nullptr; +} + +bool genx::isLegalValueForGlobalStore(Value *V, Value *StorePtr) { + // Value should be wrregion. + auto *Wrr = dyn_cast(V); + if (!Wrr || !GenXIntrinsic::isWrRegion(Wrr)) + return false; + + // With old value obtained from load instruction with StorePtr. + Value *OldVal = + Wrr->getArgOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum); + auto *LI = dyn_cast(OldVal); + return LI && (getUnderlyingGlobalVariable(LI->getPointerOperand()) == + getUnderlyingGlobalVariable(StorePtr)); +} + +bool genx::isGlobalStoreLegal(StoreInst *ST) { + assert(isGlobalStore(ST)); + return isLegalValueForGlobalStore(ST->getValueOperand(), + ST->getPointerOperand()); +} + +// The following bale will produce identity moves. +// %a0 = load m +// %b0 = load m +// bale { +// %a1 = rrd %a0, R +// %b1 = wrr %b0, %a1, R +// store %b1, m +// } +// +bool genx::isIdentityBale(const Bale &B) { + if (!B.endsWithGStore()) + return false; + + StoreInst *ST = cast(B.getHead()->Inst); + if (B.size() == 1) { + // The value to be stored should be a load from the same global. + auto LI = dyn_cast(ST->getOperand(0)); + return LI && getUnderlyingGlobalVariable(LI->getOperand(0)) == + getUnderlyingGlobalVariable(ST->getOperand(1)); + } + if (B.size() != 3) + return false; + + CallInst *B1 = dyn_cast(ST->getValueOperand()); + GlobalVariable *GV = getUnderlyingGlobalVariable(ST->getPointerOperand()); + if (!GenXIntrinsic::isWrRegion(B1) || !GV) + return false; + assert(B1); + auto B0 = dyn_cast(B1->getArgOperand(0)); + if (!B0 || GV != getUnderlyingGlobalVariable(B0->getPointerOperand())) + return false; + + CallInst *A1 = dyn_cast(B1->getArgOperand(1)); + if (!GenXIntrinsic::isRdRegion(A1)) + return false; + assert(A1); + LoadInst *A0 = dyn_cast(A1->getArgOperand(0)); + if (!A0 || GV != getUnderlyingGlobalVariable(A0->getPointerOperand())) + return false; + + Region R1(A1, BaleInfo()); + Region R2(B1, BaleInfo()); + return R1 == R2; +} + +// Check that region can be represented as raw operand. +bool genx::isValueRegionOKForRaw(Value *V, bool IsWrite, + const GenXSubtarget *ST) { + assert(V); + switch (GenXIntrinsic::getGenXIntrinsicID(V)) { + case GenXIntrinsic::genx_rdregioni: + case GenXIntrinsic::genx_rdregionf: + if (IsWrite) + return false; + break; + case GenXIntrinsic::genx_wrregioni: + case GenXIntrinsic::genx_wrregionf: + if (!IsWrite) + return false; + break; + default: + return false; + } + Region R(cast(V), BaleInfo()); + return isRegionOKForRaw(R, ST); +} + +bool genx::isRegionOKForRaw(const genx::Region &R, const GenXSubtarget *ST) { + unsigned GRFWidth = ST ? ST->getGRFWidth() : 32; + if (R.Indirect) + return false; + else if (R.Offset & (GRFWidth - 1)) // GRF boundary check + return false; + if (R.Width != R.NumElements) + return false; + if (R.Stride != 1) + return false; + return true; +} + +bool genx::skipOptWithLargeBlock(FunctionGroup &FG) { + for (auto fgi = FG.begin(), fge = FG.end(); fgi != fge; ++fgi) { + auto F = *fgi; + if (skipOptWithLargeBlock(*F)) + return true; + } + return false; +} + +std::string genx::getInlineAsmCodes(const InlineAsm::ConstraintInfo &Info) { + return Info.Codes.front(); +} + +bool genx::isInlineAsmMatchingInputConstraint( + const InlineAsm::ConstraintInfo &Info) { + return isdigit(Info.Codes.front()[0]); +} + +genx::ConstraintType genx::getInlineAsmConstraintType(StringRef Codes) { + return llvm::StringSwitch(Codes) + .Case("r", ConstraintType::Constraint_r) + .Case("rw", ConstraintType::Constraint_rw) + .Case("i", ConstraintType::Constraint_i) + .Case("n", ConstraintType::Constraint_n) + .Case("F", ConstraintType::Constraint_F) + .Case("cr", ConstraintType::Constraint_cr) + .Case("a", ConstraintType::Constraint_a) + .Default(ConstraintType::Constraint_unknown); +} + +unsigned +genx::getInlineAsmMatchedOperand(const InlineAsm::ConstraintInfo &Info) { + assert(genx::isInlineAsmMatchingInputConstraint(Info) && + "Matching input expected"); + int OperandValue = std::stoi(Info.Codes.front()); + assert(OperandValue >= 0); + return OperandValue; +} + +std::vector genx::getGenXInlineAsmInfo(MDNode *MD) { + std::vector Result; + for (auto &MDOp : MD->operands()) { + auto EntryMD = dyn_cast(MDOp); + assert(EntryMD && EntryMD->getNumOperands() == 3 && + "error setting metadata for inline asm"); + ConstantAsMetadata *Op0 = + dyn_cast(EntryMD->getOperand(0)); + ConstantAsMetadata *Op1 = + dyn_cast(EntryMD->getOperand(1)); + ConstantAsMetadata *Op2 = + dyn_cast(EntryMD->getOperand(2)); + assert(Op0 && Op1 && Op2 && "error setting metadata for inline asm"); + auto CTy = static_cast( + cast(Op0->getValue())->getZExtValue()); + Result.emplace_back(CTy, cast(Op1->getValue())->getSExtValue(), + cast(Op2->getValue())->getZExtValue()); + } + return Result; +} + +std::vector genx::getGenXInlineAsmInfo(CallInst *CI) { + assert(CI->isInlineAsm() && "Inline asm expected"); + MDNode *MD = CI->getMetadata(genx::MD_genx_inline_asm_info); + // empty constraint info + if (!MD) { + auto *IA = cast(CI->getCalledValue()); + assert(IA->getConstraintString().empty() && + "No info only for empty constraint string"); + (void)IA; + return std::vector(); + } + return genx::getGenXInlineAsmInfo(MD); +} + +bool genx::hasConstraintOfType( + const std::vector &ConstraintsInfo, + genx::ConstraintType CTy) { + return llvm::any_of(ConstraintsInfo, [&](const GenXInlineAsmInfo &Info) { + return Info.getConstraintType() == CTy; + }); +} + +unsigned genx::getInlineAsmNumOutputs(CallInst *CI) { + assert(CI->isInlineAsm() && "Inline asm expected"); + unsigned NumOutputs; + if (CI->getType()->isVoidTy()) + NumOutputs = 0; + else if (auto ST = dyn_cast(CI->getType())) + NumOutputs = ST->getNumElements(); + else + NumOutputs = 1; + return NumOutputs; +} + +/* for <1 x Ty> returns Ty + * for Ty returns <1 x Ty> + * other cases are unsupported + */ +Type *genx::getCorrespondingVectorOrScalar(Type *Ty) { + if (Ty->isVectorTy()) { + assert(Ty->getVectorNumElements() == 1 && + "wrong argument: scalar or degenerate vector is expected"); + return Ty->getScalarType(); + } + return VectorType::get(Ty, 1); +} + +// info is at main template function +CastInst *genx::scalarizeOrVectorizeIfNeeded(Instruction *Inst, Type *RefType) { + return scalarizeOrVectorizeIfNeeded(Inst, &RefType, std::next(&RefType)); +} + +// info is at main template function +CastInst *genx::scalarizeOrVectorizeIfNeeded(Instruction *Inst, + Instruction *InstToReplace) { + return scalarizeOrVectorizeIfNeeded(Inst, &InstToReplace, std::next(&InstToReplace)); +} + +Value *genx::getFunctionPointer(Value *V) { + Instruction *I = nullptr; + while (I = dyn_cast(V)) { + if (isa(I)) + V = I->getOperand(1); + else if (isa(I) || isa(I)) + V = I->getOperand(0); + else + break; + } + ConstantExpr *CE = nullptr; + while ((CE = dyn_cast(V)) && + (CE->getOpcode() == Instruction::ExtractElement || + CE->isCast())) + V = CE->getOperand(0); + if (isa(V) && V->getType()->isPointerTy() && + V->getType()->getPointerElementType()->isFunctionTy()) { + return V; + } + return nullptr; +} + +bool genx::isFuncPointerVec(Value *V, SetVector *Funcs) { + bool Res = true; + if (V->getType()->isVectorTy() && isa(V) && + cast(V)->getOpcode() == Instruction::BitCast) { + Res = getFunctionPointer(cast(V)->getOperand(0)); + } else if (ConstantVector *Vec = dyn_cast(V)) { + for (auto it = Vec->op_begin(), ie = Vec->op_end(); it != ie; it++) { + auto *F = getFunctionPointer(*it); + if (F && Funcs) { + Funcs->insert(cast(F)); + } else if (!F) { + Res = false; + break; + } + } + } else + Res = false; + return Res; +} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXUtil.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXUtil.h new file mode 100644 index 000000000000..467a62fec367 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXUtil.h @@ -0,0 +1,429 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +#ifndef GENX_UTIL_H +#define GENX_UTIL_H + +#include "FunctionGroup.h" +#include "GenXRegion.h" +#include "GenXSubtarget.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" + +namespace llvm { +namespace genx { + +// Utility function to get the integral log base 2 of an integer, or -1 if +// the input is not a power of 2. +inline int exactLog2(unsigned Val) +{ + unsigned CLZ = countLeadingZeros(Val, ZB_Width); + if (CLZ != 32 && 1U << (31 - CLZ) == Val) + return 31 - CLZ; + return -1; +} + +// Utility function to get the log base 2 of an integer, truncated to an +// integer, or -1 if the number is 0 or negative. +template +inline int log2(T Val) +{ + if (Val <= 0) + return -1; + unsigned CLZ = countLeadingZeros((uint32_t)Val, ZB_Width); + return 31 - CLZ; +} + +// createConvert : create a genx_convert intrinsic call +CallInst *createConvert(Value *In, const Twine &Name, Instruction *InsertBefore, + Module *M = nullptr); + +// createConvertAddr : create a genx_convert_addr intrinsic call +CallInst *createConvertAddr(Value *In, int Offset, const Twine &Name, + Instruction *InsertBefore, Module *M = nullptr); + +// createAddAddr : create a genx_add_addr intrinsic call +CallInst *createAddAddr(Value *Lhs, Value *Rhs, const Twine &Name, + Instruction *InsertBefore, Module *M = nullptr); + +CallInst *createUnifiedRet(Type *Ty, const Twine &Name, Module *M); + +// getPredicateConstantAsInt : get a vXi1 constant's value as a single integer +unsigned getPredicateConstantAsInt(Constant *C); + +// getConstantSubvector : get a contiguous region from a vector constant +Constant *getConstantSubvector(Constant *V, unsigned StartIdx, unsigned Size); + +// concatConstants : concatenate two possibly vector constants, giving a vector +// constant +Constant *concatConstants(Constant *C1, Constant *C2); + +// findClosestCommonDominator : find latest common dominator of some +// instructions +Instruction *findClosestCommonDominator(DominatorTree *DT, + ArrayRef Insts); + +// convertShlShr : convert Shl followed by AShr/LShr by the same amount into +// trunc+sext/zext +Instruction *convertShlShr(Instruction *Inst); + +// splitStructPhis : split all struct phis in a function +bool splitStructPhis(Function *F); + +// breakConstantExprs : break constant expressions in a function. +bool breakConstantExprs(Function *F); + +// normalize g_load with bitcasts. +// +// When a single g_load is being bitcast'ed to different types, clone g_loads. +bool normalizeGloads(Instruction *Inst); + +// fold bitcast instruction to store/load pointer operand if possible. +// Return this new instruction or nullptr. +Instruction *foldBitCastInst(Instruction *Inst); + +// Return the underlying global variable. Return nullptr if it does not exist. +GlobalVariable *getUnderlyingGlobalVariable(Value *V); +const GlobalVariable *getUnderlyingGlobalVariable(const Value *V); + +class Bale; + +bool isGlobalStore(StoreInst *ST); + +bool isGlobalLoad(LoadInst* LI); + +// Check that V is correct as value for global store to StorePtr. +// This implies: +// 1) V is wrregion W; +// 2) Old value of W is result of gload L; +// 3) Pointer operand of L is derived from global variable of StorePtr. +bool isLegalValueForGlobalStore(Value *V, Value *StorePtr); + +// Check that global store ST operands meet condition of +// isLegalValueForGlobalStore. +bool isGlobalStoreLegal(StoreInst *ST); + +bool isIdentityBale(const Bale &B); + +// Check if region of value is OK for baling in to raw operand +// +// Enter: V = value that is possibly rdregion/wrregion +// IsWrite = true if caller wants to see wrregion, false for rdregion +// +// The region must be constant indexed, contiguous, and start on a GRF +// boundary. +bool isValueRegionOKForRaw(Value *V, bool IsWrite, const GenXSubtarget *ST); + +// Check if region is OK for baling in to raw operand +// +// The region must be constant indexed, contiguous, and start on a GRF +// boundary. +bool isRegionOKForRaw(const genx::Region &R, const GenXSubtarget *ST); + +// Skip optimizations on functions with large blocks. +inline bool skipOptWithLargeBlock(const Function &F) { + return std::any_of(F.begin(), F.end(), + [](const BasicBlock &BB) { return BB.size() >= 5000; }); +} + +bool skipOptWithLargeBlock(FunctionGroup &FG); + +// getTwoAddressOperandNum : get operand number of two address operand +int getTwoAddressOperandNum(CallInst *CI); + +// isNot : test whether an instruction is a "not" instruction (an xor with +// constant all ones) +bool isNot(Instruction *Inst); + +// isPredNot : test whether an instruction is a "not" instruction (an xor +// with constant all ones) with predicate (i1 or vector of i1) type +bool isPredNot(Instruction *Inst); + +// isIntNot : test whether an instruction is a "not" instruction (an xor +// with constant all ones) with non-predicate type +bool isIntNot(Instruction *Inst); + +// if V is a function pointer return function it points to, +// nullptr otherwise +Value *getFunctionPointer(Value *V); + +// return true if V is a const vector of function pointers, +// fill Funcs with the functions pointed to if provided +bool isFuncPointerVec(Value *V, SetVector *Funcs = nullptr); + +// ShuffleVectorAnalyzer : class to analyze a shufflevector +class ShuffleVectorAnalyzer { + ShuffleVectorInst *SI; + +public: + ShuffleVectorAnalyzer(ShuffleVectorInst *SI) : SI(SI) {} + // getAsSlice : return start index of slice, or -1 if shufflevector is not + // slice + int getAsSlice(); + + // Replicated slice descriptor. + // Replicated slice (e.g. 1 2 3 1 2 3) can be parametrized by + // initial offset (1), slice size (3) and replication count (2). + struct ReplicatedSlice { + unsigned InitialOffset; + unsigned SliceSize; + unsigned ReplicationCount; + ReplicatedSlice(unsigned Offset, unsigned Size, unsigned Count) + : InitialOffset(Offset), SliceSize(Size), ReplicationCount(Count) {} + }; + + // isReplicatedSlice : check whether shufflevector is replicated slice. + // Example of replicated slice: + // shufflevector <3 x T> x, undef, <6 x i32> <1, 2, 1, 2, 1, 2>. + bool isReplicatedSlice() const; + + static bool isReplicatedSlice(ShuffleVectorInst *SI) { + return ShuffleVectorAnalyzer(SI).isReplicatedSlice(); + } + + // When we have replicated slice, its parameters are ealisy deduced + // from first and last elements of mask. This function decomposes + // replicated slice to its parameters. + ReplicatedSlice getReplicatedSliceDescriptor() const { + assert(isReplicatedSlice() && "Expected replicated slice"); + const unsigned TotalSize = SI->getType()->getVectorNumElements(); + const unsigned SliceStart = SI->getMaskValue(0); + const unsigned SliceEnd = SI->getMaskValue(TotalSize - 1); + const unsigned SliceSize = SliceEnd - SliceStart + 1; + const unsigned ReplicationCount = TotalSize / SliceSize; + return ReplicatedSlice(SliceStart, SliceSize, ReplicationCount); + } + + static ReplicatedSlice getReplicatedSliceDescriptor(ShuffleVectorInst *SI) { + return ShuffleVectorAnalyzer(SI).getReplicatedSliceDescriptor(); + } + + // getAsUnslice : see if the shufflevector is an + // unslice where the "old value" is operand 0 and operand 1 is another + // shufflevector and operand 0 of that is the "new value" Returns start + // index, or -1 if it is not an unslice + int getAsUnslice(); + // getAsSplat : if shufflevector is a splat, get the splatted input, with the + // element's vector index if the input is a vector + struct SplatInfo { + Value *Input; + unsigned Index; + SplatInfo(Value *Input, unsigned Index) : Input(Input), Index(Index) {} + }; + SplatInfo getAsSplat(); + + // Serialize this shuffulevector instruction. + Value *serialize(); + + // Compute the cost in terms of number of insertelement instructions needed. + unsigned getSerializeCost(unsigned i); + + // To describe the region of one of two shufflevector instruction operands. + struct OperandRegionInfo { + Value *Op; + Region R; + }; + OperandRegionInfo getMaskRegionPrefix(int StartIdx); +}; + +// adjustPhiNodesForBlockRemoval : adjust phi nodes when removing a block +void adjustPhiNodesForBlockRemoval(BasicBlock *Succ, BasicBlock *BB); + +/*********************************************************************** + * sinkAdd : sink add(s) in address calculation + * + * Enter: IdxVal = the original index value + * + * Return: the new calculation for the index value + * + * This detects the case when a variable index in a region or element access + * is one or more constant add/subs then some mul/shl/truncs. It sinks + * the add/subs into a single add after the mul/shl/truncs, so the add + * stands a chance of being baled in as a constant offset in the region. + * + * If add sinking is successfully applied, it may leave now unused + * instructions behind, which need tidying by a later dead code removal + * pass. + */ +Value *sinkAdd(Value *V); + +// Check if this is a mask packing operation, i.e. a bitcast from Vxi1 to +// integer i8, i16 or i32. +static inline bool isMaskPacking(const Value *V) { + if (auto BC = dyn_cast(V)) { + auto SrcTy = dyn_cast(BC->getSrcTy()); + if (!SrcTy || !SrcTy->getScalarType()->isIntegerTy(1)) + return false; + unsigned NElts = SrcTy->getVectorNumElements(); + if (NElts != 8 && NElts != 16 && NElts != 32) + return false; + return V->getType()->getScalarType()->isIntegerTy(NElts); + } + return false; +} + +void LayoutBlocks(Function &func, LoopInfo &LI); +void LayoutBlocks(Function &func); + +// Metadata name for inline assemly instruction +constexpr const char *MD_genx_inline_asm_info = "genx.inlasm.constraints.info"; + +// Inline assembly avaliable constraints +enum class ConstraintType : uint32_t { + Constraint_r, + Constraint_rw, + Constraint_i, + Constraint_n, + Constraint_F, + Constraint_a, + Constraint_cr, + Constraint_unknown +}; + +// Represents info about inline asssembly operand +class GenXInlineAsmInfo { + genx::ConstraintType CTy = ConstraintType::Constraint_unknown; + int MatchingInput = -1; + bool IsOutput = false; + +public: + GenXInlineAsmInfo(genx::ConstraintType Ty, int MatchingInput, bool IsOutput) + : CTy(Ty), MatchingInput(MatchingInput), IsOutput(IsOutput) {} + bool hasMatchingInput() const { return MatchingInput != -1; } + int getMatchingInput() const { return MatchingInput; } + bool isOutput() const { return IsOutput; } + genx::ConstraintType getConstraintType() const { return CTy; } +}; + +// If input input constraint has matched output operand with same constraint +bool isInlineAsmMatchingInputConstraint(const InlineAsm::ConstraintInfo &Info); + +// Get matched output operand number for input operand +unsigned getInlineAsmMatchedOperand(const InlineAsm::ConstraintInfo &Info); + +// Get joined string representation of constraints +std::string getInlineAsmCodes(const InlineAsm::ConstraintInfo &Info); + +// Get constraint type +genx::ConstraintType getInlineAsmConstraintType(StringRef Codes); + +// Get vector of inline asm info for inline assembly instruction. +// Return empty vector if no constraint string in inline asm or +// if called before GenXInlineAsmLowering pass. +std::vector getGenXInlineAsmInfo(CallInst *CI); + +// Get vector of inline asm info from MDNode +std::vector getGenXInlineAsmInfo(MDNode *MD); + +bool hasConstraintOfType(const std::vector &ConstraintsInfo, + genx::ConstraintType CTy); + +// Get number of outputs for inline assembly instruction +unsigned getInlineAsmNumOutputs(CallInst *CI); + +Type *getCorrespondingVectorOrScalar(Type *Ty); + +/* scalarVectorizeIfNeeded: scalarize of vectorize \p Inst if it is required + * + * Result of some instructions can be both Ty and <1 x Ty> value e.g. rdregion. + * It is sometimes required to replace uses of instructions with types + * [\p FirstType, \p LastType) with \p Inst. If types don't + * correspond this function places BitCastInst <1 x Ty> to Ty, or Ty to <1 x Ty> + * after \p Inst and returns the pointer to the instruction. If no cast is + * required, nullptr is returned. + */ +template < + typename ConstIter, + typename std::enable_if< + std::is_base_of< + Type, typename std::remove_pointer::value_type>::type>::value, + int>::type = 0> +CastInst *scalarizeOrVectorizeIfNeeded(Instruction *Inst, ConstIter FirstType, + ConstIter LastType) { + assert(Inst && "wrong argument"); + assert(std::all_of(FirstType, LastType, + [Inst](Type *Ty) { + return Ty == Inst->getType() || + Ty == getCorrespondingVectorOrScalar( + Inst->getType()); + }) && + "wrong arguments: type of instructions must correspond"); + + if (Inst->getType()->isVectorTy() && + Inst->getType()->getVectorNumElements() > 1) + return nullptr; + bool needBitCast = std::any_of( + FirstType, LastType, [Inst](Type *Ty) { return Ty != Inst->getType(); }); + if (!needBitCast) + return nullptr; + auto *CorrespondingTy = getCorrespondingVectorOrScalar(Inst->getType()); + auto *BC = CastInst::Create(Instruction::BitCast, Inst, CorrespondingTy); + BC->insertAfter(Inst); + return BC; +} +/* scalarVectorizeIfNeeded: scalarize of vectorize \p Inst if it is required + * + * Result of some instructions can be both Ty and <1 x Ty> value e.g. rdregion. + * It is sometimes required to replace uses of instructions of [\p + * FirstInstToReplace, \p LastInstToReplace) with \p Inst. If types don't + * correspond this function places BitCastInst <1 x Ty> to Ty, or Ty to <1 x Ty> + * after \p Inst and returns the pointer to the instruction. If no cast is + * required, nullptr is returned. + */ +template ::value_type>::type>::value, + int>::type = 0> +CastInst *scalarizeOrVectorizeIfNeeded(Instruction *Inst, + ConstIter FirstInstToReplace, + ConstIter LastInstToReplace) { + std::vector Types; + std::transform(FirstInstToReplace, LastInstToReplace, + std::back_inserter(Types), + [](Instruction *Inst) { return Inst->getType(); }); + return scalarizeOrVectorizeIfNeeded(Inst, Types.begin(), Types.end()); +} + +CastInst *scalarizeOrVectorizeIfNeeded(Instruction *Inst, Type *RefType); + +CastInst *scalarizeOrVectorizeIfNeeded(Instruction *Inst, + Instruction *InstToReplace); + +} // namespace genx +} // namespace llvm + +#endif // GENX_UTIL_H diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXVectorDecomposer.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXVectorDecomposer.cpp new file mode 100644 index 000000000000..1f95f5c98814 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXVectorDecomposer.cpp @@ -0,0 +1,1177 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +// The GenXVectorDecomposer class is called by by the GenXPostLegalization pass +// to perform vector decomposition. See comment in GenXVectorDecomposer.h. +// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "GENX_POST_LEGALIZATION" + +#include "GenXVectorDecomposer.h" +#include "GenX.h" +#include "GenXBaling.h" +#include "GenXRegion.h" +#include "GenXUtil.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/DiagnosticPrinter.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; +using namespace genx; +using namespace GenXIntrinsic::GenXRegion; + +static cl::opt + LimitGenXVectorDecomposer("limit-genx-vector-decomposer", + cl::init(UINT_MAX), cl::Hidden, + cl::desc("Limit GenX vector decomposer.")); + +static cl::opt GenXReportVectorDecomposerFailureThreshold( + "genx-report-vector-decomposer-failure-threshold", cl::init(UINT_MAX), + cl::Hidden, + cl::desc("Byte size threshold for reporting failure of GenX vector " + "decomposer.")); + +static cl::opt GenXDefaultSelectPredicateWidth( + "genx-sel-width", cl::init(32), cl::Hidden, + cl::desc("The default width for select predicate splitting.")); + +namespace { + +class DiagnosticVectorDecomposition : public DiagnosticInfo { +private: + const Twine &Description; + Instruction *Inst; + + static int KindID; + static int getKindID() { + if (KindID == 0) + KindID = llvm::getNextAvailablePluginDiagnosticKind(); + return KindID; + } + +public: + DiagnosticVectorDecomposition(Instruction *I, const Twine &Desc, + DiagnosticSeverity Severity = DS_Error) + : DiagnosticInfo(getKindID(), Severity), Description(Desc), Inst(I) {} + + void print(DiagnosticPrinter &P) const override { + std::string Str; + raw_string_ostream OS(Str); + + auto DL = Inst->getDebugLoc(); + DL.print(OS); + + OS << ' ' << Description; + OS << '\n'; + OS.flush(); + P << Str; + } + + static bool classof(const DiagnosticInfo *DI) { + return DI->getKind() == getKindID(); + } +}; + +int DiagnosticVectorDecomposition::KindID = 0; + +} // end anonymous namespace + +/*********************************************************************** + * VectorDecomposer::run : run the vector decomposer on the start wrregion + * instructions added with addStartWrRegion() + * + * Return: true if code modified + */ +bool VectorDecomposer::run(DominatorTree *ArgDT) +{ + DT = ArgDT; + DL = &DT->getRoots().front()->getModule()->getDataLayout(); + bool Modified = false; + // Process each start wrregion added with addStartWrRegion(). + for (auto swi = StartWrRegions.begin(), swe = StartWrRegions.end(); + swi != swe; ++swi) { + Instruction *Inst = *swi; + Modified |= processStartWrRegion(Inst); + clearOne(); + } + for (auto i = ToDelete.begin(), e = ToDelete.end(); i != e; ++i) + (*i)->deleteValue(); + clear(); + return Modified; +} + +/*********************************************************************** + * VectorDecomposer::processStartWrRegion : process one start wrregion + * + * Enter: Inst = the start wrregion. Note that this might have already + * been erased if it was part of an already processed web, + * so the first thing we have to do is check that. + * + * This processes one start wrregion (a wrregion with constant input). If it has + * not already been seen as part of another web, this processes the web + * containing the start wrregion. + * + * Return: true if code modified + */ +bool VectorDecomposer::processStartWrRegion(Instruction *Inst) +{ + // Determine the web of vectors related by wrregion, phi nodes, bitcast, + // and determine the decomposition that we can do to the web. + if (!determineDecomposition(Inst)) + return false; + static unsigned Count = 0; + if (++Count > LimitGenXVectorDecomposer) + return false; + if (LimitGenXVectorDecomposer != UINT_MAX) + dbgs() << "genx vector decomposer " << Count << "\n"; + decompose(); + clearOne(); + return true; +} + +/*********************************************************************** + * VectorDecomposer::determineDecomposition : determine the web of vectors + * related by wrregion, phi nodes, bitcast, and determine the decomposition + * that we can do to the web + * + * Enter: Inst = the start wrregion. Note that this might have already + * been erased if it was part of an already processed web, + * so the first thing we have to do is check that. + * + * Return: true if decomposition possible; Decomposition and Offsets set up + * as described in the comment near the end of this function + */ +bool VectorDecomposer::determineDecomposition(Instruction *Inst) +{ + if (Seen.find(Inst) != Seen.end()) + return false; // This start wrregion already processed in some other web + // (and may have been erased). + NotDecomposingReportInst = Inst; + Web.clear(); + Decomposition.clear(); + unsigned NumGrfs = alignTo<256>(DL->getTypeSizeInBits(Inst->getType())) / 256; + if (NumGrfs == 1) + return false; // Ignore single GRF vector. + LLVM_DEBUG(dbgs() << "VectorDecomposer::determineDecomposition(" << Inst->getName() << ")\n"); + NotDecomposing = false; + for (unsigned i = 0; i != NumGrfs; ++i) + Decomposition.push_back(i); + addToWeb(Inst); + for (unsigned Idx = 0; Idx != Web.size(); ++Idx) { + Inst = Web[Idx]; + // Look at the def of this value. + if (GenXIntrinsic::isWrRegion(Inst)) { + // wrregion. If the "old value of vector" input is not constant, include + // it in the web. + addToWeb(Inst->getOperand(0), Inst); + } else if (auto Phi = dyn_cast(Inst)) { + // Phi node. Add all incomings to the web. + for (unsigned j = 0, je = Phi->getNumIncomingValues(); j != je; ++j) + addToWeb(Phi->getIncomingValue(j), Phi); + } else if (isa(Inst)) { + // Bitcast. Add the input to the web. But a bitcast with non-instruction + // input confuses this algorithm, so in that case disable it. We're not + // really expecting a bitcast with constant input anyway, although we + // might get one with arg input. + if (isa(Inst->getOperand(0))) + addToWeb(Inst->getOperand(0), Inst); + else + setNotDecomposing(Inst, "use of function argument or constant"); + } else { + // Any other def. This stops decomposition. + if ((isa(Inst) && !GenXIntrinsic::isAnyNonTrivialIntrinsic(Inst)) + || isa(Inst)) + setNotDecomposing(Inst, "return value from call"); + else + setNotDecomposing(Inst, "other non-decomposable definition"); + } + // Look at the uses of this value. + for (auto ui = Inst->use_begin(), ue = Inst->use_end(); ui != ue; ++ui) { + auto user = cast(ui->getUser()); + if (auto Phi = dyn_cast(user)) { + // Use in a phi node. Add the result of the phi node and all the other + // incomings to the web. + addToWeb(Phi); + for (unsigned j = 0, je = Phi->getNumIncomingValues(); j != je; ++j) { + auto Incoming = dyn_cast(Phi->getIncomingValue(j)); + if (Incoming && Incoming != Inst) + addToWeb(Incoming, Phi); + } + continue; + } + if ((GenXIntrinsic::isWrRegion(user) && !ui->getOperandNo()) + || isa(user)) { + // Use as the "old value of vector" operand of a wrregion, or in a + // bitcast. Add the result of the wrregion to the web. + addToWeb(user); + continue; + } + if (GenXIntrinsic::isRdRegion(user) && !ui->getOperandNo()) { + // Use as the vector value in rdregion. Adjust decomposition. + adjustDecomposition(user); + continue; + } + // We have some other use that stops us decomposing this web. (We + // continue gathering the web anyway so that all values in it get put + // in the Seen set.) + if (isa(user) || isa(user)) + setNotDecomposing(user, "use as return value"); + else if (isa(user) && !GenXIntrinsic::isAnyNonTrivialIntrinsic(user)) + setNotDecomposing(user, "use as call argument"); + else + setNotDecomposing(user, "other non-decomposable use"); + } + } + if (NotDecomposing) + return false; + // Now we have Decomposition[] set to reflect how we can decompose the GRFs + // of the vector. A range of Decomposition[i] with the same value need to + // be kept together in the same vector. Further, for the start of such a + // range, Decomposition[i] == i. So for example the array might be set to + // { 0, 0, 2, 2, 4, 4, 4, 4, 8, 8 }. + // + // Change Decomposition[] so the indices used are contiguous, changing the + // example above to { 0, 0, 1, 1, 2, 2, 2, 2, 3, 3 }, and create the Offsets[] + // array to translate a value from Decomposition[] into the GRF offset, so + // for this example { 0, 2, 4, 8 }. + Offsets.clear(); + for (unsigned Last = UINT_MAX, i = 0, e = Decomposition.size(); i != e; ++i) { + if (Decomposition[i] != Last) { + Offsets.push_back(Decomposition[i]); + Last = Decomposition[i]; + } + Decomposition[i] = Offsets.size() - 1; + } + LLVM_DEBUG( + dbgs() << "decompose to"; + for (unsigned i = 0; i != Decomposition.size(); ++i) + dbgs() << " " << Decomposition[i]; + dbgs() << ":"; + for (unsigned i = 0; i != Offsets.size(); ++i) + dbgs() << " " << Offsets[i]; + dbgs() << ":"; + for (unsigned i = 0; i != Web.size(); ++i) + dbgs() << " " << Web[i]->getName(); + dbgs() << "\n" + ); + if (Offsets.size() == 1) { + setNotDecomposing(0, "reads and writes in overlapping regions"); + LLVM_DEBUG(dbgs() << "no decomposition\n"); + return false; + } + return true; +} + +/*********************************************************************** + * addToWeb : add value to current vector web, adjusting decompose size + * if it is a wrregion + * + * Enter: V = value to add (if it is an instruction) + * User = instruction V is used in, for reporting failure to + * decompose if V is an Argument + */ +void VectorDecomposer::addToWeb(Value *V, Instruction *User) +{ + if (isa(V)) + return; + auto Inst = dyn_cast(V); + if (!Inst) { + // Cannot decompose with an arg in the web. + setNotDecomposing(User, "use of function argument"); + return; + } + if (Seen.find(Inst) != Seen.end()) + return; // already in the web + // Add to the web. + LLVM_DEBUG(dbgs() << " addToWeb(" << V->getName() << ")\n"); + Seen.insert(Inst); + Web.push_back(Inst); + if (!GenXIntrinsic::isWrRegion(Inst)) + return; + // It is a wrregion. Adjust decomposition. + adjustDecomposition(Inst); +} + +/*********************************************************************** + * adjustDecomposition : adjust web decomposition for region + * + * Enter: Inst = rdregion or wrregion instruction + * + * The vector will be decomposed into contiguous blocks of GRFs. This + * detects if the region accesses multiple GRFs currently slated to be in + * different decomposed vectors, and if so marks them as needing to be + * in the same decomposed vector. + */ +void VectorDecomposer::adjustDecomposition(Instruction *Inst) +{ + if (Decomposition.empty()) + return; // Decomposition[] not set up yet + Region R(Inst, BaleInfo()); + if (R.Indirect) { + setNotDecomposing(Inst, "indirect region"); + return; // cannot decompose if indirect + } + if (NotDecomposing) + return; // decomposition of this vector already disabled + // Compute byte offset of last byte accessed in the region. (This is after + // legalization so we can assume that strides are non-negative.) + unsigned Last = 0; + if (R.Width != R.NumElements) + Last = (R.NumElements / R.Width - 1) * R.VStride; + Last += (R.Width - 1) * R.Stride; + Last = R.Offset + Last * R.ElementBytes; + // Compute the GRF number of the first and last byte of the region. + unsigned First = R.Offset / 32U; + Last /= 32U; + if ((First >= Decomposition.size()) || (Last >= Decomposition.size())) { + setNotDecomposing(Inst, "out-of-bounds"); + return; // don't attempt to decompose out-of-bounds accesses + } + if (First != Last) { + // This region spans more than one GRF. Ensure they are all in the same + // decomposed vector. + for (unsigned i = Last + 1; + i != Decomposition.size() && Decomposition[i] == Decomposition[Last]; + ++i) + Decomposition[i] = Decomposition[First]; + for (unsigned i = First + 1; i != Last + 1; ++i) + Decomposition[i] = Decomposition[First]; + } +} + +/*********************************************************************** + * reportLocation : report location of a DebugLoc, with nested inline funcs + */ +static void reportLocation(const LLVMContext &Ctx, const DebugLoc &DL, raw_ostream &OS) +{ + if (auto InlinedAt = DL.getInlinedAt()) { + reportLocation(Ctx, DebugLoc(InlinedAt), OS); + OS << ": in function inlined here:\n"; + } + StringRef Filename = ""; + unsigned Line = 0; + unsigned Col = 0; + if (!DL) { + Filename = DL->getFilename(); + Line = DL.getLine(); + Col = DL.getCol(); + } + OS << Filename; + if (Line) { + OS << ":" << Line; + if (Col) + OS << ":" << Col; + } +} + +static DILocalVariable *getVariable(IntrinsicInst *II) { + do { + Value *V = II->getOperand(0); + Metadata *M = ValueAsMetadata::get(V); + if (auto DbgNode = MetadataAsValue::getIfExists(V->getContext(), M)) + for (auto *U : DbgNode->users()) + if (auto DVI = dyn_cast(U)) + return DVI->getVariable(); + if (!GenXIntrinsic::isWrRegion(V)) + break; + II = cast(V); + } while (1); + + return nullptr; +} + +/*********************************************************************** + * setNotDecomposing : set NotDecomposing flag and report to user + * + * Enter: Inst = instruction to report at (0 to use same location as + * NotDecomposingReportInst, the "first write" to the web) + * Text = message + */ +void VectorDecomposer::setNotDecomposing(Instruction *Inst, const char *Text) +{ + NotDecomposing = true; + if (NotDecomposingReportInst) { + unsigned Bytes = NotDecomposingReportInst->getType() + ->getPrimitiveSizeInBits() / 8U; + if (Bytes < GenXReportVectorDecomposerFailureThreshold) + return; + reportLocation(Inst->getContext(), + NotDecomposingReportInst->getDebugLoc(), dbgs()); + dbgs() << ": in decomposition candidate (" + << Bytes << " byte vector/matrix) written to here:\n"; + NotDecomposingReportInst = nullptr; + } + if (!Inst) + Inst = NotDecomposingReportInst; + assert(Inst); + if (Inst->getDebugLoc()) + Inst = Inst->getParent()->getFirstNonPHI(); + reportLocation(Inst->getContext(), Inst->getDebugLoc(), dbgs()); + dbgs() << ": vector decomposition failed because: " << Text << "\n"; +} + +/*********************************************************************** + * VectorDecomposer::decompose : decompose web of vectors in Web based on + * Decomposition[] and Offsets[] + */ +void VectorDecomposer::decompose() +{ + // For each phi node in the web, create a phi node for each decomposed + // part, with all incomings set to the decomposed part of the original + // incoming if it was constant, otherwise undef. + for (auto wi = Web.begin(), we = Web.end(); wi != we; ++wi) { + auto Phi = dyn_cast(*wi); + if (!Phi) + continue; + auto PhiPartsEntry = &PhiParts[Phi]; + auto Undef = UndefValue::get(Phi->getType()); + unsigned NumIncomings = Phi->getNumIncomingValues(); + for (unsigned PartIndex = 0; PartIndex != Offsets.size(); ++PartIndex) { + auto PartTy = getPartType(Phi->getType(), PartIndex); + auto NewPhi = PHINode::Create(PartTy, NumIncomings, + Phi->getName() + ".decomp." + Twine(PartIndex), Phi); + for (unsigned ii = 0; ii != NumIncomings; ++ii) { + auto Incoming = dyn_cast(Phi->getIncomingValue(ii)); + if (!Incoming) + Incoming = Undef; + Incoming = getConstantPart(Incoming, PartIndex); + NewPhi->addIncoming(Incoming, Phi->getIncomingBlock(ii)); + } + NewInsts.push_back(NewPhi); + PhiPartsEntry->push_back(NewPhi); + } + } + // Shorten the list of instructions in Web so it only includes phi nodes + // and start wrregions (ones with constant input). We need to do this first + // because other instructions in the web may become erased so checking them + // in the "decompose each tree of values" loop is invalid. + unsigned NewLen = 0; + for (unsigned wi = 0, we = Web.size(); wi != we; ++wi) { + Instruction *Inst = Web[wi]; + if (isa(Inst) || (GenXIntrinsic::isWrRegion(Inst) + && isa(Inst->getOperand(0)))) + Web[NewLen++] = Inst; + } + Web.resize(NewLen); + // Decompose each tree of values in the web rooted at a start wrregion (one + // with constant input) or at each use of a phi node. Each tree can be + // done independently, as we have already put the phi nodes in place to link + // them together. + for (auto wi = Web.begin(), we = Web.end(); wi != we; ++wi) { + Instruction *Inst = *wi; + if (auto Phi = dyn_cast(Inst)) { + auto Parts = &PhiParts[Phi]; + // decomposeTree removes the use, so we repeatedly process the first use + // until they have all gone. + while (!Phi->use_empty()) + decomposeTree(&*Phi->use_begin(), Parts); + } else { + assert (GenXIntrinsic::isWrRegion(Inst) && isa(Inst->getOperand(0))); + decomposeTree(&Inst->getOperandUse(0), nullptr); + } + } + // Erase original phi nodes. (The other original instructions in the web have + // been erased already.) + for (auto pi = PhiParts.begin(), pe = PhiParts.end(); pi != pe; ++pi) + eraseInst(pi->first); + // Do an aggressive dead code removal pass on instructions that we have added. + removeDeadCode(); +} + +/*********************************************************************** + * VectorDecomposer::decomposeTree : decompose vectors in a tree + * + * Enter: U = use at the root of the tree, one of: + * - the "old value" operand of wrregion (might be constant) + * - the "old value" operand of rdregion + * - the input of bitcast + * - a phi incoming + * PartsIn = decomposed parts of input (not modifiable) + * (0 if *U is constant) + * + * This is a tree of wrregion and bitcast instructions, with phi node uses + * and rdregions at the leaves. + * + * This function traverses the tree using self recursion. + */ +void VectorDecomposer::decomposeTree(Use *U, + const SmallVectorImpl *PartsIn) +{ + auto Inst = cast(U->getUser()); + if (auto Phi = dyn_cast(Inst)) { + decomposePhiIncoming(Phi, U->getOperandNo(), PartsIn); + return; + } + assert(!U->getOperandNo()); + if (GenXIntrinsic::isRdRegion(Inst)) { + decomposeRdRegion(Inst, PartsIn); + return; + } + // Set up the decomposed parts of the incoming value. + SmallVector Parts; + if (PartsIn) + for (unsigned i = 0, e = PartsIn->size(); i != e; ++i) + Parts.push_back((*PartsIn)[i]); + else + for (unsigned i = 0, e = Offsets.size(); i != e; ++i) + Parts.push_back(getConstantPart(cast(*U), i)); + // Handle bitcast. + if (isa(Inst)) { + decomposeBitCast(Inst, &Parts); + return; + } + // Handle wrregion. + assert(GenXIntrinsic::isWrRegion(Inst)); + decomposeWrRegion(Inst, &Parts); +} + +/*********************************************************************** + * VectorDecomposer::decomposePhiIncoming : decompose a use in a phi node + * + * Enter: Phi = the phi node + * OperandNum = operand number in the phi node + * PartsIn = decomposed parts of input (not modifiable) + */ +void VectorDecomposer::decomposePhiIncoming(PHINode *Phi, unsigned OperandNum, + const SmallVectorImpl *PartsIn) +{ + // For each part, find the decomposed phi node and set its + // corresponding incoming. + auto PhiPartsEntry = &PhiParts[Phi]; + for (unsigned PartIndex = 0, NumParts = PartsIn->size(); + PartIndex != NumParts; ++PartIndex) { + auto PhiPart = cast((*PhiPartsEntry)[PartIndex]); + PhiPart->setIncomingValue(OperandNum, (*PartsIn)[PartIndex]); + } + // Set the incoming in the original phi node to undef, to remove the use. + Phi->setIncomingValue(OperandNum, UndefValue::get(Phi->getType())); +} + +/*********************************************************************** + * VectorDecomposer::decomposeRdRegion : decompose a rdregion + * + * Enter: RdRegion = the rdregion instruction + * PartsIn = decomposed parts of input (not modifiable) + */ +void VectorDecomposer::decomposeRdRegion(Instruction *RdRegion, + const SmallVectorImpl *PartsIn) +{ + Region RdR(RdRegion, BaleInfo()); + unsigned PartIndex = getPartIndex(&RdR); + Value *Part = (*PartsIn)[PartIndex]; + if (isa(Part)) { + // Check if this region read is used as a two addr operand. + auto isUsedInTwoAddr = [](Value *V) { + for (auto ui = V->use_begin(), ue = V->use_end(); ui != ue; ++ui) { + auto user = cast(ui->getUser()); + if (auto CI = dyn_cast(user)) { + if (getTwoAddressOperandNum(CI) == (int)ui->getOperandNo()) + return true; + } + } + return false; + }; + + // Do not emit a warning if this undef is being used as old value + // in a two-addr instruction. + if (!isUsedInTwoAddr(RdRegion)) { + if (auto N = getVariable(cast(RdRegion))) { + emitWarning(RdRegion, "undefined value from '" + N->getName() + + "' is referenced after decomposition"); + } else + emitWarning(RdRegion, + "undefined value is referenced after decomposition"); + } + } + if (RdRegion->getType() == Part->getType() && RdR.isContiguous() + && isa(RdRegion->getType())) { + // The rdregion reads the whole of the decomposed part of the vector (and + // has a vector result even if single element). + // Just replace uses and erase. + RdRegion->replaceAllUsesWith(Part); + eraseInst(RdRegion); + return; + } + // The rdregion reads only some of the decomposed part of the vector. + // Create a new rdregion to replace the old one, taking its name. + RdR.Offset -= getPartOffset(PartIndex); + auto NewRdRegion = RdR.createRdRegion(Part, + "", RdRegion, RdRegion->getDebugLoc(), + /*AllowScalar=*/!isa(RdRegion->getType())); + NewRdRegion->takeName(RdRegion); + RdRegion->replaceAllUsesWith(NewRdRegion); + assert(Seen.find(RdRegion) == Seen.end()); + eraseInst(RdRegion); +} + +/*********************************************************************** + * VectorDecomposer::decomposeWrRegion : decompose a wrregion + * + * Enter: WrRegion = the wrregion instruction + * Parts = decomposed parts of input (modifiable) + */ +void VectorDecomposer::decomposeWrRegion(Instruction *WrRegion, + SmallVectorImpl *Parts) +{ + Region WrR(WrRegion, BaleInfo()); + unsigned PartIndex = getPartIndex(&WrR); + Value *Part = (*Parts)[PartIndex]; + if (WrRegion->getOperand(NewValueOperandNum)->getType() == Part->getType() + && !WrR.Mask) { + // The wrregion writes the whole of the decomposed part of the vector. + // We can just directly replace the part. + (*Parts)[PartIndex] = WrRegion->getOperand(NewValueOperandNum); + } else { + // The wrregion writes only some of the decomposed part of the vector. + // Create a new wrregion. + WrR.Offset -= getPartOffset(PartIndex); + auto NewInst = cast(WrR.createWrRegion(Part, + WrRegion->getOperand(NewValueOperandNum), + WrRegion->getName() + ".decomp." + Twine(PartIndex), + WrRegion, WrRegion->getDebugLoc())); + (*Parts)[PartIndex] = NewInst; + NewInsts.push_back(NewInst); + } + // Decompose its uses. decomposeTree removes the use, so we repeatedly process + // the first use until they have all gone. + while (!WrRegion->use_empty()) + decomposeTree(&*WrRegion->use_begin(), Parts); + // Now the original wrregion has no uses, and we can remove it. + eraseInst(WrRegion); +} + +/*********************************************************************** + * VectorDecomposer::decomposeBitCast : decompose a bitcast + * + * Enter: Inst = the bitcast instruction + * Parts = decomposed parts of input (modifiable) + */ +void VectorDecomposer::decomposeBitCast(Instruction *Inst, + SmallVectorImpl *Parts) +{ + // Create a new bitcast for each decomposed part, other than when the part + // is undef. (We handle the undef case as it is common, when only some of the + // vector has been set up. Other constant cases we leave to the EarlyCSE pass + // that comes after this pass.) + for (unsigned PartIndex = 0, NumParts = Parts->size(); + PartIndex != NumParts; ++PartIndex) { + Type *NewTy = getPartType(Inst->getType(), PartIndex); + if (isa((*Parts)[PartIndex])) + (*Parts)[PartIndex] = UndefValue::get(NewTy); + else { + auto NewInst = CastInst::Create(Instruction::BitCast, (*Parts)[PartIndex], + NewTy, Inst->getName() + ".decomp." + Twine(PartIndex), Inst); + NewInst->setDebugLoc(Inst->getDebugLoc()); + NewInsts.push_back(NewInst); + (*Parts)[PartIndex] = NewInst; + } + } + // Decompose its uses. decomposeTree removes the use, so we repeatedly process + // the first use until they have all gone. + while (!Inst->use_empty()) + decomposeTree(&*Inst->use_begin(), Parts); + // Now the original wrregion has no uses, and we can remove it. + eraseInst(Inst); +} + +/*********************************************************************** + * VectorDecomposer::getPartIndex : get the part index for the region + */ +unsigned VectorDecomposer::getPartIndex(Region *R) +{ + return Decomposition[R->Offset / 32U]; +} + +/*********************************************************************** + * VectorDecomposer::getPartOffset : get the byte offset of a part + */ +unsigned VectorDecomposer::getPartOffset(unsigned PartIndex) +{ + // Offsets[] has the index in GRFs. + return Offsets[PartIndex] * 32; +} + +/*********************************************************************** + * VectorDecomposer::getPartNumBytes : get the size of a part in bytes + */ +unsigned VectorDecomposer::getPartNumBytes(Type *WholeTy, unsigned PartIndex) +{ + if (PartIndex + 1 != Offsets.size()) { + // Not the last part. We can use the offset (in GRFs) difference. + return 32 * (Offsets[PartIndex + 1] - Offsets[PartIndex]); + } + // For the last part, we need to get the total size from WholeTy. + return DL->getTypeSizeInBits(WholeTy) / 8U - 32 * Offsets[PartIndex]; +} + +/*********************************************************************** + * VectorDecomposer::getPartNumElements : get the size of a part in elements + */ +unsigned VectorDecomposer::getPartNumElements(Type *WholeTy, unsigned PartIndex) +{ + Type *ElementTy = WholeTy->getScalarType(); + return getPartNumBytes(WholeTy, PartIndex) + / (DL->getTypeSizeInBits(ElementTy) >> 3); +} + +/*********************************************************************** + * VectorDecomposer::getPartType : get the type of a part + */ +VectorType *VectorDecomposer::getPartType(Type *WholeTy, unsigned PartIndex) +{ + Type *ElementTy = WholeTy->getScalarType(); + return VectorType::get(ElementTy, getPartNumElements(WholeTy, PartIndex)); +} + +/*********************************************************************** + * VectorDecomposer::getConstantPart : get the decomposed part of a constant + */ +Constant *VectorDecomposer::getConstantPart(Constant *Whole, unsigned PartIndex) +{ + Region R(Whole, DL); + R.Offset = getPartOffset(PartIndex); + R.NumElements = R.Width = getPartNumElements(Whole->getType(), PartIndex); + return R.evaluateConstantRdRegion(Whole, /*AllowScalar=*/false); +} + +/*********************************************************************** + * VectorDecomposer::removeDeadCode : aggressive dead code removal on + * instructions added by the vector decomposer + * + * NewInsts contains the instructions added. + */ +void VectorDecomposer::removeDeadCode() +{ + SmallVector Stack; // the "to be processed" stack + std::set Unused; + // Put all newly added instructions into the Unused set. + for (auto i = NewInsts.begin(), e = NewInsts.end(); i != e; ++i) + Unused.insert(*i); + // Look at each newly added instruction. If it is used in anything other than + // one of our newly added instructions, add it to the "to be processed" stack + // and remove it from the Unused set. (It also counts as used an instruction + // that is used in another of our newly added instructions that happens to + // have already been seen as used. It doesn't matter either way that this + // happens.) + for (auto i = NewInsts.begin(), e = NewInsts.end(); i != e; ++i) { + Instruction *Inst = *i; + bool IsUsed = false; + for (auto ui = Inst->use_begin(), ue = Inst->use_end(); ui != ue; ++ui) { + auto user = cast(ui->getUser()); + if (Unused.find(user) == Unused.end()) + IsUsed = true; + } + if (IsUsed) { + Stack.push_back(Inst); + Unused.erase(Inst); + } + } + // Process each entry on the stack. + while (!Stack.empty()) { + Instruction *Inst = Stack.back(); + Stack.pop_back(); + // Inst is used, perhaps indirectly, by something outside the web. + // Mark instructions it uses as used. For wrregion and bitcast, this + // is just operand 0. For a phi node, it is all incomings. + if (auto Phi = dyn_cast(Inst)) { + for (unsigned ii = 0, ie = Phi->getNumIncomingValues(); ii != ie; ++ii) { + auto Incoming = dyn_cast(Phi->getIncomingValue(ii)); + auto it = Unused.find(Incoming); + if (it == Unused.end()) + continue; + // Incoming is an instruction currently in the unused set. Remove it + // from the set, and add it to the "to be processed" stack. + Unused.erase(it); + Stack.push_back(Incoming); + } + } else { + auto Operand = dyn_cast(Inst->getOperand(0)); + auto it = Unused.find(Operand); + if (it != Unused.end()) { + // Operand is an instruction currently in the unused set. Remove it + // from the set, and add it to the "to be processed" stack. + Unused.erase(it); + Stack.push_back(Operand); + } + } + } + // Anything left in Unused is really unused, except for uses by other + // instructions in Unused (possibly circularly in the case of phi nodes). + // Erase them all forcibly, by changing all uses to undef first. + for (auto uui = Unused.begin(), uue = Unused.end(); uui != uue; ++uui) { + Instruction *Inst = *uui; + while (!Inst->use_empty()) + *Inst->use_begin() = UndefValue::get((*Inst->use_begin())->getType()); + eraseInst(Inst); + } +} + +/*********************************************************************** + * VectorDecomposer::eraseInst : erase an instruction + * + * This is used in the case that the instruction might be in the Seen + * set. So we delay actually deleting it until the end of processing the + * function. + */ +void VectorDecomposer::eraseInst(Instruction *Inst) +{ + Inst->removeFromParent(); + ToDelete.push_back(Inst); + // Remove all non-constant operands. + for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i) { + Value *Opnd = Inst->getOperand(i); + if (isa(Opnd)) + continue; + Inst->setOperand(i, UndefValue::get(Opnd->getType())); + } +} + +void VectorDecomposer::emitWarning(Instruction *Inst, const Twine &Msg) { + DiagnosticVectorDecomposition Warn(Inst, Msg, DS_Warning); + Inst->getContext().diagnose(Warn); +} + +// Decompose +// +// %33 = fcmp une <24 x float> %25, zeroinitializer +// %34 = fcmp oeq <24 x float> %24, zeroinitializer +// %35 = and <24 x i1> %33, %34 +// %36 = select <24 x i1> %35, <24 x float> +// +// into +// +// %25.0 = rrd(%25, 16, 16, 1) +// %25.1 = rrd(%25, 8, 8, 1) +// %24.0 = rrd(%24, 16, 16, 1) +// %24.1 = rrd(%24, 8, 8, 1) +// %33.0 = fcmp une <16 x float> %25.0, zeroinitializer +// %33.1 = fcmp une <8 x float> %25.1, zeroinitializer +// %34.0 = fcmp oeq <16 x float> %24.0, zeroinitializer +// %34.1 = fcmp oeq <8 x float> %24.1, zeroinitializer +// %35.0 = and <16 x i1> %33.0, %34.0 +// %35.1 = and <8 x i1> %33.1, %34.1 +// $36.0 = select <16 x i1> %35.0, <16 x float> +// %36.1 = select <8 x i1> %35.1, <8 x float> +// %36.new.0 = wrr(<24 x float> undef, <16 x float> %36.0, 0) +// %36.new.1 = wwr(<24 x float> %36.new.0, <8 x float> %36.1, 16) +// +// This allows register pressure reducer to better reorder the above sequence. +// +bool SelectDecomposer::run() { + bool Modified = false; + for (auto Inst : StartSelects) { + Modified |= processStartSelect(Inst); + clear(); + } + return Modified; +} + +bool SelectDecomposer::processStartSelect(Instruction *Inst) { + auto SI = dyn_cast(Inst); + if (!SI || !determineDecomposition(Inst)) + return false; + + // Decompose it and its predicate computation recursively. + decompose(SI); + + // Merge components, starting with undef. + SmallVectorImpl &Parts = DMap[Inst]; + Value *NewInst = UndefValue::get(Inst->getType()); + for (unsigned Idx = 0, N = Decomposition.size(); Idx < N; ++Idx) { + Region R(NewInst); + R.getSubregion(getPartOffset(Idx), getPartNumElements(Idx)); + NewInst = R.createWrRegion(NewInst, Parts[Idx], ".join", Inst, + Inst->getDebugLoc()); + } + Inst->replaceAllUsesWith(NewInst); + return true; +} + +template bool isGlobalVarOperand(const Value *V) { + const T *Inst = dyn_cast(V); + return Inst && + getUnderlyingGlobalVariable(Inst->getPointerOperand()) != nullptr; +} + +bool SelectDecomposer::determineDecomposition(Instruction *Inst) { + auto SI = dyn_cast(Inst); + assert(SI && "select expected"); + VectorType *Ty = dyn_cast(SI->getCondition()->getType()); + if (!Ty) + return false; + unsigned NumElts = Ty->getNumElements(); + if (NumElts <= 16) + return false; + if (!isa(SI->getCondition())) + return false; + + // Disable select decomposition if this select may be used in g_store bale. + // Otherwise, g_store bale cannot be created correctly due to a missing load + // of a global that will be stored(it is one of the requirements to g_store + // bales). The change fixes FRC_global and FRC_MC_global tests. + if (std::any_of(Inst->user_begin(), Inst->user_end(), + isGlobalVarOperand) || + std::any_of(Inst->value_op_begin(), Inst->value_op_end(), + isGlobalVarOperand)) + return false; + + // Extra checks to avoid aggressive splitting. + auto BB = Inst->getParent(); + auto check = [=](Instruction *I) { + if (!I->hasOneUse() || I->getParent() != BB) { + setNotDecomposing(); + return false; + } + return true; + }; + + // This determines the width of predicate operands. + // We consider the following two factors + // - The type size of sel + // - The input operands + unsigned Width = GenXDefaultSelectPredicateWidth; + if (Width > 32) + Width = 32; + else if (Width < 16) + Width = 16; + else if (SI->getType()->getScalarSizeInBits() >= 32) + Width = 16; + + // If there is a region read with a non-unit stride, + // then adjust the splitting width appropriately. + auto adjustWidth = [=, &Width](Value *V) { + // If this region read only supports up to 16, then do not split into + // simd 32. Otherwise it makes difficult to bale in this region read. + if (Width == 32 && GenXIntrinsic::isRdRegion(V)) { + CallInst *CI = cast(V); + Region R(CI, BaleInfo()); + unsigned LegalSize = R.getLegalSize( + 0, true /*Allow2D*/, + CI->getOperand(0)->getType()->getVectorNumElements(), ST); + if (LegalSize < 32) + Width = 16; + } + }; + + addToWeb(SI->getCondition()); + for (unsigned i = 0; i != Web.size(); ++i) { + Inst = Web[i]; + if (!check(Inst)) + break; + unsigned OpCode = Inst->getOpcode(); + switch (OpCode) + { + default: + setNotDecomposing(); + break; + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + addToWeb(Inst->getOperand(0)); + addToWeb(Inst->getOperand(1)); + adjustWidth(Inst->getOperand(0)); + adjustWidth(Inst->getOperand(1)); + break; + case Instruction::FCmp: + case Instruction::ICmp: + adjustWidth(Inst->getOperand(0)); + adjustWidth(Inst->getOperand(1)); + break; + } + } + + if (NotDecomposing) + return false; + + Offsets.clear(); + unsigned Offset = 0; + unsigned Remaining = NumElts; + while (Remaining > Width) { + Decomposition.push_back(Width); + Offsets.push_back(Offset); + Remaining -= Width; + Offset += Width; + } + if (Remaining > 0) { + Decomposition.push_back(Remaining); + Offsets.push_back(Offset); + } +#if _DEBUG + unsigned NumParts = (NumElts + Width - 1) / Width; + assert(NumParts == Decomposition.size()); + assert(NumParts == Offsets.size()); +#endif + + return true; +} + +void SelectDecomposer::addToWeb(Value *V) { + if (isa(V)) + return; + auto Inst = dyn_cast(V); + if (!Inst) { + // Cannot decompose with an argument in the web. + setNotDecomposing(); + return; + } + if (Seen.find(Inst) != Seen.end()) + return; + + Seen.insert(Inst); + Web.push_back(Inst); +} + +void SelectDecomposer::decompose(Instruction *Inst) { + if (isa(Inst)) + decomposeSelect(Inst); + else if (isa(Inst)) + decomposeCmp(Inst); + else { + assert(Inst->getOpcode() == Instruction::And || + Inst->getOpcode() == Instruction::Or || + Inst->getOpcode() == Instruction::Xor); + decomposeBinOp(Inst); + } +} + +void SelectDecomposer::decomposeSelect(Instruction *Inst) { + SelectInst *SI = cast(Inst); + if (auto I = dyn_cast(SI->getCondition())) + decompose(I); + + unsigned N = Decomposition.size(); + SmallVector Parts(N); + IRBuilder<> B(Inst); + + Value *OpC = SI->getCondition(); + Value *OpT = SI->getTrueValue(); + Value *OpF = SI->getFalseValue(); + + for (unsigned Idx = 0; Idx < N; ++Idx) { + Value *OpC_I = getPart(OpC, Idx, Inst); + Value *OpT_I = getPart(OpT, Idx, Inst); + Value *OpF_I = getPart(OpF, Idx, Inst); + Value *NewInst = B.CreateSelect(OpC_I, OpT_I, OpF_I, Inst->getName()); + if (auto I = dyn_cast(NewInst)) + I->setDebugLoc(Inst->getDebugLoc()); + Parts[Idx] = NewInst; + } + + DMap[Inst].swap(Parts); +} + +void SelectDecomposer::decomposeBinOp(Instruction *Inst) { + Value *Op0 = Inst->getOperand(0); + Value *Op1 = Inst->getOperand(1); + if (auto I = dyn_cast(Op0)) + decompose(I); + if (auto I = dyn_cast(Op1)) + decompose(I); + + unsigned N = Decomposition.size(); + SmallVector Parts(N); + IRBuilder<> B(Inst); + + for (unsigned Idx = 0; Idx < N; ++Idx) { + Value *Op0_I = getPart(Op0, Idx, Inst); + Value *Op1_I = getPart(Op1, Idx, Inst); + Value *NewInst = B.CreateBinOp(Instruction::BinaryOps(Inst->getOpcode()), + Op0_I, Op1_I, Inst->getName()); + if (auto I = dyn_cast(NewInst)) + I->setDebugLoc(Inst->getDebugLoc()); + Parts[Idx] = NewInst; + } + + DMap[Inst].swap(Parts); +} + +void SelectDecomposer::decomposeCmp(Instruction *Inst) { + Value *Op0 = Inst->getOperand(0); + Value *Op1 = Inst->getOperand(1); + + unsigned N = Decomposition.size(); + SmallVector Parts(N); + IRBuilder<> B(Inst); + CmpInst *CI = cast(Inst); + + for (unsigned Idx = 0; Idx < N; ++Idx) { + Value *Op0_I = getPart(Op0, Idx, Inst); + Value *Op1_I = getPart(Op1, Idx, Inst); + Value *NewInst = nullptr; + if (isa(CI)) + NewInst = B.CreateICmp(CI->getPredicate(), Op0_I, Op1_I, Inst->getName()); + else + NewInst = B.CreateFCmp(CI->getPredicate(), Op0_I, Op1_I, Inst->getName()); + if (auto I = dyn_cast(NewInst)) + I->setDebugLoc(Inst->getDebugLoc()); + Parts[Idx] = NewInst; + } + + DMap[Inst].swap(Parts); +} + +Value *SelectDecomposer::getPart(Value *Whole, unsigned PartIndex, + Instruction *Inst) const { + auto I = DMap.find(Whole); + if (I != DMap.end()) { + assert(I->second.size() > PartIndex); + return I->second[PartIndex]; + } + + unsigned Offset = getPartOffset(PartIndex); + unsigned NumElts = getPartNumElements(PartIndex); + + if (Whole->getType()->getScalarType()->isIntegerTy(1)) { + auto C = dyn_cast(Whole); + assert(C && "constant expected"); + if (Constant *V = C->getSplatValue()) + return ConstantVector::getSplat(NumElts, V); + SmallVector Values; + for (unsigned Idx = Offset; Idx < Offset + NumElts; ++Idx) + Values.push_back(C->getAggregateElement(Idx)); + return ConstantVector::get(Values); + } + + const DataLayout &DL = Inst->getModule()->getDataLayout(); + Region R(Whole, &DL); + R.Offset = Offset * R.ElementBytes; + R.NumElements = R.Width = NumElts; + + if (auto C = dyn_cast(Whole)) + return R.evaluateConstantRdRegion(C, /*AllowScalar=*/false); + return R.createRdRegion(Whole, ".in", Inst, Inst->getDebugLoc()); +} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXVectorDecomposer.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXVectorDecomposer.h new file mode 100644 index 000000000000..c7bf04e31e14 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXVectorDecomposer.h @@ -0,0 +1,175 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXVectorDecomposer +/// -------------------- +/// +/// GenXVectorDecomposer is not a pass; instead it is a class is called by by +/// the GenXPostLegalization pass to perform vector decomposition. +/// +/// For a vector written by wrregion and read by rdregion, it finds the way that +/// the vector can be divided into parts, with each part a range of one or more +/// GRFs, such that no rdregion or wrregion crosses a part boundary. Then it +/// decomposes the vector into those parts. A rdregion/wrregion that reads/writes +/// a whole part can be removed completely; a rdregion/wrregion that reads/writes +/// only some of the part is replaced to read/write just the applicable part. +/// +/// In fact it does all this for a web of vectors linked by wrregion, phi nodes +/// and bitcasts. +/// +/// The idea is that having lots of small vectors instead of one big vector +/// reduces register fragmentation in the finalizer's register allocator. +/// +/// There is an option -limit-genx-vector-decomposer=N to aid debugging the code +/// changes made by the vector decomposer. +/// +//===----------------------------------------------------------------------===// +#include "GenXRegion.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/Instructions.h" +#include +#include + +namespace llvm { + +class Constant; +class DominatorTree; +class Instruction; +class PHINode; +class Type; +class Use; + +namespace gen { +class Region; +} + +// VectorDecomposer : decomposes vectors in a function +class VectorDecomposer { + DominatorTree *DT; + const DataLayout *DL = nullptr; + SmallVector StartWrRegions; + std::set Seen; + SmallVector Web; + SmallVector ToDelete; + bool NotDecomposing; + Instruction *NotDecomposingReportInst; + SmallVector Decomposition; + SmallVector Offsets; + std::map> PhiParts; + SmallVector NewInsts; +public: + // clear : clear anything stored + void clear() { + clearOne(); + StartWrRegions.clear(); + Seen.clear(); + ToDelete.clear(); + } + // addStartWrRegion : add a wrregion with undef input to the list + void addStartWrRegion(Instruction *Inst) { StartWrRegions.push_back(Inst); } + // run : run the vector decomposer on the stored StartWrRegions + bool run(DominatorTree *DT); +private: + // clearOne : clear from processing one web + void clearOne() { + Web.clear(); + Decomposition.clear(); + Offsets.clear(); + PhiParts.clear(); + NewInsts.clear(); + } + bool processStartWrRegion(Instruction *Inst); + bool determineDecomposition(Instruction *Inst); + void addToWeb(Value *V, Instruction *User = nullptr); + void adjustDecomposition(Instruction *Inst); + void setNotDecomposing(Instruction *Inst, const char *Text); + void decompose(); + void decomposeTree(Use *U, const SmallVectorImpl *PartsIn); + void decomposePhiIncoming(PHINode *Phi, unsigned OperandNum, + const SmallVectorImpl *PartsIn); + void decomposeRdRegion(Instruction *RdRegion, + const SmallVectorImpl *PartsIn); + void decomposeWrRegion(Instruction *WrRegion, SmallVectorImpl *Parts); + void decomposeBitCast(Instruction *Inst, SmallVectorImpl *Parts); + unsigned getPartIndex(genx::Region *R); + unsigned getPartOffset(unsigned PartIndex); + unsigned getPartNumBytes(Type *WholeTy, unsigned PartIndex); + unsigned getPartNumElements(Type *WholeTy, unsigned PartIndex); + VectorType *getPartType(Type *WholeTy, unsigned PartIndex); + Constant *getConstantPart(Constant *Whole, unsigned PartIndex); + void removeDeadCode(); + void eraseInst(Instruction *Inst); + + void emitWarning(Instruction *Inst, const Twine &Msg); +}; + +// Decompose predicate computation sequences for select +// to reduce flag register pressure. +class SelectDecomposer { + const GenXSubtarget *ST; + bool NotDecomposing = false; + SmallVector StartSelects; + SmallVector Web; + SmallVector Decomposition; + SmallVector Offsets; + std::set Seen; + + // Map each decomposed instructions to its corresonding part values. + SmallDenseMap> DMap; + +public: + explicit SelectDecomposer(const GenXSubtarget *ST) : ST(ST) {} + void addStartSelect(Instruction *Inst) { StartSelects.push_back(Inst); } + bool run(); + +private: + void clear() { + NotDecomposing = false; + Web.clear(); + Decomposition.clear(); + Offsets.clear(); + Seen.clear(); + DMap.clear(); + } + bool processStartSelect(Instruction *Inst); + bool determineDecomposition(Instruction* Inst); + void setNotDecomposing() { NotDecomposing = true; } + void addToWeb(Value *V); + void decompose(Instruction *Inst); + void decomposeSelect(Instruction *Inst); + void decomposeBinOp(Instruction *Inst); + void decomposeCmp(Instruction *Inst); + + unsigned getPartOffset(unsigned PartIndex) const { + return Offsets[PartIndex]; + } + unsigned getPartNumElements(unsigned PartIndex) const { + return Decomposition[PartIndex]; + } + Value *getPart(Value *Whole, unsigned PartIndex, Instruction *Inst) const; +}; + +} // end namespace llvm diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXVisa.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXVisa.h new file mode 100644 index 000000000000..7415a8411b2a --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXVisa.h @@ -0,0 +1,140 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +// This file contains defines for vISA and the vISA writer. +// +//===----------------------------------------------------------------------===// +#ifndef GENXVISA_H +#define GENXVISA_H + +#include "GenX.h" +#include "GenXBaling.h" +#include "llvm/ADT/Twine.h" +#include "llvm/IR/Constants.h" +#include "llvm/Pass.h" +#include "llvm/PassRegistry.h" +#include +#include +#include +#include "GenXModule.h" + +namespace llvm { + namespace visa { + + // vISA relational operators + enum { EQ, NE, GT, GE, LT, LE }; + + + enum { + CLASS_GENERAL, CLASS_ADDRESS, CLASS_PREDICATE, CLASS_INDIRECT, + CLASS_IMMEDIATE = 5, CLASS_STATE }; + + // vISA vector operand modifiers + enum { MOD_ABS = 0x8, MOD_NEG = 0x10, MOD_NEGABS = 0x18, + MOD_SAT = 0x20, MOD_NOT = 0x28 }; + + enum { VISA_NUM_RESERVED_REGS = 32, + VISA_NUM_RESERVED_PREDICATES = 1, + VISA_NUM_RESERVED_SURFACES = 6 }; + + // These reserved indices are used by CM Frontend + // and some passes (like TPM) to create an stateless/slm/stack accesses + // TODO: consider introducing as set of new intrinsics with explicit + // specification of access (to get rid of the relevant hacky code base). + enum ReservedSurfaceIndex { + RSI_Stack = 253, // 253 is for stack access (T1), used by TPM pass + RSI_Slm = 254, // 254 is SLM, which is T0 in vISA + RSI_Stateless = 255 // 255 is stateless, which is T5 in vISA + }; + + // Extracts surface Index (which is expected to be constant) + // from llvm::Value + // TODO: consider replacing dync_cast_or_null to dyn_cast + // TODO: rename convert->extract + inline int convertToSurfaceIndex(const Value* ValueIdx) { + if (const auto CI = dyn_cast_or_null(ValueIdx)) { + int InputValue = static_cast(CI->getZExtValue()); + return InputValue; + } + return -1; + } + + inline ReservedSurfaceIndex getReservedSurfaceIndex(PreDefined_Surface Surface) { + switch(Surface) { + case PreDefined_Surface::PREDEFINED_SURFACE_STACK: + return RSI_Stack; + case PreDefined_Surface::PREDEFINED_SURFACE_SLM: + return RSI_Slm; + case PreDefined_Surface::PREDEFINED_SURFACE_T255: + return RSI_Stateless; + default: + // other types of prefefined surfaces are not used by CM backend + break; + } + llvm_unreachable("unsupported predefined surface"); + } + + inline bool isReservedSurfaceIndex(int SurfaceIndex) { + return SurfaceIndex == RSI_Stateless || SurfaceIndex == RSI_Slm || + SurfaceIndex == RSI_Stack; + } + + inline PreDefined_Surface getReservedSurface(int SurfaceIndex) { + assert(isReservedSurfaceIndex(SurfaceIndex)); + switch(SurfaceIndex) { + case RSI_Stack: + return PreDefined_Surface::PREDEFINED_SURFACE_STACK; + case RSI_Slm: + return PreDefined_Surface::PREDEFINED_SURFACE_SLM; + case RSI_Stateless: + return PreDefined_Surface::PREDEFINED_SURFACE_T255; + } + llvm_unreachable("unexpected surface index"); + } + + enum { VISA_MAX_GENERAL_REGS = 65536 * 256 - 1, + VISA_MAX_ADDRESS_REGS = 4096, + VISA_MAX_PREDICATE_REGS = 4096, + VISA_MAX_SAMPLER_REGS = 32 - 1, + VISA_MAX_SURFACE_REGS = 256, + VISA_MAX_VME_REGS = 16 }; + + enum { VISA_WIDTH_GENERAL_REG = 32 }; + + enum { VISA_ABI_INPUT_REGS_RESERVED = 1, + VISA_ABI_INPUT_REGS_MAX = 128 }; + + enum InputVarType { + VISA_INPUT_GENERAL = 0x0, + VISA_INPUT_SAMPLER = 0x1, + VISA_INPUT_SURFACE = 0x2, + VISA_INPUT_UNKNOWN + }; + + } // end namespace Visa + +} // end namespace llvm +#endif // ndef GENXVISA_H diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXVisaRegAlloc.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXVisaRegAlloc.cpp new file mode 100644 index 000000000000..fd3a70a78866 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXVisaRegAlloc.cpp @@ -0,0 +1,698 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +// GenXVisaRegAlloc is a function group pass that allocates vISA registers to +// LLVM IR values. See GenXVisaRegAlloc.h. +// +//===----------------------------------------------------------------------===// +#define DEBUG_TYPE "GENX_REGALLOC" + +#include "GenXVisaRegAlloc.h" +#include "GenX.h" +#include "GenXIntrinsics.h" +#include "GenXLiveness.h" +#include "GenXNumbering.h" +#include "GenXUtil.h" +#include "visa_igc_common_header.h" +#include "llvm/GenXIntrinsics/GenXMetadata.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; +using namespace genx; +using namespace visa; + +static cl::opt LimitGenXExtraCoalescing("limit-genx-extra-coalescing", cl::init(UINT_MAX), cl::Hidden, + cl::desc("Limit GenX extra coalescing.")); + + +char GenXVisaRegAlloc::ID = 0; +INITIALIZE_PASS_BEGIN(GenXVisaRegAlloc, "GenXVisaRegAlloc", "GenXVisaRegAlloc", false, false) +INITIALIZE_PASS_DEPENDENCY(GenXLiveness) +INITIALIZE_PASS_DEPENDENCY(GenXNumbering) +INITIALIZE_PASS_DEPENDENCY(FunctionGroupAnalysis) +INITIALIZE_PASS_END(GenXVisaRegAlloc, "GenXVisaRegAlloc", "GenXVisaRegAlloc", false, false) + +FunctionGroupPass *llvm::createGenXVisaRegAllocPass() +{ + initializeGenXVisaRegAllocPass(*PassRegistry::getPassRegistry()); + return new GenXVisaRegAlloc(); +} + +void GenXVisaRegAlloc::getAnalysisUsage(AnalysisUsage &AU) const +{ + FunctionGroupPass::getAnalysisUsage(AU); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.setPreservesAll(); +} + +/*********************************************************************** + * runOnFunctionGroup : run the register allocator for this FunctionGroup + * + * This is currently a trivial allocator that just gives a new vISA virtual + * register to every single Value. + */ +bool GenXVisaRegAlloc::runOnFunctionGroup(FunctionGroup &FGArg) +{ + FG = &FGArg; + Liveness = &getAnalysis(); + Numbering = &getAnalysis(); + FGA = &getAnalysis(); + BoolTy = Type::getInt1Ty(FG->getContext()); + // Empty out the analysis from the last function it was used on. + RegMap.clear(); + RegStorage.clear(); + PredefinedSurfaceRegs.clear(); + for (unsigned i = 0; i != RegCategory::NUMREALCATEGORIES; ++i) { + CurrentRegId[i] = 0; + } + for (unsigned i = 0; i < VISA_NUM_RESERVED_SURFACES; ++i) { + RegStorage.emplace_back(RegCategory::SURFACE, i); + PredefinedSurfaceRegs.push_back(&RegStorage.back()); + } + for (auto &F : *FG) { + if (F->hasFnAttribute(genx::FunctionMD::CMGenXMain) || + F->hasFnAttribute(genx::FunctionMD::CMStackCall)) + RegMap[F] = KernRegMap_t(); + } + // Reserve the reserved registers. + CurrentRegId[RegCategory::GENERAL] = VISA_NUM_RESERVED_REGS; + CurrentRegId[RegCategory::PREDICATE] = VISA_NUM_RESERVED_PREDICATES; + CurrentRegId[RegCategory::SURFACE] = VISA_NUM_RESERVED_SURFACES; + // Do some extra coalescing. + extraCoalescing(); + // Get the live ranges in a reproducible order. + std::vector LRs; + getLiveRanges(&LRs); + // Allocate a register to each live range. + for (auto i = LRs.begin(), e = LRs.end(); i != e; ++i) + allocReg(*i); + if (CurrentRegId[RegCategory::GENERAL] > VISA_MAX_GENERAL_REGS) + report_fatal_error("Too many vISA general registers"); + if (CurrentRegId[RegCategory::ADDRESS] > VISA_MAX_ADDRESS_REGS) + report_fatal_error("Too many vISA address registers"); + if (CurrentRegId[RegCategory::PREDICATE] > VISA_MAX_PREDICATE_REGS) + report_fatal_error("Too many vISA predicate registers"); + if (CurrentRegId[RegCategory::SAMPLER] > VISA_MAX_SAMPLER_REGS) + report_fatal_error("Too many vISA sampler registers"); + if (CurrentRegId[RegCategory::SURFACE] > VISA_MAX_SURFACE_REGS) + report_fatal_error("Too many vISA surface registers"); + if (CurrentRegId[RegCategory::VME] > VISA_MAX_VME_REGS) + report_fatal_error("Too many vISA VME registers"); + return false; +} + +/*********************************************************************** + * getLiveRanges : get the live ranges in a reproducible order + * + * We scan the code to find the live ranges, rather than just walking the + * GenXLiveness map, to ensure that registers are allocated in a consistent + * order that does not depend on the layout of allocated memory. + * + * This ignores any live range with no category, so such a live range does not + * get allocated a register. GenXArgIndirection uses that to stop an indirected + * argument uselessly getting a register. + */ +void GenXVisaRegAlloc::getLiveRanges(std::vector *LRs) const +{ + // create LRs for global variables. + for (auto &GV : FG->getModule()->globals()) + getLiveRangesForValue(&GV, LRs); + for (auto fgi = FG->begin(), fge = FG->end(); fgi != fge; ++fgi) { + Function *F = *fgi; + for (auto ai = F->arg_begin(), ae = F->arg_end(); ai != ae; ++ai) + getLiveRangesForValue(&*ai, LRs); + if (fgi != FG->begin() && !F->getReturnType()->isVoidTy()) { + // allocate reg for unified return value + getLiveRangesForValue(Liveness->getUnifiedRet(F), LRs); + } + for (auto fi = F->begin(), fe = F->end(); fi != fe; ++fi) { + BasicBlock *BB = &*fi; + for (auto bi = BB->begin(), be = BB->end(); bi != be; ++bi) + getLiveRangesForValue(&*bi, LRs); + } + } + for (auto &LR : *LRs) + LR->prepareFuncs(FGA); +} + +void GenXVisaRegAlloc::getLiveRangesForValue(Value *V, + std::vector *LRs) const +{ + auto Ty = V->getType(); + for (unsigned i = 0, e = IndexFlattener::getNumElements(Ty); + i != e; ++i) { + SimpleValue SV(V, i); + LiveRange *LR = Liveness->getLiveRangeOrNull(SV); + if (!LR || LR->getCategory() == RegCategory::NONE) + continue; + // Only process an LR if the map iterator is on the value that appears + // first in the LR. That avoids processing the same LR multiple times. + if (SV != *LR->value_begin()) + continue; + LRs->push_back(LR); + } +} + +/*********************************************************************** + * extraCoalescing : do some extra coalescing over and above what + * GenXCoalescing does + * + * GenXCoalescing does coalescing where it saves a copy, for example for + * a two address operand. This function does coalescing that does not save + * a copy, but the two live ranges are related by being the operand (a + * kill use) and result of the same instruction. This is in the hope that + * the jitter's register allocator will be able to do a better job with it. + * + * A further case of extra coalescing is that multiple instances of a constant + * load of a surface variable are coalesced together. This allows the CM code + * to use lots of printfs without running out of surface variables. + */ +void GenXVisaRegAlloc::extraCoalescing() +{ + LiveRange *CommonSurface = nullptr; + for (auto fgi = FG->begin(), fge = FG->end(); fgi != fge; ++fgi) { + Function *F = *fgi; + for (auto fi = F->begin(), fe = F->end(); fi != fe; ++fi) { + BasicBlock *BB = &*fi; + for (auto bi = BB->begin(), be = BB->end(); bi != be; ++bi) { + auto Inst = &*bi; + if (isa(Inst->getType())) + continue; + if (GenXIntrinsic::isWrRegion(Inst)) + continue; + auto LR = Liveness->getLiveRangeOrNull(Inst); + if (!LR || LR->Category != RegCategory::GENERAL) + continue; + // Check for convert of constant ot surface. + switch (GenXIntrinsic::getGenXIntrinsicID(Inst)) { + case GenXIntrinsic::genx_convert: + case GenXIntrinsic::genx_constanti: + if (LR->Category == RegCategory::SURFACE + && isa(Inst->getOperand(0))) { + // See if we can coalesce it with CommonSurface. + if (!CommonSurface) + CommonSurface = LR; + else if (!Liveness->interfere(CommonSurface, LR)) + CommonSurface = Liveness->coalesce(CommonSurface, LR, /*DisalowCASC=*/true); + } + break; + default: + break; + } + // We have a non-struct non-wrregion instruction whose result has a + // live range (it is not baled into anything else). + // Check all uses to see if there is one in a non-alu intrinsic. We + // don't want to coalesce that, because of the danger of the jitter + // needing to add an extra move in the send. + bool UseInNonAluIntrinsic = false; + for (auto ui = Inst->use_begin(), ue = Inst->use_end(); + ui != ue && !UseInNonAluIntrinsic; ++ui) { + auto user = dyn_cast(ui->getUser()); + assert(user); + if (user->getType()->isVoidTy()) { + UseInNonAluIntrinsic = true; + break; + } + unsigned IID = GenXIntrinsic::getAnyIntrinsicID(user); + switch (IID) { + case GenXIntrinsic::not_any_intrinsic: + case GenXIntrinsic::genx_rdregioni: + case GenXIntrinsic::genx_rdregionf: + case GenXIntrinsic::genx_wrregioni: + case GenXIntrinsic::genx_wrregionf: + break; + default: { + // It is an intrinsic. A non-alu intrinsic does not have a + // return value that is general. + GenXIntrinsicInfo II(IID); + if (!II.getRetInfo().isGeneral()) + UseInNonAluIntrinsic = true; + } + break; + } + } + if (UseInNonAluIntrinsic) + continue; + + // Do not coalesce when this is a two address instrinsic with undef + // input. Otherwise logic is broken on lifetime marker in vISA emission. + // + auto skipTwoAddrCoalesce = [](Instruction *Inst) { + unsigned IntrinsicID = GenXIntrinsic::getAnyIntrinsicID(Inst); + if (IntrinsicID == GenXIntrinsic::not_any_intrinsic) + return false; + GenXIntrinsicInfo Info(IntrinsicID); + const auto *descp = Info.getInstDesc(); + for (const auto *p = descp; *p; ++p) { + GenXIntrinsicInfo::ArgInfo AI(*p); + if (AI.getCategory() != GenXIntrinsicInfo::TWOADDR) + continue; + if (isa(Inst->getOperand(AI.getArgIdx()))) + return true; + } + return false; + }; + if (skipTwoAddrCoalesce(Inst)) + continue; + + // See if we can coalesce with any operand. + for (unsigned oi = 0, oe = Inst->getNumOperands(); oi != oe; ++oi) { + Value *Operand = Inst->getOperand(oi); + if (isa(Operand)) + continue; + if (Operand->getType() != Inst->getType()) + continue; + // Do not coalesce with kernel arguments as they are input variables. + if (FG->getHead() == F && isa(Operand)) + continue; + auto OperandLR = Liveness->getLiveRangeOrNull(Operand); + if (!OperandLR || OperandLR->Category != RegCategory::GENERAL) + continue; + if (Liveness->interfere(LR, OperandLR)) + continue; + // The two live ranges do not interfere, so we can coalesce them. + if (++CoalescingCount > LimitGenXExtraCoalescing) + continue; + if (LimitGenXExtraCoalescing != UINT_MAX) + dbgs() << "genx extra coalescing " << CoalescingCount << "\n"; + Liveness->coalesce(LR, OperandLR, /*DisalowCASC=*/true); + break; + } + } + } + } +} + +/*********************************************************************** + * allocReg : allocate a register for a LiveRange + */ +void GenXVisaRegAlloc::allocReg(LiveRange *LR) +{ + if (LR->value_empty()) + return; + if (LR->getCategory() >= RegCategory::NUMREALCATEGORIES) + return; // don't allocate register to EM or RM value + LLVM_DEBUG( + dbgs() << "Allocating "; + LR->print(dbgs()); + dbgs() << "\n" + ); + SimpleValue V = *LR->value_begin(); + Type *Ty = V.getType(); + if (auto GV = dyn_cast(V.getValue())) + if (GV->hasAttribute(genx::FunctionMD::GenXVolatile)) + Ty = Ty->getPointerElementType(); + assert(!Ty->isVoidTy()); + if (LR->Category == RegCategory::PREDICATE) { + VectorType *VT = dyn_cast(Ty); + assert((!VT || genx::exactLog2(VT->getNumElements()) >= 0) && "invalid predicate width"); + (void)VT; + } + // Allocate the register, also setting the alignment. + // Assign to the values. If any value is an input arg, ensure the register + // gets its type, to avoid needing an alias for an input arg. + for (auto &F : LR->Funcs) { + Reg *NewReg = + createReg(LR->Category, Ty, DONTCARESIGNED, LR->getLogAlignment()); + if (RegMap.count(F) > 0) { + for (LiveRange::value_iterator vi = LR->value_begin(), + ve = LR->value_end(); + vi != ve; ++vi) { + LLVM_DEBUG(dbgs() << "Allocating reg " << NewReg->Num << " to " + << *(vi->getValue()) << " in func " << F->getName() + << "\n";); + assert(RegMap.at(F).find(*vi) == RegMap.at(F).end()); + RegMap.at(F)[*vi] = NewReg; + if (isa(vi->getValue())) + NewReg->Ty = vi->getType(); + } + } + } +} + +/*********************************************************************** + * getRegForValueUntyped : get the vISA reg allocated to a particular + * value, ignoring signedness and type + * + * This is a const method so it can be called from print(). + */ +GenXVisaRegAlloc::Reg* GenXVisaRegAlloc::getRegForValueUntyped(const Function *kernel, + SimpleValue V) const +{ + // is possible if called for GenXPrinter + if (RegMap.count(kernel) == 0) + return nullptr; + auto& KernMap = RegMap.at(kernel); + KernRegMap_t::const_iterator i = KernMap.find(V); + if (i == KernMap.end()) { + // Check if it's predefined variables. + if (GenXIntrinsic::getGenXIntrinsicID(V.getValue()) != GenXIntrinsic::genx_predefined_surface) + return nullptr; + auto CI = cast(V.getValue()); + unsigned Id = cast(CI->getArgOperand(0))->getZExtValue(); + assert(Id < 4 && "Invalid predefined surface ID!"); + assert(PredefinedSurfaceRegs.size() == VISA_NUM_RESERVED_SURFACES && + "Predefined surface registers have not been initialized"); + return PredefinedSurfaceRegs[Id]; + } + return i->second; +} + +/*********************************************************************** + * getRegForValueOrNull : get the vISA reg allocated to a particular Value + * + * Enter: V = value (Argument or Instruction) to get register for + * Signed = request for signed or unsigned + * OverrideType = 0 else override type of value (used for bitcast) + * + * Called from GenXVisaFunctionWriter to get the register for an + * operand. The operand type might not match the register type (say a + * bitcast has been coalesced, or the same integer value is used + * unsigned in one place and signed in another), in which case we + * find/create a vISA register alias. + */ +GenXVisaRegAlloc::Reg* GenXVisaRegAlloc::getRegForValueOrNull( + const Function* kernel, SimpleValue V, Signedness Signed, Type *OverrideType) +{ + if (!OverrideType) + OverrideType = V.getType(); + if (OverrideType->isPointerTy()) { + auto GV = dyn_cast(V.getValue()); + if (GV && GV->hasAttribute(genx::FunctionMD::GenXVolatile)) + OverrideType = OverrideType->getPointerElementType(); + } + Reg* R = getRegForValueUntyped(kernel, V); + if (!R) + return nullptr; // no register allocated + Reg* OriginalReg = R; + + if (R->Category == RegCategory::GENERAL) { + for (;;) { + Type *ExistingType = R->Ty; + if (VectorType *VT = dyn_cast(ExistingType)) + if (VT->getNumElements() == 1) + ExistingType = VT->getElementType(); + if (VectorType *VT = dyn_cast(OverrideType)) + if (VT->getNumElements() == 1) + OverrideType = VT->getElementType(); + if (ExistingType == OverrideType) { + if (R->Signed == Signed || Signed == DONTCARESIGNED) + break; // Match, use this alias. + } + // On to next alias. + auto Next = R->NextAlias[kernel]; + if (Next) { + R = Next; + continue; + } + // Run out of aliases. Add a new one. + Reg* NewReg = createReg(RegCategory::GENERAL, OverrideType, Signed, 0, OriginalReg); + R->NextAlias[kernel] = NewReg; + R = NewReg; + break; + } + } + return R; +} + +/*********************************************************************** + * getSigned : get the signedness of a register + * + * If the register has byte type and is currently don't care signedness, this + * arbitrarily picks unsigned. We do that because having a byte mov with + * different signedness between source and destination can make the jitter + * generate less efficient code. + */ +genx::Signedness GenXVisaRegAlloc::getSigned(Reg* R) +{ + return (R && R->Category == RegCategory::GENERAL) ? + R->Signed : DONTCARESIGNED; +} + +// addRetIPArgument : Add the RetIP argument required for caller kernels and +// their caller. +void GenXVisaRegAlloc::addRetIPArgument() { + RetIP = createReg(RegCategory::GENERAL, Type::getInt64Ty(FG->getContext())); +} + +/*********************************************************************** + * TypeDetails constructor + * + * Enter: Ty = LLVM type + * Signedness = whether signed type required + */ +TypeDetails::TypeDetails(const DataLayout &DL, Type *Ty, Signedness Signed) + : DL(DL) { + Type *ElementTy = Ty; + NumElements = 1; + if (VectorType *VT = dyn_cast(ElementTy)) { + ElementTy = VT->getElementType(); + NumElements = VT->getNumElements(); + } + if (IntegerType *IT = dyn_cast(ElementTy)) { + BytesPerElement = IT->getBitWidth() / 8; + if (Signed == UNSIGNED) { + switch (BytesPerElement) { + case 1: VisaType = ISA_TYPE_UB; break; + case 2: VisaType = ISA_TYPE_UW; break; + case 4: VisaType = ISA_TYPE_UD; break; + default: VisaType = ISA_TYPE_UQ; break; + } + } else { + switch (BytesPerElement) { + case 1: VisaType = ISA_TYPE_B; break; + case 2: VisaType = ISA_TYPE_W; break; + case 4: VisaType = ISA_TYPE_D; break; + default: VisaType = ISA_TYPE_Q; break; + } + } + } else if (ElementTy->isHalfTy()) { + VisaType = ISA_TYPE_HF; + BytesPerElement = 2; + } else if (ElementTy->isFloatTy()) { + VisaType = ISA_TYPE_F; + BytesPerElement = 4; + } else if (auto PT = dyn_cast(ElementTy)) { + BytesPerElement = DL.getPointerTypeSize(PT); + if (BytesPerElement == 4) + VisaType = ISA_TYPE_UD; + else if (BytesPerElement == 8) + VisaType = ISA_TYPE_UQ; + else + report_fatal_error("unsupported pointer type size"); + } else { + assert(ElementTy->isDoubleTy()); + VisaType = ISA_TYPE_DF; + BytesPerElement = 8; + } + if (NumElements > 16384 || NumElements * BytesPerElement > 16384 * 8) + report_fatal_error("Variable too big"); +} + + +/*********************************************************************** + * print : dump the state of the pass. This is used by -genx-dump-regalloc + */ +void GenXVisaRegAlloc::print(raw_ostream &OS, const Module *M) const +{ + // Get the live ranges in a reproducible order, and sort them by "length" + // (the total number of instructions that the live range covers). + struct LiveRangeAndLength { + LiveRange *LR; + unsigned Length; + LiveRangeAndLength(LiveRange *LR, unsigned Length) : LR(LR), Length(Length) {} + bool operator<(const LiveRangeAndLength &Rhs) const { return Length > Rhs.Length; } + }; + std::vector LRs; + getLiveRanges(&LRs); + std::vector LRLs; + for (auto i = LRs.begin(), e = LRs.end(); i != e; ++i) + LRLs.push_back(LiveRangeAndLength(*i, (*i)->getLength(/*WithWeak=*/ false))); + LRs.clear(); + std::sort(LRLs.begin(), LRLs.end()); + // Dump them. Also keep count of the register pressure at each + // instruction number. + std::vector Pressure; + std::vector FlagPressure; + for (auto i = LRLs.begin(), e = LRLs.end(); i != e; ++i) { + // Dump a single live range. + LiveRange *LR = i->LR; + SimpleValue SV = *LR->value_begin(); + Reg* RN = getRegForValueUntyped(&(*(M->begin())), SV); + assert(RN); + OS << "["; + RN->print(OS); + Type *ElTy = IndexFlattener::getElementType(SV.getValue()->getType(), + SV.getIndex()); + unsigned Bytes = (ElTy->getPrimitiveSizeInBits() + 15U) / 8U & -2U; + bool IsFlag = ElTy->getScalarType()->isIntegerTy(1); + OS << "] (" << Bytes << " bytes, length " << i->Length <<") "; + // Dump some indication of what the live range is. For a kernel argument, + // show its name. For an instruction with debug info, show the location. + // We try and find the earliest definition with debug info to show. + unsigned BestNum = UINT_MAX; + Instruction *BestInst = nullptr; + Argument *KernelArg = nullptr; + for (auto i = LR->value_begin(), e = LR->value_end(); i != e; ++i) { + Value *V = i->getValue(); + if (auto Arg = dyn_cast(V)) { + if (Arg->getParent() == FG->getHead()) { + KernelArg = Arg; + break; + } + } else { + auto Inst = cast(V); + if (!isa(Inst)) { + unsigned Num = Numbering->getNumber(Inst); + if (Num < BestNum) { + auto DL = Inst->getDebugLoc(); + if (!DL) { + BestNum = Num; + BestInst = Inst; + } + } + } + } + } + if (KernelArg) + OS << KernelArg->getName(); + else if (BestInst) { + const DebugLoc &DL = BestInst->getDebugLoc(); + OS << DL->getFilename() << ":" << DL.getLine(); + } + // Dump the live range segments, and add each to the pressure score. + OS << ":"; + LR->printSegments(OS); + for (auto si = LR->begin(), se = LR->end(); si != se; ++si) { + if (si->getEnd() >= Pressure.size()) { + Pressure.resize(si->getEnd() + 1, 0); + FlagPressure.resize(si->getEnd() + 1, 0); + } + for (unsigned n = si->getStart(); n != si->getEnd(); ++n) { + Pressure[n] += Bytes; + if (IsFlag) + FlagPressure[n] += Bytes; + } + } + OS << "\n"; + } + OS << "\n"; + // Prepare to print register pressure info. First we need to compute a + // mapping from instruction number to instruction. Only bother with + // instructions with debug info. + std::vector Insts; + for (auto fgi = FG->begin(), fge = FG->end(); fgi != fge; ++fgi) { + Function *F = *fgi; + for (auto fi = F->begin(), fe = F->end(); fi != fe; ++fi) { + BasicBlock *BB = &*fi; + for (auto bi = BB->begin(), be = BB->end(); bi != be; ++bi) { + Instruction *Inst = &*bi; + if (!Inst->getDebugLoc()) { + unsigned Num = Numbering->getNumber(Inst); + if (Num >= Insts.size()) + Insts.resize(Num + 1, nullptr); + Insts[Num] = Inst; + } + } + } + } + OS << "Register pressure (bytes):\n"; + unsigned Last = 0; + bool HadInst = false; + Function *LastFunc = nullptr; + for (unsigned n = 0; n != Pressure.size(); ++n) { + if (Pressure[n]) { + Instruction *Inst = nullptr; + if (n < Insts.size()) + Inst = Insts[n]; + if (Pressure[n] != Last) + HadInst = false; + if (Pressure[n] != Last || (!HadInst && Inst)) { + if (Inst && Inst->getParent()->getParent() != LastFunc) { + LastFunc = Inst->getParent()->getParent(); + OS << "In " << LastFunc->getName() << "\n"; + } + Last = Pressure[n]; + OS << Pressure[n] << " at " << n; + if (Inst) { + HadInst = true; + OS << " "; + const DebugLoc &DL = Inst->getDebugLoc(); + DL.print(OS); + } + OS << "\n"; + } + } + } + OS << "Flag pressure (bytes):\n"; + Last = 0; + HadInst = false; + for (unsigned n = 0; n != FlagPressure.size(); ++n) { + Instruction *Inst = nullptr; + if (n < Insts.size()) + Inst = Insts[n]; + if (FlagPressure[n] != Last) + HadInst = false; + if (FlagPressure[n] != Last || (!HadInst && Inst)) { + Last = FlagPressure[n]; + OS << FlagPressure[n] << " at " << n; + if (Inst) { + HadInst = true; + const DebugLoc &DL = Inst->getDebugLoc(); + OS << " " << DL->getFilename() << ":" << DL.getLine(); + } + OS << "\n"; + } + } +} + +/*********************************************************************** + * RegNum::print : print a regnum + */ +void GenXVisaRegAlloc::Reg::print(raw_ostream &OS) const +{ + switch (Category) { + case RegCategory::NONE: OS << "-"; return; + case RegCategory::GENERAL: OS << "v"; break; + case RegCategory::ADDRESS: OS << "a"; break; + case RegCategory::PREDICATE: OS << "p"; break; + case RegCategory::SAMPLER: OS << "s"; break; + case RegCategory::SURFACE: OS << "t"; break; + case RegCategory::VME: OS << "vme"; break; + default: OS << "?"; break; + } + OS << Num; +} + diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXVisaRegAlloc.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXVisaRegAlloc.h new file mode 100644 index 000000000000..bf89347ea5cf --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXVisaRegAlloc.h @@ -0,0 +1,253 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXVisaRegAlloc +/// ---------------- +/// +/// GenXVisaRegAlloc is a function group pass that allocates vISA registers to +/// LLVM IR values. +/// +/// Before allocating registers, this pass does "extra coalescing", over and above +/// what GenXCoalescing does. Two otherwise independent live ranges that are +/// related by being an operand and the result of the same instruction (and are +/// the same type) get coalesced and thus allocated into the same register. +/// +/// However, extra coalescing is not performed when the result of the instruction +/// is used in a non-alu intrinsic, to try and avoid the danger of the jitter +/// needing to add an extra move in the send. +/// +/// Other than that, all this pass does is allocate a different vISA register to +/// each LiveRange. +/// +/// The pass is also an analysis for GenXKernelBuilder to query to find out +/// what vISA register is allocated to a particular Value. In fact, the query +/// from GenXKernelBuilder can specify what type it wants the register to be, +/// and it is at that point that an alias is allocated if there is no existing +/// alias of the requested type. +/// +/// Finally, there are callbacks in the analysis to generate the vISA variable +/// tables to put into the vISA file. +/// +//===----------------------------------------------------------------------===// +#ifndef GENXVISAREGALLOC_H +#define GENXVISAREGALLOC_H + +#include "FunctionGroup.h" +#include "GenX.h" +#include "GenXLiveness.h" +#include "GenXModule.h" +#include "vc/GenXOpts/Utils/RegCategory.h" +#include "visaBuilder_interface.h" +#include +#include +#include + +namespace llvm { + + class Function; + class FunctionPass; + class raw_ostream; + class Type; + class Value; + + FunctionGroupPass *createGenXGroupPrinterPass(raw_ostream &O, const std::string &Banner); + + // GenXVisaRegAlloc : vISA virtual register allocator pass + class GenXVisaRegAlloc : public FunctionGroupPass { + public: + + // Reg : a virtual register + class Reg { + public: + unsigned short Category = genx::RegCategory::NONE; + // Register ID. First value of it depends on count of predefined + // variablse in category. F.e. for general var it is 32. + unsigned short Num = 0; + // Pointer to register that is aliased by this register. + Reg* AliasTo = nullptr; + // Single linked list to store all aliases of real register. + std::map NextAlias; + genx::Signedness Signed = genx::DONTCARESIGNED; + Type *Ty = nullptr; + // log2 min alignment requested by user of register + unsigned Alignment; + // String representation of register, mostly it is combination of + // Category and Num + std::string NameStr; + // Attributes + std::vector> Attributes; + // Pointer to VISA variable. It is set by CisaBuilder when it creates + // VISA variables for all registers in RegMap. + std::map GenVar; + + explicit Reg( + unsigned Category, + unsigned Num, + Type *Ty = 0, + genx::Signedness Signed = genx::DONTCARESIGNED, + unsigned LogAlignment = 0, + Reg* AliasTo = nullptr) + : Category(Category), Num(Num), AliasTo(AliasTo), Signed(Signed), + Ty(Ty), Alignment(LogAlignment) { + static const char* Prefix[] = { "ERR", "V", "A", "P", "S", "T", "VME" }; + assert(Category && Category < genx::RegCategory::NUMREALCATEGORIES); + NameStr = Prefix[Category] + std::to_string(Num); + } + + // Get VISA variable assigned to register. + // Template T is just cast for return Type. Normally, we need to assert + // here that required Var type is equal of real type in GenVar. + template + T* GetVar(VISAKernel* F) { + return reinterpret_cast(GenVar[F]); + } + + // Set VISA variable for current register. + void SetVar(VISAKernel *F, void* Var) { + GenVar[F] = Var; + } + + void addAttribute(unsigned AttrName, Twine AttrVal) { + Attributes.push_back(std::make_pair(AttrName, AttrVal.str())); + } + + void print(raw_ostream &OS) const; + }; + + using RegPushHook = void(*)(void* Object, Reg&); + using KernRegMap_t = std::map; + using RegMap_t = std::map; + private: + FunctionGroup *FG; + GenXLiveness *Liveness; + GenXNumbering *Numbering; + FunctionGroupAnalysis *FGA; + + // pushReg callback that will be called once new register is created + RegPushHook TheRegPushHook = nullptr; + // Object that will be passed to hook, likely it is 'this' of hook owner. + void* TheRegPushHookObject = nullptr; + + // Storage for all created registers. It is list because we use pointers + // to stored registers, thus we need to non-reallocable storage. + std::list RegStorage; + // Map from LLVM Value to pointer to register associed with it. + RegMap_t RegMap; + // List of pointers to predefined surface registers. + std::vector PredefinedSurfaceRegs; + + // Array of current indexes being assigned to new register. + unsigned CurrentRegId[genx::RegCategory::NUMREALCATEGORIES]; + + public: + static char ID; + explicit GenXVisaRegAlloc() : FunctionGroupPass(ID) { } + virtual StringRef getPassName() const { return "GenX vISA virtual register allocator"; } + void getAnalysisUsage(AnalysisUsage &AU) const; + bool runOnFunctionGroup(FunctionGroup &FG); + + std::list& getRegStorage() { + return RegStorage; + } + // Get the vISA virtual register for a value (assert if none) + Reg* getRegForValue(const Function *kernel, genx::SimpleValue V, + genx::Signedness Signed = genx::DONTCARESIGNED, Type *OverrideType = 0) + { + Reg* R = getRegForValueOrNull(kernel, V, Signed, OverrideType); + assert(R && "no register allocated for this value"); + return R; + } + // Get the vISA virtual register for a value or nullptr if there is no + // register associated with given value. + Reg* getRegForValueOrNull(const Function *kernel, genx::SimpleValue V, + genx::Signedness Signed = genx::DONTCARESIGNED, Type *OverrideType = 0); + + // Get the vISA virtual register for a value (0 if none), ignoring type + // and signedness so it can be a const method usable from print(). + Reg* getRegForValueUntyped(const Function* kernel, genx::SimpleValue V) const; + + // Get the signedness of a register. + genx::Signedness getSigned(Reg* R); + + // Set callback that will be called each time new register is created. + // It is used in CisaBuilder when new aliases are created. + void SetRegPushHook(void* Object, RegPushHook Callback) { + TheRegPushHook = Callback; + TheRegPushHookObject = Object; + } + + // Create new register and push it in storage. + // If RegPushHook was specified it will be called with created register as + // parameter. Thus, all needed register's variables must be specified + // at this moment, for example AliasTo. + template + Reg* createReg(unsigned Category, Args&& ... args) { + RegStorage.emplace_back(Category, CurrentRegId[Category]++, + std::forward(args) ...); + Reg& R = RegStorage.back(); + if (TheRegPushHook) + TheRegPushHook(TheRegPushHookObject, R); + return &R; + } + + // createPrinterPass : get a pass to print the IR, together with the GenX + // specific analyses + virtual Pass *createPrinterPass(raw_ostream &O, const std::string &Banner) const + { return createGenXGroupPrinterPass(O, Banner); } + // print : dump the state of the pass. This is used by -genx-dump-regalloc + virtual void print(raw_ostream &O, const Module *M) const; + private: + Type *BoolTy; + void getLiveRanges(std::vector *LRs) const; + void getLiveRangesForValue(Value *V, std::vector *LRs) const; + void extraCoalescing(); + void allocReg(genx::LiveRange *LR); + public: + // Add special RetIP argument. + Reg* getRetIPArgument() const { return RetIP; } + void addRetIPArgument(); + private: + unsigned CoalescingCount = 0; + Reg* RetIP; + }; + + namespace visa { + // Details of a type required for a vISA general register declaration + // or an indirect operand. + struct TypeDetails { + const DataLayout &DL; + unsigned NumElements; + unsigned BytesPerElement; + unsigned VisaType; + TypeDetails(const DataLayout &DL, Type *Ty, genx::Signedness Signed); + }; + } // end namespace visa + + void initializeGenXVisaRegAllocPass(PassRegistry &); + +} // end namespace llvm +#endif //ndef GENXVISAREGALLOC_H + diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXWATable.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXWATable.cpp new file mode 100644 index 000000000000..bda27d3d1859 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXWATable.cpp @@ -0,0 +1,34 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ + +#include "GenXWATable.h" + +using namespace llvm; + +char GenXWATable::ID = 0; + +INITIALIZE_PASS_BEGIN(GenXWATable, "GenXWATable", "GenXWATable", false, true) +INITIALIZE_PASS_END(GenXWATable, "GenXWATable", "GenXWATable", false, true) diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXWATable.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXWATable.h new file mode 100644 index 000000000000..4274db163e0e --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXWATable.h @@ -0,0 +1,57 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ + +#ifndef VCOPT_LIB_GENXCODEGEN_GENXWATABLE_H +#define VCOPT_LIB_GENXCODEGEN_GENXWATABLE_H + +#include + +#include + +namespace llvm { + +void initializeGenXWATablePass(PassRegistry &PR); + +// Transparent wrapper around driver WA_TABLE. +class GenXWATable : public ImmutablePass { + WA_TABLE *WaTable = nullptr; + +public: + static char ID; + + GenXWATable() : ImmutablePass(ID) {} + + GenXWATable(WA_TABLE *Table) : ImmutablePass(ID), WaTable{Table} { + initializeGenXWATablePass(*PassRegistry::getPassRegistry()); + } + + // This can return nullptr which means that we don't know + // workarounds for current platform. + WA_TABLE *getWATable() const { return WaTable; } +}; +} // namespace llvm + +#endif diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXWrapper.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXWrapper.cpp new file mode 100644 index 000000000000..d80fb2907f2e --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/GenXWrapper.cpp @@ -0,0 +1,717 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ + +#if defined(__linux__) +#include +#endif + +#include "GenXOCLRuntimeInfo.h" +#include "GenXWATable.h" + +#include "llvmWrapper/Target/TargetMachine.h" + +#include "vc/GenXCodeGen/GenXTarget.h" +#include "vc/GenXCodeGen/GenXWrapper.h" +#include "vc/GenXOpts/GenXOpts.h" +#include "vc/GenXOpts/Utils/KernelInfo.h" +#include "vc/Support/Options.h" +#include "vc/Support/Status.h" +#include "llvm/GenXIntrinsics/GenXIntrinsics.h" +#include "llvm/GenXIntrinsics/GenXSPIRVReaderAdaptor.h" +#include "llvm/GenXIntrinsics/GenXIntrOpts.h" + +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/Triple.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Bitcode/BitcodeReader.h" +#include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/IR/Verifier.h" +#include "llvm/InitializePasses.h" +#include "llvm/Option/ArgList.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/DynamicLibrary.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/Process.h" +#include "llvm/Support/StringSaver.h" +#include "llvm/Support/TargetRegistry.h" +#include "llvm/Support/TargetSelect.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetOptions.h" +#include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/IPO/PassManagerBuilder.h" +#include "llvm/Transforms/Scalar.h" + +#include +#include +#include +#include +#include +#include +#include + +using namespace llvm; + +static Expected> translateSPIRVToIR(ArrayRef Input) { +#if defined(_WIN64) + //TODO: rename to SPIRVDLL64.dll when binary components are fixed + constexpr char *SpirvLibName = "SPIRVDLL.dll"; +#elif defined(_WIN32) + constexpr char *SpirvLibName = "SPIRVDLL32.dll"; +#else + constexpr char *SpirvLibName = "libSPIRVDLL.so"; +#endif + constexpr char *SpirvReadVerifyName = "spirv_read_verify_module"; + using SpirvReadVerifyType = + int(const char *pIn, size_t InSz, + void (*OutSaver)(const char *pOut, size_t OutSize, void *OutUserData), + void *OutUserData, + void (*ErrSaver)(const char *pErrMsg, void *ErrUserData), + void *ErrUserData); + +#if defined(__linux__) + // Hack to workaround cmoc crashes during loading of SPIRV library + static auto DeepBindHack = dlopen(SpirvLibName, RTLD_NOW | RTLD_DEEPBIND); +#endif // __linux__ + + using DL = sys::DynamicLibrary; + std::string ErrMsg; + DL DyLib = DL::getPermanentLibrary(SpirvLibName, &ErrMsg); + if (!DyLib.isValid()) + return make_error(ErrMsg); + + auto *SpirvReadVerifyFunc = reinterpret_cast( + DyLib.getAddressOfSymbol(SpirvReadVerifyName)); + if (!SpirvReadVerifyFunc) + return make_error(SpirvLibName, SpirvReadVerifyName); + + auto OutSaver = [](const char *pOut, size_t OutSize, void *OutData) { + auto *Vec = reinterpret_cast *>(OutData); + Vec->assign(pOut, pOut + OutSize); + }; + auto ErrSaver = [](const char *pErrMsg, void *ErrData) { + auto *ErrStr = reinterpret_cast(ErrData); + *ErrStr = pErrMsg; + }; + + std::vector Result; + int Status = SpirvReadVerifyFunc(Input.data(), Input.size(), OutSaver, + &Result, ErrSaver, &ErrMsg); + if (Status != 0) + return make_error(ErrMsg); + + return {std::move(Result)}; +} + +static Expected> getModule(ArrayRef Input, + LLVMContext &C) { + auto ExpIR = translateSPIRVToIR(Input); + if (!ExpIR) + return ExpIR.takeError(); + + std::vector &IR = ExpIR.get(); + llvm::MemoryBufferRef BufferRef(llvm::StringRef(IR.data(), IR.size()), + "Deserialized SPIRV Module"); + auto ExpModule = llvm::parseBitcodeFile(BufferRef, C); + + if (!ExpModule) + return llvm::handleExpected( + std::move(ExpModule), + []() -> llvm::Error { + llvm_unreachable("Should create new error"); + // Without this dead return MSVC fails with ICE in release-32bit. + return llvm::Error::success(); + }, + [](const llvm::ErrorInfoBase &E) { + return make_error(E.message()); + }); + + if (verifyModule(*ExpModule.get())) + return make_error(); + + return ExpModule; +} + +static void dumpModuleToTemp(const Module &M, const char *Name) { + int FD = -1; + auto EC = sys::fs::openFileForWrite( + Name, FD, sys::fs::CD_CreateAlways, sys::fs::F_None); + if (EC) { + llvm::errs() << "Can not open file: " << Name << "\n"; + return; + } + + raw_fd_ostream O(FD, /*shouldClose=*/true); + M.print(O, nullptr); +} + +static void dumpDataToTemp(StringRef S, const char *Name) { + int FD = -1; + auto EC = sys::fs::openFileForWrite( + Name, FD, sys::fs::CD_CreateAlways, sys::fs::F_None); + if (EC) { + llvm::errs() << "Can not open file: " << Name << "\n"; + return; + } + + raw_fd_ostream O(FD, /*shouldClose=*/true); + O << S; +} + +static vc::ocl::ArgInfo +convertOCLArgInfo(const GenXOCLRuntimeInfo::KernelArgInfo &Info) { + vc::ocl::ArgInfo Converted; + + using ArgKind = GenXOCLRuntimeInfo::KernelArgInfo::KindType; + switch (Info.getKind()) { + case ArgKind::General: + Converted.Kind = vc::ocl::ArgKind::General; + break; + case ArgKind::LocalSize: + Converted.Kind = vc::ocl::ArgKind::LocalSize; + break; + case ArgKind::GroupCount: + Converted.Kind = vc::ocl::ArgKind::GroupCount; + break; + case ArgKind::Buffer: + Converted.Kind = vc::ocl::ArgKind::Buffer; + break; + case ArgKind::SVM: + Converted.Kind = vc::ocl::ArgKind::SVM; + break; + case ArgKind::Sampler: + Converted.Kind = vc::ocl::ArgKind::Sampler; + break; + case ArgKind::Image1D: + Converted.Kind = vc::ocl::ArgKind::Image1d; + break; + case ArgKind::Image2D: + Converted.Kind = vc::ocl::ArgKind::Image2d; + break; + case ArgKind::Image3D: + Converted.Kind = vc::ocl::ArgKind::Image3d; + break; + case ArgKind::PrintBuffer: + Converted.Kind = vc::ocl::ArgKind::PrintBuffer; + break; + case ArgKind::PrivateBase: + Converted.Kind = vc::ocl::ArgKind::PrivateBase; + break; + } + + using ArgAccessKind = GenXOCLRuntimeInfo::KernelArgInfo::AccessKindType; + switch (Info.getAccessKind()) { + case ArgAccessKind::None: + Converted.AccessKind = vc::ocl::ArgAccessKind::None; + break; + case ArgAccessKind::ReadOnly: + Converted.AccessKind = vc::ocl::ArgAccessKind::ReadOnly; + break; + case ArgAccessKind::WriteOnly: + Converted.AccessKind = vc::ocl::ArgAccessKind::WriteOnly; + break; + case ArgAccessKind::ReadWrite: + Converted.AccessKind = vc::ocl::ArgAccessKind::ReadWrite; + break; + } + + Converted.Index = Info.getIndex(); + Converted.Offset = Info.getOffset(); + Converted.SizeInBytes = Info.getSizeInBytes(); + Converted.BTI = Info.getBTI(); + + return Converted; +} + +static void convertOCLKernelInfo(vc::ocl::KernelInfo &Converted, + const GenXOCLRuntimeInfo::KernelInfo &Info) { + Converted.Name = Info.getName(); + std::transform(Info.arg_begin(), Info.arg_end(), + std::back_inserter(Converted.Args), + [](const GenXOCLRuntimeInfo::KernelArgInfo &ArgInfo) { + return convertOCLArgInfo(ArgInfo); + }); + Converted.PrintStrings = Info.getPrintStrings(); + Converted.HasGroupID = Info.usesGroupId(); + Converted.HasBarriers = Info.usesBarriers(); + Converted.SLMSize = Info.getSLMSize(); + Converted.ThreadPrivateMemSize = Info.getTPMSize(); + Converted.StatelessPrivateMemSize = Info.getStatelessPrivMemSize(); + Converted.GRFSizeInBytes = Info.getGRFSizeInBytes(); + + if (Info.getRelocationTable().Size > 0) { + Converted.RelocationTable.Buf = Info.getRelocationTable().Buffer; + Converted.RelocationTable.Size = Info.getRelocationTable().Size; + Converted.RelocationTable.NumEntries = + Info.getRelocationTable().Entries; + } + if (Info.getSymbolTable().Size > 0) { + Converted.SymbolTable.Buf = Info.getSymbolTable().Buffer; + Converted.SymbolTable.Size = Info.getSymbolTable().Size; + Converted.SymbolTable.NumEntries = Info.getSymbolTable().Entries; + } +} + + +static std::vector convertInternalOCLInfo( + const std::vector &CompiledKernels) { + std::vector Converted{CompiledKernels.size()}; + for (unsigned i = 0, e = CompiledKernels.size(); i != e; ++i) { + auto &Conv = Converted[i]; + auto &Orig = CompiledKernels[i]; + convertOCLKernelInfo(Conv.KernelInfo, Orig.getKernelInfo()); + Conv.JitInfo = Orig.getJitterInfo(); + Conv.GenBinary = Orig.getGenBinary(); + } + return Converted; +} + +static Triple overrideTripleWithVC(StringRef TripleStr) { + Triple T{TripleStr}; + // Normalize triple. + bool Is32Bit = T.isArch32Bit(); + if (TripleStr.startswith("genx32")) + Is32Bit = true; + return Triple{Is32Bit ? "genx32-unknown-unknown" : "genx64-unknown-unknown"}; +} + +static std::string getSubtargetFeatureString(const vc::CompileOptions &Opts) { + SubtargetFeatures Features; + if (Opts.NoVecDecomp) + Features.AddFeature("disable_vec_decomp"); + if (Opts.Runtime == vc::RuntimeKind::OpenCL) + Features.AddFeature("ocl_runtime"); + return Features.getString(); +} + +static CodeGenOpt::Level getCodeGenOptLevel(const vc::CompileOptions &Opts) { + if (Opts.OptLevel == vc::OptimizerLevel::None) + return CodeGenOpt::None; + return CodeGenOpt::Default; +} + +static Expected> +createTargetMachine(const vc::CompileOptions &Opts, Triple &TheTriple) { + std::string Error; + const Target *TheTarget = + TargetRegistry::lookupTarget(TheTriple.getArchName(), TheTriple, Error); + assert(TheTarget && "vc target was not registered"); + + const std::string FeaturesStr = getSubtargetFeatureString(Opts); + // These ones do not look useful for now. Maybe will be adjusted + // later to account for fp model. + const TargetOptions Options; + CodeGenOpt::Level OptLevel = getCodeGenOptLevel(Opts); + std::unique_ptr TM{ + TheTarget->createTargetMachine(TheTriple.getTriple(), Opts.CPUStr, + FeaturesStr, Options, /*RelocModel=*/None, + /*CodeModel=*/None, OptLevel)}; + if (!TM) + return make_error(); + return {std::move(TM)}; +} + +static void optimizeIR(const vc::CompileOptions &Opts, TargetMachine &TM, + Module &M) { + legacy::PassManager PerModulePasses; + legacy::FunctionPassManager PerFunctionPasses(&M); + + PerModulePasses.add( + createTargetTransformInfoWrapperPass(TM.getTargetIRAnalysis())); + PerFunctionPasses.add( + createTargetTransformInfoWrapperPass(TM.getTargetIRAnalysis())); + + unsigned OptLevel; + if (Opts.OptLevel == vc::OptimizerLevel::None) + OptLevel = 0; + else + OptLevel = 2; + + PassManagerBuilder PMBuilder; + PMBuilder.Inliner = createFunctionInliningPass(2, 2, false); + PMBuilder.OptLevel = OptLevel; + PMBuilder.SizeLevel = OptLevel; + PMBuilder.SLPVectorize = false; + PMBuilder.LoopVectorize = false; + PMBuilder.DisableUnrollLoops = false; + PMBuilder.MergeFunctions = false; + PMBuilder.PrepareForThinLTO = false; + PMBuilder.PrepareForLTO = false; + PMBuilder.RerollLoops = true; + + TM.adjustPassManager(PMBuilder); + + PMBuilder.populateFunctionPassManager(PerFunctionPasses); + PMBuilder.populateModulePassManager(PerModulePasses); + + // Do we need per function passes at all? + PerFunctionPasses.doInitialization(); + for (Function &F : M) { + if (!F.isDeclaration()) + PerFunctionPasses.run(F); + } + PerFunctionPasses.doFinalization(); + + PerModulePasses.run(M); +} + +static void dumpFinalOutput(const vc::CompileOptions &Opts, const Module &M, + StringRef IsaBinary) { + if (Opts.DumpIR) + dumpModuleToTemp(M, "final.ll"); + if (Opts.DumpIsa) + dumpDataToTemp(IsaBinary, "final.isa"); +} + +static void populateCodeGenPassManager(const vc::CompileOptions &Opts, + TargetMachine &TM, raw_pwrite_stream &OS, + legacy::PassManager &PM) { + TargetLibraryInfoImpl TLII{TM.getTargetTriple()}; + PM.add(new TargetLibraryInfoWrapperPass(TLII)); + // Non-constant pointer. + WA_TABLE *WaTable = Opts.WATable.get(); + PM.add(new GenXWATable(WaTable)); + + auto FileType = IGCLLVM::TargetMachine::CodeGenFileType::CGFT_AssemblyFile; + bool AddPasses = + TM.addPassesToEmitFile(PM, OS, nullptr, FileType, /*NoVerify*/ true); + assert(!AddPasses && "Bad filetype for vc-codegen"); +} + +static vc::ocl::CompileOutput runOclCodeGen(const vc::CompileOptions &Opts, + TargetMachine &TM, Module &M) { + legacy::PassManager PM; + + SmallString<32> IsaBinary; + raw_svector_ostream OS(IsaBinary); + raw_null_ostream NullOS; + if (Opts.DumpIsa) + populateCodeGenPassManager(Opts, TM, OS, PM); + else + populateCodeGenPassManager(Opts, TM, NullOS, PM); + + std::vector CompiledKernels; + PM.add(createGenXOCLInfoExtractorPass(CompiledKernels)); + + PM.run(M); + dumpFinalOutput(Opts, M, IsaBinary); + + vc::ocl::CompileOutput Output; + Output.Kernels = convertInternalOCLInfo(CompiledKernels); + Output.PointerSizeInBytes = M.getDataLayout().getPointerSize(); + return Output; +} + +static vc::cm::CompileOutput runCmCodeGen(const vc::CompileOptions &Opts, + TargetMachine &TM, Module &M) { + legacy::PassManager PM; + SmallString<32> IsaBinary; + raw_svector_ostream OS(IsaBinary); + populateCodeGenPassManager(Opts, TM, OS, PM); + PM.run(M); + dumpFinalOutput(Opts, M, IsaBinary); + vc::cm::CompileOutput Output; + Output.IsaBinary.assign(IsaBinary.begin(), IsaBinary.end()); + return Output; +} + +static vc::CompileOutput runCodeGen(const vc::CompileOptions &Opts, + TargetMachine &TM, Module &M) { + switch (Opts.Runtime) { + case vc::RuntimeKind::CM: + return runCmCodeGen(Opts, TM, M); + case vc::RuntimeKind::OpenCL: + return runOclCodeGen(Opts, TM, M); + } + llvm_unreachable("Unknown runtime kind"); +} + +Expected vc::Compile(ArrayRef Input, + const vc::CompileOptions &Opts) { + // Environment variable for additional options for debug purposes. + // This will exit with error if options is incorrect and should not + // be used to pass meaningful options required for compilation. +#ifndef NDEBUG + constexpr const char *DebugEnvVarName = "IGC_VCCodeGenDebugOpts"; + cl::ParseEnvironmentOptions("vc-codegen", DebugEnvVarName); +#endif + + LLVMContext Context; + LLVMInitializeGenXTarget(); + LLVMInitializeGenXTargetInfo(); + llvm::PassRegistry &Registry = *llvm::PassRegistry::getPassRegistry(); + llvm::initializeTarget(Registry); + + auto ExpModule = getModule(Input, Context); + if (!ExpModule) + return ExpModule.takeError(); + Module &M = *ExpModule.get(); + + legacy::PassManager PerModulePasses; + PerModulePasses.add(createGenXSPIRVReaderAdaptorPass()); + PerModulePasses.add(createGenXRestoreIntrAttrPass()); + PerModulePasses.run(M); + + Triple TheTriple = overrideTripleWithVC(M.getTargetTriple()); + M.setTargetTriple(TheTriple.getTriple()); + + auto ExpTargetMachine = createTargetMachine(Opts, TheTriple); + if (!ExpTargetMachine) + return ExpTargetMachine.takeError(); + TargetMachine &TM = *ExpTargetMachine.get(); + M.setDataLayout(TM.createDataLayout()); + + if (Opts.DumpIR) + dumpModuleToTemp(M, "start.ll"); + + optimizeIR(Opts, TM, M); + + if (Opts.DumpIR) + dumpModuleToTemp(M, "optimized.ll"); + + return runCodeGen(Opts, TM, M); +} + +static Expected +parseOptions(const SmallVectorImpl &Argv, + vc::options::Flags FlagsToInclude) { + const opt::OptTable &Options = vc::getOptTable(); + + const bool IsInternal = FlagsToInclude == vc::options::InternalOption; + + unsigned MissingArgIndex = 0; + unsigned MissingArgCount = 0; + opt::InputArgList InputArgs = + Options.ParseArgs(Argv, MissingArgIndex, MissingArgCount, FlagsToInclude); + if (MissingArgCount) + return make_error(Argv[MissingArgIndex], IsInternal); + + // ocloc uncoditionally passes opencl options to internal options. + // Skip checking of internal options for now. + if (!IsInternal) { + if (opt::Arg *A = InputArgs.getLastArg(vc::options::OPT_UNKNOWN, + vc::options::OPT_INPUT)) { + std::string BadOpt = A->getAsString(InputArgs); + return make_error(BadOpt, IsInternal); + } + } + + return {std::move(InputArgs)}; +} + +static Expected parseApiOptions(StringSaver &Saver, + StringRef ApiOptions) { + SmallVector Argv; + cl::TokenizeGNUCommandLine(ApiOptions, Saver, Argv); + + const opt::OptTable &Options = vc::getOptTable(); + // This can be rewritten to parse options and then check for + // OPT_vc_codegen, but it would be better to manually check for + // this option before any real parsing. If it is missing, + // then no parsing should be done at all. + auto HasOption = [&Argv](const std::string &Opt) { + return std::any_of(Argv.begin(), Argv.end(), + [&Opt](const char *ArgStr) { return Opt == ArgStr; }); + }; + const std::string VCCodeGenOptName = + Options.getOption(vc::options::OPT_vc_codegen).getPrefixedName(); + if (HasOption(VCCodeGenOptName)) + return parseOptions(Argv, vc::options::ApiOption); + // Deprecated -cmc parsing just for compatibility. + const std::string IgcmcOptName = + Options.getOption(vc::options::OPT_igcmc).getPrefixedName(); + if (!sys::Process::GetEnv("IGC_VCAvoidCmcFlag") && HasOption(IgcmcOptName)) + return parseOptions(Argv, vc::options::IgcmcApiOption); + + return make_error(); +} + +static Expected +parseInternalOptions(StringSaver &Saver, StringRef InternalOptions) { + SmallVector Argv; + cl::TokenizeGNUCommandLine(InternalOptions, Saver, Argv); + return parseOptions(Argv, vc::options::InternalOption); +} + +static Error fillApiOptions(const opt::ArgList &ApiOptions, + vc::CompileOptions &Opts) { + if (ApiOptions.hasArg(vc::options::OPT_igcmc)) + Opts.OptLevel = vc::OptimizerLevel::None; + if (ApiOptions.hasArg(vc::options::OPT_no_vector_decomposition)) + Opts.NoVecDecomp = true; + + if (opt::Arg *A = ApiOptions.getLastArg(vc::options::OPT_optimize)) { + StringRef Val = A->getValue(); + auto MaybeLevel = StringSwitch>(Val) + .Case("none", vc::OptimizerLevel::None) + .Case("full", vc::OptimizerLevel::Full) + .Default(None); + if (!MaybeLevel) { + const std::string BadOpt = A->getAsString(ApiOptions); + return make_error(BadOpt, /*IsInternal=*/false); + } + Opts.OptLevel = MaybeLevel.getValue(); + } + + return Error::success(); +} + +static Error fillInternalOptions(const opt::ArgList &InternalOptions, + vc::CompileOptions &Opts) { + if (InternalOptions.hasArg(vc::options::OPT_dump_isa_binary)) + Opts.DumpIsa = true; + if (InternalOptions.hasArg(vc::options::OPT_dump_llvm_ir)) + Opts.DumpIR = true; + + if (opt::Arg *A = InternalOptions.getLastArg(vc::options::OPT_runtime)) { + StringRef Val = A->getValue(); + auto MaybeRuntime = StringSwitch>(Val) + .Case("cm", vc::RuntimeKind::CM) + .Case("ocl", vc::RuntimeKind::OpenCL) + .Default(None); + if (!MaybeRuntime) { + const std::string BadOpt = A->getAsString(InternalOptions); + return make_error(BadOpt, /*IsInternal=*/true); + } + Opts.Runtime = MaybeRuntime.getValue(); + } + + if (InternalOptions.hasArg(vc::options::OPT_help)) { + constexpr const char *Usage = "-options \"-vc-codegen [options]\""; + constexpr const char *Title = "Vector compiler options"; + constexpr unsigned FlagsToInclude = vc::options::ApiOption; + constexpr unsigned FlagsToExclude = 0; + constexpr bool ShowAllAliases = false; + vc::getOptTable().PrintHelp(llvm::errs(), Usage, Title, FlagsToInclude, + FlagsToExclude, ShowAllAliases); + } + if (InternalOptions.hasArg(vc::options::OPT_help_internal)) { + constexpr const char *Usage = + "-options \"-vc-codegen\" -internal_options \"[options]\""; + constexpr const char *Title = "Vector compiler internal options"; + constexpr unsigned FlagsToInclude = vc::options::InternalOption; + constexpr unsigned FlagsToExclude = 0; + constexpr bool ShowAllAliases = false; + vc::getOptTable().PrintHelp(llvm::errs(), Usage, Title, FlagsToInclude, + FlagsToExclude, ShowAllAliases); + } + + return Error::success(); +} + +static Expected +fillOptions(const opt::ArgList &ApiOptions, + const opt::ArgList &InternalOptions) { + vc::CompileOptions Opts; + Error Status = fillApiOptions(ApiOptions, Opts); + if (Status) + return {std::move(Status)}; + + Status = fillInternalOptions(InternalOptions, Opts); + if (Status) + return {std::move(Status)}; + + return {std::move(Opts)}; +} + +// Parse global llvm cl options. +// Parsing of cl codegen options should not fail under any circumstances. +static void parseLLVMOptions(const opt::ArgList &Args) { + // Need to control cl options as vector compiler still uses these ones + // to control compilation process. This will be addressed later. + llvm::cl::ResetAllOptionOccurrences(); + BumpPtrAllocator Alloc; + StringSaver Saver{Alloc}; + SmallVector Argv{"vc-codegen"}; + for (const std::string &ArgPart : + Args.getAllArgValues(vc::options::OPT_llvm_options)) + cl::TokenizeGNUCommandLine(ArgPart, Saver, Argv); + cl::ParseCommandLineOptions(Argv.size(), Argv.data()); +} + +// Derive llvm options from different API and internal options. +static opt::DerivedArgList +composeLLVMArgs(const opt::InputArgList &ApiArgs, + const opt::InputArgList &InternalArgs, + llvm::StringSaver &Saver) { + const opt::OptTable &Options = vc::getOptTable(); + const opt::Option LLVMOpt = Options.getOption(vc::options::OPT_llvm_options); + + // Pass through old value. + opt::DerivedArgList UpdatedArgs{InternalArgs}; + if (const opt::Arg *BaseArg = + InternalArgs.getLastArg(vc::options::OPT_llvm_options)) + UpdatedArgs.AddSeparateArg(BaseArg, LLVMOpt, BaseArg->getValue()); + + // Add visaopts if any. + if (opt::Arg *VisaArg = ApiArgs.getLastArg(vc::options::OPT_igcmc_visaopts)) { + StringRef WrappedVisaOpts = + Saver.save(Twine{"-finalizer-opts='"} + VisaArg->getValue() + "'"); + UpdatedArgs.AddSeparateArg(VisaArg, LLVMOpt, WrappedVisaOpts); + } + + + // Stack memory size. + if (opt::Arg *StackMemSizeArg = + ApiArgs.getLastArg(vc::options::OPT_igcmc_stack_size)) { + StringRef MemSizeRef = Saver.save(StackMemSizeArg->getAsString(ApiArgs)); + UpdatedArgs.AddSeparateArg(StackMemSizeArg, LLVMOpt, MemSizeRef); + } + + + return UpdatedArgs; +} + +llvm::Expected +vc::ParseOptions(llvm::StringRef ApiOptions, llvm::StringRef InternalOptions) { + llvm::BumpPtrAllocator Alloc; + llvm::StringSaver Saver{Alloc}; + auto ExpApiArgList = parseApiOptions(Saver, ApiOptions); + if (!ExpApiArgList) + return ExpApiArgList.takeError(); + const opt::InputArgList &ApiArgs = ExpApiArgList.get(); + + auto ExpInternalArgList = parseInternalOptions(Saver, InternalOptions); + if (!ExpInternalArgList) + return ExpInternalArgList.takeError(); + const opt::InputArgList &InternalArgs = ExpInternalArgList.get(); + + // Prepare additional llvm options (like finalizer args). + opt::DerivedArgList LLVMArgs = composeLLVMArgs(ApiArgs, InternalArgs, Saver); + + // This is a temporary solution until we remove all cl options that + // are accesible by user and affect compilation. + parseLLVMOptions(LLVMArgs); + + return fillOptions(ApiArgs, InternalArgs); +} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/IgnoreRAUWValueMap.h b/IGC/VectorCompiler/lib/GenXCodeGen/IgnoreRAUWValueMap.h new file mode 100644 index 000000000000..0fb00d7050b6 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/IgnoreRAUWValueMap.h @@ -0,0 +1,42 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ + +#ifndef IGNORERAUWVALUEMAP_H +#define IGNORERAUWVALUEMAP_H +#include "llvm/IR/ValueMap.h" + +namespace llvm { + +// Configuration for ValueMap that ignores RAUW, instead of moving the map +// entry. +template +struct IgnoreRAUWValueMapConfig : public ValueMapConfig { + enum { FollowRAUW = false }; +}; + +} // End llvm namespace + +#endif // ndef IGNORERAUWVALUEMAP_H diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/IsaDescription.h b/IGC/VectorCompiler/lib/GenXCodeGen/IsaDescription.h new file mode 100644 index 000000000000..86f5b0ec18ba --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/IsaDescription.h @@ -0,0 +1,254 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +#pragma once +#include + +/// +/// ISA Description +/// + +#define TYPE_INTEGER ISA_TYPE_UW|ISA_TYPE_W|ISA_TYPE_UB|ISA_TYPE_B|ISA_TYPE_D|ISA_TYPE_UD|ISA_TYPE_Q|ISA_TYPE_UQ +#define TYPE_FLOAT ISA_TYPE_DF|ISA_TYPE_F +#define TYPE_FLOAT_ALL ISA_TYPE_DF|ISA_TYPE_F|ISA_TYPE_HF +#define TYPE_ANY TYPE_INTEGER | TYPE_FLOAT + +#define SIZEOF_CISA_OPCODE sizeof(unsigned char) +#define OPND_DST_GEN 0x100 +#define OPND_SRC_GEN 0x200 +#define OPND_DST_INDIR 0x400 +#define OPND_SRC_INDIR 0x800 +#define OPND_DST_PRED 0x1000 +#define OPND_SRC_PRED 0x2000 +#define OPND_DST_ADDR 0x4000 +#define OPND_SRC_ADDR 0x8000 +#define OPND_ADDRESS_OF 0x10000 +#define OPND_SURFACE 0x20000 +#define OPND_SAMPLE 0x40000 +#define OPND_IMM 0x100000 +#define OPND_PRED 0x200000 +#define OPND_OTHER 0x400000 +#define OPND_RAW_SRC 0x800000 +#define OPND_RAW_DST 0x1000000 + +#define OPND_VECTOR_SRC_G_IMM_AO OPND_SRC_GEN | OPND_IMM | OPND_ADDRESS_OF +#define OPND_VECTOR_SRC_G_I_IMM_AO OPND_SRC_GEN | OPND_IMM |OPND_SRC_INDIR | OPND_ADDRESS_OF +#define OPND_VECTOR_SRC_G_I_IMM OPND_SRC_GEN | OPND_IMM |OPND_SRC_INDIR +#define OPND_VECTOR_SRC_G_I_IMM_A_AO OPND_SRC_GEN | OPND_IMM |OPND_SRC_INDIR | OPND_SRC_ADDR | OPND_ADDRESS_OF +#define OPND_VECTOR_SRC_G_I_IMM_P_AO OPND_SRC_GEN | OPND_IMM |OPND_SRC_INDIR | OPND_SRC_PRED | OPND_ADDRESS_OF +#define OPND_VECTOR_SRC_G_A_AO OPND_SRC_GEN | OPND_SRC_ADDR | OPND_ADDRESS_OF +#define OPND_VECTOR_SRC_G_I OPND_SRC_GEN | OPND_SRC_INDIR + +#define OPND_VECTOR_DST_G_I OPND_DST_GEN | OPND_DST_INDIR +#define OPND_VECTOR_DST_G_I_A OPND_DST_GEN | OPND_DST_INDIR | OPND_DST_ADDR +#define OPND_VECTOR_DST_G_I_P OPND_DST_GEN | OPND_DST_PRED | OPND_DST_INDIR + +#define OPND_VECTOR_SRC OPND_SRC_GEN | OPND_IMM |OPND_SRC_INDIR | OPND_SRC_ADDR | OPND_ADDRESS_OF | OPND_SRC_PRED +#define OPND_VECTOR_DST OPND_DST_GEN | OPND_DST_INDIR | OPND_DST_ADDR | OPND_DST_PRED + +#define OPND_SPECIAL OPND_SAMPLE | OPND_SURFACE + +#define SAME_DATA_TYPE 0x1 +#define SAME_SPECIAL_KIND 0x2 + +#define OPND_BLOCK_WIDTH OPND_IMM +#define OPND_BLOCK_HEIGHT OPND_IMM +#define OPND_PLANE OPND_IMM + +#define OPND_SIMB_INDEX OPND_IMM +#define OPND_NUM_OPNDS OPND_IMM +#define OPND_KIND OPND_IMM + +typedef enum { + SIZE_1 = 1, + SIZE_2 = 2, + SIZE_4 = 4, + SIZE_8 = 8 +} SpecificSize; + +typedef enum { + HORIZON_STRIDE_1 = 1, + HORIZON_VERTICAL_STRIDE_0, + HORIZON_STRIDE_2, + ELEM_NUM_2, + ELEM_NUM_4, + ELEM_NUM_8_16, + ELEM_NUM_96, + ELEM_NUM_128, + ELEM_NUM_224, + ELEM_NUM_GE_2, + ELEM_NUM_GE_16, + ELEM_NUM_GE_32, + ELEM_NUM_GE_128, + ELEM_NUM_GE_160, + ELEM_NUM_MC32, + ELEM_NUM_MC16, + SIZE_54, + SIZE_128, + SIZE_192, + SIZE_224, + SIZE_228, + SIZE_352, + SIZE_SIZE, + OWORD_SIZE, + GE_4, + VALUE_0_3, + VALUE_1_32, + VALUE_1_64, + SINGLE_DATA_TYPE, + PREDICATE_NONEPRED_OPND, + SCALAR_REGION, + LABEL_BLOCK_C, + LABEL_FUNC_C, + SIZE_GE_WIDTH_M_HIEGH, + GE_READSIZE, + GE_WRITESIZE, + SIZE_STREAM_MODE_DEPENDENT_1, + SIZE_STREAM_MODE_DEPENDENT_2, + SIZE_STREAM_MODE_DEPENDENT_3, + SIZE_STREAM_MODE_DEPENDENT_4, + LENGHT_LESS_256, + GRF_ALIGNED = 0x100, + SAT_C = 0x200, + SAT_FLOAT_ONLY = 0x400 + + //GATHER: UPPER_BITS_IGNORE, + // LINENUM: LARGE_THAN_0, + //SIZE_BLOCK_HEIGH_WIDTH, + //OWORD_LD_UNALIGNED: SIZE_SIZE_OWORD, + //Instruction specific features + //RIGHT_ALIGNED, + //MOVS: SINGLE_SPEC_OPND_TYPE, + //FILE NAME: LENGHT_LESS_256, + //ALL: WITHIN_SIMD_WIDTH +} OpndContraint; + +//Common_ISA_Opnd_Desc_Type +enum { + OPND_EXECSIZE = 1, + OPND_STRING, + OPND_LABEL, + OPND_ATOMIC_SUBOP, + OPND_EMASK_CTRL, + OPND_COND_MODE, + OPND_CHAN_PATT, + OPND_OWORD_SIZE, + OPND_IS_MODIFIED, + OPND_ELEM_NUM, + OPND_ELEM_SIZE, + OPND_SIMD_MODE, + OPND_CHANNEL_SIMD_MODE, + OPND_CMP_SUBOP, + OPND_VME_SUBOP, + OPND_STREAM_MODE, + OPND_SEARCH_CRTL, + OPND_MATRIX_MODE, + OPND_SUBMATRIX_SHAPE, + OPND_SUBPRE_SHAPE, + OPND_SPECIAL_KIND, + OPND_MEDIA_LD_MODIFIER, + OPND_MEDIA_ST_MODIFIER, + OPND_RAW, + OPND_SUBOPCODE, + OP_EXT +}; + +typedef enum +{ + ISA_Inst_Mov = 0x0, + ISA_Inst_Arith = 0x1, + ISA_Inst_Logic = 0x2, + ISA_Inst_Compare = 0x3, //CMP + ISA_Inst_Address = 0x4, //ADDROF, ADDR_ADD + ISA_Inst_Flow = 0x5, + ISA_Inst_Data_Port = 0x6, + ISA_Inst_Sampler = 0x7, + ISA_Inst_Misc = 0x8, // VME, etc. + ISA_Inst_SIMD_Flow = 0x9, + ISA_Inst_Sync = 0xA, + ISA_Inst_SVM = 0xB, + ISA_Inst_Reserved +} ISA_Inst_Type; + +struct ISA_Inst_Info +{ + ISA_Opcode op; + ISA_Inst_Type type; + const char* str; + uint8_t n_srcs; //for send messages, we count the surface as well as all the offsets to be sources + uint8_t n_dsts; +}; + +#define MAX_OPNDS_PER_INST 24 + +typedef struct OpndDesc +{ + unsigned opnd_type; //Common_ISA_Opnd_Desc_Type OR #defines like OPND_VECTOR_SRC_G_IMM_AO + unsigned data_type; //VISA_Type, overloaded to supported data types if it's a vector + unsigned opnd_constraint; +} OpndDesc; + + +typedef uint8_t ISA_SubOpcode; + +struct ISA_SubInst_Desc +{ + ISA_SubOpcode subOpcode; + ISA_Inst_Type type; + const char* name; + uint16_t opnd_num; + OpndDesc opnd_desc[MAX_OPNDS_PER_INST]; +}; + +struct VISA_INST_Desc +{ + TARGET_PLATFORM platf; + ISA_SubOpcode opcode; + ISA_Inst_Type type; + const char* name; + uint16_t opnd_num; + char attr; + OpndDesc opnd_desc[MAX_OPNDS_PER_INST]; + + const ISA_SubInst_Desc& getSubInstDesc(uint8_t subOpcode) const; + const ISA_SubInst_Desc& getSubInstDescByName(const char *symbol) const; +}; + +enum SVMSubOpcode +{ + SVM_BLOCK_LD = 0x1, + SVM_BLOCK_ST = 0x2, + SVM_GATHER = 0x3, + SVM_SCATTER = 0x4, + SVM_ATOMIC = 0x5, + SVM_GATHER4SCALED, + SVM_SCATTER4SCALED, + SVM_LASTOP +}; + + +extern struct ISA_Inst_Info ISA_Inst_Table[ISA_OPCODE_ENUM_SIZE]; + +extern VISA_INST_Desc CISA_INST_table[ISA_NUM_OPCODE]; diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/KillAnalysis.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/KillAnalysis.cpp new file mode 100644 index 000000000000..4472f78fec38 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/KillAnalysis.cpp @@ -0,0 +1,188 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +// KillAnalysis is an object that can analyze which uses of a value are kills, +// and cache the result. +// +//===----------------------------------------------------------------------===// + +#include "KillAnalysis.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Debug.h" + +namespace { + +// BlockInfo : info for one basic block when calculating the live range for +// one value +struct BlockInfo { + llvm::Instruction *LastUser; + bool LiveOut; + BlockInfo() : LastUser(nullptr), LiveOut(false) {} +}; + +} // anonymous namespace + +using namespace llvm; + +/*********************************************************************** + * isKill : determine whether a use is a kill + * + * Enter: U = the use, which must be of an Instruction or Argument + * + * Return: true if this is a kill use (including the case that there are + * multiple uses in the same instruction, and no further reachable + * uses) + * + * This caches the information on which uses of the value are kills. If + * anything changes to do with the value, such as changing uses or moving + * code containing uses, or even completely removing the value, then the + * caller must invalidate the cached information by calling erase(V). + */ +bool KillAnalysis::isKill(Use *U) +{ + SmallVectorImpl *Kills = getKills(*U); + for (unsigned i = 0, e = Kills->size(); i != e; ++i) + if ((*Kills)[i] == U->getUser()) + return true; + return false; +} + +/*********************************************************************** + * getKills : get the kills vector for the value + * + * If there is no kills vector already cached for this value, we need to + * create one by determining its live range and remembering which is the + * last user in each basic block. Where a use is seen in a basic block, + * we recursively add its predecessor blocks to the live range, stopping + * when we get to an already seen block. + * + * This is pretty much the same as the algorithm in + * Appel "Modern Compiler Implementation in C" 19.6. + * + */ +SmallVectorImpl *KillAnalysis::getKills(Value *V) +{ + auto MapIter = Map.find(V); + if (MapIter != Map.end()) + return &MapIter->second; + // Need to construct live range for this value so we can find the kill uses. + std::map Blocks; + // If the value is an instruction, set up the def as the last user in its + // basic block. Don't do anything for an argument. + if (auto Inst = dyn_cast(V)) + Blocks[Inst->getParent()].LastUser = Inst; + // Trace back from each use. + for (auto ui = V->use_begin(), ue = V->use_end(); ui != ue; ++ui) { + auto user = cast(ui->getUser()); + if (auto Phi = dyn_cast(user)) { + // Use in a phi node. Just mark the incoming block as live out. + Blocks[Phi->getIncomingBlock(ui->getOperandNo())].LiveOut = true; + continue; + } + auto BB = user->getParent(); + auto BI = &Blocks[BB]; + if (BI->LiveOut) + continue; // already live out of this block + if (BI->LastUser == V) { + // This is the first time we have seen a use in this block, and it is + // the def block. It is tentatively the last user in the block, and + // no tracing back is required. + BI->LastUser = user; + continue; + } + bool LiveIn = false; + if (!BI->LastUser) { + // This is the first time we have seen a use in this block. It is + // tentatively the last user in the block. + BI->LastUser = user; + LiveIn = true; + } else if (BI->LastUser != user) { + // There was already a tentative last use in this block (in a different + // instruction to the present use). We need to see which one comes last. + // To attempt to optimize the case that the two uses are fairly close + // together in a large basic block, we walk both forwards and backwards + // at the same time. + auto Backwards = BI->LastUser; + auto Forwards = BI->LastUser; + for (;;) { + if (Backwards != &BB->front()) { + Backwards = Backwards->getPrevNode(); + if (Backwards == user) { + // user is not the last user + break; + } + } + if (Forwards != &BB->back()) { + Forwards = Forwards->getNextNode(); + if (Forwards == user) { + // user is the last user. + BI->LastUser = user; + break; + } + } + } + } + if (!LiveIn) + continue; + // We now need to trace back through predecessors. + SmallVector Stack; + for (;;) { + if (BB) { + // Push predecessors onto stack. + for (auto bui = BB->use_begin(), bue = BB->use_end(); bui != bue; ++bui) { + Instruction *Inst = cast(bui->getUser()); + assert(Inst && Inst->isTerminator() && "cannot cope with computed goto"); + Stack.push_back(Inst->getParent()); + } + } + // Get a predecessor from the stack. + if (Stack.empty()) + break; + BB = Stack.back(); + Stack.resize(Stack.size() - 1); + // Mark it live out. If it is already live out, or we have already seen a + // use there, we do not need to trace back. + BI = &Blocks[BB]; + if (BI->LiveOut || BI->LastUser) + BB = nullptr; + BI->LiveOut = true; + } + } + // Create a new entry in the map for this value, and populate it with the + // kill uses. + // Note that the order in which we populate the kill uses vector depends on + // memory layout, so if anything starts to depend on it, we should change + // this code to use a fixed ordering. + auto MapEntry = &Map[V]; + for (auto i = Blocks.begin(), e = Blocks.end(); i != e; ++i) + if (!i->second.LiveOut && i->second.LastUser) + MapEntry->push_back(i->second.LastUser); + return MapEntry; +} + diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/KillAnalysis.h b/IGC/VectorCompiler/lib/GenXCodeGen/KillAnalysis.h new file mode 100644 index 000000000000..269b2d45605f --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/KillAnalysis.h @@ -0,0 +1,51 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +// KillAnalysis is an object that can analyze which uses of a value are kills, +// and cache the result. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/ValueMap.h" + +namespace llvm { + +class Use; +class Value; + +class KillAnalysis { + ValueMap> Map; +public: + // erase : erase a value from the KillAnalysis + void erase(Value *V) { Map.erase(V); } + // isKill : determine whether a use is a kill + bool isKill(Use *U); +private: + SmallVectorImpl *getKills(Value *V); +}; + +} // namespace llvm diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/TargetInfo/CMakeLists.txt b/IGC/VectorCompiler/lib/GenXCodeGen/TargetInfo/CMakeLists.txt new file mode 100644 index 000000000000..21f184ee94ac --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/TargetInfo/CMakeLists.txt @@ -0,0 +1,5 @@ +set(INFO_SOURCES + GenXTargetInfo.cpp +) + +add_library(VCTargetInfo ${INFO_SOURCES}) diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/TargetInfo/GenXTargetInfo.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/TargetInfo/GenXTargetInfo.cpp new file mode 100644 index 000000000000..e4b3b53f8cc3 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/TargetInfo/GenXTargetInfo.cpp @@ -0,0 +1,50 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ + +#include "GenXTargetInfo.h" + +#include "llvm/Support/TargetRegistry.h" + +using namespace llvm; + +Target &llvm::getTheGenXTarget32() { + static Target TheGenXTarget32; + return TheGenXTarget32; +} + +Target &llvm::getTheGenXTarget64() { + static Target TheGenXTarget64; + return TheGenXTarget64; +} + +extern "C" void LLVMInitializeGenXTargetInfo() { + RegisterTarget<> X(getTheGenXTarget32(), "genx32", "Intel GenX 32-bit", + "genx32"); + RegisterTarget<> Y(getTheGenXTarget64(), "genx64", "Intel GenX 64-bit", + "genx64"); +} + +extern "C" void LLVMInitializeGenXTargetMC() {} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/TargetInfo/GenXTargetInfo.h b/IGC/VectorCompiler/lib/GenXCodeGen/TargetInfo/GenXTargetInfo.h new file mode 100644 index 000000000000..205dfce2fc1c --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/TargetInfo/GenXTargetInfo.h @@ -0,0 +1,39 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ + +#ifndef LIB_GENXCODEGEN_TARGETINFO_GENXTARGETINFO_H +#define LIB_GENXCODEGEN_TARGETINFO_GENXTARGETINFO_H + +namespace llvm { + +class Target; + +Target &getTheGenXTarget32(); +Target &getTheGenXTarget64(); + +} + +#endif diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/Utils/CMakeLists.txt b/IGC/VectorCompiler/lib/GenXCodeGen/Utils/CMakeLists.txt new file mode 100644 index 000000000000..aadf0397f982 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/Utils/CMakeLists.txt @@ -0,0 +1,23 @@ + +set(CISA_GEN_INTRINSICS "${CMAKE_CURRENT_SOURCE_DIR}/cisa_gen_intrinsics.py") +set(CISA_JSON_FILE "${CMAKE_CURRENT_SOURCE_DIR}/cisa_gen_intrinsics.json") + +set(CISA_OUT_PATH "${CMAKE_CURRENT_BINARY_DIR}/../") +set(CISA_OUT_FILES "${CISA_OUT_PATH}/GenXIntrinsicInfoTable.inc" + "${CISA_OUT_PATH}/GenXIntrinsicsBuildMap.inc") +message(" >>${CISA_OUT_PATH}<< -> ${CMAKE_CURRENT_BINARY_DIR}") +message(" COMMAND -> ${PYTHON_EXECUTABLE} ${CISA_GEN_INTRINSICS} ${CISA_JSON_FILE} ${CISA_OUT_PATH} <-") +message(" ${CMAKE_CURRENT_SOURCE_DIR}") +add_custom_command( + OUTPUT ${CISA_OUT_FILES} + COMMAND ${PYTHON_EXECUTABLE} ${CISA_GEN_INTRINSICS} ${CISA_JSON_FILE} ${CISA_OUT_PATH} + COMMENT "Building Cisa generators for GenXCisaBuilder." + DEPENDS ${CISA_GEN_INTRINSICS} ${CISA_JSON_FILE} + VERBATIM) + +set_source_files_properties( + ${CISA_OUT_FILES} + PROPERTIES GENERATED TRUE + ) + +add_custom_target(GenXUtilBuild ALL DEPENDS ${CISA_OUT_FILES}) diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/Utils/cisa_gen_intrinsics.json b/IGC/VectorCompiler/lib/GenXCodeGen/Utils/cisa_gen_intrinsics.json new file mode 100755 index 000000000000..d77e3e7a2e25 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/Utils/cisa_gen_intrinsics.json @@ -0,0 +1,3674 @@ +{ + "DESCRIPTION": "See cisa_gen_intrinsics.py for description of this document", + "INTRINSICS": { + "genx_fptosi_sat": { + "opc": "ISA_MOV", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "SIGNED", "SATURATION_SATURATE", 0 ], + "src0": [ "GENERAL", "MODIFIER_ARITH", 1 ] + }, + "genx_fptoui_sat": { + "opc": "ISA_MOV", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "UNSIGNED", "SATURATION_SATURATE", 0 ], + "src0": [ "GENERAL", "MODIFIER_ARITH", 1 ] + }, + "genx_sat": { + "opc": "ISA_MOV", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "SATURATION_SATURATE", 0 ], + "src0": [ "GENERAL", "MODIFIER_ARITH", 1 ] + }, + "genx_uutrunc_sat": { + "opc": "ISA_MOV", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "UNSIGNED", "SATURATION_SATURATE", 0 ], + "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ] + }, + "genx_ustrunc_sat": { + "opc": "ISA_MOV", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "UNSIGNED", "SATURATION_SATURATE", 0 ], + "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ] + }, + "genx_sutrunc_sat": { + "opc": "ISA_MOV", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "SIGNED", "SATURATION_SATURATE", 0 ], + "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ] + }, + "genx_sstrunc_sat": { + "opc": "ISA_MOV", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "SIGNED", "SATURATION_SATURATE", 0 ], + "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ] + }, + "genx_thread_x": { + "opc": "ISA_MOV", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", 0 ], + "src0": "CreateOpndPredefinedSrc(PREDEFINED_X, 0, 0, 0, 1, 0)" + }, + "genx_thread_y": { + "opc": "ISA_MOV", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", 0 ], + "src0": "CreateOpndPredefinedSrc(PREDEFINED_Y, 0, 0, 0, 1, 0)" + }, + "genx_group_id_x": { + "opc": "ISA_MOV", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", 0 ], + "src0": "CreateOpndPredefinedSrc(PREDEFINED_GROUP_ID_X, 0, 0, 0, 1, 0)" + }, + "genx_group_id_y": { + "opc": "ISA_MOV", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", 0 ], + "src0": "CreateOpndPredefinedSrc(PREDEFINED_GROUP_ID_Y, 0, 0, 0, 1, 0)" + }, + "genx_group_id_z": { + "opc": "ISA_MOV", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", 0 ], + "src0": "CreateOpndPredefinedSrc(PREDEFINED_GROUP_ID_Z, 0, 0, 0, 1, 0)" + }, + "genx_timestamp": { + "opc": "ISA_MOV", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", 0 ], + "src0": "CreateOpndPredefinedSrc(PREDEFINED_TSC, 0, 0, 1, 1, 0)" + }, + "genx_r0": { + "opc": "ISA_MOV", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", 0 ], + "src0": "CreateOpndPredefinedSrc(PREDEFINED_R0, 0, 0, 1, 1, 0)" + }, + "genx_ce0": { + "opc": "ISA_MOV", + "exec_size": [ "EXECSIZE_NOMASK" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", 0 ], + "src0": "CreateOpndPredefinedSrc(PREDEFINED_CE0, 0, 0, 0, 1, 0)" + }, + "genx_sr0": { + "opc": "ISA_MOV", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", 0 ], + "src0": "CreateOpndPredefinedSrc(PREDEFINED_SR0, 0, 0, 1, 1, 0)" + }, + "genx_set_sr0_2": { + "opc": "ISA_MOV", + "exec_size": [ "EXECSIZE_NOMASK" ], + "pred": [ "IMPLICITPRED" ], + "dst": "CreateOpndPredefinedDst(PREDEFINED_SR0, 0, 2, 1)", + "src0": [ "GENERAL", 1 ] + }, + "genx_get_color": { + "opc": "ISA_MOV", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", 0 ], + "src0": "CreateOpndPredefinedSrc(PREDEFINED_COLOR, 0, 0, 1, 1, 0)" + }, + "genx_get_hwid": { + "opc": "ISA_MOV", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", 0 ], + "src0": "CreateOpndPredefinedSrc(PREDEFINED_HW_TID, 0, 0, 0, 1, 0)" + }, + "genx_set_pause": { + "opc": "ISA_MOV", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": "CreateOpndPredefinedDst(PREDEFINED_TSC, 0, 4, 1)", + "src0": [ "GENERAL", 1 ] + }, + "genx_dummy_mov": { + "opc": "ISA_MOV", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": "CreateOpndPredefinedDst(PREDEFINED_NULL, 0, 0, 1)", + "src0": [ "GENERAL", 1 ] + }, + "genx_constanti": { + "opc": "ISA_MOV", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", 0 ], + "src0": [ "GENERAL", 1 ] + }, + "genx_constantf": { + "opc": "ISA_MOV", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", 0 ], + "src0": [ "GENERAL", 1 ] + }, + "genx_media_ld": { + "opc": "ISA_MEDIA_LD", + "modifiers": [ "BYTE", 1 ], + "surface": [ "SURFACE", 2 ], + "plane": [ "BYTE", 3 ], + "block_width": [ "BYTE", 4 ], + "block_height": [ "MEDIAHEIGHT", 4 ], + "x_offset": [ "GENERAL", "UNSIGNED", 5 ], + "y_offset": [ "GENERAL", "UNSIGNED", 6 ], + "dst": [ "RAW", 0 ] + }, + "genx_media_st": { + "opc": "ISA_MEDIA_ST", + "modifiers": [ "BYTE", 1 ], + "surface": [ "SURFACE", 2 ], + "plane": [ "BYTE", 3 ], + "block_width": [ "BYTE", 4 ], + "block_height": [ "MEDIAHEIGHT", 4 ], + "x_offset": [ "GENERAL", "UNSIGNED", 5 ], + "y_offset": [ "GENERAL", "UNSIGNED", 6 ], + "src": [ "RAW", 7 ] + }, + "genx_oword_ld": { + "opc": "ISA_OWORD_LD", + "log2_owords": [ "LOG2OWORDS", 0 ], + "is_modified": [ "BYTE", 1 ], + "surface": [ "SURFACE", 2 ], + "offset": [ "GENERAL", "UNSIGNED", 3 ], + "dst": [ "RAW", 0 ] + }, + "genx_oword_ld_unaligned": { + "opc": "ISA_OWORD_LD_UNALIGNED", + "gen_opc": "ISA_OWORD_LD", + "log2_owords": [ "LOG2OWORDS", 0 ], + "is_modified": [ "BYTE", 1 ], + "surface": [ "SURFACE", 2 ], + "offset": [ "GENERAL", "UNSIGNED", 3 ], + "dst": [ "RAW", 0 ] + }, + "genx_oword_st": { + "opc": "ISA_OWORD_ST", + "log2_owords": [ "LOG2OWORDS", 3 ], + "surface": [ "SURFACE", 1 ], + "offset": [ "GENERAL", "UNSIGNED", 2 ], + "src": [ "RAW", 3 ] + }, + "genx_dword_atomic_add": { + "opc": "ISA_DWORD_ATOMIC", + "sub_opc": [ "LITERAL", "ATOMIC_ADD" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "surface": [ "SURFACE", 2 ], + "offset": [ "URAW", 3 ], + "src": [ "URAW", 4 ], + "src1": [ "NULLRAW" ], + "twoaddr": [ "TWOADDR", 5 ], + "dst": [ "URAW", "RAW_NULLALLOWED", 0 ] + }, + "genx_dword_atomic_sub": { + "opc": "ISA_DWORD_ATOMIC", + "sub_opc": [ "LITERAL", "ATOMIC_SUB" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "surface": [ "SURFACE", 2 ], + "offset": [ "URAW", 3 ], + "src": [ "URAW", 4 ], + "src1": [ "NULLRAW" ], + "twoaddr": [ "TWOADDR", 5 ], + "dst": [ "URAW", "RAW_NULLALLOWED", 0 ] + }, + "genx_dword_atomic_inc": { + "opc": "ISA_DWORD_ATOMIC", + "sub_opc": [ "LITERAL", "ATOMIC_INC" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "surface": [ "SURFACE", 2 ], + "offset": [ "URAW", 3 ], + "src": [ "NULLRAW" ], + "src1": [ "NULLRAW" ], + "twoaddr": [ "TWOADDR", 4 ], + "dst": [ "URAW", "RAW_NULLALLOWED", 0 ] + }, + "genx_dword_atomic_dec": { + "opc": "ISA_DWORD_ATOMIC", + "sub_opc": [ "LITERAL", "ATOMIC_DEC" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "surface": [ "SURFACE", 2 ], + "offset": [ "URAW", 3 ], + "src": [ "NULLRAW" ], + "src1": [ "NULLRAW" ], + "twoaddr": [ "TWOADDR", 4 ], + "dst": [ "URAW", "RAW_NULLALLOWED", 0 ] + }, + "genx_dword_atomic_min": { + "opc": "ISA_DWORD_ATOMIC", + "sub_opc": [ "LITERAL", "ATOMIC_MIN" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "surface": [ "SURFACE", 2 ], + "offset": [ "URAW", 3 ], + "src": [ "URAW", 4 ], + "src1": [ "NULLRAW" ], + "twoaddr": [ "TWOADDR", 5 ], + "dst": [ "URAW", "RAW_NULLALLOWED", 0 ] + }, + "genx_dword_atomic_max": { + "opc": "ISA_DWORD_ATOMIC", + "sub_opc": [ "LITERAL", "ATOMIC_MAX" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "surface": [ "SURFACE", 2 ], + "offset": [ "URAW", 3 ], + "src": [ "URAW", 4 ], + "src1": [ "NULLRAW" ], + "twoaddr": [ "TWOADDR", 5 ], + "dst": [ "URAW", "RAW_NULLALLOWED", 0 ] + }, + "genx_dword_atomic_xchg": { + "opc": "ISA_DWORD_ATOMIC", + "sub_opc": [ "LITERAL", "ATOMIC_XCHG" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "surface": [ "SURFACE", 2 ], + "offset": [ "URAW", 3 ], + "src": [ "URAW", 4 ], + "src1": [ "NULLRAW" ], + "twoaddr": [ "TWOADDR", 5 ], + "dst": [ "URAW", "RAW_NULLALLOWED", 0 ] + }, + "genx_dword_atomic_cmpxchg": { + "opc": "ISA_DWORD_ATOMIC", + "sub_opc": [ "LITERAL", "ATOMIC_CMPXCHG" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "surface": [ "SURFACE", 2 ], + "offset": [ "URAW", 3 ], + "src": [ "URAW", 4 ], + "src1": [ "URAW", 5 ], + "twoaddr": [ "TWOADDR", 6 ], + "dst": [ "URAW", "RAW_NULLALLOWED", 0 ] + }, + "genx_dword_atomic_and": { + "opc": "ISA_DWORD_ATOMIC", + "sub_opc": [ "LITERAL", "ATOMIC_AND" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "surface": [ "SURFACE", 2 ], + "offset": [ "URAW", 3 ], + "src": [ "URAW", 4 ], + "src1": [ "NULLRAW" ], + "twoaddr": [ "TWOADDR", 5 ], + "dst": [ "URAW", "RAW_NULLALLOWED", 0 ] + }, + "genx_dword_atomic_or": { + "opc": "ISA_DWORD_ATOMIC", + "sub_opc": [ "LITERAL", "ATOMIC_OR" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "surface": [ "SURFACE", 2 ], + "offset": [ "URAW", 3 ], + "src": [ "URAW", 4 ], + "src1": [ "NULLRAW" ], + "twoaddr": [ "TWOADDR", 5 ], + "dst": [ "URAW", "RAW_NULLALLOWED", 0 ] + }, + "genx_dword_atomic_xor": { + "opc": "ISA_DWORD_ATOMIC", + "sub_opc": [ "LITERAL", "ATOMIC_XOR" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "surface": [ "SURFACE", 2 ], + "offset": [ "URAW", 3 ], + "src": [ "URAW", 4 ], + "src1": [ "NULLRAW" ], + "twoaddr": [ "TWOADDR", 5 ], + "dst": [ "URAW", "RAW_NULLALLOWED", 0 ] + }, + "genx_dword_atomic_imin": { + "opc": "ISA_DWORD_ATOMIC", + "sub_opc": [ "LITERAL", "ATOMIC_IMIN" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "surface": [ "SURFACE", 2 ], + "offset": [ "URAW", 3 ], + "src": [ "SRAW", 4 ], + "src1": [ "NULLRAW" ], + "twoaddr": [ "TWOADDR", 5 ], + "dst": [ "SRAW", "RAW_NULLALLOWED", 0 ] + }, + "genx_dword_atomic_imax": { + "opc": "ISA_DWORD_ATOMIC", + "sub_opc": [ "LITERAL", "ATOMIC_IMAX" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "surface": [ "SURFACE", 2 ], + "offset": [ "URAW", 3 ], + "src": [ "SRAW", 4 ], + "src1": [ "NULLRAW" ], + "twoaddr": [ "TWOADDR", 5 ], + "dst": [ "SRAW", "RAW_NULLALLOWED", 0 ] + }, + "genx_dword_atomic_fmax": { + "opc": "ISA_DWORD_ATOMIC", + "sub_opc": [ "LITERAL", "ATOMIC_FMAX" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "surface": [ "SURFACE", 2 ], + "offset": [ "URAW", 3 ], + "src": [ "RAW", 4 ], + "src1": [ "NULLRAW" ], + "twoaddr": [ "TWOADDR", 5 ], + "dst": [ "RAW", "RAW_NULLALLOWED", 0 ] + }, + "genx_dword_atomic_fmin": { + "opc": "ISA_DWORD_ATOMIC", + "sub_opc": [ "LITERAL", "ATOMIC_FMIN" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "surface": [ "SURFACE", 2 ], + "offset": [ "URAW", 3 ], + "src": [ "RAW", 4 ], + "src1": [ "NULLRAW" ], + "twoaddr": [ "TWOADDR", 5 ], + "dst": [ "RAW", "RAW_NULLALLOWED", 0 ] + }, + "genx_dword_atomic_fcmpwr": { + "opc": "ISA_DWORD_ATOMIC", + "sub_opc": [ "LITERAL", "ATOMIC_FCMPWR" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "surface": [ "SURFACE", 2 ], + "offset": [ "URAW", 3 ], + "src": [ "RAW", 4 ], + "src1": [ "RAW", 5 ], + "twoaddr": [ "TWOADDR", 6 ], + "dst": [ "RAW", "RAW_NULLALLOWED", 0 ] + }, + "fma": { + "opc": "ISA_MAD", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", 0 ], + "src0": [ "GENERAL", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "MODIFIER_ARITH", 2 ], + "src2": [ "GENERAL", "MODIFIER_ARITH", 3 ] + }, + "genx_ssmad": { + "opc": "ISA_MAD", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "SIGNED", 0 ], + "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ], + "src2": [ "GENERAL", "SIGNED", "CONTIGUOUS", 3 ] + }, + "genx_sumad": { + "opc": "ISA_MAD", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "SIGNED", 0 ], + "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ], + "src2": [ "GENERAL", "UNSIGNED", "CONTIGUOUS", 3 ] + }, + "genx_usmad": { + "opc": "ISA_MAD", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "UNSIGNED", 0 ], + "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ], + "src2": [ "GENERAL", "SIGNED", "CONTIGUOUS", 3 ] + }, + "genx_uumad": { + "opc": "ISA_MAD", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "UNSIGNED", 0 ], + "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ], + "src2": [ "GENERAL", "UNSIGNED", "CONTIGUOUS", 3 ] + }, + "genx_ssmad_sat": { + "opc": "ISA_MAD", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "SIGNED", "SATURATION_SATURATE", 0 ], + "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ], + "src2": [ "GENERAL", "SIGNED", "CONTIGUOUS", 3 ] + }, + "genx_sumad_sat": { + "opc": "ISA_MAD", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "SIGNED", "SATURATION_SATURATE", 0 ], + "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ], + "src2": [ "GENERAL", "UNSIGNED", "CONTIGUOUS", 3 ] + }, + "genx_usmad_sat": { + "opc": "ISA_MAD", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "UNSIGNED", "SATURATION_SATURATE", 0 ], + "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ], + "src2": [ "GENERAL", "SIGNED", "CONTIGUOUS", 3 ] + }, + "genx_uumad_sat": { + "opc": "ISA_MAD", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "UNSIGNED", "SATURATION_SATURATE", 0 ], + "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ], + "src2": [ "GENERAL", "UNSIGNED", "CONTIGUOUS", 3 ] + }, + "genx_constantpred": { + "opc": "ISA_SETP", + "exec_size": [ "EXECSIZE" ], + "dst": [ "PREDICATE", 0 ], + "src0": [ "CONSTVI1ASI32", 1 ] + }, + "genx_smax": { + "opc": "ISA_FMINMAX", + "exec_size": [ "EXECSIZE" ], + "flag_for_max": [ "LITERAL", 1 ], + "dst": [ "GENERAL", "SIGNED", "SATURATION_INTALLOWED", 0 ], + "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_umax": { + "opc": "ISA_FMINMAX", + "exec_size": [ "EXECSIZE" ], + "flag_for_max": [ "LITERAL", 1 ], + "dst": [ "GENERAL", "UNSIGNED", "SATURATION_INTALLOWED", 0 ], + "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_fmax": { + "opc": "ISA_FMINMAX", + "exec_size": [ "EXECSIZE" ], + "flag_for_max": [ "LITERAL", 1 ], + "dst": [ "GENERAL", 0 ], + "src0": [ "GENERAL", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "MODIFIER_ARITH", 2 ] + }, + "genx_smin": { + "opc": "ISA_FMINMAX", + "exec_size": [ "EXECSIZE" ], + "flag_for_max": [ "LITERAL", 0 ], + "dst": [ "GENERAL", "SIGNED", "SATURATION_INTALLOWED", 0 ], + "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_umin": { + "opc": "ISA_FMINMAX", + "exec_size": [ "EXECSIZE" ], + "flag_for_max": [ "LITERAL", 0 ], + "dst": [ "GENERAL", "UNSIGNED", "SATURATION_INTALLOWED", 0 ], + "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_fmin": { + "opc": "ISA_FMINMAX", + "exec_size": [ "EXECSIZE" ], + "flag_for_max": [ "LITERAL", 0 ], + "dst": [ "GENERAL", 0 ], + "src0": [ "GENERAL", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "MODIFIER_ARITH", 2 ] + }, + "genx_pow": { + "opc": "ISA_POW", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", 0 ], + "src0": [ "GENERAL", 1 ], + "src1": [ "GENERAL", 2 ] + }, + "genx_add_addr": { + "opc": "ISA_ADDR_ADD", + "exec_size": [ "EXECSIZE" ], + "dst": [ "ADDRESS", 0 ], + "src0": [ "ADDRESS", 1 ], + "src1": [ "GENERAL", "UNSIGNED", 2 ] + }, + "genx_3d_sample": { + "opc": "ISA_3D_SAMPLE", + "sampling3d_opcode": [ "BYTE", 1 ], + "exec_size": [ "EXECSIZE_FROM_ARG", 2 ], + "pred": [ "PREDICATION", 2 ], + "channel_mask": [ "BYTE", 3 ], + "aoffimmi_value": [ "GENERAL", "UNSIGNED", 4 ], + "sampler": [ "SAMPLER", 5 ], + "surface": [ "SURFACE", 6 ], + "dst": [ "RAW", 0 ], + "number_of_additional_operands": [ "ARGCOUNT", "ARGCOUNTMIN1", 7 ], + "raw_operands": [ "RAW_OPERANDS", "RAW", 7 ] + }, + "genx_sqrt": { + "opc": "ISA_SQRT", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", 0 ], + "src0": [ "GENERAL", 1 ] + }, + "genx_rsqrt": { + "opc": "ISA_RSQRT", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", 0 ], + "src0": [ "GENERAL", 1 ] + }, + "genx_ieee_sqrt": { + "opc": "ISA_SQRTM", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", 0 ], + "src0": [ "GENERAL", 1 ] + }, + "genx_inv": { + "opc": "ISA_INV", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", 0 ], + "src0": [ "GENERAL", 1 ] + }, + "genx_log": { + "opc": "ISA_LOG", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", 0 ], + "src0": [ "GENERAL", 1 ] + }, + "genx_exp": { + "opc": "ISA_EXP", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", 0 ], + "src0": [ "GENERAL", 1 ] + }, + "genx_scatter_scaled": { + "opc": "ISA_SCATTER_SCALED", + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "1_byte_block_size_MBZ": [ "LITERAL", 0 ], + "log2_num_blocks": [ "BYTE", 2 ], + "scale": [ "SHORT", 3 ], + "surface": [ "SURFACE", 4 ], + "global_offset": [ "GENERAL", "UNSIGNED", 5 ], + "element_offset": [ "URAW", 6 ], + "src": [ "RAW", 7 ] + }, + "genx_scatter4_scaled": { + "opc": "ISA_SCATTER4_SCALED", + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "channel_mask": [ "BYTE", 2 ], + "scale": [ "SHORT", 3 ], + "surface": [ "SURFACE", 4 ], + "global_offset": [ "GENERAL", "UNSIGNED", 5 ], + "element_offset": [ "URAW", 6 ], + "src": [ "RAW", 7 ] + }, + "genx_scatter4_typed": { + "opc": "ISA_SCATTER4_TYPED", + "exec_size": [ "EXECSIZE_FROM_ARG", 2 ], + "pred": [ "PREDICATION", 2 ], + "channel_mask": [ "BYTE", 1 ], + "surface": [ "SURFACE", 3 ], + "U_pixel_address": [ "URAW", 4 ], + "V_pixel_address": [ "URAW", "RAW_NULLALLOWED", 5 ], + "R_pixel_address": [ "URAW", "RAW_NULLALLOWED", 6 ], + "LOD": [ "NULLRAW" ], + "src": [ "RAW", 7 ] + }, + "genx_gather_scaled": { + "opc": "ISA_GATHER_SCALED", + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "block_size_MBZ": [ "LITERAL", 0 ], + "log2_num_blocks": [ "BYTE", 2 ], + "scale": [ "SHORT", 3 ], + "surface": [ "SURFACE", 4 ], + "global_offset": [ "GENERAL", "UNSIGNED", 5 ], + "element_offset": [ "URAW", 6 ], + "skip__": [ "TWOADDR", 7 ], + "dst": [ "RAW", 0 ] + }, + "genx_gather_scaled2": { + "opc": "ISA_GATHER_SCALED", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "block_size_MBZ": [ "LITERAL", 0 ], + "log2_num_blocks": [ "BYTE", 1 ], + "scale": [ "SHORT", 2 ], + "surface": [ "SURFACE", 3 ], + "global_offset": [ "GENERAL", "UNSIGNED", 4 ], + "element_offset": [ "URAW", 5 ], + "dst": [ "RAW", 0 ] + }, + "genx_gather4_scaled": { + "opc": "ISA_GATHER4_SCALED", + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "channel_mask": [ "BYTE", 2 ], + "scale": [ "SHORT", 3 ], + "surface": [ "SURFACE", 4 ], + "global_offset": [ "GENERAL", "UNSIGNED", 5 ], + "element_offset": [ "URAW", 6 ], + "skip__": [ "TWOADDR", 7 ], + "dst": [ "RAW", 0 ] + }, + "genx_gather4_scaled2": { + "opc": "ISA_GATHER4_SCALED", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "channel_mask": [ "BYTE", 1 ], + "scale": [ "SHORT", 2 ], + "surface": [ "SURFACE", 3 ], + "global_offset": [ "GENERAL", "UNSIGNED", 4 ], + "element_offset": [ "URAW", 5 ], + "dst": [ "RAW", 0 ] + }, + "genx_gather4_typed": { + "opc": "ISA_GATHER4_TYPED", + "exec_size": [ "EXECSIZE_FROM_ARG", 2 ], + "pred": [ "PREDICATION", 2 ], + "channel_mask": [ "BYTE", 1 ], + "surface": [ "SURFACE", 3 ], + "U_pixel_address": [ "URAW", 4 ], + "V_pixel_address": [ "URAW", "RAW_NULLALLOWED", 5 ], + "R_pixel_address": [ "URAW", "RAW_NULLALLOWED", 6 ], + "LOD": [ "NULLRAW" ], + "skip__": [ "TWOADDR", 7 ], + "dst": [ "RAW", 0 ] + }, + "genx_typed_atomic_add": { + "opc": "ISA_3D_TYPED_ATOMIC", + "sub_opc": [ "LITERAL", "ATOMIC_ADD" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "surface": [ "SURFACE", 2 ], + "U": [ "URAW", 4 ], + "V": [ "URAW", "RAW_NULLALLOWED", 5 ], + "R": [ "URAW", "RAW_NULLALLOWED", 6 ], + "LOD": [ "URAW", "RAW_NULLALLOWED", 7 ], + "src0": [ "URAW", 3 ], + "src1": [ "NULLRAW" ], + "dst": [ "URAW", "RAW_NULLALLOWED", 0 ] + }, + "genx_typed_atomic_sub": { + "opc": "ISA_3D_TYPED_ATOMIC", + "sub_opc": [ "LITERAL", "ATOMIC_SUB" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "surface": [ "SURFACE", 2 ], + "U": [ "URAW", 4 ], + "V": [ "URAW", "RAW_NULLALLOWED", 5 ], + "R": [ "URAW", "RAW_NULLALLOWED", 6 ], + "LOD": [ "URAW", "RAW_NULLALLOWED", 7 ], + "src0": [ "URAW", 3 ], + "src1": [ "NULLRAW" ], + "dst": [ "URAW", "RAW_NULLALLOWED", 0 ] + }, + "genx_typed_atomic_inc": { + "opc": "ISA_3D_TYPED_ATOMIC", + "sub_opc": [ "LITERAL", "ATOMIC_INC" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "surface": [ "SURFACE", 2 ], + "U": [ "URAW", 3 ], + "V": [ "URAW", "RAW_NULLALLOWED", 4 ], + "R": [ "URAW", "RAW_NULLALLOWED", 5 ], + "LOD": [ "URAW", "RAW_NULLALLOWED", 6 ], + "src0": [ "NULLRAW" ], + "src1": [ "NULLRAW" ], + "dst": [ "URAW", "RAW_NULLALLOWED", 0 ] + }, + "genx_typed_atomic_dec": { + "opc": "ISA_3D_TYPED_ATOMIC", + "sub_opc": [ "LITERAL", "ATOMIC_DEC" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "surface": [ "SURFACE", 2 ], + "U": [ "URAW", 3 ], + "V": [ "URAW", "RAW_NULLALLOWED", 4 ], + "R": [ "URAW", "RAW_NULLALLOWED", 5 ], + "LOD": [ "URAW", "RAW_NULLALLOWED", 6 ], + "src0": [ "NULLRAW" ], + "src1": [ "NULLRAW" ], + "dst": [ "URAW", "RAW_NULLALLOWED", 0 ] + }, + "genx_typed_atomic_min": { + "opc": "ISA_3D_TYPED_ATOMIC", + "sub_opc": [ "LITERAL", "ATOMIC_MIN" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "surface": [ "SURFACE", 2 ], + "U": [ "URAW", 4 ], + "V": [ "URAW", "RAW_NULLALLOWED", 5 ], + "R": [ "URAW", "RAW_NULLALLOWED", 6 ], + "LOD": [ "URAW", "RAW_NULLALLOWED", 7 ], + "src0": [ "URAW", 3 ], + "src1": [ "NULLRAW" ], + "dst": [ "URAW", "RAW_NULLALLOWED", 0 ] + }, + "genx_typed_atomic_max": { + "opc": "ISA_3D_TYPED_ATOMIC", + "sub_opc": [ "LITERAL", "ATOMIC_MAX" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "surface": [ "SURFACE", 2 ], + "U": [ "URAW", 4 ], + "V": [ "URAW", "RAW_NULLALLOWED", 5 ], + "R": [ "URAW", "RAW_NULLALLOWED", 6 ], + "LOD": [ "URAW", "RAW_NULLALLOWED", 7 ], + "src0": [ "URAW", 3 ], + "src1": [ "NULLRAW" ], + "dst": [ "URAW", "RAW_NULLALLOWED", 0 ] + }, + "genx_typed_atomic_xchg": { + "opc": "ISA_3D_TYPED_ATOMIC", + "sub_opc": [ "LITERAL", "ATOMIC_XCHG" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "surface": [ "SURFACE", 2 ], + "U": [ "URAW", 4 ], + "V": [ "URAW", "RAW_NULLALLOWED", 5 ], + "R": [ "URAW", "RAW_NULLALLOWED", 6 ], + "LOD": [ "URAW", "RAW_NULLALLOWED", 7 ], + "src0": [ "URAW", 3 ], + "src1": [ "NULLRAW" ], + "dst": [ "URAW", "RAW_NULLALLOWED", 0 ] + }, + "genx_typed_atomic_cmpxchg": { + "opc": "ISA_3D_TYPED_ATOMIC", + "sub_opc": [ "LITERAL", "ATOMIC_CMPXCHG" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "surface": [ "SURFACE", 2 ], + "U": [ "URAW", 5 ], + "V": [ "URAW", "RAW_NULLALLOWED", 6 ], + "R": [ "URAW", "RAW_NULLALLOWED", 7 ], + "LOD": [ "URAW", "RAW_NULLALLOWED", 8 ], + "src0": [ "URAW", 3 ], + "src1": [ "URAW", 4 ], + "dst": [ "URAW", "RAW_NULLALLOWED", 0 ] + }, + "genx_typed_atomic_and": { + "opc": "ISA_3D_TYPED_ATOMIC", + "sub_opc": [ "LITERAL", "ATOMIC_AND" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "surface": [ "SURFACE", 2 ], + "U": [ "URAW", 4 ], + "V": [ "URAW", "RAW_NULLALLOWED", 5 ], + "R": [ "URAW", "RAW_NULLALLOWED", 6 ], + "LOD": [ "URAW", "RAW_NULLALLOWED", 7 ], + "src0": [ "URAW", 3 ], + "src1": [ "NULLRAW" ], + "dst": [ "URAW", "RAW_NULLALLOWED", 0 ] + }, + "genx_typed_atomic_or": { + "opc": "ISA_3D_TYPED_ATOMIC", + "sub_opc": [ "LITERAL", "ATOMIC_OR" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "surface": [ "SURFACE", 2 ], + "U": [ "URAW", 4 ], + "V": [ "URAW", "RAW_NULLALLOWED", 5 ], + "R": [ "URAW", "RAW_NULLALLOWED", 6 ], + "LOD": [ "URAW", "RAW_NULLALLOWED", 7 ], + "src0": [ "URAW", 3 ], + "src1": [ "NULLRAW" ], + "dst": [ "URAW", "RAW_NULLALLOWED", 0 ] + }, + "genx_typed_atomic_xor": { + "opc": "ISA_3D_TYPED_ATOMIC", + "sub_opc": [ "LITERAL", "ATOMIC_XOR" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "surface": [ "SURFACE", 2 ], + "U": [ "URAW", 4 ], + "V": [ "URAW", "RAW_NULLALLOWED", 5 ], + "R": [ "URAW", "RAW_NULLALLOWED", 6 ], + "LOD": [ "URAW", "RAW_NULLALLOWED", 7 ], + "src0": [ "URAW", 3 ], + "src1": [ "NULLRAW" ], + "dst": [ "URAW", "RAW_NULLALLOWED", 0 ] + }, + "genx_typed_atomic_imin": { + "opc": "ISA_3D_TYPED_ATOMIC", + "sub_opc": [ "LITERAL", "ATOMIC_IMIN" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "surface": [ "SURFACE", 2 ], + "U": [ "URAW", 4 ], + "V": [ "URAW", "RAW_NULLALLOWED", 5 ], + "R": [ "URAW", "RAW_NULLALLOWED", 6 ], + "LOD": [ "URAW", "RAW_NULLALLOWED", 7 ], + "src0": [ "URAW", 3 ], + "src1": [ "NULLRAW" ], + "dst": [ "URAW", "RAW_NULLALLOWED", 0 ] + }, + "genx_typed_atomic_imax": { + "opc": "ISA_3D_TYPED_ATOMIC", + "sub_opc": [ "LITERAL", "ATOMIC_IMAX" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "surface": [ "SURFACE", 2 ], + "U": [ "URAW", 4 ], + "V": [ "URAW", "RAW_NULLALLOWED", 5 ], + "R": [ "URAW", "RAW_NULLALLOWED", 6 ], + "LOD": [ "URAW", "RAW_NULLALLOWED", 7 ], + "src0": [ "URAW", 3 ], + "src1": [ "NULLRAW" ], + "dst": [ "URAW", "RAW_NULLALLOWED", 0 ] + }, + "genx_typed_atomic_fmax": { + "opc": "ISA_3D_TYPED_ATOMIC", + "sub_opc": [ "LITERAL", "ATOMIC_FMAX" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "surface": [ "SURFACE", 2 ], + "U": [ "URAW", 4 ], + "V": [ "URAW", "RAW_NULLALLOWED", 5 ], + "R": [ "URAW", "RAW_NULLALLOWED", 6 ], + "LOD": [ "URAW", "RAW_NULLALLOWED", 7 ], + "src0": [ "URAW", 3 ], + "src1": [ "NULLRAW" ], + "dst": [ "URAW", "RAW_NULLALLOWED", 0 ] + }, + "genx_typed_atomic_fmin": { + "opc": "ISA_3D_TYPED_ATOMIC", + "sub_opc": [ "LITERAL", "ATOMIC_FMIN" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "surface": [ "SURFACE", 2 ], + "U": [ "URAW", 4 ], + "V": [ "URAW", "RAW_NULLALLOWED", 5 ], + "R": [ "URAW", "RAW_NULLALLOWED", 6 ], + "LOD": [ "URAW", "RAW_NULLALLOWED", 7 ], + "src0": [ "URAW", 3 ], + "src1": [ "NULLRAW" ], + "dst": [ "URAW", "RAW_NULLALLOWED", 0 ] + }, + "genx_typed_atomic_fcmpwr": { + "opc": "ISA_3D_TYPED_ATOMIC", + "sub_opc": [ "LITERAL", "ATOMIC_FCMPWR" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "surface": [ "SURFACE", 2 ], + "U": [ "URAW", 5 ], + "V": [ "URAW", "RAW_NULLALLOWED", 6 ], + "R": [ "URAW", "RAW_NULLALLOWED", 7 ], + "LOD": [ "URAW", "RAW_NULLALLOWED", 8 ], + "src0": [ "URAW", 3 ], + "src1": [ "URAW", 4 ], + "dst": [ "URAW", "RAW_NULLALLOWED", 0 ] + }, + "genx_sssad2add": { + "opc": "ISA_SAD2ADD", + "exec_size": [ "EXECSIZE_GE2" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "SIGNED", 0 ], + "src0": [ "GENERAL", "SIGNED", 1 ], + "src1": [ "GENERAL", "SIGNED", 2 ], + "src2": [ "GENERAL", "SIGNED", 3 ] + }, + "genx_uusad2add": { + "opc": "ISA_SAD2ADD", + "exec_size": [ "EXECSIZE_GE2" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "UNSIGNED", 0 ], + "src0": [ "GENERAL", "UNSIGNED", 1 ], + "src1": [ "GENERAL", "UNSIGNED", 2 ], + "src2": [ "GENERAL", "UNSIGNED", 3 ] + }, + "genx_susad2add": { + "opc": "ISA_SAD2ADD", + "exec_size": [ "EXECSIZE_GE2" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "SIGNED", 0 ], + "src0": [ "GENERAL", "UNSIGNED", 1 ], + "src1": [ "GENERAL", "UNSIGNED", 2 ], + "src2": [ "GENERAL", "SIGNED", 3 ] + }, + "genx_ussad2add": { + "opc": "ISA_SAD2ADD", + "exec_size": [ "EXECSIZE_GE2" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "UNSIGNED", 0 ], + "src0": [ "GENERAL", "SIGNED", 1 ], + "src1": [ "GENERAL", "SIGNED", 2 ], + "src2": [ "GENERAL", "UNSIGNED", 3 ] + }, + "genx_sssad2add_sat": { + "opc": "ISA_SAD2ADD", + "exec_size": [ "EXECSIZE_GE2" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "SIGNED", "SATURATION_SATURATE", 0 ], + "src0": [ "GENERAL", "SIGNED", 1 ], + "src1": [ "GENERAL", "SIGNED", 2 ], + "src2": [ "GENERAL", "SIGNED", 3 ] + }, + "genx_uusad2add_sat": { + "opc": "ISA_SAD2ADD", + "exec_size": [ "EXECSIZE_GE2" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "UNSIGNED", "SATURATION_SATURATE", 0 ], + "src0": [ "GENERAL", "UNSIGNED", 1 ], + "src1": [ "GENERAL", "UNSIGNED", 2 ], + "src2": [ "GENERAL", "UNSIGNED", 3 ] + }, + "genx_susad2add_sat": { + "opc": "ISA_SAD2ADD", + "exec_size": [ "EXECSIZE_GE2" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "SIGNED", "SATURATION_SATURATE", 0 ], + "src0": [ "GENERAL", "UNSIGNED", 1 ], + "src1": [ "GENERAL", "UNSIGNED", 2 ], + "src2": [ "GENERAL", "SIGNED", 3 ] + }, + "genx_ussad2add_sat": { + "opc": "ISA_SAD2ADD", + "exec_size": [ "EXECSIZE_GE2" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "UNSIGNED", "SATURATION_SATURATE", 0 ], + "src0": [ "GENERAL", "SIGNED", 1 ], + "src1": [ "GENERAL", "SIGNED", 2 ], + "src2": [ "GENERAL", "UNSIGNED", 3 ] + }, + "genx_ssad2": { + "opc": "ISA_SAD2", + "exec_size": [ "EXECSIZE_GE2" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "SIGNED", 0 ], + "src0": [ "GENERAL", "SIGNED", 1 ], + "src1": [ "GENERAL", "SIGNED", 2 ] + }, + "genx_usad2": { + "opc": "ISA_SAD2", + "exec_size": [ "EXECSIZE_GE2" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "UNSIGNED", 0 ], + "src0": [ "GENERAL", "UNSIGNED", 1 ], + "src1": [ "GENERAL", "UNSIGNED", 2 ] + }, + "genx_wait": { + "opc": "ISA_WAIT", + "mask": [ "GENERAL", "UNSIGNED", 1 ] + }, + "genx_avs": { + "opc": "ISA_AVS", + "channel_mask": [ "BYTE", 1 ], + "sampler": [ "SAMPLER", 2 ], + "surface": [ "SURFACE", 3 ], + "U_pixel_address": [ "GENERAL", 4 ], + "V_pixel_address": [ "GENERAL", 5 ], + "deltaU": [ "GENERAL", 6 ], + "deltaV": [ "GENERAL", 7 ], + "u2d": [ "GENERAL", 8 ], + "groupID": [ "GENERAL", "UNSIGNED", 9 ], + "verticalBlockNumber": [ "GENERAL", "UNSIGNED", 10 ], + "output_format_control": [ "BYTE", 11 ], + "v2d": [ "GENERAL", 12 ], + "execMode": [ "BYTE", 13 ], + "IEFByPass": [ "GENERAL", "UNSIGNED", 14 ], + "dst": [ "RAW", 0 ] + }, + "genx_sample_unorm": { + "opc": "ISA_SAMPLE_UNORM", + "channel_mask": [ "BYTE", 1 ], + "sampler": [ "SAMPLER", 2 ], + "surface": [ "SURFACE", 3 ], + "U_pixel_address": [ "GENERAL", 4 ], + "V_pixel_address": [ "GENERAL", 5 ], + "deltaU": [ "GENERAL", 6 ], + "deltaV": [ "GENERAL", 7 ], + "dst": [ "RAW", 0 ] + }, + "genx_sin": { + "opc": "ISA_SIN", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", 0 ], + "src0": [ "GENERAL", 1 ] + }, + "genx_cos": { + "opc": "ISA_COS", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", 0 ], + "src0": [ "GENERAL", 1 ] + }, + "genx_ssavg": { + "opc": "ISA_AVG", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "SIGNED", 0 ], + "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_suavg": { + "opc": "ISA_AVG", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "SIGNED", 0 ], + "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_usavg": { + "opc": "ISA_AVG", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "UNSIGNED", 0 ], + "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_uuavg": { + "opc": "ISA_AVG", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "UNSIGNED", 0 ], + "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_ssavg_sat": { + "opc": "ISA_AVG", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "SIGNED", "SATURATION_SATURATE", 0 ], + "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_suavg_sat": { + "opc": "ISA_AVG", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "SIGNED", "SATURATION_SATURATE", 0 ], + "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_usavg_sat": { + "opc": "ISA_AVG", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "UNSIGNED", "SATURATION_SATURATE", 0 ], + "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_uuavg_sat": { + "opc": "ISA_AVG", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "UNSIGNED", "SATURATION_SATURATE", 0 ], + "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_fence": { + "opc": "ISA_FENCE", + "mask": [ "BYTE", 1 ] + }, + "genx_ssadd_sat": { + "opc": "ISA_ADD", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "SIGNED", "SATURATION_SATURATE", 0 ], + "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_suadd_sat": { + "opc": "ISA_ADD", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "SIGNED", "SATURATION_SATURATE", 0 ], + "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_usadd_sat": { + "opc": "ISA_ADD", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "UNSIGNED", "SATURATION_SATURATE", 0 ], + "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_uuadd_sat": { + "opc": "ISA_ADD", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "UNSIGNED", "SATURATION_SATURATE", 0 ], + "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_lzd": { + "opc": "ISA_LZD", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "UNSIGNED", 0 ], + "src0": [ "GENERAL", "UNSIGNED", 1 ] + }, + "genx_raw_send": { + "opc": "ISA_RAW_SEND", + "modifier_sendc": [ "BYTE", 1 ], + "exec_size": [ "EXECSIZE_FROM_ARG", 2 ], + "pred": [ "PREDICATION", 2 ], + "extended_message_descriptor": [ "INT", 3 ], + "numsrc": [ "NUMGRFS", 5 ], + "numdst": [ "NUMGRFS", 0 ], + "desc": [ "GENERAL", "UNSIGNED", 4 ], + "src": [ "RAW", 5 ], + "skip__": [ "TWOADDR", 6 ], + "dst": [ "RAW", 0 ] + }, + "genx_raw_send_noresult": { + "opc": "ISA_RAW_SEND", + "modifier_sendc": [ "BYTE", 1 ], + "exec_size": [ "EXECSIZE_FROM_ARG", 2 ], + "pred": [ "PREDICATION", 2 ], + "extended_message_descriptor": [ "INT", 3 ], + "numsrc": [ "NUMGRFS", 5 ], + "numdst": [ "LITERAL", 0 ], + "desc": [ "GENERAL", "UNSIGNED", 4 ], + "src": [ "RAW", 5 ], + "dst": [ "NULLRAW" ] + }, + "genx_raw_sends": { + "opc": "ISA_RAW_SENDS", + "modifier_sendc": [ "BYTE", 1 ], + "exec_size": [ "EXECSIZE_FROM_ARG", 2 ], + "pred": [ "PREDICATION", 2 ], + "numsrc": [ "NUMGRFS", 6 ], + "numsrc2": [ "NUMGRFS", 7 ], + "numdst": [ "NUMGRFS", 0 ], + "FFID": [ "BYTE", 3 ], + "extended_message_descriptor": [ "GENERAL", "UNSIGNED", 4 ], + "desc": [ "GENERAL", "UNSIGNED", 5 ], + "src": [ "RAW", 6 ], + "src2": [ "RAW", 7 ], + "skip__": [ "TWOADDR", 8 ], + "dst": [ "RAW", 0 ] + }, + "genx_raw_sends_noresult": { + "opc": "ISA_RAW_SENDS", + "modifier_sendc": [ "BYTE", 1 ], + "exec_size": [ "EXECSIZE_FROM_ARG", 2 ], + "pred": [ "PREDICATION", 2 ], + "numsrc": [ "NUMGRFS", 6 ], + "numsrc2": [ "NUMGRFS", 7 ], + "numdst": [ "LITERAL", 0 ], + "FFID": [ "BYTE", 3 ], + "extended_message_descriptor": [ "GENERAL", "UNSIGNED", 4 ], + "desc": [ "GENERAL", "UNSIGNED", 5 ], + "src": [ "RAW", 6 ], + "src2": [ "RAW", 7 ], + "dst": [ "NULLRAW", 8 ] + }, + "genx_raw_send2": { + "opc": "ISA_RAW_SENDS", + "modifier_sendc": [ "BYTE", 1 ], + "exec_size": [ "EXECSIZE_FROM_BYTE", 2 ], + "pred": [ "PREDICATION", 3 ], + "numsrc": [ "BYTE", 4 ], + "numsrc2": [ "LITERAL", 0 ], + "numdst": [ "BYTE", 5 ], + "FFID": [ "BYTE", 6 ], + "extended_message_descriptor": [ "GENERAL", "UNSIGNED", 7 ], + "desc": [ "GENERAL", "UNSIGNED", 8 ], + "src": [ "RAW", 9 ], + "src2": [ "NULLRAW", 10 ], + "skip__": [ "TWOADDR", 10 ], + "dst": [ "RAW", 0 ] + }, + "genx_raw_send2_noresult": { + "opc": "ISA_RAW_SENDS", + "modifier_sendc": [ "BYTE", 1 ], + "exec_size": [ "EXECSIZE_FROM_BYTE", 2 ], + "pred": [ "PREDICATION", 3 ], + "numsrc": [ "BYTE", 4 ], + "numsrc2": [ "LITERAL", 0 ], + "numdst": [ "LITERAL", 0 ], + "FFID": [ "BYTE", 5 ], + "extended_message_descriptor": [ "GENERAL", "UNSIGNED", 6 ], + "desc": [ "GENERAL", "UNSIGNED", 7 ], + "src": [ "RAW", 8 ], + "src2": [ "NULLRAW", 9 ], + "dst": [ "NULLRAW", 10 ] + }, + "genx_raw_sends2": { + "opc": "ISA_RAW_SENDS", + "modifier_sendc": [ "BYTE", 1 ], + "exec_size": [ "EXECSIZE_FROM_BYTE", 2 ], + "pred": [ "PREDICATION", 3 ], + "numsrc": [ "BYTE", 4 ], + "numsrc2": [ "BYTE", 5 ], + "numdst": [ "BYTE", 6 ], + "FFID": [ "BYTE", 7 ], + "extended_message_descriptor": [ "GENERAL", "UNSIGNED", 8 ], + "desc": [ "GENERAL", "UNSIGNED", 9 ], + "src": [ "RAW", 10 ], + "src2": [ "RAW", 11 ], + "skip__": [ "TWOADDR", 12 ], + "dst": [ "RAW", 0 ] + }, + "genx_raw_sends2_noresult": { + "opc": "ISA_RAW_SENDS", + "modifier_sendc": [ "BYTE", 1 ], + "exec_size": [ "EXECSIZE_FROM_BYTE", 2 ], + "pred": [ "PREDICATION", 3 ], + "numsrc": [ "BYTE", 4 ], + "numsrc2": [ "BYTE", 5 ], + "numdst": [ "LITERAL", 0 ], + "FFID": [ "BYTE", 6 ], + "extended_message_descriptor": [ "GENERAL", "UNSIGNED", 7 ], + "desc": [ "GENERAL", "UNSIGNED", 8 ], + "src": [ "RAW", 9 ], + "src2": [ "RAW", 10 ], + "dst": [ "NULLRAW", 11 ] + }, + "genx_rndd": { + "opc": "ISA_RNDD", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", 0 ], + "src0": [ "GENERAL", 1 ] + }, + "genx_rnde": { + "opc": "ISA_RNDE", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", 0 ], + "src0": [ "GENERAL", 1 ] + }, + "genx_rndu": { + "opc": "ISA_RNDU", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", 0 ], + "src0": [ "GENERAL", 1 ] + }, + "genx_rndz": { + "opc": "ISA_RNDZ", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", 0 ], + "src0": [ "GENERAL", 1 ] + }, + "genx_ssmul": { + "opc": "ISA_MUL", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "SIGNED", 0 ], + "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_sumul": { + "opc": "ISA_MUL", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "SIGNED", 0 ], + "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_usmul": { + "opc": "ISA_MUL", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "SIGNED", 0 ], + "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_uumul": { + "opc": "ISA_MUL", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "SIGNED", 0 ], + "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_ssmul_sat": { + "opc": "ISA_MUL", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "SIGNED", "SATURATION_SATURATE", 0 ], + "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_sumul_sat": { + "opc": "ISA_MUL", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "SIGNED", "SATURATION_SATURATE", 0 ], + "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_usmul_sat": { + "opc": "ISA_MUL", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "UNSIGNED", "SATURATION_SATURATE", 0 ], + "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_uumul_sat": { + "opc": "ISA_MUL", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "UNSIGNED", "SATURATION_SATURATE", 0 ], + "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_smulh": { + "opc": "ISA_MULH", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "SIGNED", 0 ], + "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_umulh": { + "opc": "ISA_MULH", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "UNSIGNED", 0 ], + "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_ssshl": { + "opc": "ISA_SHL", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "SIGNED", 0 ], + "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_sushl": { + "opc": "ISA_SHL", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "SIGNED", 0 ], + "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_usshl": { + "opc": "ISA_SHL", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "UNSIGNED", 0 ], + "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_uushl": { + "opc": "ISA_SHL", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "UNSIGNED", 0 ], + "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_ssshl_sat": { + "opc": "ISA_SHL", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "SIGNED", "SATURATION_SATURATE", 0 ], + "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_sushl_sat": { + "opc": "ISA_SHL", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "SIGNED", "SATURATION_SATURATE", 0 ], + "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_usshl_sat": { + "opc": "ISA_SHL", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "UNSIGNED", "SATURATION_SATURATE", 0 ], + "src0": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "SIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_uushl_sat": { + "opc": "ISA_SHL", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "UNSIGNED", "SATURATION_SATURATE", 0 ], + "src0": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 1 ], + "src1": [ "GENERAL", "UNSIGNED", "MODIFIER_ARITH", 2 ] + }, + "genx_rol": { + "opc": "ISA_ROL", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "UNSIGNED", 0 ], + "src0": [ "GENERAL", "UNSIGNED", 1 ], + "src1": [ "GENERAL", "UNSIGNED", 2 ] + }, + "genx_ror": { + "opc": "ISA_ROR", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "UNSIGNED", 0 ], + "src0": [ "GENERAL", "UNSIGNED", 1 ], + "src1": [ "GENERAL", "UNSIGNED", 2 ] + }, + "genx_sbfe": { + "opc": "ISA_BFE", + "exec_size": [ "EXECSIZE_NOT2" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "OWALIGNED", "SIGNED", 0 ], + "src0": [ "GENERAL", "OWALIGNED", "SIGNED", 1 ], + "src1": [ "GENERAL", "OWALIGNED", "SIGNED", 2 ], + "src2": [ "GENERAL", "OWALIGNED", "SIGNED", 3 ] + }, + "genx_ubfe": { + "opc": "ISA_BFE", + "exec_size": [ "EXECSIZE_NOT2" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "OWALIGNED", "UNSIGNED", 0 ], + "src0": [ "GENERAL", "OWALIGNED", "UNSIGNED", 1 ], + "src1": [ "GENERAL", "OWALIGNED", "UNSIGNED", 2 ], + "src2": [ "GENERAL", "OWALIGNED", "UNSIGNED", 3 ] + }, + "genx_bfi": { + "opc": "ISA_BFI", + "exec_size": [ "EXECSIZE_NOT2" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "OWALIGNED", 0 ], + "src0": [ "GENERAL", "OWALIGNED", "UNSIGNED", 1 ], + "src1": [ "GENERAL", "OWALIGNED", "UNSIGNED", 2 ], + "src2": [ "GENERAL", "OWALIGNED", "UNSIGNED", 3 ], + "src3": [ "GENERAL", "OWALIGNED", "UNSIGNED", 4 ] + }, + "genx_va_minmax": { + "opc": "ISA_VA", + "gen_opc": "ISA_VA_MINMAX_FOPCODE", + "sub_opc": [ "LITERAL", "MINMAX_FOPCODE" ], + "surface": [ "SURFACE", 1 ], + "normalized_x_co_ordinate": [ "GENERAL", 2 ], + "normalized_y_co_ordinate": [ "GENERAL", 3 ], + "Min_Max_Enable": [ "GENERAL", 4 ], + "Destination": [ "RAW", 0 ] + }, + "genx_va_minmax_filter": { + "opc": "ISA_VA", + "gen_opc": "ISA_VA_MINMAXFILTER_FOPCODE", + "sub_opc": [ "LITERAL", "MINMAXFILTER_FOPCODE" ], + "sampler": [ "SAMPLER", 1 ], + "surface": [ "SURFACE", 2 ], + "normalized_x_co_ordinate": [ "GENERAL", 3 ], + "normalized_y_co_ordinate": [ "GENERAL", 4 ], + "output_size": [ "BYTE", 5 ], + "return_data_format": [ "BYTE", 6 ], + "Min_Max_Enable": [ "GENERAL", 7 ], + "Destination": [ "RAW", 0 ] + }, + "genx_va_centroid": { + "opc": "ISA_VA", + "gen_opc": "ISA_VA_Centroid_FOPCODE", + "sub_opc": [ "LITERAL", "Centroid_FOPCODE" ], + "surface": [ "SURFACE", 1 ], + "normalized_x_co_ordinate": [ "GENERAL", 2 ], + "normalized_y_co_ordinate": [ "GENERAL", 3 ], + "vSize": [ "GENERAL", 4 ], + "Destination": [ "RAW", 0 ] + }, + "genx_va_bool_centroid": { + "opc": "ISA_VA", + "gen_opc": "ISA_VA_BoolCentroid_FOPCODE", + "sub_opc": [ "LITERAL", "BoolCentroid_FOPCODE" ], + "surface": [ "SURFACE", 1 ], + "normalized_x_co_ordinate": [ "GENERAL", 2 ], + "normalized_y_co_ordinate": [ "GENERAL", 3 ], + "vSize": [ "GENERAL", 4 ], + "hSize": [ "GENERAL", 5 ], + "Destination": [ "RAW", 0 ] + }, + "genx_va_hdc_1pixel_convolve": { + "opc": "ISA_VA_SKL_PLUS", + "gen_opc": "ISA_VA_SKL_PLUS_ISA_HDC_1PIXELCONV", + "sub_opc": [ "LITERAL", "ISA_HDC_1PIXELCONV" ], + "sampler": [ "SAMPLER", 1 ], + "surface": [ "SURFACE", 2 ], + "normalized_x_co_ordinate": [ "GENERAL", 3 ], + "normalized_y_co_ordinate": [ "GENERAL", 4 ], + "pixel_size": [ "BYTE", 5 ], + "offsets": [ "RAW", 6 ], + "destination_surface": [ "SURFACE", 7 ], + "destination_x_offset": [ "GENERAL", 8 ], + "destination_y_offset": [ "GENERAL", 9 ] + }, + "genx_va_hdc_convolve2d": { + "opc": "ISA_VA_SKL_PLUS", + "gen_opc": "ISA_VA_SKL_PLUS_ISA_HDC_CONV", + "sub_opc": [ "LITERAL", "ISA_HDC_CONV" ], + "sampler": [ "SAMPLER", 1 ], + "surface": [ "SURFACE", 2 ], + "normalized_x_co_ordinate": [ "GENERAL", 3 ], + "normalized_y_co_ordinate": [ "GENERAL", 4 ], + "properties": [ "BYTE", 5 ], + "destination_surface": [ "SURFACE", 6 ], + "destination_x_offset": [ "GENERAL", 7 ], + "destination_y_offset": [ "GENERAL", 8 ] + }, + "genx_va_hdc_lbp_correlation": { + "opc": "ISA_VA_SKL_PLUS", + "gen_opc": "ISA_VA_SKL_PLUS_ISA_HDC_LBPCORRELATION", + "sub_opc": [ "LITERAL", "ISA_HDC_LBPCORRELATION" ], + "surface": [ "SURFACE", 1 ], + "normalized_x_co_ordinate": [ "GENERAL", 2 ], + "normalized_y_co_ordinate": [ "GENERAL", 3 ], + "horizontal_disparity": [ "GENERAL", 4 ], + "destination_surface": [ "SURFACE", 5 ], + "destination_x_offset": [ "GENERAL", 6 ], + "destination_y_offset": [ "GENERAL", 7 ] + }, + "genx_va_hdc_lbp_creation": { + "opc": "ISA_VA_SKL_PLUS", + "gen_opc": "ISA_VA_SKL_PLUS_ISA_HDC_LBPCREATION", + "sub_opc": [ "LITERAL", "ISA_HDC_LBPCREATION" ], + "surface": [ "SURFACE", 1 ], + "normalized_x_co_ordinate": [ "GENERAL", 2 ], + "normalized_y_co_ordinate": [ "GENERAL", 3 ], + "mode": [ "BYTE", 4 ], + "destination_surface": [ "SURFACE", 5 ], + "destination_x_offset": [ "GENERAL", 6 ], + "destination_y_offset": [ "GENERAL", 7 ] + }, + "genx_va_hdc_minmax_filter": { + "opc": "ISA_VA_SKL_PLUS", + "gen_opc": "ISA_VA_SKL_PLUS_ISA_HDC_MMF", + "sub_opc": [ "LITERAL", "ISA_HDC_MMF" ], + "sampler": [ "SAMPLER", 1 ], + "surface": [ "SURFACE", 2 ], + "normalized_x_co_ordinate": [ "GENERAL", 3 ], + "normalized_y_co_ordinate": [ "GENERAL", 4 ], + "return_data_format": [ "BYTE", 5 ], + "minmax_enable_mode": [ "BYTE", 6 ], + "destination_surface": [ "SURFACE", 7 ], + "destination_x_offset": [ "GENERAL", 8 ], + "destination_y_offset": [ "GENERAL", 9 ] + }, + "genx_va_correlation_search": { + "opc": "ISA_VA_SKL_PLUS", + "gen_opc": "ISA_VA_SKL_PLUS_VA_OP_CODE_CORRELATION_SEARCH", + "sub_opc": [ "LITERAL", "VA_OP_CODE_CORRELATION_SEARCH" ], + "surface": [ "SURFACE", 1 ], + "normalized_x_co_ordinate": [ "GENERAL", 2 ], + "normalized_y_co_ordinate": [ "GENERAL", 3 ], + "normalized_vertical_origin": [ "GENERAL", 4 ], + "normalized_horizontal_origin": [ "GENERAL", 5 ], + "x_direction_size": [ "GENERAL", 6 ], + "y_direction_size": [ "GENERAL", 7 ], + "x_direction_search_size": [ "GENERAL", 8 ], + "y_direction_search_size": [ "GENERAL", 9 ], + "Destination": [ "RAW", 0 ] + }, + "genx_va_flood_fill": { + "opc": "ISA_VA_SKL_PLUS", + "gen_opc": "ISA_VA_SKL_PLUS_VA_OP_CODE_FLOOD_FILL", + "sub_opc": [ "LITERAL", "VA_OP_CODE_FLOOD_FILL" ], + "Is8Connect": [ "BYTE", 1 ], + "pixel_mask_horizontal_direction": [ "RAW", 2 ], + "pixel_mask_vertical_direction_left": [ "GENERAL", 3 ], + "pixel_mask_vertical_direction_right": [ "GENERAL", 4 ], + "loop_count": [ "GENERAL", 5 ], + "Destination": [ "RAW", 0 ] + }, + "genx_va_lbp_correlation": { + "opc": "ISA_VA_SKL_PLUS", + "gen_opc": "ISA_VA_SKL_PLUS_VA_OP_CODE_LBP_CORRELATION", + "sub_opc": [ "LITERAL", "VA_OP_CODE_LBP_CORRELATION" ], + "surface": [ "SURFACE", 1 ], + "normalized_x_co_ordinate": [ "GENERAL", 2 ], + "normalized_y_co_ordinate": [ "GENERAL", 3 ], + "horizontal_disparity": [ "GENERAL", 4 ], + "Destination": [ "RAW", 0 ] + }, + "genx_va_lbp_creation": { + "opc": "ISA_VA_SKL_PLUS", + "gen_opc": "ISA_VA_SKL_PLUS_VA_OP_CODE_LBP_CREATION", + "sub_opc": [ "LITERAL", "VA_OP_CODE_LBP_CREATION" ], + "surface": [ "SURFACE", 1 ], + "normalized_x_co_ordinate": [ "GENERAL", 2 ], + "normalized_y_co_ordinate": [ "GENERAL", 3 ], + "mode": [ "BYTE", 4 ], + "Destination": [ "RAW", 0 ] + }, + "genx_3d_load": { + "opc": "ISA_3D_LOAD", + "sampling3d_opcode": [ "BYTE", 1 ], + "exec_size": [ "EXECSIZE_FROM_ARG", 2 ], + "pred": [ "PREDICATION", 2 ], + "channel_mask": [ "BYTE", 3 ], + "aoffimmi_value": [ "GENERAL", "UNSIGNED", 4 ], + "surface": [ "SURFACE", 5 ], + "dst": [ "RAW", 0 ], + "number_of_additional_operands": [ "ARGCOUNT", "ARGCOUNTMIN1", 6 ], + "raw_operands": [ "RAW_OPERANDS", "RAW", 6 ] + }, + "genx_frc": { + "opc": "ISA_FRC", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "SATURATION_NOSAT", 0 ], + "src0": [ "GENERAL", 1 ] + }, + "genx_va_convolve2d": { + "opc": "ISA_VA", + "gen_opc": "ISA_VA_Convolve_FOPCODE", + "sub_opc": "Convolve_FOPCODE", + "sampler": [ "SAMPLER", 1 ], + "surface": [ "SURFACE", 2 ], + "normalized_x_co_ordinate": [ "GENERAL", 3 ], + "normalized_y_co_ordinate": [ "GENERAL", 4 ], + "properties": [ "BYTE", 5 ], + "dst": [ "RAW", 0 ] + }, + "genx_va_erode": { + "opc": "ISA_VA", + "gen_opc": "ISA_VA_ERODE_FOPCODE", + "sub_opc": "ERODE_FOPCODE", + "sampler": [ "SAMPLER", 1 ], + "surface": [ "SURFACE", 2 ], + "normalized_x_co_ordinate": [ "GENERAL", 3 ], + "normalized_y_co_ordinate": [ "GENERAL", 4 ], + "properties": [ "BYTE", 5 ], + "dst": [ "RAW", 0 ] + }, + "genx_va_dilate": { + "opc": "ISA_VA", + "gen_opc": "ISA_VA_ERODE_FOPCODE", + "sub_opc": "Dilate_FOPCODE", + "sampler": [ "SAMPLER", 1 ], + "surface": [ "SURFACE", 2 ], + "normalized_x_co_ordinate": [ "GENERAL", 3 ], + "normalized_y_co_ordinate": [ "GENERAL", 4 ], + "properties": [ "BYTE", 5 ], + "dst": [ "RAW", 0 ] + }, + "genx_va_hdc_erode": { + "opc": "ISA_VA_SKL_PLUS", + "gen_opc": "ISA_VA_SKL_PLUS_ISA_HDC_ERODE", + "sub_opc": "ISA_HDC_ERODE", + "sampler": [ "SAMPLER", 1 ], + "surface": [ "SURFACE", 2 ], + "normalized_x_co_ordinate": [ "GENERAL", 3 ], + "normalized_y_co_ordinate": [ "GENERAL", 4 ], + "dstSurface": [ "SURFACE", 5 ], + "xOffset": [ "GENERAL", 6 ], + "yOffset": [ "GENERAL", 7 ] + }, + "genx_va_hdc_dilate": { + "opc": "ISA_VA_SKL_PLUS", + "gen_opc": "ISA_VA_SKL_PLUS_ISA_HDC_DILATE", + "sub_opc": "ISA_HDC_DILATE", + "sampler": [ "SAMPLER", 1 ], + "surface": [ "SURFACE", 2 ], + "normalized_x_co_ordinate": [ "GENERAL", 3 ], + "normalized_y_co_ordinate": [ "GENERAL", 4 ], + "dstSurface": [ "SURFACE", 5 ], + "xOffset": [ "GENERAL", 6 ], + "yOffset": [ "GENERAL", 7 ] + }, + "genx_barrier": { + "opc": "ISA_BARRIER", + "nobarrier": [ "ISBARRIER" ] + }, + "genx_yield": { + "opc": "ISA_YIELD" + }, + "genx_cache_flush": { + "opc": "ISA_SAMPLR_CACHE_FLUSH" + }, + "genx_sbarrier": { + "opc": "ISA_SBARRIER", + "signal_flag": [ "BYTE", 1 ] + }, + "genx_bfrev": { + "opc": "ISA_BFREV", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "UNSIGNED", 0 ], + "src0": [ "GENERAL", "UNSIGNED", 1 ] + }, + "genx_cbit": { + "opc": "ISA_CBIT", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "UNSIGNED", 0 ], + "src0": [ "GENERAL", "UNSIGNED", 1 ] + }, + "genx_ieee_div": { + "opc": "ISA_DIVM", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", 0 ], + "src0": [ "GENERAL", 1 ], + "src1": [ "GENERAL", 2 ] + }, + "genx_dp2": { + "opc": "ISA_DP2", + "exec_size": [ "EXECSIZE_GE4" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "STRIDE1", "OWALIGNED", 0 ], + "src0": [ "GENERAL", "STRIDE1", "OWALIGNED", 1 ], + "src1": [ "GENERAL", "STRIDE1", "OWALIGNED", 2 ] + }, + "genx_dp3": { + "opc": "ISA_DP3", + "exec_size": [ "EXECSIZE_GE4" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "STRIDE1", "OWALIGNED", 0 ], + "src0": [ "GENERAL", "STRIDE1", "OWALIGNED", 1 ], + "src1": [ "GENERAL", "STRIDE1", "OWALIGNED", 2 ] + }, + "genx_dp4": { + "opc": "ISA_DP4", + "exec_size": [ "EXECSIZE_GE4" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "STRIDE1", "OWALIGNED", 0 ], + "src0": [ "GENERAL", "STRIDE1", "OWALIGNED", 1 ], + "src1": [ "GENERAL", "STRIDE1", "OWALIGNED", 2 ] + }, + "genx_dph": { + "opc": "ISA_DPH", + "exec_size": [ "EXECSIZE_GE4" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "STRIDE1", "OWALIGNED", 0 ], + "src0": [ "GENERAL", "STRIDE1", "OWALIGNED", 1 ], + "src1": [ "GENERAL", "STRIDE1", "OWALIGNED", 2 ] + }, + "genx_sfbh": { + "opc": "ISA_FBH", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "UNSIGNED", 0 ], + "src0": [ "GENERAL", "SIGNED", 1 ] + }, + "genx_ufbh": { + "opc": "ISA_FBH", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "UNSIGNED", 0 ], + "src0": [ "GENERAL", "UNSIGNED", 1 ] + }, + "genx_fbl": { + "opc": "ISA_FBL", + "exec_size": [ "EXECSIZE" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "UNSIGNED", 0 ], + "src0": [ "GENERAL", "UNSIGNED", 1 ] + }, + "genx_line": { + "opc": "ISA_LINE", + "exec_size": [ "EXECSIZE_GE4" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", 0 ], + "src0": [ "GENERAL", "FIXED4", "NOIMM", 1 ], + "src1": [ "GENERAL", 2 ] + }, + "genx_load": { + "opc": "ISA_LOAD", + "channel_mask": [ "SAMPLECHMASK", 1 ], + "surface": [ "SURFACE", 2 ], + "U_pixel_address": [ "RAW", 3 ], + "V_pixel_address": [ "RAW", 4 ], + "R_pixel_address": [ "RAW", 5 ], + "dst": [ "RAW", 0 ] + }, + "genx_lrp": { + "opc": "ISA_LRP", + "exec_size": [ "EXECSIZE_GE4" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", "OWALIGNED", "CONTIGUOUS", 0 ], + "src0": [ + "GENERAL", + "OWALIGNED", + "SCALARORCONTIGUOUS", + "NOIMM", + 1 + ], + "src1": [ + "GENERAL", + "OWALIGNED", + "SCALARORCONTIGUOUS", + "NOIMM", + 2 + ], + "src2": [ + "GENERAL", + "OWALIGNED", + "SCALARORCONTIGUOUS", + "NOIMM", + 3 + ] + }, + "genx_pln": { + "opc": "ISA_PLANE", + "exec_size": [ "EXECSIZE_GE8" ], + "pred": [ "IMPLICITPRED" ], + "dst": [ "GENERAL", 0 ], + "src0": [ "GENERAL", "OWALIGNED", "FIXED4", "NOIMM", 1 ], + "src1": [ "GENERAL", "GRFALIGNED", "TWICEWIDTH", "NOIMM", 2 ] + }, + "genx_sample": { + "opc": "ISA_SAMPLE", + "channel_mask": [ "SAMPLECHMASK", 1 ], + "sampler": [ "SAMPLER", 2 ], + "surface": [ "SURFACE", 3 ], + "U_pixel_address": [ "RAW", 4 ], + "V_pixel_address": [ "RAW", 5 ], + "R_pixel_address": [ "RAW", 6 ], + "dst": [ "RAW", 0 ] + }, + "genx_svm_atomic_add": { + "opc": "ISA_SVM", + "gen_opc": "ISA_SVM_SVM_ATOMIC", + "skip": [ "LITERAL", "SVM_ATOMIC" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "sub_opc": [ "LITERAL", "ATOMIC_ADD" ], + "address": [ "URAW", 2 ], + "src0": [ "URAW", 3 ], + "src1": [ "NULLRAW" ], + "skip__": [ "TWOADDR", 4 ], + "dst": [ "URAW", 0 ] + }, + "genx_svm_atomic_sub": { + "opc": "ISA_SVM", + "gen_opc": "ISA_SVM_SVM_ATOMIC", + "skip": [ "LITERAL", "SVM_ATOMIC" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "sub_opc": [ "LITERAL", "ATOMIC_SUB" ], + "address": [ "URAW", 2 ], + "src0": [ "URAW", 3 ], + "src1": [ "NULLRAW" ], + "skip__": [ "TWOADDR", 4 ], + "dst": [ "URAW", 0 ] + }, + "genx_svm_atomic_min": { + "opc": "ISA_SVM", + "gen_opc": "ISA_SVM_SVM_ATOMIC", + "skip": [ "LITERAL", "SVM_ATOMIC" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "sub_opc": [ "LITERAL", "ATOMIC_MIN" ], + "address": [ "URAW", 2 ], + "src0": [ "URAW", 3 ], + "src1": [ "NULLRAW" ], + "skip__": [ "TWOADDR", 4 ], + "dst": [ "URAW", 0 ] + }, + "genx_svm_atomic_max": { + "opc": "ISA_SVM", + "gen_opc": "ISA_SVM_SVM_ATOMIC", + "skip": [ "LITERAL", "SVM_ATOMIC" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "sub_opc": [ "LITERAL", "ATOMIC_MAX" ], + "address": [ "URAW", 2 ], + "src0": [ "URAW", 3 ], + "src1": [ "NULLRAW" ], + "skip__": [ "TWOADDR", 4 ], + "dst": [ "URAW", 0 ] + }, + "genx_svm_atomic_xchg": { + "opc": "ISA_SVM", + "gen_opc": "ISA_SVM_SVM_ATOMIC", + "skip": [ "LITERAL", "SVM_ATOMIC" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "sub_opc": [ "LITERAL", "ATOMIC_XCHG" ], + "address": [ "URAW", 2 ], + "src0": [ "URAW", 3 ], + "src1": [ "NULLRAW" ], + "skip__": [ "TWOADDR", 4 ], + "dst": [ "URAW", 0 ] + }, + "genx_svm_atomic_and": { + "opc": "ISA_SVM", + "gen_opc": "ISA_SVM_SVM_ATOMIC", + "skip": [ "LITERAL", "SVM_ATOMIC" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "sub_opc": [ "LITERAL", "ATOMIC_AND" ], + "address": [ "URAW", 2 ], + "src0": [ "URAW", 3 ], + "src1": [ "NULLRAW" ], + "skip__": [ "TWOADDR", 4 ], + "dst": [ "URAW", 0 ] + }, + "genx_svm_atomic_or": { + "opc": "ISA_SVM", + "gen_opc": "ISA_SVM_SVM_ATOMIC", + "skip": [ "LITERAL", "SVM_ATOMIC" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "sub_opc": [ "LITERAL", "ATOMIC_OR" ], + "address": [ "URAW", 2 ], + "src0": [ "URAW", 3 ], + "src1": [ "NULLRAW" ], + "skip__": [ "TWOADDR", 4 ], + "dst": [ "URAW", 0 ] + }, + "genx_svm_atomic_xor": { + "opc": "ISA_SVM", + "gen_opc": "ISA_SVM_SVM_ATOMIC", + "skip": [ "LITERAL", "SVM_ATOMIC" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "sub_opc": [ "LITERAL", "ATOMIC_XOR" ], + "address": [ "URAW", 2 ], + "src0": [ "URAW", 3 ], + "src1": [ "NULLRAW" ], + "skip__": [ "TWOADDR", 4 ], + "dst": [ "URAW", 0 ] + }, + "genx_svm_atomic_imin": { + "opc": "ISA_SVM", + "gen_opc": "ISA_SVM_SVM_ATOMIC", + "skip": [ "LITERAL", "SVM_ATOMIC" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "sub_opc": [ "LITERAL", "ATOMIC_IMIN" ], + "address": [ "URAW", 2 ], + "src0": [ "SRAW", 3 ], + "src1": [ "NULLRAW" ], + "skip__": [ "TWOADDR", 4 ], + "dst": [ "SRAW", 0 ] + }, + "genx_svm_atomic_imax": { + "opc": "ISA_SVM", + "gen_opc": "ISA_SVM_SVM_ATOMIC", + "skip": [ "LITERAL", "SVM_ATOMIC" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "sub_opc": [ "LITERAL", "ATOMIC_IMAX" ], + "address": [ "URAW", 2 ], + "src0": [ "SRAW", 3 ], + "src1": [ "NULLRAW" ], + "skip__": [ "TWOADDR", 4 ], + "dst": [ "SRAW", 0 ] + }, + "genx_svm_atomic_inc": { + "opc": "ISA_SVM", + "gen_opc": "ISA_SVM_SVM_ATOMIC", + "skip": [ "LITERAL", "SVM_ATOMIC" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "sub_opc": [ "LITERAL", "ATOMIC_INC" ], + "address": [ "URAW", 2 ], + "src0": [ "NULLRAW" ], + "src1": [ "NULLRAW" ], + "skip__": [ "TWOADDR", 3 ], + "dst": [ "URAW", 0 ] + }, + "genx_svm_atomic_dec": { + "opc": "ISA_SVM", + "gen_opc": "ISA_SVM_SVM_ATOMIC", + "skip": [ "LITERAL", "SVM_ATOMIC" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "sub_opc": [ "LITERAL", "ATOMIC_DEC" ], + "address": [ "URAW", 2 ], + "src0": [ "NULLRAW" ], + "src1": [ "NULLRAW" ], + "skip__": [ "TWOADDR", 3 ], + "dst": [ "URAW", 0 ] + }, + "genx_svm_atomic_cmpxchg": { + "opc": "ISA_SVM", + "gen_opc": "ISA_SVM_SVM_ATOMIC", + "skip": [ "LITERAL", "SVM_ATOMIC" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "sub_opc": [ "LITERAL", "ATOMIC_CMPXCHG" ], + "address": [ "URAW", 2 ], + "src0": [ "URAW", 3 ], + "src1": [ "URAW", 4 ], + "skip__": [ "TWOADDR", 5 ], + "dst": [ "URAW", 0 ] + }, + "genx_svm_block_ld": { + "opc": "ISA_SVM", + "gen_opc": "ISA_SVM_SVM_BLOCK_LD", + "sub_opc": [ "LITERAL", "SVM_BLOCK_LD" ], + "log2_owords": [ "LOG2OWORDS", 0 ], + "address": [ "GENERAL", "UNSIGNED", 1 ], + "dst": [ "RAW", 0 ] + }, + "genx_svm_block_ld_unaligned": { + "opc": "ISA_SVM", + "gen_opc": "ISA_SVM_SVM_BLOCK_LD", + "sub_opc": [ "LITERAL", "SVM_BLOCK_LD" ], + "log2_owords": [ "LOG2OWORDS_PLUS_8", 0 ], + "address": [ "GENERAL", "UNSIGNED", 1 ], + "dst": [ "RAW", 0 ] + }, + "genx_svm_block_st": { + "opc": "ISA_SVM", + "gen_opc": "ISA_SVM_SVM_BLOCK_ST", + "sub_opc": [ "LITERAL", "SVM_BLOCK_ST" ], + "log2_owords": [ "LOG2OWORDS", 2 ], + "address": [ "GENERAL", "UNSIGNED", 1 ], + "src": [ "RAW", 2 ] + }, + "genx_svm_gather": { + "opc": "ISA_SVM", + "gen_opc": "ISA_SVM_SVM_GATHER", + "sub_opc": [ "LITERAL", "SVM_GATHER" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "block_size_inferred_from_dst": [ "SVMGATHERBLOCKSIZE", 0 ], + "log2_num_blocks": [ "BYTE", 2 ], + "address": [ "URAW", 3 ], + "twoaddr": [ "TWOADDR", 4 ], + "dst": [ "RAW", 0 ] + }, + "genx_svm_gather4_scaled": { + "opc": "ISA_SVM", + "gen_opc": "ISA_SVM_SVM_GATHER4SCALED", + "sub_opc": [ "LITERAL", "SVM_GATHER4SCALED" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "channel_mask": [ "BYTE", 2 ], + "scale": [ "SHORT", 3 ], + "address": [ "GENERAL", "UNSIGNED", 4 ], + "offset": [ "URAW", 5 ], + "twoaddr": [ "TWOADDR", 6 ], + "dst": [ "RAW", 0 ] + }, + "genx_svm_scatter": { + "opc": "ISA_SVM", + "gen_opc": "ISA_SVM_SVM_SCATTER", + "sub_opc": [ "LITERAL", "SVM_SCATTER" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "block_size_inferred_from_dst": [ "SVMGATHERBLOCKSIZE", 4 ], + "log2_num_blocks": [ "BYTE", 2 ], + "address": [ "URAW", 3 ], + "src": [ "RAW", 4 ] + }, + "genx_svm_scatter4_scaled": { + "opc": "ISA_SVM", + "gen_opc": "ISA_SVM_SVM_SCATTER4SCALED", + "sub_opc": [ "LITERAL", "SVM_SCATTER4SCALED" ], + "exec_size": [ "EXECSIZE_FROM_ARG", 1 ], + "pred": [ "PREDICATION", 1 ], + "channel_mask": [ "BYTE", 2 ], + "scale": [ "SHORT", 3 ], + "address": [ "GENERAL", "UNSIGNED", 4 ], + "offset": [ "URAW", 5 ], + "src": [ "RAW", 6 ] + }, + "genx_predefined_surface": { + "opc": "ISA_MOVS", + "exec_size": [ "EXECSIZE" ], + "dst": [ "SURFACE", 0 ], + "src0": [ "INT", 1 ], + "OPTIONS": [ "disable" ] + }, + "genx_va_1pixel_convolve": { + "opc": "ISA_VA_SKL_PLUS", + "gen_opc": "ISA_VA_SKL_PLUS__VA_OP_CODE_1PIXEL_CONVOLVE", + "sub_opc": "VA_OP_CODE_1PIXEL_CONVOLVE", + "sampler": [ "SAMPLER", 1 ], + "surface": [ "SURFACE", 2 ], + "uOffset": [ "GENERAL", 3 ], + "vOffset": [ "GENERAL", 4 ], + "mode": [ "BYTE", 5 ], + "offsets": [ "RAW", 6 ], + "dst": [ "RAW", 0 ] + }, + "genx_va_1pixel_convolve_1x1mode": { + "opc": "ISA_VA_SKL_PLUS", + "gen_opc": "ISA_VA_SKL_PLUS__VA_OP_CODE_1PIXEL_CONVOLVE", + "sub_opc": "VA_OP_CODE_1PIXEL_CONVOLVE", + "sampler": [ "SAMPLER", 1 ], + "surface": [ "SURFACE", 2 ], + "uOffset": [ "GENERAL", 3 ], + "vOffset": [ "GENERAL", 4 ], + "mode": [ "LITERAL", 3 ], + "offsets": [ "NULLRAW" ], + "dst": [ "RAW", 0 ] + }, + "genx_va_1d_convolve_vertical": { + "opc": "ISA_VA_SKL_PLUS", + "gen_opc": "ISA_VA_SKL_PLUS__VA_OP_CODE_1D_CONVOLVE_VH", + "sub_opc": "VA_OP_CODE_1D_CONVOLVE_VERTICAL", + "sampler": [ "SAMPLER", 1 ], + "surface": [ "SURFACE", 2 ], + "uOffset": [ "GENERAL", 3 ], + "vOffset": [ "GENERAL", 4 ], + "mode": [ "BYTE", 5 ], + "direction": [ "LITERAL", "VA_V_DIRECTION" ], + "dst": [ "RAW", 0 ] + }, + "genx_va_1d_convolve_horizontal": { + "opc": "ISA_VA_SKL_PLUS", + "gen_opc": "ISA_VA_SKL_PLUS__VA_OP_CODE_1D_CONVOLVE_VH", + "sub_opc": "VA_OP_CODE_1D_CONVOLVE_HORIZONTAL", + "sampler": [ "SAMPLER", 1 ], + "surface": [ "SURFACE", 2 ], + "uOffset": [ "GENERAL", 3 ], + "vOffset": [ "GENERAL", 4 ], + "mode": [ "BYTE", 5 ], + "direction": [ "LITERAL", "VA_H_DIRECTION" ], + "dst": [ "RAW", 0 ] + }, + "genx_va_hdc_1d_convolve_horizontal": { + "opc": "ISA_VA_SKL_PLUS", + "gen_opc": "ISA_VA_SKL_PLUS__ISA_HDC_1DCONV_VH", + "sub_opc": [ "LITERAL", "ISA_HDC_1DCONV_H" ], + "sampler": [ "SAMPLER", 1 ], + "surface": [ "SURFACE", 2 ], + "uOffset": [ "GENERAL", 3 ], + "vOffset": [ "GENERAL", 4 ], + "properties": [ "BYTE", 5 ], + "direction": [ "LITERAL", "VA_H_DIRECTION" ], + "dstSurface": [ "SURFACE", 6 ], + "xOffset": [ "GENERAL", 7 ], + "yOffset": [ "GENERAL", 8 ] + }, + "genx_va_hdc_1d_convolve_vertical": { + "opc": "ISA_VA_SKL_PLUS", + "gen_opc": "ISA_VA_SKL_PLUS__ISA_HDC_1DCONV_VH", + "sub_opc": [ "LITERAL", "ISA_HDC_1DCONV_H" ], + "sampler": [ "SAMPLER", 1 ], + "surface": [ "SURFACE", 2 ], + "uOffset": [ "GENERAL", 3 ], + "vOffset": [ "GENERAL", 4 ], + "properties": [ "BYTE", 5 ], + "direction": [ "LITERAL", "VA_V_DIRECTION" ], + "dstSurface": [ "SURFACE", 6 ], + "xOffset": [ "GENERAL", 7 ], + "yOffset": [ "GENERAL", 8 ] + }, + "genx_simdcf_get_em":{ + "opc": "ISA_CMP_E", + "exec_size": [ "EXECSIZE_FROM_ARG", 1], + "src0": "CreateImmOpndFromUInt(ISA_TYPE_UB, 1)", + "src1": "CreateImmOpndFromUInt(ISA_TYPE_UB, 1)", + "dst" : [ "Z_PREDICATE", 0] + } + }, + + "OPCODE_GEN": { + "ISA_VA_SKL_PLUS__ISA_HDC_1DCONV_VH": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAVAHDCConvolve1D", + "sampler", + "surface", + "uOffset", + "vOffset", + "(HDCReturnFormat)properties", + "direction", + "dstSurface", + "xOffset", + "yOffset" + ] + ] + ], + "ISA_VA_SKL_PLUS__VA_OP_CODE_1D_CONVOLVE_VH": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAVAConvolve1D", + "sampler", + "surface", + "uOffset", + "vOffset", + "(CONVExecMode)mode", + "direction", + "dst" + ] + ] + ], + "ISA_VA_SKL_PLUS__VA_OP_CODE_1PIXEL_CONVOLVE": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAVAConvolve1Pixel", + "sampler", + "surface", + "uOffset", + "vOffset", + "(CONV1PixelExecMode)mode", + "offsets", + "dst" + ] + ] + ], + "ISA_MOV": [ + [ "CISA_CALL", + [ "Kernel->AppendVISADataMovementInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "nullptr" + ] + ] + ], + "ISA_MOVS": [ + [ "CISA_CALL", + [ "Kernel->AppendVISADataMovementInst", + "opc", + "nullptr", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "(VISA_VectorOpnd*)dst", + "src0", + "nullptr" + ] + ] + ], + "ISA_MEDIA_LD": [ + [ "CISA_CALL", + [ "Kernel->AppendVISASurfAccessMediaLoadStoreInst", + "opc", + "(MEDIA_LD_mod)modifiers", + "surface", + "block_width", + "block_height", + "x_offset", + "y_offset", + "dst", + "(CISA_PLANE_ID)plane" + ] + ] + ], + "ISA_MEDIA_ST": [ + [ "CISA_CALL", + [ "Kernel->AppendVISASurfAccessMediaLoadStoreInst", + "opc", + "(MEDIA_LD_mod)modifiers", + "surface", + "block_width", + "block_height", + "x_offset", + "y_offset", + "src", + "(CISA_PLANE_ID)plane" + ] + ] + ], + "ISA_OWORD_LD": [ + [ "CISA_CALL", + [ "Kernel->AppendVISASurfAccessOwordLoadStoreInst", + "opc", + "vISA_EMASK_M1", + "surface", + "log2_owords", + "offset", + "dst" + ] + ] + ], + "ISA_OWORD_ST": [ + [ "CISA_CALL", + [ "Kernel->AppendVISASurfAccessOwordLoadStoreInst", + "opc", + "vISA_EMASK_M1", + "surface", + "log2_owords", + "offset", + "src" + ] + ] + ], + "ISA_DWORD_ATOMIC": [ + [ "CISA_CALL", + [ "Kernel->AppendVISASurfAccessDwordAtomicInst", + "pred", + "sub_opc", + "false", + "exec_mask", + "exec_size", + "surface", + "offset", + "src", + "src1", + "dst" + ] + ] + ], + "ISA_MAD": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAArithmeticInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "src1", + "src2" + ] + ] + ], + "ISA_SETP": [ + [ "CISA_CALL", + [ "Kernel->AppendVISASetP", + "exec_mask", + "exec_size", + "dst", + "src0" + ] + ] + ], + "ISA_FMINMAX": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAMinMaxInst", + "(CISA_MIN_MAX_SUB_OPCODE)flag_for_max", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "src1" + ] + ] + ], + "ISA_POW": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAArithmeticInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "src1", + "nullptr" + ] + ] + ], + "ISA_ADDR_ADD": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAAddrAddInst", + "exec_mask", + "exec_size", + "dst", + "src0", + "src1" + ] + ] + ], + "ISA_3D_SAMPLE": [ + [ "CISA_CALL", + [ "Kernel->AppendVISA3dSampler", + "(VISASampler3DSubOpCode)sampling3d_opcode", + "false", + "false", + "false", + "pred", + "exec_mask", + "exec_size", + "static_cast(channel_mask)", + "aoffimmi_value", + "sampler", + "surface", + "dst", + "number_of_additional_operands", + "raw_operands" + ] + ] + ], + "ISA_SQRT": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAArithmeticInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "nullptr", + "nullptr" + ] + ] + ], + "ISA_RSQRT": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAArithmeticInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "nullptr", + "nullptr" + ] + ] + ], + "ISA_SQRTM": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAArithmeticInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "nullptr", + "nullptr" + ] + ] + ], + "ISA_INV": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAArithmeticInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "nullptr", + "nullptr" + ] + ] + ], + "ISA_LOG": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAArithmeticInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "nullptr", + "nullptr" + ] + ] + ], + "ISA_EXP": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAArithmeticInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "nullptr", + "nullptr" + ] + ] + ], + "ISA_SCATTER_SCALED": [ + [ "CISA_CALL", + [ "Kernel->AppendVISASurfAccessScatterScaledInst", + "opc", + "pred", + "exec_mask", + "exec_size", + "(VISA_SVM_Block_Num)log2_num_blocks", + "surface", + "global_offset", + "element_offset", + "src" + ] + ] + ], + "ISA_SCATTER4_SCALED": [ + [ "CISA_CALL", + [ "Kernel->AppendVISASurfAccessGather4Scatter4ScaledInst", + "opc", + "pred", + "exec_mask", + "exec_size", + "convertChannelMaskToVisaType(channel_mask)", + "surface", + "global_offset", + "element_offset", + "src" + ] + ] + ], + "ISA_SCATTER4_TYPED": [ + [ "CISA_CALL", + [ "Kernel->AppendVISASurfAccessGather4Scatter4TypedInst", + "opc", + "pred", + "convertChannelMaskToVisaType(channel_mask)", + "exec_mask", + "exec_size", + "surface", + "U_pixel_address", + "V_pixel_address", + "R_pixel_address", + "LOD", + "src" + ] + ] + ], + "ISA_GATHER_SCALED": [ + [ "CISA_CALL", + [ "Kernel->AppendVISASurfAccessScatterScaledInst", + "opc", + "pred", + "exec_mask", + "exec_size", + "(VISA_SVM_Block_Num)log2_num_blocks", + "surface", + "global_offset", + "element_offset", + "dst" + ] + ] + ], + "ISA_GATHER4_SCALED": [ + [ "CISA_CALL", + [ "Kernel->AppendVISASurfAccessGather4Scatter4ScaledInst", + "opc", + "pred", + "exec_mask", + "exec_size", + "convertChannelMaskToVisaType(channel_mask)", + "surface", + "global_offset", + "element_offset", + "dst" + ] + ] + ], + "ISA_GATHER4_TYPED": [ + [ "CISA_CALL", + [ "Kernel->AppendVISASurfAccessGather4Scatter4TypedInst", + "opc", + "pred", + "convertChannelMaskToVisaType(channel_mask)", + "exec_mask", + "exec_size", + "surface", + "U_pixel_address", + "V_pixel_address", + "R_pixel_address", + "LOD", + "dst" + ] + ] + ], + "ISA_3D_TYPED_ATOMIC": [ + [ "CISA_CALL", + [ "Kernel->AppendVISA3dTypedAtomic", + "sub_opc", + "sub_opc & (1<<5)", + "pred", + "exec_mask", + "exec_size", + "surface", + "U", + "V", + "R", + "LOD", + "src0", + "src1", + "dst" + ] + ] + ], + "ISA_SAD2ADD": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAArithmeticInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "src1", + "src2" + ] + ] + ], + "ISA_SAD2": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAArithmeticInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "src1", + "nullptr" + ] + ] + ], + "ISA_WAIT": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAWaitInst", + "mask" + ] + ] + ], + "ISA_AVS": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAMEAVS", + "surface", + "sampler", + "(VISAChannelMask)channel_mask", + "U_pixel_address", + "V_pixel_address", + "deltaU", + "deltaV", + "u2d", + "v2d", + "groupID", + "verticalBlockNumber", + "(OutputFormatControl)output_format_control", + "(AVSExecMode)execMode", + "IEFByPass", + "dst" + ] + ] + ], + "ISA_SAMPLE_UNORM": [ + [ "CISA_CALL", + [ "Kernel->AppendVISASISampleUnorm", + "surface", + "sampler", + "(VISAChannelMask)channel_mask", + "U_pixel_address", + "V_pixel_address", + "deltaU", + "deltaV", + "dst", + "getChannelOutputFormat(channel_mask)" + ] + ] + ], + "ISA_SIN": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAArithmeticInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "nullptr", + "nullptr" + ] + ] + ], + "ISA_COS": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAArithmeticInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "nullptr", + "nullptr" + ] + ] + ], + "ISA_AVG": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAArithmeticInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "src1", + "nullptr" + ] + ] + ], + "ISA_FENCE": [ + [ "CISA_CALL", + [ "Kernel->AppendVISASyncInst", + "opc", + "mask" + ] + ] + ], + "ISA_ADD": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAArithmeticInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "src1", + "nullptr" + ] + ] + ], + "ISA_LZD": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAArithmeticInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "nullptr", + "nullptr" + ] + ] + ], + "ISA_RAW_SEND": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAMiscRawSend", + "pred", + "exec_mask", + "exec_size", + "modifier_sendc", + "extended_message_descriptor", + "numsrc", + "numdst", + "desc", + "src", + "dst" + ] + ] + ], + "ISA_RAW_SENDS": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAMiscRawSends", + "pred", + "exec_mask", + "exec_size", + "modifier_sendc", + "FFID", + "extended_message_descriptor", + "numsrc", + "numsrc2", + "numdst", + "desc", + "src", + "src2", + "dst", + "false" + ] + ] + ], + "ISA_RNDD": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAArithmeticInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "nullptr", + "nullptr" + ] + ] + ], + "ISA_RNDE": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAArithmeticInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "nullptr", + "nullptr" + ] + ] + ], + "ISA_RNDU": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAArithmeticInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "nullptr", + "nullptr" + ] + ] + ], + "ISA_RNDZ": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAArithmeticInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "nullptr", + "nullptr" + ] + ] + ], + "ISA_MUL": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAArithmeticInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "src1", + "nullptr" + ] + ] + ], + "ISA_MULH": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAArithmeticInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "src1", + "nullptr" + ] + ] + ], + "ISA_SHL": [ + [ "CISA_CALL", + [ "Kernel->AppendVISALogicOrShiftInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "src1", + "nullptr", + "nullptr" + ] + ] + ], + "ISA_ROL": [ + [ "CISA_CALL", + [ "Kernel->AppendVISALogicOrShiftInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "src1", + "nullptr", + "nullptr" + ] + ] + ], + "ISA_ROR": [ + [ "CISA_CALL", + [ "Kernel->AppendVISALogicOrShiftInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "src1", + "nullptr", + "nullptr" + ] + ] + ], + "ISA_BFE": [ + [ "CISA_CALL", + [ "Kernel->AppendVISALogicOrShiftInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "src1", + "src2", + "nullptr" + ] + ] + ], + "ISA_BFI": [ + [ "CISA_CALL", + [ "Kernel->AppendVISALogicOrShiftInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "src1", + "src2", + "src3" + ] + ] + ], + "ISA_VA_MINMAX_FOPCODE": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAVAMinMax", + "surface", + "normalized_x_co_ordinate", + "normalized_y_co_ordinate", + "Min_Max_Enable", + "Destination" + ] + ] + ], + "ISA_VA_MINMAXFILTER_FOPCODE": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAVAMinMaxFilter", + "sampler", + "surface", + "normalized_x_co_ordinate", + "normalized_y_co_ordinate", + "(OutputFormatControl)output_size", + "(MMFExecMode)return_data_format", + "Min_Max_Enable", + "Destination" + ] + ] + ], + "ISA_VA_Centroid_FOPCODE": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAVACentroid", + "surface", + "normalized_x_co_ordinate", + "normalized_y_co_ordinate", + "vSize", + "Destination" + ] + ] + ], + "ISA_VA_BoolCentroid_FOPCODE": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAVABooleanCentroid", + "surface", + "normalized_x_co_ordinate", + "normalized_y_co_ordinate", + "vSize", + "hSize", + "Destination" + ] + ] + ], + "ISA_VA_SKL_PLUS_ISA_HDC_1PIXELCONV": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAVAHDCConvolve1Pixel", + "sampler", + "surface", + "normalized_x_co_ordinate", + "normalized_y_co_ordinate", + "(HDCReturnFormat)pixel_size", + "offsets", + "destination_surface", + "destination_x_offset", + "destination_y_offset" + ] + ] + ], + "ISA_VA_SKL_PLUS_ISA_HDC_CONV": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAVAHDCConvolve", + "sampler", + "surface", + "normalized_x_co_ordinate", + "normalized_y_co_ordinate", + "(HDCReturnFormat)(properties & 0xf)", + "(CONVHDCRegionSize)(properties >> 4)", + "destination_surface", + "destination_x_offset", + "destination_y_offset" + ] + ] + ], + "ISA_VA_SKL_PLUS_ISA_HDC_LBPCORRELATION": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAVAHDCLBPCorrelation", + "surface", + "normalized_x_co_ordinate", + "normalized_y_co_ordinate", + "horizontal_disparity", + "destination_surface", + "destination_x_offset", + "destination_y_offset" + ] + ] + ], + "ISA_VA_SKL_PLUS_ISA_HDC_LBPCREATION": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAVAHDCLBPCreation", + "surface", + "normalized_x_co_ordinate", + "normalized_y_co_ordinate", + "(LBPCreationMode)mode", + "destination_surface", + "destination_x_offset", + "destination_y_offset" + ] + ] + ], + "ISA_VA_SKL_PLUS_ISA_HDC_MMF": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAVAHDCMinMaxFilter", + "sampler", + "surface", + "normalized_x_co_ordinate", + "normalized_y_co_ordinate", + "(HDCReturnFormat)return_data_format", + "(MMFEnableMode)minmax_enable_mode", + "destination_surface", + "destination_x_offset", + "destination_y_offset" + ] + ] + ], + "ISA_VA_SKL_PLUS_VA_OP_CODE_CORRELATION_SEARCH": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAVACorrelationSearch", + "surface", + "normalized_x_co_ordinate", + "normalized_y_co_ordinate", + "normalized_vertical_origin", + "normalized_horizontal_origin", + "x_direction_size", + "y_direction_size", + "x_direction_search_size", + "y_direction_search_size", + "Destination" + ] + ] + ], + "ISA_VA_SKL_PLUS_VA_OP_CODE_FLOOD_FILL": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAVAFloodFill", + "Is8Connect", + "pixel_mask_horizontal_direction", + "pixel_mask_vertical_direction_left", + "pixel_mask_vertical_direction_right", + "loop_count", + "Destination" + ] + ] + ], + "ISA_VA_SKL_PLUS_VA_OP_CODE_LBP_CORRELATION": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAVALBPCorrelation", + "surface", + "normalized_x_co_ordinate", + "normalized_y_co_ordinate", + "horizontal_disparity", + "Destination" + ] + ] + ], + "ISA_VA_SKL_PLUS_VA_OP_CODE_LBP_CREATION": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAVALBPCreation", + "surface", + "normalized_x_co_ordinate", + "normalized_y_co_ordinate", + "(LBPCreationMode)mode", + "Destination" + ] + ] + ], + "ISA_3D_LOAD": [ + [ "CISA_CALL", + [ "Kernel->AppendVISA3dLoad", + "(VISASampler3DSubOpCode)sampling3d_opcode", + "false", + "pred", + "exec_mask", + "exec_size", + "convertChannelMaskToVisaType(channel_mask)", + "aoffimmi_value", + "surface", + "dst", + "number_of_additional_operands", + "raw_operands" + ] + ] + ], + "ISA_FRC": [ + [ "CISA_CALL", + [ "Kernel->AppendVISALogicOrShiftInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "nullptr", + "nullptr", + "nullptr" + ] + ] + ], + "ISA_VA_Convolve_FOPCODE": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAVAConvolve", + "sampler", + "surface", + "normalized_x_co_ordinate", + "normalized_y_co_ordinate", + "(CONVExecMode)(properties & 0x3)", + "((properties >> 4) & 0x1)", + "dst" + ] + ] + ], + "ISA_VA_ERODE_FOPCODE": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAVAErodeDilate", + "VA_DILATE", + "sampler", + "surface", + "normalized_x_co_ordinate", + "normalized_y_co_ordinate", + "(EDExecMode)properties", + "dst" + ] + ] + ], + "ISA_VA_SKL_PLUS_ISA_HDC_ERODE": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAVAHDCErodeDilate", + "VA_ERODE", + "sampler", + "surface", + "normalized_x_co_ordinate", + "normalized_y_co_ordinate", + "dstSurface", + "xOffset", + "yOffset" + ] + ] + ], + "ISA_VA_SKL_PLUS_ISA_HDC_DILATE": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAVAHDCErodeDilate", + "VA_DILATE", + "sampler", + "surface", + "normalized_x_co_ordinate", + "normalized_y_co_ordinate", + "dstSurface", + "xOffset", + "yOffset" + ] + ] + ], + "ISA_BARRIER": [ + [ "CISA_CALL", + [ "Kernel->AppendVISASyncInst", + "opc", + "0" + ] + ] + ], + "ISA_YIELD": [ + [ "CISA_CALL", + [ "Kernel->AppendVISASyncInst", + "opc", + "0" + ] + ] + ], + "ISA_SAMPLR_CACHE_FLUSH": [ + [ "CISA_CALL", + [ "Kernel->AppendVISASyncInst", + "opc", + "0" + ] + ] + ], + "ISA_SBARRIER": [ + [ "CISA_CALL", + [ "Kernel->AppendVISASplitBarrierInst", + "signal_flag != 0" + ] + ] + ], + "ISA_BF_CVT": [ + [ "CISA_CALL", + [ "Kernel->AppendVISADataMovementInst", + "opc", + "nullptr", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "nullptr" + ] + ] + ], + "ISA_BFREV": [ + [ "CISA_CALL", + [ "Kernel->AppendVISALogicOrShiftInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "nullptr", + "nullptr", + "nullptr" + ] + ] + ], + "ISA_CBIT": [ + [ "CISA_CALL", + [ "Kernel->AppendVISALogicOrShiftInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "nullptr", + "nullptr", + "nullptr" + ] + ] + ], + "ISA_DIVM": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAArithmeticInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "src1", + "nullptr" + ] + ] + ], + "ISA_DP2": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAArithmeticInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "src1", + "nullptr" + ] + ] + ], + "ISA_DP3": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAArithmeticInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "src1", + "nullptr" + ] + ] + ], + "ISA_DP4": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAArithmeticInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "src1", + "nullptr" + ] + ] + ], + "ISA_DP4A": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAArithmeticInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "src1", + "src2" + ] + ] + ], + "ISA_DPH": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAArithmeticInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "src1", + "nullptr" + ] + ] + ], + "ISA_FBH": [ + [ "CISA_CALL", + [ "Kernel->AppendVISALogicOrShiftInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "nullptr", + "nullptr", + "nullptr" + ] + ] + ], + "ISA_FBL": [ + [ "CISA_CALL", + [ "Kernel->AppendVISALogicOrShiftInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "nullptr", + "nullptr", + "nullptr" + ] + ] + ], + "ISA_LINE": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAArithmeticInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "src1", + "nullptr" + ] + ] + ], + "ISA_LOAD": [ + [ "CISA_CALL", + [ "Kernel->AppendVISASILoad", + "surface", + "convertChannelMaskToVisaType(channel_mask & 0xf)", + "(channel_mask >> 4) & 0x3", + "U_pixel_address", + "V_pixel_address", + "R_pixel_address", + "dst" + ] + ] + ], + "ISA_SAMPLE": [ + [ "CISA_CALL", + [ "Kernel->AppendVISASISample", + "vISA_EMASK_M1", + "surface", + "sampler", + "convertChannelMaskToVisaType(channel_mask)", + "(channel_mask >> 4) & 0x3", + "U_pixel_address", + "V_pixel_address", + "R_pixel_address", + "dst" + ] + ] + ], + "ISA_LRP": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAArithmeticInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "src1", + "src2" + ] + ] + ], + "ISA_PLANE": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAArithmeticInst", + "opc", + "pred", + "Mod & MODIFIER_SAT", + "exec_mask", + "exec_size", + "dst", + "src0", + "src1", + "nullptr" + ] + ] + ], + "ISA_SVM_SVM_ATOMIC": [ + [ "CISA_CALL", + [ "Kernel->AppendVISASvmAtomicInst", + "pred", + "exec_mask", + "exec_size", + "sub_opc", + "32", + "address", + "src0", + "src1", + "dst" + ] + ] + ], + "ISA_SVM_SVM_BLOCK_LD": [ + [ "CISA_CALL", + [ "Kernel->AppendVISASvmBlockLoadInst", + "VISA_Oword_Num(log2_owords & 0x7)", + "(log2_owords & 8)", + "address", + "dst" + ] + ] + ], + "ISA_SVM_SVM_BLOCK_ST": [ + [ "CISA_CALL", + [ "Kernel->AppendVISASvmBlockStoreInst", + "VISA_Oword_Num(log2_owords)", + "(log2_owords & 8)", + "address", + "src" + ] + ] + ], + "ISA_SVM_SVM_GATHER": [ + [ "CISA_CALL", + [ "Kernel->AppendVISASvmGatherInst", + "pred", + "exec_mask", + "exec_size", + "(VISA_SVM_Block_Type)block_size_inferred_from_dst", + "(VISA_SVM_Block_Num)log2_num_blocks", + "address", + "dst" + ] + ] + ], + "ISA_SVM_SVM_GATHER4SCALED": [ + [ "CISA_CALL", + [ "Kernel->AppendVISASvmGather4ScaledInst", + "pred", + "exec_mask", + "exec_size", + "convertChannelMaskToVisaType(channel_mask)", + "address", + "offset", + "dst" + ] + ] + ], + "ISA_SVM_SVM_SCATTER": [ + [ "CISA_CALL", + [ "Kernel->AppendVISASvmScatterInst", + "pred", + "exec_mask", + "exec_size", + "(VISA_SVM_Block_Type)block_size_inferred_from_dst", + "(VISA_SVM_Block_Num)log2_num_blocks", + "address", + "src" + ] + ] + ], + "ISA_SVM_SVM_SCATTER4SCALED": [ + [ "CISA_CALL", + [ "Kernel->AppendVISASvmScatter4ScaledInst", + "pred", + "exec_mask", + "exec_size", + "convertChannelMaskToVisaType(channel_mask)", + "address", + "offset", + "src" + ] + ] + ], + "ISA_CMP_E": [ + [ "CISA_CALL", + [ "Kernel->AppendVISAComparisonInst", + "opc", + "exec_mask", + "exec_size", + "dst", + "src0", + "src1" + ] + ] + ] + }, + "ARGUMENTS_GEN": { + "EXECSIZE": "GetExecSize(II::ArgInfo({args}), &exec_mask)", + "EXECSIZE_GE2": "GetExecSize(II::ArgInfo({args}), &exec_mask)", + "EXECSIZE_GE4": "GetExecSize(II::ArgInfo({args}), &exec_mask)", + "EXECSIZE_GE8": "GetExecSize(II::ArgInfo({args}), &exec_mask)", + "EXECSIZE_NOT2": "GetExecSize(II::ArgInfo({args}), &exec_mask)", + "EXECSIZE_NOMASK": "GetExecSize(II::ArgInfo({args}), &exec_mask)", + "EXECSIZE_FROM_ARG": "GetExecSizeFromArg(II::ArgInfo({args}), &exec_mask)", + "EXECSIZE_FROM_BYTE": "GetExecSizeFromByte(II::ArgInfo({args}), &exec_mask)", + "NULLRAW": "CreateNullRawOperand(II::ArgInfo({args}))", + "MEDIAHEIGHT": "GetMediaHeght(II::ArgInfo({args}))", + "IMPLICITPRED": "CreateImplicitPredication(II::ArgInfo({args}))", + "GENERAL": "CreateOperand(II::ArgInfo({args}))", + "ADDRESS": "CreateAddressOperand(II::ArgInfo({args}))", + "RAW": "CreateRawOperand(II::ArgInfo({args}))", + "URAW": "CreateRawOperand(II::ArgInfo({args} | II::RAW_UNSIGNED))", + "SRAW": "CreateRawOperand(II::ArgInfo({args} | II::RAW_SIGNED))", + "SURFACE": "CreateSurfaceOperand(II::ArgInfo({args}))", + "SAMPLER": "CreateSamplerOperand(II::ArgInfo({args}))", + "PREDICATION": "CreatePredication(II::ArgInfo({args}))", + "PREDICATE": "GetPredicateVar(II::ArgInfo({args}))", + "Z_PREDICATE": "GetZeroedPredicateVar(II::ArgInfo({args}))", + "BYTE": "GetUnsignedValue(II::ArgInfo({args}))", + "SHORT": "GetUnsignedValue(II::ArgInfo({args}))", + "INT": "GetUnsignedValue(II::ArgInfo({args}))", + "LOG2OWORDS": "GetOwords(II::ArgInfo({args}))", + "LOG2OWORDS_PLUS_8": "GetOwords(II::ArgInfo({args})) + 8", + "SVMGATHERBLOCKSIZE": "GetSvmGatherBlockSize(II::ArgInfo({args}))", + "TWOADDR": "ProcessTwoAddr(II::ArgInfo({args}))", + "CONSTVI1ASI32": "ConstVi1Asi32(II::ArgInfo({args}))", + "ARGCOUNT": "GetArgCount(II::ArgInfo({args}))", + "NUMGRFS": "GetNumGrfs(II::ArgInfo({args}))", + "SAMPLECHMASK": "GetSampleChMask(II::ArgInfo({args}))", + "RAW_OPERANDS": ["VISA_RawOpnd* {dst}[16]", "CreateRawOperands(II::ArgInfo({args}), {dst})"], + "LITERAL": "{value1}", + "ISBARRIER": "HasBarrier = true", + "SKIP": null + } +} diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/Utils/cisa_gen_intrinsics.py b/IGC/VectorCompiler/lib/GenXCodeGen/Utils/cisa_gen_intrinsics.py new file mode 100755 index 000000000000..2e099df34b8c --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXCodeGen/Utils/cisa_gen_intrinsics.py @@ -0,0 +1,230 @@ +#!/usr/bin/python3 +""" +Usage: cisa_gen_intrinsics.py + +This script gets intrinsics description from JSON file specified by argument +and generates two files GenXIntrinsicInfoTable.inc and GenXIntrinsicsBuildMap.inc into +path specified by argument. + +JSON file must contain following mandatory fields: INTRINSICS, OPCODE_GEN and ARGUMENTS_GEN. + +*** Field INTRINSICS + Contains description of all intrinsics. Each intrinsic is described in following format: + intrinsic_name : { + opc: VISA opcode corresponding to the intrinsic + gen_opc: optional field, it aims to distinguish generators of complex opcodes which may + contain sub-opcode field + OPTIONS: list of intrinsics options. Currently, supported only 'disable' value, which means + that intrinsic will be skipped at all. + : see description below + } + + Each argument is a [key: list] format, where key is a name of Argument, list is a command + for generator. + First field of generator command is a generator name, it tells how to generate code for + fetching an argument value. Each argument generator is described in ARGUMENTS_GEN map. + + For example: + "Surface": ["GENERAL", "UNSIGNED", 10], + Here GENERAL is generator name by which will be determined (from "ARGUMENTS_GEN") what code + to generate for getting argument value. + Generated code: + auto Surface = CreateOperand(II::ArgInfo(UNSIGNED | 10)); + or for GenXIntrinsicInfoTable.inc: + GENERAL | UNSIGNED | 10, + + To add new intrinsic you need to add new description into INTRINSICS map. If it contains + opcode which is absent in opcode_map you also need to add item for new opcode to OPCODE_GEN. + + For example, lets add new intrinsic with new opcode and one new argument generator(NEW_PREDICATION): + "INTRINSICS": + "genx_new": { + "opc": "ISA_NEW", + "exec_size": ["EXECSIZE_FROM_ARG", 1], + "pred": ["NEW_PREDICATION", 1], + "DataOrder": ["BYTE", 5], + "Surface": ["GENERAL", "UNSIGNED", 10], + "DstData": ["RAW", 0], + "Src1Data": ["NULLRAW"] + }, + "OPCODE_GEN": + ISA_NEW: "CISA_CALL(Kernel->AppendNew(exec_size, pred, DataOrder, Src1Data, DstData, Surface));" + "ARGUMENTS_GEN": + "NEW_PREDICATION": "CreateNewPredication(II::ArgInfo({args}))", + Also we need to add new function or lambda with name CreateNewPredication to GenXCisaBuilder.cpp + +*** Field ARGUMENTS_GEN + It is needed only to generate CISA building code (GenXIntrinsicsBuildMap.inc) + Pattern keys that can be used inside generator: + args - string with arguments that are passed to ArgInfo constructor. + value1 - first value in argument list, needed for LITERAL generator + dst - name of a variable to which will be assigned argument value + +*** Field OPCODE_GEN + It is needed only to generate CISA building code (GenXIntrinsicsBuildMap.inc) + Final part of generated code for a single intrinsic is a call of Finalizer's function that builds + instruction itself. So, all items of this map is just map from opcode to the build function. + Opcode may be not real VISA opcode, for example, ISA_VA_SKL_PLUS has different functions to build + instructions with different signatures, which depends of its sub-opcode. Thus there are maybe + compound opcodes for such cases. +""" + +import sys +import re +import json +from collections import OrderedDict + + +HEADER = '''/****************************************************************************** + * AUTOGENERATED FILE, DO NOT EDIT! + * Generated by GenXUtilBuild project + */ +''' + +def open_and_delete_comments(dscr_filename): + with open(dscr_filename, "r") as jsonfile: + data = jsonfile.readlines() + jsonwithoutcomments = filter(lambda line: "//" not in line, data) + stringjson = "".join(jsonwithoutcomments) + return stringjson; + +def generate(dscr_filename, out_path): + special_keys = ('gen_opc', 'OPTIONS') + descr = json.loads(open_and_delete_comments(dscr_filename), object_pairs_hook=OrderedDict) + opcode_gen = descr['OPCODE_GEN'] + arguments_gen = descr['ARGUMENTS_GEN'] + intrinsics = descr['INTRINSICS'] + + # Convert list to function call string + # Example: [ Func, arg1, arg2] to Func(arg1, arg2) + def gen2str(value): + if isinstance(value, list): + args = [] + for v in value[1:]: + args.append(gen2str(v)) + return "{}({})".format(value[0], ', '.join(args)) + return str(value) + + # Recursively search regex in lists + def gen_search(value, regex): + if isinstance(value, list): + for v in value: + if gen_search(v, regex): + return True + return False + return bool(re.search(regex, value)) + + def isstrinst(opc_gen): + isalt = True + if sys.version_info[0] >= 3: + isalt = isinstance(opc_gen, bytes) + else: + isalt = isinstance(opc_gen, unicode) + return bool(isinstance(opc_gen, str) or isalt) + + with open(out_path + '/GenXIntrinsicInfoTable.inc', 'w') as file: + file.write(HEADER) + for name, intr in intrinsics.items(): + if 'OPTIONS' in intr and 'disable' in intr['OPTIONS']: + continue + if name == "fma": + file.write('Intrinsic::{},\n'.format(name)) + else: + file.write('GenXIntrinsic::{},\n'.format(name)) + for key, value in intr.items(): + if key in special_keys: + continue + elif key in ('opc'): + file.write('LITERAL | {},\n'.format(value)) + elif isinstance(value, list): + file.write('{},\n'.format(' | '.join([str(x) for x in value if x != 'RAW_OPERANDS']))) + else: + # skip other + pass + file.write('END,\n\n') + + + with open(out_path + '/GenXIntrinsicsBuildMap.inc', 'w') as file: + file.write(HEADER) + file.write('switch(IntrinID) {\n\n') + + for name, intr in intrinsics.items(): + gen_opc = intr.get('gen_opc') + if not gen_opc: + gen_opc = intr['opc'] + + opc_gen = opcode_gen.get(gen_opc) + if not opc_gen: + print(intr) + raise RuntimeError("Instruction generator not found") + if isstrinst(opc_gen): + opc_gen = [opc_gen] + assert isinstance(opc_gen, list) + + if 'OPTIONS' in intr and 'disable' in intr['OPTIONS']: + continue + + if name == "fma": + file.write(' case llvm::Intrinsic::' + name + ': {\n') + else: + file.write(' case llvm::GenXIntrinsic::' + name + ': {\n') + + for key, value in intr.items(): + if key in special_keys: + continue + + # no_assign means that there is no variable that need to be assigned + no_assign = key in ('twoaddr', 'nobarrier') + + # skip items that are not exist in generator string + if not no_assign and not gen_search(opc_gen, r'\b%s\b'%key): + continue + + if key == 'opc': + replace = value + elif isinstance(value, list): + replace = arguments_gen.get(value[0]) + if not replace: + print(value) + raise RuntimeError('Key not found!') + if not replace: + continue + context = { 'value1': value[1] if len(value) > 1 else None, 'dst': key, + 'args': '{}'.format(' | ').join( + ['II::' + x if isstrinst(x) + else str(x) for x in value if x != 'RAW_OPERANDS']) } + if isinstance(replace, list): + replace = [x.format(**context) for x in replace] + else: + replace = replace.format(**context) + else: + replace = value + assert replace, 'Unknown token' + + if isinstance(replace, list): + for replace_item in replace: + file.write(' ' + replace_item + ';\n') + else: + assign = '' if no_assign else 'auto ' + key + ' = ' + file.write(' ' + assign + replace + ';\n') + + for g in opc_gen: + file.write(' ' + gen2str(g) + ';\n') + file.write(' } break;\n\n') + + file.write(''' default: + CI->print(errs()); + errs() << '\\n'; + report_fatal_error("Unsupported intrinsic!"); + break; +}''') + +def main(): + if len(sys.argv) > 1 and sys.argv[1] == '--help': + print(__doc__) + sys.exit(0) + assert len(sys.argv) > 2, "Missing arguments! Usage: cisa_gen_intrinsics.py " + generate(sys.argv[1], sys.argv[2]) + +if __name__ == '__main__': + main() diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMAnalysis/ConstantFoldingGenX.cpp b/IGC/VectorCompiler/lib/GenXOpts/CMAnalysis/ConstantFoldingGenX.cpp new file mode 100644 index 000000000000..7be7ca29da15 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXOpts/CMAnalysis/ConstantFoldingGenX.cpp @@ -0,0 +1,285 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +// This file defines a routine for folding a GenX intrinsic call into a constant. +// +//===----------------------------------------------------------------------===// + +#include "vc/GenXOpts/GenXAnalysis.h" +#include "vc/GenXOpts/Utils/CMRegion.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/GenXIntrinsics/GenXIntrinsics.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "genx-constantfolding" + +using namespace llvm; + +/*********************************************************************** + * canConstantFoldGenXIntrinsic : Return true if it is even possible to fold + * a call to the specified GenX intrinsic + */ +bool llvm::canConstantFoldGenXIntrinsic(unsigned IID) +{ + switch (IID) { + case GenXIntrinsic::genx_rdregioni: + case GenXIntrinsic::genx_rdregionf: + // The wrregion case specifically excludes genx_wrconstregion + case GenXIntrinsic::genx_wrregioni: + case GenXIntrinsic::genx_wrregionf: + case GenXIntrinsic::genx_all: + case GenXIntrinsic::genx_any: + return true; + } + return false; +} + +/*********************************************************************** + * constantFoldRdRegion : attempt to constant fold rdregion + */ +static Constant *constantFoldRdRegion(Type *RetTy, + ArrayRef Operands, + const CMRegion &R, const DataLayout *DL) { + Constant *Input = Operands[GenXIntrinsic::GenXRegion::OldValueOperandNum]; + // The input can be a ConstantExpr if we are being called from + // CallAnalyzer. + if (isa(Input)) + return nullptr; + // If the input value is undef, just return undef. + if (isa(Input)) + return UndefValue::get(RetTy); + // Parse the region parameters. + unsigned WholeNumElements = Input->getType()->getVectorNumElements(); + auto OffsetC = dyn_cast( + Operands[GenXIntrinsic::GenXRegion::RdIndexOperandNum]); + if (!OffsetC) + return nullptr; + int RetElemSize = RetTy->getScalarType()->getPrimitiveSizeInBits() / 8; + if (!RetElemSize) { + assert(RetTy->getScalarType()->isPointerTy() && + RetTy->getScalarType()->getPointerElementType()->isFunctionTy()); + RetElemSize = DL->getTypeSizeInBits(RetTy) / 8; + } + unsigned Offset = 0; + if (!isa(OffsetC->getType())) + Offset = dyn_cast(OffsetC)->getZExtValue() / RetElemSize; + else + assert(OffsetC->getType()->getVectorNumElements() == R.NumElements); + if (Offset >= WholeNumElements) + return UndefValue::get(RetTy); // out of range index + if (!isa(RetTy)) + return Input->getAggregateElement(Offset); + // Gather the elements of the region being read. + SmallVector Values; + unsigned RowIdx = Offset; + unsigned Idx = RowIdx; + unsigned NextRow = R.Width; + for (unsigned i = 0; i != R.NumElements; ++i) { + if (i == NextRow) { + NextRow += R.Width; + RowIdx += R.VStride; + Idx = RowIdx; + } + if (isa(OffsetC->getType())) { + auto EltOffset = + dyn_cast(OffsetC->getAggregateElement(i))->getZExtValue(); + EltOffset = EltOffset / + (RetTy->getScalarType()->getPrimitiveSizeInBits() / 8); + Idx += EltOffset; + } + if (Idx >= WholeNumElements) + // push undef value if idx is out of bounds + Values.push_back(UndefValue::get(RetTy->getScalarType())); + else + // Get the element value and push it into Values. + Values.push_back(Input->getAggregateElement(Idx)); + Idx += R.Stride; + } + return ConstantVector::get(Values); +} + +/*********************************************************************** + * constantFoldWrRegion : attempt to constant fold Wrregion + */ +static Constant *constantFoldWrRegion(Type *RetTy, + ArrayRef Operands, + const CMRegion &R, const DataLayout *DL) { + Constant *OldValue = Operands[GenXIntrinsic::GenXRegion::OldValueOperandNum]; + Constant *NewValue = Operands[GenXIntrinsic::GenXRegion::NewValueOperandNum]; + // The inputs can be ConstantExpr if we are being called from + // CallAnalyzer. + if (isa(OldValue) || isa(NewValue)) + return nullptr; + assert(RetTy == OldValue->getType()); + auto OffsetC = + dyn_cast(Operands[GenXIntrinsic::GenXRegion::WrIndexOperandNum]); + if (!OffsetC) + return nullptr; // allow for but do not const fold when index is vector + int RetElemSize = RetTy->getScalarType()->getPrimitiveSizeInBits() / 8; + if (!RetElemSize) { + assert(RetTy->getScalarType()->isPointerTy() && + RetTy->getScalarType()->getPointerElementType()->isFunctionTy()); + RetElemSize = DL->getTypeSizeInBits(RetTy) / 8; + } + unsigned Offset = OffsetC->getSExtValue() / RetElemSize; + if (isa(OldValue) && R.isContiguous() && (Offset == 0)) { + // If old value is undef and new value is splat, and the result vector + // is no bigger than 2 GRFs, then just return a splat of the right type. + Constant *Splat = NewValue; + if (isa(NewValue->getType())) + Splat = NewValue->getSplatValue(); + if (Splat) + if (RetTy->getPrimitiveSizeInBits() <= 2 * 32 * 8) + return ConstantVector::getSplat(RetTy->getVectorNumElements(), Splat); + // If new value fills the whole vector, just return the new value. + if (NewValue->getType() == RetTy) + return NewValue; + } + unsigned WholeNumElements = RetTy->getVectorNumElements(); + // Gather the elements of the old value. + SmallVector Values; + for (unsigned i = 0; i != WholeNumElements; ++i) + Values.push_back(OldValue->getAggregateElement(i)); + // Insert the elements of the new value. + if (Offset >= Values.size()) + return UndefValue::get(RetTy); // out of range index + if (!isa(NewValue->getType())) + Values[Offset] = NewValue; + else { + unsigned RowIdx = Offset; + unsigned Idx = RowIdx; + unsigned NextRow = R.Width; + for (unsigned i = 0; i != R.NumElements; ++i) { + if (i == NextRow) { + NextRow += R.Width; + RowIdx += R.VStride; + Idx = RowIdx; + } + if (Idx >= WholeNumElements) + // return collected values even if idx is out of bounds + return ConstantVector::get(Values); + Values[Idx] = NewValue->getAggregateElement(i); + Idx += R.Stride; + } + } + return ConstantVector::get(Values); +} + +/*********************************************************************** + * constantFoldAll : constant fold llvm.genx.all + * constantFoldAny : constant fold llvm.genx.any + */ +static Constant *constantFoldAll(Type *RetTy, Constant *In) +{ + if (In->isAllOnesValue()) + return Constant::getAllOnesValue(RetTy); + return Constant::getNullValue(RetTy); +} +static Constant *constantFoldAny(Type *RetTy, Constant *In) +{ + if (!In->isNullValue()) + return Constant::getAllOnesValue(RetTy); + return Constant::getNullValue(RetTy); +} + +/*********************************************************************** + * ConstantFoldGenXIntrinsic : attempt to constant fold a call to the + * specified GenX intrinsic with the specified arguments, returning null if + * unsuccessful + */ +Constant *llvm::ConstantFoldGenXIntrinsic(unsigned IID, Type *RetTy, + ArrayRef Operands, ImmutableCallSite CS, const DataLayout *DL) +{ + Instruction *I = const_cast(CS.getInstruction()); + switch (IID) { + case GenXIntrinsic::genx_rdregioni: + case GenXIntrinsic::genx_rdregionf: { + CMRegion R(I); + return constantFoldRdRegion(RetTy, Operands, R, DL); + } + // The wrregion case specifically excludes genx_wrconstregion + case GenXIntrinsic::genx_wrregioni: + case GenXIntrinsic::genx_wrregionf: { + CMRegion R(I); + return constantFoldWrRegion(RetTy, Operands, R, DL); + } + case GenXIntrinsic::genx_all: + return constantFoldAll(RetTy, Operands[0]); + case GenXIntrinsic::genx_any: + return constantFoldAny(RetTy, Operands[0]); + } + return nullptr; +} + +/*********************************************************************** + * ConstantFoldGenX : attempt to constant fold genx intrinsics including + * its arguments, returning null if unsuccessful. + */ +Constant *llvm::ConstantFoldGenX(Instruction *I, const DataLayout &DL) { + LLVM_DEBUG(dbgs() << "Trying to fold " << *I << "\n"); + auto IID = GenXIntrinsic::getGenXIntrinsicID(I); + if (!canConstantFoldGenXIntrinsic(IID)) { + LLVM_DEBUG(dbgs() << "Fail: not a genx intrinsic\n"); + return nullptr; + } + + CallSite CS{I}; + auto CheckConst = [](const Use &A) { + Value *V = A.get(); + bool IsConst = isa(V); + if (!IsConst) + LLVM_DEBUG(dbgs() << "Fail: operand " << *V << " is not a constant\n"); + return IsConst; + }; + if (!std::all_of(CS.arg_begin(), CS.arg_end(), CheckConst)) + return nullptr; + + SmallVector ConstantArgs; + ConstantArgs.reserve(CS.arg_size()); + auto FoldOperand = [&DL](const Use &A) { + auto *C = cast(A.get()); + Constant *Folded = ConstantFoldConstant(C, DL); + if (Folded) + LLVM_DEBUG(dbgs() << "Folded operand " << *C << " to " << *Folded + << "\n"); + return Folded ? Folded : C; + }; + std::transform(CS.arg_begin(), CS.arg_end(), std::back_inserter(ConstantArgs), + FoldOperand); + + Constant *Folded = ConstantFoldGenXIntrinsic( + IID, CS.getFunctionType()->getReturnType(), ConstantArgs, CS, &DL); + if (Folded) + LLVM_DEBUG(dbgs() << "Successfully constant folded intruction to " + << *Folded << "\n"); + else + LLVM_DEBUG(dbgs() << "Failed to constant fold instruction\n"); + return Folded; +} diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMAnalysis/InstructionSimplifyGenX.cpp b/IGC/VectorCompiler/lib/GenXOpts/CMAnalysis/InstructionSimplifyGenX.cpp new file mode 100644 index 000000000000..34fd453c9cf0 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXOpts/CMAnalysis/InstructionSimplifyGenX.cpp @@ -0,0 +1,269 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +// This file defines a routine for simplifying a GenX intrinsic call to a +// constant or one of the operands. This is for cases where not all operands +// are constant; the constant operand cases are handled in ConstantFoldGenX.cpp. +// +//===----------------------------------------------------------------------===// + +#include "vc/GenXOpts/GenXAnalysis.h" +#include "vc/GenXOpts/GenXOpts.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/GenXIntrinsics/GenXIntrinsics.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/Pass.h" +#include "llvm/PassSupport.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Scalar.h" + +#define DEBUG_TYPE "genx-simplify" + +using namespace llvm; + +/*********************************************************************** + * SimplifyGenXIntrinsic : given a GenX intrinsic and a set of arguments, + * see if we can fold the result. + * + * ConstantFoldingGenX.cpp handles pure constant folding cases. This code + * only handles cases where not all operands are constant, but we can do + * some folding anyway. + * + * If this call could not be simplified, returns null. + */ +Value *llvm::SimplifyGenXIntrinsic(unsigned IID, Type *RetTy, Use *ArgBegin, + Use *ArgEnd) { + switch (IID) { + case GenXIntrinsic::genx_rdregioni: + case GenXIntrinsic::genx_rdregionf: + // Identity rdregion can be simplified to its "old value" input. + if (RetTy + == ArgBegin[GenXIntrinsic::GenXRegion::OldValueOperandNum]->getType()) { + unsigned NumElements = RetTy->getVectorNumElements(); + unsigned Width = cast( + ArgBegin[GenXIntrinsic::GenXRegion::RdWidthOperandNum]) + ->getZExtValue(); + auto IndexV = dyn_cast( + ArgBegin[GenXIntrinsic::GenXRegion::RdIndexOperandNum]); + if (!IndexV) + return nullptr; + unsigned Index = 0; + if (!isa(IndexV->getType())) + Index = dyn_cast(IndexV)->getZExtValue() + / (RetTy->getScalarType()->getPrimitiveSizeInBits() / 8); + else + return nullptr; + if ((Index == 0 || Index >= NumElements) && + (Width == NumElements || Width == cast(ArgBegin[ + GenXIntrinsic::GenXRegion::RdVStrideOperandNum])->getSExtValue())) + if (NumElements == 1 || cast(ArgBegin[ + GenXIntrinsic::GenXRegion::RdStrideOperandNum])->getSExtValue()) + return ArgBegin[GenXIntrinsic::GenXRegion::OldValueOperandNum]; + } + // rdregion with splatted constant input can be simplified to a constant of + // the appropriate type, ignoring the possibly variable index. + if (auto C = dyn_cast( + ArgBegin[GenXIntrinsic::GenXRegion::OldValueOperandNum])) + if (auto Splat = C->getSplatValue()) { + if (auto VT = dyn_cast(RetTy)) + return ConstantVector::getSplat(VT->getNumElements(), Splat); + return Splat; + } + break; + case GenXIntrinsic::genx_wrregioni: + case GenXIntrinsic::genx_wrregionf: + // The wrregion case specifically excludes genx_wrconstregion. + // Identity wrregion can be simplified to its "new value" input. + if (RetTy + == ArgBegin[GenXIntrinsic::GenXRegion::NewValueOperandNum]->getType()) { + if (auto CMask = dyn_cast(ArgBegin[ + GenXIntrinsic::GenXRegion::PredicateOperandNum])) { + if (CMask->isAllOnesValue()) { + unsigned NumElements = RetTy->getVectorNumElements(); + unsigned Width = cast( + ArgBegin[GenXIntrinsic::GenXRegion::WrWidthOperandNum]) + ->getZExtValue(); + auto IndexV = dyn_cast( + ArgBegin[GenXIntrinsic::GenXRegion::WrIndexOperandNum]); + if (!IndexV) + return nullptr; + unsigned Index = 0; + if (!isa(IndexV->getType())) + Index = dyn_cast(IndexV)->getZExtValue() + / (RetTy->getScalarType()->getPrimitiveSizeInBits() / 8); + else + return nullptr; + if ((Index == 0 || Index >= NumElements) && + (Width == NumElements || Width == cast(ArgBegin[ + GenXIntrinsic::GenXRegion::WrVStrideOperandNum])->getSExtValue())) + if (NumElements == 1 || cast(ArgBegin[ + GenXIntrinsic::GenXRegion::WrStrideOperandNum])->getSExtValue()) + return ArgBegin[GenXIntrinsic::GenXRegion::NewValueOperandNum]; + } + } + } + // Wrregion with constant 0 predicate can be simplified to its "old value" + // input. + if (auto CMask = dyn_cast(ArgBegin[ + GenXIntrinsic::GenXRegion::PredicateOperandNum])) + if (CMask->isNullValue()) + return ArgBegin[GenXIntrinsic::GenXRegion::OldValueOperandNum]; + // Wrregion writing a value that has just been read out of the same + // region in the same vector can be simplified to its "old value" input. + // This works even if the predicate is not all true. + if (auto RdR = dyn_cast(ArgBegin[ + GenXIntrinsic::GenXRegion::NewValueOperandNum])) { + if (auto RdRFunc = RdR->getCalledFunction()) { + Value *OldVal = ArgBegin[GenXIntrinsic::GenXRegion::OldValueOperandNum]; + if ((GenXIntrinsic::getGenXIntrinsicID(RdRFunc) == + GenXIntrinsic::genx_rdregioni || + GenXIntrinsic::getGenXIntrinsicID(RdRFunc) == + GenXIntrinsic::genx_rdregionf) && + RdR->getArgOperand(GenXIntrinsic::GenXRegion::OldValueOperandNum) + == OldVal) { + // Check the region parameters match between the rdregion and + // wrregion. There are 4 region parameters: vstride, width, stride, + // index. + bool CanSimplify = true; + for (unsigned i = 0; i != 4; ++i) { + if (ArgBegin[GenXIntrinsic::GenXRegion::WrVStrideOperandNum + i] + != RdR->getArgOperand( + GenXIntrinsic::GenXRegion::RdVStrideOperandNum + i)) { + CanSimplify = false; + break; + } + } + if (CanSimplify) + return OldVal; + } + } + } + break; + case GenXIntrinsic::genx_wrpredregion: + // wrpredregion with undef "new value" input is simplified to the "old + // value" input. + if (isa(ArgBegin[1])) + return ArgBegin[0]; + break; + } + return nullptr; +} + +/*********************************************************************** + * SimplifyGenX : given a GenX related instruction, see if we can fold + * the result. + * + * ConstantFoldingGenX.cpp handles pure constant folding cases. This code + * also handles cases where not all operands are constant. + * + * If this instruction could not be simplified, returns null. + */ +Value *llvm::SimplifyGenX(CallInst *I) { + CallSite CS{I}; + Value *V = CS.getCalledValue(); + Type *Ty = V->getType(); + if (auto *PTy = dyn_cast(Ty)) + Ty = PTy->getElementType(); + auto *FTy = cast(Ty); + auto *F = dyn_cast(V); + if (!F) + return nullptr; + + LLVM_DEBUG(dbgs() << "Trying to simplify " << *I << "\n"); + auto GenXID = GenXIntrinsic::getGenXIntrinsicID(F); + if (Value *Ret = SimplifyGenXIntrinsic(GenXID, FTy->getReturnType(), + CS.arg_begin(), CS.arg_end())) { + LLVM_DEBUG(dbgs() << "Simplified to " << *Ret << "\n"); + return Ret; + } + + LLVM_DEBUG(dbgs() << "Failed to simplify, trying to constant fold\n"); + Constant *C = ConstantFoldGenX(I, I->getModule()->getDataLayout()); + if (C) + LLVM_DEBUG(dbgs() << "Successfully folded to " << *C << "\n"); + else + LLVM_DEBUG(dbgs() << "Failed to constant fold instruction\n"); + return C; +} + +namespace llvm { +void initializeGenXSimplifyPass(PassRegistry &); +} + +namespace { +class GenXSimplify : public FunctionPass { +public: + static char ID; + + GenXSimplify() : FunctionPass(ID) { + initializeGenXSimplifyPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + } + + bool runOnFunction(Function &F) override; +}; +} // namespace + +bool GenXSimplify::runOnFunction(Function &F) { + const DataLayout &DL = F.getParent()->getDataLayout(); + bool Changed = false; + for (auto &BB : F) { + for (auto I = BB.begin(); I != BB.end();) { + Instruction *Inst = &*I++; + if (auto *CI = dyn_cast(Inst)) { + if (GenXIntrinsic::isGenXIntrinsic(CI)) { + if (Value *V = SimplifyGenX(CI)) { + CI->replaceAllUsesWith(V); + CI->eraseFromParent(); + Changed = true; + } + continue; + } + } + + if (Value *V = SimplifyInstruction(Inst, DL)) { + Inst->replaceAllUsesWith(V); + Inst->eraseFromParent(); + Changed = true; + } + } + } + return Changed; +} + +char GenXSimplify::ID = 0; +INITIALIZE_PASS(GenXSimplify, "genx-simplify", + "simplify genx specific instructions", false, false) + +FunctionPass *llvm::createGenXSimplifyPass() { return new GenXSimplify; } diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/GenXPacketize.cpp b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/GenXPacketize.cpp new file mode 100644 index 000000000000..50a49fba17df --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/GenXPacketize.cpp @@ -0,0 +1,1757 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +/// GenXPacketize +/// ------------- +/// +/// - Vectorize the SIMT functions +/// +/// - Vectorize the generic function called by the SIMT functions +/// +/// - Replace generic control-flow with SIMD control-flow +/// +//===----------------------------------------------------------------------===// + +#include "PacketBuilder.h" + +#include "llvmWrapper/Support/Alignment.h" + +#include "vc/GenXOpts/Utils/CMRegion.h" + +#include "llvm/GenXIntrinsics/GenXIntrinsics.h" +#include "llvm/GenXIntrinsics/GenXSimdCFLowering.h" +#include "llvm/IR/Dominators.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils.h" +#include "llvm/Transforms/Utils/Cloning.h" + +#include +#include +#include +#include + +using namespace pktz; + +namespace llvm { + +/// Packetizing SIMT functions +/// ^^^^^^^^^^^^^^^^^^^^^^^^^^ +/// +/// a) Look for functions with attributes CMGenXSIMT +/// If no such function, end the pass +/// +/// b) sort functions in call-graph topological order +/// find those generic functions called by the SIMT functions +/// find all the possible widthes those functions should be vectorized to +/// +/// c) find those uniform function arguments +/// arguments for non-SIMT functions are uniform +/// arguments for SIMT-entry are uniform +/// arguments for SIMT-functions are uniform if it is only defined by +/// callers' uniform argument. +/// +/// d) Run reg2mem pass to remove phi-nodes +/// This is because we need to generate simd-control-flow +/// after packetization. simd-control-flow lowering cannot handle phi-node. +/// +/// e) for uniform arguments +/// Mark the allocas for those arguments as uniform +/// Mark the load/store for those allocas as uniform +/// +/// f) vectorize generic functions to its SIMT width, callee first +/// - create the vector prototype +/// - clone the function-body into the vector prototype +/// - vectorize the function-body +/// - note: original function is kept because it may be used outside SIMT +/// +/// g) vectorize SIMT-entry functions +/// - no change of function arguments +/// - no cloning, direct-vectorization on the function-body +/// +/// h) SIMD-control-flow lowering +/// +/// i) run mem2reg pass to create SSA +/// +/// j) CMABI pass to remove global Execution-Mask +/// +class GenXPacketize : public ModulePass { +public: + static char ID; + explicit GenXPacketize() : ModulePass(ID) {} + ~GenXPacketize() { releaseMemory(); } + virtual StringRef getPassName() const override { return "GenX Packetize"; } + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequiredID(BreakCriticalEdgesID); + }; + bool runOnModule(Module &M) override; + void releaseMemory() override { + ReplaceMap.clear(); + UniformArgs.clear(); + UniformInsts.clear(); + FuncOrder.clear(); + FuncVectors.clear(); + FuncMap.clear(); + } + +private: + void findFunctionVectorizationOrder(Module *M); + + Value *getPacketizeValue(Value *OrigValue); + Value *getUniformValue(Value *OrigValue); + Function *getVectorIntrinsic(Module *M, unsigned id, std::vector &ArgTy); + Value *packetizeConstant(Constant *pConstant); + Value *packetizeGenXIntrinsic(Instruction *pInst); + Value *packetizeLLVMIntrinsic(Instruction *pInst); + Value *packetizeLLVMInstruction(Instruction *pInst); + Value *packetizeInstruction(Instruction *pInst); + + void replaceAllUsesNoTypeCheck(Value *pInst, Value *pNewInst); + void removeDeadInstructions(Function &F); + void fixupLLVMIntrinsics(Function &F); + + Function *vectorizeSIMTFunction(Function *F, unsigned Width); + bool vectorizeSIMTEntry(Function &F); + + bool isUniformIntrinsic(unsigned id); + void findUniformArgs(Function &F); + void findUniformInsts(Function &F); + + void lowerControlFlowAfter(std::vector &SIMTFuncs); + GlobalVariable *findGlobalExecMask(); + +private: + Module *M; + PacketBuilder *B; + + // track already packetized values + ValueToValueMapTy ReplaceMap; + + /// uniform set for arguments + std::set UniformArgs; + /// uniform set for alloca, load, store, and GEP + std::set UniformInsts; + /// sort function in caller-first order + std::vector FuncOrder; + /// map: function ==> a set of vectorization width + std::map> FuncVectors; + /// Map: original function and vectorization width ==> vectorized version + std::map, Function *> FuncMap; + + const DataLayout *DL; +}; + +bool GenXPacketize::runOnModule(Module &Module) { + M = &Module; + // find all the SIMT enntry-functions + std::vector ForkFuncs; + for (auto &F : M->getFunctionList()) { + if (F.hasFnAttribute("CMGenxSIMT")) { + uint32_t Width = 0; + F.getFnAttribute("CMGenxSIMT").getValueAsString().getAsInteger(0, Width); + if (Width > 1) { + assert(Width == 8 || Width == 16 || Width == 32); + ForkFuncs.push_back(&F); + } + } + } + if (ForkFuncs.empty()) + return false; + + // sort functions in order, also find those functions that are used in + // the SIMT mode, therefore need whole-function vectorization. + findFunctionVectorizationOrder(M); + + unsigned NumFunc = FuncOrder.size(); + // find uniform arguments + UniformArgs.clear(); + for (unsigned i = 0; i < NumFunc; ++i) { + auto F = FuncOrder[i]; + findUniformArgs(*F); + } + + // perform reg-to-mem to remove phi before packetization + // because we need to generate simd-control-flow after packetization + // we then perform mem-to-reg after generating simd-control-flow. + std::unique_ptr DemotePass(createDemoteRegisterToMemoryPass()); + for (auto &F : M->getFunctionList()) { + DemotePass->runOnFunction(F); + } + + UniformInsts.clear(); + + DL = &(M->getDataLayout()); + B = new PacketBuilder(M); + std::vector SIMTFuncs; + // Process those functions called in the SIMT mode + for (int i = NumFunc - 1; i >= 0; --i) { + auto F = FuncOrder[i]; + auto iter = FuncVectors.find(F); + if (iter != FuncVectors.end()) { + auto WV = iter->second; + for (auto W : WV) { + auto VF = vectorizeSIMTFunction(F, W); + auto Key = std::pair(F, W); + FuncMap.insert( + std::pair, Function *>(Key, VF)); + SIMTFuncs.push_back(VF); + } + } + } + + // vectorize SIMT entry-functions + bool Modified = false; + for (auto F : ForkFuncs) { + Modified |= vectorizeSIMTEntry(*F); + SIMTFuncs.push_back(&(*F)); + } + + delete B; + + // lower the SIMD control-flow + lowerControlFlowAfter(SIMTFuncs); + + return Modified; +} + +/*************************************************************************** + * vectorize a functions that is used in the fork-region + */ +Function *GenXPacketize::vectorizeSIMTFunction(Function *F, unsigned Width) { + assert(!F->hasFnAttribute("CMGenxSIMT")); + B->SetTargetWidth(Width); + + // vectorize the argument and return types + std::vector ArgTypes; + for (const Argument &I : F->args()) { + if (UniformArgs.count(&I)) + ArgTypes.push_back(I.getType()); + else if (I.getType()->isPointerTy()) { + // FIXME: check the pointer defined by an argument or an alloca + // [N x float]* should packetize to [N x <8 x float>]* + auto VTy = PointerType::get( + B->GetVectorType(I.getType()->getPointerElementType()), + I.getType()->getPointerAddressSpace()); + ArgTypes.push_back(VTy); + } else { + ArgTypes.push_back(B->GetVectorType(I.getType())); + } + } + Type *RetTy = B->GetVectorType(F->getReturnType()); + // Create a new function type... + assert(!F->isVarArg()); + FunctionType *FTy = FunctionType::get(RetTy, ArgTypes, false); + + // Create the vector function prototype + StringRef VecFName = F->getName(); + const char *Suffix[] = {".vec00", ".vec08", ".vec16", ".vec24", ".vec32"}; + Function *ClonedFunc = + Function::Create(FTy, GlobalValue::InternalLinkage, + VecFName + Suffix[Width / 8], F->getParent()); + ClonedFunc->setCallingConv(F->getCallingConv()); + ClonedFunc->setAttributes(F->getAttributes()); + ClonedFunc->setAlignment(IGCLLVM::getAlign(F->getAlignment())); + + // then use CloneFunctionInto + ValueToValueMapTy ArgMap; + Function::arg_iterator ArgI = ClonedFunc->arg_begin(); + for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); + I != E; ++I) { + ArgI->setName(I->getName()); // Copy the name over... + ArgMap[I] = ArgI; // Add mapping to ValueMap + if (UniformArgs.count(I)) { // bookkeep the uniform set + UniformArgs.insert(ArgI); + } + ArgI++; + } + SmallVector returns; + ClonedCodeInfo CloneInfo; + CloneFunctionInto(ClonedFunc, F, ArgMap, true, returns, Suffix[Width / 8], + &CloneInfo); + + ReplaceMap.clear(); + // find uniform instructions related to uniform arguments + findUniformInsts(*ClonedFunc); + + // vectorize instructions in the fork-regions + for (auto I = ClonedFunc->begin(), E = ClonedFunc->end(); I != E; ++I) { + BasicBlock *BB = &*I; + for (auto &I : BB->getInstList()) { + if (!UniformInsts.count(&I)) { + Value *pPacketizedInst = packetizeInstruction(&I); + ReplaceMap[&I] = pPacketizedInst; + } else { + for (int i = 0, n = I.getNumOperands(); i < n; ++i) { + Value *OrigValue = I.getOperand(i); + auto iter = ReplaceMap.find(OrigValue); + if (iter != ReplaceMap.end() && iter->second != OrigValue) { + I.setOperand(i, iter->second); + } + } + } + } + } + + removeDeadInstructions(*ClonedFunc); + + return ClonedFunc; +} + +/*************************************************************************** + * vectorize a SIMT-entry function + */ +bool GenXPacketize::vectorizeSIMTEntry(Function &F) { + assert(F.hasFnAttribute("CMGenxSIMT")); + + // find uniform instructions related to uniform arguments + findUniformInsts(F); + + uint32_t Width = 0; + F.getFnAttribute("CMGenxSIMT").getValueAsString().getAsInteger(0, Width); + + B->SetTargetWidth(Width); + + ReplaceMap.clear(); + + B->IRB()->SetInsertPoint(&F.getEntryBlock(), F.getEntryBlock().begin()); + + // vectorize instructions in the fork-regions + for (auto I = F.begin(), E = F.end(); I != E; ++I) { + BasicBlock *BB = &*I; + for (auto &I : BB->getInstList()) { + if (!UniformInsts.count(&I)) { + Value *pPacketizedInst = packetizeInstruction(&I); + ReplaceMap[&I] = pPacketizedInst; + } else { + for (int i = 0, n = I.getNumOperands(); i < n; ++i) { + Value *OrigValue = I.getOperand(i); + auto iter = ReplaceMap.find(OrigValue); + if (iter != ReplaceMap.end() && iter->second != OrigValue) { + I.setOperand(i, iter->second); + } + } + } + } + } + + removeDeadInstructions(F); + // a SIMT entry is always inlined after vectorization + // This is required in order to handle structure argument, + // for example, generated from lambda capture. + if (F.hasFnAttribute(Attribute::NoInline)) + F.removeFnAttr(Attribute::NoInline); + F.addFnAttr(Attribute::AlwaysInline); + F.removeFnAttr("CMGenxSIMT"); + F.setLinkage(GlobalValue::InternalLinkage); + + return true; +} + +/************************************************************************ + * findFunctionVectorizationOrder : calculate the order we want to visit + * functions, such that a function is not visited until all its callees + * have been visited. Also if a function is called directly or indirectly + * in the SIMT mode, add it to the list that need vectorization + */ +// Call graph node +struct CGNode { + Function *F; + std::set UnvisitedCallers; + std::set Callees; +}; + +void GenXPacketize::findFunctionVectorizationOrder(Module *M) { + // First build the call graph. + // We roll our own call graph here, because it is simpler than the general + // case supported by LLVM's call graph analysis (CM does not support + // recursion or function pointers), and we want to modify it (using the + // UnvisitedCallers set) when we traverse it. + std::map CallGraph; + for (auto mi = M->begin(), me = M->end(); mi != me; ++mi) { + Function *F = &*mi; + if (F->empty()) + continue; + + fixupLLVMIntrinsics(*F); + + // For each defined function: for each use (a call), add it to our + // UnvisitedCallers set, and add us to its Callees set. + // We are ignoring an illegal non-call use of a function; someone + // else can spot and diagnose that later. + // If the function has no callers, then add it straight in to FuncOrder. + CGNode *CGN = &CallGraph[F]; + CGN->F = F; + if (F->use_empty()) { + FuncOrder.push_back(F); + continue; + } + for (auto ui = F->use_begin(), ue = F->use_end(); ui != ue; ++ui) { + if (auto CI = dyn_cast(ui->getUser())) { + BasicBlock *Blk = CI->getParent(); + Function *Caller = Blk->getParent(); + CGNode *CallerNode = &CallGraph[Caller]; + CallerNode->F = Caller; + CGN->UnvisitedCallers.insert(CallerNode); + CallerNode->Callees.insert(CGN); + // find the vectorization width of callee + auto CallerVectorIter = FuncVectors.find(Caller); + if (CallerVectorIter != FuncVectors.end()) { + auto CalleeVectorIter = FuncVectors.find(F); + if (CalleeVectorIter != FuncVectors.end()) + CalleeVectorIter->second.insert(CallerVectorIter->second.begin(), + CallerVectorIter->second.end()); + else + FuncVectors.insert(std::pair>( + F, CallerVectorIter->second)); + } else if (Caller->hasFnAttribute("CMGenxSIMT")) { + uint32_t width = 0; + Caller->getFnAttribute("CMGenxSIMT") + .getValueAsString() + .getAsInteger(0, width); + if (width > 1) { + auto CalleeVectorIter = FuncVectors.find(F); + if (CalleeVectorIter != FuncVectors.end()) + CalleeVectorIter->second.insert(width); + else { + std::set WidthSet; + WidthSet.insert(width); + FuncVectors.insert( + std::pair>(F, WidthSet)); + } + } + } + } + } + } + // Run through the visit order. For each function, remove it from each + // callee's UnvisitedCallers set, and, if now empty, add the callee to + // the end of the visit order. + for (unsigned i = 0; i != FuncOrder.size(); ++i) { + CGNode *CGN = &CallGraph[FuncOrder[i]]; + for (auto ci = CGN->Callees.begin(), ce = CGN->Callees.end(); ci != ce; + ++ci) { + CGNode *Callee = *ci; + Callee->UnvisitedCallers.erase(CGN); + if (Callee->UnvisitedCallers.empty()) + FuncOrder.push_back(Callee->F); + // find the vectorization width of callee + auto CallerVectorIter = FuncVectors.find(CGN->F); + if (CallerVectorIter != FuncVectors.end()) { + auto CalleeVectorIter = FuncVectors.find(Callee->F); + if (CalleeVectorIter != FuncVectors.end()) + CalleeVectorIter->second.insert(CallerVectorIter->second.begin(), + CallerVectorIter->second.end()); + else + FuncVectors.insert( + std::make_pair(Callee->F, CallerVectorIter->second)); + } + } + } +} + +void GenXPacketize::findUniformArgs(Function &F) { + auto iter = FuncVectors.find(&F); + if (iter == FuncVectors.end()) { + // non-simt function or simt-entry function + for (const Argument &I : F.args()) + UniformArgs.insert(&I); + } else { + // simt functions that needs whole-function vectorization + for (const Argument &I : F.args()) { + bool IsUniform = true; + // check every call-site + for (User *U : F.users()) { + if (CallInst *CI = dyn_cast(U)) { + auto Def = CI->getArgOperand(I.getArgNo()); + if (Argument *DA = dyn_cast(Def)) { + if (!UniformArgs.count(DA)) { + IsUniform = false; + break; + } + } else { + IsUniform = false; + break; + } + } else { + IsUniform = false; + break; + } + } + if (IsUniform) + UniformArgs.insert(&I); + } + } +} + +bool GenXPacketize::isUniformIntrinsic(unsigned id) { + switch (id) { + case GenXIntrinsic::genx_get_color: + case GenXIntrinsic::genx_get_hwid: + case GenXIntrinsic::genx_get_scoreboard_bti: + case GenXIntrinsic::genx_get_scoreboard_deltas: + case GenXIntrinsic::genx_get_scoreboard_depcnt: + case GenXIntrinsic::genx_local_id: + case GenXIntrinsic::genx_local_id16: + case GenXIntrinsic::genx_local_size: + case GenXIntrinsic::genx_group_count: + case GenXIntrinsic::genx_group_id_x: + case GenXIntrinsic::genx_group_id_y: + case GenXIntrinsic::genx_group_id_z: + case GenXIntrinsic::genx_predefined_surface: + case GenXIntrinsic::genx_barrier: + case GenXIntrinsic::genx_sbarrier: + case GenXIntrinsic::genx_cache_flush: + case GenXIntrinsic::genx_fence: + case GenXIntrinsic::genx_wait: + case GenXIntrinsic::genx_yield: + case GenXIntrinsic::genx_print_buffer: + case GenXIntrinsic::genx_r0: + case GenXIntrinsic::genx_sr0: + case GenXIntrinsic::genx_timestamp: + case GenXIntrinsic::genx_thread_x: + case GenXIntrinsic::genx_thread_y: + return true; + default: + break; + } + return false; +} + +void GenXPacketize::findUniformInsts(Function &F) { + // global variable load is uniform + for (auto &Global : M->getGlobalList()) { + for (auto UI = Global.use_begin(), UE = Global.use_end(); UI != UE; ++UI) { + if (auto LD = dyn_cast(UI->getUser())) { + UniformInsts.insert(LD); + } + } + } + // some intrinsics are always uniform + for (auto &FD : M->getFunctionList()) { + if (FD.isDeclaration()) { + if (isUniformIntrinsic(GenXIntrinsic::getGenXIntrinsicID(&FD))) { + for (auto UI = FD.use_begin(), UE = FD.use_end(); UI != UE; ++UI) { + if (auto Inst = dyn_cast(UI->getUser())) { + UniformInsts.insert(Inst); + } + } + } + } + } + // first find out all the uniform alloca to store those uniform arguments + std::stack uvset; + for (const Argument &I : F.args()) { + if (!UniformArgs.count(&I)) + continue; + for (auto UI = I.user_begin(), E = I.user_end(); UI != E; ++UI) { + const Value *use = (*UI); + if (auto LI = dyn_cast(use)) { + UniformInsts.insert(LI); + } else if (auto GEP = dyn_cast(use)) { + if (GEP->getPointerOperand() == &I) { + UniformInsts.insert(GEP); + uvset.push((Value *)GEP); + } + } else if (auto SI = dyn_cast(use)) { + if (SI->getPointerOperand() == &I) + UniformInsts.insert(SI); + else { + auto PI = SI->getPointerOperand(); + if (auto AI = dyn_cast(PI)) { + UniformInsts.insert(AI); + uvset.push((Value *)AI); + } + } + } else if (auto CI = dyn_cast(use)) { + if (Function *Callee = CI->getCalledFunction()) { + if (GenXIntrinsic::isVLoadStore(Callee)) { + UniformInsts.insert(CI); + } + } + } + } + } + + // then find the uniform loads and stores in fork-region + while (!uvset.empty()) { + Value *Def = uvset.top(); + uvset.pop(); + for (auto UI = Def->user_begin(), E = Def->user_end(); UI != E; ++UI) { + Value *use = (*UI); + if (auto UseI = dyn_cast(use)) { + if (isa(UseI)) { + UniformInsts.insert(UseI); + } else if (auto LI = dyn_cast(UseI)) { + UniformInsts.insert(UseI); + if (LI->getType()->isPointerTy()) + uvset.push(UseI); + } else if (auto GEP = dyn_cast(UseI)) { + if (GEP->hasAllConstantIndices()) { + uvset.push(UseI); + UniformInsts.insert(UseI); + } + } + } + } + } + return; +} + +Value *GenXPacketize::getPacketizeValue(Value *OrigValue) { + auto iter = ReplaceMap.find(OrigValue); + if (iter != ReplaceMap.end()) { + return iter->second; + } else if (auto C = dyn_cast(OrigValue)) { + return packetizeConstant(C); + } else if (auto A = dyn_cast(OrigValue)) { + if (UniformArgs.count(A)) + return B->VBROADCAST(OrigValue, OrigValue->getName()); + // otherwise the argument should have been in the right vector form + ReplaceMap[OrigValue] = OrigValue; + return OrigValue; + } else if (auto Inst = dyn_cast(OrigValue)) { + // need special handling for alloca + if (auto AI = dyn_cast(OrigValue)) { + // this is not a uniform alloca + if (!UniformInsts.count(Inst)) { + Type *VecType = B->GetVectorType(AI->getAllocatedType()); + auto V = B->ALLOCA(VecType, nullptr, AI->getName()); + V->removeFromParent(); + V->insertBefore(Inst); + ReplaceMap[OrigValue] = V; + return V; + } + ReplaceMap[OrigValue] = OrigValue; + return OrigValue; + } else if (UniformInsts.count(Inst)) { + auto V = B->VBROADCAST(OrigValue); + return V; + } + } + + report_fatal_error("Could not find packetized value!"); + + return nullptr; +} + +// this is used on operands that are expected to be uniform +Value *GenXPacketize::getUniformValue(Value *OrigValue) { + if (auto G = dyn_cast(OrigValue)) + return G; + if (auto C = dyn_cast(OrigValue)) + return C; + if (auto A = dyn_cast(OrigValue)) { + if (UniformArgs.count(A)) { + return A; + } + } + if (auto A = dyn_cast(OrigValue)) { + if (UniformInsts.count(A)) { + return A; + } + } + auto VV = getPacketizeValue(OrigValue); + return B->VEXTRACT(VV, (uint64_t)0, OrigValue->getName()); +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Returns the equivalent vector intrinsic for the input scalar +/// intrinsic +Function *GenXPacketize::getVectorIntrinsic(Module *M, unsigned id, + std::vector &ArgTy) +{ + if (id == Intrinsic::fma) { + return Intrinsic::getDeclaration(M, (Intrinsic::ID)id, ArgTy[0]); + } else if (id == Intrinsic::pow) { + // for some reason, passing the 2 vector input args to the pow declaration + // results in a malformed vectored pow intrinsic. Forcing the expected + // vector input here. + return Intrinsic::getDeclaration(M, (Intrinsic::ID)id, B->mSimdFP32Ty); + } else if ((id == Intrinsic::maxnum) || (id == Intrinsic::minnum)) { + return Intrinsic::getDeclaration(M, (Intrinsic::ID)id, ArgTy[0]); + } else { + return GenXIntrinsic::getAnyDeclaration(M, id, ArgTy); + } +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Determines if instruction is an llvm intrinsic (which may include +/// x86 intrinsics +static bool IsLLVMIntrinsic(Instruction *pInst) { + if (isa(pInst)) { + CallInst *call = cast(pInst); + Function *f = call->getCalledFunction(); + assert(f); + return f->isIntrinsic(); + } + return false; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Packetize a scalar constant +Value *GenXPacketize::packetizeConstant(Constant *pConstant) { + if (isa(pConstant)) { + return UndefValue::get(B->GetVectorType(pConstant->getType())); + } else { + return B->VBROADCAST(pConstant); + } +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Packetize an LLVM intrinsic. Generally this means replacing +/// a scalar intrinsic function call with a vectored equivalent. +Value *GenXPacketize::packetizeLLVMIntrinsic(Instruction *pInst) { + Module *M = B->mpModule; + + B->IRB()->SetInsertPoint(pInst); + CallInst *pCall = cast(pInst); + Function *f = pCall->getCalledFunction(); + assert(f && f->isIntrinsic()); + auto id = GenXIntrinsic::getAnyIntrinsicID(f); + + // packetize intrinsic operands + std::vector vectorArgTys; + std::vector packetizedArgs; + for (auto &operand : pCall->arg_operands()) { + auto VV = getPacketizeValue(operand.get()); + packetizedArgs.push_back(VV); + vectorArgTys.push_back(VV->getType()); + } + + // override certain intrinsics + Value *pNewCall; + switch (id) { + case Intrinsic::log2: + pNewCall = B->VLOG2PS(packetizedArgs[0]); + break; + case Intrinsic::exp2: + pNewCall = B->VEXP2PS(packetizedArgs[0]); + break; + default: { + Function *newF = getVectorIntrinsic(M, id, vectorArgTys); + pNewCall = CallInst::Create(newF, packetizedArgs, "", pCall); + } + } + return pNewCall; +} + +Value *GenXPacketize::packetizeLLVMInstruction(Instruction *pInst) { + Value *pReplacedInst = nullptr; + B->IRB()->SetInsertPoint(pInst); + // packetize a call + if (auto CI = dyn_cast(pInst)) { + auto F = CI->getCalledFunction(); + auto FMI = FuncMap.find(std::pair(F, B->mVWidth)); + if (FMI != FuncMap.end()) { + std::vector ArgOps; + auto VF = FMI->second; + for (Argument &Arg : VF->args()) { + auto i = Arg.getArgNo(); + if (UniformArgs.count(&Arg)) + ArgOps.push_back(getUniformValue(CI->getArgOperand(i))); + else + ArgOps.push_back(getPacketizeValue(CI->getArgOperand(i))); + } + pReplacedInst = CallInst::Create(VF, ArgOps, CI->getName(), CI); + return pReplacedInst; + } else + assert(false); + } + uint32_t opcode = pInst->getOpcode(); + + switch (opcode) { + case Instruction::AddrSpaceCast: + case Instruction::BitCast: { + // packetize the bitcast source + Value *pPacketizedSrc = getPacketizeValue(pInst->getOperand(0)); + Type *pPacketizedSrcTy = pPacketizedSrc->getType(); + + // packetize dst type + Type *pReturnTy; + if (pInst->getType()->isPointerTy()) { + // two types of pointers, * or + Type *pDstScalarTy = pInst->getType()->getPointerElementType(); + + if (pPacketizedSrc->getType()->isVectorTy()) { + // + Type *pDstPtrTy = PointerType::get( + pDstScalarTy, pInst->getType()->getPointerAddressSpace()); + uint32_t numElems = pPacketizedSrcTy->getVectorNumElements(); + pReturnTy = VectorType::get(pDstPtrTy, numElems); + } else { + // * + pReturnTy = + PointerType::get(B->GetVectorType(pDstScalarTy), + pInst->getType()->getPointerAddressSpace()); + } + } else { + pReturnTy = B->GetVectorType(pInst->getType()); + } + + pReplacedInst = + B->CAST((Instruction::CastOps)opcode, pPacketizedSrc, pReturnTy); + break; + } + + case Instruction::GetElementPtr: { + GetElementPtrInst *pGepInst = cast(pInst); + auto pBase = pGepInst->getPointerOperand(); + Value *pVecSrc = nullptr; + if (dyn_cast(pBase)) + pVecSrc = pBase; + else if (dyn_cast(pBase)) + pVecSrc = pBase; + else if (dyn_cast(pBase) && + UniformInsts.count(dyn_cast(pBase))) + pVecSrc = pBase; + else + pVecSrc = getPacketizeValue(pBase); + + if (!isa(pVecSrc)) { + // just packetize the GEP to a vector GEP. + SmallVector vecIndices; + for (uint32_t i = 0; i < pGepInst->getNumIndices(); ++i) { + vecIndices.push_back(getPacketizeValue(pGepInst->getOperand(1 + i))); + } + pReplacedInst = B->GEPA(pVecSrc, vecIndices); + } else { + if (pGepInst->hasAllConstantIndices()) { + // SOA GEP with scalar src and constant indices, result will be * Ex. gep [4 x <8 x float>]*, 0, 0 --> <8 x float>* + SmallVector vecIndices; + for (uint32_t i = 0; i < pGepInst->getNumIndices(); ++i) { + vecIndices.push_back(pGepInst->getOperand(1 + i)); + } + pReplacedInst = B->GEPA(pVecSrc, vecIndices); + } else { + //// SOA GEP with non-uniform indices. Need to vector GEP to each SIMD + /// lane. + /// Result will be + SmallVector vecIndices; + for (uint32_t i = 0; i < pGepInst->getNumIndices(); ++i) { + vecIndices.push_back(getPacketizeValue(pGepInst->getOperand(1 + i))); + } + + // Step to the SIMD lane + if (B->mVWidth == 8) { + vecIndices.push_back(B->C({0, 1, 2, 3, 4, 5, 6, 7})); + } else if (B->mVWidth == 16) { + vecIndices.push_back( + B->C({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15})); + } else { + report_fatal_error("Unsupported SIMD width."); + } + + pReplacedInst = B->GEPA(pVecSrc, vecIndices); + } + } + break; + } + + case Instruction::Load: { + LoadInst *pLoadInst = cast(pInst); + Value *pSrc = pLoadInst->getPointerOperand(); + Value *pVecSrc = getPacketizeValue(pSrc); + auto LI = cast(pInst); + if (pVecSrc == pSrc) + pReplacedInst = pInst; + else if (pVecSrc->getType()->isVectorTy()) { + assert(pVecSrc->getType()->getVectorElementType()->isPointerTy()); + auto Align = LI->getAlignment(); + pReplacedInst = B->MASKED_GATHER(pVecSrc, Align); + } else { + auto Align = LI->getAlignment(); + pReplacedInst = B->ALIGNED_LOAD(pVecSrc, Align); + } + break; + } + + case Instruction::Store: { + StoreInst *pStoreInst = cast(pInst); + Value *pVecDstPtrs = getPacketizeValue(pStoreInst->getPointerOperand()); + Value *pVecSrc = getPacketizeValue(pStoreInst->getOperand(0)); + if (pVecDstPtrs->getType()->isVectorTy()) { + assert(pVecDstPtrs->getType()->getVectorElementType()->isPointerTy()); + auto Align = cast(pInst)->getAlignment(); + pReplacedInst = B->MASKED_SCATTER(pVecSrc, pVecDstPtrs, Align); + } else { + pReplacedInst = B->STORE(pVecSrc, pVecDstPtrs); + } + break; + } + + case Instruction::ExtractElement: { + auto OldVec = pInst->getOperand(0); + auto Vec = getPacketizeValue(OldVec); + auto Idx = pInst->getOperand(1); + auto N = OldVec->getType()->getVectorNumElements(); + auto ElemType = pInst->getType(); + auto VecDstTy = VectorType::get(ElemType, B->mVWidth); + // create an read-region + CMRegion R(VecDstTy); + if (ConstantInt *CI = dyn_cast(Idx)) { + R.Offset = CI->getSExtValue() * ElemType->getPrimitiveSizeInBits() / 8; + R.Indirect = nullptr; + } else { + R.Offset = 0; + auto NBits = Idx->getType()->getIntegerBitWidth() / 8; + auto MulCType = IntegerType::getIntNTy(M->getContext(), NBits); + auto MulC = + ConstantInt::get(MulCType, ElemType->getPrimitiveSizeInBits() / 8); + R.Indirect = B->MUL(Idx, MulC); + } + R.NumElements = B->mVWidth; + R.Width = B->mVWidth; + R.Stride = N; + R.VStride = 0; + pReplacedInst = + R.createRdRegion(Vec, pInst->getName(), pInst /*InsertBefore*/, + pInst->getDebugLoc(), true /*AllowScalar*/); + break; + } + + case Instruction::InsertElement: { + auto OldVec = pInst->getOperand(0); + auto Vec = getPacketizeValue(OldVec); + auto ElmVec = getPacketizeValue(pInst->getOperand(1)); + auto Idx = pInst->getOperand(2); + auto N = OldVec->getType()->getVectorNumElements(); + auto ElemType = pInst->getOperand(1)->getType(); + // create an write-region + CMRegion R(Vec->getType()); + if (ConstantInt *CI = dyn_cast(Idx)) { + R.Offset = CI->getSExtValue() * ElemType->getPrimitiveSizeInBits() / 8; + R.Indirect = nullptr; + } else { + R.Offset = 0; + auto NBits = Idx->getType()->getIntegerBitWidth() / 8; + auto MulCType = IntegerType::getIntNTy(M->getContext(), NBits); + auto MulC = + ConstantInt::get(MulCType, ElemType->getPrimitiveSizeInBits() / 8); + R.Indirect = B->MUL(Idx, MulC); + } + R.NumElements = B->mVWidth; + R.Width = B->mVWidth; + R.Stride = N; + R.VStride = 0; + pReplacedInst = + R.createWrRegion(Vec, ElmVec, pInst->getName(), pInst /*InsertBefore*/, + pInst->getDebugLoc()); + break; + } + + case Instruction::Br: { + // any conditional branches with vectored conditions need to preceded with + // a genx_simdcf_any to ensure we branch iff all lanes are set + BranchInst *pBranch = cast(pInst); + if (pBranch->isConditional()) { + Value *vCondition = getPacketizeValue(pBranch->getCondition()); + llvm::Function *NewFn = GenXIntrinsic::getGenXDeclaration( + B->mpModule, + GenXIntrinsic::genx_simdcf_any, + vCondition->getType()); + llvm::CallInst *NewTest = CallInst::Create(NewFn, vCondition, "", pInst); + NewTest->setName("exit.cond.mask.test"); + pBranch->setCondition(NewTest); + } + pReplacedInst = pBranch; + break; + } + + case Instruction::PHI: { + Type *vecType = B->GetVectorType(pInst->getType()); + pInst->mutateType(vecType); + pReplacedInst = pInst; + break; + } + + case Instruction::Alloca: { + AllocaInst *pAllocaInst = cast(pInst); + Type *pVecType = B->GetVectorType(pAllocaInst->getAllocatedType()); + Value *pReturn = B->ALLOCA(pVecType, nullptr, pInst->getName()); + pReplacedInst = pReturn; + break; + } + + case Instruction::ShuffleVector: { + auto Src1 = pInst->getOperand(0); + auto Src2 = pInst->getOperand(1); + auto Mask = pInst->getOperand(2); + if (Src1->getType()->getVectorNumElements() == 1 && + Mask->getType()->getVectorNumElements() == 1) { + if (cast(Mask)->isAllOnesValue()) + pReplacedInst = getPacketizeValue(Src2); + else + pReplacedInst = getPacketizeValue(Src1); + } else + report_fatal_error( + "ShuffleVector should've been replaced by Scalarizer."); + break; + } + + case Instruction::IntToPtr: { + IntToPtrInst *pIntToPtrInst = cast(pInst); + Value *pVecSrc = getPacketizeValue(pInst->getOperand(0)); + Type *pVecDestTy = VectorType::get(pIntToPtrInst->getDestTy(), B->mVWidth); + pReplacedInst = B->INT_TO_PTR(pVecSrc, pVecDestTy); + break; + } + + case Instruction::Select: { + Value *pVecCond = getPacketizeValue(pInst->getOperand(0)); + Value *pTrueSrc = getPacketizeValue(pInst->getOperand(1)); + Value *pFalseSrc = getPacketizeValue(pInst->getOperand(2)); + + if (!pTrueSrc->getType()->isPointerTy()) { + // simple select packetization + pReplacedInst = B->SELECT(pVecCond, pTrueSrc, pFalseSrc); + } else { + // vector struct input, need to loop over components and build up new + // struct allocation + Value *pAlloca = B->ALLOCA( + B->GetVectorType(pInst->getType()->getPointerElementType())); + uint32_t numElems = + pInst->getType()->getPointerElementType()->getArrayNumElements(); + + for (uint32_t i = 0; i < numElems; ++i) { + Value *pTrueSrcElem = B->LOAD(pTrueSrc, {0, i}); + Value *pFalseSrcElem = B->LOAD(pFalseSrc, {0, i}); + + // mask store true components + Value *pGep = B->GEP(pAlloca, {0, i}); + B->MASKED_STORE(pTrueSrcElem, pGep, 4, pVecCond); + + // store false components to inverted mask + B->MASKED_STORE(pFalseSrcElem, pGep, 4, B->NOT(pVecCond)); + } + pReplacedInst = pAlloca; + } + break; + } + + case Instruction::Ret: { + ReturnInst *pRet = cast(pInst); + if (pRet->getReturnValue() != nullptr) { + Value *pReturn = getPacketizeValue(pRet->getReturnValue()); + ReturnInst *pNewRet = B->RET(pReturn); + pReplacedInst = pNewRet; + } else { + pReplacedInst = pInst; + } + + break; + } + + default: { + // for the rest of the instructions, vectorize the instruction type as + // well as its args + Type *vecType = B->GetVectorType(pInst->getType()); + pInst->mutateType(vecType); + + for (Use &op : pInst->operands()) { + op.set(getPacketizeValue(op.get())); + } + pReplacedInst = pInst; + } + } + + return pReplacedInst; +} + +Value *GenXPacketize::packetizeGenXIntrinsic(Instruction *inst) { + B->IRB()->SetInsertPoint(inst); + + if (auto CI = dyn_cast_or_null(inst)) { + if (Function *Callee = CI->getCalledFunction()) { + auto IID = GenXIntrinsic::getGenXIntrinsicID(Callee); + Value *replacement = nullptr; + // some intrinsics are uniform therefore should not get here + assert(!isUniformIntrinsic(IID)); + switch (IID) { + case GenXIntrinsic::genx_line: + case GenXIntrinsic::genx_pln: + case GenXIntrinsic::genx_dp2: + case GenXIntrinsic::genx_dp3: + case GenXIntrinsic::genx_dp4: + case GenXIntrinsic::genx_ssdp4a: + case GenXIntrinsic::genx_sudp4a: + case GenXIntrinsic::genx_usdp4a: + case GenXIntrinsic::genx_uudp4a: + case GenXIntrinsic::genx_ssdp4a_sat: + case GenXIntrinsic::genx_sudp4a_sat: + case GenXIntrinsic::genx_usdp4a_sat: + case GenXIntrinsic::genx_uudp4a_sat: + case GenXIntrinsic::genx_dph: + case GenXIntrinsic::genx_transpose_ld: + case GenXIntrinsic::genx_oword_ld: + case GenXIntrinsic::genx_oword_ld_unaligned: + case GenXIntrinsic::genx_oword_st: + case GenXIntrinsic::genx_svm_block_ld: + case GenXIntrinsic::genx_svm_block_ld_unaligned: + case GenXIntrinsic::genx_svm_block_st: + case GenXIntrinsic::genx_load: + case GenXIntrinsic::genx_3d_load: + case GenXIntrinsic::genx_3d_sample: + case GenXIntrinsic::genx_avs: + case GenXIntrinsic::genx_sample: + case GenXIntrinsic::genx_sample_unorm: + case GenXIntrinsic::genx_simdcf_any: + case GenXIntrinsic::genx_simdcf_goto: + case GenXIntrinsic::genx_simdcf_join: + case GenXIntrinsic::genx_simdcf_predicate: + case GenXIntrinsic::genx_rdpredregion: + case GenXIntrinsic::genx_wrconstregion: + case GenXIntrinsic::genx_wrpredregion: + case GenXIntrinsic::genx_wrpredpredregion: + case GenXIntrinsic::genx_output: + case GenXIntrinsic::genx_va_1d_convolve_horizontal: + case GenXIntrinsic::genx_va_1d_convolve_vertical: + case GenXIntrinsic::genx_va_1pixel_convolve: + case GenXIntrinsic::genx_va_1pixel_convolve_1x1mode: + case GenXIntrinsic::genx_va_bool_centroid: + case GenXIntrinsic::genx_va_centroid: + case GenXIntrinsic::genx_va_convolve2d: + case GenXIntrinsic::genx_va_correlation_search: + case GenXIntrinsic::genx_va_dilate: + case GenXIntrinsic::genx_va_erode: + case GenXIntrinsic::genx_va_flood_fill: + case GenXIntrinsic::genx_va_hdc_1d_convolve_horizontal: + case GenXIntrinsic::genx_va_hdc_1d_convolve_vertical: + case GenXIntrinsic::genx_va_hdc_1pixel_convolve: + case GenXIntrinsic::genx_va_hdc_convolve2d: + case GenXIntrinsic::genx_va_hdc_dilate: + case GenXIntrinsic::genx_va_hdc_erode: + case GenXIntrinsic::genx_va_hdc_lbp_correlation: + case GenXIntrinsic::genx_va_hdc_lbp_creation: + case GenXIntrinsic::genx_va_hdc_minmax_filter: + case GenXIntrinsic::genx_va_lbp_correlation: + case GenXIntrinsic::genx_va_lbp_creation: + case GenXIntrinsic::genx_va_minmax: + case GenXIntrinsic::genx_va_minmax_filter: + case GenXIntrinsic::genx_media_ld: + case GenXIntrinsic::genx_media_st: + case GenXIntrinsic::genx_raw_send: + case GenXIntrinsic::genx_raw_send_noresult: + case GenXIntrinsic::genx_raw_sends: + case GenXIntrinsic::genx_raw_sends_noresult: + report_fatal_error("Unsupported genx intrinsic in SIMT mode."); + return nullptr; + case GenXIntrinsic::genx_dword_atomic_add: + case GenXIntrinsic::genx_dword_atomic_sub: + case GenXIntrinsic::genx_dword_atomic_min: + case GenXIntrinsic::genx_dword_atomic_max: + case GenXIntrinsic::genx_dword_atomic_xchg: + case GenXIntrinsic::genx_dword_atomic_and: + case GenXIntrinsic::genx_dword_atomic_or: + case GenXIntrinsic::genx_dword_atomic_xor: + case GenXIntrinsic::genx_dword_atomic_imin: + case GenXIntrinsic::genx_dword_atomic_imax: + case GenXIntrinsic::genx_dword_atomic_fmin: + case GenXIntrinsic::genx_dword_atomic_fmax: + { + Value *Src0 = getPacketizeValue(CI->getOperand(0)); + Value *BTI = getUniformValue(CI->getOperand(1)); + Value *Src2 = getPacketizeValue(CI->getOperand(2)); + Value *Src3 = getPacketizeValue(CI->getOperand(3)); + Value *Src4 = getPacketizeValue(CI->getOperand(4)); + Value *Args[] = {Src0, BTI, Src2, Src3, Src4}; + auto RetTy = B->GetVectorType(CI->getType()); + Type *Tys[] = {RetTy, Src0->getType(), Src2->getType()}; + auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys); + replacement = CallInst::Create(Decl, Args, CI->getName(), CI); + cast(replacement)->setDebugLoc(CI->getDebugLoc()); + return replacement; + } break; + case GenXIntrinsic::genx_dword_atomic_inc: + case GenXIntrinsic::genx_dword_atomic_dec: { + Value *Src0 = getPacketizeValue(CI->getOperand(0)); + Value *BTI = getUniformValue(CI->getOperand(1)); + Value *Src2 = getPacketizeValue(CI->getOperand(2)); + Value *Src3 = getPacketizeValue(CI->getOperand(3)); + Value *Args[] = {Src0, BTI, Src2, Src3}; + auto RetTy = B->GetVectorType(CI->getType()); + Type *Tys[] = {RetTy, Src0->getType()}; + auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys); + replacement = CallInst::Create(Decl, Args, CI->getName(), CI); + cast(replacement)->setDebugLoc(CI->getDebugLoc()); + return replacement; + } break; + case GenXIntrinsic::genx_dword_atomic_fcmpwr: { + Value *Src0 = getPacketizeValue(CI->getOperand(0)); + Value *BTI = getUniformValue(CI->getOperand(1)); + Value *Src2 = getPacketizeValue(CI->getOperand(2)); + Value *Src3 = getPacketizeValue(CI->getOperand(3)); + Value *Src4 = getPacketizeValue(CI->getOperand(4)); + Value *Src5 = getPacketizeValue(CI->getOperand(5)); + Value *Args[] = {Src0, BTI, Src2, Src3, Src4, Src5}; + auto RetTy = B->GetVectorType(CI->getType()); + Type *Tys[] = {RetTy, Src0->getType(), Src2->getType()}; + auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys); + replacement = CallInst::Create(Decl, Args, CI->getName(), CI); + cast(replacement)->setDebugLoc(CI->getDebugLoc()); + return replacement; + } break; + case GenXIntrinsic::genx_dword_atomic_cmpxchg: { + Value *Src0 = getPacketizeValue(CI->getOperand(0)); + Value *BTI = getUniformValue(CI->getOperand(1)); + Value *Src2 = getPacketizeValue(CI->getOperand(2)); + Value *Src3 = getPacketizeValue(CI->getOperand(3)); + Value *Src4 = getPacketizeValue(CI->getOperand(4)); + Value *Src5 = getPacketizeValue(CI->getOperand(5)); + Value *Args[] = {Src0, BTI, Src2, Src3, Src4, Src5}; + auto RetTy = B->GetVectorType(CI->getType()); + Type *Tys[] = {RetTy, Src0->getType()}; + auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys); + replacement = CallInst::Create(Decl, Args, CI->getName(), CI); + cast(replacement)->setDebugLoc(CI->getDebugLoc()); + return replacement; + } break; + case GenXIntrinsic::genx_svm_gather: { + Value *Predicate = getPacketizeValue(CI->getOperand(0)); + Value *NBlk = CI->getOperand(1); + assert(isa(NBlk)); + Value *Addr = getPacketizeValue(CI->getOperand(2)); + Value *Src3 = getPacketizeValue(CI->getOperand(3)); + Value *Args[] = {Predicate, NBlk, Addr, Src3}; + auto RetTy = B->GetVectorType(CI->getType()); + Type *Tys[] = {RetTy, Predicate->getType(), Addr->getType()}; + auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys); + replacement = CallInst::Create(Decl, Args, CI->getName(), CI); + cast(replacement)->setDebugLoc(CI->getDebugLoc()); + return replacement; + } break; + case GenXIntrinsic::genx_svm_scatter: { + Value *Predicate = getPacketizeValue(CI->getOperand(0)); + Value *NBlk = CI->getOperand(1); + assert(isa(NBlk)); + Value *Addr = getPacketizeValue(CI->getOperand(2)); + Value *Src3 = getPacketizeValue(CI->getOperand(3)); + Value *Args[] = {Predicate, NBlk, Addr, Src3}; + // store, no return type + Type *Tys[] = {Predicate->getType(), Addr->getType(), Src3->getType()}; + auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys); + replacement = CallInst::Create(Decl, Args, CI->getName(), CI); + cast(replacement)->setDebugLoc(CI->getDebugLoc()); + return replacement; + } break; + case GenXIntrinsic::genx_svm_gather4_scaled: { + Value *Predicate = getPacketizeValue(CI->getOperand(0)); + Value *ChMask = CI->getOperand(1); + assert(isa(ChMask)); + Value *Scale = CI->getOperand(2); + assert(isa(Scale)); + Value *Addr = getUniformValue(CI->getOperand(3)); + Value *Src4 = getPacketizeValue(CI->getOperand(4)); + Value *Src5 = getPacketizeValue(CI->getOperand(5)); + Value *Args[] = {Predicate, ChMask, Scale, Addr, Src4, Src5}; + auto RetTy = B->GetVectorType(CI->getType()); + Type *Tys[] = {RetTy, Predicate->getType(), Src4->getType()}; + auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys); + replacement = CallInst::Create(Decl, Args, CI->getName(), CI); + cast(replacement)->setDebugLoc(CI->getDebugLoc()); + return replacement; + } break; + case GenXIntrinsic::genx_svm_scatter4_scaled: { + Value *Predicate = getPacketizeValue(CI->getOperand(0)); + Value *ChMask = CI->getOperand(1); + assert(isa(ChMask)); + Value *Scale = CI->getOperand(2); + assert(isa(Scale)); + Value *Addr = getUniformValue(CI->getOperand(3)); + Value *Src4 = getPacketizeValue(CI->getOperand(4)); + Value *Src5 = getPacketizeValue(CI->getOperand(5)); + Value *Args[] = {Predicate, ChMask, Scale, Addr, Src4, Src5}; + // store no return type + Type *Tys[] = {Predicate->getType(), Addr->getType(), Src4->getType(), + Src5->getType()}; + auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys); + replacement = CallInst::Create(Decl, Args, CI->getName(), CI); + cast(replacement)->setDebugLoc(CI->getDebugLoc()); + return replacement; + } break; + case GenXIntrinsic::genx_gather4_typed: { + Value *ChMask = CI->getOperand(0); + assert(isa(ChMask)); + Value *Predicate = getPacketizeValue(CI->getOperand(1)); + Value *BTI = getUniformValue(CI->getOperand(2)); + Value *Src3 = getPacketizeValue(CI->getOperand(3)); + Value *Src4 = getPacketizeValue(CI->getOperand(4)); + Value *Src5 = getPacketizeValue(CI->getOperand(5)); + Value *Src6 = getPacketizeValue(CI->getOperand(6)); + Value *Args[] = {ChMask, Predicate, BTI, Src3, Src4, Src5, Src6}; + auto RetTy = B->GetVectorType(CI->getType()); + Type *Tys[] = {RetTy, Predicate->getType(), Src3->getType()}; + auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys); + replacement = CallInst::Create(Decl, Args, CI->getName(), CI); + cast(replacement)->setDebugLoc(CI->getDebugLoc()); + return replacement; + } break; + case GenXIntrinsic::genx_scatter4_typed: { + Value *ChMask = CI->getOperand(0); + assert(isa(ChMask)); + Value *Predicate = getPacketizeValue(CI->getOperand(1)); + Value *BTI = getUniformValue(CI->getOperand(2)); + Value *Src3 = getPacketizeValue(CI->getOperand(3)); + Value *Src4 = getPacketizeValue(CI->getOperand(4)); + Value *Src5 = getPacketizeValue(CI->getOperand(5)); + Value *Src6 = getPacketizeValue(CI->getOperand(6)); + Value *Args[] = {ChMask, Predicate, BTI, Src3, Src4, Src5, Src6}; + // store no return type + Type *Tys[] = {Predicate->getType(), Src3->getType(), Src6->getType()}; + auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys); + replacement = CallInst::Create(Decl, Args, CI->getName(), CI); + cast(replacement)->setDebugLoc(CI->getDebugLoc()); + return replacement; + } break; + case GenXIntrinsic::genx_scatter4_scaled: + case GenXIntrinsic::genx_scatter_scaled: { + Value *Predicate = getPacketizeValue(CI->getOperand(0)); + Value *NBlk = CI->getOperand(1); // or channel mask for scatter4 + assert(isa(NBlk)); + Value *Scale = CI->getOperand(2); + assert(isa(Scale)); + Value *BTI = getUniformValue(CI->getOperand(3)); + Value *GOff = getUniformValue(CI->getOperand(4)); + Value *ElemOffsets = getPacketizeValue(CI->getOperand(5)); + Value *InData = getPacketizeValue(CI->getOperand(6)); + Value *Args[] = {Predicate, NBlk, Scale, BTI, + GOff, ElemOffsets, InData}; + // no return value for store + Type *Tys[] = {Args[0]->getType(), Args[5]->getType(), + Args[6]->getType()}; + auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys); + replacement = CallInst::Create(Decl, Args, CI->getName(), CI); + cast(replacement)->setDebugLoc(CI->getDebugLoc()); + return replacement; + } break; + case GenXIntrinsic::genx_gather4_scaled: + case GenXIntrinsic::genx_gather_scaled: { + Value *Predicate = getPacketizeValue(CI->getOperand(0)); + Value *NBlk = CI->getOperand(1); // or channel mask for gather4 + assert(isa(NBlk)); + Value *Scale = CI->getOperand(2); + assert(isa(Scale)); + Value *BTI = getUniformValue(CI->getOperand(3)); + Value *GOff = getUniformValue(CI->getOperand(4)); + Value *ElemOffsets = getPacketizeValue(CI->getOperand(5)); + Value *InData = getPacketizeValue(CI->getOperand(6)); + Value *Args[] = {Predicate, NBlk, Scale, BTI, + GOff, ElemOffsets, InData}; + auto RetTy = B->GetVectorType(CI->getType()); + Type *Tys[] = {RetTy, Args[0]->getType(), Args[5]->getType()}; + auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys); + replacement = CallInst::Create(Decl, Args, CI->getName(), CI); + cast(replacement)->setDebugLoc(CI->getDebugLoc()); + return replacement; + } break; + case GenXIntrinsic::genx_gather4_scaled2: + case GenXIntrinsic::genx_gather_scaled2: { + Value *NBlk = CI->getOperand(0); + assert(isa(NBlk)); + Value *Scale = CI->getOperand(1); + assert(isa(Scale)); + Value *BTI = getUniformValue(CI->getOperand(2)); + Value *GOff = getUniformValue(CI->getOperand(3)); + Value *ElemOffsets = getPacketizeValue(CI->getOperand(4)); + Value *Args[] = {NBlk, Scale, BTI, GOff, ElemOffsets}; + Type *RetTy = B->GetVectorType(CI->getType()); + Type *Tys[] = {RetTy, ElemOffsets->getType()}; + Function *Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys); + replacement = CallInst::Create(Decl, Args, CI->getName(), CI); + cast(replacement)->setDebugLoc(CI->getDebugLoc()); + return replacement; + } + case GenXIntrinsic::genx_lane_id: { + assert((CI->getType()->getIntegerBitWidth() == 32) && + "Expected to return 32-bit integer."); + if (B->mVWidth == 8) { + std::initializer_list l = {0, 1, 2, 3, 4, 5, 6, 7}; + replacement = B->C(l); + } else if (B->mVWidth == 16) { + std::initializer_list l = {0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15}; + replacement = B->C(l); + } else if (B->mVWidth == 32) { + std::initializer_list l = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}; + replacement = B->C(l); + } else + assert(false); + return replacement; + } break; + case GenXIntrinsic::genx_rdregionf: + case GenXIntrinsic::genx_rdregioni: { + // packetize intrinsic operands + const DebugLoc &DL = CI->getDebugLoc(); + auto OrigV0 = CI->getOperand(0); + CMRegion R(CI); + assert(R.Width == 1); + if (OrigV0->getType()->getVectorNumElements() == 1) { + replacement = getPacketizeValue(OrigV0); + } else { + R.NumElements = B->mVWidth; + if (R.Indirect) { + R.Indirect = getPacketizeValue(R.Indirect); + } + replacement = R.createRdRegion(getPacketizeValue(OrigV0), + CI->getName(), CI, DL); + } + return replacement; + } break; + case GenXIntrinsic::genx_wrregionf: + case GenXIntrinsic::genx_wrregioni: { + auto NewV0 = CI->getOperand(1); + const DebugLoc &DL = CI->getDebugLoc(); + CMRegion R(CI); + assert(isa(NewV0->getType())); + assert(NewV0->getType()->getVectorNumElements() == 1); + auto NewV1 = getPacketizeValue(NewV0); + R.NumElements = B->mVWidth; + if (R.Indirect) { + R.Indirect = getPacketizeValue(R.Indirect); + } + replacement = + R.createWrRegion(CI->getOperand(0), NewV1, CI->getName(), CI, DL); + return replacement; + } break; + case GenXIntrinsic::genx_untyped_atomic_add: + case GenXIntrinsic::genx_untyped_atomic_sub: + case GenXIntrinsic::genx_untyped_atomic_min: + case GenXIntrinsic::genx_untyped_atomic_max: + case GenXIntrinsic::genx_untyped_atomic_xchg: + case GenXIntrinsic::genx_untyped_atomic_and: + case GenXIntrinsic::genx_untyped_atomic_or: + case GenXIntrinsic::genx_untyped_atomic_xor: + case GenXIntrinsic::genx_untyped_atomic_imin: + case GenXIntrinsic::genx_untyped_atomic_imax: { + Value *Src0 = getPacketizeValue(CI->getOperand(0)); + Value *BTI = getUniformValue(CI->getOperand(1)); + Value *GOFF = getUniformValue(CI->getOperand(2)); + Value *Src3 = getPacketizeValue(CI->getOperand(3)); + Value *Src4 = getPacketizeValue(CI->getOperand(4)); + Value *Src5 = getPacketizeValue(CI->getOperand(5)); + Value *Args[] = {Src0, BTI, GOFF, Src3, Src4, Src5}; + auto RetTy = B->GetVectorType(CI->getType()); + Type *Tys[] = {RetTy, Src0->getType()}; + auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys); + replacement = CallInst::Create(Decl, Args, CI->getName(), CI); + cast(replacement)->setDebugLoc(CI->getDebugLoc()); + return replacement; + } break; + case GenXIntrinsic::genx_untyped_atomic_inc: + case GenXIntrinsic::genx_untyped_atomic_dec: { + Value *Src0 = getPacketizeValue(CI->getOperand(0)); + Value *BTI = getUniformValue(CI->getOperand(1)); + Value *GOFF = getUniformValue(CI->getOperand(2)); + Value *Src3 = getPacketizeValue(CI->getOperand(3)); + Value *Src4 = getPacketizeValue(CI->getOperand(4)); + Value *Args[] = {Src0, BTI, GOFF, Src3, Src4}; + auto RetTy = B->GetVectorType(CI->getType()); + Type *Tys[] = {RetTy, Src0->getType()}; + auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys); + replacement = CallInst::Create(Decl, Args, CI->getName(), CI); + cast(replacement)->setDebugLoc(CI->getDebugLoc()); + return replacement; + } break; + case GenXIntrinsic::genx_untyped_atomic_cmpxchg: { + Value *Src0 = getPacketizeValue(CI->getOperand(0)); + Value *BTI = getUniformValue(CI->getOperand(1)); + Value *GOFF = getUniformValue(CI->getOperand(2)); + Value *Src3 = getPacketizeValue(CI->getOperand(3)); + Value *Src4 = getPacketizeValue(CI->getOperand(4)); + Value *Src5 = getPacketizeValue(CI->getOperand(5)); + Value *Src6 = getPacketizeValue(CI->getOperand(6)); + Value *Args[] = {Src0, BTI, GOFF, Src3, Src4, Src5, Src6}; + auto RetTy = B->GetVectorType(CI->getType()); + Type *Tys[] = {RetTy, Src0->getType()}; + auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys); + replacement = CallInst::Create(Decl, Args, CI->getName(), CI); + cast(replacement)->setDebugLoc(CI->getDebugLoc()); + return replacement; + } break; + + case GenXIntrinsic::genx_typed_atomic_add: + case GenXIntrinsic::genx_typed_atomic_sub: + case GenXIntrinsic::genx_typed_atomic_min: + case GenXIntrinsic::genx_typed_atomic_max: + case GenXIntrinsic::genx_typed_atomic_xchg: + case GenXIntrinsic::genx_typed_atomic_and: + case GenXIntrinsic::genx_typed_atomic_or: + case GenXIntrinsic::genx_typed_atomic_xor: + case GenXIntrinsic::genx_typed_atomic_imin: + case GenXIntrinsic::genx_typed_atomic_imax: + case GenXIntrinsic::genx_typed_atomic_fmin: + case GenXIntrinsic::genx_typed_atomic_fmax: + { + Value *Src0 = getPacketizeValue(CI->getOperand(0)); + Value *BTI = getUniformValue(CI->getOperand(1)); + Value *Src2 = getPacketizeValue(CI->getOperand(2)); + Value *Src3 = getPacketizeValue(CI->getOperand(3)); + Value *Src4 = getPacketizeValue(CI->getOperand(4)); + Value *Src5 = getPacketizeValue(CI->getOperand(5)); + Value *Src6 = getPacketizeValue(CI->getOperand(6)); + Value *Args[] = {Src0, BTI, Src2, Src3, Src4, Src5, Src6}; + auto RetTy = B->GetVectorType(CI->getType()); + Type *Tys[] = {RetTy, Src0->getType(), Src3->getType()}; + auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys); + replacement = CallInst::Create(Decl, Args, CI->getName(), CI); + cast(replacement)->setDebugLoc(CI->getDebugLoc()); + return replacement; + } break; + case GenXIntrinsic::genx_typed_atomic_inc: + case GenXIntrinsic::genx_typed_atomic_dec: { + Value *Src0 = getPacketizeValue(CI->getOperand(0)); + Value *BTI = getUniformValue(CI->getOperand(1)); + Value *Src2 = getPacketizeValue(CI->getOperand(2)); + Value *Src3 = getPacketizeValue(CI->getOperand(3)); + Value *Src4 = getPacketizeValue(CI->getOperand(4)); + Value *Src5 = getPacketizeValue(CI->getOperand(5)); + Value *Args[] = {Src0, BTI, Src2, Src3, Src4, Src5}; + auto RetTy = B->GetVectorType(CI->getType()); + Type *Tys[] = {RetTy, Src0->getType(), Src2->getType()}; + auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys); + replacement = CallInst::Create(Decl, Args, CI->getName(), CI); + cast(replacement)->setDebugLoc(CI->getDebugLoc()); + return replacement; + } break; + case GenXIntrinsic::genx_typed_atomic_fcmpwr: + case GenXIntrinsic::genx_typed_atomic_cmpxchg: { + Value *Src0 = getPacketizeValue(CI->getOperand(0)); + Value *BTI = getUniformValue(CI->getOperand(1)); + Value *Src2 = getPacketizeValue(CI->getOperand(2)); + Value *Src3 = getPacketizeValue(CI->getOperand(3)); + Value *Src4 = getPacketizeValue(CI->getOperand(4)); + Value *Src5 = getPacketizeValue(CI->getOperand(5)); + Value *Src6 = getPacketizeValue(CI->getOperand(6)); + Value *Src7 = getPacketizeValue(CI->getOperand(7)); + Value *Args[] = {Src0, BTI, Src2, Src3, Src4, Src5, Src6, Src7}; + auto RetTy = B->GetVectorType(CI->getType()); + Type *Tys[] = {RetTy, Src0->getType(), Src4->getType()}; + auto Decl = GenXIntrinsic::getGenXDeclaration(M, IID, Tys); + replacement = CallInst::Create(Decl, Args, CI->getName(), CI); + cast(replacement)->setDebugLoc(CI->getDebugLoc()); + return replacement; + } break; + // default llvm-intrinsic packetizing rule should work for svm atomics + default: + break; + } + } + } + return nullptr; +} + +/// - map old instruction to new in case we revisit the old instruction +Value *GenXPacketize::packetizeInstruction(Instruction *pInst) { + // determine instruction type and call its packetizer + Value *pResult = packetizeGenXIntrinsic(pInst); + if (!pResult) { + if (IsLLVMIntrinsic(pInst)) + pResult = packetizeLLVMIntrinsic(pInst); + else + pResult = packetizeLLVMInstruction(pInst); + } + + if (pResult) { + if (pInst->getName() != "") { + pResult->setName(pInst->getName()); + } + + // Copy any metadata to new instruction + if (pResult != pInst && isa(pResult)) { + cast(pResult)->copyMetadata(*pInst); + } + } + + return pResult; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Replace all uses but avoid any type checking as instructions +/// maybe in a partial bad state. +/// @param pInst - old instruction we're replacing. +/// @param pNewInst - new instruction +void GenXPacketize::replaceAllUsesNoTypeCheck(Value *pInst, Value *pNewInst) { + SmallVector users; + SmallVector opNum; + + for (auto &U : pInst->uses()) { + users.push_back(U.getUser()); + opNum.push_back(U.getOperandNo()); + } + for (uint32_t i = 0; i < users.size(); ++i) { + users[i]->setOperand(opNum[i], pNewInst); + } +} + +////////////////////////////////////////////////////////////////////////// +/// @brief Remove replaced instructions. DCE will not remove calls, etc. +/// So we have to remove these manually. +void GenXPacketize::removeDeadInstructions(Function &F) { + SmallVector unused; + for (auto RMI : ReplaceMap) { + if (RMI.first != RMI.second) { + if (Instruction *UnusedInst = + (Instruction *)dyn_cast(RMI.first)) { + unused.push_back(UnusedInst); + } + } + } + for (auto UnusedInst : unused) { + UnusedInst->replaceAllUsesWith(UndefValue::get(UnusedInst->getType())); + UnusedInst->eraseFromParent(); + } +} + +////////////////////////////////////////////////////////////////////////// +/// @brief LLVM optimizes certain operations and replaces with general C +/// functions instead +/// of llvm intrinsics (sqrtf() instead of llvm.sqrt() for example). We +/// convert these back to known llvm intrinsics before packetization, +/// which are handled natively +/// @param F - function to analyze +void GenXPacketize::fixupLLVMIntrinsics(Function &F) { + std::unordered_set removeSet; + + for (auto &BB : F.getBasicBlockList()) { + for (auto &I : BB.getInstList()) { + if (isa(I)) { + CallInst *pCallInst = cast(&I); + Function *pFunc = pCallInst->getCalledFunction(); + if (pFunc) { + if (pFunc->getName().startswith("sqrt")) { + B->IRB()->SetInsertPoint(&I); + Value *pSqrt = B->VSQRTPS(pCallInst->getOperand(0)); + pCallInst->replaceAllUsesWith(pSqrt); + removeSet.insert(pCallInst); + } else if (pFunc->getName().startswith("fabs")) { + B->IRB()->SetInsertPoint(&I); + Value *pFabs = B->FABS(pCallInst->getOperand(0)); + pCallInst->replaceAllUsesWith(pFabs); + removeSet.insert(pCallInst); + } else if (pFunc->getName().startswith("exp2")) { + B->IRB()->SetInsertPoint(&I); + Value *pExp2 = B->EXP2(pCallInst->getOperand(0)); + pCallInst->replaceAllUsesWith(pExp2); + removeSet.insert(pCallInst); + } else if (pFunc->getName().equals("ldexpf")) { + B->IRB()->SetInsertPoint(&I); + Value *pArg = pCallInst->getOperand(0); + Value *pExp = pCallInst->getOperand(1); + + // replace ldexp with arg * 2^exp = arg * (2 << arg) + Value *pShift = B->SHL(B->C(1), pExp); + pShift = B->UI_TO_FP(pShift, B->mFP32Ty); + Value *pResult = B->FMUL(pArg, pShift); + pCallInst->replaceAllUsesWith(pResult); + removeSet.insert(pCallInst); + } + } + } + } + } + + for (auto *pInst : removeSet) { + pInst->eraseFromParent(); + } +} + +////////////////////////////////////////////////////////////////////////// +/// @brief find the global ExecMask variable if exists in order to lower +/// CM SIMD control-flow representation after packetization +GlobalVariable *GenXPacketize::findGlobalExecMask() { + // look for the global EMask variable if exists + for (auto &Global : M->getGlobalList()) { + auto Ty = Global.getType()->getElementType(); + if (Ty->isVectorTy() && + Ty->getVectorNumElements() == CMSimdCFLower::MAX_SIMD_CF_WIDTH) { + auto ElemTy = Ty->getVectorElementType(); + if (ElemTy->isIntegerTy() && ElemTy->getIntegerBitWidth() == 1) { + // so far the type is right, then check the use + for (auto EMUI = Global.use_begin(), EMUE = Global.use_end(); + EMUI != EMUE; ++EMUI) { + if (auto LD = dyn_cast(EMUI->getUser())) { + for (auto UI = LD->user_begin(), E = LD->user_end(); UI != E; + ++UI) { + const Value *LocalUse = (*UI); + if (auto CI = dyn_cast_or_null(LocalUse)) { + if (Function *Callee = CI->getCalledFunction()) { + if (GenXIntrinsic::getGenXIntrinsicID(Callee) == + GenXIntrinsic::genx_simdcf_goto) + return &Global; + } + } + } + } + } + } + } + } + return nullptr; +} +////////////////////////////////////////////////////////////////////////// +/// @brief lower CM SIMD control-flow representation after packetization +/// +void GenXPacketize::lowerControlFlowAfter(std::vector &SIMTFuncs) { + auto EMVar = findGlobalExecMask(); + // create one if we cannot find one. + if (!EMVar) { + auto EMTy = VectorType::get(Type::getInt1Ty(M->getContext()), + CMSimdCFLower::MAX_SIMD_CF_WIDTH); + EMVar = new GlobalVariable(*M, EMTy, false /*isConstant*/, + GlobalValue::InternalLinkage, + Constant::getAllOnesValue(EMTy), "EM"); + } + CMSimdCFLower CFL(EMVar); + // Derive an order to process functions such that a function is visited + // after anything that calls it. + int n = SIMTFuncs.size(); + for (int i = n - 1; i >= 0; --i) + CFL.processFunction(SIMTFuncs[i]); +} + +// foward declare the initializer +void initializeGenXPacketizePass(PassRegistry &); + +} // namespace llvm + +using namespace llvm; + +char GenXPacketize::ID = 0; +INITIALIZE_PASS_BEGIN(GenXPacketize, "GenXPacketize", "GenXPacketize", false, + false) +INITIALIZE_PASS_DEPENDENCY(BreakCriticalEdges) +INITIALIZE_PASS_END(GenXPacketize, "GenXPacketize", "GenXPacketize", false, + false) + +namespace llvm { +ModulePass *createGenXPacketizePass() { + initializeGenXPacketizePass(*PassRegistry::getPassRegistry()); + return new GenXPacketize(); +} +} // namespace llvm diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder.cpp b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder.cpp new file mode 100644 index 000000000000..c09ff72d3b1b --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder.cpp @@ -0,0 +1,209 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ + +#include "PacketBuilder.h" + +using namespace llvm; + +namespace pktz +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Contructor for Builder. + /// @param pJitMgr - JitManager which contains modules, function passes, etc. + PacketBuilder::PacketBuilder(Module *pModule, uint32_t width) + { + mVWidth16 = 16; + mpModule = static_cast(pModule); + + // Built in types: scalar + LLVMContext& Ctx = getContext(); + mpIRBuilder = new IGCLLVM::IRBuilder<>(Ctx); + mVoidTy = Type::getVoidTy(Ctx); + mFP16Ty = Type::getHalfTy(Ctx); + mFP32Ty = Type::getFloatTy(Ctx); + mFP32PtrTy = PointerType::get(mFP32Ty, 0); + mDoubleTy = Type::getDoubleTy(Ctx); + mInt1Ty = Type::getInt1Ty(Ctx); + mInt8Ty = Type::getInt8Ty(Ctx); + mInt16Ty = Type::getInt16Ty(Ctx); + mInt32Ty = Type::getInt32Ty(Ctx); + mInt8PtrTy = PointerType::get(mInt8Ty, 0); + mInt16PtrTy = PointerType::get(mInt16Ty, 0); + mInt32PtrTy = PointerType::get(mInt32Ty, 0); + mInt64Ty = Type::getInt64Ty(Ctx); + + mSimd4FP64Ty = VectorType::get(mDoubleTy, 4); + + // Built in types: simd16 + mSimd16Int1Ty = VectorType::get(mInt1Ty, mVWidth16); + mSimd16Int16Ty = VectorType::get(mInt16Ty, mVWidth16); + mSimd16Int32Ty = VectorType::get(mInt32Ty, mVWidth16); + mSimd16Int64Ty = VectorType::get(mInt64Ty, mVWidth16); + mSimd16FP16Ty = VectorType::get(mFP16Ty, mVWidth16); + mSimd16FP32Ty = VectorType::get(mFP32Ty, mVWidth16); + + mSimd32Int8Ty = VectorType::get(mInt8Ty, 32); + + if (sizeof(uint32_t*) == 4) + { + mIntPtrTy = mInt32Ty; + mSimd16IntPtrTy = mSimd16Int32Ty; + } + else + { + assert(sizeof(uint32_t*) == 8); + mIntPtrTy = mInt64Ty; + mSimd16IntPtrTy = mSimd16Int64Ty; + } + // Built in types: target simd + SetTargetWidth(width); + + } + + void PacketBuilder::SetTargetWidth(uint32_t width) + { + mVWidth = width; + + mSimdInt1Ty = VectorType::get(mInt1Ty, mVWidth); + mSimdInt16Ty = VectorType::get(mInt16Ty, mVWidth); + mSimdInt32Ty = VectorType::get(mInt32Ty, mVWidth); + mSimdInt64Ty = VectorType::get(mInt64Ty, mVWidth); + mSimdFP16Ty = VectorType::get(mFP16Ty, mVWidth); + mSimdFP32Ty = VectorType::get(mFP32Ty, mVWidth); + if (sizeof(uint32_t*) == 4) + { + mSimdIntPtrTy = mSimdInt32Ty; + } + else + { + assert(sizeof(uint32_t*) == 8); + mSimdIntPtrTy = mSimdInt64Ty; + } + } + + /// @brief Mark this alloca as temporary to avoid hoisting later on + void PacketBuilder::SetTempAlloca(Value* inst) + { + AllocaInst* pAlloca = dyn_cast(inst); + assert(pAlloca && "Unexpected non-alloca instruction"); + MDNode* N = MDNode::get(getContext(), MDString::get(getContext(), "is_temp_alloca")); + pAlloca->setMetadata("is_temp_alloca", N); + } + + bool PacketBuilder::IsTempAlloca(Value* inst) + { + AllocaInst* pAlloca = dyn_cast(inst); + assert(pAlloca && "Unexpected non-alloca instruction"); + + return (pAlloca->getMetadata("is_temp_alloca") != nullptr); + } + + // Returns true if able to find a call instruction to mark + bool PacketBuilder::SetNamedMetaDataOnCallInstr(Instruction* inst, StringRef mdName) + { + CallInst* pCallInstr = dyn_cast(inst); + if (pCallInstr) + { + MDNode* N = MDNode::get(getContext(), MDString::get(getContext(), mdName)); + pCallInstr->setMetadata(mdName, N); + return true; + } + else + { + // Follow use def chain back up + for (Use& u : inst->operands()) + { + Instruction* srcInst = dyn_cast(u.get()); + if (srcInst) + { + if (SetNamedMetaDataOnCallInstr(srcInst, mdName)) + { + return true; + } + } + } + } + + return false; + } + + bool PacketBuilder::HasNamedMetaDataOnCallInstr(Instruction* inst, StringRef mdName) + { + CallInst* pCallInstr = dyn_cast(inst); + + if (!pCallInstr) + { + return false; + } + + return (pCallInstr->getMetadata(mdName) != nullptr); + } + + ////////////////////////////////////////////////////////////////////////// + /// @brief Packetizes the type. Assumes SOA conversion. + Type* PacketBuilder::GetVectorType(Type* pType) + { + if (pType->isVoidTy()) + return pType; + + if (pType->isVectorTy()) + { + uint32_t vectorSize = pType->getVectorNumElements(); + Type* pElemType = pType->getVectorElementType(); + Type* pVecType = VectorType::get(pElemType, vectorSize*mVWidth); + return pVecType; + } + + // [N x float] should packetize to [N x <8 x float>] + if (pType->isArrayTy()) + { + uint32_t arraySize = pType->getArrayNumElements(); + Type* pArrayType = pType->getArrayElementType(); + Type* pVecArrayType = GetVectorType(pArrayType); + Type* pVecType = ArrayType::get(pVecArrayType, arraySize); + return pVecType; + } + + // {float,int} should packetize to {<8 x float>, <8 x int>} + if (pType->isAggregateType()) + { + uint32_t numElems = pType->getStructNumElements(); + SmallVector vecTypes; + for (uint32_t i = 0; i < numElems; ++i) + { + Type* pElemType = pType->getStructElementType(i); + Type* pVecElemType = GetVectorType(pElemType); + vecTypes.push_back(pVecElemType); + } + Type* pVecType = StructType::get(getContext(), vecTypes); + return pVecType; + } + + // should packetize to <8 x > + Type* vecType = VectorType::get(pType, mVWidth); + return vecType; + } +} // end of namespace pktz diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder.h b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder.h new file mode 100644 index 000000000000..5d3586148bb5 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder.h @@ -0,0 +1,340 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +#pragma once +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" + +#include "llvmWrapper/IR/IRBuilder.h" +#include "llvmWrapper/IR/InstrTypes.h" +#include "llvmWrapper/IR/Module.h" + +#include +#include + +using namespace llvm; + +namespace pktz +{ + struct PacketBuilder + { + public: + PacketBuilder(Module* pModule, uint32_t width = 16); + virtual ~PacketBuilder() + { + if (mpIRBuilder) + delete mpIRBuilder; + } + + IGCLLVM::IRBuilder<>* IRB() { return mpIRBuilder; }; + LLVMContext &getContext() { return mpModule->getContext(); } + + IGCLLVM::Module* mpModule; + IGCLLVM::IRBuilder<>* mpIRBuilder; + + uint32_t mVWidth; // vector width target simd + uint32_t mVWidth16; // vector width simd16 + + // Built in types: scalar + Type* mVoidTy; + Type* mInt1Ty; + Type* mInt8Ty; + Type* mInt16Ty; + Type* mInt32Ty; + Type* mInt64Ty; + Type* mIntPtrTy; + Type* mFP16Ty; + Type* mFP32Ty; + Type* mFP32PtrTy; + Type* mDoubleTy; + Type* mInt8PtrTy; + Type* mInt16PtrTy; + Type* mInt32PtrTy; + + Type* mSimd4FP64Ty; + + // Built in types: target SIMD + Type* mSimdFP16Ty; + Type* mSimdFP32Ty; + Type* mSimdInt1Ty; + Type* mSimdInt16Ty; + Type* mSimdInt32Ty; + Type* mSimdInt64Ty; + Type* mSimdIntPtrTy; + + // Built in types: simd16 + + Type* mSimd16FP16Ty; + Type* mSimd16FP32Ty; + Type* mSimd16Int1Ty; + Type* mSimd16Int16Ty; + Type* mSimd16Int32Ty; + Type* mSimd16Int64Ty; + Type* mSimd16IntPtrTy; + + Type* mSimd32Int8Ty; + + void SetTargetWidth(uint32_t width); + void SetTempAlloca(Value* inst); + bool IsTempAlloca(Value* inst); + bool SetNamedMetaDataOnCallInstr(Instruction* inst, StringRef mdName); + bool HasNamedMetaDataOnCallInstr(Instruction* inst, StringRef mdName); + Type* GetVectorType(Type* pType); + void SetMetadata(StringRef s, uint32_t val) + { + llvm::NamedMDNode* metaData = mpModule->getOrInsertNamedMetadata(s); + Constant* cval = IRB()->getInt32(val); + llvm::MDNode* mdNode = llvm::MDNode::get(getContext(), + llvm::ConstantAsMetadata::get(cval)); + if (metaData->getNumOperands()) + { + metaData->setOperand(0, mdNode); + } + else + { + metaData->addOperand(mdNode); + } + } + uint32_t GetMetadata(StringRef s) + { + NamedMDNode* metaData = mpModule->getNamedMetadata(s); + if (metaData) + { + MDNode* mdNode = metaData->getOperand(0); + Metadata* val = mdNode->getOperand(0); + return mdconst::dyn_extract(val)->getZExtValue(); + } + else + { + return 0; + } + } +#include "gen_builder.hpp" +#include "gen_builder_intrin.hpp" +#include "gen_builder_meta.hpp" + + Value* VLOG2PS(Value* src); + Value* VPOW24PS(Value* src); + Value* VEXP2PS(Value* src); + + //#include "PacketBuilder_misc.h" + Constant* C(bool i); + Constant* C(char i); + Constant* C(uint8_t i); + Constant* C(int i); + Constant* C(int64_t i); + Constant* C(uint64_t i); + Constant* C(uint16_t i); + Constant* C(uint32_t i); + Constant* C(float i); + + template + Constant* C(const std::initializer_list& constList) + { + std::vector vConsts; + for (auto i : constList) + { + vConsts.push_back(C((Ty)i)); + } + return ConstantVector::get(vConsts); + } + + template + Constant* CA(LLVMContext& ctx, ArrayRef constList) + { + return ConstantDataArray::get(ctx, constList); + } + + template + Constant* CInc(uint32_t base, uint32_t count) + { + std::vector vConsts; + + for (uint32_t i = 0; i < count; i++) + { + vConsts.push_back(C((Ty)base)); + base++; + } + return ConstantVector::get(vConsts); + } + + Constant* PRED(bool pred); + + Value* VIMMED1(int i); + Value* VIMMED1_16(int i); + + Value* VIMMED1(uint32_t i); + Value* VIMMED1_16(uint32_t i); + + Value* VIMMED1(float i); + Value* VIMMED1_16(float i); + + Value* VIMMED1(bool i); + Value* VIMMED1_16(bool i); + + Value* VUNDEF(Type* t); + + Value* VUNDEF_F(); + Value* VUNDEF_F_16(); + + Value* VUNDEF_I(); + Value* VUNDEF_I_16(); + + Value* VUNDEF(Type* ty, uint32_t size); + + Value* VUNDEF_IPTR(); + + Value* VBROADCAST(Value* src, const llvm::Twine& name = ""); + Value* VBROADCAST_16(Value* src); + + Value* VRCP(Value* va, const llvm::Twine& name = ""); + Value* VPLANEPS(Value* vA, Value* vB, Value* vC, Value*& vX, Value*& vY); + + uint32_t IMMED(Value* i); + int32_t S_IMMED(Value* i); + + CallInst* CALL(Value* Callee, const std::initializer_list& args, const llvm::Twine& name = ""); + CallInst* CALL(Value* Callee) + { + return CALLA(Callee); + } + CallInst* CALL(Value* Callee, Value* arg); + CallInst* CALL2(Value* Callee, Value* arg1, Value* arg2); + CallInst* CALL3(Value* Callee, Value* arg1, Value* arg2, Value* arg3); + + Value* MASK(Value* vmask); + Value* MASK_16(Value* vmask); + + Value* VMASK(Value* mask); + Value* VMASK_16(Value* mask); + + Value* VMOVMSK(Value* mask); + + ////////////////////////////////////////////////////////////////////////// + /// @brief functions that build IR to call x86 intrinsics directly, or + /// emulate them with other instructions if not available on the host + ////////////////////////////////////////////////////////////////////////// + + Value* EXTRACT_16(Value* x, uint32_t imm); + Value* JOIN_16(Value* a, Value* b); + + Value* PSHUFB(Value* a, Value* b); + Value* PMOVSXBD(Value* a); + Value* PMOVSXWD(Value* a); + Value* PMAXSD(Value* a, Value* b); + Value* PMINSD(Value* a, Value* b); + Value* PMAXUD(Value* a, Value* b); + Value* PMINUD(Value* a, Value* b); + Value* VABSPS(Value* a); + Value* FMADDPS(Value* a, Value* b, Value* c); + + Value* ICLAMP(Value* src, Value* low, Value* high, const llvm::Twine& name = ""); + Value* FCLAMP(Value* src, Value* low, Value* high); + Value* FCLAMP(Value* src, float low, float high); + + Value* VPOPCNT(Value* a); + + Value* VEXTRACTI128(Value* a, Constant* imm8); + Value* VINSERTI128(Value* a, Value* b, Constant* imm8); + + Value* CreateEntryAlloca(Function* pFunc, Type* pType); + Value* CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize); + + uint32_t GetTypeSize(Type* pType); + + // #include "PacketBuilder_mem.h" + public: + typedef enum _JIT_MEM_CLIENT + { + MEM_CLIENT_INTERNAL, + GFX_MEM_CLIENT_FETCH, + GFX_MEM_CLIENT_SAMPLER, + GFX_MEM_CLIENT_SHADER, + } JIT_MEM_CLIENT; + + protected: + virtual Value* OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset); + void AssertMemoryUsageParams(Value* ptr, JIT_MEM_CLIENT usage); + + public: + virtual Value* GEP(Value* Ptr, Value* Idx, Type* Ty = nullptr, const Twine& Name = ""); + virtual Value* GEP(Type* Ty, Value* Ptr, Value* Idx, const Twine& Name = ""); + virtual Value* GEP(Value* ptr, const std::initializer_list& indexList, Type* Ty = nullptr); + virtual Value* GEP(Value* ptr, const std::initializer_list& indexList, Type* Ty = nullptr); + + Value* GEPA(Value* Ptr, ArrayRef IdxList, const Twine& Name = ""); + Value* GEPA(Type* Ty, Value* Ptr, ArrayRef IdxList, const Twine& Name = ""); + + Value* IN_BOUNDS_GEP(Value* ptr, const std::initializer_list& indexList); + Value* IN_BOUNDS_GEP(Value* ptr, const std::initializer_list& indexList); + + virtual LoadInst* LOAD(Value* Ptr, const char* Name, Type* Ty = nullptr, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); + + virtual LoadInst* LOAD(Value* Ptr, + const Twine& Name = "", + Type* Ty = nullptr, + JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); + + virtual LoadInst* LOAD(Type* Ty, Value* Ptr, const Twine& Name = "", JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); + + virtual LoadInst* LOAD(Value* Ptr, + bool isVolatile, + const Twine& Name = "", + Type* Ty = nullptr, + JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); + + virtual LoadInst* LOAD(Value* BasePtr, + const std::initializer_list& offset, + const llvm::Twine& Name = "", + Type* Ty = nullptr, + JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL); + + virtual CallInst* MASKED_LOAD(Value* Ptr, + unsigned Align, + Value* Mask, + Value* PassThru = nullptr, + const Twine& Name = "", + Type* Ty = nullptr, + JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL) + { + return IRB()->CreateMaskedLoad(Ptr, Align, Mask, PassThru, Name); + } + + LoadInst* LOADV(Value* BasePtr, const std::initializer_list& offset, const llvm::Twine& name = ""); + StoreInst* STORE(Value* Val, Value* BasePtr, const std::initializer_list& offset); + StoreInst* STOREV(Value* Val, Value* BasePtr, const std::initializer_list& offset); + + Value* MEM_ADD(Value* i32Incr, + Value* basePtr, + const std::initializer_list& indices, + const llvm::Twine& name = ""); + }; +} // end of namespace pktz diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder_math.cpp b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder_math.cpp new file mode 100644 index 000000000000..620a146b1521 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder_math.cpp @@ -0,0 +1,163 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ + +#include "PacketBuilder.h" + +// need to disable this to use INFINITY and NAN values +#pragma warning(disable : 4756 4056) + +//#include + +namespace pktz +{ + ////////////////////////////////////////////////////////////////////////// + /// @brief Computes log2(a) using either scalar log2 function from the runtime + /// or vector approximation + /// @param a - src float vector + Value* PacketBuilder::VLOG2PS(Value* a) + { + Value* result; + + // fast log2 approximation + // log2(x) = (x.ExpPart - 127) + log(1.xFracPart) + Value* asInt = BITCAST(a, mSimdInt32Ty); + Value* b = SUB(AND(ASHR(asInt, 23), 255), VIMMED1(127)); + Value* intermResult = SI_TO_FP(b, mSimdFP32Ty); + + Value* fa = OR(AND(asInt, VIMMED1(0x007FFFFF)), VIMMED1(127 << 23)); + fa = BITCAST(fa, mSimdFP32Ty); + fa = FSUB(fa, VIMMED1(1.0f)); + + // log(x) = (1.4386183024320163f + (-0.640238532500937f + + // 0.20444600983623412f*fx)*fx)*fx; + result = FMUL(fa, VIMMED1(0.20444600983623412f)); + result = FADD(result, VIMMED1(-0.640238532500937f)); + result = FMUL(fa, result); + result = FADD(result, VIMMED1(1.4386183024320163f)); + result = FMUL(result, fa); + result = FADD(result, intermResult); + + // handle bad input + // 0 -> -inf + Value* zeroInput = FCMP_OEQ(a, VIMMED1(0.0f)); + result = SELECT(zeroInput, VIMMED1(-INFINITY), result); + + // -F -> NAN + Value* negInput = FCMP_OLT(a, VIMMED1(0.0f)); + result = SELECT(negInput, VIMMED1(NAN), result); + + // inf -> inf + Value* infInput = FCMP_OEQ(a, VIMMED1(INFINITY)); + result = SELECT(infInput, VIMMED1(INFINITY), result); + + // NAN -> NAN + Value* nanInput = FCMP_UNO(a, a); + result = SELECT(nanInput, VIMMED1(NAN), result); + + result->setName("log2."); + return result; + } + + ////////////////////////////////////////////////////////////////////////// + /// @brief Computes a^2.4 using either scalar pow function from the runtime + /// or vector approximation + /// @param a - src float vector + Value* PacketBuilder::VPOW24PS(Value* a) + { + Value* result; + // approximation algorithm from + // http://stackoverflow.com/questions/6475373/optimizations-for-pow-with-const-non-integer-exponent + // computes a^2.4 with approximately 5% overestimate. + // can reduce the error further with a few more terms + + const float expnum = 24; + const float expden = 10; + const float coeffnum = 1.0f; + const float coeffden = 1.0f; + + Value* correctionFactor = + VIMMED1(exp2f(127.f * expden / expnum - 127.f) * + powf(1.f * coeffnum / coeffden, 1.0f * expden / expnum)); + + result = FMUL(a, correctionFactor); + result = SI_TO_FP(BITCAST(result, mSimdInt32Ty), mSimdFP32Ty); + result = FMUL(result, VIMMED1(1.f * expnum / expden)); + result = BITCAST(FP_TO_SI(result, mSimdInt32Ty), mSimdFP32Ty); + + result->setName("pow24."); + return result; + } + +#define EXP_POLY_DEGREE 3 + +#define POLY0(x, c0) VIMMED1(c0) +#define POLY1(x, c0, c1) FADD(FMUL(POLY0(x, c1), x), VIMMED1(c0)) +#define POLY2(x, c0, c1, c2) FADD(FMUL(POLY1(x, c1, c2), x), VIMMED1(c0)) +#define POLY3(x, c0, c1, c2, c3) FADD(FMUL(POLY2(x, c1, c2, c3), x), VIMMED1(c0)) +#define POLY4(x, c0, c1, c2, c3, c4) FADD(FMUL(POLY3(x, c1, c2, c3, c4), x), VIMMED1(c0)) +#define POLY5(x, c0, c1, c2, c3, c4, c5) FADD(FMUL(POLY4(x, c1, c2, c3, c4, c5), x), VIMMED1(c0)) + + ////////////////////////////////////////////////////////////////////////// + /// @brief Computes 2^x using either scalar pow function from the runtime + /// or vector approximation + /// @param a - src float vector + Value* PacketBuilder::VEXP2PS(Value* a) + { + Value* result; + + // fast exp2 taken from here: + // http://jrfonseca.blogspot.com/2008/09/fast-sse2-pow-tables-or-polynomials.html + + a = VMINPS(a, VIMMED1(129.0f)); + a = VMAXPS(a, VIMMED1(-126.99999f)); + + Value* ipart = FP_TO_SI(FSUB(a, VIMMED1(0.5f)), mSimdInt32Ty); + Value* fpart = FSUB(a, SI_TO_FP(ipart, mSimdFP32Ty)); + Value* expipart = BITCAST(SHL(ADD(ipart, VIMMED1(127)), 23), mSimdFP32Ty); +#if EXP_POLY_DEGREE == 5 + Value* expfpart = POLY5(fpart, + 9.9999994e-1f, + 6.9315308e-1f, + 2.4015361e-1f, + 5.5826318e-2f, + 8.9893397e-3f, + 1.8775767e-3f); +#elif EXP_POLY_DEGREE == 4 + Value* expfpart = POLY4( + fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f); +#elif EXP_POLY_DEGREE == 3 + Value* expfpart = + POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f); +#elif EXP_POLY_DEGREE == 2 + Value* expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f); +#else +#error +#endif + result = FMUL(expipart, expfpart, "exp2."); + + return result; + } +} diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder_mem.cpp b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder_mem.cpp new file mode 100644 index 000000000000..c77bb1ed6d7c --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder_mem.cpp @@ -0,0 +1,172 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ + +#include "PacketBuilder.h" + +#include + +namespace pktz +{ + void PacketBuilder::AssertMemoryUsageParams(Value* ptr, JIT_MEM_CLIENT usage) + { + assert( + ptr->getType() != mInt64Ty && + "Address appears to be GFX access. Requires translation through BuilderGfxMem."); + } + + Value* PacketBuilder::GEP(Value* Ptr, Value* Idx, Type* Ty, const Twine& Name) + { + return IRB()->CreateGEP(Ptr, Idx, Name); + } + + Value* PacketBuilder::GEP(Type* Ty, Value* Ptr, Value* Idx, const Twine& Name) + { + return IRB()->CreateGEP(Ty, Ptr, Idx, Name); + } + + Value* PacketBuilder::GEP(Value* ptr, const std::initializer_list& indexList, Type* Ty) + { + std::vector indices; + for (auto i : indexList) + indices.push_back(i); + return GEPA(ptr, indices); + } + + Value* PacketBuilder::GEP(Value* ptr, const std::initializer_list& indexList, Type* Ty) + { + std::vector indices; + for (auto i : indexList) + indices.push_back(C(i)); + return GEPA(ptr, indices); + } + + Value* PacketBuilder::GEPA(Value* Ptr, ArrayRef IdxList, const Twine& Name) + { + return IRB()->CreateGEP(Ptr, IdxList, Name); + } + + Value* PacketBuilder::GEPA(Type* Ty, Value* Ptr, ArrayRef IdxList, const Twine& Name) + { + return IRB()->CreateGEP(Ty, Ptr, IdxList, Name); + } + + Value* PacketBuilder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list& indexList) + { + std::vector indices; + for (auto i : indexList) + indices.push_back(i); + return IN_BOUNDS_GEP(ptr, indices); + } + + Value* PacketBuilder::IN_BOUNDS_GEP(Value* ptr, const std::initializer_list& indexList) + { + std::vector indices; + for (auto i : indexList) + indices.push_back(C(i)); + return IN_BOUNDS_GEP(ptr, indices); + } + + LoadInst* PacketBuilder::LOAD(Value* Ptr, const char* Name, Type* Ty, JIT_MEM_CLIENT usage) + { + AssertMemoryUsageParams(Ptr, usage); + return IRB()->CreateLoad(Ptr, Name); + } + + LoadInst* PacketBuilder::LOAD(Value* Ptr, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage) + { + AssertMemoryUsageParams(Ptr, usage); + return IRB()->CreateLoad(Ptr, Name); + } + + LoadInst* PacketBuilder::LOAD(Type* Ty, Value* Ptr, const Twine& Name, JIT_MEM_CLIENT usage) + { + AssertMemoryUsageParams(Ptr, usage); + return IRB()->CreateLoad(Ty, Ptr, Name); + } + + LoadInst* + PacketBuilder::LOAD(Value* Ptr, bool isVolatile, const Twine& Name, Type* Ty, JIT_MEM_CLIENT usage) + { + AssertMemoryUsageParams(Ptr, usage); + return IRB()->CreateLoad(Ptr, isVolatile, Name); + } + + LoadInst* PacketBuilder::LOAD(Value* basePtr, + const std::initializer_list& indices, + const llvm::Twine& name, + Type* Ty, + JIT_MEM_CLIENT usage) + { + std::vector valIndices; + for (auto i : indices) + valIndices.push_back(C(i)); + return PacketBuilder::LOAD(GEPA(basePtr, valIndices), name); + } + + LoadInst* PacketBuilder::LOADV(Value* basePtr, + const std::initializer_list& indices, + const llvm::Twine& name) + { + std::vector valIndices; + for (auto i : indices) + valIndices.push_back(i); + return LOAD(GEPA(basePtr, valIndices), name); + } + + StoreInst* + PacketBuilder::STORE(Value* val, Value* basePtr, const std::initializer_list& indices) + { + std::vector valIndices; + for (auto i : indices) + valIndices.push_back(C(i)); + return STORE(val, GEPA(basePtr, valIndices)); + } + + StoreInst* + PacketBuilder::STOREV(Value* val, Value* basePtr, const std::initializer_list& indices) + { + std::vector valIndices; + for (auto i : indices) + valIndices.push_back(i); + return STORE(val, GEPA(basePtr, valIndices)); + } + + Value* PacketBuilder::OFFSET_TO_NEXT_COMPONENT(Value* base, Constant* offset) + { + return GEP(base, offset); + } + + Value* PacketBuilder::MEM_ADD(Value* i32Incr, + Value* basePtr, + const std::initializer_list& indices, + const llvm::Twine& name) + { + Value* i32Value = LOAD(GEP(basePtr, indices), name); + Value* i32Result = ADD(i32Value, i32Incr); + return STORE(i32Result, GEP(basePtr, indices)); + } + +} diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder_misc.cpp b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder_misc.cpp new file mode 100644 index 000000000000..131d9e5dd40b --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/PacketBuilder_misc.cpp @@ -0,0 +1,503 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +#include "PacketBuilder.h" + +//#include + +namespace pktz +{ + Constant* PacketBuilder::C(bool i) { return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0)); } + + Constant* PacketBuilder::C(char i) { return ConstantInt::get(IRB()->getInt8Ty(), i); } + + Constant* PacketBuilder::C(uint8_t i) { return ConstantInt::get(IRB()->getInt8Ty(), i); } + + Constant* PacketBuilder::C(int i) { return ConstantInt::get(IRB()->getInt32Ty(), i); } + + Constant* PacketBuilder::C(int64_t i) { return ConstantInt::get(IRB()->getInt64Ty(), i); } + + Constant* PacketBuilder::C(uint16_t i) { return ConstantInt::get(mInt16Ty, i); } + + Constant* PacketBuilder::C(uint32_t i) { return ConstantInt::get(IRB()->getInt32Ty(), i); } + + Constant* PacketBuilder::C(uint64_t i) { return ConstantInt::get(IRB()->getInt64Ty(), i); } + + Constant* PacketBuilder::C(float i) { return ConstantFP::get(IRB()->getFloatTy(), i); } + + Constant* PacketBuilder::PRED(bool pred) + { + return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0)); + } + + Value* PacketBuilder::VIMMED1(int i) + { + return ConstantVector::getSplat(mVWidth, cast(C(i))); + } + + Value* PacketBuilder::VIMMED1_16(int i) + { + return ConstantVector::getSplat(mVWidth16, cast(C(i))); + } + + Value* PacketBuilder::VIMMED1(uint32_t i) + { + return ConstantVector::getSplat(mVWidth, cast(C(i))); + } + + Value* PacketBuilder::VIMMED1_16(uint32_t i) + { + return ConstantVector::getSplat(mVWidth16, cast(C(i))); + } + + Value* PacketBuilder::VIMMED1(float i) + { + return ConstantVector::getSplat(mVWidth, cast(C(i))); + } + + Value* PacketBuilder::VIMMED1_16(float i) + { + return ConstantVector::getSplat(mVWidth16, cast(C(i))); + } + + Value* PacketBuilder::VIMMED1(bool i) + { + return ConstantVector::getSplat(mVWidth, cast(C(i))); + } + + Value* PacketBuilder::VIMMED1_16(bool i) + { + return ConstantVector::getSplat(mVWidth16, cast(C(i))); + } + + Value* PacketBuilder::VUNDEF_IPTR() { return UndefValue::get(VectorType::get(mInt32PtrTy, mVWidth)); } + + Value* PacketBuilder::VUNDEF(Type* t) { return UndefValue::get(VectorType::get(t, mVWidth)); } + + Value* PacketBuilder::VUNDEF_I() { return UndefValue::get(VectorType::get(mInt32Ty, mVWidth)); } + + Value* PacketBuilder::VUNDEF_I_16() { return UndefValue::get(VectorType::get(mInt32Ty, mVWidth16)); } + + Value* PacketBuilder::VUNDEF_F() { return UndefValue::get(VectorType::get(mFP32Ty, mVWidth)); } + + Value* PacketBuilder::VUNDEF_F_16() { return UndefValue::get(VectorType::get(mFP32Ty, mVWidth16)); } + + Value* PacketBuilder::VUNDEF(Type* ty, uint32_t size) + { + return UndefValue::get(VectorType::get(ty, size)); + } + + Value* PacketBuilder::VBROADCAST(Value* src, const llvm::Twine& name) + { + // check if src is already a vector + if (src->getType()->isVectorTy()) + { + if (auto CV = dyn_cast(src)) { + if (CV->getSplatValue()) { + return VECTOR_SPLAT(mVWidth*src->getType()->getVectorNumElements(), + CV->getSplatValue(), name); + } + } + return src; + } + + return VECTOR_SPLAT(mVWidth, src, name); + } + + Value* PacketBuilder::VBROADCAST_16(Value* src) + { + // check if src is already a vector + if (src->getType()->isVectorTy()) + { + return src; + } + + return VECTOR_SPLAT(mVWidth16, src); + } + + uint32_t PacketBuilder::IMMED(Value* v) + { + assert(isa(v)); + ConstantInt* pValConst = cast(v); + return pValConst->getZExtValue(); + } + + int32_t PacketBuilder::S_IMMED(Value* v) + { + assert(isa(v)); + ConstantInt* pValConst = cast(v); + return pValConst->getSExtValue(); + } + + CallInst* PacketBuilder::CALL(Value* Callee, + const std::initializer_list& argsList, + const llvm::Twine& name) + { + std::vector args; + for (auto arg : argsList) + args.push_back(arg); + return CALLA(Callee, args, name); + } + + CallInst* PacketBuilder::CALL(Value* Callee, Value* arg) + { + std::vector args; + args.push_back(arg); + return CALLA(Callee, args); + } + + CallInst* PacketBuilder::CALL2(Value* Callee, Value* arg1, Value* arg2) + { + std::vector args; + args.push_back(arg1); + args.push_back(arg2); + return CALLA(Callee, args); + } + + CallInst* PacketBuilder::CALL3(Value* Callee, Value* arg1, Value* arg2, Value* arg3) + { + std::vector args; + args.push_back(arg1); + args.push_back(arg2); + args.push_back(arg3); + return CALLA(Callee, args); + } + + Value* PacketBuilder::VRCP(Value* va, const llvm::Twine& name) + { + return FDIV(VIMMED1(1.0f), va, name); // 1 / a + } + + Value* PacketBuilder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value*& vX, Value*& vY) + { + Value* vOut = FMADDPS(vA, vX, vC); + vOut = FMADDPS(vB, vY, vOut); + return vOut; + } + + Value* PacketBuilder::EXTRACT_16(Value* x, uint32_t imm) + { + if (imm == 0) + { + return VSHUFFLE(x, UndefValue::get(x->getType()), {0, 1, 2, 3, 4, 5, 6, 7}); + } + else + { + return VSHUFFLE(x, UndefValue::get(x->getType()), {8, 9, 10, 11, 12, 13, 14, 15}); + } + } + + Value* PacketBuilder::JOIN_16(Value* a, Value* b) + { + return VSHUFFLE(a, b, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}); + } + + ////////////////////////////////////////////////////////////////////////// + /// @brief convert x86 mask to llvm mask + Value* PacketBuilder::MASK(Value* vmask) + { + Value* src = BITCAST(vmask, mSimdInt32Ty); + return ICMP_SLT(src, VIMMED1(0)); + } + + Value* PacketBuilder::MASK_16(Value* vmask) + { + Value* src = BITCAST(vmask, mSimd16Int32Ty); + return ICMP_SLT(src, VIMMED1_16(0)); + } + + ////////////////////////////////////////////////////////////////////////// + /// @brief convert llvm mask to x86 mask + Value* PacketBuilder::VMASK(Value* mask) { return S_EXT(mask, mSimdInt32Ty); } + + Value* PacketBuilder::VMASK_16(Value* mask) { return S_EXT(mask, mSimd16Int32Ty); } + + /// @brief Convert llvm mask to integer + Value* PacketBuilder::VMOVMSK(Value* mask) + { + assert(mask->getType()->getVectorElementType() == mInt1Ty); + uint32_t numLanes = mask->getType()->getVectorNumElements(); + Value* i32Result; + if (numLanes == 8) + { + i32Result = BITCAST(mask, mInt8Ty); + } + else if (numLanes == 16) + { + i32Result = BITCAST(mask, mInt16Ty); + } + else + { + assert("Unsupported vector width"); + i32Result = BITCAST(mask, mInt8Ty); + } + return Z_EXT(i32Result, mInt32Ty); + } + + ////////////////////////////////////////////////////////////////////////// + /// @brief Generate a VPSHUFB operation in LLVM IR. If not + /// supported on the underlying platform, emulate it + /// @param a - 256bit SIMD(32x8bit) of 8bit integer values + /// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values + /// Byte masks in lower 128 lane of b selects 8 bit values from lower + /// 128bits of a, and vice versa for the upper lanes. If the mask + /// value is negative, '0' is inserted. + Value* PacketBuilder::PSHUFB(Value* a, Value* b) + { + Value* res; + Constant* cB = dyn_cast(b); + assert(cB); + // number of 8 bit elements in b + uint32_t numElms = cast(cB->getType())->getNumElements(); + // output vector + Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms)); + + // insert an 8 bit value from the high and low lanes of a per loop iteration + numElms /= 2; + for (uint32_t i = 0; i < numElms; i++) + { + ConstantInt* cLow128b = cast(cB->getAggregateElement(i)); + ConstantInt* cHigh128b = cast(cB->getAggregateElement(i + numElms)); + + // extract values from constant mask + char valLow128bLane = (char)(cLow128b->getSExtValue()); + char valHigh128bLane = (char)(cHigh128b->getSExtValue()); + + Value* insertValLow128b; + Value* insertValHigh128b; + + // if the mask value is negative, insert a '0' in the respective output position + // otherwise, lookup the value at mask position (bits 3..0 of the respective mask + // byte) in a and insert in output vector + insertValLow128b = + (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF))); + insertValHigh128b = (valHigh128bLane < 0) + ? C((char)0) + : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms)); + + vShuf = VINSERT(vShuf, insertValLow128b, i); + vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms)); + } + res = vShuf; + return res; + } + + ////////////////////////////////////////////////////////////////////////// + /// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32 + /// bits)in LLVM IR. If not supported on the underlying platform, emulate it + /// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values. Only + /// lower 8 values are used. + Value* PacketBuilder::PMOVSXBD(Value* a) + { + // VPMOVSXBD output type + Type* v8x32Ty = VectorType::get(mInt32Ty, 8); + // Extract 8 values from 128bit lane and sign extend + return S_EXT(VSHUFFLE(a, a, C({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty); + } + + ////////////////////////////////////////////////////////////////////////// + /// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32 + /// bits)in LLVM IR. If not supported on the underlying platform, emulate it + /// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values. + Value* PacketBuilder::PMOVSXWD(Value* a) + { + // VPMOVSXWD output type + Type* v8x32Ty = VectorType::get(mInt32Ty, 8); + // Extract 8 values from 128bit lane and sign extend + return S_EXT(VSHUFFLE(a, a, C({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty); + } + + Value* PacketBuilder::PMAXSD(Value* a, Value* b) + { + Value* cmp = ICMP_SGT(a, b); + return SELECT(cmp, a, b); + } + + Value* PacketBuilder::PMINSD(Value* a, Value* b) + { + Value* cmp = ICMP_SLT(a, b); + return SELECT(cmp, a, b); + } + + Value* PacketBuilder::PMAXUD(Value* a, Value* b) + { + Value* cmp = ICMP_UGT(a, b); + return SELECT(cmp, a, b); + } + + Value* PacketBuilder::PMINUD(Value* a, Value* b) + { + Value* cmp = ICMP_ULT(a, b); + return SELECT(cmp, a, b); + } + + // Helper function to create alloca in entry block of function + Value* PacketBuilder::CreateEntryAlloca(Function* pFunc, Type* pType) + { + auto saveIP = IRB()->saveIP(); + IRB()->SetInsertPoint(&pFunc->getEntryBlock(), pFunc->getEntryBlock().begin()); + Value* pAlloca = ALLOCA(pType); + if (saveIP.isSet()) + IRB()->restoreIP(saveIP); + return pAlloca; + } + + Value* PacketBuilder::CreateEntryAlloca(Function* pFunc, Type* pType, Value* pArraySize) + { + auto saveIP = IRB()->saveIP(); + IRB()->SetInsertPoint(&pFunc->getEntryBlock(), pFunc->getEntryBlock().begin()); + Value* pAlloca = ALLOCA(pType, pArraySize); + if (saveIP.isSet()) + IRB()->restoreIP(saveIP); + return pAlloca; + } + + Value* PacketBuilder::VABSPS(Value* a) + { + Value* asInt = BITCAST(a, mSimdInt32Ty); + Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty); + return result; + } + + Value* PacketBuilder::ICLAMP(Value* src, Value* low, Value* high, const llvm::Twine& name) + { + Value* lowCmp = ICMP_SLT(src, low); + Value* ret = SELECT(lowCmp, low, src); + + Value* highCmp = ICMP_SGT(ret, high); + ret = SELECT(highCmp, high, ret, name); + + return ret; + } + + Value* PacketBuilder::FCLAMP(Value* src, Value* low, Value* high) + { + Value* lowCmp = FCMP_OLT(src, low); + Value* ret = SELECT(lowCmp, low, src); + + Value* highCmp = FCMP_OGT(ret, high); + ret = SELECT(highCmp, high, ret); + + return ret; + } + + Value* PacketBuilder::FCLAMP(Value* src, float low, float high) + { + Value* result = VMAXPS(src, VIMMED1(low)); + result = VMINPS(result, VIMMED1(high)); + + return result; + } + + Value* PacketBuilder::FMADDPS(Value* a, Value* b, Value* c) + { + Value* vOut; + + vOut = FADD(FMUL(a, b), c); + return vOut; + } + + ////////////////////////////////////////////////////////////////////////// + /// @brief pop count on vector mask (e.g. <8 x i1>) + Value* PacketBuilder::VPOPCNT(Value* a) { return POPCNT(VMOVMSK(a)); } + + ////////////////////////////////////////////////////////////////////////// + /// @brief C functions called by LLVM IR + ////////////////////////////////////////////////////////////////////////// + + Value* PacketBuilder::VEXTRACTI128(Value* a, Constant* imm8) + { + bool flag = !imm8->isZeroValue(); + SmallVector idx; + for (unsigned i = 0; i < mVWidth / 2; i++) + { + idx.push_back(C(flag ? i + mVWidth / 2 : i)); + } + return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx)); + } + + Value* PacketBuilder::VINSERTI128(Value* a, Value* b, Constant* imm8) + { + bool flag = !imm8->isZeroValue(); + SmallVector idx; + for (unsigned i = 0; i < mVWidth; i++) + { + idx.push_back(C(i)); + } + Value* inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx)); + + SmallVector idx2; + for (unsigned i = 0; i < mVWidth / 2; i++) + { + idx2.push_back(C(flag ? i : i + mVWidth)); + } + for (unsigned i = mVWidth / 2; i < mVWidth; i++) + { + idx2.push_back(C(flag ? i + mVWidth / 2 : i)); + } + return VSHUFFLE(a, inter, ConstantVector::get(idx2)); + } + + uint32_t PacketBuilder::GetTypeSize(Type* pType) + { + if (pType->isStructTy()) + { + uint32_t numElems = pType->getStructNumElements(); + Type* pElemTy = pType->getStructElementType(0); + return numElems * GetTypeSize(pElemTy); + } + + if (pType->isArrayTy()) + { + uint32_t numElems = pType->getArrayNumElements(); + Type* pElemTy = pType->getArrayElementType(); + return numElems * GetTypeSize(pElemTy); + } + + if (pType->isIntegerTy()) + { + uint32_t bitSize = pType->getIntegerBitWidth(); + return bitSize / 8; + } + + if (pType->isFloatTy()) + { + return 4; + } + + if (pType->isHalfTy()) + { + return 2; + } + + if (pType->isDoubleTy()) + { + return 8; + } + + assert(false && "Unimplemented type."); + return 0; + } +} diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/README.md b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/README.md new file mode 100644 index 000000000000..05580087b002 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/README.md @@ -0,0 +1 @@ +# Packetizer diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/WIAnalysis.cpp b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/WIAnalysis.cpp new file mode 100644 index 000000000000..d33b97707eb5 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/WIAnalysis.cpp @@ -0,0 +1,900 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ + +#include "WIAnalysis.hpp" + +#include "llvm/GenXIntrinsics/GenXIntrinsics.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/InitializePasses.h" +#include +#include +#include +#include + +#include "llvmWrapper/IR/InstrTypes.h" + + +#include +#include +#include + +using namespace llvm; + +static cl::opt PrintWiaCheck("print-wia-check", cl::init(true), + cl::Hidden, + cl::desc("Debug wia-check analysis")); + +namespace pktz { + +WIAnalysis::WIAnalysis() : FunctionPass(ID) { + initializeWIAnalysisPass(*PassRegistry::getPassRegistry()); +} + +const unsigned int WIAnalysis::MinIndexBitwidthToPreserve = 16; + +void WIAnalysis::print(raw_ostream &OS, const Module *) const { + DenseMap BBIDs; + int id = 0; + for (Function::iterator I = m_func->begin(), E = m_func->end(); I != E; + ++I, ++id) { + BasicBlock *BB = &*I; + BBIDs[BB] = id; + } + + OS << "WIAnalysis: " << m_func->getName().str() << "\n"; + + OS << "Args: \n"; + for (Function::arg_iterator I = m_func->arg_begin(), E = m_func->arg_end(); + I != E; ++I) { + Value *AVal = &*I; + DenseMap::const_iterator dep_it = + m_deps.find(AVal); + if (dep_it != m_deps.end()) + OS << " " << "STRIDE:" << dep_it->second << " " << *AVal << "\n"; + else + OS << " unknown " << *AVal << "\n"; + } + OS << "\n"; + + for (Function::iterator I = m_func->begin(), E = m_func->end(); I != E; ++I) { + BasicBlock *BB = &*I; + OS << "BB:" << BBIDs[BB]; + if (BB->hasName()) + OS << " " << BB->getName(); + OS << " ; preds ="; + bool isFirst = true; + for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) { + BasicBlock *pred = *PI; + OS << ((isFirst) ? " " : ", ") << "BB:" << BBIDs[pred] << " "; + if (pred->hasName()) + OS << pred->getName(); + isFirst = false; + } + OS << "\n"; + for (BasicBlock::iterator it = BB->begin(), ie = BB->end(); it != ie; + ++it) { + Instruction *I = &*it; + DenseMap::const_iterator dep_it = + m_deps.find(I); + if (dep_it != m_deps.end()) { + OS << " " << "STRIDE:" << dep_it->second << " " << *I; + } else { + OS << " unknown " << *I; + } + if (I->isTerminator()) { + auto TI = cast(I); + OS << " ["; + for (unsigned i = 0, e = TI->getNumSuccessors(); i < e; ++i) { + BasicBlock *succ = TI->getSuccessor(i); + OS << " BB:" << BBIDs[succ]; + } + OS << " ]"; + } + OS << "\n"; + } + OS << "\n"; + } +} + +bool WIAnalysis::runOnFunction(Function &F) { + + if (!F.hasFnAttribute("CMGenxSIMT")) + return false; + m_func = &F; + DT = &getAnalysis().getDomTree(); + PDT = &getAnalysis().getPostDomTree(); + + m_deps.clear(); + m_changed1.clear(); + m_changed2.clear(); + m_pChangedNew = &m_changed1; + m_pChangedOld = &m_changed2; + m_ctrlBranches.clear(); + + initDependency(&F); + + inst_iterator it = inst_begin(F); + inst_iterator e = inst_end(F); + for (; it != e; ++it) { + calculate_dep(&*it); + } + + // Recursively check if WI-dep changes and if so reclaculates + // the WI-dep and marks the users for re-checking. + // This procedure is guranteed to converge since WI-dep can only + // become less unifrom (uniform->consecutive->ptr->stride->random). + updateDeps(); + + if (PrintWiaCheck) { + print(dbgs()); + } + return false; +} + +void WIAnalysis::updateDeps() { + // As lonst as we have values to update + while (!m_pChangedNew->empty()) { + // swap between changedSet pointers - recheck the newChanged(now old) + std::swap(m_pChangedNew, m_pChangedOld); + // clear the newChanged set so it will be filled with the users of + // instruction which their WI-dep canged during the current iteration + m_pChangedNew->clear(); + + // update all changed values + std::vector::iterator it = m_pChangedOld->begin(); + std::vector::iterator e = m_pChangedOld->end(); + for (; it != e; ++it) { + // remove first instruction + // calculate its new dependencey value + calculate_dep(*it); + } + } +} + +bool WIAnalysis::isInstructionSimple(const Instruction *inst) { + // avoid changing cb load to sampler load, since sampler load + // has longer latency. + if (isa(inst)) { + return false; + } + + if (isa(inst) || isa(inst) || + isa(inst) || isa(inst)) { + return true; + } + return false; +} + +void WIAnalysis::initDependency(llvm::Function *pF) { + llvm::Function::arg_iterator ai, ae; + ai = pF->arg_begin(); + ae = pF->arg_end(); + + // add all kernel function args as uniform + for (; ai != ae; ++ai) { + incUpdateDepend(ai, WIAnalysis::UNIFORM); + } +} + +bool WIAnalysis::validDepend(const llvm::Value *val) { + return (m_deps.find(val) != m_deps.end()); +} + +WIAnalysis::WIDependancy WIAnalysis::whichDepend(const Value *val) { + assert(m_pChangedNew->empty() && "set should be empty before query"); + assert(val && "Bad value"); + if (m_deps.find(val) == m_deps.end()) { + // We expect all instructions in the map. Otherwise take the safe + // way return random on release (assert on debug). For non-instruction + // (arguments, constants) return uniform. + bool isInst = isa(val); + if (isInst) { + return WIAnalysis::RANDOM; + } + return WIAnalysis::UNIFORM; + } + return m_deps[val]; +} + +bool WIAnalysis::stayUniformIfUsedAt(const Value *val, BasicBlock *use_blk) { + const Instruction *inst = dyn_cast(val); + // if it is a function argument, no problem to use it anywhere inside the + // function + if (!inst) { + return true; + } + if (m_deps.find(inst) == m_deps.end()) { + assert(0 && "trouble, don't have a record"); + return true; + } + if (m_deps[inst] != WIAnalysis::UNIFORM) { + return true; + } + const BasicBlock *def_blk = inst->getParent(); + if (m_ctrlBranches.find(def_blk) == m_ctrlBranches.end()) { + return true; + } + if (m_ctrlBranches.find(use_blk) != m_ctrlBranches.end()) { + return false; + } + // every controlling branch of the def block has to be in the set of + // controlling branches for the use-blk + for (SmallPtrSet::iterator + I = m_ctrlBranches[def_blk].begin(), + E = m_ctrlBranches[def_blk].end(); + I != E; ++I) { + if (!m_ctrlBranches[use_blk].count(*I)) { + return false; + } + } + return true; +} + +void WIAnalysis::invalidateDepend(const Value *val) { + if (m_deps.find(val) != m_deps.end()) { + m_deps.erase(val); + } +} + +bool WIAnalysis::isControlFlowUniform(const Function *F) { + assert(F && "Bad Function"); + + /// Place out-masks + for (Function::const_iterator it = F->begin(), e = F->end(); it != e; ++it) { + WIAnalysis::WIDependancy dep = whichDepend(it->getTerminator()); + if (dep != WIAnalysis::UNIFORM) { + // Found a branch which diverges on the input + return false; + } + } + // All branches are uniform + return true; +} + +WIAnalysis::WIDependancy WIAnalysis::getDependency(const Value *val) { + + if (m_deps.find(val) == m_deps.end()) { + // Make sure that constants are not added in the map. + if (!isa(val)) { + return WIAnalysis::UNIFORM; + } + // Don't expect this happens, let's assert in debug build! + assert(false && "Dependence for 'val' should bave been set already!"); + m_deps[val] = WIAnalysis::UNIFORM; + } + return m_deps[val]; +} + +bool WIAnalysis::hasDependency(const Value *val) { + + if (!isa(val) && !isa(val)) { + return true; + } + return (m_deps.count(val) > 0); +} + +void WIAnalysis::calculate_dep(const Value *val) { + assert(val && "Bad value"); + + // Not an instruction, must be a constant or an argument + // Could this vector type be of a constant which + // is not uniform ? + assert(isa(val) && + "Could we reach here with non instruction value?"); + + const Instruction *inst = dyn_cast(val); + assert(inst && "This Value is not an Instruction"); + + bool hasOriginal = hasDependency(inst); + WIDependancy orig; + // We only calculate dependency on unset instructions if all their operands + // were already given dependency. This is good for compile time since these + // instructions will be visited again after the operands dependency is set. + // An exception are phi nodes since they can be the ancestor of themselves in + // the def-use chain. Note that in this case we force the phi to have the + // pre-header value already calculated. + if (!hasOriginal) { + unsigned int unsetOpNum = 0; + for (unsigned i = 0; i < inst->getNumOperands(); ++i) { + if (!hasDependency(inst->getOperand(i))) + unsetOpNum++; + } + if (isa(inst)) { + // We do not calculate PhiNode with all incoming values unset. + // + // This seems right as we don't expect a phi that only depends upon other + // phi's (if it happens, those phis form a cycle dependency) so any phi's + // calculation will eventually be triggered from calculating a non-phi one + // which the phi depends upon. + if (unsetOpNum == inst->getNumOperands()) + return; + } else { + // We do not calculate non-PhiNode instruction that have unset operands + if (unsetOpNum > 0) + return; + } + orig = WIAnalysis::UNIFORM; + } else { + orig = m_deps[inst]; + // if inst is already marked random, it cannot get better + if (orig == WIAnalysis::RANDOM) { + return; + } + } + + WIDependancy dep = orig; + + // LLVM does not have compile time polymorphisms + // TODO: to make things faster we may want to sort the list below according + // to the order of their probability of appearance. + if (const BinaryOperator *BI = dyn_cast(inst)) + dep = calculate_dep(BI); + else if (const CallInst *CI = dyn_cast(inst)) + dep = calculate_dep(CI); + else if (isa(inst)) + dep = calculate_dep_simple(inst); + else if (isa(inst)) + dep = calculate_dep_simple(inst); + else if (const GetElementPtrInst *GEP = dyn_cast(inst)) + dep = calculate_dep(GEP); + else if (isa(inst)) + dep = calculate_dep_simple(inst); + else if (isa(inst)) + dep = calculate_dep_simple(inst); + else if (const PHINode *Phi = dyn_cast(inst)) + dep = calculate_dep(Phi); + else if (isa(inst)) + dep = calculate_dep_simple(inst); + else if (isa(inst)) + dep = RANDOM; // calculate_dep_simple(inst); + else if (inst->isTerminator()) + dep = calculate_dep(inst); + else if (const SelectInst *SI = dyn_cast(inst)) + dep = calculate_dep(SI); + else if (const AllocaInst *AI = dyn_cast(inst)) + dep = calculate_dep(AI); + else if (const CastInst *CI = dyn_cast(inst)) + dep = calculate_dep(CI); + else if (isa(inst)) + dep = calculate_dep_simple(inst); + else if (const LoadInst *LI = dyn_cast(inst)) + dep = calculate_dep(LI); + else if (const VAArgInst *VAI = dyn_cast(inst)) + dep = calculate_dep(VAI); + + // If the value was changed in this calculation + if (!hasOriginal || dep != orig) { + // Save the new value of this instruction + updateDepMap(inst, dep); + // divergent branch, trigger updates due to control-dependence + if (inst->isTerminator() && dep != WIAnalysis::UNIFORM) { + update_cf_dep(inst); + } + } +} + +void WIAnalysis::update_cf_dep(const Instruction *inst) { + BasicBlock *blk = const_cast(inst->getParent()); + BasicBlock *ipd = PDT->getNode(blk)->getIDom()->getBlock(); + // a branch can have NULL immediate post-dominator when a function + // has multiple exits in llvm-ir + // compute influence region and the partial-joins + assert(inst->isTerminator() && "Expected terminator inst"); + BranchInfo br_info(cast(inst), ipd); + // debug: dump influence region and partial-joins + // br_info.print(ods()); + + // check dep-type for every phi in the full join + if (ipd) { + updatePHIDepAtJoin(ipd, &br_info); + } + // check dep-type for every phi in the partial-joins + for (SmallPtrSet::iterator + join_it = br_info.partial_joins.begin(), + join_e = br_info.partial_joins.end(); + join_it != join_e; ++join_it) { + updatePHIDepAtJoin(*join_it, &br_info); + } + + // walk through all the instructions in the influence-region + // update the dep-type based upon its uses + DenseSet::iterator blk_it = br_info.influence_region.begin(); + DenseSet::iterator blk_e = br_info.influence_region.end(); + for (; blk_it != blk_e; ++blk_it) { + BasicBlock *def_blk = *blk_it; + // add the branch into the controlling-branch set of the block + // if the block is in the influence-region, and not a partial join + bool is_join = (br_info.partial_joins.count(def_blk) > 0); + if (!is_join) { + m_ctrlBranches[def_blk].insert(inst); + } + // An insight that can speed up the search process is that all the in-region + // values that are used outside must dominate TI. Therefore, instead of + // searching every basic blocks in the influence region, we only search the + // dominators of the current branch + if (def_blk != blk && + !DT->dominates(DT->getNode(def_blk), DT->getNode(blk))) { + continue; + } + for (BasicBlock::iterator I = def_blk->begin(), E = def_blk->end(); I != E; + ++I) { + Instruction *defi = &(*I); + if (hasDependency(defi) && getDependency(defi) == WIAnalysis::RANDOM) { + continue; + } + // look at the uses + Value::use_iterator use_it = defi->use_begin(); + Value::use_iterator use_e = defi->use_end(); + for (; use_it != use_e; ++use_it) { + Instruction *user = dyn_cast((*use_it).getUser()); + assert(user); + BasicBlock *user_blk = user->getParent(); + PHINode *phi = dyn_cast(user); + if (phi) { + // another place we assume all critical edges have been split and + // phi-move will be placed on the blocks created on those + user_blk = phi->getIncomingBlock(*use_it); + } + if (user_blk == def_blk) { + // local def-use, not related to control-dependence + continue; // skip + } + if (user_blk == br_info.full_join || + br_info.partial_joins.count(user_blk) || + !br_info.influence_region.count(user_blk)) { + updateDepMap(defi, WIAnalysis::RANDOM); + // break out of the use loop + // since def is changed to RANDOM, all uses will be changed later + break; + } + } // end of usei loop + } // end of defi loop within a block + } // end of influence-region block loop +} + +void WIAnalysis::updatePHIDepAtJoin(BasicBlock *blk, BranchInfo *brInfo) { + for (BasicBlock::iterator I = blk->begin(), E = blk->end(); I != E; ++I) { + Instruction *defi = &(*I); + PHINode *phi = dyn_cast(defi); + if (!phi) { + break; + } + if (hasDependency(phi) && getDependency(phi) == WIAnalysis::RANDOM) { + continue; + } + Value *trickySrc = nullptr; + for (unsigned predIdx = 0; predIdx < phi->getNumOperands(); ++predIdx) { + Value *srcVal = phi->getOperand(predIdx); + Instruction *defi = dyn_cast(srcVal); + if (defi && brInfo->influence_region.count(defi->getParent())) { + updateDepMap(phi, WIAnalysis::RANDOM); + break; + } else { + // if the src is an immed, or an argument, or defined outside, + // think about the phi-move that can be placed in the incoming block. + // this phi should be random if we have two different src-values like + // that. this is one place where we assume all critical edges have been + // split + BasicBlock *predBlk = phi->getIncomingBlock(predIdx); + if (brInfo->influence_region.count(predBlk)) { + if (!trickySrc) { + trickySrc = srcVal; + } else if (trickySrc != srcVal) { + updateDepMap(phi, WIAnalysis::RANDOM); + break; + } + } + } + } + } +} + +void WIAnalysis::updateDepMap(const Instruction *inst, + WIAnalysis::WIDependancy dep) { + // Save the new value of this instruction + m_deps[inst] = dep; + // Register for update all of the dependent values of this updated + // instruction. + Value::const_user_iterator it = inst->user_begin(); + Value::const_user_iterator e = inst->user_end(); + for (; it != e; ++it) { + m_pChangedNew->push_back(*it); + } +} + +WIAnalysis::WIDependancy +WIAnalysis::calculate_dep_simple(const Instruction *I) { + // simply check that all operands are uniform, if so return uniform, else + // random + const unsigned nOps = I->getNumOperands(); + for (unsigned i = 0; i < nOps; ++i) { + const Value *op = I->getOperand(i); + WIAnalysis::WIDependancy dep = getDependency(op); + if (dep != WIAnalysis::UNIFORM) { + return WIAnalysis::RANDOM; + } + } + return WIAnalysis::UNIFORM; +} + +WIAnalysis::WIDependancy WIAnalysis::calculate_dep(const LoadInst *inst) { + return calculate_dep_simple(inst); +} + +WIAnalysis::WIDependancy WIAnalysis::calculate_dep(const BinaryOperator *inst) { + // Calculate the dependency type for each of the operands + Value *op0 = inst->getOperand(0); + Value *op1 = inst->getOperand(1); + + WIAnalysis::WIDependancy dep0 = getDependency(op0); + WIAnalysis::WIDependancy dep1 = getDependency(op1); + + // For whatever binary operation, + // uniform returns uniform + if (WIAnalysis::UNIFORM == dep0 && WIAnalysis::UNIFORM == dep1) { + return WIAnalysis::UNIFORM; + } + + // FIXME:: assumes that the X value does not cross the +/- border - risky !!! + // The pattern (and (X, C)), where C preserves the lower k bits of the value, + // is often used for truncating of numbers in 64bit. We assume that the index + // properties are not hurt by this. + if (inst->getOpcode() == Instruction::And) { + ConstantInt *C0 = dyn_cast(inst->getOperand(0)); + ConstantInt *C1 = dyn_cast(inst->getOperand(1)); + // Use any of the constants. Instcombine places constants on Op1 + // so try Op1 first. + if (C1 || C0) { + ConstantInt *C = C1 ? C1 : C0; + WIAnalysis::WIDependancy dep = C1 ? dep0 : dep1; + // Cannot look at bit pattern of huge integers. + if (C->getBitWidth() < 65) { + uint64_t val = C->getZExtValue(); + uint64_t ptr_mask = (1 << MinIndexBitwidthToPreserve) - 1; + // Zero all bits above the lower k bits that we are interested in + val &= (ptr_mask); + // Make sure that all of the remaining bits are active + if (val == ptr_mask) { + return dep; + } + } + } + } + + // FIXME:: assumes that the X value does not cross the +/- border - risky !!! + // The pattern (ashr (shl X, C)C) is used for truncating of numbers in 64bit + // The constant C must leave at least 32bits of the original number + if (inst->getOpcode() == Instruction::AShr) { + BinaryOperator *SHL = dyn_cast(inst->getOperand(0)); + // We also allow add of uniform value between the ashr and shl instructions + // since instcombine creates this pattern when adding a constant. + // The shl forces all low bits to be zero, so there can be no carry to the + // high bits due to the addition. Addition with uniform preservs WI-dep. + if (SHL && SHL->getOpcode() == Instruction::Add) { + Value *addedVal = SHL->getOperand(1); + if (getDependency(addedVal) == WIAnalysis::UNIFORM) { + SHL = dyn_cast(SHL->getOperand(0)); + } + } + + if (SHL && SHL->getOpcode() == Instruction::Shl) { + ConstantInt *c_ashr = dyn_cast(inst->getOperand(1)); + ConstantInt *c_shl = dyn_cast(SHL->getOperand(1)); + const IntegerType *AshrTy = cast(inst->getType()); + if (c_ashr && c_shl && c_ashr->getZExtValue() == c_shl->getZExtValue()) { + // If wordWidth - shift_width >= 32 bits + if ((AshrTy->getBitWidth() - c_shl->getZExtValue()) >= + MinIndexBitwidthToPreserve) { + // return the dep of the original X + return getDependency(SHL->getOperand(0)); + } + } + } + } + + if (dep0 == WIAnalysis::RANDOM || dep1 == WIAnalysis::RANDOM) { + return WIAnalysis::RANDOM; + } + // stride computation + switch (inst->getOpcode()) { + // Addition simply adds the stride value, except for ptr_consecutive + // which is promoted to strided. + // Another exception is when we subtract the tid: 1 - X which turns the + // tid order to random. + case Instruction::Add: { + int stride = dep0 + dep1; + return clampDepend(stride); + } + case Instruction::Sub: { + int stride = dep0 - dep1; + return clampDepend(stride); + } + case Instruction::Mul: + if (const ConstantInt* ConstOpnd = dyn_cast(op0)) { + const int c = (int)ConstOpnd->getSExtValue(); + return clampDepend(c*dep1); + } + else if (const ConstantInt* ConstOpnd = dyn_cast(op1)) { + const int c = (int)ConstOpnd->getSExtValue(); + return clampDepend(c*dep0); + } + break; + case Instruction::Shl: + if (const ConstantInt* ConstOpnd = dyn_cast(op1)) { + const int c = (int)ConstOpnd->getSExtValue(); + return clampDepend(dep0<getCalledFunction()) { + switch (GenXIntrinsic::getGenXIntrinsicID(Callee)) { + case GenXIntrinsic::genx_lane_id: + return (WIAnalysis::WIDependancy)1; + default: + break; + } + } + + return WIAnalysis::RANDOM; +} + +WIAnalysis::WIDependancy +WIAnalysis::calculate_dep(const GetElementPtrInst *inst) { + // running over the all indices argumets except for the last + // here we assume the pointer is the first operand + unsigned num = inst->getNumIndices(); + for (unsigned i = 1; i < num; ++i) { + const Value *op = inst->getOperand(i); + WIAnalysis::WIDependancy dep = getDependency(op); + if (dep != WIAnalysis::UNIFORM) { + return WIAnalysis::RANDOM; + } + } + const Value *opPtr = inst->getOperand(0); + WIAnalysis::WIDependancy ptrDep = getDependency(opPtr); + + const Value *lastInd = inst->getOperand(num); + WIAnalysis::WIDependancy lastIndDep = getDependency(lastInd); + // \todo + return clampDepend((int)ptrDep + (int)lastIndDep); +} + +WIAnalysis::WIDependancy WIAnalysis::calculate_dep(const PHINode *inst) { + unsigned num = inst->getNumIncomingValues(); + bool foundFirst = 0; + WIDependancy totalDep; + + for (unsigned i = 0; i < num; ++i) { + Value *op = inst->getIncomingValue(i); + if (hasDependency(op)) { + if (!foundFirst) { + totalDep = getDependency(op); + } else if (totalDep != getDependency(op)) { + totalDep = WIAnalysis::RANDOM; + } + foundFirst = 1; + } + } + + assert(foundFirst && + "We should not reach here with All incoming values are unset"); + + return totalDep; +} + +WIAnalysis::WIDependancy WIAnalysis::calculate_dep(const Instruction *inst) { + // Instruction has no return value + // Just need to know if this inst is uniform or not + // because we may want to avoid predication if the control flows + // in the function are uniform... + switch (inst->getOpcode()) { + case Instruction::Br: { + const BranchInst *brInst = cast(inst); + if (brInst->isConditional()) { + // Conditional branch is uniform, if its condition is uniform + Value *op = brInst->getCondition(); + WIAnalysis::WIDependancy dep = getDependency(op); + if (WIAnalysis::UNIFORM == dep) { + return WIAnalysis::UNIFORM; + } + return WIAnalysis::RANDOM; + } + // Unconditional branch is non TID-dependent + return WIAnalysis::UNIFORM; + } + // Return instructions are unconditional + case Instruction::Ret: + return WIAnalysis::UNIFORM; + case Instruction::Unreachable: + return WIAnalysis::UNIFORM; + case Instruction::IndirectBr: + return WIAnalysis::RANDOM; + // TODO: Define the dependency requirements of indirectBr + case Instruction::Switch: + return WIAnalysis::RANDOM; + // TODO: Should this depend only on the condition, like branch? + default: + return WIAnalysis::RANDOM; + } +} + +WIAnalysis::WIDependancy WIAnalysis::calculate_dep(const SelectInst *inst) { + Value *op0 = inst->getOperand(0); // mask + WIAnalysis::WIDependancy dep0 = getDependency(op0); + if (WIAnalysis::UNIFORM == dep0) { + Value *op1 = inst->getOperand(1); + Value *op2 = inst->getOperand(2); + WIAnalysis::WIDependancy dep1 = getDependency(op1); + WIAnalysis::WIDependancy dep2 = getDependency(op2); + if (dep1 == dep2) + return dep1; + } + return WIAnalysis::RANDOM; +} + +WIAnalysis::WIDependancy WIAnalysis::calculate_dep(const AllocaInst *inst) { + // \todo + return WIAnalysis::RANDOM; +} + +WIAnalysis::WIDependancy WIAnalysis::calculate_dep(const CastInst *inst) { + Value *op0 = inst->getOperand(0); + WIAnalysis::WIDependancy dep0 = getDependency(op0); + + // independent remains independent + if (WIAnalysis::UNIFORM == dep0) + return dep0; + + switch (inst->getOpcode()) { + case Instruction::SExt: + case Instruction::FPTrunc: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::AddrSpaceCast: + case Instruction::UIToFP: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::SIToFP: + return dep0; + case Instruction::BitCast: + case Instruction::ZExt: + return WIAnalysis::RANDOM; + // FIXME:: assumes that the value does not cross the +/- border - risky !!!! + case Instruction::Trunc: { + const Type *destType = inst->getDestTy(); + const IntegerType *intType = dyn_cast(destType); + if (intType && (intType->getBitWidth() >= MinIndexBitwidthToPreserve)) { + return dep0; + } + return WIAnalysis::RANDOM; + } + default: + assert(false && "no such opcode"); + // never get here + return WIAnalysis::RANDOM; + } +} + +WIAnalysis::WIDependancy WIAnalysis::calculate_dep(const VAArgInst *inst) { + assert(false && "Are we supporting this ??"); + return WIAnalysis::RANDOM; +} + +BranchInfo::BranchInfo(const IGCLLVM::TerminatorInst *inst, const BasicBlock *ipd) + : cbr(inst), full_join(ipd) { + assert(cbr == inst->getParent()->getTerminator() && "block terminator mismatch"); + assert(cbr->getNumSuccessors() == 2 && "only for cbr with two successors"); + + std::set f_set, t_set; + std::stack work_set; + if (cbr->getSuccessor(0) != full_join) { + work_set.push(cbr->getSuccessor(0)); + while (!work_set.empty()) { + BasicBlock *cur_blk = work_set.top(); + work_set.pop(); + f_set.insert(cur_blk); + influence_region.insert(cur_blk); + for (succ_iterator SI = succ_begin(cur_blk), E = succ_end(cur_blk); + SI != E; ++SI) { + BasicBlock *succ_blk = (*SI); + if (succ_blk != full_join && !f_set.count(succ_blk)) { + work_set.push(succ_blk); + } + } + } + } + if (cbr->getSuccessor(1) != full_join) { + work_set.push(cbr->getSuccessor(1)); + while (!work_set.empty()) { + BasicBlock *cur_blk = work_set.top(); + work_set.pop(); + t_set.insert(cur_blk); + influence_region.insert(cur_blk); + if (f_set.count(cur_blk)) { + partial_joins.insert(cur_blk); + } + for (succ_iterator SI = succ_begin(cur_blk), E = succ_end(cur_blk); + SI != E; ++SI) { + BasicBlock *succ_blk = (*SI); + if (succ_blk != full_join && !t_set.count(succ_blk)) { + work_set.push(succ_blk); + } + } + } + } +} + +void BranchInfo::print(raw_ostream &OS) const { + OS << "\nCBR: " << *cbr; + OS << "\nIPD: "; + if (full_join) { + full_join->print(OS); + } + OS << "\nPartial Joins:"; + SmallPtrSet::iterator join_it = partial_joins.begin(); + SmallPtrSet::iterator join_e = partial_joins.end(); + for (; join_it != join_e; ++join_it) { + BasicBlock *cur_blk = *join_it; + OS << "\n "; + cur_blk->print(OS); + } + OS << "\nInfluence Region:"; + DenseSet::const_iterator blk_it = influence_region.begin(); + DenseSet::const_iterator blk_e = influence_region.end(); + for (; blk_it != blk_e; ++blk_it) { + BasicBlock *cur_blk = *blk_it; + OS << "\n "; + cur_blk->print(OS); + } + OS << "\n"; +} + +char WIAnalysis::ID = 0; // LLVM uses address of ID as the actual ID. + +FunctionPass *createWIAnalysisPass() { return new WIAnalysis(); } + +} // end of namespace pktz + +using namespace pktz; + +#define PASS_FLAG "wi-analysis" +#define PASS_DESCRIPTION "WIAnalysis provides work item dependency info" +#define PASS_CFG_ONLY true +#define PASS_ANALYSIS true +INITIALIZE_PASS_BEGIN(WIAnalysis, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, + PASS_ANALYSIS) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) +INITIALIZE_PASS_END(WIAnalysis, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, + PASS_ANALYSIS) diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/WIAnalysis.hpp b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/WIAnalysis.hpp new file mode 100644 index 000000000000..4d0f7fe51879 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/WIAnalysis.hpp @@ -0,0 +1,265 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "llvmWrapper/IR/InstrTypes.h" + +#include + +namespace llvm { +// foward declare the initializer +void initializeWIAnalysisPass(PassRegistry &); +} // namespace llvm + +namespace pktz { +/// @Brief, given a conditional branch and its immediate post dominator, +/// find its influence-region and partial joins within the influence region +class BranchInfo { +public: + BranchInfo(const IGCLLVM::TerminatorInst *inst, const llvm::BasicBlock *ipd); + + void print(llvm::raw_ostream &OS) const; + void dump() const { print(llvm::dbgs()); } + + const IGCLLVM::TerminatorInst *cbr; + const llvm::BasicBlock *full_join; + llvm::DenseSet influence_region; + llvm::SmallPtrSet partial_joins; + llvm::BasicBlock *fork_blk; +}; + +/// @brief Work Item Analysis class used to provide information on +/// individual instructions. The analysis class detects values which +/// depend in work-item and describe their dependency. +/// The algorithm used is recursive and new instructions are updated +/// according to their operands (which are already calculated). +/// @Author: Nadav Rotem, who wrote the original code for OCL vectorizer +/// +/// @Author: Gang Chen, adopt it for IGC, +/// - extend it to handle the divergent SIMD control-flow +/// - support GFX-specific intrinsic +class WIAnalysis : public llvm::FunctionPass { +public: + static char ID; // Pass identification, replacement for typeid + + WIAnalysis(); + + ~WIAnalysis() {} + + /// @brief LLVM llvm::Function pass entry + /// @param F llvm::Function to transform + /// @return True if changed + virtual bool runOnFunction(llvm::Function &F); + + /// @brief Update dependency relations between all values + void updateDeps(); + + /// @brief backward update dependency based upon use + void backwardUpdate(); + + /// @brief initialize value dependence + void initDependency(llvm::Function *pF); + + /// @brief describes the type of dependency on the work item + enum WIDependancy { + UNIFORM = 0, /// All elements in vector are constant + // stride-value between 1 and 1023 + RANDOM = 1024, /// if stride >= 1024, treat as random + }; + + /// The WIAnalysis follows pointer arithmetic + /// and Index arithmetic when calculating dependency + /// properties. If a part of the index is lost due to + /// a transformation, it is acceptable. + /// This constant decides how many bits need to be + /// preserved before we give up on the analysis. + static const unsigned int MinIndexBitwidthToPreserve; + + /// @brief Returns true if the analysis has a dependency + // for the instruction, false otherwise + /// @param val llvm::Value to test + /// @return Validity of dependency + bool validDepend(const llvm::Value *val); + + /// @brief Returns the type of dependency the instruction has on + /// the work-item + /// @param val llvm::Value to test + /// @return Dependency kind + WIDependancy whichDepend(const llvm::Value *val); + + /// @brief Inform analysis that instruction was invalidated + /// as pointer may later be reused + /// @param val llvm::Value to invalidate + void invalidateDepend(const llvm::Value *val); + + /// incremental update of the dep-map on individual value + /// without propagation. Exposed for later pass. + void incUpdateDepend(const llvm::Value *val, WIDependancy dep) { + m_deps[val] = dep; + } + + /// check if a value stay uniform when we add a use in the given block + /// If the value is not uniform to begin with, query returns true. + bool stayUniformIfUsedAt(const llvm::Value *val, llvm::BasicBlock *blk); + + /// check if a value is defined inside divergent control-flow + bool insideDivergentCF(const llvm::Value *val) { + return ( + llvm::isa(val) && + m_ctrlBranches.find(llvm::cast(val)->getParent()) != + m_ctrlBranches.end()); + } + + /// @brief Checks if all of the control flow in the analyzed function is + /// uniform. + /// @param F function to check + /// @return True if masks are needed + bool isControlFlowUniform(const llvm::Function *F); + + virtual void releaseMemory() { + m_deps.clear(); + m_changed1.clear(); + m_changed2.clear(); + m_ctrlBranches.clear(); + m_backwardList.clear(); + } + + /// print - print m_deps in human readable form + virtual void print(llvm::raw_ostream &OS, const llvm::Module * = 0) const; + void dump() const { print(llvm::dbgs()); } + +private: + /*! \name Dependency Calculation Functions + * \{ */ + /// @brief Calculate the dependency type for the instruction + /// @param inst Instruction to inspect + /// @return Type of dependency. + void calculate_dep(const llvm::Value *val); + WIDependancy calculate_dep(const llvm::BinaryOperator *inst); + WIDependancy calculate_dep(const llvm::CallInst *inst); + WIDependancy calculate_dep(const llvm::GetElementPtrInst *inst); + WIDependancy calculate_dep(const llvm::PHINode *inst); + WIDependancy calculate_dep(const llvm::Instruction *inst); + WIDependancy calculate_dep(const llvm::SelectInst *inst); + WIDependancy calculate_dep(const llvm::AllocaInst *inst); + WIDependancy calculate_dep(const llvm::CastInst *inst); + WIDependancy calculate_dep(const llvm::VAArgInst *inst); + WIDependancy calculate_dep(const llvm::LoadInst *inst); + /*! \} */ + + WIDependancy clampDepend(int stride) { + if (stride < 0 || stride >= RANDOM) + return RANDOM; + return (WIDependancy)stride; + } + /// @brief do the trivial checking WI-dep + /// @param I instruction to check + /// @return Dependency type. Returns Uniform if all operands are + /// Uniform, Random otherwise + WIDependancy calculate_dep_simple(const llvm::Instruction *I); + + /// @brief update the WI-dep from a divergent branch, + /// affected instructions are added to m_pChangedNew + /// @param the divergent branch + void update_cf_dep(const llvm::Instruction *TI); + + /// @check phi divergence at a join-blk due to a divergent branch + void updatePHIDepAtJoin(llvm::BasicBlock *blk, BranchInfo *brInfo); + + void updateDepMap(const llvm::Instruction *inst, + WIAnalysis::WIDependancy dep); + + /// @brief Provide known dependency type for requested value + /// @param val llvm::Value to examine + /// @return Dependency type. Returns Uniform for unknown type + WIDependancy getDependency(const llvm::Value *val); + + /// @brief return true if there is calculated dependency type for requested + /// value + /// @param val llvm::Value to examine + /// @return true if value has dependency type, false otherwise. + bool hasDependency(const llvm::Value *val); + + /// @brief return true if all uses of this value are marked UNIFORM + bool allUsesUniform(const llvm::Value *val); + + /// @brief return true is the instruction is simple and making it random is + /// cheap + bool isInstructionSimple(const llvm::Instruction *inst); + + /// @brief LLVM Interface + /// @param AU Analysis + /// WIAnalysis requires dominator and post dominator analysis + /// WIAnalysis also requires BreakCriticalEdge because it assumes that + /// potential phi-moves will be placed at those blocks + virtual void getAnalysisUsage(llvm::AnalysisUsage &AU) const { + // Analysis pass preserve all + AU.setPreservesAll(); + + AU.addRequired(); + AU.addRequired(); + } + +private: + /// Stores an updated list of all dependencies + llvm::DenseMap m_deps; + /// for each block, store the list of diverging branches that affect it + llvm::DenseMap> + m_ctrlBranches; + + /// Iteratively one set holds the changed from the previous iteration and + /// the other holds the new changed values from the current iteration. + std::vector m_changed1; + std::vector m_changed2; + /// ptr to m_changed1, m_changed2 + std::vector *m_pChangedOld; + std::vector *m_pChangedNew; + + std::vector m_backwardList; + + llvm::Function *m_func = nullptr; + llvm::DominatorTree *DT = nullptr; + llvm::PostDominatorTree *PDT = nullptr; +}; + +} // end of namespace pktz + diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/gen_builder.hpp b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/gen_builder.hpp new file mode 100644 index 000000000000..67d310fdf826 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/gen_builder.hpp @@ -0,0 +1,1035 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +// @file gen_builder.hpp +// +// @brief auto-generated file +// +// DO NOT EDIT +// +// Generation Command Line: +// gen_llvm_ir_macros.py +// --input +// /cygdrive/d/cm-llvm/llvm/include/llvm/IR/IRBuilder.h +// --output-dir +// . +// --gen_h +// --gen_meta_h +// --gen_intrin_h +// +//============================================================================ +// clang-format off +#pragma once + +//============================================================================ +// Auto-generated Builder IR Wrappers +//============================================================================ +GlobalVariable* GLOBAL_STRING(StringRef Str, const Twine &Name = "", unsigned AddressSpace = 0) +{ + return IRB()->CreateGlobalString(Str, Name, AddressSpace); +} + +CallInst* MEMSET(Value *Ptr, Value *Val, uint64_t Size, unsigned Align, bool isVolatile = false, MDNode *TBAATag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr) +{ + return IRB()->CreateMemSet(Ptr, Val, Size, Align, isVolatile, TBAATag, ScopeTag, NoAliasTag); +} + +CallInst* MEMSET(Value *Ptr, Value *Val, Value *Size, unsigned Align, bool isVolatile = false, MDNode *TBAATag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr) +{ + return IRB()->CreateMemSet(Ptr, Val, Size, Align, isVolatile, TBAATag, ScopeTag, NoAliasTag); +} + +CallInst* MEMCOPY(Value *Dst, Value *Src, uint64_t Size, unsigned Align, bool isVolatile = false, MDNode *TBAATag = nullptr, MDNode *TBAAStructTag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr) +{ + return IRB()->CreateMemCpy(Dst, Align, Src, Align, Size, isVolatile, TBAATag, TBAAStructTag, ScopeTag, NoAliasTag); +} + +CallInst* MEMCOPY(Value *Dst, Value *Src, Value *Size, unsigned Align, bool isVolatile = false, MDNode *TBAATag = nullptr, MDNode *TBAAStructTag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr) +{ + return IRB()->CreateMemCpy(Dst, Align, Src, Align, Size, isVolatile, TBAATag, TBAAStructTag, ScopeTag, NoAliasTag); +} + +CallInst* ELEMENT_UNORDERED_ATOMIC_MEM_CPY(Value *Dst, unsigned DstAlign, Value *Src, unsigned SrcAlign, uint64_t Size, uint32_t ElementSize, MDNode *TBAATag = nullptr, MDNode *TBAAStructTag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr) +{ + return IRB()->CreateElementUnorderedAtomicMemCpy(Dst, DstAlign, Src, SrcAlign, Size, ElementSize, TBAATag, TBAAStructTag, ScopeTag, NoAliasTag); +} + +CallInst* ELEMENT_UNORDERED_ATOMIC_MEM_CPY(Value *Dst, unsigned DstAlign, Value *Src, unsigned SrcAlign, Value *Size, uint32_t ElementSize, MDNode *TBAATag = nullptr, MDNode *TBAAStructTag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr) +{ + return IRB()->CreateElementUnorderedAtomicMemCpy(Dst, DstAlign, Src, SrcAlign, Size, ElementSize, TBAATag, TBAAStructTag, ScopeTag, NoAliasTag); +} + +CallInst* MEMMOVE(Value *Dst, Value *Src, uint64_t Size, unsigned Align, bool isVolatile = false, MDNode *TBAATag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr) +{ + return IRB()->CreateMemMove(Dst, Align, Src, Align, Size, isVolatile, TBAATag, ScopeTag, NoAliasTag); +} + +CallInst* MEMMOVE(Value *Dst, Value *Src, Value *Size, unsigned Align, bool isVolatile = false, MDNode *TBAATag = nullptr, MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr) +{ + return IRB()->CreateMemMove(Dst, Align, Src, Align, Size, isVolatile, TBAATag, ScopeTag, NoAliasTag); +} + +CallInst* FADD_REDUCE(Value *Acc, Value *Src) +{ + return IRB()->CreateFAddReduce(Acc, Src); +} + +CallInst* FMUL_REDUCE(Value *Acc, Value *Src) +{ + return IRB()->CreateFMulReduce(Acc, Src); +} + +CallInst* ADD_REDUCE(Value *Src) +{ + return IRB()->CreateAddReduce(Src); +} + +CallInst* MUL_REDUCE(Value *Src) +{ + return IRB()->CreateMulReduce(Src); +} + +CallInst* AND_REDUCE(Value *Src) +{ + return IRB()->CreateAndReduce(Src); +} + +CallInst* OR_REDUCE(Value *Src) +{ + return IRB()->CreateOrReduce(Src); +} + +CallInst* XOR_REDUCE(Value *Src) +{ + return IRB()->CreateXorReduce(Src); +} + +CallInst* INT_MAX_REDUCE(Value *Src, bool IsSigned = false) +{ + return IRB()->CreateIntMaxReduce(Src, IsSigned); +} + +CallInst* INT_MIN_REDUCE(Value *Src, bool IsSigned = false) +{ + return IRB()->CreateIntMinReduce(Src, IsSigned); +} + +CallInst* FP_MAX_REDUCE(Value *Src, bool NoNaN = false) +{ + return IRB()->CreateFPMaxReduce(Src, NoNaN); +} + +CallInst* FP_MIN_REDUCE(Value *Src, bool NoNaN = false) +{ + return IRB()->CreateFPMinReduce(Src, NoNaN); +} + +CallInst* LIFETIME_START(Value *Ptr, ConstantInt *Size = nullptr) +{ + return IRB()->CreateLifetimeStart(Ptr, Size); +} + +CallInst* LIFETIME_END(Value *Ptr, ConstantInt *Size = nullptr) +{ + return IRB()->CreateLifetimeEnd(Ptr, Size); +} + +CallInst* INVARIANT_START(Value *Ptr, ConstantInt *Size = nullptr) +{ + return IRB()->CreateInvariantStart(Ptr, Size); +} + +CallInst* MASKED_STORE(Value *Val, Value *Ptr, unsigned Align, Value *Mask) +{ + return IRB()->CreateMaskedStore(Val, Ptr, Align, Mask); +} + +CallInst* MASKED_GATHER(Value *Ptrs, unsigned Align, Value *Mask = nullptr, Value *PassThru = nullptr, const Twine& Name = "") +{ + return IRB()->CreateMaskedGather(Ptrs, Align, Mask, PassThru, Name); +} + +CallInst* MASKED_SCATTER(Value *Val, Value *Ptrs, unsigned Align, Value *Mask = nullptr) +{ + return IRB()->CreateMaskedScatter(Val, Ptrs, Align, Mask); +} + +CallInst* ASSUMPTION(Value *Cond) +{ + return IRB()->CreateAssumption(Cond); +} + +CallInst* GC_STATEPOINT_CALL(uint64_t ID, uint32_t NumPatchBytes, Value *ActualCallee, ArrayRef CallArgs, ArrayRef DeoptArgs, ArrayRef GCArgs, const Twine &Name = "") +{ + return IRB()->CreateGCStatepointCall(ID, NumPatchBytes, ActualCallee, CallArgs, DeoptArgs, GCArgs, Name); +} + +CallInst* GC_STATEPOINT_CALL(uint64_t ID, uint32_t NumPatchBytes, Value *ActualCallee, uint32_t Flags, ArrayRef CallArgs, ArrayRef TransitionArgs, ArrayRef DeoptArgs, ArrayRef GCArgs, const Twine &Name = "") +{ + return IRB()->CreateGCStatepointCall(ID, NumPatchBytes, ActualCallee, Flags, CallArgs, TransitionArgs, DeoptArgs, GCArgs, Name); +} + +CallInst* GC_STATEPOINT_CALL(uint64_t ID, uint32_t NumPatchBytes, Value *ActualCallee, ArrayRef CallArgs, ArrayRef DeoptArgs, ArrayRef GCArgs, const Twine &Name = "") +{ + return IRB()->CreateGCStatepointCall(ID, NumPatchBytes, ActualCallee, CallArgs, DeoptArgs, GCArgs, Name); +} + +InvokeInst* GC_STATEPOINT_INVOKE(uint64_t ID, uint32_t NumPatchBytes, Value *ActualInvokee, BasicBlock *NormalDest, BasicBlock *UnwindDest, ArrayRef InvokeArgs, ArrayRef DeoptArgs, ArrayRef GCArgs, const Twine &Name = "") +{ + return IRB()->CreateGCStatepointInvoke(ID, NumPatchBytes, ActualInvokee, NormalDest, UnwindDest, InvokeArgs, DeoptArgs, GCArgs, Name); +} + +InvokeInst* GC_STATEPOINT_INVOKE(uint64_t ID, uint32_t NumPatchBytes, Value *ActualInvokee, BasicBlock *NormalDest, BasicBlock *UnwindDest, uint32_t Flags, ArrayRef InvokeArgs, ArrayRef TransitionArgs, ArrayRef DeoptArgs, ArrayRef GCArgs, const Twine &Name = "") +{ + return IRB()->CreateGCStatepointInvoke(ID, NumPatchBytes, ActualInvokee, NormalDest, UnwindDest, Flags, InvokeArgs, TransitionArgs, DeoptArgs, GCArgs, Name); +} + +InvokeInst* GC_STATEPOINT_INVOKE(uint64_t ID, uint32_t NumPatchBytes, Value *ActualInvokee, BasicBlock *NormalDest, BasicBlock *UnwindDest, ArrayRef InvokeArgs, ArrayRef DeoptArgs, ArrayRef GCArgs, const Twine &Name = "") +{ + return IRB()->CreateGCStatepointInvoke(ID, NumPatchBytes, ActualInvokee, NormalDest, UnwindDest, InvokeArgs, DeoptArgs, GCArgs, Name); +} + +CallInst* GC_RESULT(Instruction *Statepoint, Type *ResultType, const Twine &Name = "") +{ + return IRB()->CreateGCResult(Statepoint, ResultType, Name); +} + +CallInst* GC_RELOCATE(Instruction *Statepoint, int BaseOffset, int DerivedOffset, Type *ResultType, const Twine &Name = "") +{ + return IRB()->CreateGCRelocate(Statepoint, BaseOffset, DerivedOffset, ResultType, Name); +} + +CallInst* BINARY_INTRINSIC(Intrinsic::ID ID, Value *LHS, Value *RHS, const Twine &Name = "") +{ + return IRB()->CreateBinaryIntrinsic(ID, LHS, RHS, nullptr, Name); +} + +CallInst* MIN_NUM(Value *LHS, Value *RHS, const Twine &Name = "") +{ + return IRB()->CreateMinNum(LHS, RHS, Name); +} + +CallInst* MAX_NUM(Value *LHS, Value *RHS, const Twine &Name = "") +{ + return IRB()->CreateMaxNum(LHS, RHS, Name); +} + +ReturnInst* RET_VOID() +{ + return IRB()->CreateRetVoid(); +} + +ReturnInst* RET(Value *V) +{ + return IRB()->CreateRet(V); +} + +ReturnInst* AGGREGATE_RET(Value *const *retVals, unsigned N) +{ + return IRB()->CreateAggregateRet(retVals, N); +} + +BranchInst* BR(BasicBlock *Dest) +{ + return IRB()->CreateBr(Dest); +} + +BranchInst* COND_BR(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights = nullptr, MDNode *Unpredictable = nullptr) +{ + return IRB()->CreateCondBr(Cond, True, False, BranchWeights, Unpredictable); +} + +BranchInst* COND_BR(Value *Cond, BasicBlock *True, BasicBlock *False, Instruction *MDSrc) +{ + return IRB()->CreateCondBr(Cond, True, False, MDSrc); +} + +SwitchInst* SWITCH(Value *V, BasicBlock *Dest, unsigned NumCases = 10, MDNode *BranchWeights = nullptr, MDNode *Unpredictable = nullptr) +{ + return IRB()->CreateSwitch(V, Dest, NumCases, BranchWeights, Unpredictable); +} + +IndirectBrInst* INDIRECT_BR(Value *Addr, unsigned NumDests = 10) +{ + return IRB()->CreateIndirectBr(Addr, NumDests); +} + +InvokeInst* INVOKE(Value *Callee, BasicBlock *NormalDest, BasicBlock *UnwindDest, ArrayRef Args = None, const Twine &Name = "") +{ + return IRB()->CreateInvoke(Callee, NormalDest, UnwindDest, Args, Name); +} + +InvokeInst* INVOKE(Value *Callee, BasicBlock *NormalDest, BasicBlock *UnwindDest, ArrayRef Args, ArrayRef OpBundles, const Twine &Name = "") +{ + return IRB()->CreateInvoke(Callee, NormalDest, UnwindDest, Args, OpBundles, Name); +} + +ResumeInst* RESUME(Value *Exn) +{ + return IRB()->CreateResume(Exn); +} + +CleanupReturnInst* CLEANUP_RET(CleanupPadInst *CleanupPad, BasicBlock *UnwindBB = nullptr) +{ + return IRB()->CreateCleanupRet(CleanupPad, UnwindBB); +} + +CatchSwitchInst* CATCH_SWITCH(Value *ParentPad, BasicBlock *UnwindBB, unsigned NumHandlers, const Twine &Name = "") +{ + return IRB()->CreateCatchSwitch(ParentPad, UnwindBB, NumHandlers, Name); +} + +CatchPadInst* CATCH_PAD(Value *ParentPad, ArrayRef Args, const Twine &Name = "") +{ + return IRB()->CreateCatchPad(ParentPad, Args, Name); +} + +CleanupPadInst* CLEANUP_PAD(Value *ParentPad, ArrayRef Args = None, const Twine &Name = "") +{ + return IRB()->CreateCleanupPad(ParentPad, Args, Name); +} + +CatchReturnInst* CATCH_RET(CatchPadInst *CatchPad, BasicBlock *BB) +{ + return IRB()->CreateCatchRet(CatchPad, BB); +} + +UnreachableInst* UNREACHABLE() +{ + return IRB()->CreateUnreachable(); +} + +Value* ADD(Value *LHS, Value *RHS, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false) +{ + return IRB()->CreateAdd(LHS, RHS, Name, HasNUW, HasNSW); +} + +Value* NSW_ADD(Value *LHS, Value *RHS, const Twine &Name = "") +{ + return IRB()->CreateNSWAdd(LHS, RHS, Name); +} + +Value* NUW_ADD(Value *LHS, Value *RHS, const Twine &Name = "") +{ + return IRB()->CreateNUWAdd(LHS, RHS, Name); +} + +Value* FADD(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr) +{ + return IRB()->CreateFAdd(LHS, RHS, Name, FPMathTag); +} + +Value* SUB(Value *LHS, Value *RHS, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false) +{ + return IRB()->CreateSub(LHS, RHS, Name, HasNUW, HasNSW); +} + +Value* NSW_SUB(Value *LHS, Value *RHS, const Twine &Name = "") +{ + return IRB()->CreateNSWSub(LHS, RHS, Name); +} + +Value* NUW_SUB(Value *LHS, Value *RHS, const Twine &Name = "") +{ + return IRB()->CreateNUWSub(LHS, RHS, Name); +} + +Value* FSUB(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr) +{ + return IRB()->CreateFSub(LHS, RHS, Name, FPMathTag); +} + +Value* MUL(Value *LHS, Value *RHS, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false) +{ + return IRB()->CreateMul(LHS, RHS, Name, HasNUW, HasNSW); +} + +Value* NSW_MUL(Value *LHS, Value *RHS, const Twine &Name = "") +{ + return IRB()->CreateNSWMul(LHS, RHS, Name); +} + +Value* NUW_MUL(Value *LHS, Value *RHS, const Twine &Name = "") +{ + return IRB()->CreateNUWMul(LHS, RHS, Name); +} + +Value* FMUL(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr) +{ + return IRB()->CreateFMul(LHS, RHS, Name, FPMathTag); +} + +Value* UDIV(Value *LHS, Value *RHS, const Twine &Name = "", bool isExact = false) +{ + return IRB()->CreateUDiv(LHS, RHS, Name, isExact); +} + +Value* EXACT_U_DIV(Value *LHS, Value *RHS, const Twine &Name = "") +{ + return IRB()->CreateExactUDiv(LHS, RHS, Name); +} + +Value* SDIV(Value *LHS, Value *RHS, const Twine &Name = "", bool isExact = false) +{ + return IRB()->CreateSDiv(LHS, RHS, Name, isExact); +} + +Value* EXACT_S_DIV(Value *LHS, Value *RHS, const Twine &Name = "") +{ + return IRB()->CreateExactSDiv(LHS, RHS, Name); +} + +Value* FDIV(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr) +{ + return IRB()->CreateFDiv(LHS, RHS, Name, FPMathTag); +} + +Value* UREM(Value *LHS, Value *RHS, const Twine &Name = "") +{ + return IRB()->CreateURem(LHS, RHS, Name); +} + +Value* SREM(Value *LHS, Value *RHS, const Twine &Name = "") +{ + return IRB()->CreateSRem(LHS, RHS, Name); +} + +Value* FREM(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr) +{ + return IRB()->CreateFRem(LHS, RHS, Name, FPMathTag); +} + +Value* SHL(Value *LHS, Value *RHS, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false) +{ + return IRB()->CreateShl(LHS, RHS, Name, HasNUW, HasNSW); +} + +Value* SHL(Value *LHS, const APInt &RHS, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false) +{ + return IRB()->CreateShl(LHS, RHS, Name, HasNUW, HasNSW); +} + +Value* SHL(Value *LHS, uint64_t RHS, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false) +{ + return IRB()->CreateShl(LHS, RHS, Name, HasNUW, HasNSW); +} + +Value* LSHR(Value *LHS, Value *RHS, const Twine &Name = "", bool isExact = false) +{ + return IRB()->CreateLShr(LHS, RHS, Name, isExact); +} + +Value* LSHR(Value *LHS, const APInt &RHS, const Twine &Name = "", bool isExact = false) +{ + return IRB()->CreateLShr(LHS, RHS, Name, isExact); +} + +Value* LSHR(Value *LHS, uint64_t RHS, const Twine &Name = "", bool isExact = false) +{ + return IRB()->CreateLShr(LHS, RHS, Name, isExact); +} + +Value* ASHR(Value *LHS, Value *RHS, const Twine &Name = "", bool isExact = false) +{ + return IRB()->CreateAShr(LHS, RHS, Name, isExact); +} + +Value* ASHR(Value *LHS, const APInt &RHS, const Twine &Name = "", bool isExact = false) +{ + return IRB()->CreateAShr(LHS, RHS, Name, isExact); +} + +Value* ASHR(Value *LHS, uint64_t RHS, const Twine &Name = "", bool isExact = false) +{ + return IRB()->CreateAShr(LHS, RHS, Name, isExact); +} + +Value* AND(Value *LHS, Value *RHS, const Twine &Name = "") +{ + return IRB()->CreateAnd(LHS, RHS, Name); +} + +Value* AND(Value *LHS, const APInt &RHS, const Twine &Name = "") +{ + return IRB()->CreateAnd(LHS, RHS, Name); +} + +Value* AND(Value *LHS, uint64_t RHS, const Twine &Name = "") +{ + return IRB()->CreateAnd(LHS, RHS, Name); +} + +Value* OR(Value *LHS, Value *RHS, const Twine &Name = "") +{ + return IRB()->CreateOr(LHS, RHS, Name); +} + +Value* OR(Value *LHS, const APInt &RHS, const Twine &Name = "") +{ + return IRB()->CreateOr(LHS, RHS, Name); +} + +Value* OR(Value *LHS, uint64_t RHS, const Twine &Name = "") +{ + return IRB()->CreateOr(LHS, RHS, Name); +} + +Value* XOR(Value *LHS, Value *RHS, const Twine &Name = "") +{ + return IRB()->CreateXor(LHS, RHS, Name); +} + +Value* XOR(Value *LHS, const APInt &RHS, const Twine &Name = "") +{ + return IRB()->CreateXor(LHS, RHS, Name); +} + +Value* XOR(Value *LHS, uint64_t RHS, const Twine &Name = "") +{ + return IRB()->CreateXor(LHS, RHS, Name); +} + +Value* BINOP(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr) +{ + return IRB()->CreateBinOp(Opc, LHS, RHS, Name, FPMathTag); +} + +Value* NEG(Value *V, const Twine &Name = "", bool HasNUW = false, bool HasNSW = false) +{ + return IRB()->CreateNeg(V, Name, HasNUW, HasNSW); +} + +Value* NSW_NEG(Value *V, const Twine &Name = "") +{ + return IRB()->CreateNSWNeg(V, Name); +} + +Value* NUW_NEG(Value *V, const Twine &Name = "") +{ + return IRB()->CreateNUWNeg(V, Name); +} + +Value* FNEG(Value *V, const Twine &Name = "", MDNode *FPMathTag = nullptr) +{ + return IRB()->CreateFNeg(V, Name, FPMathTag); +} + +Value* NOT(Value *V, const Twine &Name = "") +{ + return IRB()->CreateNot(V, Name); +} + +AllocaInst* ALLOCA(Type *Ty, unsigned AddrSpace, Value *ArraySize = nullptr, const Twine &Name = "") +{ + return IRB()->CreateAlloca(Ty, ArraySize, Name, AddrSpace /* IGCLLVM wrapper interface */); +} + +AllocaInst* ALLOCA(Type *Ty, Value *ArraySize = nullptr, const Twine &Name = "") +{ + return IRB()->CreateAlloca(Ty, ArraySize, Name); +} + +StoreInst* STORE(Value *Val, Value *Ptr, bool isVolatile = false) +{ + return IRB()->CreateStore(Val, Ptr, isVolatile); +} + +LoadInst* ALIGNED_LOAD(Value *Ptr, unsigned Align, const char *Name) +{ + return IRB()->CreateAlignedLoad(Ptr, Align, Name); +} + +LoadInst* ALIGNED_LOAD(Value *Ptr, unsigned Align, const Twine &Name = "") +{ + return IRB()->CreateAlignedLoad(Ptr, Align, Name); +} + +LoadInst* ALIGNED_LOAD(Value *Ptr, unsigned Align, bool isVolatile, const Twine &Name = "") +{ + return IRB()->CreateAlignedLoad(Ptr, Align, isVolatile, Name); +} + +StoreInst* ALIGNED_STORE(Value *Val, Value *Ptr, unsigned Align, bool isVolatile = false) +{ + return IRB()->CreateAlignedStore(Val, Ptr, Align, isVolatile); +} + +FenceInst* FENCE(AtomicOrdering Ordering, SyncScope::ID SSID = SyncScope::System, const Twine &Name = "") +{ + return IRB()->CreateFence(Ordering, SSID, Name); +} + +AtomicCmpXchgInst* ATOMIC_CMP_XCHG(Value *Ptr, Value *Cmp, Value *New, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SyncScope::ID SSID = SyncScope::System) +{ + return IRB()->CreateAtomicCmpXchg(Ptr, Cmp, New, SuccessOrdering, FailureOrdering, SSID); +} + +AtomicRMWInst* ATOMIC_RMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, AtomicOrdering Ordering, SyncScope::ID SSID = SyncScope::System) +{ + return IRB()->CreateAtomicRMW(Op, Ptr, Val, Ordering, SSID); +} + +Value* IN_BOUNDS_GEP(Value *Ptr, ArrayRef IdxList, const Twine &Name = "") +{ + return IRB()->CreateInBoundsGEP(Ptr, IdxList, Name); +} + +Value* IN_BOUNDS_GEP(Type *Ty, Value *Ptr, ArrayRef IdxList, const Twine &Name = "") +{ + return IRB()->CreateInBoundsGEP(Ty, Ptr, IdxList, Name); +} + +Value* IN_BOUNDS_GEP(Type *Ty, Value *Ptr, Value *Idx, const Twine &Name = "") +{ + return IRB()->CreateInBoundsGEP(Ty, Ptr, Idx, Name); +} + +Value* CONST_GEP1_32(Value *Ptr, unsigned Idx0, const Twine &Name = "") +{ + return IRB()->CreateConstGEP1_32(Ptr, Idx0, Name); +} + +Value* CONST_GEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name = "") +{ + return IRB()->CreateConstGEP1_32(Ty, Ptr, Idx0, Name); +} + +Value* CONST_IN_BOUNDS_GEP1_32(Type *Ty, Value *Ptr, unsigned Idx0, const Twine &Name = "") +{ + return IRB()->CreateConstInBoundsGEP1_32(Ty, Ptr, Idx0, Name); +} + +Value* CONST_GEP2_32(Type *Ty, Value *Ptr, unsigned Idx0, unsigned Idx1, const Twine &Name = "") +{ + return IRB()->CreateConstGEP2_32(Ty, Ptr, Idx0, Idx1, Name); +} + +Value* CONST_IN_BOUNDS_GEP2_32(Type *Ty, Value *Ptr, unsigned Idx0, unsigned Idx1, const Twine &Name = "") +{ + return IRB()->CreateConstInBoundsGEP2_32(Ty, Ptr, Idx0, Idx1, Name); +} + +Value* CONST_GEP1_64(Value *Ptr, uint64_t Idx0, const Twine &Name = "") +{ + return IRB()->CreateConstGEP1_64(Ptr, Idx0, Name); +} + +Value* CONST_IN_BOUNDS_GEP1_64(Value *Ptr, uint64_t Idx0, const Twine &Name = "") +{ + return IRB()->CreateConstInBoundsGEP1_64(Ptr, Idx0, Name); +} + +Value* CONST_GEP2_64(Value *Ptr, uint64_t Idx0, uint64_t Idx1, const Twine &Name = "") +{ + return IRB()->CreateConstGEP2_64(Ptr, Idx0, Idx1, Name); +} + +Value* CONST_IN_BOUNDS_GEP2_64(Value *Ptr, uint64_t Idx0, uint64_t Idx1, const Twine &Name = "") +{ + return IRB()->CreateConstInBoundsGEP2_64(Ptr, Idx0, Idx1, Name); +} + +Value* STRUCT_GEP(Type *Ty, Value *Ptr, unsigned Idx, const Twine &Name = "") +{ + return IRB()->CreateStructGEP(Ty, Ptr, Idx, Name); +} + +Value* GLOBAL_STRING_PTR(StringRef Str, const Twine &Name = "", unsigned AddressSpace = 0) +{ + return IRB()->CreateGlobalStringPtr(Str, Name, AddressSpace); +} + +Value* TRUNC(Value *V, Type *DestTy, const Twine &Name = "") +{ + return IRB()->CreateTrunc(V, DestTy, Name); +} + +Value* Z_EXT(Value *V, Type *DestTy, const Twine &Name = "") +{ + return IRB()->CreateZExt(V, DestTy, Name); +} + +Value* S_EXT(Value *V, Type *DestTy, const Twine &Name = "") +{ + return IRB()->CreateSExt(V, DestTy, Name); +} + +Value* Z_EXT_OR_TRUNC(Value *V, Type *DestTy, const Twine &Name = "") +{ + return IRB()->CreateZExtOrTrunc(V, DestTy, Name); +} + +Value* S_EXT_OR_TRUNC(Value *V, Type *DestTy, const Twine &Name = "") +{ + return IRB()->CreateSExtOrTrunc(V, DestTy, Name); +} + +Value* FP_TO_UI(Value *V, Type *DestTy, const Twine &Name = "") +{ + return IRB()->CreateFPToUI(V, DestTy, Name); +} + +Value* FP_TO_SI(Value *V, Type *DestTy, const Twine &Name = "") +{ + return IRB()->CreateFPToSI(V, DestTy, Name); +} + +Value* UI_TO_FP(Value *V, Type *DestTy, const Twine &Name = "") +{ + return IRB()->CreateUIToFP(V, DestTy, Name); +} + +Value* SI_TO_FP(Value *V, Type *DestTy, const Twine &Name = "") +{ + return IRB()->CreateSIToFP(V, DestTy, Name); +} + +Value* FP_TRUNC(Value *V, Type *DestTy, const Twine &Name = "") +{ + return IRB()->CreateFPTrunc(V, DestTy, Name); +} + +Value* FP_EXT(Value *V, Type *DestTy, const Twine &Name = "") +{ + return IRB()->CreateFPExt(V, DestTy, Name); +} + +Value* PTR_TO_INT(Value *V, Type *DestTy, const Twine &Name = "") +{ + return IRB()->CreatePtrToInt(V, DestTy, Name); +} + +Value* INT_TO_PTR(Value *V, Type *DestTy, const Twine &Name = "") +{ + return IRB()->CreateIntToPtr(V, DestTy, Name); +} + +Value* BITCAST(Value *V, Type *DestTy, const Twine &Name = "") +{ + return IRB()->CreateBitCast(V, DestTy, Name); +} + +Value* ADDR_SPACE_CAST(Value *V, Type *DestTy, const Twine &Name = "") +{ + return IRB()->CreateAddrSpaceCast(V, DestTy, Name); +} + +Value* Z_EXT_OR_BIT_CAST(Value *V, Type *DestTy, const Twine &Name = "") +{ + return IRB()->CreateZExtOrBitCast(V, DestTy, Name); +} + +Value* S_EXT_OR_BIT_CAST(Value *V, Type *DestTy, const Twine &Name = "") +{ + return IRB()->CreateSExtOrBitCast(V, DestTy, Name); +} + +Value* TRUNC_OR_BIT_CAST(Value *V, Type *DestTy, const Twine &Name = "") +{ + return IRB()->CreateTruncOrBitCast(V, DestTy, Name); +} + +Value* CAST(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name = "") +{ + return IRB()->CreateCast(Op, V, DestTy, Name); +} + +Value* POINTER_CAST(Value *V, Type *DestTy, const Twine &Name = "") +{ + return IRB()->CreatePointerCast(V, DestTy, Name); +} + +Value* POINTER_BIT_CAST_OR_ADDR_SPACE_CAST(Value *V, Type *DestTy, const Twine &Name = "") +{ + return IRB()->CreatePointerBitCastOrAddrSpaceCast(V, DestTy, Name); +} + +Value* INT_CAST(Value *V, Type *DestTy, bool isSigned, const Twine &Name = "") +{ + return IRB()->CreateIntCast(V, DestTy, isSigned, Name); +} + +Value* BIT_OR_POINTER_CAST(Value *V, Type *DestTy, const Twine &Name = "") +{ + return IRB()->CreateBitOrPointerCast(V, DestTy, Name); +} + +Value* FP_CAST(Value *V, Type *DestTy, const Twine &Name = "") +{ + return IRB()->CreateFPCast(V, DestTy, Name); +} + +Value* ICMP_EQ(Value *LHS, Value *RHS, const Twine &Name = "") +{ + return IRB()->CreateICmpEQ(LHS, RHS, Name); +} + +Value* ICMP_NE(Value *LHS, Value *RHS, const Twine &Name = "") +{ + return IRB()->CreateICmpNE(LHS, RHS, Name); +} + +Value* ICMP_UGT(Value *LHS, Value *RHS, const Twine &Name = "") +{ + return IRB()->CreateICmpUGT(LHS, RHS, Name); +} + +Value* ICMP_UGE(Value *LHS, Value *RHS, const Twine &Name = "") +{ + return IRB()->CreateICmpUGE(LHS, RHS, Name); +} + +Value* ICMP_ULT(Value *LHS, Value *RHS, const Twine &Name = "") +{ + return IRB()->CreateICmpULT(LHS, RHS, Name); +} + +Value* ICMP_ULE(Value *LHS, Value *RHS, const Twine &Name = "") +{ + return IRB()->CreateICmpULE(LHS, RHS, Name); +} + +Value* ICMP_SGT(Value *LHS, Value *RHS, const Twine &Name = "") +{ + return IRB()->CreateICmpSGT(LHS, RHS, Name); +} + +Value* ICMP_SGE(Value *LHS, Value *RHS, const Twine &Name = "") +{ + return IRB()->CreateICmpSGE(LHS, RHS, Name); +} + +Value* ICMP_SLT(Value *LHS, Value *RHS, const Twine &Name = "") +{ + return IRB()->CreateICmpSLT(LHS, RHS, Name); +} + +Value* ICMP_SLE(Value *LHS, Value *RHS, const Twine &Name = "") +{ + return IRB()->CreateICmpSLE(LHS, RHS, Name); +} + +Value* FCMP_OEQ(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr) +{ + return IRB()->CreateFCmpOEQ(LHS, RHS, Name, FPMathTag); +} + +Value* FCMP_OGT(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr) +{ + return IRB()->CreateFCmpOGT(LHS, RHS, Name, FPMathTag); +} + +Value* FCMP_OGE(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr) +{ + return IRB()->CreateFCmpOGE(LHS, RHS, Name, FPMathTag); +} + +Value* FCMP_OLT(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr) +{ + return IRB()->CreateFCmpOLT(LHS, RHS, Name, FPMathTag); +} + +Value* FCMP_OLE(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr) +{ + return IRB()->CreateFCmpOLE(LHS, RHS, Name, FPMathTag); +} + +Value* FCMP_ONE(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr) +{ + return IRB()->CreateFCmpONE(LHS, RHS, Name, FPMathTag); +} + +Value* FCMP_ORD(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr) +{ + return IRB()->CreateFCmpORD(LHS, RHS, Name, FPMathTag); +} + +Value* FCMP_UNO(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr) +{ + return IRB()->CreateFCmpUNO(LHS, RHS, Name, FPMathTag); +} + +Value* FCMP_UEQ(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr) +{ + return IRB()->CreateFCmpUEQ(LHS, RHS, Name, FPMathTag); +} + +Value* FCMP_UGT(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr) +{ + return IRB()->CreateFCmpUGT(LHS, RHS, Name, FPMathTag); +} + +Value* FCMP_UGE(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr) +{ + return IRB()->CreateFCmpUGE(LHS, RHS, Name, FPMathTag); +} + +Value* FCMP_ULT(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr) +{ + return IRB()->CreateFCmpULT(LHS, RHS, Name, FPMathTag); +} + +Value* FCMP_ULE(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr) +{ + return IRB()->CreateFCmpULE(LHS, RHS, Name, FPMathTag); +} + +Value* FCMP_UNE(Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr) +{ + return IRB()->CreateFCmpUNE(LHS, RHS, Name, FPMathTag); +} + +Value* ICMP(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name = "") +{ + return IRB()->CreateICmp(P, LHS, RHS, Name); +} + +Value* FCMP(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name = "", MDNode *FPMathTag = nullptr) +{ + return IRB()->CreateFCmp(P, LHS, RHS, Name, FPMathTag); +} + +PHINode* PHI(Type *Ty, unsigned NumReservedValues, const Twine &Name = "") +{ + return IRB()->CreatePHI(Ty, NumReservedValues, Name); +} + +CallInst* CALLA(Value *Callee, ArrayRef Args = None, const Twine &Name = "", MDNode *FPMathTag = nullptr) +{ + return IRB()->CreateCall(Callee, Args, Name, FPMathTag); +} + +CallInst* CALLA(FunctionType *FTy, Value *Callee, ArrayRef Args, const Twine &Name = "", MDNode *FPMathTag = nullptr) +{ + return IRB()->CreateCall(FTy, Callee, Args, Name, FPMathTag); +} + +CallInst* CALLA(Value *Callee, ArrayRef Args, ArrayRef OpBundles, const Twine &Name = "", MDNode *FPMathTag = nullptr) +{ + return IRB()->CreateCall(Callee, Args, OpBundles, Name, FPMathTag); +} + +CallInst* CALLA(Function *Callee, ArrayRef Args, const Twine &Name = "", MDNode *FPMathTag = nullptr) +{ + return IRB()->CreateCall(Callee, Args, Name, FPMathTag); +} + +Value* SELECT(Value *C, Value *True, Value *False, const Twine &Name = "", Instruction *MDFrom = nullptr) +{ + return IRB()->CreateSelect(C, True, False, Name, MDFrom); +} + +VAArgInst* VA_ARG(Value *List, Type *Ty, const Twine &Name = "") +{ + return IRB()->CreateVAArg(List, Ty, Name); +} + +Value* VEXTRACT(Value *Vec, Value *Idx, const Twine &Name = "") +{ + return IRB()->CreateExtractElement(Vec, Idx, Name); +} + +Value* VEXTRACT(Value *Vec, uint64_t Idx, const Twine &Name = "") +{ + return IRB()->CreateExtractElement(Vec, Idx, Name); +} + +Value* VINSERT(Value *Vec, Value *NewElt, Value *Idx, const Twine &Name = "") +{ + return IRB()->CreateInsertElement(Vec, NewElt, Idx, Name); +} + +Value* VINSERT(Value *Vec, Value *NewElt, uint64_t Idx, const Twine &Name = "") +{ + return IRB()->CreateInsertElement(Vec, NewElt, Idx, Name); +} + +Value* VSHUFFLE(Value *V1, Value *V2, Value *Mask, const Twine &Name = "") +{ + return IRB()->CreateShuffleVector(V1, V2, Mask, Name); +} + +Value* VSHUFFLE(Value *V1, Value *V2, ArrayRef IntMask, const Twine &Name = "") +{ + return IRB()->CreateShuffleVector(V1, V2, IntMask, Name); +} + +Value* EXTRACT_VALUE(Value *Agg, ArrayRef Idxs, const Twine &Name = "") +{ + return IRB()->CreateExtractValue(Agg, Idxs, Name); +} + +Value* INSERT_VALUE(Value *Agg, Value *Val, ArrayRef Idxs, const Twine &Name = "") +{ + return IRB()->CreateInsertValue(Agg, Val, Idxs, Name); +} + +LandingPadInst* LANDING_PAD(Type *Ty, unsigned NumClauses, const Twine &Name = "") +{ + return IRB()->CreateLandingPad(Ty, NumClauses, Name); +} + +Value* IS_NULL(Value *Arg, const Twine &Name = "") +{ + return IRB()->CreateIsNull(Arg, Name); +} + +Value* IS_NOT_NULL(Value *Arg, const Twine &Name = "") +{ + return IRB()->CreateIsNotNull(Arg, Name); +} + +Value* PTR_DIFF(Value *LHS, Value *RHS, const Twine &Name = "") +{ + return IRB()->CreatePtrDiff(LHS, RHS, Name); +} + +Value* INVARIANT_GROUP_BARRIER(Value *Ptr) +{ + return IRB()->CreateLaunderInvariantGroup(Ptr); +} + +Value* VECTOR_SPLAT(unsigned NumElts, Value *V, const Twine &Name = "") +{ + return IRB()->CreateVectorSplat(NumElts, V, Name); +} + +Value* EXTRACT_INTEGER(const DataLayout &DL, Value *From, IntegerType *ExtractedTy, uint64_t Offset, const Twine &Name) +{ + return IRB()->CreateExtractInteger(DL, From, ExtractedTy, Offset, Name); +} + +CallInst* ALIGNMENT_ASSUMPTION(const DataLayout &DL, Value *PtrValue, unsigned Alignment, Value *OffsetValue = nullptr) +{ + return IRB()->CreateAlignmentAssumption(DL, PtrValue, Alignment, OffsetValue); +} + +CallInst* ALIGNMENT_ASSUMPTION(const DataLayout &DL, Value *PtrValue, Value *Alignment, Value *OffsetValue = nullptr) +{ + return IRB()->CreateAlignmentAssumption(DL, PtrValue, Alignment, OffsetValue); +} + + // clang-format on diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/gen_builder_intrin.hpp b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/gen_builder_intrin.hpp new file mode 100644 index 000000000000..d19a1bb3dfcc --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/gen_builder_intrin.hpp @@ -0,0 +1,172 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +// @file gen_builder_intrin.hpp +// +// @brief auto-generated file +// +// DO NOT EDIT +// +// Generation Command Line: +// gen_llvm_ir_macros.py +// --input +// /cygdrive/d/cm-llvm/llvm/include/llvm/IR/IRBuilder.h +// --output-dir +// . +// --gen_h +// --gen_meta_h +// --gen_intrin_h +// +//============================================================================ +// clang-format off +#pragma once + +//============================================================================ +// Auto-generated llvm intrinsics +//============================================================================ +Value* CTTZ(Value* a, Value* flag, const llvm::Twine& name = "") +{ + SmallVector args; + args.push_back(a->getType()); + Function* pFunc = Intrinsic::getDeclaration(mpModule, Intrinsic::cttz, args); + return CALL(pFunc, std::initializer_list{a, flag}, name); +} + +Value* CTLZ(Value* a, Value* flag, const llvm::Twine& name = "") +{ + SmallVector args; + args.push_back(a->getType()); + Function* pFunc = Intrinsic::getDeclaration(mpModule, Intrinsic::ctlz, args); + return CALL(pFunc, std::initializer_list{a, flag}, name); +} + +Value* VSQRTPS(Value* a, const llvm::Twine& name = "") +{ + SmallVector args; + args.push_back(a->getType()); + Function* pFunc = Intrinsic::getDeclaration(mpModule, Intrinsic::sqrt, args); + return CALL(pFunc, std::initializer_list{a}, name); +} + +Value* STACKSAVE(const llvm::Twine& name = "") +{ + Function* pFunc = Intrinsic::getDeclaration(mpModule, Intrinsic::stacksave); + return CALL(pFunc, std::initializer_list{}, name); +} + +Value* STACKRESTORE(Value* a, const llvm::Twine& name = "") +{ + Function* pFunc = Intrinsic::getDeclaration(mpModule, Intrinsic::stackrestore); + return CALL(pFunc, std::initializer_list{a}, name); +} + +Value* VMINPS(Value* a, Value* b, const llvm::Twine& name = "") +{ + SmallVector args; + args.push_back(a->getType()); + Function* pFunc = Intrinsic::getDeclaration(mpModule, Intrinsic::minnum, args); + return CALL(pFunc, std::initializer_list{a, b}, name); +} + +Value* VMAXPS(Value* a, Value* b, const llvm::Twine& name = "") +{ + SmallVector args; + args.push_back(a->getType()); + Function* pFunc = Intrinsic::getDeclaration(mpModule, Intrinsic::maxnum, args); + return CALL(pFunc, std::initializer_list{a, b}, name); +} + +Value* DEBUGTRAP(const llvm::Twine& name = "") +{ + Function* pFunc = Intrinsic::getDeclaration(mpModule, Intrinsic::debugtrap); + return CALL(pFunc, std::initializer_list{}, name); +} + +Value* POPCNT(Value* a, const llvm::Twine& name = "") +{ + SmallVector args; + args.push_back(a->getType()); + Function* pFunc = Intrinsic::getDeclaration(mpModule, Intrinsic::ctpop, args); + return CALL(pFunc, std::initializer_list{a}, name); +} + +Value* LOG2(Value* a, const llvm::Twine& name = "") +{ + SmallVector args; + args.push_back(a->getType()); + Function* pFunc = Intrinsic::getDeclaration(mpModule, Intrinsic::log2, args); + return CALL(pFunc, std::initializer_list{a}, name); +} + +Value* FABS(Value* a, const llvm::Twine& name = "") +{ + SmallVector args; + args.push_back(a->getType()); + Function* pFunc = Intrinsic::getDeclaration(mpModule, Intrinsic::fabs, args); + return CALL(pFunc, std::initializer_list{a}, name); +} + +Value* EXP2(Value* a, const llvm::Twine& name = "") +{ + SmallVector args; + args.push_back(a->getType()); + Function* pFunc = Intrinsic::getDeclaration(mpModule, Intrinsic::exp2, args); + return CALL(pFunc, std::initializer_list{a}, name); +} + +Value* COS(Value* a, const llvm::Twine& name = "") +{ + SmallVector args; + args.push_back(a->getType()); + Function* pFunc = Intrinsic::getDeclaration(mpModule, Intrinsic::cos, args); + return CALL(pFunc, std::initializer_list{a}, name); +} + +Value* SIN(Value* a, const llvm::Twine& name = "") +{ + SmallVector args; + args.push_back(a->getType()); + Function* pFunc = Intrinsic::getDeclaration(mpModule, Intrinsic::sin, args); + return CALL(pFunc, std::initializer_list{a}, name); +} + +Value* FLOOR(Value* a, const llvm::Twine& name = "") +{ + SmallVector args; + args.push_back(a->getType()); + Function* pFunc = Intrinsic::getDeclaration(mpModule, Intrinsic::floor, args); + return CALL(pFunc, std::initializer_list{a}, name); +} + +Value* POW(Value* a, Value* b, const llvm::Twine& name = "") +{ + SmallVector args; + args.push_back(a->getType()); + Function* pFunc = Intrinsic::getDeclaration(mpModule, Intrinsic::pow, args); + return CALL(pFunc, std::initializer_list{a, b}, name); +} + + // clang-format on diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/gen_builder_meta.hpp b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/gen_builder_meta.hpp new file mode 100644 index 000000000000..34d692069f7a --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXOpts/CMPacketize/gen_builder_meta.hpp @@ -0,0 +1,244 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +// @file gen_builder_meta.hpp +// +// @brief auto-generated file +// +// DO NOT EDIT +// +// Generation Command Line: +// gen_llvm_ir_macros.py +// --input +// /cygdrive/d/cm-llvm/llvm/include/llvm/IR/IRBuilder.h +// --output-dir +// . +// --gen_h +// --gen_meta_h +// --gen_intrin_h +// +//============================================================================ +// clang-format off +#pragma once + +//============================================================================ +// Auto-generated meta intrinsics +//============================================================================ +Value* VGATHERPD(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale, const llvm::Twine& name = "") +{ + SmallVector argTypes; + argTypes.push_back(src->getType()); + argTypes.push_back(pBase->getType()); + argTypes.push_back(indices->getType()); + argTypes.push_back(mask->getType()); + argTypes.push_back(scale->getType()); + FunctionType* pFuncTy = FunctionType::get(src->getType(), argTypes, false); + Function* pFunc = cast(mpModule->getOrInsertFunction("meta.intrinsic.VGATHERPD", pFuncTy)); + return CALL(pFunc, std::initializer_list{src, pBase, indices, mask, scale}, name); +} + +Value* VGATHERPS(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale, const llvm::Twine& name = "") +{ + SmallVector argTypes; + argTypes.push_back(src->getType()); + argTypes.push_back(pBase->getType()); + argTypes.push_back(indices->getType()); + argTypes.push_back(mask->getType()); + argTypes.push_back(scale->getType()); + FunctionType* pFuncTy = FunctionType::get(src->getType(), argTypes, false); + Function* pFunc = cast(mpModule->getOrInsertFunction("meta.intrinsic.VGATHERPS", pFuncTy)); + return CALL(pFunc, std::initializer_list{src, pBase, indices, mask, scale}, name); +} + +Value* VGATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale, const llvm::Twine& name = "") +{ + SmallVector argTypes; + argTypes.push_back(src->getType()); + argTypes.push_back(pBase->getType()); + argTypes.push_back(indices->getType()); + argTypes.push_back(mask->getType()); + argTypes.push_back(scale->getType()); + FunctionType* pFuncTy = FunctionType::get(src->getType(), argTypes, false); + Function* pFunc = cast(mpModule->getOrInsertFunction("meta.intrinsic.VGATHERDD", pFuncTy)); + return CALL(pFunc, std::initializer_list{src, pBase, indices, mask, scale}, name); +} + +Value* VRCPPS(Value* a, const llvm::Twine& name = "") +{ + SmallVector argTypes; + argTypes.push_back(a->getType()); + FunctionType* pFuncTy = FunctionType::get(a->getType(), argTypes, false); + Function* pFunc = cast(mpModule->getOrInsertFunction("meta.intrinsic.VRCPPS", pFuncTy)); + return CALL(pFunc, std::initializer_list{a}, name); +} + +Value* VROUND(Value* a, Value* rounding, const llvm::Twine& name = "") +{ + SmallVector argTypes; + argTypes.push_back(a->getType()); + argTypes.push_back(rounding->getType()); + FunctionType* pFuncTy = FunctionType::get(a->getType(), argTypes, false); + Function* pFunc = cast(mpModule->getOrInsertFunction("meta.intrinsic.VROUND", pFuncTy)); + return CALL(pFunc, std::initializer_list{a, rounding}, name); +} + +Value* BEXTR_32(Value* src, Value* control, const llvm::Twine& name = "") +{ + SmallVector argTypes; + argTypes.push_back(src->getType()); + argTypes.push_back(control->getType()); + FunctionType* pFuncTy = FunctionType::get(src->getType(), argTypes, false); + Function* pFunc = cast(mpModule->getOrInsertFunction("meta.intrinsic.BEXTR_32", pFuncTy)); + return CALL(pFunc, std::initializer_list{src, control}, name); +} + +Value* VPSHUFB(Value* a, Value* b, const llvm::Twine& name = "") +{ + SmallVector argTypes; + argTypes.push_back(a->getType()); + argTypes.push_back(b->getType()); + FunctionType* pFuncTy = FunctionType::get(a->getType(), argTypes, false); + Function* pFunc = cast(mpModule->getOrInsertFunction("meta.intrinsic.VPSHUFB", pFuncTy)); + return CALL(pFunc, std::initializer_list{a, b}, name); +} + +Value* VPERMD(Value* a, Value* idx, const llvm::Twine& name = "") +{ + SmallVector argTypes; + argTypes.push_back(a->getType()); + argTypes.push_back(idx->getType()); + FunctionType* pFuncTy = FunctionType::get(a->getType(), argTypes, false); + Function* pFunc = cast(mpModule->getOrInsertFunction("meta.intrinsic.VPERMD", pFuncTy)); + return CALL(pFunc, std::initializer_list{a, idx}, name); +} + +Value* VPERMPS(Value* idx, Value* a, const llvm::Twine& name = "") +{ + SmallVector argTypes; + argTypes.push_back(idx->getType()); + argTypes.push_back(a->getType()); + FunctionType* pFuncTy = FunctionType::get(a->getType(), argTypes, false); + Function* pFunc = cast(mpModule->getOrInsertFunction("meta.intrinsic.VPERMPS", pFuncTy)); + return CALL(pFunc, std::initializer_list{idx, a}, name); +} + +Value* VCVTPD2PS(Value* a, const llvm::Twine& name = "") +{ + SmallVector argTypes; + argTypes.push_back(a->getType()); + FunctionType* pFuncTy = FunctionType::get(VectorType::get(mFP32Ty, a->getType()->getVectorNumElements()), argTypes, false); + Function* pFunc = cast(mpModule->getOrInsertFunction("meta.intrinsic.VCVTPD2PS", pFuncTy)); + return CALL(pFunc, std::initializer_list{a}, name); +} + +Value* VCVTPH2PS(Value* a, const llvm::Twine& name = "") +{ + SmallVector argTypes; + argTypes.push_back(a->getType()); + FunctionType* pFuncTy = FunctionType::get(VectorType::get(mFP32Ty, a->getType()->getVectorNumElements()), argTypes, false); + Function* pFunc = cast(mpModule->getOrInsertFunction("meta.intrinsic.VCVTPH2PS", pFuncTy)); + return CALL(pFunc, std::initializer_list{a}, name); +} + +Value* VCVTPS2PH(Value* a, Value* round, const llvm::Twine& name = "") +{ + SmallVector argTypes; + argTypes.push_back(a->getType()); + argTypes.push_back(round->getType()); + FunctionType* pFuncTy = FunctionType::get(mSimdInt16Ty, argTypes, false); + Function* pFunc = cast(mpModule->getOrInsertFunction("meta.intrinsic.VCVTPS2PH", pFuncTy)); + return CALL(pFunc, std::initializer_list{a, round}, name); +} + +Value* VHSUBPS(Value* a, Value* b, const llvm::Twine& name = "") +{ + SmallVector argTypes; + argTypes.push_back(a->getType()); + argTypes.push_back(b->getType()); + FunctionType* pFuncTy = FunctionType::get(a->getType(), argTypes, false); + Function* pFunc = cast(mpModule->getOrInsertFunction("meta.intrinsic.VHSUBPS", pFuncTy)); + return CALL(pFunc, std::initializer_list{a, b}, name); +} + +Value* VPTESTC(Value* a, Value* b, const llvm::Twine& name = "") +{ + SmallVector argTypes; + argTypes.push_back(a->getType()); + argTypes.push_back(b->getType()); + FunctionType* pFuncTy = FunctionType::get(mInt32Ty, argTypes, false); + Function* pFunc = cast(mpModule->getOrInsertFunction("meta.intrinsic.VPTESTC", pFuncTy)); + return CALL(pFunc, std::initializer_list{a, b}, name); +} + +Value* VPTESTZ(Value* a, Value* b, const llvm::Twine& name = "") +{ + SmallVector argTypes; + argTypes.push_back(a->getType()); + argTypes.push_back(b->getType()); + FunctionType* pFuncTy = FunctionType::get(mInt32Ty, argTypes, false); + Function* pFunc = cast(mpModule->getOrInsertFunction("meta.intrinsic.VPTESTZ", pFuncTy)); + return CALL(pFunc, std::initializer_list{a, b}, name); +} + +Value* VFMADDPS(Value* a, Value* b, Value* c, const llvm::Twine& name = "") +{ + SmallVector argTypes; + argTypes.push_back(a->getType()); + argTypes.push_back(b->getType()); + argTypes.push_back(c->getType()); + FunctionType* pFuncTy = FunctionType::get(a->getType(), argTypes, false); + Function* pFunc = cast(mpModule->getOrInsertFunction("meta.intrinsic.VFMADDPS", pFuncTy)); + return CALL(pFunc, std::initializer_list{a, b, c}, name); +} + +Value* VPHADDD(Value* a, Value* b, const llvm::Twine& name = "") +{ + SmallVector argTypes; + argTypes.push_back(a->getType()); + argTypes.push_back(b->getType()); + FunctionType* pFuncTy = FunctionType::get(a->getType(), argTypes, false); + Function* pFunc = cast(mpModule->getOrInsertFunction("meta.intrinsic.VPHADDD", pFuncTy)); + return CALL(pFunc, std::initializer_list{a, b}, name); +} + +Value* PDEP32(Value* a, Value* b, const llvm::Twine& name = "") +{ + SmallVector argTypes; + argTypes.push_back(a->getType()); + argTypes.push_back(b->getType()); + FunctionType* pFuncTy = FunctionType::get(a->getType(), argTypes, false); + Function* pFunc = cast(mpModule->getOrInsertFunction("meta.intrinsic.PDEP32", pFuncTy)); + return CALL(pFunc, std::initializer_list{a, b}, name); +} + +Value* RDTSC(const llvm::Twine& name = "") +{ + FunctionType* pFuncTy = FunctionType::get(mInt64Ty, {}, false); + Function* pFunc = cast(mpModule->getOrInsertFunction("meta.intrinsic.RDTSC", pFuncTy)); + return CALL(pFunc, std::initializer_list{}, name); +} + + // clang-format on diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMTrans/CMABI.cpp b/IGC/VectorCompiler/lib/GenXOpts/CMTrans/CMABI.cpp new file mode 100644 index 000000000000..02d3d7f9e86f --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXOpts/CMTrans/CMABI.cpp @@ -0,0 +1,1942 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ + +//===----------------------------------------------------------------------===// +// +/// CMABI +/// ----- +/// +/// This pass fixes ABI issues for the genx backend. Currently, it +/// +/// - transforms pass by pointer argument into copy-in and copy-out; +/// +/// - localizes global scalar or vector variables into copy-in and copy-out; +/// +/// - passes bool arguments as i8 (matches cm-icl's hehavior). +/// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "cmabi" + +#include "llvmWrapper/Support/Alignment.h" + +#include "llvm/ADT/DenseMap.h" +#include "vc/GenXOpts/GenXOpts.h" +#include "vc/GenXOpts/Utils/GenXSTLExtras.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/CallGraphSCCPass.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/GenXIntrinsics/GenXIntrinsics.h" +#include "llvm/GenXIntrinsics/GenXMetadata.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/CallSite.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/DiagnosticPrinter.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" + +#include "llvmWrapper/Analysis/CallGraph.h" + +#include + +using namespace llvm; + +using LocalizationLimitT = int32_t; +static constexpr auto LocalizeAll = std::numeric_limits::max(); +static cl::opt + LocalizationLimit("cm-abi-issues-localization-limit", + cl::desc("maximum size (in bytes) used to localize global variables"), + cl::init(LocalizeAll)); + +STATISTIC(NumArgumentsTransformed, "Number of pointer arguments transformed"); +STATISTIC(NumArgumentsDead , "Number of dead pointer args eliminated"); + +namespace llvm { +void initializeCMABIPass(PassRegistry &); +void initializeCMLowerVLoadVStorePass(PassRegistry &); +} + +/// Localizing global variables +/// ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +/// +/// General idea of localizing global variables into locals. Globals used in +/// different kernels get a seperate copy and they are always invisiable to +/// other kernels and we can safely localize all globals used (including +/// indirectly) in a kernel. For example, +/// +/// .. code-block:: text +/// +/// @gv1 = global <8 x float> zeroinitializer, align 32 +/// @gv2 = global <8 x float> zeroinitializer, align 32 +/// @gv3 = global <8 x float> zeroinitializer, align 32 +/// +/// define dllexport void @f0() { +/// call @f1() +/// call @f2() +/// call @f3() +/// } +/// +/// define internal void @f1() { +/// ; ... +/// store <8 x float> %splat1, <8 x float>* @gv1, align 32 +/// } +/// +/// define internal void @f2() { +/// ; ... +/// store <8 x float> %splat2, <8 x float>* @gv2, align 32 +/// } +/// +/// define internal void @f3() { +/// %1 = <8 x float>* @gv1, align 32 +/// %2 = <8 x float>* @gv2, align 32 +/// %3 = fadd <8 x float> %1, <8 x float> %2 +/// store <8 x float> %3, <8 x float>* @gv3, align 32 +/// } +/// +/// will be transformed into +/// +/// .. code-block:: text +/// +/// define dllexport void @f0() { +/// %v1 = alloca <8 x float>, align 32 +/// %v2 = alloca <8 x float>, align 32 +/// %v3 = alloca <8 x float>, align 32 +/// +/// %0 = load <8 x float> * %v1, align 32 +/// %1 = { <8 x float> } call @f1_transformed(<8 x float> %0) +/// %2 = extractvalue { <8 x float> } %1, 0 +/// store <8 x float> %2, <8 x float>* %v1, align 32 +/// +/// %3 = load <8 x float> * %v2, align 32 +/// %4 = { <8 x float> } call @f2_transformed(<8 x float> %3) +/// %5 = extractvalue { <8 x float> } %4, 0 +/// store <8 x float> %5, <8 x float>* %v1, align 32 +/// +/// %6 = load <8 x float> * %v1, align 32 +/// %7 = load <8 x float> * %v2, align 32 +/// %8 = load <8 x float> * %v3, align 32 +/// +/// %9 = { <8 x float>, <8 x float>, <8 x float> } +/// call @f3_transformed(<8 x float> %6, <8 x float> %7, <8 x float> %8) +/// +/// %10 = extractvalue { <8 x float>, <8 x float>, <8 x float> } %9, 0 +/// store <8 x float> %10, <8 x float>* %v1, align 32 +/// %11 = extractvalue { <8 x float>, <8 x float>, <8 x float> } %9, 1 +/// store <8 x float> %11, <8 x float>* %v2, align 32 +/// %12 = extractvalue { <8 x float>, <8 x float>, <8 x float> } %9, 2 +/// store <8 x float> %12, <8 x float>* %v3, align 32 +/// } +/// +/// All callees will be updated accordingly, E.g. f1_transformed becomes +/// +/// .. code-block:: text +/// +/// define internal { <8 x float> } @f1_transformed(<8 x float> %v1) { +/// %0 = alloca <8 x float>, align 32 +/// store <8 x float> %v1, <8 x float>* %0, align 32 +/// ; ... +/// store <8 x float> %splat1, <8 x float>* @0, align 32 +/// ; ... +/// %1 = load <8 x float>* %0, align 32 +/// %2 = insertvalue { <8 x float> } undef, <8 x float> %1, 0 +/// ret { <8 x float> } %2 +/// } +/// +namespace { + +// \brief Collect necessary information for global variable localization. +class LocalizationInfo { +public: + typedef SetVector GlobalSetTy; + + explicit LocalizationInfo(Function *F) : Fn(F) {} + LocalizationInfo() : Fn(0) {} + + Function *getFunction() const { return Fn; } + bool empty() const { return Globals.empty(); } + GlobalSetTy &getGlobals() { return Globals; } + + // \brief Add a global. + void addGlobal(GlobalVariable *GV) { + Globals.insert(GV); + } + + // \brief Add all globals from callee. + void addGlobals(LocalizationInfo &LI) { + Globals.insert(LI.getGlobals().begin(), LI.getGlobals().end()); + } + + void setArgIndex(GlobalVariable *GV, unsigned ArgIndex) { + assert(!IndexMap.count(GV)); + IndexMap[GV] = ArgIndex; + } + unsigned getArgIndex(GlobalVariable *GV) const { + assert(IndexMap.count(GV)); + return IndexMap.lookup(GV); + } + +private: + // \brief The function being analyzed. + Function *Fn; + + // \brief Global variables that are used directly or indirectly. + GlobalSetTy Globals; + + // This map keeps track of argument index for a global variable. + SmallDenseMap IndexMap; +}; + +// Diagnostic information for error/warning for overlapping arg +class DiagnosticInfoOverlappingArgs : public DiagnosticInfo { +private: + std::string Description; + StringRef Filename; + unsigned Line; + unsigned Col; + static int KindID; + static int getKindID() { + if (KindID == 0) + KindID = llvm::getNextAvailablePluginDiagnosticKind(); + return KindID; + } +public: + // Initialize from an Instruction and an Argument. + DiagnosticInfoOverlappingArgs(Instruction *Inst, + const Twine &Desc, DiagnosticSeverity Severity = DS_Error); + void print(DiagnosticPrinter &DP) const override; + + static bool classof(const DiagnosticInfo *DI) { + return DI->getKind() == getKindID(); + } +}; +int DiagnosticInfoOverlappingArgs::KindID = 0; + + + +struct CMABI : public CallGraphSCCPass { + static char ID; + + CMABI() : CallGraphSCCPass(ID) { + initializeCMABIPass(*PassRegistry::getPassRegistry()); + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + CallGraphSCCPass::getAnalysisUsage(AU); + } + + virtual bool runOnSCC(CallGraphSCC &SCC); + + virtual bool doInitialization(CallGraph &CG); + virtual bool doFinalization(CallGraph &CG); + +private: + unsigned int const MaxCallSites = 5; + + CallGraphNode *ProcessNode(CallGraphNode *CGN); + + // Fix argument passing for kernels. + CallGraphNode *TransformKernel(Function *F); + + // Major work is done in this method. + CallGraphNode *TransformNode(Function *F, + SmallPtrSet &ArgsToTransform, + LocalizationInfo &LI); + + // \brief Create allocas for globals and replace their uses. + void LocalizeGlobals(LocalizationInfo &LI); + + // \brief Compute the localized global variables for each function. + void AnalyzeGlobals(CallGraph &CG); + + // \brief Returns the localization info associated to a function. + LocalizationInfo &getLocalizationInfo(Function *F) { + if (!GlobalInfo.count(F)) { + LocalizationInfo *LI = new LocalizationInfo(F); + LocalizationInfoObjs.push_back(LI); + GlobalInfo[F] = LI; + return *LI; + } + return *GlobalInfo[F]; + } + + void addDirectGlobal(Function *F, GlobalVariable *GV) { + getLocalizationInfo(F).addGlobal(GV); + } + + // \brief Add all globals from callee to caller. + void addIndirectGlobal(Function *F, Function *Callee) { + getLocalizationInfo(F).addGlobals(getLocalizationInfo(Callee)); + } + + // Return true if pointer type argument arg appears in a + // store instruction. This helps decide whether it is safe + // to convert ptr arg to byvalue arg. Latter can be passed + // in GRF. + bool IsPtrArgModified(Value * Arg); + + // \brief Diagnose illegal overlapping by-ref args. + void diagnoseOverlappingArgs(CallInst *CI); + + // This map captures all global variables to be localized. + SmallDenseMap GlobalInfo; + + // Kernels in the module being processed. + SmallPtrSet Kernels; + + // Already visited functions. + SmallPtrSet AlreadyVisited; + + // LocalizationInfo objects created. + SmallVector LocalizationInfoObjs; +}; + +} // namespace + +// Currently weight of the global defines by its size +static int calcGVWeight(const GlobalVariable &GV, const DataLayout &DL) { + return DL.getTypeAllocSize(GV.getValueType()); +} + +/* selectGlobalsToLocalize - chooses which globals to localize + * Returns vector of pointers to such globals. + * + * Algorithm: exclude globals that definitely should not be localized + * sort globals by weight, choose first smallest ones, sum of which is under \p + * Bound + * + * \p Globals - range of globals to choose from + * \p Bound - bound not to overcome + * \p ExcludePred - functor : GVRef -> bool, true if global should not be + * localized \p WeightCalculator - functor : GVRef -> decltype(Bound), returns + * weight of global + */ +template +auto selectGlobalsToLocalize(ForwardRange Globals, T Bound, + ExcludePredT ExcludePred, + WeightCalculatorT WeightCalculator) + -> std::vector> { + assert(Bound >= 0 && "bound must be nonnegative"); + using GVPtr = genx::ranges::range_pointer_t; + using GVRef = genx::ranges::range_reference_t; + if (Bound == 0) + return std::vector(); + + // filter out those, that we must exclude + auto Unexcluded = make_filter_range( + Globals, [ExcludePred](GVRef GV) { return !ExcludePred(GV); }); + using GVWithWeightT = std::pair; + + if (Bound == LocalizeAll) { + std::vector ToLocalize; + transform(Unexcluded, std::back_inserter(ToLocalize), + [](GVRef GV) { return &GV; }); + return ToLocalize; + } + + std::vector ToLocalizeWithWeight; + transform(Unexcluded, std::back_inserter(ToLocalizeWithWeight), + [WeightCalculator](GVRef GV) { + return std::make_pair(&GV, WeightCalculator(GV)); + }); + + // sort globals by weight + std::sort(ToLocalizeWithWeight.begin(), ToLocalizeWithWeight.end(), + [](GVWithWeightT LHS, GVWithWeightT RHS) { + return LHS.second < RHS.second; + }); + + // filter max number of lightest ones, which weight sum is under the bound + auto FirstNotToLocalize = genx::upper_partial_sum_bound( + ToLocalizeWithWeight.begin(), ToLocalizeWithWeight.end(), Bound, + [](decltype(Bound) Base, GVWithWeightT Inc) { + return Base + Inc.second; + }); + + // collect them back to ToLocalize + std::vector ToLocalize; + ToLocalize.reserve(FirstNotToLocalize - ToLocalizeWithWeight.begin()); + std::transform(ToLocalizeWithWeight.begin(), FirstNotToLocalize, + std::back_inserter(ToLocalize), + [](GVWithWeightT GV) { return GV.first; }); + + return ToLocalize; +} + +bool CMABI::doInitialization(CallGraph &CG) { + // Analyze global variable usages and for each function attaches global + // variables to be copy-in and copy-out. + AnalyzeGlobals(CG); + + auto getValue = [](Metadata *M) -> Value * { + if (auto VM = dyn_cast(M)) + return VM->getValue(); + return nullptr; + }; + + // Collect all CM kernels from named metadata. + if (NamedMDNode *Named = + CG.getModule().getNamedMetadata(genx::FunctionMD::GenXKernels)) { + assert(Named); + for (unsigned I = 0, E = Named->getNumOperands(); I != E; ++I) { + MDNode *Node = Named->getOperand(I); + if (Function *F = + dyn_cast_or_null(getValue(Node->getOperand(0)))) + Kernels.insert(F); + } + } + + // no change. + return false; +} + +bool CMABI::doFinalization(CallGraph &CG) { + bool Changed = false; + for (Module::global_iterator I = CG.getModule().global_begin(); + I != CG.getModule().global_end(); + /*empty*/) { + GlobalVariable *GV = &*I++; + if (GV->use_empty()) { + GV->eraseFromParent(); + Changed = true; + } + } + + for (LocalizationInfo *Obj : LocalizationInfoObjs) + delete Obj; + + return Changed; +} + +bool CMABI::runOnSCC(CallGraphSCC &SCC) { + bool Changed = false, LocalChange; + + // Diagnose overlapping by-ref args. + for (auto i = SCC.begin(), e = SCC.end(); i != e; ++i) { + Function *F = (*i)->getFunction(); + if (!F || F->empty()) + continue; + for (auto ui = F->use_begin(), ue = F->use_end(); ui != ue; ++ui) { + auto CI = dyn_cast(ui->getUser()); + if (CI && CI->getNumArgOperands() == ui->getOperandNo()) + diagnoseOverlappingArgs(CI); + } + } + + // Iterate until we stop transforming from this SCC. + do { + LocalChange = false; + for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) { + if (CallGraphNode *CGN = ProcessNode(*I)) { + LocalChange = true; + SCC.ReplaceNode(*I, CGN); + } + } + Changed |= LocalChange; + } while (LocalChange); + + return Changed; +} + +// Sometimes we can get phi with GEP (or maybe some other inst) as an argument. +// While GEP's arguments are constants, its OK as GEP is a constant to. +// But when we replace constants with lokals, GEP becomes a normal instruction, +// a normal instruction, that is placed before phi - wrong IR, we need to fix +// it. Here it is fixed. +static void fixPhiUseIssue(Instruction *Inst) { + auto PhiUse = cast(Inst->use_begin()->getUser()); + auto InstOpNoInPhi = Inst->use_begin()->getOperandNo(); + assert(Inst->getParent() == PhiUse->getParent()); + Inst->removeFromParent(); + Inst->insertBefore(PhiUse->getIncomingBlock(InstOpNoInPhi)->getTerminator()); +} + +// Replace uses of global variables with the corresponding allocas with a +// specified function. +// +// Returns vector of instructions with phi use, that should be later fixed. +static std::vector +replaceUsesWithinFunction(SmallDenseMap &GlobalsToReplace, + Function *F) { + std::vector PhiUseIssueInsts; + for (auto I = inst_begin(F), E = inst_end(F); I != E; ++I) { + Instruction *Inst = &*I; + for (unsigned i = 0, e = Inst->getNumOperands(); i < e; ++i) { + auto Iter = GlobalsToReplace.find(Inst->getOperand(i)); + if (Iter != GlobalsToReplace.end()) + Inst->setOperand(i, Iter->second); + } + if (Inst->getNumUses() == 1) { + auto PhiUse = dyn_cast(Inst->use_begin()->getUser()); + if (PhiUse && Inst->getParent() == PhiUse->getParent()) { + PhiUseIssueInsts.push_back(Inst); + } + } + } + return PhiUseIssueInsts; +} + +// \brief Create allocas for globals directly used in this kernel and +// replace all uses. +void CMABI::LocalizeGlobals(LocalizationInfo &LI) { + const LocalizationInfo::GlobalSetTy &Globals = LI.getGlobals(); + typedef LocalizationInfo::GlobalSetTy::const_iterator IteratorTy; + + SmallDenseMap GlobalsToReplace; + Function *Fn = LI.getFunction(); + for (IteratorTy I = Globals.begin(), E = Globals.end(); I != E; ++I) { + GlobalVariable *GV = (*I); + LLVM_DEBUG(dbgs() << "Localizing global: " << *GV); + + Instruction &FirstI = *Fn->getEntryBlock().begin(); + Type *ElemTy = GV->getType()->getElementType(); + AllocaInst *Alloca = new AllocaInst(ElemTy, 0 /*AddressSpace*/, + GV->getName() + ".local", &FirstI); + Alloca->setAlignment(MaybeAlign(GV->getAlignment())); + if (!isa(GV->getInitializer())) + new StoreInst(GV->getInitializer(), Alloca, &FirstI); + + GlobalsToReplace.insert(std::make_pair(GV, Alloca)); + } + + // Replaces all globals uses within this function. + auto PhiUseIssueInsts = replaceUsesWithinFunction(GlobalsToReplace, Fn); + + for (auto InstWithPhiUse : PhiUseIssueInsts) { + fixPhiUseIssue(InstWithPhiUse); + } +} + +CallGraphNode *CMABI::ProcessNode(CallGraphNode *CGN) { + Function *F = CGN->getFunction(); + + // nothing to do for declarations or already visited functions. + if (!F || F->isDeclaration() || AlreadyVisited.count(F)) + return 0; + + // Variables to be localized. + LocalizationInfo &LI = getLocalizationInfo(F); + + // This is a kernel. + if (Kernels.count(F)) { + // Localize globals for kernels. + if (!LI.getGlobals().empty()) + LocalizeGlobals(LI); + + // Check whether there are i1 or vxi1 kernel arguments. + for (auto AI = F->arg_begin(), AE = F->arg_end(); AI != AE; ++AI) + if (AI->getType()->getScalarType()->isIntegerTy(1)) + return TransformKernel(F); + + // No changes to this kernel's prototype. + return 0; + } + + // Non-kernels, only transforms module locals. + if (!F->hasLocalLinkage()) + return 0; + + SmallVector PointerArgs; + for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; ++I) + if (I->getType()->isPointerTy()) + PointerArgs.push_back(I); + + // Check if there is any pointer arguments or globals to localize. + if (PointerArgs.empty() && LI.empty()) + return 0; + + // Check transformable arguments. + SmallPtrSet ArgsToTransform; + for (unsigned i = 0, e = PointerArgs.size(); i != e; ++i) { + Argument *PtrArg = PointerArgs[i]; + Type *ArgTy = cast(PtrArg->getType())->getElementType(); + + // Only transform to simple types. + if ((F->getNumUses() > MaxCallSites || ArgTy->isVectorTy() || IsPtrArgModified(PtrArg)) && + (ArgTy->isIntOrIntVectorTy() || ArgTy->isFPOrFPVectorTy())) + ArgsToTransform.insert(PtrArg); + } + + if (ArgsToTransform.empty() && LI.empty()) + return 0; + + return TransformNode(F, ArgsToTransform, LI); +} + +// check for typical inst sequences passing arg as a base +// of store-like intrinsics +static bool checkSinkToMemIntrinsic(Instruction *Inst) { + auto *CI = dyn_cast(Inst); + if (CI && (GenXIntrinsic::getAnyIntrinsicID(CI->getCalledFunction()) == + GenXIntrinsic::genx_svm_scatter || + GenXIntrinsic::getAnyIntrinsicID(CI->getCalledFunction()) == + GenXIntrinsic::genx_scatter_scaled)) + return true; + for (auto *U : Inst->users()) { + if (isa(U) || isa(U) || + isa(U) || isa(U)) + return checkSinkToMemIntrinsic(cast(U)); + } + return false; +} + +bool CMABI::IsPtrArgModified(Value *Arg) { + // Arg is a ptr to a vector type. If data is written using a + // store, then return true. This means copy-in/copy-out are + // needed as caller may use the updated value. If no data is + // ever stored in Arg then return false. It is safe to + // convert the parameter to pass-by-value in GRF. + // This is a recursive function. + for (const auto &U : Arg->users()) { + if (auto *I = dyn_cast(U)) { + if (isa(U)) + return true; + else if (isa(U) || isa(U)) + return IsPtrArgModified(U); + else if (isa(U)) + return checkSinkToMemIntrinsic(I); + } + } + return false; +} + +// \brief Fix argument passing for kernels: i1 -> i8. +CallGraphNode *CMABI::TransformKernel(Function *F) { + assert(F->getReturnType()->isVoidTy()); + LLVMContext &Context = F->getContext(); + + AttributeList AttrVec; + const AttributeList &PAL = F->getAttributes(); + + // First, determine the new argument list + SmallVector ArgTys; + unsigned ArgIndex = 0; + for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; + ++I, ++ArgIndex) { + Type *ArgTy = I->getType(); + // Change i1 to i8 and vxi1 to vxi8 + if (ArgTy->getScalarType()->isIntegerTy(1)) { + Type *Ty = IntegerType::get(F->getContext(), 8); + if (ArgTy->isVectorTy()) + ArgTys.push_back(VectorType::get(Ty, ArgTy->getVectorNumElements())); + else + ArgTys.push_back(Ty); + } else { + // Unchanged argument + AttributeSet attrs = PAL.getParamAttributes(ArgIndex); + if (attrs.hasAttributes()) { + AttrBuilder B(attrs); + AttrVec = AttrVec.addParamAttributes(Context, ArgTys.size(), B); + } + ArgTys.push_back(I->getType()); + } + } + + FunctionType *NFTy = FunctionType::get(F->getReturnType(), ArgTys, false); + assert((NFTy != F->getFunctionType()) && + "type out of sync, expect bool arguments"); + + // Add any function attributes. + AttributeSet FnAttrs = PAL.getFnAttributes(); + if (FnAttrs.hasAttributes()) { + AttrBuilder B(FnAttrs); + AttrVec = AttrVec.addAttributes(Context, AttributeList::FunctionIndex, B); + } + + // Create the new function body and insert it into the module. + Function *NF = Function::Create(NFTy, F->getLinkage(), F->getName()); + NF->setAttributes(AttrVec); + LLVM_DEBUG(dbgs() << "CMABI: Transforming to:" << *NF << "\n" << "From: " << *F); + F->getParent()->getFunctionList().insert(F->getIterator(), NF); + NF->takeName(F); + NF->setSubprogram(F->getSubprogram()); // tranfer debug-info + NF->setCallingConv(F->getCallingConv()); + + // Since we have now created the new function, splice the body of the old + // function right into the new function. + NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList()); + + // Loop over the argument list, transferring uses of the old arguments over to + // the new arguments, also transferring over the names as well. + for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(), + I2 = NF->arg_begin(); + I != E; ++I, ++I2) { + // For an unmodified argument, move the name and users over. + if (!I->getType()->getScalarType()->isIntegerTy(1)) { + I->replaceAllUsesWith(I2); + I2->takeName(I); + } else { + Instruction *InsertPt = &*(NF->begin()->begin()); + Instruction *Conv = new TruncInst(I2, I->getType(), "tobool", InsertPt); + I->replaceAllUsesWith(Conv); + I2->takeName(I); + } + } + + CallGraph &CG = getAnalysis().getCallGraph(); + CallGraphNode *NF_CGN = CG.getOrInsertFunction(NF); + + // Update the metadata entry. + if (F->hasDLLExportStorageClass()) + NF->setDLLStorageClass(F->getDLLStorageClass()); + + auto getValue = [](Metadata *M) -> Value * { + if (auto VM = dyn_cast(M)) + return VM->getValue(); + return nullptr; + }; + + // Scan the CM kernel metadata and replace with NF. + if (NamedMDNode *Named = + CG.getModule().getNamedMetadata(genx::FunctionMD::GenXKernels)) { + for (unsigned I = 0, E = Named->getNumOperands(); I != E; ++I) { + MDNode *Node = Named->getOperand(I); + if (F == dyn_cast_or_null(getValue(Node->getOperand(0)))) + Node->replaceOperandWith(genx::KernelMDOp::FunctionRef, ValueAsMetadata::get(NF)); + } + } + + // Now that the old function is dead, delete it. If there is a dangling + // reference to the CallgraphNode, just leave the dead function around. + NF_CGN->stealCalledFunctionsFrom(CG[F]); + CallGraphNode *CGN = CG[F]; + if (CGN->getNumReferences() == 0) + delete CG.removeFunctionFromModule(CGN); + else + F->setLinkage(Function::ExternalLinkage); + + return NF_CGN; +} + +// \brief Actually performs the transformation of the specified arguments, and +// returns the new function. +// +// Note this transformation does change the semantics as a C function, due to +// possible pointer aliasing. But it is allowed as a CM function. +// +// The pass-by-reference scheme is useful to copy-out values from the +// subprogram back to the caller. It also may be useful to convey large inputs +// to subprograms, as the amount of parameter conveying code will be reduced. +// There is a restriction imposed on arguments passed by reference in order to +// allow for an efficient CM implementation. Specifically the restriction is +// that for a subprogram that uses pass-by-reference, the behavior must be the +// same as if we use a copy-in/copy-out semantic to convey the +// pass-by-reference argument; otherwise the CM program is said to be erroneous +// and may produce incorrect results. Such errors are not caught by the +// compiler and it is up to the user to guarantee safety. +// +// The implication of the above stated restriction is that no pass-by-reference +// argument that is written to in a subprogram (either directly or transitively +// by means of a nested subprogram call pass-by-reference argument) may overlap +// with another pass-by-reference parameter or a global variable that is +// referenced in the subprogram; in addition no pass-by-reference subprogram +// argument that is referenced may overlap with a global variable that is +// written to in the subprogram. +// +CallGraphNode *CMABI::TransformNode(Function *F, + SmallPtrSet &ArgsToTransform, + LocalizationInfo &LI) { + // Computing a new prototype for the function. E.g. + // + // i32 @foo(i32, <8 x i32>*) becomes {i32, <8 x i32>} @bar(i32, <8 x i32>) + // + FunctionType *FTy = F->getFunctionType(); + SmallVector RetTys; + if (!FTy->getReturnType()->isVoidTy()) + RetTys.push_back(FTy->getReturnType()); + auto SkipHeuristic = (F->getNumUses() > MaxCallSites); + + // Keep track of parameter attributes for the arguments that we are *not* + // transforming. For the ones we do transform, parameter attributes are lost. + AttributeList AttrVec; + const AttributeList &PAL = F->getAttributes(); + LLVMContext &Context = F->getContext(); + + // First, determine the new argument list + SmallVector Params; + SmallPtrSet CopyInOutNeeded; + unsigned ArgIndex = 0; + for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; + ++I, ++ArgIndex) { + if (!ArgsToTransform.count(I)) { + // Unchanged argument + AttributeSet attrs = PAL.getParamAttributes(ArgIndex); + if (attrs.hasAttributes()) { + AttrBuilder B(attrs); + AttrVec = AttrVec.addParamAttributes(Context, Params.size(), B); + } + Params.push_back(I->getType()); + } else if (I->use_empty()) { + // Delete unused arguments + ++NumArgumentsDead; + } else { + // Use the element type as the new argument type. + Params.push_back(I->getType()->getPointerElementType()); + + if (IsPtrArgModified(I) || SkipHeuristic) { + CopyInOutNeeded.insert(I); + RetTys.push_back(I->getType()->getPointerElementType()); + } + + ++NumArgumentsTransformed; + } + } + + typedef LocalizationInfo::GlobalSetTy::iterator IteratorTy; + for (IteratorTy I = LI.getGlobals().begin(), E = LI.getGlobals().end(); + I != E; ++I) { + GlobalVariable *GV = *I; + // Store the index information of this global variable. + LI.setArgIndex(GV, Params.size()); + + Type *PointeeTy = GV->getType()->getPointerElementType(); + Params.push_back(PointeeTy); + RetTys.push_back(PointeeTy); + } + + // Add any function attributes. + AttributeSet FnAttrs = PAL.getFnAttributes(); + if (FnAttrs.hasAttributes()) { + AttrBuilder B(FnAttrs); + AttrVec = AttrVec.addAttributes(Context, AttributeList::FunctionIndex, B); + } + + // Construct the new function type using the new arguments. + llvm::Type *RetTy = StructType::get(Context, RetTys); + FunctionType *NFTy = FunctionType::get(RetTy, Params, FTy->isVarArg()); + + // Create the new function body and insert it into the module. + Function *NF = Function::Create(NFTy, F->getLinkage(), F->getName()); + NF->setAttributes(AttrVec); + LLVM_DEBUG(dbgs() << "CMABI: Transforming to:" << *NF << "\n" << "From: " << *F); + F->getParent()->getFunctionList().insert(F->getIterator(), NF); + NF->takeName(F); + NF->setCallingConv(F->getCallingConv()); + + // Get a new callgraph node for NF. + CallGraph &CG = getAnalysis().getCallGraph(); + CallGraphNode *NF_CGN = CG.getOrInsertFunction(NF); + + std::vector DirectUsers; + + for (auto U: F->users()) { + if (isa(U)) + DirectUsers.push_back(U); + } + + // Loop over all of the callers of the function, transforming the call sites + // to pass in the loaded pointers. + for (auto U: DirectUsers) { + CallSite CS(U); + assert(CS.getCalledFunction() == F); + Instruction *Call = CS.getInstruction(); + const AttributeList &CallPAL = CS.getAttributes(); + + SmallVector Args; + AttributeList NewAttrVec; + + // Loop over the operands, inserting loads in the caller. + CallSite::arg_iterator AI = CS.arg_begin(); + ArgIndex = 0; + for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; + ++I, ++AI, ++ArgIndex) { + if (!ArgsToTransform.count(I)) { + // Unchanged argument + AttributeSet attrs = CallPAL.getParamAttributes(ArgIndex); + if (attrs.hasAttributes()) { + AttrBuilder B(attrs); + NewAttrVec = NewAttrVec.addParamAttributes(Context, Args.size(), B); + } + Args.push_back(*AI); + } else if (!I->use_empty()) { + LoadInst *Load = new LoadInst(*AI, (*AI)->getName() + ".val", Call); + Args.push_back(Load); + } + } + + // Push any varargs arguments on the list. + for (; AI != CS.arg_end(); ++AI, ++ArgIndex) { + AttributeSet attrs = CallPAL.getParamAttributes(ArgIndex); + if (attrs.hasAttributes()) { + AttrBuilder B(attrs); + NewAttrVec = NewAttrVec.addParamAttributes(Context, Args.size(), B); + } + Args.push_back(*AI); + } + + // Push any localized globals. + for (IteratorTy I = LI.getGlobals().begin(), E = LI.getGlobals().end(); + I != E; ++I) { + GlobalVariable *GV = *I; + LoadInst *Load = new LoadInst(GV, GV->getName() + ".val", Call); + Args.push_back(Load); + } + + // Add any function attributes. + if (CallPAL.hasAttributes(AttributeList::FunctionIndex)) { + AttrBuilder B(CallPAL.getFnAttributes()); + NewAttrVec = NewAttrVec.addAttributes(Context, AttributeList::FunctionIndex, B); + } + + if (isa(Call)) + llvm_unreachable("InvokeInst not supported"); + + CallInst *New = CallInst::Create(NF, Args, "", Call); + New->setCallingConv(CS.getCallingConv()); + New->setAttributes(NewAttrVec); + if (cast(Call)->isTailCall()) + New->setTailCall(); + New->setDebugLoc(Call->getDebugLoc()); + + // Update the callgraph to know that the callsite has been transformed. + auto CalleeNode = static_cast( + CG[Call->getParent()->getParent()]); + CalleeNode->replaceCallEdge(CallSite(Call), New, NF_CGN); + + unsigned Index = 0; + IRBuilder<> Builder(Call); + + New->takeName(Call); + if (!F->getReturnType()->isVoidTy()) + Call->replaceAllUsesWith(Builder.CreateExtractValue(New, Index++, "ret")); + + // Loop over the operands, and copy out all pass by reference values. + AI = CS.arg_begin(); + for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; + ++I, ++AI) { + // Unused arguments are already eliminated from the call sites. + if (ArgsToTransform.count(I) && !I->use_empty() && + CopyInOutNeeded.count(I)) { + Value *OutVal = Builder.CreateExtractValue(New, Index++); + Builder.CreateStore(OutVal, *AI); + } + } + // Loop over localized globals, and copy out all globals. + for (IteratorTy I = LI.getGlobals().begin(), E = LI.getGlobals().end(); + I != E; ++I) { + GlobalVariable *GV = *I; + Value *OutVal = Builder.CreateExtractValue(New, Index++); + Builder.CreateStore(OutVal, GV); + } + assert(Index == New->getType()->getStructNumElements() && "type out of sync"); + + // Remove the old call from the function, reducing the use-count of F. + Call->eraseFromParent(); + } + + // Since we have now created the new function, splice the body of the old + // function right into the new function. + NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList()); + + // Allocas used for transformed arguments. + SmallVector Allocas; + + // Loop over the argument list, transferring uses of the old arguments over to + // the new arguments, also transferring over the names as well. + for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(), + I2 = NF->arg_begin(); + I != E; ++I) { + // For an unmodified argument, move the name and users over. + if (!ArgsToTransform.count(I)) { + I->replaceAllUsesWith(I2); + I2->takeName(I); + ++I2; + continue; + } + + if (I->use_empty()) + continue; + + // Otherwise, we transformed this argument. + // + // In the callee, we create an alloca, and store each of the new incoming + // arguments into the alloca. + Instruction *InsertPt = &*(NF->begin()->begin()); + Type *AgTy = I->getType()->getPointerElementType(); + AllocaInst *TheAlloca = new AllocaInst(AgTy, 0, "", InsertPt); + Instruction * NewInst = TheAlloca; + if (I->getType()->getPointerAddressSpace() != 0) { + // Insert addrspace cast + auto AddrSpaceCast = new AddrSpaceCastInst( + TheAlloca, AgTy->getPointerTo(I->getType()->getPointerAddressSpace()), + ""); + AddrSpaceCast->insertAfter(TheAlloca); + NewInst = AddrSpaceCast; + } + if (CopyInOutNeeded.count(I)) + Allocas.push_back(TheAlloca); + + I2->setName(I->getName()); + new StoreInst(I2++, NewInst, InsertPt); + + // Anything that used the arg should now use the alloca. + I->replaceAllUsesWith(NewInst); + NewInst->takeName(I); + } + + // Collect all globals and their corresponding allocas. + SmallDenseMap GlobalsToReplace; + + // Loop over globals and transfer uses of globals over to new arguments. + for (IteratorTy I = LI.getGlobals().begin(), E = LI.getGlobals().end(); + I != E; ++I) { + GlobalVariable *GV = *I; + + Instruction *InsertPt = &*(NF->begin()->begin()); + Type *AgTy = GV->getType()->getPointerElementType(); + AllocaInst *TheAlloca = new AllocaInst(AgTy, 0, "", InsertPt); + Allocas.push_back(TheAlloca); + + auto ArgIter = NF->arg_begin(); + std::advance(ArgIter, LI.getArgIndex(GV)); + ArgIter->setName(GV->getName() + ".in"); + new StoreInst(ArgIter, TheAlloca, InsertPt); + + TheAlloca->setName(GV->getName() + ".local"); + GlobalsToReplace.insert(std::make_pair(GV, TheAlloca)); + } + // Replaces all globals uses within this new function. + replaceUsesWithinFunction(GlobalsToReplace, NF); + + // Fix all return instructions since we have changed the return type. + Type *NFRetTy = NF->getReturnType(); + for (inst_iterator I = inst_begin(NF), E = inst_end(NF); I != E; /* empty */) { + Instruction *Inst = &*I++; + if (ReturnInst *RI = dyn_cast(Inst)) { + IRBuilder<> Builder(RI); + + // Create new return value, which is a struct type. + Value *RetVal = UndefValue::get(NFRetTy); + unsigned Index = 0; + + if (!F->getReturnType()->isVoidTy()) { + Value *RV = RI->getReturnValue(); + assert(RV && RV->getType()->isSingleValueType() && "type unexpected"); + RetVal = Builder.CreateInsertValue(RetVal, RV, Index++); + } + for (unsigned i = 0, e = Allocas.size(); i < e; ++i) { + Value *V = Builder.CreateLoad(Allocas[i]); + RetVal = Builder.CreateInsertValue(RetVal, V, Index++); + } + + StructType *ST = cast(NFRetTy); + assert(ST->getNumElements() == Index && "type out of sync"); + (void)ST; + + // Return the final struct by value. + Builder.CreateRet(RetVal); + RI->eraseFromParent(); + } + } + + // It turns out sometimes llvm will recycle function pointers which confuses + // this pass. We delete its localization info and mark this function as + // already visited. + GlobalInfo.erase(F); + AlreadyVisited.insert(F); + + NF_CGN->stealCalledFunctionsFrom(CG[F]); + + // Now that the old function is dead, delete it. If there is a dangling + // reference to the CallgraphNode, just leave the dead function around. + CallGraphNode *CGN = CG[F]; + if (CGN->getNumReferences() == 0) + delete CG.removeFunctionFromModule(CGN); + else + F->setLinkage(Function::ExternalLinkage); + + return NF_CGN; +} + +static void breakConstantVector(unsigned i, Instruction *CurInst, + Instruction *InsertPt) { + ConstantVector *CV = cast(CurInst->getOperand(i)); + + // Splat case. + if (auto S = dyn_cast_or_null(CV->getSplatValue())) { + // Turn element into an instruction + auto Inst = S->getAsInstruction(); + Inst->setDebugLoc(CurInst->getDebugLoc()); + Inst->insertBefore(InsertPt); + Type *NewTy = VectorType::get(Inst->getType(), 1); + Inst = CastInst::Create(Instruction::BitCast, Inst, NewTy, "", CurInst); + Inst->setDebugLoc(CurInst->getDebugLoc()); + + // Splat this value. + IRBuilder<> Builder(InsertPt); + Value *NewVal = Builder.CreateVectorSplat(CV->getNumOperands(), Inst); + + // Update i-th operand with newly created splat. + CurInst->setOperand(i, NewVal); + } + + SmallVector Vals; + bool HasConstExpr = false; + for (unsigned j = 0, N = CV->getNumOperands(); j < N; ++j) { + Value *Elt = CV->getOperand(j); + if (auto CE = dyn_cast(Elt)) { + auto Inst = CE->getAsInstruction(); + Inst->setDebugLoc(CurInst->getDebugLoc()); + Inst->insertBefore(InsertPt); + Vals.push_back(Inst); + HasConstExpr = true; + } else + Vals.push_back(Elt); + } + + if (HasConstExpr) { + Value *Val = UndefValue::get(CV->getType()); + IRBuilder<> Builder(InsertPt); + for (unsigned j = 0, N = CV->getNumOperands(); j < N; ++j) + Val = Builder.CreateInsertElement(Val, Vals[j], j); + CurInst->setOperand(i, Val); + } +} + +static void breakConstantExprs(Function *F) { + for (po_iterator i = po_begin(&F->getEntryBlock()), + e = po_end(&F->getEntryBlock()); + i != e; ++i) { + BasicBlock *BB = *i; + // The effect of this loop is that we process the instructions in reverse + // order, and we re-process anything inserted before the instruction + // being processed. + for (Instruction *CurInst = BB->getTerminator(); CurInst;) { + PHINode *PN = dyn_cast(CurInst); + for (unsigned i = 0, e = CurInst->getNumOperands(); i < e; ++i) { + auto InsertPt = PN ? PN->getIncomingBlock(i)->getTerminator() : CurInst; + Value *Op = CurInst->getOperand(i); + if (ConstantExpr *CE = dyn_cast(Op)) { + Instruction *NewInst = CE->getAsInstruction(); + NewInst->setDebugLoc(CurInst->getDebugLoc()); + NewInst->insertBefore(CurInst); + CurInst->setOperand(i, NewInst); + } else if (isa(Op)) + breakConstantVector(i, CurInst, InsertPt); + } + CurInst = CurInst == &BB->front() ? nullptr : CurInst->getPrevNode(); + } + } +} + +// For each function, compute the list of globals that need to be passed as +// copy-in and copy-out arguments. +void CMABI::AnalyzeGlobals(CallGraph &CG) { + Module &M = CG.getModule(); + // assuming the device module is self-contained, + // set internal-linkage for global variables + // and functions so globla-DCE can remove them + // if there is no use in the module. + for (auto& Global : M.getGlobalList()) { + if (!Global.isDeclaration()) + Global.setLinkage(GlobalValue::InternalLinkage); + } + for (auto& F : M.getFunctionList()) { + // __cm_intrinsic_impl_* could be used for emulation mul/div etc + if (GenXIntrinsic::getAnyIntrinsicID(&F) == + GenXIntrinsic::not_any_intrinsic && + !F.getName().contains("__cm_intrinsic_impl_") && + !F.isDeclaration() && !F.hasDLLExportStorageClass()) + F.setLinkage(GlobalValue::InternalLinkage); + } + // No global variables. + if (M.global_empty()) + return; + + // Store functions in a SetVector to keep order and make searching efficient. + SetVector Funcs; + for (auto I = scc_begin(&CG), IE = scc_end(&CG); I != IE; ++I) { + const std::vector &SCCNodes = *I; + for (const CallGraphNode *Node : SCCNodes) { + Function *F = Node->getFunction(); + if (F != nullptr && !F->isDeclaration()) { + Funcs.insert(F); + breakConstantExprs(F); + } + } + } + auto PrintIndexChecker = [](Use &IUI) { + CallInst *CI = dyn_cast(IUI.getUser()); + if (!CI) + return false; + Function *Callee = CI->getCalledFunction(); + if (!Callee) + return false; + unsigned IntrinID = GenXIntrinsic::getAnyIntrinsicID(Callee); + return (IntrinID == GenXIntrinsic::genx_print_format_index); + }; + auto UsesPrintChecker = [PrintIndexChecker](const Use &UI) { + auto *User = UI.getUser(); + return std::any_of(User->use_begin(), User->use_end(), PrintIndexChecker); + }; + const auto &DL = M.getDataLayout(); + auto ToLocalize = selectGlobalsToLocalize( + M.globals(), LocalizationLimit.getValue(), + [UsesPrintChecker](const GlobalVariable &GV) { + // don't localize global constant format string if it's used by print_index intrinsic + bool UsesPrintIndex = std::any_of(GV.use_begin(), GV.use_end(), UsesPrintChecker); + return (GV.hasAttribute(genx::FunctionMD::GenXVolatile) || + UsesPrintIndex); + }, + [&DL](const GlobalVariable &GV) { return calcGVWeight(GV, DL); }); + for (auto I = Funcs.begin(), E = Funcs.end(); I != E; ++I) { + Function *Fn = *I; + LLVM_DEBUG(dbgs() << "Visiting " << Fn->getName() << "\n"); + + // Collect globals used directly. + for (GlobalVariable *GV : ToLocalize) { + for (Value::use_iterator UI = GV->use_begin(), UE = GV->use_end(); + UI != UE; ++UI) { + Instruction *Inst = dyn_cast(UI->getUser()); + // not used in this function. + if (!Inst || Inst->getParent()->getParent() != Fn) + continue; + + // Find the global being used and populate this info. + for (unsigned i = 0, e = Inst->getNumOperands(); i < e; ++i) { + Value *Op = Inst->getOperand(i); + if (GlobalVariable *GV = dyn_cast(Op)) + addDirectGlobal(Fn, GV); + } + } + } + + // Collect globals used indirectly. + for (inst_iterator II = inst_begin(Fn), IE = inst_end(Fn); II != IE; ++II) { + Instruction *Inst = &*II; + // Ignore InvokeInst. + if (CallInst *CI = dyn_cast(Inst)) { + // Ignore indirect calls + if (Function *Callee = CI->getCalledFunction()) { + // Collect all globals from its callee. + if (!Callee->isDeclaration()) + addIndirectGlobal(Fn, Callee); + } + } + } + } +} + +/*********************************************************************** + * diagnoseOverlappingArgs : attempt to diagnose overlapping by-ref args + * + * The CM language spec says you are not allowed a call with two by-ref args + * that overlap. This is to give the compiler the freedom to implement with + * copy-in copy-out semantics or with an address register. + * + * This function attempts to diagnose code that breaks this restriction. For + * pointer args to the call, it attempts to track how values are loaded using + * the pointer (assumed to be an alloca of the temporary used for copy-in + * copy-out semantics), and how those values then get propagated through + * wrregions and stores. If any vector element in a wrregion or store is found + * that comes from more than one pointer arg, it is reported. + * + * This ignores variable index wrregions, and only traces through instructions + * with the same debug location as the call, so does not work with -g0. + */ +void CMABI::diagnoseOverlappingArgs(CallInst *CI) +{ + LLVM_DEBUG(dbgs() << "diagnoseOverlappingArgs " << *CI << "\n"); + auto DL = CI->getDebugLoc(); + if (!DL) + return; + std::map> ValMap; + SmallVector WorkList; + std::set InWorkList; + std::set> Reported; + // Using ArgIndex starting at 1 so we can reserve 0 to mean "element does not + // come from any by-ref arg". + for (unsigned ArgIndex = 1, NumArgs = CI->getNumArgOperands(); + ArgIndex <= NumArgs; ++ArgIndex) { + Value *Arg = CI->getOperand(ArgIndex - 1); + if (!Arg->getType()->isPointerTy()) + continue; + LLVM_DEBUG(dbgs() << "arg " << ArgIndex << ": " << *Arg << "\n"); + // Got a pointer arg. Find its loads (with the same debug loc). + for (auto ui = Arg->use_begin(), ue = Arg->use_end(); ui != ue; ++ui) { + auto LI = dyn_cast(ui->getUser()); + if (!LI || LI->getDebugLoc() != DL) + continue; + LLVM_DEBUG(dbgs() << " " << *LI << "\n"); + // For a load, create a map entry that says that every vector element + // comes from this arg. + unsigned NumElements = 1; + if (auto VT = dyn_cast(LI->getType())) + NumElements = VT->getNumElements(); + auto Entry = &ValMap[LI]; + Entry->resize(NumElements, ArgIndex); + // Add its users (with the same debug location) to the work list. + for (auto ui = LI->use_begin(), ue = LI->use_end(); ui != ue; ++ui) { + auto Inst = cast(ui->getUser()); + if (Inst->getDebugLoc() == DL) + if (InWorkList.insert(Inst).second) + WorkList.push_back(Inst); + } + } + } + // Process the work list. + while (!WorkList.empty()) { + auto Inst = WorkList.back(); + WorkList.pop_back(); + InWorkList.erase(Inst); + LLVM_DEBUG(dbgs() << "From worklist: " << *Inst << "\n"); + Value *Key = nullptr; + SmallVector TempVector; + SmallVectorImpl *VectorToMerge = nullptr; + if (auto SI = dyn_cast(Inst)) { + // Store: set the map entry using the store pointer as the key. It might + // be an alloca of a local variable, or a global variable. + // Strictly speaking this is not properly keeping track of what is being + // merged using load-wrregion-store for a non-SROAd local variable or a + // global variable. Instead it is just merging at the store itself, which + // is good enough for our purposes. + Key = SI->getPointerOperand(); + VectorToMerge = &ValMap[SI->getValueOperand()]; + } else if (auto BC = dyn_cast(Inst)) { + // Bitcast: calculate the new map entry. + Key = BC; + uint64_t OutElementSize = + BC->getType()->getScalarType()->getPrimitiveSizeInBits(); + uint64_t InElementSize = BC->getOperand(0) + ->getType() + ->getScalarType() + ->getPrimitiveSizeInBits(); + int LogRatio = countTrailingZeros(OutElementSize, ZB_Undefined) - + countTrailingZeros(InElementSize, ZB_Undefined); + auto OpndEntry = &ValMap[BC->getOperand(0)]; + if (!LogRatio) + VectorToMerge = OpndEntry; + else if (LogRatio > 0) { + // Result element type is bigger than input element type, so there are + // fewer result elements. Just use an arbitrarily chosen non-zero entry + // of the N input elements to set the 1 result element. + assert(!(OpndEntry->size() & ((1U << LogRatio) - 1))); + for (unsigned i = 0, e = OpndEntry->size(); i != e; i += 1U << LogRatio) { + unsigned FoundArgIndex = 0; + for (unsigned j = 0; j != 1U << LogRatio; ++j) + FoundArgIndex = std::max(FoundArgIndex, (unsigned)(*OpndEntry)[i + j]); + TempVector.push_back(FoundArgIndex); + } + VectorToMerge = &TempVector; + } else { + // Result element type is smaller than input element type, so there are + // multiple result elements per input element. + for (unsigned i = 0, e = OpndEntry->size(); i != e; ++i) + for (unsigned j = 0; j != 1U << -LogRatio; ++j) + TempVector.push_back((*OpndEntry)[i]); + VectorToMerge = &TempVector; + } + } else if (auto CI = dyn_cast(Inst)) { + if (auto CF = CI->getCalledFunction()) { + switch (GenXIntrinsic::getGenXIntrinsicID(CF)) { + default: + break; + case GenXIntrinsic::genx_wrregionf: + case GenXIntrinsic::genx_wrregioni: + // wrregion: As long as it is constant index, propagate the argument + // indices into the appropriate elements of the result. + if (auto IdxC = dyn_cast(CI->getOperand( + GenXIntrinsic::GenXRegion::WrIndexOperandNum))) { + unsigned Idx = 0; + if (!IdxC->isNullValue()) { + auto IdxCI = dyn_cast(IdxC); + if (!IdxCI) { + LLVM_DEBUG(dbgs() << "Ignoring variable index wrregion\n"); + break; + } + Idx = IdxCI->getZExtValue(); + } + Idx /= (CI->getType()->getScalarType()->getPrimitiveSizeInBits() / 8U); + // First copy the "old value" input to the map entry. + auto OpndEntry = &ValMap[CI->getOperand( + GenXIntrinsic::GenXRegion::OldValueOperandNum)]; + auto Entry = &ValMap[CI]; + Entry->clear(); + Entry->insert(Entry->begin(), OpndEntry->begin(), OpndEntry->end()); + // Then copy the "new value" elements according to the region. + TempVector.resize(CI->getType()->getVectorNumElements(), 0); + int VStride = cast(CI->getOperand( + GenXIntrinsic::GenXRegion::WrVStrideOperandNum))->getSExtValue(); + unsigned Width = cast(CI->getOperand( + GenXIntrinsic::GenXRegion::WrWidthOperandNum))->getZExtValue(); + int Stride = cast(CI->getOperand( + GenXIntrinsic::GenXRegion::WrStrideOperandNum))->getSExtValue(); + OpndEntry = &ValMap[CI->getOperand( + GenXIntrinsic::GenXRegion::NewValueOperandNum)]; + unsigned NumElements = OpndEntry->size(); + if (!NumElements) + break; + for (unsigned RowIdx = Idx, Row = 0, Col = 0, + NumRows = NumElements / Width;; Idx += Stride, ++Col) { + if (Col == Width) { + Col = 0; + if (++Row == NumRows) + break; + Idx = RowIdx += VStride; + } + TempVector[Idx] = (*OpndEntry)[Row * Width + Col]; + } + VectorToMerge = &TempVector; + Key = CI; + } + break; + } + } + } + if (!VectorToMerge) + continue; + auto Entry = &ValMap[Key]; + LLVM_DEBUG(dbgs() << "Merging :"; + for (unsigned i = 0; i != VectorToMerge->size(); ++i) + dbgs() << " " << (unsigned)(*VectorToMerge)[i]; + dbgs() << "\ninto " << Key->getName() << ":"; + for (unsigned i = 0; i != Entry->size(); ++i) + dbgs() << " " << (unsigned)(*Entry)[i]; + dbgs() << "\n"); + if (Entry->empty()) + Entry->insert(Entry->end(), VectorToMerge->begin(), VectorToMerge->end()); + else { + assert(VectorToMerge->size() == Entry->size()); + for (unsigned i = 0; i != VectorToMerge->size(); ++i) { + unsigned ArgIdx1 = (*VectorToMerge)[i]; + unsigned ArgIdx2 = (*Entry)[i]; + if (ArgIdx1 && ArgIdx2 && ArgIdx1 != ArgIdx2) { + LLVM_DEBUG(dbgs() << "By ref args overlap: args " << ArgIdx1 << " and " << ArgIdx2 << "\n"); + if (ArgIdx1 > ArgIdx2) + std::swap(ArgIdx1, ArgIdx2); + if (Reported.insert(std::pair(ArgIdx1, ArgIdx2)) + .second) { + // Not already reported. + DiagnosticInfoOverlappingArgs Err(CI, "by reference arguments " + + Twine(ArgIdx1) + " and " + Twine(ArgIdx2) + " overlap", + DS_Error); + Inst->getContext().diagnose(Err); + } + } + (*Entry)[i] = std::max((*Entry)[i], (*VectorToMerge)[i]); + } + } + LLVM_DEBUG(dbgs() << "giving:"; + for (unsigned i = 0; i != Entry->size(); ++i) + dbgs() << " " << (unsigned)(*Entry)[i]; + dbgs() << "\n"); + if (Key == Inst) { + // Not the case that we have a store and we are using the pointer as + // the key. In ther other cases that do a merge (bitcast and wrregion), + // add users to the work list as long as they have the same debug loc. + for (auto ui = Inst->use_begin(), ue = Inst->use_end(); ui != ue; ++ui) { + auto User = cast(ui->getUser()); + if (User->getDebugLoc() == DL) + if (InWorkList.insert(Inst).second) + WorkList.push_back(User); + } + } + } +} + +/*********************************************************************** + * DiagnosticInfoOverlappingArgs initializer from Instruction + * + * If the Instruction has a DebugLoc, then that is used for the error + * location. + * Otherwise, the location is unknown. + */ +DiagnosticInfoOverlappingArgs::DiagnosticInfoOverlappingArgs(Instruction *Inst, + const Twine &Desc, DiagnosticSeverity Severity) + : DiagnosticInfo(getKindID(), Severity), Line(0), Col(0) +{ + auto DL = Inst->getDebugLoc(); + if (!DL) { + Filename = DL.get()->getFilename(); + Line = DL.getLine(); + Col = DL.getCol(); + } + Description = Desc.str(); +} + +/*********************************************************************** + * DiagnosticInfoOverlappingArgs::print : print the error/warning message + */ +void DiagnosticInfoOverlappingArgs::print(DiagnosticPrinter &DP) const +{ + std::string Loc( + (Twine(!Filename.empty() ? Filename : "") + + ":" + Twine(Line) + + (!Col ? Twine() : Twine(":") + Twine(Col)) + + ": ") + .str()); + DP << Loc << Description; +} + + +char CMABI::ID = 0; +INITIALIZE_PASS_BEGIN(CMABI, "cmabi", "Fix ABI issues for the genx backend", false, false) +INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) +INITIALIZE_PASS_END(CMABI, "cmabi", "Fix ABI issues for the genx backend", false, false) + +Pass *llvm::createCMABIPass() { return new CMABI(); } + +namespace { + +// A well-formed passing argument by reference pattern. +// +// (Alloca) +// %argref1 = alloca <8 x float>, align 32 +// +// (CopyInRegion/CopyInStore) +// %rdr = tail call <8 x float> @llvm.genx.rdregionf(<960 x float> %m, i32 0, i32 8, i32 1, i16 0, i32 undef) +// call void @llvm.genx.vstore(<8 x float> %rdr, <8 x float>* %argref) +// +// (CopyOutRegion/CopyOutLoad) +// %ld = call <8 x float> @llvm.genx.vload(<8 x float>* %argref) +// %wr = call <960 x float> @llvm.genx.wrregionf(<960 x float> %m, <8 x float> %ld, i32 0, i32 8, i32 1, i16 0, i32 undef, i1 true) +// +struct ArgRefPattern { + // Alloca of this reference argument. + AllocaInst *Alloca; + + // The input value + CallInst *CopyInRegion; + CallInst *CopyInStore; + + // The output value + CallInst *CopyOutLoad; + CallInst *CopyOutRegion; + + // Load and store instructions on arg alloca. + SmallVector VLoads; + SmallVector VStores; + + explicit ArgRefPattern(AllocaInst *AI) + : Alloca(AI), CopyInRegion(nullptr), CopyInStore(nullptr), + CopyOutLoad(nullptr), CopyOutRegion(nullptr) {} + + // Match a copy-in and copy-out pattern. Return true on success. + bool match(DominatorTree &DT, PostDominatorTree &PDT); + void process(); +}; + +struct CMLowerVLoadVStore : public FunctionPass { + static char ID; + CMLowerVLoadVStore() : FunctionPass(ID) { + initializeCMLowerVLoadVStorePass(*PassRegistry::getPassRegistry()); + } + virtual void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.setPreservesCFG(); + } + + virtual bool runOnFunction(Function &F) override; + +private: + bool promoteAllocas(Function &F); + bool lowerLoadStore(Function &F); +}; + +} // namespace + +char CMLowerVLoadVStore::ID = 0; +INITIALIZE_PASS_BEGIN(CMLowerVLoadVStore, "CMLowerVLoadVStore", + "Lower CM reference vector loads and stores", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) +INITIALIZE_PASS_END(CMLowerVLoadVStore, "CMLowerVLoadVStore", + "Lower CM reference vector loads and stores", false, false) + + +bool CMLowerVLoadVStore::runOnFunction(Function &F) { + bool Changed = false; + Changed |= promoteAllocas(F); + Changed |= lowerLoadStore(F); + return Changed; +} + +// Lower remaining vector load/store intrinsic calls into normal load/store +// instructions. +bool CMLowerVLoadVStore::lowerLoadStore(Function &F) { + auto M = F.getParent(); + DenseMap AllocaMap; + // collect all the allocas that store the address of genx-volatile variable + for (auto& G : M->getGlobalList()) { + if (!G.hasAttribute("genx_volatile")) + continue; + std::vector WL; + for (auto UI = G.user_begin(); UI != G.user_end();) { + auto U = *UI++; + WL.push_back(U); + } + + while (!WL.empty()) { + auto Inst = WL.back(); + WL.pop_back(); + if (auto CE = dyn_cast(Inst)) { + for (auto UI = CE->user_begin(); UI != CE->user_end();) { + auto U = *UI++; + WL.push_back(U); + } + } + else if (auto CI = dyn_cast(Inst)) { + for (auto UI = CI->user_begin(); UI != CI->user_end();) { + auto U = *UI++; + WL.push_back(U); + } + } + else if (auto SI = dyn_cast(Inst)) { + auto Ptr = SI->getPointerOperand()->stripPointerCasts(); + if (auto PI = dyn_cast(Ptr)) { + AllocaMap[PI] = &G; + } + } + } + } + + // lower all vload/vstore into normal load/store. + std::vector ToErase; + for (Instruction &Inst : instructions(F)) { + if (GenXIntrinsic::isVLoadStore(&Inst)) { + auto *Ptr = Inst.getOperand(0); + if (GenXIntrinsic::isVStore(&Inst)) + Ptr = Inst.getOperand(1); + auto AS0 = cast(Ptr->getType())->getAddressSpace(); + Ptr = Ptr->stripPointerCasts(); + auto GV = dyn_cast(Ptr); + if (GV) { + if (!GV->hasAttribute("genx_volatile")) + GV = nullptr; + } + else if (auto LI = dyn_cast(Ptr)) { + auto PV = LI->getPointerOperand()->stripPointerCasts(); + if (auto PI = dyn_cast(PV)) { + if (AllocaMap.find(PI) != AllocaMap.end()) { + GV = AllocaMap[PI]; + } + } + } + if (GV == nullptr) { + // change to load/store + IRBuilder<> Builder(&Inst); + if (GenXIntrinsic::isVStore(&Inst)) + Builder.CreateStore(Inst.getOperand(0), Inst.getOperand(1)); + else { + auto LI = Builder.CreateLoad(Inst.getOperand(0), Inst.getName()); + LI->setDebugLoc(Inst.getDebugLoc()); + Inst.replaceAllUsesWith(LI); + } + ToErase.push_back(&Inst); + } + else { + // change to vload/vstore that has the same address space as + // the global-var in order to clean up unnecessary addr-cast. + auto AS1 = GV->getType()->getAddressSpace(); + if (AS0 != AS1) { + IRBuilder<> Builder(&Inst); + if (GenXIntrinsic::isVStore(&Inst)) { + auto PtrTy = cast(Inst.getOperand(1)->getType()); + PtrTy = PointerType::get(PtrTy->getElementType(), AS1); + auto PtrCast = Builder.CreateAddrSpaceCast(Inst.getOperand(1), PtrTy); + Type* Tys[] = { Inst.getOperand(0)->getType(), + PtrCast->getType() }; + Value* Args[] = { Inst.getOperand(0), PtrCast }; + Function* Fn = GenXIntrinsic::getGenXDeclaration( + F.getParent(), GenXIntrinsic::genx_vstore, Tys); + Builder.CreateCall(Fn, Args, Inst.getName()); + } + else { + auto PtrTy = cast(Inst.getOperand(0)->getType()); + PtrTy = PointerType::get(PtrTy->getElementType(), AS1); + auto PtrCast = Builder.CreateAddrSpaceCast(Inst.getOperand(0), PtrTy); + Type* Tys[] = { Inst.getType(), PtrCast->getType() }; + Function* Fn = GenXIntrinsic::getGenXDeclaration( + F.getParent(), GenXIntrinsic::genx_vload, Tys); + Value* VLoad = Builder.CreateCall(Fn, PtrCast, Inst.getName()); + Inst.replaceAllUsesWith(VLoad); + } + ToErase.push_back(&Inst); + } + } + } + } + + for (auto Inst : ToErase) { + Inst->eraseFromParent(); + } + + return !ToErase.empty(); +} + +static bool isBitCastForLifetimeMarker(Value *V) { + if (!V || !isa(V)) + return false; + for (auto U : V->users()) { + unsigned IntrinsicID = GenXIntrinsic::getAnyIntrinsicID(U); + if (IntrinsicID != Intrinsic::lifetime_start && + IntrinsicID != Intrinsic::lifetime_end) + return false; + } + return true; +} + +// Check whether two values are bitwise identical. +static bool isBitwiseIdentical(Value *V1, Value *V2) { + assert(V1 && V2 && "null value"); + if (V1 == V2) + return true; + if (BitCastInst *BI = dyn_cast(V1)) + V1 = BI->getOperand(0); + if (BitCastInst *BI = dyn_cast(V2)) + V2 = BI->getOperand(0); + + // Special case arises from vload/vstore. + if (GenXIntrinsic::isVLoad(V1) && GenXIntrinsic::isVLoad(V2)) { + auto L1 = cast(V1); + auto L2 = cast(V2); + // Check if loading from the same location. + if (L1->getOperand(0) != L2->getOperand(0)) + return false; + + // Check if this pointer is local and only used in vload/vstore. + Value *Addr = L1->getOperand(0); + if (!isa(Addr)) + return false; + for (auto UI : Addr->users()) { + if (isa(UI)) { + for (auto U : UI->users()) { + unsigned IntrinsicID = GenXIntrinsic::getAnyIntrinsicID(U); + if (IntrinsicID != Intrinsic::lifetime_start && + IntrinsicID != Intrinsic::lifetime_end) + return false; + } + } else { + if (!GenXIntrinsic::isVLoadStore(UI)) + return false; + } + } + + // Check if there is no store to the same location in between. + if (L1->getParent() != L2->getParent()) + return false; + BasicBlock::iterator I = L1->getParent()->begin(); + for (; &*I != L1 && &*I != L2; ++I) + /*empty*/; + assert(&*I == L1 || &*I == L2); + auto IEnd = (&*I == L1) ? L2->getIterator() : L1->getIterator(); + for (; I != IEnd; ++I) { + Instruction *Inst = &*I; + if (GenXIntrinsic::isVStore(Inst) && Inst->getOperand(1) == Addr) + return false; + } + + // OK. + return true; + } + + // Cannot prove. + return false; +} + +bool ArgRefPattern::match(DominatorTree &DT, PostDominatorTree &PDT) { + assert(Alloca); + if (Alloca->use_empty()) + return false; + + // check if all users are load/store. + SmallVector Loads; + SmallVector Stores; + for (auto U : Alloca->users()) + if (GenXIntrinsic::isVLoad(U)) + Loads.push_back(cast(U)); + else if (GenXIntrinsic::isVStore(U)) + Stores.push_back(cast(U)); + else if (isBitCastForLifetimeMarker(U)) + continue; + else + return false; + + if (Loads.empty() || Stores.empty()) + return false; + + // find a unique store that dominates all other users if exists. + auto Cmp = [&](CallInst *L, CallInst *R) { return DT.dominates(L, R); }; + CopyInStore = *std::min_element(Stores.begin(), Stores.end(), Cmp); + CopyInRegion = dyn_cast(CopyInStore->getArgOperand(0)); + if (!CopyInRegion || !CopyInRegion->hasOneUse() || !GenXIntrinsic::isRdRegion(CopyInRegion)) + return false; + + for (auto SI : Stores) + if (SI != CopyInStore && !Cmp(CopyInStore, SI)) + return false; + for (auto LI : Loads) + if (LI != CopyInStore && !Cmp(CopyInStore, LI)) + return false; + + // find a unique load that post-dominates all other users if exists. + auto PostCmp = [&](CallInst *L, CallInst *R) { + BasicBlock *LBB = L->getParent(); + BasicBlock *RBB = R->getParent(); + if (LBB != RBB) + return PDT.dominates(LBB, RBB); + + // Loop through the basic block until we find L or R. + BasicBlock::const_iterator I = LBB->begin(); + for (; &*I != L && &*I != R; ++I) + /*empty*/; + + return &*I == R; + }; + CopyOutLoad = *std::min_element(Loads.begin(), Loads.end(), PostCmp); + + // Expect copy-out load has one or zero use. It is possible there + // is no use as the region becomes dead after this subroutine call. + // + if (!CopyOutLoad->use_empty()) { + if (!CopyOutLoad->hasOneUse()) + return false; + CopyOutRegion = dyn_cast(CopyOutLoad->user_back()); + if (!GenXIntrinsic::isWrRegion(CopyOutRegion)) + return false; + } + + for (auto SI : Stores) + if (SI != CopyOutLoad && !PostCmp(CopyOutLoad, SI)) + return false; + for (auto LI : Loads) + if (LI != CopyOutLoad && !PostCmp(CopyOutLoad, LI)) + return false; + + // Ensure read-in and write-out to the same region. It is possible that region + // collasping does not simplify region accesses completely. + // Probably we should assert on region descriptors. + if (CopyOutRegion && + !isBitwiseIdentical(CopyInRegion->getOperand(0), + CopyOutRegion->getOperand(0))) + return false; + + // It should be OK to rewrite all loads and stores into the argref. + VLoads.swap(Loads); + VStores.swap(Stores); + return true; +} + +void ArgRefPattern::process() { + // 'Spill' the base region into memory during rewriting. + IRBuilder<> Builder(Alloca); + Function *RdFn = CopyInRegion->getCalledFunction(); + assert(RdFn); + Type *BaseAllocaTy = RdFn->getFunctionType()->getParamType(0); + AllocaInst *BaseAlloca = Builder.CreateAlloca(BaseAllocaTy, nullptr, + Alloca->getName() + ".refprom"); + + Builder.SetInsertPoint(CopyInRegion); + Builder.CreateStore(CopyInRegion->getArgOperand(0), BaseAlloca); + + if (CopyOutRegion) { + Builder.SetInsertPoint(CopyOutRegion); + CopyOutRegion->setArgOperand(0, Builder.CreateLoad(BaseAlloca)); + } + + // Rewrite all stores. + for (auto ST : VStores) { + Builder.SetInsertPoint(ST); + Value *OldVal = Builder.CreateLoad(BaseAlloca); + // Always use copy-in region arguments as copy-out region + // arguments do not dominate this store. + auto M = ST->getParent()->getParent()->getParent(); + Value *Args[] = {OldVal, + ST->getArgOperand(0), + CopyInRegion->getArgOperand(1), // vstride + CopyInRegion->getArgOperand(2), // width + CopyInRegion->getArgOperand(3), // hstride + CopyInRegion->getArgOperand(4), // offset + CopyInRegion->getArgOperand(5), // parent width + ConstantInt::getTrue(Type::getInt1Ty(M->getContext()))}; + auto ID = OldVal->getType()->isFPOrFPVectorTy() ? GenXIntrinsic::genx_wrregionf + : GenXIntrinsic::genx_wrregioni; + Type *Tys[] = {Args[0]->getType(), Args[1]->getType(), Args[5]->getType(), + Args[7]->getType()}; + auto WrFn = GenXIntrinsic::getGenXDeclaration(M, ID, Tys); + Value *NewVal = Builder.CreateCall(WrFn, Args); + Builder.CreateStore(NewVal, BaseAlloca); + ST->eraseFromParent(); + } + + // Rewrite all loads + for (auto LI : VLoads) { + if (LI->use_empty()) + continue; + + Builder.SetInsertPoint(LI); + Value *SrcVal = Builder.CreateLoad(BaseAlloca); + SmallVector Args(CopyInRegion->arg_operands()); + Args[0] = SrcVal; + Value *Val = Builder.CreateCall(RdFn, Args); + LI->replaceAllUsesWith(Val); + LI->eraseFromParent(); + } +} + +// Allocas that are used in reference argument passing may be promoted into the +// base region. +bool CMLowerVLoadVStore::promoteAllocas(Function &F) { + auto &DT = getAnalysis().getDomTree(); + auto &PDT = getAnalysis().getPostDomTree(); + bool Modified = false; + + SmallVector Allocas; + for (auto &Inst : F.front().getInstList()) { + if (auto AI = dyn_cast(&Inst)) + Allocas.push_back(AI); + else + break; + } + + for (auto AI : Allocas) { + ArgRefPattern ArgRef(AI); + if (ArgRef.match(DT, PDT)) { + ArgRef.process(); + Modified = true; + } + } + + return Modified; +} + +Pass *llvm::createCMLowerVLoadVStorePass() { return new CMLowerVLoadVStore; } diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMTrans/CMImpParam.cpp b/IGC/VectorCompiler/lib/GenXOpts/CMTrans/CMImpParam.cpp new file mode 100644 index 000000000000..67ef5b5541e7 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXOpts/CMTrans/CMImpParam.cpp @@ -0,0 +1,701 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ + +//===----------------------------------------------------------------------===// +// +/// CMImpParam +/// ---------- +/// +/// As well as explicit kernel args declared in the CM kernel function, certain +/// implicit args are also passed. These fall into two categories: +/// +/// 1. fields set up in r0 by the hardware, depending on which dispatch method +/// is being used (e.g. media walker); +/// +/// 2. implicit args set up along with the explicit args in CURBE by the CM +/// runtime. +/// +/// The r0 implicit args are represented in LLVM IR by special intrinsics, and the +/// GenX backend generates these to special reserved vISA registers. +/// +/// For the CM runtime implicit args in (2) above, in vISA 3.2 and earlier, these were also +/// represented by LLVM special intrinsics and vISA special reserved vISA registers. +/// Because they are specific to the CM runtime, and not any other user of vISA, +/// vISA 3.3 has removed them, and instead they are handled much like other kernel +/// args in the input table. +/// +/// The *kind* byte in the input table has two fields: +/// +/// * the *category* field, saying whether the input is general/surface/etc; +/// +/// * the *provenance* field, saying whether the input is an explicit one from +/// the CM source, or an implicit one generated by this pass. This is a +/// protocol agreed between the CM compiler (in fact this pass) and the CM +/// runtime. +/// +/// Within the CM compiler, the vISA input table for a kernel is represented by an +/// array of kind bytes, each one corresponding to an argument of the kernel function. +/// +/// Clang codegen still generates special intrinsics for these CM runtime implicit +/// args. It is the job of this CMImpParam pass to transform those intrinsics: +/// +/// * where the intrinsic for a CM runtime implicit arg is used somewhere: +/// +/// - a global variable is created for it; +/// +/// - for any kernel that uses the implicit arg (or can reach a subroutine that +/// uses it), the implicit arg is added to the input table in the kernel +/// metadata and as an extra arg to the definition of the kernel itself, +/// and its value is stored into the global variable; +/// +/// * each use of the intrinsic for a CM runtime implicit arg is transformed into +/// a load of the corresponding global variable. +/// +/// Like any other global variable, the subsequent CMABI pass turns the global +/// variable for an implicit arg into local variable(s) passed into subroutines +/// if necessary. +/// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "cmimpparam" +#include "vc/GenXOpts/GenXOpts.h" +#include "vc/GenXOpts/Utils/KernelInfo.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/Analysis/CallGraphSCCPass.h" +#include "llvm/GenXIntrinsics/GenXIntrinsics.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +#include +#include + +using namespace llvm; + +namespace llvm { +void initializeCMImpParamPass(PassRegistry &); +} // namespace llvm + +namespace { + +class ImplicitUseInfo { +public: + typedef std::set ImplicitSetTy; + + explicit ImplicitUseInfo(Function *F) : Fn(F) {} + ImplicitUseInfo() : Fn(nullptr) {} + + Function *getFunction() const { return Fn; } + + bool empty() const { return Implicits.empty(); } + ImplicitSetTy &getImplicits() { return Implicits; } + const ImplicitSetTy &getImplicits() const { return Implicits; } + + // \brief Add an implicit arg intrinsic + void addImplicit(unsigned IID) { Implicits.insert(IID); } + + void merge(const ImplicitUseInfo &IUI) { + Implicits.insert(IUI.Implicits.begin(), IUI.Implicits.end()); + } + + void dump() const { print(dbgs()); } + + void print(raw_ostream &OS, unsigned depth = 0) const { + for (auto IID : Implicits) { + OS.indent(depth) << GenXIntrinsic::getAnyName(IID, None) << "\n"; + } + } + +private: + // \brief The function being analyzed + Function *Fn; + + // \brief Implicit arguments used + ImplicitSetTy Implicits; +}; + +struct CMImpParam : public ModulePass { + static char ID; + bool IsCmRT; + + CMImpParam(bool isCmRT = true) : ModulePass(ID), IsCmRT(isCmRT) { + initializeCMImpParamPass(*PassRegistry::getPassRegistry()); + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + } + + virtual StringRef getPassName() const { return "CM Implicit Params"; } + + virtual bool runOnModule(Module &M); + + void dump() const { print(dbgs()); } + virtual void print(raw_ostream &OS, const Module *M = nullptr) const; + +private: + void replaceWithGlobal(CallInst *CI, unsigned IID); + bool AnalyzeImplicitUse(Module &M); + void MergeImplicits(ImplicitUseInfo &implicits, Function *F); + void PropagateImplicits(Function *F, Module &M, + ImplicitUseInfo &implicits); + CallGraphNode *ProcessKernel(Function *F); + + static Value *getValue(Metadata *M) { + if (auto VM = dyn_cast(M)) + return VM->getValue(); + return nullptr; + } + + // Convert to implicit thread payload related intrinsics. + bool ConvertToOCLPayload(Module &M); + + uint32_t MapToKind(unsigned IID) { + using namespace genx; + switch (IID) { + default: + return KernelMetadata::AK_NORMAL; + case GenXIntrinsic::genx_print_buffer: + return KernelMetadata::AK_NORMAL | KernelMetadata::IMP_OCL_PRINTF_BUFFER; + case GenXIntrinsic::genx_local_size: + return KernelMetadata::AK_NORMAL | KernelMetadata::IMP_LOCAL_SIZE; + case GenXIntrinsic::genx_local_id: + case GenXIntrinsic::genx_local_id16: + return KernelMetadata::AK_NORMAL | KernelMetadata::IMP_LOCAL_ID; + case GenXIntrinsic::genx_group_count: + return KernelMetadata::AK_NORMAL | KernelMetadata::IMP_GROUP_COUNT; + case GenXIntrinsic::genx_get_scoreboard_deltas: + return KernelMetadata::AK_NORMAL | KernelMetadata::IMP_SB_DELTAS; + case GenXIntrinsic::genx_get_scoreboard_bti: + return KernelMetadata::AK_SURFACE | KernelMetadata::IMP_SB_BTI; + case GenXIntrinsic::genx_get_scoreboard_depcnt: + return KernelMetadata::AK_SURFACE | KernelMetadata::IMP_SB_DEPCNT; + case GenXIntrinsic::genx_local_id_x: + return KernelMetadata::AK_NORMAL | KernelMetadata::IMP_OCL_LOCAL_ID_X; + case GenXIntrinsic::genx_local_id_y: + return KernelMetadata::AK_NORMAL | KernelMetadata::IMP_OCL_LOCAL_ID_Y; + case GenXIntrinsic::genx_local_id_z: + return KernelMetadata::AK_NORMAL | KernelMetadata::IMP_OCL_LOCAL_ID_Z; + case GenXIntrinsic::genx_group_or_local_size: + return KernelMetadata::AK_NORMAL | + KernelMetadata::IMP_OCL_GROUP_OR_LOCAL_SIZE; + } + return KernelMetadata::AK_NORMAL; + } + + // \brief Returns the implicit use info associated with a function + ImplicitUseInfo &getImplicitUseInfo(Function *F) { + if (!ImplicitsInfo.count(F)) { + ImplicitUseInfo *IUI = new ImplicitUseInfo(F); + ImplicitsInfoObjs.push_back(IUI); + ImplicitsInfo[F] = IUI; + return *IUI; + } + return *ImplicitsInfo[F]; + } + + // \brief Returns the implict use info associated with a function (kernel) + // and also creates a new one that represents the total implicits for the + // kernel as a whole (stored in a different object) + ImplicitUseInfo &getImplicitUseInfoKernel(Function *F) { + assert(Kernels.count(F)); + + if (KernelsInfo.count(F)) { + // Kernel already processed + return *KernelsInfo[F]; + } + + ImplicitUseInfo *IUI = new ImplicitUseInfo(F); + ImplicitsInfoObjs.push_back(IUI); + KernelsInfo[F] = IUI; + + if (ImplicitsInfo.count(F)) { + IUI->merge(*ImplicitsInfo[F]); + } + + return *IUI; + } + + const ImplicitUseInfo *implicitUseInfoKernelExist(Function *F) const { + if (KernelsInfo.count(F)) { + auto CI = KernelsInfo.find(F); + return CI->second; + } + + return nullptr; + } + + void addImplicit(Function *F, unsigned IID) { + getImplicitUseInfo(F).addImplicit(IID); + } + + GlobalVariable *getIIDGlobal(Function *F, unsigned IID) { + if (GlobalsMap.count(IID)) + return GlobalsMap[IID]; + + Type * Ty = getIntrinRetType(F->getContext(), IID); + assert(Ty); + GlobalVariable *NewVar = new GlobalVariable( + *F->getParent(), Ty, false, + GlobalVariable::InternalLinkage, + Constant::getNullValue(Ty), + "__imparg_" + GenXIntrinsic::getAnyName(IID, None)); + GlobalsMap[IID] = NewVar; + + return NewVar; + } + + Type *getIntrinRetType(LLVMContext &Context, unsigned IID) { + switch (IID) { + case GenXIntrinsic::genx_print_buffer: + return llvm::Type::getInt64Ty(Context); + case GenXIntrinsic::genx_local_id: + case GenXIntrinsic::genx_local_size: + case GenXIntrinsic::genx_group_count: + return llvm::VectorType::get(llvm::Type::getInt32Ty(Context), 3); + case GenXIntrinsic::genx_local_id16: + return llvm::VectorType::get(llvm::Type::getInt16Ty(Context), 3); + default: + // Should be able to extract the type from the intrinsic + // directly as no overloading is required (if it is then + // you need to define specific type in a case statement above) + FunctionType *FTy = dyn_cast_or_null( + GenXIntrinsic::getAnyType(Context, IID)); + if (FTy) + return FTy->getReturnType(); + } + return nullptr; + } + + // This map captures all implicit uses to be transformed + SmallDenseMap ImplicitsInfo; + + // This map captures all implicit uses that are required for a kernel + // (includes sub function uses) + SmallDenseMap KernelsInfo; + + // All kernels (entry points) in module being processed + SmallPtrSet Kernels; + + // Already visited functions + SmallPtrSet AlreadyVisited; + + // ImplicitUseInfo objects created + SmallVector ImplicitsInfoObjs; + + // Functions that contain implicit arg intrinsics + SmallPtrSet ContainImplicit; + + // GlobalVariables that have been created for an intrinsic + SmallDenseMap GlobalsMap; +}; + +} // namespace + +bool CMImpParam::runOnModule(Module &M) { + bool changed = false; + + // Apply necessary changes if kernels are compiled for OpenCL runtime. + changed |= ConvertToOCLPayload(M); + + // Analyze functions for implicit use intrinsic invocation + changed |= AnalyzeImplicitUse(M); + + // Collect all CM kernels from named metadata and also traverse the call graph + // to determine what the total implicit uses are for the top level kernels + if (NamedMDNode *Named = M.getNamedMetadata(genx::FunctionMD::GenXKernels)) { + for (unsigned I = 0, E = Named->getNumOperands(); I != E; ++I) { + MDNode *Node = Named->getOperand(I); + if (auto F = dyn_cast_or_null( + getValue(Node->getOperand(genx::KernelMDOp::FunctionRef)))) { + Kernels.insert(F); + AlreadyVisited.clear(); + ImplicitUseInfo &implicits = getImplicitUseInfoKernel(F); + PropagateImplicits(F, M, implicits); + // for OCL/L0 RT we should unconditionally add + // implicit PRIVATE_BASE argument which is not supported on CM RT + if (!implicits.empty() || !IsCmRT) { + ProcessKernel(F); + changed |= true; + } + } + } + } + for (ImplicitUseInfo *Obj : ImplicitsInfoObjs) + delete Obj; + + return changed; +} + +// Replace the given instruction with a load from a global +void CMImpParam::replaceWithGlobal(CallInst *CI, unsigned IID) { + GlobalVariable *GV = getIIDGlobal(CI->getParent()->getParent(), IID); + LoadInst *Load = new LoadInst(GV, GV->getName() + ".val", CI); + CI->replaceAllUsesWith(Load); +} + +// For each function, see if it uses an intrinsic that in turn requires an +// implicit kernel argument +// (such as llvm.genx.local.size) +bool CMImpParam::AnalyzeImplicitUse(Module &M) { + bool changed = false; + + for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { + Function *Fn = &*I; + LLVM_DEBUG(dbgs() << "AnalyzeImplicitUse visiting " << Fn->getName() << "\n"); + + bool implicitUse = false; + + SmallVector ToErase; + + // FIXME I think this should scan function declarations to find the implicit + // arg intrinsics, then scan their uses, instead of scanning the whole code + // to find calls to them. + for (inst_iterator II = inst_begin(Fn), IE = inst_end(Fn); II != IE; ++II) { + Instruction *Inst = &*II; + if (CallInst *CI = dyn_cast(Inst)) { + if (Function *Callee = CI->getCalledFunction()) { + auto IID = GenXIntrinsic::getAnyIntrinsicID(Callee); + if (GenXIntrinsic::isAnyNonTrivialIntrinsic(IID)) { + switch (IID) { + case GenXIntrinsic::genx_local_size: + case GenXIntrinsic::genx_local_id: + case GenXIntrinsic::genx_local_id16: + case GenXIntrinsic::genx_group_count: + case GenXIntrinsic::genx_get_scoreboard_deltas: + case GenXIntrinsic::genx_get_scoreboard_bti: + case GenXIntrinsic::genx_get_scoreboard_depcnt: + case GenXIntrinsic::genx_local_id_x: + case GenXIntrinsic::genx_local_id_y: + case GenXIntrinsic::genx_local_id_z: + case GenXIntrinsic::genx_group_or_local_size: + case GenXIntrinsic::genx_print_buffer: + LLVM_DEBUG(dbgs() << "AnalyzeImplicitUse found " + << GenXIntrinsic::getGenXName((GenXIntrinsic::ID)IID, None)); + addImplicit(Fn, IID); + implicitUse = true; + + // Replace the intrinsic with a load of a global at this point + replaceWithGlobal(CI, IID); + ToErase.push_back(CI); + changed = true; + break; + default: + // Ignore (default added to prevent compiler warnings) + break; + } + } + } + } + } + + for (auto CI : ToErase) + CI->eraseFromParent(); + + // Mark this function as containing an implicit use intrinsic + if (implicitUse) + ContainImplicit.insert(Fn); + } + + return changed; +} + +// Convert to implicit thread payload related intrinsics. +bool CMImpParam::ConvertToOCLPayload(Module &M) { + // Check if this kernel is compiled for OpenCL runtime. + bool DoConversion = false; + + if (NamedMDNode *Named = M.getNamedMetadata(genx::FunctionMD::GenXKernels)) { + for (unsigned I = 0, E = Named->getNumOperands(); I != E; ++I) { + MDNode *Node = Named->getOperand(I); + auto F = dyn_cast_or_null( + getValue(Node->getOperand(genx::KernelMDOp::FunctionRef))); + if (F && (F->hasFnAttribute(genx::FunctionMD::OCLRuntime) || !IsCmRT)) { + DoConversion = true; + break; + } + } + } + + if (!DoConversion) + return false; + + bool Changed = false; + auto getFn = [=, &M](unsigned ID, Type *Ty) { + return M.getFunction(GenXIntrinsic::getAnyName(ID, Ty)); + }; + + // Convert genx_local_id -> zext(genx_local_id16) + Type *Ty32 = VectorType::get(Type::getInt32Ty(M.getContext()), 3); + Type *Ty16 = VectorType::get(Type::getInt16Ty(M.getContext()), 3); + if (auto LIDFn = getFn(GenXIntrinsic::genx_local_id, Ty32)) { + Function *LID16 = GenXIntrinsic::getGenXDeclaration( + &M, GenXIntrinsic::genx_local_id16, Ty16); + for (auto UI = LIDFn->user_begin(); UI != LIDFn->user_end();) { + auto UInst = dyn_cast(*UI++); + if (UInst) { + IRBuilder<> Builder(UInst); + Value *Val = Builder.CreateCall(LID16); + Val = Builder.CreateZExt(Val, Ty32); + UInst->replaceAllUsesWith(Val); + UInst->eraseFromParent(); + Changed = true; + } + } + } + return Changed; +} + +// Merge implicit uses from the supplied function with implicit set passed in +void CMImpParam::MergeImplicits(ImplicitUseInfo &implicits, Function *F) { + assert(ImplicitsInfo.count(F) && "Function not found in implicits info map"); + auto IInfo = ImplicitsInfo[F]; + implicits.merge(*IInfo); +} + +// Determine if the named function uses any functions tagged with implicit use +// in the call-graph +void CMImpParam::PropagateImplicits(Function *F, Module &M, + ImplicitUseInfo &implicits) { + // Traverse the call graph from the Kernel to determine what implicits are + // used + CallGraph &CG = getAnalysis().getCallGraph(); + + // If this node has already been processed then return immediately + if (AlreadyVisited.count(F)) + return; + + // Add this node to the already visited list + AlreadyVisited.insert(F); + + // Start the traversal + CallGraphNode *N = CG[F]; + // Inspect all children (recursive) + for (auto Children : *N) { + auto Func = Children.second->getFunction(); + // Does this function have implicit arg use? + if (ContainImplicit.count(Func)) { + // Yes - add the implicits it contains to the set so far + MergeImplicits(implicits, Func); + } + // Also recursively process children of this node + PropagateImplicits(Func, M, implicits); + } +} + +// Process a kernel - loads from a global (and the globals) have already been +// added if required elsewhere (in doInitialization) +// We've already determined that this is a kernel and that it requires some +// implicit arguments adding +CallGraphNode *CMImpParam::ProcessKernel(Function *F) { + LLVMContext &Context = F->getContext(); + + assert(Kernels.count(F) && "ProcessKernel invoked on non-kernel CallGraphNode"); + + AttributeList AttrVec; + const AttributeList &PAL = F->getAttributes(); + + // Determine the new argument list + SmallVector ArgTys; + + // First transfer all the explicit arguments from the old kernel + unsigned ArgIndex = 0; + for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; + ++I, ++ArgIndex) { + ArgTys.push_back(I->getType()); + AttributeSet attrs = PAL.getParamAttributes(ArgIndex); + if (attrs.hasAttributes()) { + AttrBuilder B(attrs); + AttrVec = AttrVec.addParamAttributes(Context, ArgIndex, B); + } + } + + bool UsesImplicits = KernelsInfo.count(F) > 0; + + // Now add all the implicit arguments + if (UsesImplicits) { + ImplicitUseInfo *IUI = KernelsInfo[F]; + for (unsigned IID : IUI->getImplicits()) { + ArgTys.push_back(getIntrinRetType(Context, IID)); + // TODO: Might need to also add any attributes from the intrinsic at some + // point + } + } + if (!IsCmRT) { + // PRIVATE_BASE arg + ArgTys.push_back(Type::getInt64Ty(F->getContext())); + } + + FunctionType *NFTy = FunctionType::get(F->getReturnType(), ArgTys, false); + assert((NFTy != F->getFunctionType()) && + "type out of sync, expect bool arguments)"); + + // Add any function attributes + AttributeSet FnAttrs = PAL.getFnAttributes(); + if (FnAttrs.hasAttributes()) { + AttrBuilder B(FnAttrs); + AttrVec = AttrVec.addAttributes(Context, AttributeList::FunctionIndex, B); + } + + // Create new function body and insert into the module + Function *NF = Function::Create(NFTy, F->getLinkage(), F->getName()); + NF->setAttributes(AttrVec); + LLVM_DEBUG(dbgs() << "CMImpParam: Transforming to: " << *NF << "\n" << "From: " + << *F); + F->getParent()->getFunctionList().insert(F->getIterator(), NF); + NF->takeName(F); + NF->setSubprogram(F->getSubprogram()); // tranfer debug-info + + // Now to splice the body of the old function into the new function + NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList()); + + // Loop over the argument list, transferring uses of the old arguments to the + // new arguments, also tranferring over the names as well + Function::arg_iterator I2 = NF->arg_begin(); + for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E; + ++I, ++I2) { + I->replaceAllUsesWith(I2); + I2->takeName(I); + } + + // Get the insertion point ready for stores to globals + Instruction &FirstI = *NF->getEntryBlock().begin(); + llvm::SmallVector ImpKinds; + + if (UsesImplicits) { + ImplicitUseInfo *IUI = KernelsInfo[F]; + for (unsigned IID : IUI->getImplicits()) { + // We known that for each IID implicit we've already added an arg + // Rename the arg to something more meaningful here + assert(I2 != NF->arg_end() && + "fewer parameters for new function than expected"); + I2->setName("__arg_" + GenXIntrinsic::getAnyName(IID, None)); + + // Also insert a new store at the start of the function to the global + // variable used for this implicit argument intrinsic + assert(GlobalsMap.count(IID) && + "no global associated with this imp arg intrinsic"); + new StoreInst(I2, GlobalsMap[IID], &FirstI); + + // Prepare the kinds that will go into the metadata + ImpKinds.push_back(MapToKind(IID)); + + ++I2; + } + } + if (!IsCmRT) { + I2->setName("privBase"); + ImpKinds.push_back(genx::KernelMetadata::AK_NORMAL | + genx::KernelMetadata::IMP_OCL_PRIVATE_BASE); + } + + CallGraph &CG = getAnalysis().getCallGraph(); + CallGraphNode *NF_CGN = CG.getOrInsertFunction(NF); + + if (F->hasDLLExportStorageClass()) + NF->setDLLStorageClass(F->getDLLStorageClass()); + // Scan the CM kernel metadata and replace with NF + if (NamedMDNode *Named = + CG.getModule().getNamedMetadata(genx::FunctionMD::GenXKernels)) { + for (unsigned I = 0, E = Named->getNumOperands(); I != E; ++I) { + MDNode *Node = Named->getOperand(I); + if (auto VM = dyn_cast_or_null( + Node->getOperand(genx::KernelMDOp::FunctionRef))) { + if (F == VM->getValue()) { + Node->replaceOperandWith(genx::KernelMDOp::FunctionRef, ValueAsMetadata::get(NF)); + llvm::SmallVector ArgKinds; + + // Create a new MDNode of Kinds + // First add all the current Kinds for explicit operands + MDNode *TypeNode = + dyn_cast(Node->getOperand(genx::KernelMDOp::ArgKinds)); + assert(TypeNode); + for (unsigned i = 0; i < TypeNode->getNumOperands(); ++i) + ArgKinds.push_back(TypeNode->getOperand(i)); + for (uint32_t Kind : ImpKinds) + ArgKinds.push_back(ValueAsMetadata::getConstant( + ConstantInt::get(Type::getInt32Ty(Context), Kind))); + llvm::MDNode *Kinds = llvm::MDNode::get(Context, ArgKinds); + Node->replaceOperandWith(genx::KernelMDOp::ArgKinds, Kinds); + } + } + } + } + + // Now that the old function is dead, delete it. If there is a dangling + // reference to the CallGraphNode, just leave the dead function around + NF_CGN->stealCalledFunctionsFrom(CG[F]); + CallGraphNode *CGN = CG[F]; + if (CGN->getNumReferences() == 0) + delete CG.removeFunctionFromModule(CGN); + else + F->setLinkage(Function::ExternalLinkage); + + return NF_CGN; +} + +void CMImpParam::print(raw_ostream &OS, const Module *M) const { + OS << "Kernels : \n"; + + for (auto Func : Kernels) { + OS.indent(4) << Func->getName() << "\n"; + + const ImplicitUseInfo *IUI = implicitUseInfoKernelExist(Func); + if (IUI) + IUI->print(OS, 8); + } + + OS << "Functions with implicit arg intrinsics : \n"; + + for (auto FuncInfoPair : ImplicitsInfo) { + OS.indent(4) << FuncInfoPair.first->getName() << "\n"; + + FuncInfoPair.second->print(OS, 8); + } +} + + +char CMImpParam::ID = 0; +INITIALIZE_PASS_BEGIN(CMImpParam, "cmimpparam", + "Transformations required to support implicit arguments", + false, false) +INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) +INITIALIZE_PASS_END(CMImpParam, "cmimpparam", + "Transformations required to support implicit arguments", + false, false) + +Pass *llvm::createCMImpParamPass(bool IsCMRT) { return new CMImpParam(IsCMRT); } diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMTrans/CMKernelArgOffset.cpp b/IGC/VectorCompiler/lib/GenXOpts/CMTrans/CMKernelArgOffset.cpp new file mode 100644 index 000000000000..50d927561369 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXOpts/CMTrans/CMKernelArgOffset.cpp @@ -0,0 +1,621 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ + +//===----------------------------------------------------------------------===// +// +/// CMKernelArgOffset +/// ----------------- +/// +/// This pass determines the offset of each CM kernel argument, and adds it to +/// the kernel metadata. +/// +/// This pass also changes the linkage type for kernels, functions, and globals. +/// assuming that functions and globals has no external exposure, therefore +/// if not use, can be deleted by later GlobalDCE pass. +/// +/// A CM kernel has metadata containing, amongst other things, an array of +/// *kind* bytes, one byte per kernel argument, that will be output in the vISA +/// kernel input table. This pass calculates the offset of each kernel argument, +/// and adds an array to the kernel metadata containing the calculated offsets. +/// +/// Argument offsets start at 32, as r0 is reserved by the various thread +/// dispatch mechanisms. +/// +/// The pass attempts to calculate the kernel argument offsets in a way that +/// minimizes space wasted by holes. +/// +/// The arguments are processed in three sets, with each (non-empty) set +/// starting in a new GRF: +/// +/// 1. explicit kernel arguments (i.e. ones that appeared in the CM source); +/// +/// 2. implicit kernel (non-thread) arguments; +/// +/// 3. implicit thread arguments. +/// +/// These three sets need to be allocated as three separate chunks of whole GRF +/// registers in that order by the CM runtime. In theory, the CM runtime can +/// cope with the compiler creating a different ordering, but to do so it needs +/// to create its own ordering and insert mov instructions at the start of the +/// kernel, which is suboptimal. However, I am not clear whether that mechanism +/// works, and it has not been tested. +/// +/// There is a compiler option that can be used to disable argument re-ordering. +/// This is for developers who are using the output asm files directly and want +/// to control the argument order explicitly. The option is +/// -enable-kernel-arg-reordering but is typically invoked as -mllvm +/// -enable-kernel-arg-reordering=false (the default is true) +/// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "cmkernelargoffset" + +#include "llvmWrapper/Support/Alignment.h" + +#include "vc/GenXOpts/GenXOpts.h" +#include "vc/GenXOpts/Utils/KernelInfo.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/GenXIntrinsics/GenXIntrinsics.h" +#include "llvm/GenXIntrinsics/GenXMetadata.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/DiagnosticPrinter.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +static cl::opt + EnableKernelArgReordering("enable-kernel-arg-reordering", cl::init(true), + cl::Hidden, + cl::desc("Enable kernel argument reordering")); + +namespace llvm { +void initializeCMKernelArgOffsetPass(PassRegistry &); +} + +namespace { + +struct GrfParamZone { + unsigned Start; + unsigned End; + GrfParamZone(unsigned s, unsigned e) : Start(s), End(e){}; +}; + +// Diagnostic information for error/warning from this pass. +class DiagnosticInfoCMKernelArgOffset : public DiagnosticInfoOptimizationBase { +private: + static int KindID; + static int getKindID() { + if (KindID == 0) + KindID = llvm::getNextAvailablePluginDiagnosticKind(); + return KindID; + } + +public: + static void emit(Instruction *Inst, StringRef Msg, + DiagnosticSeverity Severity = DS_Error); + DiagnosticInfoCMKernelArgOffset(DiagnosticSeverity Severity, + const Function &Fn, const DebugLoc &DLoc, + StringRef Msg) + : DiagnosticInfoOptimizationBase((DiagnosticKind)getKindID(), Severity, + /*PassName=*/nullptr, Msg, Fn, DLoc) {} + // This kind of message is always enabled, and not affected by -rpass. + virtual bool isEnabled() const override { return true; } + static bool classof(const DiagnosticInfo *DI) { + return DI->getKind() == getKindID(); + } +}; +int DiagnosticInfoCMKernelArgOffset::KindID = 0; + +// CMKernelArgOffset pass +class CMKernelArgOffset : public ModulePass { + genx::KernelMetadata *KM = nullptr; + + // Emit code for OCL runtime. + bool OCLCodeGen = false; + +public: + static char ID; + CMKernelArgOffset(unsigned GrfByteSize = 32, bool OCLCodeGen = false) + : ModulePass(ID), OCLCodeGen(OCLCodeGen), GrfByteSize(GrfByteSize) { + initializeCMKernelArgOffsetPass(*PassRegistry::getPassRegistry()); + GrfMaxCount = 256; + GrfStartOffset = GrfByteSize; + GrfEndOffset = 128 * GrfByteSize; + } + virtual void getAnalysisUsage(AnalysisUsage &AU) const {} + virtual StringRef getPassName() const { return "CM kernel arg offset"; } + virtual bool runOnModule(Module &M); + +private: + void processKernel(MDNode *Node); + void processKernelOnOCLRT(MDNode *Node, Function *F); + + static Value *getValue(Metadata *M) { + if (auto VM = dyn_cast(M)) + return VM->getValue(); + return nullptr; + } + + // Check whether there is an input/output argument attribute. + void checkArgKinds(Function *F) { + assert(KM && KM->isKernel()); + for (unsigned i = 0, e = KM->getNumArgs(); i != e; ++i) { + auto IOKind = KM->getArgInputOutputKind(i); + // If there is input/output attribute, compiler will not freely reorder + // arguments. + if (IOKind != genx::KernelMetadata::IO_Normal) { + EnableKernelArgReordering = false; + break; + } + } + } + + // Relayout thread paylod for OpenCL runtime. + bool enableOCLCodeGen() const { return OCLCodeGen; } + + // Update offset MD node + void updateOffsetMD(MDNode *KernelMD, + SmallDenseMap &PlacedArgs) { + assert(KM); + Function *F = dyn_cast_or_null( + getValue(KernelMD->getOperand(genx::KernelMDOp::FunctionRef))); + assert(F && "nullptr kernel"); + + // All arguments now have offsets. Update the metadata node containing the + // offsets. + assert(F->arg_size() == KM->getNumArgs() && + "Mismatch between metadata for kernel and number of args"); + SmallVector ArgOffsets; + auto I32Ty = Type::getInt32Ty(F->getContext()); + for (auto ai = F->arg_begin(), ae = F->arg_end(); ai != ae; ++ai) { + Argument *Arg = &*ai; + ArgOffsets.push_back(ValueAsMetadata::getConstant( + ConstantInt::get(I32Ty, PlacedArgs[Arg]))); + } + MDNode *OffsetsNode = MDNode::get(F->getContext(), ArgOffsets); + KernelMD->replaceOperandWith(genx::KernelMDOp::ArgOffsets, OffsetsNode); + + // Give an error on too many arguments. + if (ArgOffsets.size() >= GrfMaxCount) + DiagnosticInfoCMKernelArgOffset::emit(&F->front().front(), + "Too many kernel arguments"); + } + + unsigned GrfByteSize; + unsigned GrfMaxCount; + unsigned GrfStartOffset; + unsigned GrfEndOffset; +}; + +} // namespace + +char CMKernelArgOffset::ID = 0; + +INITIALIZE_PASS_BEGIN(CMKernelArgOffset, "cmkernelargoffset", + "CM kernel arg offset determination", false, false) +INITIALIZE_PASS_END(CMKernelArgOffset, "cmkernelargoffset", + "CM kernel arg offset determination", false, false) + +Pass *llvm::createCMKernelArgOffsetPass(unsigned GrfByteSize, bool OCLCodeGen) { + return new CMKernelArgOffset(GrfByteSize, OCLCodeGen); +} + +/*********************************************************************** + * runOnModule : run the CM kernel arg offset pass + */ +bool CMKernelArgOffset::runOnModule(Module &M) { + NamedMDNode *Named = M.getNamedMetadata(genx::FunctionMD::GenXKernels); + if (!Named) + return 0; + + // Process each kernel in the CM kernel metadata. + for (unsigned i = 0, e = Named->getNumOperands(); i != e; ++i) { + MDNode *KernelNode = Named->getOperand(i); + if (KernelNode) + processKernel(KernelNode); + } + + return true; +} + +/*********************************************************************** + * processKernel : process one kernel + * + * Enter: Node = metadata node for one kernel + * + * See GenXMetadata.h for complete list of kernel metadata + */ +void CMKernelArgOffset::processKernel(MDNode *Node) { + Function *F = dyn_cast_or_null( + getValue(Node->getOperand(genx::KernelMDOp::FunctionRef))); + if (!F) + return; + + // change the linkage attribute for the kernel + F->setDLLStorageClass(llvm::GlobalValue::DLLExportStorageClass); + + genx::KernelMetadata KM(F); + this->KM = &KM; + checkArgKinds(F); + + // Layout kernel arguments differently if to run on OpenCL runtime. + if (enableOCLCodeGen()) { + return processKernelOnOCLRT(Node, F); + } + + auto getTypeSizeInBytes = [=](Type *Ty) { + const DataLayout &DL = F->getParent()->getDataLayout(); + if (auto PT = dyn_cast(Ty)) + return DL.getPointerTypeSize(Ty); + return static_cast(Ty->getPrimitiveSizeInBits() / 8); + }; + + // setup kernel inputs, optionally reordering the assigned offsets for + // improved packing where appropriate. The reordering algorithm replicates + // that used in the legacy Cm compiler, as certain media walker applications + // seem sensitive to the way the kernel inputs are laid out. + SmallDenseMap PlacedArgs; + unsigned Offset = 0; + if (EnableKernelArgReordering /*DoReordering*/) { + // Reorder kernel input arguments. Arguments are placed in size order, + // largest first (then in natural argument order where arguments are the + // same size). Each argument is placed at the lowest unused suitably + // aligned offset. So, in general big arguments are placed first with the + // smaller arguments being fit opportunistically into the gaps left + // between arguments placed earlier. + // + // Arguments that are at least one GRF in size must be aligned to a GRF + // boundary. Arguments smaller than a GRF must not cross a GRF boundary. + // + // FreeZones describes unallocated portions of the kernel input space, + // and is list of non-overlapping start-end pairs, ordered lowest first. + // Initially it consists of a single pair that describes the whole space + + SmallVector FreeZones; + FreeZones.push_back(GrfParamZone(GrfStartOffset, GrfEndOffset)); + + // Repeatedly iterate over the arguments list, each time looking for the + // largest one that hasn't been processed yet. + // But ignore implicit args for now as they want to go after all the others. + + do { + Argument *BestArg = nullptr; + unsigned BestSize; + unsigned BestElemSize; + + auto ArgKinds = KM.getArgKinds(); + auto Kind = ArgKinds.begin(); + for (Function::arg_iterator i = F->arg_begin(), e = F->arg_end(); i != e; + ++i, ++Kind) { + Argument *Arg = &*i; + if (*Kind & 0xf8) + continue; // implicit arg + + if (PlacedArgs.find(Arg) != PlacedArgs.end()) + // Already done this one. + continue; + + Type *Ty = Arg->getType(); + unsigned Bytes = getTypeSizeInBytes(Ty); + + if (BestArg == nullptr || BestSize < Bytes) { + BestArg = Arg; + BestSize = Bytes; + BestElemSize = getTypeSizeInBytes(Ty->getScalarType()); + } + } + + if (BestArg == nullptr) + // All done. + break; + + // The best argument in this cycle has been found. Search FreeZones for + // a suitably sized and aligned gap. + + unsigned Align; + + if (BestSize > GrfByteSize) + Align = GrfByteSize; + else + Align = BestElemSize; + + auto zi = FreeZones.begin(); + auto ze = FreeZones.end(); + + unsigned Start = 0, End = 0; + + for (; zi != ze; ++zi) { + GrfParamZone &Zone = *zi; + + Start = alignTo(Zone.Start, Align); + End = Start + BestSize; + + if ((Start % GrfByteSize) != 0 && + (Start / GrfByteSize) != (End - 1) / GrfByteSize) { + Start = alignTo(Zone.Start, GrfByteSize); + End = Start + BestSize; + } + + if (End <= Zone.End) + // Found one. This should never fail unless we have too many + // parameters to start with. + break; + } + + assert(zi != ze && + "unable to allocate argument offset (too many arguments?)"); + + // Exclude the found block from the free zones list. This may require + // that the found zone be split in two if the start of the block is + // not suitably aligned. + + GrfParamZone &Zone = *zi; + + if (Zone.Start == Start) + Zone.Start = End; + else { + unsigned NewEnd = Zone.End; + Zone.End = Start; + ++zi; + FreeZones.insert(zi, GrfParamZone(End, NewEnd)); + } + + PlacedArgs[BestArg] = Start; + } while (true); + // Now process the implicit args. First get the offset at the start of the + // last free zone. Process the implicit kernel args first, then the + // implicit thread args. + Offset = FreeZones.back().Start; + for (int WantThreadImplicit = 0; WantThreadImplicit != 2; + ++WantThreadImplicit) { + bool FirstThreadImplicit = WantThreadImplicit; + auto ArgKinds = KM.getArgKinds(); + auto Kind = ArgKinds.begin(); + for (Function::arg_iterator i = F->arg_begin(), e = F->arg_end(); i != e; + ++i, ++Kind) { + Argument *Arg = &*i; + if (!(*Kind & 0xf8)) + continue; // not implicit arg + int IsThreadImplicit = (*Kind >> 3) == 3; // local_id + if (WantThreadImplicit != IsThreadImplicit) + continue; + Type *Ty = Arg->getType(); + unsigned Bytes = Ty->getPrimitiveSizeInBits() / 8U; + unsigned Align = Ty->getScalarSizeInBits() / 8U; + // If this is the first thread implicit arg, put it in a new GRF. + if (FirstThreadImplicit) + Align = GrfByteSize; + FirstThreadImplicit = false; + Offset = alignTo(Offset, Align); + if ((Offset & (GrfByteSize - 1)) + Bytes > GrfByteSize) { + // GRF align if arg would cross GRF boundary + Offset = alignTo(Offset, GrfByteSize); + } + PlacedArgs[Arg] = Offset; + Offset += Bytes; + } + } + } else { + // No argument reordering. Arguments are placed at increasing offsets + // in their natural order, aligned according to their type. + // + // Again, arguments that are at least one GRF in size must be aligned to + // a GRF boundary. Arguments smaller than a GRF must not cross a GRF + // boundary. + + // kernel input start offset + auto &DL = F->getParent()->getDataLayout(); + Offset = GrfStartOffset; + + // Place an argument and update offset. + // Arguments larger than a GRF must be at least GRF-aligned. Arguments + // smaller than a GRF may not cross GRF boundaries. This means that + // arguments cross a GRF boundary must be GRF aligned. + auto placeArg = [&](Argument *Arg, unsigned ByteSize, unsigned Align) { + Offset = alignTo(Offset, Align); + unsigned StartGRF = Offset / GrfByteSize; + unsigned EndGRF = (Offset + ByteSize - 1) / GrfByteSize; + if (StartGRF != EndGRF) + Offset = alignTo(Offset, GrfByteSize); + PlacedArgs[Arg] = Offset; + Offset += ByteSize; + }; + + for (auto &Arg : F->args()) { + Type *Ty = Arg.getType(); + unsigned Bytes = 0, Alignment = 0; + if (Ty->isPointerTy()) { + Bytes = DL.getPointerTypeSize(Ty); + Alignment = IGCLLVM::getAlignmentValue( + DL.getPointerABIAlignment(Ty->getPointerAddressSpace())); + } else { + Bytes = Ty->getPrimitiveSizeInBits() / 8; + Alignment = IGCLLVM::getAlignmentValue(Ty->getScalarSizeInBits() / 8); + } + placeArg(&Arg, Bytes, Alignment); + } + } + + // Update the offset MD node. + updateOffsetMD(Node, PlacedArgs); + + this->KM = nullptr; +} + +/*********************************************************************** + * DiagnosticInfoCMKernelArgOffset::emit : emit an error or warning + */ +void DiagnosticInfoCMKernelArgOffset::emit(Instruction *Inst, StringRef Msg, + DiagnosticSeverity Severity) { + DiagnosticInfoCMKernelArgOffset Err(Severity, *Inst->getParent()->getParent(), + Inst->getDebugLoc(), Msg); + Inst->getContext().diagnose(Err); +} + +void CMKernelArgOffset::processKernelOnOCLRT(MDNode *Node, Function *F) { + assert(KM); + // Assign BTI values. + { + unsigned Idx = 0; + auto ArgKinds = KM->getArgKinds(); + auto Kind = ArgKinds.begin(); + for (auto &Arg : F->args()) { + if (*Kind == genx::KernelMetadata::AK_SAMPLER || + *Kind == genx::KernelMetadata::AK_SURFACE) { + int32_t BTI = KM->getBTI(Idx); + assert(BTI >= 0 && "unassigned BTI"); + + Type *Ty = Arg.getType(); + if (Ty->isPointerTy()) { + SmallVector ToErase; + + assert(Arg.hasOneUse() && "invalid surface input"); + auto ArgUse = Arg.use_begin()->getUser(); + assert(isa(ArgUse) && "invalid surface input usage"); + ToErase.push_back(cast(ArgUse)); + + for (auto ui = ArgUse->use_begin(), ue = ArgUse->use_end(); ui != ue; + ++ui) { + auto User = cast(ui->getUser()); + User->replaceAllUsesWith( + ConstantInt::get(User->getType(), BTI)); + ToErase.push_back(User); + } + + for (auto i = ToErase.rbegin(), e = ToErase.rend(); i != e; ++i) + (*i)->eraseFromParent(); + ToErase.clear(); + } else { + Arg.replaceAllUsesWith(ConstantInt::get(Arg.getType(), BTI)); + } + } + ++Kind, ++Idx; + } + } + + SmallDenseMap PlacedArgs; + { + // OpenCL SIMD8 thread payloads are organized as follows: + // + // 0 1 2 3 4 5 6 7 + // R0: GX GY GZ + // R1: LIDx LIDy LIDz + // + unsigned Offset = GrfStartOffset; + + unsigned ThreadPayloads[] = { + Offset // R1, local_id_x, local_id_y, local_id_z + }; + auto getImpOffset = [&](genx::KernelArgInfo AI) -> int { + if (AI.isLocalIDs()) + return ThreadPayloads[0]; + return -1; + }; + + // Starting offsets for non-implicit arguments. + Offset += 1 * GrfByteSize; + + // Place an argument and update offset. + // Arguments larger than a GRF must be at least GRF-aligned. Arguments + // smaller than a GRF may not cross GRF boundaries. This means that + // arguments cross a GRF boundary must be GRF aligned. + auto placeArg = [&](Argument *Arg, unsigned ByteSize, unsigned Align) { + Offset = alignTo(Offset, Align); + unsigned StartGRF = Offset / GrfByteSize; + unsigned EndGRF = (Offset + ByteSize - 1) / GrfByteSize; + if (StartGRF != EndGRF) + Offset = alignTo(Offset, GrfByteSize); + PlacedArgs[Arg] = Offset; + Offset += ByteSize; + }; + + // First scan, assign implicit arguments. + auto ArgKinds = KM->getArgKinds(); + auto Kind = ArgKinds.begin(); + for (auto &Arg : F->args()) { + genx::KernelArgInfo AI(*Kind++); + int ImpOffset = getImpOffset(AI); + if (ImpOffset > 0) { + PlacedArgs[&Arg] = ImpOffset; + continue; + } + + if (AI.isLocalSize() || AI.isGroupCount() || AI.isPrintBuffer() || + AI.isPrivateBase()) { + unsigned Bytes = Arg.getType()->getPrimitiveSizeInBits() / 8; + unsigned Align = Arg.getType()->getScalarSizeInBits() / 8; + placeArg(&Arg, Bytes, Align); + } + } + + // Second scan, assign normal arguments. + Kind = ArgKinds.begin(); + unsigned Idx = 0; + for (auto &Arg : F->args()) { + genx::KernelArgInfo AI(*Kind++); + bool IsBuffer = KM->isBufferType(Idx++); + + // Skip alaready assigned arguments. + if (PlacedArgs.count(&Arg)) + continue; + + // image/sampler arguments do not allocate vISA inputs + // buffer arguments do allocate unused vISA inputs + if (!AI.isNormalCategory() && !IsBuffer) { + PlacedArgs[&Arg] = genx::KernelMetadata::SKIP_OFFSET_VAL; + continue; + } + + Type *Ty = Arg.getType(); + auto &DL = F->getParent()->getDataLayout(); + unsigned Alignment = 0; + unsigned Bytes = 0; + if (IsBuffer) { + // Buffer is treated as stateless global pointer! + Bytes = DL.getPointerSize(); + Alignment = IGCLLVM::getAlignmentValue(DL.getPointerABIAlignment(0)); + } else if (Ty->isPointerTy()) { + Bytes = DL.getPointerTypeSize(Ty); + Alignment = IGCLLVM::getAlignmentValue( + DL.getPointerABIAlignment(Ty->getPointerAddressSpace())); + } else { + Bytes = Ty->getPrimitiveSizeInBits() / 8; + Alignment = IGCLLVM::getAlignmentValue(Ty->getScalarSizeInBits() / 8); + } + placeArg(&Arg, Bytes, Alignment); + } + } + + updateOffsetMD(Node, PlacedArgs); +} diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMTrans/CMRegion.cpp b/IGC/VectorCompiler/lib/GenXOpts/CMTrans/CMRegion.cpp new file mode 100644 index 000000000000..4fcb90e732cc --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXOpts/CMTrans/CMRegion.cpp @@ -0,0 +1,925 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ +// +// Implementation of methods for CMRegion class +// +//===----------------------------------------------------------------------===// + +#include "vc/GenXOpts/Utils/CMRegion.h" +#include "llvm/ADT/SmallBitVector.h" +#include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/GenXIntrinsics/GenXIntrinsics.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" + +using namespace llvm; + +// Find the datalayout if possible. +const DataLayout *GetDL(Value *V) { + if (auto Inst = dyn_cast_or_null(V)) + return &Inst->getParent()->getParent()->getParent()->getDataLayout(); + if (auto Arg = dyn_cast_or_null(V)) + return &Arg->getParent()->getParent()->getDataLayout(); + return nullptr; +} + +/*********************************************************************** + * Region constructor from a type + */ +CMRegion::CMRegion(Type *Ty, const DataLayout *DL) + : ElementBytes(0), NumElements(1), VStride(0), Width(1), + Stride(1), Offset(0), Indirect(0), IndirectIdx(0), Mask(0), + ParentWidth(0) +{ + assert(!Ty->isAggregateType() && + "cannot create region based on an aggregate type"); + ElementTy = Ty; + if (VectorType *VT = dyn_cast(ElementTy)) { + ElementTy = VT->getElementType(); + NumElements = VT->getNumElements(); + Width = NumElements; + } + if (DL) { + unsigned BitSize = DL->getTypeSizeInBits(ElementTy); + ElementBytes = alignTo<8>(BitSize) / 8; + } else { + unsigned BitSize = ElementTy->getPrimitiveSizeInBits(); + ElementBytes = alignTo<8>(BitSize) / 8; + assert(ElementBytes && "Cannot compute element size without data layout"); + } +} + +/*********************************************************************** + * Region constructor from a value + */ +CMRegion::CMRegion(Value *V, const DataLayout *DL) + : CMRegion(V->getType(), DL ? DL : GetDL(V)) {} + +/*********************************************************************** + * Region constructor from a rd/wr region and its BaleInfo + * This also works with rdpredregion and wrpredregion, with Offset in + * bits rather than bytes, and with ElementBytes set to 1. + */ +CMRegion::CMRegion(Instruction *Inst, bool WantParentWidth) + : ElementBytes(0), NumElements(1), VStride(1), Width(1), + Stride(1), Offset(0), Indirect(0), IndirectIdx(0), Mask(0), + ParentWidth(0) +{ + // Determine where to get the subregion value from and which arg index + // the region parameters start at. + unsigned ArgIdx = 0; + Value *Subregion = 0; + assert(isa(Inst)); + switch (GenXIntrinsic::getGenXIntrinsicID(Inst)) { + case GenXIntrinsic::genx_rdpredregion: + NumElements = Inst->getType()->getVectorNumElements(); + Width = NumElements; + Offset = cast(Inst->getOperand(1))->getZExtValue(); + ElementBytes = 1; + return; + case GenXIntrinsic::genx_wrpredregion: + NumElements = Inst->getOperand(1)->getType()->getVectorNumElements(); + Width = NumElements; + Offset = cast(Inst->getOperand(2))->getZExtValue(); + ElementBytes = 1; + return; + case GenXIntrinsic::genx_rdregioni: + case GenXIntrinsic::genx_rdregionf: + ArgIdx = 1; + // The size/type of the region is given by the return value: + Subregion = Inst; + break; + case GenXIntrinsic::genx_wrregioni: + case GenXIntrinsic::genx_wrregionf: + case GenXIntrinsic::genx_wrconstregion: + ArgIdx = 2; + // The size/type of the region is given by the "subregion value to + // write" operand: + Subregion = Inst->getOperand(1); + // For wrregion, while we're here, also get the mask. We set mask to NULL + // if the mask operand is constant 1 (i.e. not predicated). + Mask = Inst->getOperand(GenXIntrinsic::GenXRegion::PredicateOperandNum); + if (auto C = dyn_cast(Mask)) + if (C->isAllOnesValue()) + Mask = 0; + break; + default: + assert(0); + } + // Get the region parameters. + assert(Subregion); + ElementTy = Subregion->getType(); + if (VectorType *VT = dyn_cast(ElementTy)) { + ElementTy = VT->getElementType(); + NumElements = VT->getNumElements(); + } + ElementBytes = ElementTy->getPrimitiveSizeInBits() / 8; + if (ElementTy->getPrimitiveSizeInBits()) + ElementBytes = ElementBytes ? ElementBytes : 1; + VStride = cast(Inst->getOperand(ArgIdx))->getSExtValue(); + Width = cast(Inst->getOperand(ArgIdx + 1))->getSExtValue(); + Stride = cast(Inst->getOperand(ArgIdx + 2))->getSExtValue(); + ArgIdx += 3; + // Get the start index. + Value *V = Inst->getOperand(ArgIdx); + assert(V->getType()->getScalarType()->isIntegerTy(16) && + "region index must be i16 or vXi16 type"); + +#if 0 // _DEBUG + // In one transform, this check does not work in the middle of transformation + if (VectorType *VT = dyn_cast(V->getType())) + assert(VT->getNumElements() * Width == NumElements && + "vector region index size mismatch"); +#endif + + if (ConstantInt *CI = dyn_cast(V)) + Offset = CI->getSExtValue(); // Constant index. + else { + Indirect = V; // Index is variable; assume no baled in add. + // For a variable index, get the parent width arg. + ConstantInt *PW = dyn_cast(Inst->getOperand(ArgIdx + 1)); + if (PW) + ParentWidth = PW->getZExtValue(); + } + // We do some trivial legalization here. The legalization pass does not + // make these changes; instead we do them here so they are not permanently + // written back into the IR but are made on the fly each time some other + // pass uses this code to get the region info. + if (NumElements == 1) { + Width = Stride = 1; + VStride = 0; + } else { + if (NumElements <= Width) { + Width = NumElements; + VStride = 0; + } else if ((unsigned)VStride == Width * Stride) { + // VStride == Width * Stride, so we can canonicalize to a 1D region, + // but only if not indirect or not asked to preserve parentwidth, + // and never if multi-indirect. + if (!Indirect + || (!isa(Indirect->getType()) && !WantParentWidth)) { + Width = NumElements; + VStride = 0; + ParentWidth = 0; + } + } else if (Width == 1) { + // We can turn a 2D width 1 region into a 1D region, but if it is + // indirect it invalidates ParentWidth. So only do it if not asked + // to keep ParentWidth. Also we cannot do it if it is multi-indirect. + if (!Indirect + || (!isa(Indirect->getType()) && !WantParentWidth)) { + Width = NumElements; + Stride = VStride; + VStride = 0; + ParentWidth = 0; + } + } + if (Stride == 0 && Width == NumElements) { + // Canonical scalar region. + Width = 1; + VStride = 0; + } + } +} + +/*********************************************************************** + * Region constructor from bitmap of which elements to set + * + * Enter: Bits = bitmap of which elements to set + * ElementBytes = bytes per element + * + * It is assumed that Bits represents a legal 1D region. + */ +CMRegion::CMRegion(unsigned Bits, unsigned ElementBytes) + : ElementBytes(ElementBytes), ElementTy(0), NumElements(1), VStride(1), + Width(1), Stride(1), Offset(0), Indirect(0), IndirectIdx(0), Mask(0) +{ + assert(Bits); + Offset = countTrailingZeros(Bits, ZB_Undefined); + Bits >>= Offset; + Offset *= ElementBytes; + if (Bits != 1) { + Stride = countTrailingZeros(Bits & ~1, ZB_Undefined); + NumElements = Width = countPopulation(Bits); + } +} + +/*********************************************************************** + * CMRegion::getSubregion : modify Region struct for a subregion + * + * Enter: StartIdx = start index of subregion (in elements) + * Size = size of subregion (in elements) + * + * This does not modify the Mask; the caller needs to do that separately. + */ +void CMRegion::getSubregion(unsigned StartIdx, unsigned Size) +{ + if (Indirect && isa(Indirect->getType())) { + // Vector indirect (multi indirect). Set IndirectIdx to the index of + // the start element in the vector indirect. + IndirectIdx = StartIdx / Width; + StartIdx %= Width; + } + int AddOffset = StartIdx / Width * VStride; + AddOffset += StartIdx % Width * Stride; + AddOffset *= ElementBytes; + Offset += AddOffset; + if (!(StartIdx % Width) && !(Size % Width)) { + // StartIdx is at the start of a row and Size is a whole number of + // rows. + } else if (StartIdx % Width + Size > Width) { + // The subregion goes over a row boundary. This can only happen if there + // is only one row split and it is exactly in the middle. + VStride += (Size / 2 - Width) * Stride; + Width = Size / 2; + } else { + // Within a single row. + Width = Size; + VStride = Size * Stride; + } + NumElements = Size; +} + +/*********************************************************************** + * CMRegion::createRdRegion : create rdregion intrinsic from "this" Region + * + * Enter: Input = vector value to extract subregion from + * Name = name for new instruction + * InsertBefore = insert new inst before this point + * DL = DebugLoc to give the new instruction + * AllowScalar = true to return scalar if region is size 1 + * + * Return: newly created instruction + */ +Instruction *CMRegion::createRdRegion(Value *Input, const Twine &Name, + Instruction *InsertBefore, const DebugLoc &DL, bool AllowScalar) +{ + assert(ElementBytes && "not expecting i1 element type"); + auto OffsetInElem = Offset / ElementBytes; + (void)OffsetInElem; + assert(OffsetInElem >= 0 && + OffsetInElem < Input->getType()->getVectorNumElements() && + "initial offset is out of range of input vector"); + + Value *StartIdx = getStartIdx(Name, InsertBefore, DL); + IntegerType *I32Ty = Type::getInt32Ty(Input->getContext()); + Value *ParentWidthArg = UndefValue::get(I32Ty); + if (Indirect) + ParentWidthArg = ConstantInt::get(I32Ty, ParentWidth); + Value *Args[] = { // Args to new rdregion: + Input, // input to original rdregion + ConstantInt::get(I32Ty, VStride), // vstride + ConstantInt::get(I32Ty, Width), // width + ConstantInt::get(I32Ty, Stride), // stride + StartIdx, // start index (in bytes) + ParentWidthArg // parent width (if variable start index) + }; + Type *ElTy = cast(Args[0]->getType())->getElementType(); + Type *RegionTy; + if (NumElements != 1 || !AllowScalar) + RegionTy = VectorType::get(ElTy, NumElements); + else + RegionTy = ElTy; + Module *M = InsertBefore->getParent()->getParent()->getParent(); + auto IID = ElTy->isFloatingPointTy() + ? GenXIntrinsic::genx_rdregionf : GenXIntrinsic::genx_rdregioni; + Function *Decl = getGenXRegionDeclaration(M, IID, RegionTy, Args); + Instruction *NewInst = CallInst::Create(Decl, Args, Name, InsertBefore); + NewInst->setDebugLoc(DL); + return NewInst; +} + +/*********************************************************************** + * CMRegion::createWrRegion : create wrregion instruction for subregion + * CMRegion::createWrConstRegion : create wrconstregion instruction for subregion + * + * Enter: OldVal = vector value to insert subregion into (can be undef) + * Input = subregion value to insert (can be scalar, as long as + * region size is 1) + * Name = name for new instruction + * InsertBefore = insert new inst before this point + * DL = DebugLoc to give any new instruction + * + * Return: The new wrregion instruction. However, if it would have had a + * predication mask of all 0s, it is omitted and OldVal is returned + * instead. + */ +Value *CMRegion::createWrRegion(Value *OldVal, Value *Input, + const Twine &Name, Instruction *InsertBefore, const DebugLoc &DL) +{ + return createWrCommonRegion(OldVal->getType()->isFPOrFPVectorTy() + ? GenXIntrinsic::genx_wrregionf : GenXIntrinsic::genx_wrregioni, + OldVal, Input, + Name, InsertBefore, DL); +} + +Value *CMRegion::createWrConstRegion(Value *OldVal, Value *Input, + const Twine &Name, Instruction *InsertBefore, const DebugLoc &DL) +{ + assert(!Indirect); + assert(!Mask); + assert(isa(Input)); + return createWrCommonRegion(GenXIntrinsic::genx_wrconstregion, OldVal, Input, + Name, InsertBefore, DL); +} + +Value *CMRegion::createWrCommonRegion(GenXIntrinsic::ID IID, Value *OldVal, Value *Input, + const Twine &Name, Instruction *InsertBefore, const DebugLoc &DL) +{ + assert(ElementBytes && "not expecting i1 element type"); + if (isa(Input->getType())) + assert(NumElements == Input->getType()->getVectorNumElements() && + "input value and region are inconsistent"); + else + assert(NumElements == 1 && "input value and region are inconsistent"); + assert(OldVal->getType()->getScalarType() == + Input->getType()->getScalarType() && + "scalar type mismatch"); + Value *StartIdx = getStartIdx(Name, InsertBefore, DL); + IntegerType *I32Ty = Type::getInt32Ty(Input->getContext()); + Value *ParentWidthArg = UndefValue::get(I32Ty); + if (Indirect) + ParentWidthArg = ConstantInt::get(I32Ty, ParentWidth); + // Get the mask value. If R.Mask is 0, then the wrregion is unpredicated + // and we just use constant 1. + Value *MaskArg = Mask; + if (!MaskArg) + MaskArg = ConstantInt::get(Type::getInt1Ty(Input->getContext()), 1); + // Build the wrregion. + Value *Args[] = { // Args to new wrregion: + OldVal, // original vector + Input, // value to write into subregion + ConstantInt::get(I32Ty, VStride), // vstride + ConstantInt::get(I32Ty, Width), // width + ConstantInt::get(I32Ty, Stride), // stride + StartIdx, // start index (in bytes) + ParentWidthArg, // parent width (if variable start index) + MaskArg // mask + }; + Module *M = InsertBefore->getParent()->getParent()->getParent(); + Function *Decl = getGenXRegionDeclaration(M, IID, nullptr, Args); + Instruction *NewInst = CallInst::Create(Decl, Args, Name, InsertBefore); + NewInst->setDebugLoc(DL); + return NewInst; +} + +/*********************************************************************** + * CMRegion::createRdPredRegion : create rdpredregion instruction + * CMRegion::createRdPredRegionOrConst : create rdpredregion instruction, or + * simplify to constant + * + * Enter: Input = vector value to extract subregion from + * Index = start index of subregion + * Size = size of subregion + * Name = name for new instruction + * InsertBefore = insert new inst before this point + * DL = DebugLoc to give any new instruction + * + * Return: The new rdpredregion instruction + * + * Unlike createRdRegion, this is a static method in Region, because you pass + * the region parameters (the start index and size) directly into this method. + */ +Instruction *CMRegion::createRdPredRegion(Value *Input, unsigned Index, + unsigned Size, const Twine &Name, Instruction *InsertBefore, + const DebugLoc &DL) +{ + Type *I32Ty = Type::getInt32Ty(InsertBefore->getContext()); + Value *Args[] = { // Args to new rdpredregion call: + Input, // input predicate + ConstantInt::get(I32Ty, Index) // start offset + }; + auto RetTy = VectorType::get(Args[0]->getType()->getScalarType(), Size); + Module *M = InsertBefore->getParent()->getParent()->getParent(); + Function *Decl = getGenXRegionDeclaration(M, GenXIntrinsic::genx_rdpredregion, + RetTy, Args); + Instruction *NewInst = CallInst::Create(Decl, Args, Name, InsertBefore); + NewInst->setDebugLoc(DL); + if (NewInst->getName() == "phitmp18.i.i.split0") + dbgs() << "wobble\n"; + return NewInst; +} + +/*********************************************************************** +* GetConstantSubvector : get a contiguous region from a vector constant +*/ +static Constant *GetConstantSubvector(Constant *V, + unsigned StartIdx, unsigned Size) +{ + Type *ElTy = cast(V->getType())->getElementType(); + Type *RegionTy = VectorType::get(ElTy, Size); + if (isa(V)) + V = UndefValue::get(RegionTy); + else if (isa(V)) + V = ConstantAggregateZero::get(RegionTy); + else { + SmallVector Val; + for (unsigned i = 0; i != Size; ++i) + Val.push_back(V->getAggregateElement(i + StartIdx)); + V = ConstantVector::get(Val); + } + return V; +} + +Value *CMRegion::createRdPredRegionOrConst(Value *Input, unsigned Index, + unsigned Size, const Twine &Name, Instruction *InsertBefore, + const DebugLoc &DL) +{ + if (auto C = dyn_cast(Input)) + return GetConstantSubvector(C, Index, Size); + return createRdPredRegion(Input, Index, Size, Name, InsertBefore, DL); +} + +/*********************************************************************** + * CMRegion::createWrPredRegion : create wrpredregion instruction + * + * Enter: OldVal = vector value to insert subregion into (can be undef) + * Input = subregion value to insert + * Index = start index of subregion + * Name = name for new instruction + * InsertBefore = insert new inst before this point + * DL = DebugLoc to give any new instruction + * + * Return: The new wrpredregion instruction + * + * Unlike createWrRegion, this is a static method in Region, because you pass + * the only region parameter (the start index) directly into this method. + */ +Instruction *CMRegion::createWrPredRegion(Value *OldVal, Value *Input, + unsigned Index, const Twine &Name, Instruction *InsertBefore, + const DebugLoc &DL) +{ + IntegerType *I32Ty = Type::getInt32Ty(Input->getContext()); + Value *Args[] = { // Args to new wrpredregion: + OldVal, // original vector + Input, // value to write into subregion + ConstantInt::get(I32Ty, Index), // start index + }; + Module *M = InsertBefore->getParent()->getParent()->getParent(); + Function *Decl = getGenXRegionDeclaration(M, GenXIntrinsic::genx_wrpredregion, + nullptr, Args); + Instruction *NewInst = CallInst::Create(Decl, Args, Name, InsertBefore); + NewInst->setDebugLoc(DL); + return NewInst; +} + +/*********************************************************************** + * CMRegion::createWrPredPredRegion : create wrpredpredregion instruction + * + * Enter: OldVal = vector value to insert subregion into (can be undef) + * Input = subregion value to insert + * Index = start index of subregion + * Pred = predicate for the write region + * Name = name for new instruction + * InsertBefore = insert new inst before this point + * DL = DebugLoc to give any new instruction + * + * Return: The new wrpredpredregion instruction + * + * Unlike createWrRegion, this is a static method in Region, because you pass + * the only region parameter (the start index) directly into this method. + */ +Instruction *CMRegion::createWrPredPredRegion(Value *OldVal, Value *Input, + unsigned Index, Value *Pred, const Twine &Name, Instruction *InsertBefore, + const DebugLoc &DL) +{ + Type *Tys[] = { OldVal->getType(), Input->getType() }; + Function *CalledFunc = GenXIntrinsic::getGenXDeclaration( + InsertBefore->getParent()->getParent()->getParent(), + GenXIntrinsic::genx_wrpredpredregion, Tys); + Value *Args[] = { OldVal, Input, + ConstantInt::get(Type::getInt32Ty(InsertBefore->getContext()), Index), + Pred }; + auto NewInst = CallInst::Create(CalledFunc, Args, "", InsertBefore); + NewInst->setDebugLoc(DL); + return NewInst; +} + +/*********************************************************************** + * setRegionCalledFunc : for an existing rdregion/wrregion call, modify + * its called function to match its operand types + * + * This is used in GenXLegalization after modifying a wrregion operand + * such that its type changes. The called function then needs to change + * because it is decorated with overloaded types. + */ +void CMRegion::setRegionCalledFunc(Instruction *Inst) +{ + auto CI = cast(Inst); + SmallVector Opnds; + for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) + Opnds.push_back(CI->getOperand(i)); + Function *Decl = getGenXRegionDeclaration( + Inst->getParent()->getParent()->getParent(), + GenXIntrinsic::getGenXIntrinsicID(Inst), + Inst->getType(), Opnds); + CI->setOperand(CI->getNumArgOperands(), Decl); +} + +/*********************************************************************** + * getRegionDeclaration : get the function declaration for a region intrinsic + * + * Enter: M = Module + * IID = intrinsic ID + * RetTy = return type (can be 0 if return type not overloaded) + * Args = array of operands so we can determine overloaded types + * + * Return: the Function + */ +Function *CMRegion::getGenXRegionDeclaration(Module *M, + GenXIntrinsic::ID IID, Type *RetTy, ArrayRef Args) +{ + switch (IID) { + case GenXIntrinsic::genx_rdregioni: + case GenXIntrinsic::genx_rdregionf: { + Type *Tys[] = { RetTy, Args[0]->getType(), Args[4]->getType() }; + return GenXIntrinsic::getGenXDeclaration(M, IID, Tys); + } + case GenXIntrinsic::genx_wrregioni: + case GenXIntrinsic::genx_wrregionf: + case GenXIntrinsic::genx_wrconstregion: { + Type *Tys[] = { Args[0]->getType(), Args[1]->getType(), + Args[5]->getType(), Args[7]->getType() }; + return GenXIntrinsic::getGenXDeclaration(M, IID, Tys); + } + case GenXIntrinsic::genx_rdpredregion: { + Type *Tys[] = { RetTy, Args[0]->getType() }; + return GenXIntrinsic::getGenXDeclaration(M, IID, Tys); + } + case GenXIntrinsic::genx_wrpredregion: { + Type *Tys[] = { Args[0]->getType(), Args[1]->getType() }; + return GenXIntrinsic::getGenXDeclaration(M, IID, Tys); + } + default: + llvm_unreachable("unrecognized region intrinsic ID"); + } + return nullptr; +} + +/*********************************************************************** + * getStartIdx : get the LLVM IR Value for the start index of a region + * + * This is common code used by both createRdRegion and createWrRegion. + */ +Value *CMRegion::getStartIdx(const Twine &Name, Instruction *InsertBefore, + const DebugLoc &DL) +{ + IntegerType *I16Ty = Type::getInt16Ty(InsertBefore->getContext()); + if (!Indirect) + return ConstantInt::get(I16Ty, Offset); + // Deal with indirect (variable index) region. + if (auto VT = dyn_cast(Indirect->getType())) { + if (VT->getNumElements() != NumElements) { + // We have a vector indirect and we need to take a subregion of it. + CMRegion IdxRegion(Indirect); + IdxRegion.getSubregion(IndirectIdx, NumElements / Width); + Indirect = IdxRegion.createRdRegion(Indirect, + Name + ".multiindirect_idx_subregion", InsertBefore, DL); + IndirectIdx = 0; + } + } + Value *Index = Indirect; + if (Offset) { + Constant *OffsetVal = ConstantInt::get(I16Ty, Offset); + if (auto VT = dyn_cast(Indirect->getType())) + OffsetVal = ConstantVector::getSplat(VT->getNumElements(), OffsetVal); + auto BO = BinaryOperator::Create(Instruction::Add, Index, OffsetVal, + Name + ".indirect_idx_add", InsertBefore); + BO->setDebugLoc(DL); + Index = BO; + } + return Index; +} + +/*********************************************************************** + * isSimilar : compare two regions to see if they have the same region + * parameters other than start offset, also allowing element type to + * be different + */ +bool CMRegion::isSimilar(const CMRegion &R2) const +{ + if (ElementBytes == R2.ElementBytes) + return isStrictlySimilar(R2); + // Change the element type to match, so we can compare the regions. + CMRegion R = R2; + if (!R.changeElementType(ElementTy)) + return false; + return isStrictlySimilar(R); +} + +BitVector CMRegion::getAccessBitMap(int MinTrackingOffset) const { + // Construct bitmap for a single row + BitVector RowBitMap(getRowLength()); + for (unsigned i = 0; i < Width; i++) { + RowBitMap <<= (Stride * ElementBytes); + RowBitMap.set(0, ElementBytes); + } + // Apply row bitmap to a whole region bitmap + // exactly NumRows times + BitVector BitMap(getLength()); + unsigned NumRows = NumElements / Width; + if (NumRows != 1) { + for (unsigned i = 0; i < NumRows; i++) { + BitMap <<= (VStride * ElementBytes); + BitMap |= RowBitMap; + } + } else + BitMap = std::move(RowBitMap); + // Adjust mask according to min tracking + // offset for comparison + assert(Offset >= MinTrackingOffset); + unsigned Diff = Offset - MinTrackingOffset; + if (Diff) { + BitMap.resize(BitMap.size() + Diff); + BitMap <<= Diff; + } + return BitMap; +} + +// overlap: Compare two regions to see whether they overlaps each other. +bool CMRegion::overlap(const CMRegion &R2) const { + // To be conservative, if any of them is indirect, they overlaps. + if (Indirect || R2.Indirect) + return true; + // To be conservative, if different masks are used, they overlaps. + if (Mask != R2.Mask) + return true; + // Check offsets of regions for intersection + int MaxOffset = std::max(Offset, R2.Offset); + int MinEndOffset = std::min(Offset + getLength(), R2.Offset + R2.getLength()); + if (MaxOffset > MinEndOffset) + return false; + // Check overlapping using bit masks + int MinOffset = std::min(Offset, R2.Offset); + BitVector Mask1 = getAccessBitMap(MinOffset); + BitVector Mask2 = R2.getAccessBitMap(MinOffset); + // If there are any common bits then these regions overlap + return Mask1.anyCommon(Mask2); +} + +/*********************************************************************** + * CMRegion::isContiguous : test whether a region is contiguous + */ +bool CMRegion::isContiguous() const { + return (Width == 1 || Stride == 1) && + (Width == NumElements || VStride == static_cast(Width)); +} + +/*********************************************************************** + * CMRegion::isWhole : test whether a region covers exactly the whole of the + * given type, allowing for the element type being different + */ +bool CMRegion::isWhole(Type *Ty) const +{ + return isContiguous() && NumElements * ElementBytes * 8 + == Ty->getPrimitiveSizeInBits(); +} + +/*********************************************************************** + * evaluateConstantRdRegion : evaluate rdregion with constant input + */ +Constant *CMRegion::evaluateConstantRdRegion(Constant *Input, bool AllowScalar) +{ + assert(!Indirect); + if (NumElements != 1) + AllowScalar = false; + if (Constant *SV = Input->getSplatValue()) { + if (AllowScalar) + return SV; + return ConstantVector::getSplat(NumElements, SV); + } + auto VT = cast(Input->getType()); + SmallVector Values; + Constant *Undef = UndefValue::get(AllowScalar + ? ElementTy : VectorType::get(ElementTy, NumElements)); + if (isa(Input)) + return Undef; + unsigned RowIdx = Offset / ElementBytes; + unsigned Idx = RowIdx; + unsigned NextRow = Width; + for (unsigned i = 0; i != NumElements; ++i) { + if (i == NextRow) { + RowIdx += VStride; + Idx = RowIdx; + } + if (Idx >= VT->getNumElements()) + return Undef; // out of range index + // Get the element value and push it into Values. + if (ConstantDataVector *CDV = dyn_cast(Input)) + Values.push_back(CDV->getElementAsConstant(Idx)); + else { + auto CV = cast(Input); + Values.push_back(CV->getOperand(Idx)); + } + Idx += Stride; + } + if (AllowScalar) + return Values[0]; + return ConstantVector::get(Values); +} + +/*********************************************************************** + * evaluateConstantWrRegion : evaluate wrregion with constant inputs + */ +Constant *CMRegion::evaluateConstantWrRegion(Constant *OldVal, Constant *NewVal) +{ + assert(!Indirect); + SmallVector Vec; + for (unsigned i = 0, e = OldVal->getType()->getVectorNumElements(); + i != e; ++i) + Vec.push_back(OldVal->getAggregateElement(i)); + unsigned Off = Offset / ElementBytes, Row = Off; + auto NewVT = dyn_cast(NewVal->getType()); + unsigned NewNumEls = !NewVT ? 1 : NewVT->getNumElements(); + for (unsigned i = 0;;) { + if (Off >= Vec.size()) + return UndefValue::get(OldVal->getType()); // out of range + Vec[Off] = !NewVT ? NewVal : NewVal->getAggregateElement(i); + if (++i == NewNumEls) + break; + if (i % Width) { + Off += Stride; + continue; + } + Row += VStride; + Off = Row; + } + return ConstantVector::get(Vec); +} + +/*********************************************************************** + * CMRegion::changeElementType : change element type of the region + * + * Return: true if succeeded, false if failed (nothing altered) + */ +bool CMRegion::changeElementType(Type *NewElementType) +{ + assert(Offset % ElementBytes == 0 && "Impossible offset (in bytes) for data type"); + unsigned NewElementBytes = NewElementType->getPrimitiveSizeInBits() / 8U; + if (NewElementType->getPrimitiveSizeInBits()) + NewElementBytes = NewElementBytes ? NewElementBytes : 1; + if (NewElementBytes == ElementBytes) { + // No change in element size + ElementTy = NewElementType; + return true; + } + int Ratio = NewElementBytes/ElementBytes; + if (Ratio >= 1) { + // Trying to make the element size bigger. + if (Width & ((1 * Ratio) - 1)) + return false; // width misaligned + if (Stride != 1) + return false; // rows not contiguous + if (Offset % NewElementBytes != 0) + return false; + NumElements = NumElements / Ratio; + Width = Width / Ratio; + VStride = VStride / Ratio; + if (Width == 1) { + // Width is now 1, so turn it into a 1D region. + Stride = VStride; + VStride = 0; + Width = NumElements; + } + ElementTy = NewElementType; + ElementBytes = NewElementBytes; + return true; + } + // Trying to make the element size smaller. + Ratio = ElementBytes / NewElementBytes;; + if (Stride == 1 || Width == 1) { + // Row contiguous. + Stride = 1; + NumElements *= Ratio; + Width *= Ratio; + VStride *= Ratio; + ElementTy = NewElementType; + ElementBytes = NewElementBytes; + return true; + } + if (!is2D()) { + // 1D and not contiguous. Turn it into a 2D region. + VStride = Stride * Ratio; + Stride = 1; + Width = 1 * Ratio; + NumElements *= Ratio; + ElementTy = NewElementType; + ElementBytes = NewElementBytes; + return true; + } + return false; +} + +/*********************************************************************** + * CMRegion::append : append region AR to this region + * + * Return: true if succeeded (this region modified) + * false if not possible to append (this region in indeterminate state) + * + * This succeeds even if it leaves this region in an illegal state where + * it has a non-integral number of rows. After doing a sequence of appends, + * the caller needs to check that the resulting region is legal by calling + * isWholeNumRows(). + */ +bool CMRegion::append(CMRegion AR) +{ + assert(AR.isWholeNumRows()); + if (Indirect != AR.Indirect) + return false; + unsigned ARNumRows = AR.NumElements / AR.Width; + // Consider each row of AR separately. + for (unsigned ARRow = 0; ARRow != ARNumRows; + ++ARRow, AR.Offset += AR.VStride * AR.ElementBytes) { + if (NumElements == Width) { + // This region is currently 1D. + if (NumElements == 1) + Stride = (AR.Offset - Offset) / ElementBytes; + else if (AR.Width != 1 && Stride != AR.Stride) + return false; // Mismatched stride. + int NextOffset = Offset + Width * Stride * ElementBytes; + if (AR.Offset == NextOffset) { + // AR is a continuation of the same single row. + Width += AR.Width; + NumElements = Width; + continue; + } + // AR is the start (or whole) of a second row. + if (AR.Width > Width) + return false; // AR row is bigger than this row. + VStride = (AR.Offset - Offset) / ElementBytes; + NumElements += AR.Width; + continue; + } + // This region is already 2D. + unsigned ExtraBit = NumElements % Width; + int NextOffset = Offset + ((VStride * (NumElements / Width)) + + ExtraBit) * ElementBytes; + if (NextOffset != AR.Offset) + return false; // Mismatched next offset. + if (AR.Width > Width - ExtraBit) + return false; // Too much to fill whole row, or remainder of row after + // existing extra bit. + if (AR.Width != 1 && AR.Stride != Stride) + return false; // Mismatched stride. + NumElements += AR.Width; + } + return true; +} + +/*********************************************************************** + * Region debug dump/print + */ +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void CMRegion::dump() const +{ + errs() << *this << "\n"; +} +#endif + +void CMRegion::print(raw_ostream &OS) const +{ + OS << *VectorType::get(ElementTy, NumElements) << " <" + << VStride << ";" << Width << "," << Stride << ">("; + if (Indirect) { + OS << Indirect->getName(); + if (auto VT = dyn_cast(Indirect->getType())) + OS << "<" << VT->getNumElements() << ">(" << IndirectIdx << ")"; + OS << " + "; + } + OS << Offset << ")"; + if (Indirect && ParentWidth) + OS << " {parentwidth=" << ParentWidth << "}"; + if (Mask) + OS << " {mask=" << *Mask << "}"; +} + diff --git a/IGC/VectorCompiler/lib/GenXOpts/CMakeLists.txt b/IGC/VectorCompiler/lib/GenXOpts/CMakeLists.txt new file mode 100644 index 000000000000..9e99bece3df0 --- /dev/null +++ b/IGC/VectorCompiler/lib/GenXOpts/CMakeLists.txt @@ -0,0 +1,20 @@ +set(OPT_SOURCES + CMAnalysis/InstructionSimplifyGenX.cpp + CMAnalysis/ConstantFoldingGenX.cpp + CMTrans/CMABI.cpp + CMTrans/CMImpParam.cpp + CMTrans/CMKernelArgOffset.cpp + CMTrans/CMRegion.cpp + CMPacketize/GenXPacketize.cpp + CMPacketize/PacketBuilder.cpp + CMPacketize/PacketBuilder_math.cpp + CMPacketize/PacketBuilder_mem.cpp + CMPacketize/PacketBuilder_misc.cpp + CMPacketize/WIAnalysis.cpp +) + +add_library(VCTransforms ${OPT_SOURCES}) +target_link_libraries(VCTransforms + VCHeaders + LLVMGenXIntrinsics + ) diff --git a/IGC/VectorCompiler/lib/Support/CMakeLists.txt b/IGC/VectorCompiler/lib/Support/CMakeLists.txt new file mode 100644 index 000000000000..6ec90da356db --- /dev/null +++ b/IGC/VectorCompiler/lib/Support/CMakeLists.txt @@ -0,0 +1,11 @@ +set(SUPPORT_SOURCES + Status.cpp + Options.cpp + ) + +add_library(VCSupport ${SUPPORT_SOURCES}) +target_link_libraries(VCSupport + VCHeaders + LLVMSupport + LLVMOption + ) diff --git a/IGC/VectorCompiler/lib/Support/Options.cpp b/IGC/VectorCompiler/lib/Support/Options.cpp new file mode 100644 index 000000000000..a342d26e9a86 --- /dev/null +++ b/IGC/VectorCompiler/lib/Support/Options.cpp @@ -0,0 +1,62 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ + +#include "vc/Support/Options.h" + +#include + +using namespace vc::options; +using namespace llvm::opt; + +#define PREFIX(NAME, VALUE) static const char *const NAME[] = VALUE; +#include "vc/Support/Options.inc" +#undef PREFIX + +static const OptTable::Info InfoTable[] = { +#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM, \ + HELPTEXT, METAVAR, VALUES) \ + {PREFIX, NAME, HELPTEXT, METAVAR, OPT_##ID, Option::KIND##Class, \ + PARAM, FLAGS, OPT_##GROUP, OPT_##ALIAS, ALIASARGS, VALUES}, +#include "vc/Support/Options.inc" +#undef OPTION +}; + +namespace { +class VCOptTable : public OptTable { +public: + VCOptTable() : OptTable(InfoTable) { + OptTable &Opt = *this; + (void)Opt; +#define OPTTABLE_ARG_INIT +#include "vc/Support/Options.inc" +#undef OPTTABLE_ARG_INIT + } +}; +} // namespace + +static const VCOptTable OptionsTable; + +const OptTable &vc::getOptTable() { return OptionsTable; } diff --git a/IGC/VectorCompiler/lib/Support/Status.cpp b/IGC/VectorCompiler/lib/Support/Status.cpp new file mode 100644 index 000000000000..94c5d3a15e40 --- /dev/null +++ b/IGC/VectorCompiler/lib/Support/Status.cpp @@ -0,0 +1,150 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ + +#include "vc/Support/Status.h" + +#include "vc/Support/StatusCode.h" +#include "vc/Support/StatusTraits.h" + +#include "llvm/Support/ErrorHandling.h" + +#include +#include + +namespace { +class vc_error_category : public std::error_category { +public: + const char *name() const noexcept override; + std::string message(int condition) const override; +}; +} // namespace + +const char *vc_error_category::name() const noexcept { + return "vector compiler"; +} + +std::string vc_error_category::message(int condition) const { + using namespace vc; + + switch (static_cast(condition)) { + case errc::dynamic_load_fail: + return ErrorTraits::getMessage(); + case errc::symbol_not_found: + return ErrorTraits::getMessage(); + case errc::bad_spirv: + return ErrorTraits::getMessage(); + case errc::bad_bitcode: + return ErrorTraits::getMessage(); + case errc::invalid_module: + return ErrorTraits::getMessage(); + case errc::target_machine_not_created: + return ErrorTraits::getMessage(); + case errc::not_vc_codegen: + return ErrorTraits::getMessage(); + case errc::invalid_api_option: + return ErrorTraits::getMessage(); + case errc::invalid_internal_option: + return ErrorTraits::getMessage(); + } + llvm_unreachable("Unknown error code"); +} + +static vc_error_category vc_err_category; + +namespace vc { + +const std::error_category &err_category() noexcept { return vc_err_category; } + +// DynLoadError {{ +char DynLoadError::ID = 0; + +void DynLoadError::log(llvm::raw_ostream &OS) const { + OS << ErrorTraits::getMessage() << ": " << Message; +} +// }} + +// SymbolLookupError {{ +char SymbolLookupError::ID = 0; + +void SymbolLookupError::log(llvm::raw_ostream &OS) const { + OS << ErrorTraits::getMessage() << ": symbol '" + << Symbol << "' was not found in '" << Library << "'"; +} +// }} + +// BadSpirvError {{ +char BadSpirvError::ID = 0; + +void BadSpirvError::log(llvm::raw_ostream &OS) const { + OS << ErrorTraits::getMessage() << ": " << Message; +} +// }} + +// BadBitcodeError {{ +char BadBitcodeError::ID = 0; + +void BadBitcodeError::log(llvm::raw_ostream &OS) const { + OS << ErrorTraits::getMessage() << ": " << Message; +} +// }} + +// InvalidModuleError {{ +char InvalidModuleError::ID = 0; + +void InvalidModuleError::log(llvm::raw_ostream &OS) const { + OS << ErrorTraits::getMessage(); +} +// }} + +// TargetMachineError {{ +char TargetMachineError::ID = 0; + +void TargetMachineError::log(llvm::raw_ostream &OS) const { + OS << ErrorTraits::getMessage(); +} +// }} + +// NotVCError {{ +char NotVCError::ID = 0; + +void NotVCError::log(llvm::raw_ostream &OS) const { + OS << ErrorTraits::getMessage(); +} +// }} + +// OptionErrorCommon {{ +char OptionError::ID = 0; + +void OptionError::log(llvm::raw_ostream &OS) const { + if (IsInternal) + OS << ErrorTraits::getMessage(); + else + OS << ErrorTraits::getMessage(); + OS << ": " << BadOption; +} +// }} + +} // namespace vc diff --git a/IGC/VectorCompiler/spirv-patches-new/0001-Add-common-OCL-address-spaces-for-VectorCompute-glob.patch b/IGC/VectorCompiler/spirv-patches-new/0001-Add-common-OCL-address-spaces-for-VectorCompute-glob.patch new file mode 100644 index 000000000000..80fab282a3a7 --- /dev/null +++ b/IGC/VectorCompiler/spirv-patches-new/0001-Add-common-OCL-address-spaces-for-VectorCompute-glob.patch @@ -0,0 +1,40 @@ +From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 +From: nrudenko +Date: Wed, 24 Jun 2020 16:58:30 +0300 +Subject: [PATCH 1/4] Add common OCL address spaces for VectorCompute globals + This commit allows to use UniformConstant and CrossWorkgroup storage classes + for VectorCompute globals + +--- + lib/SPIRV/VectorComputeUtil.cpp | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/lib/SPIRV/VectorComputeUtil.cpp b/lib/SPIRV/VectorComputeUtil.cpp +index 2c68aa5..26d424f 100755 +--- a/lib/SPIRV/VectorComputeUtil.cpp ++++ b/lib/SPIRV/VectorComputeUtil.cpp +@@ -120,6 +120,10 @@ getVCGlobalVarStorageClass(SPIRAddressSpace AddressSpace) noexcept { + return StorageClassPrivate; + case SPIRAS_Local: + return StorageClassWorkgroup; ++ case SPIRAS_Global: ++ return StorageClassCrossWorkgroup; ++ case SPIRAS_Constant: ++ return StorageClassUniformConstant; + default: + assert(false && "Unexpected address space"); + return StorageClassPrivate; +@@ -133,6 +137,10 @@ getVCGlobalVarAddressSpace(SPIRVStorageClassKind StorageClass) noexcept { + return SPIRAS_Private; + case StorageClassWorkgroup: + return SPIRAS_Local; ++ case StorageClassCrossWorkgroup: ++ return SPIRAS_Global; ++ case StorageClassUniformConstant: ++ return SPIRAS_Constant; + default: + assert(false && "Unexpected storage class"); + return SPIRAS_Private; +-- +2.17.1 + diff --git a/IGC/VectorCompiler/spirv-patches-new/0002-Add-DecorationFuncParamKindINTEL-and-DecorationFuncP.patch b/IGC/VectorCompiler/spirv-patches-new/0002-Add-DecorationFuncParamKindINTEL-and-DecorationFuncP.patch new file mode 100644 index 000000000000..9998533a22c4 --- /dev/null +++ b/IGC/VectorCompiler/spirv-patches-new/0002-Add-DecorationFuncParamKindINTEL-and-DecorationFuncP.patch @@ -0,0 +1,198 @@ +From ac92d249ea1177c7bee2b2aa7861267f775ff0db Mon Sep 17 00:00:00 2001 +From: nrudenko +Date: Thu, 4 Jun 2020 16:34:15 +0300 +Subject: [PATCH 2/4] Add DecorationFuncParamKindINTEL and + DecorationFuncParamDescINTEL + +Change-Id: Ic90237386532736588c558c7479370e48af7ce87 +--- + lib/SPIRV/SPIRVReader.cpp | 12 ++++++++++++ + lib/SPIRV/SPIRVWriter.cpp | 13 +++++++++++++ + lib/SPIRV/VectorComputeUtil.h | 2 ++ + lib/SPIRV/libSPIRV/SPIRVDecorate.cpp | 12 ++++++++++++ + lib/SPIRV/libSPIRV/SPIRVDecorate.h | 9 +++++++++ + lib/SPIRV/libSPIRV/SPIRVEnum.h | 2 ++ + lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h | 2 ++ + lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h | 2 ++ + lib/SPIRV/libSPIRV/spirv.hpp | 2 ++ + 9 files changed, 56 insertions(+) + +diff --git a/lib/SPIRV/SPIRVReader.cpp b/lib/SPIRV/SPIRVReader.cpp +index 210c741..8dc5784 100644 +--- a/lib/SPIRV/SPIRVReader.cpp ++++ b/lib/SPIRV/SPIRVReader.cpp +@@ -3362,6 +3362,18 @@ bool SPIRVToLLVM::transVectorComputeMetadata(SPIRVFunction *BF) { + std::to_string(Kind)); + F->addAttribute(ArgNo + 1, Attr); + } ++ if (BA->hasDecorate(DecorationFuncParamKindINTEL, 0, &Kind)) { ++ Attribute Attr = Attribute::get(*Context, kVCMetadata::VCArgumentKind, ++ std::to_string(Kind)); ++ F->addAttribute(ArgNo + 1, Attr); ++ } ++ if (BA->hasDecorate(DecorationFuncParamDescINTEL)) { ++ auto Desc = ++ BA->getDecorationStringLiteral(DecorationFuncParamDescINTEL).front(); ++ Attribute Attr = ++ Attribute::get(*Context, kVCMetadata::VCArgumentDesc, Desc); ++ F->addAttribute(ArgNo + 1, Attr); ++ } + } + + // Do not add float control if there is no any +diff --git a/lib/SPIRV/SPIRVWriter.cpp b/lib/SPIRV/SPIRVWriter.cpp +index 3f569ff..670ba1a 100644 +--- a/lib/SPIRV/SPIRVWriter.cpp ++++ b/lib/SPIRV/SPIRVWriter.cpp +@@ -567,6 +567,19 @@ void LLVMToSPIRV::transVectorComputeMetadata(Function *F) { + .getAsInteger(0, Kind); + BA->addDecorate(DecorationFuncParamIOKind, Kind); + } ++ if (Attrs.hasAttribute(ArgNo + 1, kVCMetadata::VCArgumentKind)) { ++ SPIRVWord Kind; ++ Attrs.getAttribute(ArgNo + 1, kVCMetadata::VCArgumentKind) ++ .getValueAsString() ++ .getAsInteger(0, Kind); ++ BA->addDecorate(DecorationFuncParamKindINTEL, Kind); ++ } ++ if (Attrs.hasAttribute(ArgNo + 1, kVCMetadata::VCArgumentDesc)) { ++ StringRef Desc = ++ Attrs.getAttribute(ArgNo + 1, kVCMetadata::VCArgumentDesc) ++ .getValueAsString(); ++ BA->addDecorate(new SPIRVDecorateFuncParamDescAttr(BA, Desc.str())); ++ } + } + } + +diff --git a/lib/SPIRV/VectorComputeUtil.h b/lib/SPIRV/VectorComputeUtil.h +index 08d2129..f215b2d 100755 +--- a/lib/SPIRV/VectorComputeUtil.h ++++ b/lib/SPIRV/VectorComputeUtil.h +@@ -116,6 +116,8 @@ const static char VCSLMSize[] = "VCSLMSize"; + const static char VCGlobalVariable[] = "VCGlobalVariable"; + const static char VCVolatile[] = "VCVolatile"; + const static char VCByteOffset[] = "VCByteOffset"; ++const static char VCArgumentKind[] = "VCArgumentKind"; ++const static char VCArgumentDesc[] = "VCArgumentDesc"; + } // namespace kVCMetadata + + /////////////////////////////////////////////////////////////////////////////// +diff --git a/lib/SPIRV/libSPIRV/SPIRVDecorate.cpp b/lib/SPIRV/libSPIRV/SPIRVDecorate.cpp +index 4af7e2c..3e8d4fe 100644 +--- a/lib/SPIRV/libSPIRV/SPIRVDecorate.cpp ++++ b/lib/SPIRV/libSPIRV/SPIRVDecorate.cpp +@@ -104,6 +104,9 @@ void SPIRVDecorate::encode(spv_ostream &O) const { + case DecorationUserSemantic: + SPIRVDecorateUserSemanticAttr::encodeLiterals(Encoder, Literals); + break; ++ case DecorationFuncParamDescINTEL: ++ SPIRVDecorateFuncParamDescAttr::encodeLiterals(Encoder, Literals); ++ break; + default: + Encoder << Literals; + } +@@ -130,6 +133,9 @@ void SPIRVDecorate::decode(std::istream &I) { + case DecorationUserSemantic: + SPIRVDecorateUserSemanticAttr::decodeLiterals(Decoder, Literals); + break; ++ case DecorationFuncParamDescINTEL: ++ SPIRVDecorateFuncParamDescAttr::decodeLiterals(Decoder, Literals); ++ break; + default: + Decoder >> Literals; + } +@@ -149,6 +155,9 @@ void SPIRVMemberDecorate::encode(spv_ostream &O) const { + case DecorationUserSemantic: + SPIRVDecorateUserSemanticAttr::encodeLiterals(Encoder, Literals); + break; ++ case DecorationFuncParamDescINTEL: ++ SPIRVDecorateFuncParamDescAttr::encodeLiterals(Encoder, Literals); ++ break; + default: + Encoder << Literals; + } +@@ -172,6 +181,9 @@ void SPIRVMemberDecorate::decode(std::istream &I) { + case DecorationUserSemantic: + SPIRVDecorateUserSemanticAttr::decodeLiterals(Decoder, Literals); + break; ++ case DecorationFuncParamDescINTEL: ++ SPIRVDecorateFuncParamDescAttr::decodeLiterals(Decoder, Literals); ++ break; + default: + Decoder >> Literals; + } +diff --git a/lib/SPIRV/libSPIRV/SPIRVDecorate.h b/lib/SPIRV/libSPIRV/SPIRVDecorate.h +index ea816ba..23eca12 100644 +--- a/lib/SPIRV/libSPIRV/SPIRVDecorate.h ++++ b/lib/SPIRV/libSPIRV/SPIRVDecorate.h +@@ -407,6 +407,15 @@ public: + : SPIRVDecorateStrAttrBase(TheTarget, AnnotateString) {} + }; + ++class SPIRVDecorateFuncParamDescAttr ++ : public SPIRVDecorateStrAttrBase { ++public: ++ // Complete constructor for UserSemantic decoration ++ SPIRVDecorateFuncParamDescAttr(SPIRVEntry *TheTarget, ++ const std::string &AnnotateString) ++ : SPIRVDecorateStrAttrBase(TheTarget, AnnotateString) {} ++}; ++ + class SPIRVDecorateMergeINTELAttr : public SPIRVDecorate { + public: + // Complete constructor for MergeINTEL decoration +diff --git a/lib/SPIRV/libSPIRV/SPIRVEnum.h b/lib/SPIRV/libSPIRV/SPIRVEnum.h +index 0b65093..c653016 100644 +--- a/lib/SPIRV/libSPIRV/SPIRVEnum.h ++++ b/lib/SPIRV/libSPIRV/SPIRVEnum.h +@@ -392,6 +392,8 @@ template <> inline void SPIRVMap::init() { + {CapabilityVectorComputeINTEL}); + ADD_VEC_INIT(DecorationFuncParamIOKind, {CapabilityVectorComputeINTEL}); + ADD_VEC_INIT(DecorationStackCallINTEL, {CapabilityVectorComputeINTEL}); ++ ADD_VEC_INIT(DecorationFuncParamKindINTEL, {CapabilityVectorComputeINTEL}); ++ ADD_VEC_INIT(DecorationFuncParamDescINTEL, {CapabilityVectorComputeINTEL}); + } + + template <> inline void SPIRVMap::init() { +diff --git a/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h b/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h +index 09b9b8a..78c7925 100644 +--- a/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h ++++ b/lib/SPIRV/libSPIRV/SPIRVIsValidEnum.h +@@ -422,6 +422,8 @@ inline bool isValid(spv::Decoration V) { + case DecorationReferencedIndirectlyINTEL: + case DecorationVectorComputeFunctionINTEL: + case DecorationStackCallINTEL: ++ case DecorationFuncParamKindINTEL: ++ case DecorationFuncParamDescINTEL: + case DecorationVectorComputeVariableINTEL: + case DecorationGlobalVariableOffsetINTEL: + case DecorationFuncParamIOKind: +diff --git a/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h b/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h +index 867c9c1..077b662 100644 +--- a/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h ++++ b/lib/SPIRV/libSPIRV/SPIRVNameMapEnum.h +@@ -361,6 +361,8 @@ template <> inline void SPIRVMap::init() { + add(DecorationReferencedIndirectlyINTEL, "ReferencedIndirectlyINTEL"); + add(DecorationVectorComputeFunctionINTEL, "VectorComputeFunctionINTEL"); + add(DecorationStackCallINTEL, "StackCallINTEL"); ++ add(DecorationFuncParamKindINTEL, "FuncParamKindINTEL"); ++ add(DecorationFuncParamDescINTEL, "FuncParamDescINTEL"); + add(DecorationVectorComputeVariableINTEL, "VectorComputeVariableINTEL"); + add(DecorationGlobalVariableOffsetINTEL, "GlobalVariableOffsetINTEL"); + add(DecorationFuncParamIOKind, "FuncParamIOKind"); +diff --git a/lib/SPIRV/libSPIRV/spirv.hpp b/lib/SPIRV/libSPIRV/spirv.hpp +index d0f5f75..bfc92ef 100644 +--- a/lib/SPIRV/libSPIRV/spirv.hpp ++++ b/lib/SPIRV/libSPIRV/spirv.hpp +@@ -475,6 +475,8 @@ enum Decoration { + DecorationRestrictPointerEXT = 5355, + DecorationAliasedPointer = 5356, + DecorationAliasedPointerEXT = 5356, ++ DecorationFuncParamKindINTEL = 9624, ++ DecorationFuncParamDescINTEL = 9625, + DecorationReferencedIndirectlyINTEL = 5602, + DecorationSideEffectsINTEL = 5608, + DecorationVectorComputeVariableINTEL = 5624, +-- +2.17.1 + diff --git a/IGC/VectorCompiler/spirv-patches-new/0003-Add-SPIRVDLL-and-VCExport.patch b/IGC/VectorCompiler/spirv-patches-new/0003-Add-SPIRVDLL-and-VCExport.patch new file mode 100644 index 000000000000..a3a466a90230 --- /dev/null +++ b/IGC/VectorCompiler/spirv-patches-new/0003-Add-SPIRVDLL-and-VCExport.patch @@ -0,0 +1,216 @@ +From b727c486069844db240b3dead3fe92064b840724 Mon Sep 17 00:00:00 2001 +From: nrudenko +Date: Thu, 4 Jun 2020 15:20:43 +0300 +Subject: [PATCH 3/4] Add SPIRVDLL and VCExport + +Change-Id: I8a541ad383b18fd1b14e75f431e034dc10db6817 +--- + lib/SPIRV/CMakeLists.txt | 41 +++++++++++++++++- + lib/SPIRV/VCExport.cpp | 89 ++++++++++++++++++++++++++++++++++++++++ + lib/SPIRV/VCExport.h | 28 +++++++++++++ + 3 files changed, 157 insertions(+), 1 deletion(-) + mode change 100644 => 100755 lib/SPIRV/CMakeLists.txt + create mode 100755 lib/SPIRV/VCExport.cpp + create mode 100755 lib/SPIRV/VCExport.h + +diff --git a/lib/SPIRV/CMakeLists.txt b/lib/SPIRV/CMakeLists.txt +old mode 100644 +new mode 100755 +index 92ba12a..5f5b072 +--- a/lib/SPIRV/CMakeLists.txt ++++ b/lib/SPIRV/CMakeLists.txt +@@ -1,4 +1,5 @@ +-add_llvm_library(LLVMSPIRVLib ++set(SPIRV_SOURCES ++ VCExport.cpp + LLVMToSPIRVDbgTran.cpp + Mangler/FunctionDescriptor.cpp + Mangler/Mangler.cpp +@@ -34,6 +35,10 @@ add_llvm_library(LLVMSPIRVLib + libSPIRV/SPIRVStream.cpp + libSPIRV/SPIRVType.cpp + libSPIRV/SPIRVValue.cpp ++) ++ ++add_llvm_library(LLVMSPIRVLib ++ ${SPIRV_SOURCES} + LINK_COMPONENTS + Analysis + BitWriter +@@ -44,6 +49,31 @@ add_llvm_library(LLVMSPIRVLib + intrinsics_gen + ) + ++# --- mock: add_llvm_library(SPIRVDLL MODULE --- ++# unfortunately this do not work for llvm build system as is so some magic below ++ ++add_library(SPIRVDLL MODULE ++ ${SPIRV_SOURCES} ++) ++ ++llvm_update_compile_flags(SPIRVDLL) ++ ++add_dependencies(SPIRVDLL intrinsics_gen LLVMAnalysis LLVMBitWriter LLVMCore LLVMSupport LLVMTransformUtils) ++target_link_libraries(SPIRVDLL LLVMAnalysis LLVMBitWriter LLVMCore LLVMSupport LLVMTransformUtils) ++ ++install(TARGETS SPIRVDLL ++ EXPORT LLVMExports ++ LIBRARY DESTINATION lib ++ COMPONENT SPIRVDLL) ++ ++add_llvm_install_targets(install-SPIRVDLL ++ DEPENDS SPIRVDLL ++ COMPONENT SPIRVDLL) ++ ++set_property(GLOBAL APPEND PROPERTY LLVM_EXPORTS SPIRVDLL) ++ ++# --- end mock --- ++ + target_include_directories(LLVMSPIRVLib + PRIVATE + ${LLVM_INCLUDE_DIRS} +@@ -52,3 +82,12 @@ target_include_directories(LLVMSPIRVLib + ${CMAKE_CURRENT_SOURCE_DIR}/libSPIRV + ${CMAKE_CURRENT_SOURCE_DIR}/Mangler + ) ++ ++target_include_directories(SPIRVDLL ++ PRIVATE ++ ${LLVM_INCLUDE_DIRS} ++ ${LLVM_SPIRV_INCLUDE_DIRS} ++ ${CMAKE_CURRENT_SOURCE_DIR} ++ ${CMAKE_CURRENT_SOURCE_DIR}/libSPIRV ++ ${CMAKE_CURRENT_SOURCE_DIR}/Mangler ++) +diff --git a/lib/SPIRV/VCExport.cpp b/lib/SPIRV/VCExport.cpp +new file mode 100755 +index 0000000..e8893e1 +--- /dev/null ++++ b/lib/SPIRV/VCExport.cpp +@@ -0,0 +1,89 @@ ++//===- VCExport.cpp - dll interface for SPIRV implementation -*- C++ -*----===// ++// ++// The LLVM/SPIR-V Translator ++// ++//===----------------------------------------------------------------------===// ++// ++// This file implements dll interface of SPIRV translator ++// ++//===----------------------------------------------------------------------===// ++ ++#include ++#include ++#include ++#include ++ ++#include "LLVMSPIRVLib.h" ++#include "SPIRVInternal.h" ++#include "VCExport.h" ++#include "llvm/Bitcode/BitcodeReader.h" ++#include "llvm/Bitcode/BitcodeWriter.h" ++#include "llvm/IR/Module.h" ++#include "llvm/IR/Verifier.h" ++#include "llvm/Support/MemoryBuffer.h" ++ ++SPIRV::TranslatorOpts GetTranslatorOpts() { ++ std::map ExtensionNamesMap; ++#define _STRINGIFY(X) #X ++#define STRINGIFY(X) _STRINGIFY(X) ++#define EXT(X) ExtensionNamesMap[STRINGIFY(X)] = ExtensionID::X; ++#include "LLVMSPIRVExtensions.inc" ++#undef EXT ++#undef STRINGIFY ++#undef _STRINGIFY ++ ++ SPIRV::TranslatorOpts::ExtensionsStatusMap ExtensionsStatus; ++ // Set the initial state: ++ // - during SPIR-V consumption, assume that any known extension is allowed. ++ // - during SPIR-V generation, assume that any known extension is disallowed. ++ // - during conversion to/from SPIR-V text representation, assume that any ++ // known extension is allowed. ++ for (const auto &It : ExtensionNamesMap) ++ ExtensionsStatus[It.second] = true; ++ SPIRV::TranslatorOpts Opts(VersionNumber::MaximumVersion, ExtensionsStatus); ++ Opts.setFPContractMode(SPIRV::FPContractMode::On); ++ Opts.setDesiredBIsRepresentation(SPIRV::BIsRepresentation::SPIRVFriendlyIR); ++ return Opts; ++} ++ ++int spirv_read_verify_module( ++ const char *pIn, size_t InSz, ++ void (*OutSaver)(const char *pOut, size_t OutSize, void *OutUserData), ++ void *OutUserData, void (*ErrSaver)(const char *pErrMsg, void *ErrUserData), ++ void *ErrUserData) { ++ LLVMContext Context; ++ StringRef SpirvInput = StringRef(pIn, InSz); ++ std::istringstream IS(SpirvInput.str()); ++ ++ std::unique_ptr M; ++ { ++ llvm::Module *SpirM; ++ std::string ErrMsg; ++ auto Opts = GetTranslatorOpts(); ++ // This returns true on success... ++ bool Status = llvm::readSpirv(Context, Opts, IS, SpirM, ErrMsg); ++ if (!Status) { ++ std::ostringstream OSS; ++ OSS << "spirv_read_verify: readSpirv failed: " << ErrMsg; ++ ErrSaver(OSS.str().c_str(), ErrUserData); ++ return -1; ++ } ++ ++ Status = llvm::verifyModule(*SpirM); ++ if (Status) { ++ ErrSaver("spirv_read_verify: verify Module failed", ErrUserData); ++ return -1; ++ } ++ ++ M.reset(SpirM); ++ } ++ ++ llvm::SmallVector CloneBuffer; ++ llvm::raw_svector_ostream CloneOstream(CloneBuffer); ++ WriteBitcodeToFile(*M, CloneOstream); ++ ++ assert(CloneBuffer.size() > 0); ++ ++ OutSaver(CloneBuffer.data(), CloneBuffer.size(), OutUserData); ++ return 0; ++} +diff --git a/lib/SPIRV/VCExport.h b/lib/SPIRV/VCExport.h +new file mode 100755 +index 0000000..3b989ed +--- /dev/null ++++ b/lib/SPIRV/VCExport.h +@@ -0,0 +1,28 @@ ++//===- VCExport.h - Adding possibility to build spirv as a dll -*- C++ -*-===// ++// ++// The LLVM/SPIR-V Translator ++// ++//===----------------------------------------------------------------------===// ++// ++// This file is kind of a temporal solution ++// We need to live in separate DLL while IGC default SPIRV is not ready ++// ++//===----------------------------------------------------------------------===// ++ ++#ifndef SPIRV_VCEXPORT_H ++#define SPIRV_VCEXPORT_H ++ ++#ifdef _WIN32 ++#define __EXPORT__ __declspec(dllexport) ++#else ++#define __EXPORT__ __attribute__((visibility("default"))) ++#endif ++ ++// Returns zero on success. ++extern "C" __EXPORT__ int spirv_read_verify_module( ++ const char *pIn, size_t InSz, ++ void (*OutSaver)(const char *pOut, size_t OutSize, void *OutUserData), ++ void *OutUserData, void (*ErrSaver)(const char *pErrMsg, void *ErrUserData), ++ void *ErrUserData); ++ ++#endif // SPIRV_VCEXPORT_H +-- +2.17.1 + diff --git a/IGC/VectorCompiler/spirv-patches-new/0004-Remove-LLVMSPIRVLib-from-targets-Rename-tool-llvm-sp.patch b/IGC/VectorCompiler/spirv-patches-new/0004-Remove-LLVMSPIRVLib-from-targets-Rename-tool-llvm-sp.patch new file mode 100644 index 000000000000..b9deb556bf91 --- /dev/null +++ b/IGC/VectorCompiler/spirv-patches-new/0004-Remove-LLVMSPIRVLib-from-targets-Rename-tool-llvm-sp.patch @@ -0,0 +1,107 @@ +From 7be7da38da84bd1a5af4e881f8ff3d0a590b8326 Mon Sep 17 00:00:00 2001 +From: nrudenko +Date: Thu, 11 Jun 2020 15:58:34 +0300 +Subject: [PATCH 4/4] Remove LLVMSPIRVLib from targets Rename tool llvm-spirv + to llvm-spirv-vc + +--- + lib/SPIRV/CMakeLists.txt | 21 --------------------- + test/CMakeLists.txt | 12 ------------ + tools/llvm-spirv/CMakeLists.txt | 8 ++------ + 3 files changed, 2 insertions(+), 39 deletions(-) + +diff --git a/lib/SPIRV/CMakeLists.txt b/lib/SPIRV/CMakeLists.txt +index 5f5b072..7a54f61 100755 +--- a/lib/SPIRV/CMakeLists.txt ++++ b/lib/SPIRV/CMakeLists.txt +@@ -37,18 +37,6 @@ set(SPIRV_SOURCES + libSPIRV/SPIRVValue.cpp + ) + +-add_llvm_library(LLVMSPIRVLib +- ${SPIRV_SOURCES} +- LINK_COMPONENTS +- Analysis +- BitWriter +- Core +- Support +- TransformUtils +- DEPENDS +- intrinsics_gen +-) +- + # --- mock: add_llvm_library(SPIRVDLL MODULE --- + # unfortunately this do not work for llvm build system as is so some magic below + +@@ -74,15 +62,6 @@ set_property(GLOBAL APPEND PROPERTY LLVM_EXPORTS SPIRVDLL) + + # --- end mock --- + +-target_include_directories(LLVMSPIRVLib +- PRIVATE +- ${LLVM_INCLUDE_DIRS} +- ${LLVM_SPIRV_INCLUDE_DIRS} +- ${CMAKE_CURRENT_SOURCE_DIR} +- ${CMAKE_CURRENT_SOURCE_DIR}/libSPIRV +- ${CMAKE_CURRENT_SOURCE_DIR}/Mangler +-) +- + target_include_directories(SPIRVDLL + PRIVATE + ${LLVM_INCLUDE_DIRS} +diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt +index 3348c03..99c3a63 100644 +--- a/test/CMakeLists.txt ++++ b/test/CMakeLists.txt +@@ -66,14 +66,6 @@ if(NOT LLVM_SPIRV_BUILD_EXTERNAL) + endif(NOT LLVM_SPIRV_BUILD_EXTERNAL) + + +-add_lit_testsuite(check-llvm-spirv "Running the LLVM-SPIRV regression tests" +- ${CMAKE_CURRENT_BINARY_DIR} +- ARGS +- --verbose +- DEPENDS +- ${LLVM_SPIRV_TEST_DEPS} +- llvm-spirv +-) + + # to enable a custom test target on cmake below 3.11 + # starting with 3.11 "test" is only reserved if ENABLE_TESTING(ON) +@@ -82,9 +74,5 @@ if(LLVM_SPIRV_BUILD_EXTERNAL) + if(POLICY CMP0037 AND ${CMAKE_VERSION} VERSION_LESS "3.11.0") + cmake_policy(SET CMP0037 OLD) + endif(POLICY CMP0037 AND ${CMAKE_VERSION} VERSION_LESS "3.11.0") +- add_custom_target(test +- DEPENDS +- check-llvm-spirv +- ) + cmake_policy(POP) + endif(LLVM_SPIRV_BUILD_EXTERNAL) +diff --git a/tools/llvm-spirv/CMakeLists.txt b/tools/llvm-spirv/CMakeLists.txt +index 9aa96d9..3130b92 100644 +--- a/tools/llvm-spirv/CMakeLists.txt ++++ b/tools/llvm-spirv/CMakeLists.txt +@@ -8,17 +8,13 @@ set(LLVM_LINK_COMPONENTS + TransformUtils + ) + +-add_llvm_tool(llvm-spirv ++add_llvm_tool(llvm-spirv-vc + llvm-spirv.cpp + # llvm_setup_rpath messes with the rpath making llvm-spirv not executable from the build directory + NO_INSTALL_RPATH + ) + +-if (LLVM_SPIRV_BUILD_EXTERNAL) +- target_link_libraries(llvm-spirv PRIVATE LLVMSPIRVLib) +-endif() +- +-target_include_directories(llvm-spirv ++target_include_directories(llvm-spirv-vc + PRIVATE + ${LLVM_INCLUDE_DIRS} + ${LLVM_SPIRV_INCLUDE_DIRS} +-- +2.17.1 + diff --git a/IGC/VectorCompiler/tests/vctest_config.yml b/IGC/VectorCompiler/tests/vctest_config.yml new file mode 100644 index 000000000000..3e9f97e61bee --- /dev/null +++ b/IGC/VectorCompiler/tests/vctest_config.yml @@ -0,0 +1,2 @@ +--- +version: 64 diff --git a/IGC/VectorCompiler/unittests/CMakeLists.txt b/IGC/VectorCompiler/unittests/CMakeLists.txt new file mode 100644 index 000000000000..d57a09609259 --- /dev/null +++ b/IGC/VectorCompiler/unittests/CMakeLists.txt @@ -0,0 +1,9 @@ +add_custom_target(GenXUnitTests) +set_target_properties(GenXUnitTests PROPERTIES FOLDER "GenXTests") + +function(add_genx_unittest test_dirname) + add_unittest(GenXUnitTests ${test_dirname} ${ARGN}) +endfunction() + +add_subdirectory(SPIRVConversions) +add_subdirectory(Regions) diff --git a/IGC/VectorCompiler/unittests/Regions/CMakeLists.txt b/IGC/VectorCompiler/unittests/Regions/CMakeLists.txt new file mode 100644 index 000000000000..11e7cd4e101b --- /dev/null +++ b/IGC/VectorCompiler/unittests/Regions/CMakeLists.txt @@ -0,0 +1,17 @@ +set(LLVM_LINK_COMPONENTS + Core + Support + CodeGen + GenXCodeGen + GenXOpts + ) + +add_genx_unittest(RegionsTests + OverlapTest.cpp + ) + + +target_include_directories(RegionsTests PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/../../lib/GenXCodeGen") +target_link_libraries(RegionsTests PRIVATE LLVMTestingSupport) + + diff --git a/IGC/VectorCompiler/unittests/Regions/OverlapTest.cpp b/IGC/VectorCompiler/unittests/Regions/OverlapTest.cpp new file mode 100644 index 000000000000..8f95de6470a1 --- /dev/null +++ b/IGC/VectorCompiler/unittests/Regions/OverlapTest.cpp @@ -0,0 +1,81 @@ +//===- llvm/unittest/GenXIntrinsics/GenXIntrinsicsTest.cpp - --------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/StringRef.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/LLVMContext.h" + +#include "GenXRegion.h" + +#include "gtest/gtest.h" + +using namespace llvm; + +namespace { +TEST(GenXCodeGen, RegionOverlapping) { + LLVMContext Context; + + genx::Region R1(VectorType::get(Type::getDoubleTy(Context), 16)); + R1.VStride = 0; + R1.NumElements = R1.Width = 16; + R1.Stride = 1; + R1.Offset = 128; + genx::Region R2(VectorType::get(Type::getDoubleTy(Context), 8)); + R2.VStride = 0; + R2.NumElements = R2.Width = 8; + R2.Stride = 1; + R2.Offset = 192; + EXPECT_EQ(R1.overlap(R2), true); + R2.Offset = 256; + EXPECT_EQ(R2.overlap(R1), false); + + genx::Region R3(VectorType::get(Type::getInt32Ty(Context), 4)); + R3.VStride = 2; + R3.NumElements = 8; + R3.Width = 1; + R3.Stride = 0; + R3.Offset = 0; + genx::Region R4(R3); + EXPECT_EQ(R3.overlap(R4), true); + R4.Offset = R4.ElementBytes; + EXPECT_EQ(R3.overlap(R4), false); + R4.Offset = R4.ElementBytes * 2; + EXPECT_EQ(R3.overlap(R4), true); + R4.Offset = 6; + EXPECT_EQ(R3.overlap(R4), true); + + genx::Region R5(VectorType::get(Type::getInt16Ty(Context), 4)); + R5.VStride = 8; + R5.NumElements = 4; + R5.Width = 2; + R5.Stride = 1; + R5.Offset = 0; + genx::Region R6(R5); + R6.Offset = R6.ElementBytes; + EXPECT_EQ(R5.overlap(R6), true); + R6.Offset = R6.ElementBytes * 2; + EXPECT_EQ(R5.overlap(R6), false); + + genx::Region R7(VectorType::get(Type::getDoubleTy(Context), 128)); + R7.VStride = 32; + R7.NumElements = 128; + R7.Width = 8; + R7.Stride = 2; + R7.Offset = 0; + genx::Region R8(VectorType::get(Type::getInt32Ty(Context), 256)); + R8.VStride = 1; + R8.Width = R8.NumElements = 128; + R8.Stride = 4; + R8.Offset = R7.ElementBytes; + EXPECT_EQ(R7.overlap(R8), false); + R8.Offset--; + EXPECT_EQ(R7.overlap(R8), true); +} + +} // namespace diff --git a/IGC/VectorCompiler/unittests/SPIRVConversions/CMakeLists.txt b/IGC/VectorCompiler/unittests/SPIRVConversions/CMakeLists.txt new file mode 100644 index 000000000000..f9a678d76937 --- /dev/null +++ b/IGC/VectorCompiler/unittests/SPIRVConversions/CMakeLists.txt @@ -0,0 +1,16 @@ +set(LLVM_LINK_COMPONENTS + Core + Support + CodeGen + GenXIntrinsics + SPIRVLib + ) + +add_genx_unittest(SPIRVConversionsTests + SPIRVConversionsTest.cpp + ) + +target_include_directories(SPIRVConversionsTests PRIVATE ${SPIRV_INCLUDE_DIR}) +target_link_libraries(SPIRVConversionsTests PRIVATE LLVMTestingSupport) + + diff --git a/IGC/VectorCompiler/unittests/SPIRVConversions/SPIRVConversionsTest.cpp b/IGC/VectorCompiler/unittests/SPIRVConversions/SPIRVConversionsTest.cpp new file mode 100644 index 000000000000..b08ee54c4db8 --- /dev/null +++ b/IGC/VectorCompiler/unittests/SPIRVConversions/SPIRVConversionsTest.cpp @@ -0,0 +1,255 @@ +/*===================== begin_copyright_notice ================================== + +Copyright (c) 2017 Intel Corporation + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be included +in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + +======================= end_copyright_notice ==================================*/ + +#include "llvm/ADT/StringRef.h" +#include "llvm/GenXIntrinsics/GenXIntrinsics.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/Error.h" + +#include "LLVMSPIRVLib.h" +#include "llvm/Target/TargetMachine.h" + +#include "gtest/gtest.h" + + +#include +#include + +using namespace llvm; + +namespace { + +static GenXIntrinsic::ID BeginGenXID = llvm::GenXIntrinsic::genx_3d_load; +static GenXIntrinsic::ID EndGenXID = llvm::GenXIntrinsic::genx_zzzzend; + +// Currently returns some fixed types. +Type *generateAnyType(Intrinsic::IITDescriptor::ArgKind AK, LLVMContext &Ctx) { + using namespace Intrinsic; + + switch (AK) { + case IITDescriptor::AK_Any: + case IITDescriptor::AK_AnyInteger: + return Type::getInt32Ty(Ctx); + case IITDescriptor::AK_AnyFloat: + return Type::getDoubleTy(Ctx); + case IITDescriptor::AK_AnyPointer: + return Type::getInt32PtrTy(Ctx); + case IITDescriptor::AK_AnyVector: + return VectorType::get(Type::getInt32Ty(Ctx), 8); + } + llvm_unreachable("All types should be handled"); +} + +void generateOverloadedTypes(GenXIntrinsic::ID Id, LLVMContext &Ctx, + SmallVectorImpl &Tys) { + using namespace Intrinsic; + + SmallVector Table; + GenXIntrinsic::getIntrinsicInfoTableEntries(Id, Table); + + for (unsigned i = 0, e = Table.size(); i != e; ++i) { + auto Desc = Table[i]; + if (Desc.Kind != IITDescriptor::Argument) + continue; + + size_t ArgNum = Desc.getArgumentNumber(); + Tys.resize(std::max(ArgNum + 1, Tys.size())); + + Tys[ArgNum] = generateAnyType(Desc.getArgumentKind(), Ctx); + } +} + +static std::string ty2s(Type* ty) { + std::string type_str; + llvm::raw_string_ostream rso(type_str); + ty->print(rso, true); + return rso.str(); +} +static std::string k2s(std::map& s, + Attribute::AttrKind kkk) { + for (const auto& i: s) { + if (i.second == kkk) + return i.first; + } + return "n/a"; +} +class SpirvConvertionsTest : public testing::Test { +protected: + void SetUp() override { + M_.reset(new Module("Test_Module", Ctx_)); + M_->setTargetTriple("spir64-unknown-unknown"); + } + + void TearDown() override { + M_.reset(); + } + + Module* Retranslate(LLVMContext& ctx, std::string& err) { + err.clear(); + std::stringstream ss; + writeSpirv(M_.get(), ss, err); + + if (!err.empty()) + return nullptr; + + std::string s_sv_ir = ss.str(); + std::istrstream ir_stream(s_sv_ir.data(), s_sv_ir.size()); + + Module* result = nullptr; + readSpirv(ctx, ir_stream, result, err); + + if (!err.empty()) + return nullptr; + + return result; + } + + LLVMContext Ctx_; + std::unique_ptr M_; + std::set FN_; +}; + +TEST_F(SpirvConvertionsTest, IntrinsicAttrs) { + Type *FArgTy[] = {Type::getInt32PtrTy(Ctx_)}; + FunctionType *FT = FunctionType::get(Type::getVoidTy(Ctx_), FArgTy, false); + Function *F = Function::Create(FT, Function::ExternalLinkage, "", M_.get()); + BasicBlock *BB = BasicBlock::Create(Ctx_, "", F); + + IRBuilder<> Builder(BB); + + for (unsigned id = BeginGenXID; id < EndGenXID; ++id) { + GenXIntrinsic::ID XID = static_cast(id); + + SmallVector Tyss; + generateOverloadedTypes(XID, Ctx_, Tyss); + + Function* f = GenXIntrinsic::getGenXDeclaration(M_.get(), XID, Tyss); + SmallVector Args; + for (Type* ty: f->getFunctionType()->params()) { + Value* arg = llvm::Constant::getNullValue(ty); + Args.push_back(arg); + + FN_.insert(f->getName().str()); + /* + std::cout << "name: " << f->getName().str() << "\n"; + Type* aty = arg->getType(); + std::cout << " param_type: " << ty2s(ty) << ' ' << (void*)ty << "\n"; + std::cout << " arg_type: " << ty2s(aty) << ' ' << (void*)aty << "\n"; + */ + } + Builder.CreateCall(f, Args); + } + llvm::Error merr = M_->materializeAll(); + if (merr) + FAIL() << "materialization a module resulted in failure: " << merr << "\n"; + + std::string err; + LLVMContext C; + Module* M = Retranslate(C, err); + if (!M) { + FAIL() << "failure during retranslation: " << err << "\n"; + return; + } + + // M_->dump(); + // M->dump(); + + for (const std::string& fname :FN_) { + // std::cout << "processing <" << fname << ">" << "\n"; + Function* fl = M->getFunction(fname); + Function* fr = M_->getFunction(fname); + + if (!fl) + FAIL() << "could not find <" << fname << "> in the converted Module\n"; + if (!fr) + FAIL() << "could not find <" << fname << "> in the original Module\n"; + + // fl->getAttributes().dump(); + // fr->getAttributes().dump(); + + for (unsigned i = Attribute::None; i < Attribute::EndAttrKinds; ++i) { + Attribute::AttrKind att = (Attribute::AttrKind)i; + EXPECT_TRUE(fl->hasFnAttribute(att) == fr->hasFnAttribute(att)); + } + } +} + +TEST_F(SpirvConvertionsTest, FunctionAttrs) { + + // TODO: think about how one can test all attributes. Right now the problem + // is that I don't know how to diffirentiate between attributes which require + // a value from those that don't. + std::map kinds = { + { "Convergent", Attribute::Convergent }, + { "NoReturn", Attribute::NoReturn }, + { "NoInline", Attribute::NoInline }, + { "NoUnwind", Attribute::NoUnwind }, + { "ReadNone", Attribute::ReadNone }, + { "SafeStack", Attribute::SafeStack }, + { "WriteOnly", Attribute::WriteOnly }, + }; + for (const auto& k : kinds) { + Type *FArgTy[] = {Type::getInt32PtrTy(Ctx_)}; + FunctionType *FT = FunctionType::get(Type::getVoidTy(Ctx_), FArgTy, false); + Function* test_f = + Function::Create(FT, Function::ExternalLinkage, k.first, M_.get()); + for (unsigned i = Attribute::None; i < Attribute::EndAttrKinds; ++i) { + if (test_f->hasFnAttribute((Attribute::AttrKind)i)) { + test_f->removeFnAttr((Attribute::AttrKind)i); + } + } + test_f->addFnAttr(k.second); + BasicBlock *aux_BB = BasicBlock::Create(Ctx_, "", test_f); + IRBuilder<> aux_Builder(aux_BB); + } + + std::string err; + LLVMContext C; + Module* M = Retranslate(C, err); + if (!M) { + FAIL() << "failure during retranslation: " << err << "\n"; + return; + } + for (const auto& k : kinds) { + Function* fl = M->getFunction(k.first); + Function* fr = M_->getFunction(k.first); + for (unsigned i = Attribute::None; i < Attribute::EndAttrKinds; ++i) { + Attribute::AttrKind att = (Attribute::AttrKind)i; + if ((fl->hasFnAttribute(att) != fr->hasFnAttribute(att))) { + FAIL() << "Attriubute mismatch for <" << k.first << "> at attr:" << + i << " (" << k2s(kinds, att) << ")\n"; + } + } + } + // M_->dump(); + // M->dump(); +} + + +} // namespace diff --git a/IGC/common/igc_flags.def b/IGC/common/igc_flags.def index e517fbbde644..106b5bb6dd6b 100644 --- a/IGC/common/igc_flags.def +++ b/IGC/common/igc_flags.def @@ -407,3 +407,5 @@ DECLARE_IGC_REGKEY(bool, ApplyConservativeRastWAHeader, true, "Apply WaConservat DECLARE_IGC_GROUP("OGL Frontend") DECLARE_IGC_REGKEY(bool, OGLMinimumDump, false, "Minimum dump for testing - first and last .ll, .cos and compiler output", true) +DECLARE_IGC_GROUP("VectorCompiler Options") + DECLARE_IGC_REGKEY(bool, VCOptimizeNone, false, "Same as -optimize=none in vector compiler options", true)